diff --git a/mk/features.mk b/mk/features.mk index 285f62f..26a8ad5 100644 --- a/mk/features.mk +++ b/mk/features.mk @@ -32,6 +32,10 @@ SRCS = $(SRC_DIR)/main.c \ $(SRC_DIR)/seccomp-notify.c \ $(SRC_DIR)/shadow-fd.c \ $(SRC_DIR)/seccomp-dispatch.c \ + $(SRC_DIR)/dispatch-net.c \ + $(SRC_DIR)/dispatch-id.c \ + $(SRC_DIR)/dispatch-exec.c \ + $(SRC_DIR)/dispatch-misc.c \ $(SRC_DIR)/seccomp-supervisor.c \ $(SRC_DIR)/net-slirp.c \ $(SRC_DIR)/web-telemetry.c \ diff --git a/scripts/pre-commit.hook b/scripts/pre-commit.hook index 12da2b7..72a3d3b 100755 --- a/scripts/pre-commit.hook +++ b/scripts/pre-commit.hook @@ -178,6 +178,13 @@ build_cppcheck_suppressions() { "invalidFunctionArg:src/loader-launch.c" "invalidFunctionArg:src/loader-stack.c" "memleak:src/loader-stack.c" + "invalidFunctionArg:src/dispatch-exec.c" + "nullPointerArithmeticOutOfMemory:src/dispatch-exec.c" + "syntaxError:include/kbox/compiler.h" + "syntaxError:src/loader-transfer.c" + "invalidFunctionArg:src/rewrite.c" + "knownConditionTrueFalse" + "usleepCalled" ) local out="--inline-suppr " @@ -501,29 +508,55 @@ re_threadsafe_ok='(strtok_r|asctime_r|ctime_r|gmtime_r|localtime_r)' if [ ${#C_FILES_CHANGED[@]} -gt 0 ] && [ -s "$DIFF_CACHE" ]; then current_file="" sec_reported="" + sec_in_block=0 while IFS= read -r line; do if [[ $line == "+++ b/"* ]]; then current_file="${line#'+++ b/'}" + sec_in_block=0 + elif [[ -n "$current_file" && $line == " "* ]]; then + raw_context="${line:1}" + sanitized=$(strip_strings_and_comments "$raw_context" "$sec_in_block") + sec_in_block=$(printf '%s\n' "$sanitized" | sed -n '1p') elif [[ -n "$current_file" && $line == "+"* && $line != "+++"* ]]; then - added="${line:1}" + raw_added="${line:1}" + sanitized=$(strip_strings_and_comments "$raw_added" "$sec_in_block") + sec_in_block=$(printf '%s\n' "$sanitized" | sed -n '1p') + added=$(printf '%s\n' "$sanitized" | sed -n '2p') # 8a. Non-literal format string (per-family argument position). + # Uses raw_added (not comment-stripped) so string literals + # remain visible. If a printf-family call ends with a + # trailing comma, the format literal is on the next line; + # only skip that specific pattern (not arbitrary commas). if ! is_reported "$sec_reported" "$current_file:fmtstr"; then flagged=0 + trimmed="${raw_added%%+([[:space:]])}" # printf/vprintf: format is arg 1 - if [[ $added =~ $re_printf_call ]] && ! [[ $added =~ $re_printf_lit ]]; then - flagged=1 + if [[ $raw_added =~ $re_printf_call ]] && ! [[ $raw_added =~ $re_printf_lit ]]; then + if [[ "$trimmed" == *, || "$trimmed" == *\\ ]]; then + : /* trailing comma: literal on next line */ + else + flagged=1 + fi fi # fprintf/dprintf/vfprintf: format is arg 2 - if [[ $flagged -eq 0 && $added =~ $re_fprintf_call ]] && ! [[ $added =~ $re_fprintf_lit ]]; then - flagged=1 + if [[ $flagged -eq 0 && $raw_added =~ $re_fprintf_call ]] && ! [[ $raw_added =~ $re_fprintf_lit ]]; then + if [[ "$trimmed" == *, || "$trimmed" == *\\ ]]; then + : /* trailing comma: literal on next line */ + else + flagged=1 + fi fi # snprintf/vsnprintf: format is arg 3 - if [[ $flagged -eq 0 && $added =~ $re_snprintf_call ]] && ! [[ $added =~ $re_snprintf_lit ]]; then - flagged=1 + if [[ $flagged -eq 0 && $raw_added =~ $re_snprintf_call ]] && ! [[ $raw_added =~ $re_snprintf_lit ]]; then + if [[ "$trimmed" == *, || "$trimmed" == *\\ ]]; then + : /* trailing comma: literal on next line */ + else + flagged=1 + fi fi # Allow suppression comment. - if [[ $flagged -eq 1 ]] && [[ $added =~ $re_format_ok ]]; then + if [[ $flagged -eq 1 ]] && [[ $raw_added =~ $re_format_ok ]]; then flagged=0 fi if [[ $flagged -eq 1 ]]; then diff --git a/src/dispatch-exec.c b/src/dispatch-exec.c new file mode 100644 index 0000000..938a704 --- /dev/null +++ b/src/dispatch-exec.c @@ -0,0 +1,974 @@ +/* SPDX-License-Identifier: MIT */ + +/* Exec, mmap, mprotect, and clone3 handlers for the seccomp dispatch engine. + * + * trap_userspace_exec performs in-process binary replacement for trap/rewrite + * mode. forward_execve handles both seccomp-unotify pathname rewriting and + * trap-mode userspace exec. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dispatch-internal.h" +#include "kbox/elf.h" +#include "kbox/path.h" +#include "loader-launch.h" +#include "procmem.h" +#include "rewrite.h" +#include "shadow-fd.h" +#include "syscall-nr.h" +#include "syscall-trap-signal.h" +#include "syscall-trap.h" + +/* AT_EMPTY_PATH flag for execveat: indicates fexecve() usage. Defined here to + * avoid pulling in the full linux/fcntl.h. + */ +#define KBOX_AT_EMPTY_PATH 0x1000 + +/* Load biases for the userspace ELF loader. Must match + * prepare_userspace_launch. The loader places main and interpreter ELFs at + * these fixed virtual addresses, and the stack just below stack_top. + */ +#define KBOX_EXEC_MAIN_LOAD_BIAS 0x600000000000ULL +#define KBOX_EXEC_INTERP_LOAD_BIAS 0x610000000000ULL +#define KBOX_EXEC_STACK_TOP 0x700000010000ULL + +/* Alternate stack region for userspace re-exec. During re-exec SIGSYS handler + * is running on the old guest stack, so we cannot unmap it until after + * transferring to the new binary. Place the new stack at a different address; + * the old stack region is reclaimed by the subsequent munmap in + * teardown_old_guest_mappings during the NEXT re-exec. + */ +#define KBOX_EXEC_REEXEC_STACK_TOP 0x6F0000010000ULL + +/* Maximum entries in argv or envp for userspace exec. */ +#define KBOX_EXEC_MAX_ARGS 4096 + +/* Track which stack region is in use by the current guest. The initial launch + * uses KBOX_EXEC_STACK_TOP; re-exec alternates between the two addresses. The + * signal handler runs on the current guest's stack, so we must not unmap it + * during re-exec. + */ +static uint64_t reexec_current_stack_top; + +/* mmap dispatch: if the FD is a virtual FD with no host shadow, create the + * shadow on demand (lazy shadow) and inject it into the tracee at the same FD + * number, then CONTINUE so the host kernel mmaps the real fd. + * + * Lazy shadow creation avoids the memfd_create + file-copy cost at every open. + * The shadow is only materialized when the guest actually mmaps. + */ +struct kbox_dispatch forward_mmap(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + /* W^X enforcement for mmap in trap/rewrite mode. */ + if (request_uses_trap_signals(req)) { + int prot = (int) kbox_syscall_request_arg(req, 2); + if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: mmap denied: " + "W^X violation (prot=0x%x, pid=%u)\n", + prot, kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EACCES); + } + } + + long fd = to_dirfd_arg(kbox_syscall_request_arg(req, 4)); + + if (fd == -1) + return kbox_dispatch_continue(); + + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + if (lkl_fd >= 0) { + long host = kbox_fd_table_get_host_fd(ctx->fd_table, fd); + if (host == -1) { + /* Only create lazy shadows for read-only/private mappings. + * Writable MAP_SHARED mappings on LKL files cannot be supported + * via memfd (writes would go to the copy, not LKL). + */ + int mmap_flags = (int) kbox_syscall_request_arg(req, 3); + int mmap_prot = (int) kbox_syscall_request_arg(req, 2); + if ((mmap_flags & MAP_SHARED) && (mmap_prot & PROT_WRITE)) + return kbox_dispatch_errno(ENODEV); + + int memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + if (memfd < 0) + return kbox_dispatch_errno(ENODEV); + kbox_shadow_seal(memfd); + int injected = request_addfd_at(ctx, req, memfd, (int) fd, 0); + if (injected < 0) { + close(memfd); + return kbox_dispatch_errno(ENODEV); + } + /* Mark that a shadow was injected so repeated mmaps do not + * re-create it. Use -2 as a sentinel: host_fd >= 0 means + * "supervisor-owned shadow fd" (closed on remove). host_fd == -2 + * means "tracee-owned shadow, don't close in supervisor." + * fd_table_remove only closes host_fd when host_fd >= 0 AND + * shadow_sp < 0, so -2 is safe. + */ + kbox_fd_table_set_host_fd(ctx->fd_table, fd, + KBOX_FD_HOST_SAME_FD_SHADOW); + { + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + if (entry) + entry->shadow_sp = memfd; + } + } + } + + return kbox_dispatch_continue(); +} + +/* W^X enforcement for mprotect in trap/rewrite mode. + * + * Reject simultaneous PROT_WRITE|PROT_EXEC to prevent JIT spray attacks. + * On none->X transitions, scan the page for syscall/sysenter/SVC instructions + * and add them to the origin map for rewrite-mode caller validation. + * + * In seccomp mode, this is a no-op: CONTINUE lets the host kernel handle it. + */ +struct kbox_dispatch forward_mprotect(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + uint64_t addr = kbox_syscall_request_arg(req, 0); + uint64_t len = kbox_syscall_request_arg(req, 1); + int prot = (int) kbox_syscall_request_arg(req, 2); + + /* In seccomp mode (supervisor), just pass through. */ + if (!request_uses_trap_signals(req)) + return kbox_dispatch_continue(); + + /* W^X enforcement: reject PROT_WRITE | PROT_EXEC. */ + if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: mprotect denied: W^X violation at 0x%llx len=%llu " + "(pid=%u)\n", + (unsigned long long) addr, (unsigned long long) len, + kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EACCES); + } + + /* Allow the mprotect to proceed via host kernel. If the page transitions + * to PROT_EXEC, JIT code on it will take the Tier 1 (RET_TRAP) slow path + * because it won't be in the BPF allow ranges. This is safe: un-rewritten + * syscall instructions in JIT pages are caught by the SIGSYS handler. + * + * Full scan-on-X (rewriting JIT pages at mprotect time) is a future + * optimization: it would promote JIT pages from Tier 1 (~3us) to Tier 2 + * (~41ns) but requires synchronous instruction scanning while the page + * is still writable, which adds latency to the mprotect call. + */ + return kbox_dispatch_continue(); +} + +/* clone3 namespace-flag sanitization. */ + +/* CLONE_NEW* flags that clone3 can smuggle in via clone_args.flags. The BPF + * deny-list blocks unshare/setns, but clone3 bypasses it unless we check here. + */ +struct kbox_dispatch forward_clone3(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + uint64_t flags; + int rc; + + /* clone3(struct clone_args *args, size_t size). flags is the first uint64_t + * field in clone_args. We only need to read the first 8 bytes. + */ + rc = + guest_mem_read(ctx, kbox_syscall_request_pid(req), + kbox_syscall_request_arg(req, 0), &flags, sizeof(flags)); + if (rc < 0) { + /* Can't read tracee memory; fail closed with EPERM. + * + * CONTINUE is unsafe here: a tracee can clear dumpability via + * prctl(PR_SET_DUMPABLE, 0), causing process_vm_readv to fail with + * EPERM. If we CONTINUE, clone3 reaches host kernel with unchecked + * namespace flags: a sandbox escape. Returning EPERM is the only safe + * option. + */ + if (ctx->verbose) + fprintf(stderr, + "kbox: clone3 denied: cannot read clone_args " + "(pid=%u, rc=%d)\n", + kbox_syscall_request_pid(req), rc); + return kbox_dispatch_errno(EPERM); + } + + if (flags & CLONE_NEW_MASK) { + if (ctx->verbose) + fprintf(stderr, + "kbox: clone3 denied: namespace flags 0x%llx " + "(pid=%u)\n", + (unsigned long long) (flags & CLONE_NEW_MASK), + kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EPERM); + } + + /* In trap/rewrite mode, block thread creation (CLONE_THREAD). + * Multi-threaded guests require --syscall-mode=seccomp. + */ + if ((flags & CLONE_THREAD) && request_uses_trap_signals(req)) { + if (ctx->verbose) + fprintf(stderr, + "kbox: clone3 denied: CLONE_THREAD in trap/rewrite mode " + "(pid=%u, use --syscall-mode=seccomp)\n", + kbox_syscall_request_pid(req)); + return kbox_dispatch_errno(EPERM); + } + + return kbox_dispatch_continue(); +} + +/* Safely count a null-terminated pointer array in guest address space. + * Uses process_vm_readv to avoid SIGSEGV on bad guest pointers. + * Returns the count (not including the final NULL), or -EFAULT on bad memory. + */ +static long count_user_ptrs_safe(uint64_t arr_addr, size_t max_count) +{ + size_t n = 0; + uint64_t ptr; + + if (arr_addr == 0) + return -EFAULT; + + while (n < max_count) { + uint64_t offset, probe_addr; + int rc; + if (__builtin_mul_overflow((uint64_t) n, sizeof(uint64_t), &offset) || + __builtin_add_overflow(arr_addr, offset, &probe_addr)) + return -EFAULT; + rc = kbox_current_read(probe_addr, &ptr, sizeof(ptr)); + if (rc < 0) + return -EFAULT; + if (ptr == 0) + return (long) n; + n++; + } + + return -E2BIG; +} + +/* Safely measure the length of a guest string. + * Returns the length (not including NUL), or -EFAULT on bad memory. + */ +static long strlen_user_safe(uint64_t str_addr) +{ + char buf[256]; + size_t total = 0; + + if (str_addr == 0) + return -EFAULT; + + for (;;) { + int rc = kbox_current_read(str_addr + total, buf, sizeof(buf)); + if (rc < 0) + return -EFAULT; + for (size_t i = 0; i < sizeof(buf); i++) { + if (buf[i] == '\0') + return (long) (total + i); + } + total += sizeof(buf); + if (total > (size_t) (256 * 1024)) + return -ENAMETOOLONG; + } +} + +/* Safely read a single guest pointer (8 bytes). */ +static int read_user_ptr(uint64_t addr, uint64_t *out) +{ + return kbox_current_read(addr, out, sizeof(*out)); +} + +/* Safely copy a guest string into a destination buffer. + * Returns the string length (not including NUL), or -EFAULT. + */ +static long copy_user_string(uint64_t str_addr, char *dst, size_t dst_size) +{ + return kbox_current_read_string(str_addr, dst, dst_size); +} + +/* Tear down old guest code/data mappings and the stale stack at the + * new stack address. The current guest stack (which the SIGSYS + * handler is running on) is at the OTHER address and left alone. + * It leaks one stack-sized region until the next re-exec cycle. + */ +static void teardown_old_guest_mappings(uint64_t new_stack_top) +{ + /* Main binary region: up to 256 MB from the load bias. */ + munmap((void *) (uintptr_t) KBOX_EXEC_MAIN_LOAD_BIAS, 256UL * 1024 * 1024); + /* Interpreter region: up to 256 MB from the load bias. */ + munmap((void *) (uintptr_t) KBOX_EXEC_INTERP_LOAD_BIAS, + 256UL * 1024 * 1024); + /* Unmap any stale stack at the new stack address. On the first + * re-exec (new = REEXEC), this is a no-op (nothing mapped there). + * On the second re-exec (new = STACK_TOP), this unmaps the + * initial launch stack. Subsequent cycles alternate and reclaim. + */ + munmap((void *) (uintptr_t) (new_stack_top - 16UL * 1024 * 1024), + 16UL * 1024 * 1024 + 0x10000UL); +} + +/* Perform userspace exec for trap mode. Called from inside the SIGSYS + * handler when the guest calls execve/execveat. This replaces the + * current process image without a real exec syscall, preserving the + * SIGSYS handler and seccomp filter chain. + * + * The function is noreturn on success: it transfers control to the new + * binary's entry point. On failure, it returns a dispatch with errno. + */ +static struct kbox_dispatch trap_userspace_exec( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + int exec_memfd, + const char *pathname, + int is_execveat) +{ + unsigned char *elf_buf = NULL; + size_t elf_buf_len = 0; + char interp_path[256]; + int interp_memfd = -1; + int ilen = 0; + struct kbox_loader_launch_spec spec; + struct kbox_loader_launch launch = {0}; + struct kbox_syscall_trap_ip_range ranges[KBOX_LOADER_MAX_MAPPINGS]; + struct kbox_loader_exec_range exec_ranges[KBOX_LOADER_MAX_MAPPINGS]; + size_t exec_count = 0; + size_t range_count = 0; + unsigned char random_bytes[KBOX_LOADER_RANDOM_SIZE]; + + /* execve(path, argv, envp): argv=args[1], envp=args[2] + * execveat(dirfd, path, argv, envp, flags): argv=args[2], envp=args[3] + * + * In trap mode these are guest pointers in our address space, but still + * guest-controlled. All accesses must use safe reads (process_vm_readv) + * so bad pointers yield -EFAULT instead of crashing the SIGSYS handler. + */ + uint64_t argv_addr = kbox_syscall_request_arg(req, is_execveat ? 2 : 1); + uint64_t envp_addr = kbox_syscall_request_arg(req, is_execveat ? 3 : 2); + long argc_long = count_user_ptrs_safe(argv_addr, KBOX_EXEC_MAX_ARGS); + long envc_long = count_user_ptrs_safe(envp_addr, KBOX_EXEC_MAX_ARGS); + size_t argc, envc; + + if (argc_long < 0) { + close(exec_memfd); + return kbox_dispatch_errno(argc_long == -E2BIG ? EINVAL : EFAULT); + } + if (envc_long < 0) { + close(exec_memfd); + return kbox_dispatch_errno(envc_long == -E2BIG ? EINVAL : EFAULT); + } + argc = (size_t) argc_long; + envc = (size_t) envc_long; + if (argc == 0) { + close(exec_memfd); + return kbox_dispatch_errno(EINVAL); + } + + /* Deep-copy argv and envp into a single mmap'd arena. Using mmap instead of + * malloc/strdup because we are inside the SIGSYS handler and glibc's + * allocator is not async-signal-safe. + * + * Two passes: first measure total size (via safe string length reads), then + * copy. All guest pointer reads use process_vm_readv. + */ + size_t arena_size = (argc + envc) * sizeof(char *); + for (size_t i = 0; i < argc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(argv_addr + i * sizeof(uint64_t), &str_addr) < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + slen = strlen_user_safe(str_addr); + if (slen < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + arena_size += (size_t) slen + 1; + } + for (size_t i = 0; i < envc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(envp_addr + i * sizeof(uint64_t), &str_addr) < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + slen = strlen_user_safe(str_addr); + if (slen < 0) { + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + } + arena_size += (size_t) slen + 1; + } + arena_size = (arena_size + 4095) & ~(size_t) 4095; + + char *arena = mmap(NULL, arena_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (arena == MAP_FAILED) { + close(exec_memfd); + return kbox_dispatch_errno(ENOMEM); + } + size_t arena_used = 0; + char **argv_copy = (char **) (arena + arena_used); + arena_used += argc * sizeof(char *); + char **envp_copy = (char **) (arena + arena_used); + arena_used += envc * sizeof(char *); + for (size_t i = 0; i < argc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(argv_addr + i * sizeof(uint64_t), &str_addr) < 0) + goto fail_arena; + slen = copy_user_string(str_addr, arena + arena_used, + arena_size - arena_used); + if (slen < 0) + goto fail_arena; + argv_copy[i] = arena + arena_used; + arena_used += (size_t) slen + 1; + } + for (size_t i = 0; i < envc; i++) { + uint64_t str_addr; + long slen; + if (read_user_ptr(envp_addr + i * sizeof(uint64_t), &str_addr) < 0) + goto fail_arena; + slen = copy_user_string(str_addr, arena + arena_used, + arena_size - arena_used); + if (slen < 0) + goto fail_arena; + envp_copy[i] = arena + arena_used; + arena_used += (size_t) slen + 1; + } + + /* Check for PT_INTERP (dynamic binary needing an interpreter). */ + if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, &elf_buf_len) == + 0) { + uint64_t pt_offset, pt_filesz; + + ilen = kbox_find_elf_interp_loc(elf_buf, elf_buf_len, interp_path, + sizeof(interp_path), &pt_offset, + &pt_filesz); + munmap(elf_buf, elf_buf_len); + elf_buf = NULL; + + if (ilen < 0) { + ilen = -ENOEXEC; + goto fail_early; + } + + if (ilen > 0) { + long interp_lkl = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, + interp_path, O_RDONLY, 0); + if (interp_lkl < 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: cannot open " + "interpreter %s: %s\n", + pathname, interp_path, kbox_err_text(interp_lkl)); + ilen = (int) interp_lkl; + goto fail_early; + } + + interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); + lkl_close_and_invalidate(ctx, interp_lkl); + + if (interp_memfd < 0) { + ilen = interp_memfd; + goto fail_early; + } + } + } + /* else: kbox_read_elf_header_window_fd failed, elf_buf is still NULL. + * Nothing to unmap. Treat as static binary (no interpreter). + */ + + /* Generate random bytes for AT_RANDOM auxv entry. Use the raw syscall to + * avoid depending on sys/random.h availability. + */ + memset(random_bytes, 0x42, sizeof(random_bytes)); +#ifdef __NR_getrandom + { + long gr = + syscall(__NR_getrandom, random_bytes, sizeof(random_bytes), 0); + (void) gr; + } +#endif + + /* Pick a stack address that does not collide with old guest stack (which we + * are currently running on from inside the SIGSYS handler). + * Alternate between two stack tops so the old one survives until the next + * re-exec reclaims it. + */ + uint64_t new_stack_top = + (reexec_current_stack_top == KBOX_EXEC_REEXEC_STACK_TOP) + ? KBOX_EXEC_STACK_TOP + : KBOX_EXEC_REEXEC_STACK_TOP; + + /* Build the loader launch spec. Use the same load biases as initial launch + * so the address space layout is consistent. + */ + memset(&spec, 0, sizeof(spec)); + spec.exec_fd = exec_memfd; + spec.interp_fd = interp_memfd; + spec.argv = (const char *const *) argv_copy; + spec.argc = argc; + spec.envp = (const char *const *) envp_copy; + spec.envc = envc; + spec.execfn = pathname; + spec.random_bytes = random_bytes; + spec.page_size = (uint64_t) sysconf(_SC_PAGESIZE); + spec.stack_top = new_stack_top; + spec.main_load_bias = KBOX_EXEC_MAIN_LOAD_BIAS; + spec.interp_load_bias = KBOX_EXEC_INTERP_LOAD_BIAS; + spec.uid = ctx->root_identity ? 0 : (uint32_t) getuid(); + spec.euid = ctx->root_identity ? 0 : (uint32_t) getuid(); + spec.gid = ctx->root_identity ? 0 : (uint32_t) getgid(); + spec.egid = ctx->root_identity ? 0 : (uint32_t) getgid(); + spec.secure = 0; + + /* Tear down old guest code/data mappings BEFORE materializing new ones + * (MAP_FIXED_NOREPLACE requires the addresses to be free). But do NOT + * teardown before reading the memfds; the reads use pread which doesn't + * depend on the old mappings. + */ + teardown_old_guest_mappings(new_stack_top); + + { + int launch_rc = kbox_loader_prepare_launch(&spec, &launch); + if (launch_rc < 0) { + const char msg[] = "kbox: trap exec: loader prepare failed\n"; + (void) write(STDERR_FILENO, msg, sizeof(msg) - 1); + _exit(127); + } + } + + /* The memfds have been read into launch buffers; close them. */ + close(exec_memfd); + if (interp_memfd >= 0) + close(interp_memfd); + + /* Collect executable ranges from the new layout for the BPF filter. The new + * filter is appended to the filter chain; old filter is harmless (matches + * unmapped addresses). + */ + if (kbox_loader_collect_exec_ranges( + &launch, exec_ranges, KBOX_LOADER_MAX_MAPPINGS, &exec_count) < 0) { + if (ctx->verbose) + fprintf(stderr, "kbox: trap exec %s: cannot collect exec ranges\n", + pathname); + kbox_loader_launch_reset(&launch); + _exit(127); + } + for (size_t i = 0; i < exec_count; i++) { + ranges[i].start = (uintptr_t) exec_ranges[i].start; + ranges[i].end = (uintptr_t) exec_ranges[i].end; + } + range_count = exec_count; + + /* Install a new BPF RET_TRAP filter covering the new binary's executable + * ranges. seccomp filters form a chain; calling seccomp(SET_MODE_FILTER) + * adds to it rather than replacing. + */ + if (kbox_install_seccomp_trap_ranges(ctx->host_nrs, ranges, range_count) < + 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: cannot install new BPF filter\n", + pathname); + kbox_loader_launch_reset(&launch); + _exit(127); + } + + /* Clean up CLOEXEC entries from the FD table, matching what a + * real exec would do. + */ + kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs); + + /* If the original launch used rewrite mode, re-apply binary rewriting to + * the new binary. This patches syscall instructions in the newly loaded + * executable segments and sets up trampoline regions, promoting the new + * binary from Tier 1 (SIGSYS ~3us) to Tier 2 (~41ns) for rewritten sites. + * + * If rewrite installation fails (e.g., trampoline allocation), the binary + * still works correctly via the SIGSYS handler (Tier 1). + */ + if (req->source == KBOX_SYSCALL_SOURCE_REWRITE) { + /* Static: runtime is stored globally via store_active_rewrite_runtime + * and must survive past the noreturn transfer_to_guest. Single-threaded + * trap mode guarantees no concurrent re-exec. + */ + static struct kbox_rewrite_runtime rewrite_rt; + kbox_rewrite_runtime_reset(&rewrite_rt); + if (kbox_rewrite_runtime_install(&rewrite_rt, ctx, &launch) == 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: rewrite installed " + "(%zu trampoline regions)\n", + pathname, rewrite_rt.trampoline_region_count); + } else { + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: rewrite failed, " + "falling back to SIGSYS\n", + pathname); + } + } + +#if defined(__x86_64__) + /* Reset the guest FS base to the host (kbox) FS base. We are inside the + * SIGSYS handler where FS already points to kbox's TLS. The new binary + * starts with no TLS set up; it will call arch_prctl(ARCH_SET_FS) during + * libc init to establish its own. Until then, SIGSYS handler entry should + * see FS == host FS and the save/restore becomes a no-op, which is correct. + */ + { + uint64_t host_fs = 0; + + kbox_syscall_trap_host_arch_prctl_get_fs(&host_fs); + kbox_syscall_trap_set_guest_fs(host_fs); + } +#endif + + if (ctx->verbose) + fprintf(stderr, + "kbox: trap exec %s: transferring to new image " + "pc=0x%llx sp=0x%llx\n", + pathname, (unsigned long long) launch.transfer.pc, + (unsigned long long) launch.transfer.sp); + + /* Record which stack the new guest is using. The next re-exec will pick the + * other address and reclaim this one. + */ + reexec_current_stack_top = new_stack_top; + + /* Free staging buffers before transferring. The image regions (mmap'd guest + * code/data/stack) must survive. + */ + munmap(arena, arena_size); + if (launch.main_elf && launch.main_elf_len > 0) + munmap(launch.main_elf, launch.main_elf_len); + launch.main_elf = NULL; + if (launch.interp_elf && launch.interp_elf_len > 0) + munmap(launch.interp_elf, launch.interp_elf_len); + launch.interp_elf = NULL; + kbox_loader_stack_image_reset(&launch.layout.stack); + + /* Unblock SIGSYS before transferring. We are inside the SIGSYS handler, + * which runs with SIGSYS blocked (SA_SIGINFO default). + * Since we jump to the new entry point instead of returning from the + * handler, the kernel never restores the pre-handler signal mask. The new + * binary needs SIGSYS unblocked so the BPF RET_TRAP filter can deliver it. + */ + { + uint64_t mask[2] = {0, 0}; + unsigned int signo = SIGSYS - 1; + mask[signo / 64] = 1ULL << (signo % 64); + kbox_syscall_trap_host_rt_sigprocmask_unblock(mask, + 8 /* kernel sigset_t */); + } + + /* Transfer control to the new binary. This is noreturn. */ + kbox_loader_transfer_to_guest(&launch.transfer); + +fail_arena: + munmap(arena, arena_size); + close(exec_memfd); + return kbox_dispatch_errno(EFAULT); + +fail_early: + munmap(arena, arena_size); + close(exec_memfd); + if (interp_memfd >= 0) + close(interp_memfd); + return kbox_dispatch_errno((int) (-ilen)); +} + +/* Handle execve/execveat from inside the image. + * + * For fexecve (execveat with AT_EMPTY_PATH on a host memfd): CONTINUE, the host + * kernel handles it directly. This is the initial exec path from image.c. + * + * For in-image exec (e.g. shell runs /bin/ls): + * 1. Read the pathname from tracee memory + * 2. Open the binary from LKL, create a memfd + * 3. Check for PT_INTERP; if dynamic, extract interpreter into a second memfd + * and patch PT_INTERP to /proc/self/fd/N + * 4. Inject memfds into the tracee via ADDFD + * 5. Overwrite the pathname in tracee memory with /proc/self/fd/N + * 6. CONTINUE: kernel re-reads the rewritten path and execs + * + * The seccomp-unotify guarantees the tracee is blocked during steps 1-5, and + * the kernel has not yet copied the pathname (getname happens after the seccomp + * check), so the overwrite is race-free. + */ +struct kbox_dispatch forward_execve(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + int is_execveat) +{ + pid_t pid = kbox_syscall_request_pid(req); + + /* Detect fexecve: execveat(fd, "", argv, envp, AT_EMPTY_PATH). This is the + * initial exec from image.c on the host memfd. Let the kernel handle it + * directly. + */ + if (is_execveat) { + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); + if (flags & KBOX_AT_EMPTY_PATH) + return kbox_dispatch_continue(); + } + + /* Read pathname from tracee memory. */ + uint64_t path_addr = is_execveat ? kbox_syscall_request_arg(req, 1) + : kbox_syscall_request_arg(req, 0); + char pathbuf[KBOX_MAX_PATH]; + int rc = + guest_mem_read_string(ctx, pid, path_addr, pathbuf, sizeof(pathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + /* Translate path for LKL. */ + char translated[KBOX_MAX_PATH]; + rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, translated, + sizeof(translated)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + /* Virtual paths (/proc, /sys, /dev): let the host handle them. */ + if (kbox_is_lkl_virtual_path(translated)) + return kbox_dispatch_continue(); + + /* Open the binary from LKL. */ + long lkl_fd = + kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, translated, O_RDONLY, 0); + if (lkl_fd < 0) + return kbox_dispatch_errno((int) (-lkl_fd)); + + /* Create a memfd with the binary contents. */ + int exec_memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); + lkl_close_and_invalidate(ctx, lkl_fd); + + if (exec_memfd < 0) + return kbox_dispatch_errno(-exec_memfd); + + /* Trap mode: the SIGSYS handler and BPF filter do not survive a real exec, + * so perform a userspace exec instead. This replaces the process image + * in-place (unmap old, map new, jump to entry) without invoking kernel's + * execve. On success the function does not return. + */ + if (request_uses_trap_signals(req)) + return trap_userspace_exec(req, ctx, exec_memfd, pathbuf, is_execveat); + + /* Check for PT_INTERP (dynamic binary). */ + { + unsigned char *elf_buf = NULL; + size_t elf_buf_len = 0; + + if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, + &elf_buf_len) == 0) { + char interp_path[256]; + uint64_t pt_offset, pt_filesz; + int ilen = kbox_find_elf_interp_loc( + elf_buf, elf_buf_len, interp_path, sizeof(interp_path), + &pt_offset, &pt_filesz); + + munmap(elf_buf, elf_buf_len); + + if (ilen < 0) { + close(exec_memfd); + return kbox_dispatch_errno(ENOEXEC); + } + + if (ilen > 0) { + /* Dynamic binary. Extract the interpreter from LKL and inject + * it into the tracee. + */ + long interp_lkl = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, + interp_path, O_RDONLY, 0); + if (interp_lkl < 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: exec %s: cannot open " + "interpreter %s: %s\n", + pathbuf, interp_path, + kbox_err_text(interp_lkl)); + close(exec_memfd); + return kbox_dispatch_errno((int) (-interp_lkl)); + } + + int interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); + lkl_close_and_invalidate(ctx, interp_lkl); + + if (interp_memfd < 0) { + close(exec_memfd); + return kbox_dispatch_errno(-interp_memfd); + } + + /* Inject the interpreter memfd first so we know its FD number + * in the tracee for the PT_INTERP patch. O_CLOEXEC is safe: the + * kernel resolves /proc/self/fd/N via open_exec() before + * begin_new_exec() closes CLOEXEC descriptors. + */ + int tracee_interp_fd = + request_addfd(ctx, req, interp_memfd, O_CLOEXEC); + close(interp_memfd); + + if (tracee_interp_fd < 0) { + close(exec_memfd); + return kbox_dispatch_errno(-tracee_interp_fd); + } + + /* Patch PT_INTERP in the exec memfd to point at the injected + * interpreter: /proc/self/fd/. + */ + char new_interp[64]; + int new_len = snprintf(new_interp, sizeof(new_interp), + "/proc/self/fd/%d", tracee_interp_fd); + + if ((uint64_t) (new_len + 1) > pt_filesz) { + close(exec_memfd); + return kbox_dispatch_errno(ENOMEM); + } + + char patch[256]; + size_t patch_len = (size_t) pt_filesz; + if (patch_len > sizeof(patch)) + patch_len = sizeof(patch); + memset(patch, 0, patch_len); + memcpy(patch, new_interp, (size_t) new_len); + + if (pwrite(exec_memfd, patch, patch_len, (off_t) pt_offset) != + (ssize_t) patch_len) { + close(exec_memfd); + return kbox_dispatch_errno(EIO); + } + + if (ctx->verbose) + fprintf(stderr, + "kbox: exec %s: interpreter %s " + "-> /proc/self/fd/%d\n", + pathbuf, interp_path, tracee_interp_fd); + } + } else { + munmap(elf_buf, elf_buf_len); + } + } + + /* Inject the exec memfd into the tracee. O_CLOEXEC keeps the tracee's FD + * table clean after exec succeeds. + */ + int tracee_exec_fd = request_addfd(ctx, req, exec_memfd, O_CLOEXEC); + close(exec_memfd); + + if (tracee_exec_fd < 0) + return kbox_dispatch_errno(-tracee_exec_fd); + + /* Overwrite the pathname in the tracee's memory with /proc/self/fd/. + * The kernel has not yet copied the pathname (getname happens after the + * seccomp check), so when we CONTINUE, it reads our rewritten path. + * + * argv[0] aliasing: some shells pass the same pointer for pathname and + * argv[0]. If we overwrite the pathname, we corrupt argv[0]. Detect this + * and fix it by writing the original path right after the new path in the + * same buffer, then updating the argv[0] pointer in the argv array. + * + * Try process_vm_writev first (fast path). If that fails (e.g. pathname + * is in .rodata), fall back to /proc/pid/mem which can write through page + * protections. + */ + char new_path[64]; + int new_path_len = snprintf(new_path, sizeof(new_path), "/proc/self/fd/%d", + tracee_exec_fd); + + /* Check if argv[0] is aliased with the pathname. argv pointer is args[1] + * for execve, args[2] for execveat. + */ + uint64_t argv_addr = is_execveat ? kbox_syscall_request_arg(req, 2) + : kbox_syscall_request_arg(req, 1); + uint64_t argv0_ptr = 0; + int argv0_aliased = 0; + + if (argv_addr != 0) { + rc = guest_mem_read(ctx, pid, argv_addr, &argv0_ptr, sizeof(argv0_ptr)); + if (rc == 0 && argv0_ptr == path_addr) + argv0_aliased = 1; + } + + /* Build the write buffer: new_path + NUL + original_path + NUL. Original + * path goes right after the new path so we can point argv[0] at it. + */ + size_t orig_len = strlen(pathbuf); + size_t total_write = (size_t) (new_path_len + 1); + + if (argv0_aliased) + total_write += orig_len + 1; + + char write_buf[KBOX_MAX_PATH + 64]; + if (total_write > sizeof(write_buf)) + return kbox_dispatch_errno(ENAMETOOLONG); + + memcpy(write_buf, new_path, (size_t) (new_path_len + 1)); + if (argv0_aliased) + memcpy(write_buf + new_path_len + 1, pathbuf, orig_len + 1); + + rc = guest_mem_write(ctx, pid, path_addr, write_buf, total_write); + if (rc < 0) { + rc = guest_mem_write_force(ctx, pid, path_addr, write_buf, total_write); + if (rc < 0) { + if (ctx->verbose) + fprintf(stderr, + "kbox: exec %s: cannot rewrite " + "pathname: %s\n", + pathbuf, strerror(-rc)); + return kbox_dispatch_errno(ENOEXEC); + } + } + + /* If argv[0] was aliased, update the argv[0] pointer to point at original + * path copy (right after the new path). + */ + if (argv0_aliased) { + uint64_t new_argv0 = path_addr + (uint64_t) (new_path_len + 1); + rc = + guest_mem_write(ctx, pid, argv_addr, &new_argv0, sizeof(new_argv0)); + if (rc < 0) + guest_mem_write_force(ctx, pid, argv_addr, &new_argv0, + sizeof(new_argv0)); + } + + if (ctx->verbose) + fprintf(stderr, "kbox: exec %s -> /proc/self/fd/%d\n", pathbuf, + tracee_exec_fd); + + /* Clean up CLOEXEC entries from the FD table, matching what a successful + * exec will do in the kernel. + * + * This is still conservative: if exec later fails, tracee resumes after we + * have already purged those mappings. That rollback problem is preferable + * to keeping stale mappings alive across a successful exec, which misroutes + * future FD operations in the new image. + */ + kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs); + + /* Invalidate the cached /proc/pid/mem FD. After exec, the kernel may revoke + * access to the old FD even though the PID is the same (credential check + * against the new binary). Forcing a reopen on the next write ensures we + * have valid access. + */ + if (ctx->proc_mem_fd >= 0) { + close(ctx->proc_mem_fd); + ctx->proc_mem_fd = -1; + } + + return kbox_dispatch_continue(); +} diff --git a/src/dispatch-id.c b/src/dispatch-id.c new file mode 100644 index 0000000..3f6a7ee --- /dev/null +++ b/src/dispatch-id.c @@ -0,0 +1,312 @@ +/* SPDX-License-Identifier: MIT */ + +/* Identity syscall handlers for the seccomp dispatch engine. + * + * UID, GID, supplementary groups, and umask operations. All LKL identity + * calls go through kbox_lkl_* wrappers in lkl-wrap.h. + */ + +#include +#include +#include + +#include "dispatch-internal.h" + +/* Shared getresuid/getresgid implementation. The three getters retrieve the + * real, effective, and saved IDs respectively. LKL has no separate saved ID, + * so callers pass the effective getter for the saved slot. + */ + +typedef long (*lkl_id_getter)(const struct kbox_sysnrs *); + +static struct kbox_dispatch forward_getresid( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + lkl_id_getter get_real, + lkl_id_getter get_effective, + lkl_id_getter get_saved) +{ + pid_t pid = kbox_syscall_request_pid(req); + lkl_id_getter getters[3] = {get_real, get_effective, get_saved}; + int i; + + for (i = 0; i < 3; i++) { + uint64_t ptr = kbox_syscall_request_arg(req, i); + if (ptr == 0) + continue; + long r = getters[i](ctx->sysnrs); + if (r < 0) + return kbox_dispatch_errno((int) (-r)); + unsigned val = (unsigned) r; + int wrc = guest_mem_write(ctx, pid, ptr, &val, sizeof(val)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_getresuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_getresid(req, ctx, kbox_lkl_getuid, kbox_lkl_geteuid, + kbox_lkl_geteuid); +} + +struct kbox_dispatch forward_getresgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_getresid(req, ctx, kbox_lkl_getgid, kbox_lkl_getegid, + kbox_lkl_getegid); +} + +/* Shared override: write the same value to all three output pointers. */ + +static struct kbox_dispatch forward_getresid_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + unsigned id_val) +{ + pid_t pid = kbox_syscall_request_pid(req); + int i; + + for (i = 0; i < 3; i++) { + uint64_t ptr = kbox_syscall_request_arg(req, i); + if (ptr != 0) { + int wrc = guest_mem_write(ctx, pid, ptr, &id_val, sizeof(id_val)); + if (wrc < 0) + return kbox_dispatch_errno(EIO); + } + } + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_getresuid_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + uid_t uid) +{ + return forward_getresid_override(req, ctx, (unsigned) uid); +} + +struct kbox_dispatch forward_getresgid_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + gid_t gid) +{ + return forward_getresid_override(req, ctx, (unsigned) gid); +} + +struct kbox_dispatch forward_getgroups(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t list = kbox_syscall_request_arg(req, 1); + + if (size < 0) + return kbox_dispatch_errno(EINVAL); + + /* Probe to get actual group count. */ + long count = kbox_lkl_getgroups(ctx->sysnrs, 0, NULL); + if (count < 0) + return kbox_dispatch_errno((int) (-count)); + + if (size == 0) + return kbox_dispatch_value((int64_t) count); + + /* Caller's buffer must be large enough. */ + if (size < count) + return kbox_dispatch_errno(EINVAL); + + size_t byte_len = (size_t) count * sizeof(unsigned); + if (byte_len > KBOX_IO_CHUNK_LEN) + return kbox_dispatch_errno(ENOMEM); + unsigned *buf = (unsigned *) dispatch_scratch; + + long ret = kbox_lkl_getgroups(ctx->sysnrs, count, buf); + if (ret < 0) + return kbox_dispatch_errno((int) (-ret)); + + if (list != 0 && ret > 0) { + size_t write_len = (size_t) ret * sizeof(unsigned); + pid_t pid = kbox_syscall_request_pid(req); + int wrc = guest_mem_write(ctx, pid, list, buf, write_len); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + + return kbox_dispatch_value((int64_t) ret); +} + +struct kbox_dispatch forward_getgroups_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + gid_t gid) +{ + long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + if (size < 0) + return kbox_dispatch_errno(EINVAL); + if (size == 0) + return kbox_dispatch_value(1); + + uint64_t list = kbox_syscall_request_arg(req, 1); + if (list == 0) + return kbox_dispatch_errno(EFAULT); + + pid_t pid = kbox_syscall_request_pid(req); + unsigned val = (unsigned) gid; + int wrc = guest_mem_write(ctx, pid, list, &val, sizeof(val)); + if (wrc < 0) + return kbox_dispatch_errno(EIO); + + return kbox_dispatch_value(1); +} + +/* Identity set forwarders. */ + +struct kbox_dispatch forward_setuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long uid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + return kbox_dispatch_from_lkl(kbox_lkl_setuid(ctx->sysnrs, uid)); +} + +struct kbox_dispatch forward_setreuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long ruid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long euid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + return kbox_dispatch_from_lkl(kbox_lkl_setreuid(ctx->sysnrs, ruid, euid)); +} + +struct kbox_dispatch forward_setresuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long ruid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long euid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long suid = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + return kbox_dispatch_from_lkl( + kbox_lkl_setresuid(ctx->sysnrs, ruid, euid, suid)); +} + +struct kbox_dispatch forward_setgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long gid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + return kbox_dispatch_from_lkl(kbox_lkl_setgid(ctx->sysnrs, gid)); +} + +struct kbox_dispatch forward_setregid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long rgid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long egid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + return kbox_dispatch_from_lkl(kbox_lkl_setregid(ctx->sysnrs, rgid, egid)); +} + +struct kbox_dispatch forward_setresgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long rgid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long egid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long sgid = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + return kbox_dispatch_from_lkl( + kbox_lkl_setresgid(ctx->sysnrs, rgid, egid, sgid)); +} + +struct kbox_dispatch forward_setgroups(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t list = kbox_syscall_request_arg(req, 1); + + if (size < 0 || size > 65536) + return kbox_dispatch_errno(EINVAL); + + if (size == 0) + return kbox_dispatch_from_lkl(kbox_lkl_setgroups(ctx->sysnrs, 0, NULL)); + + size_t byte_len = (size_t) size * sizeof(unsigned); + if (byte_len > KBOX_IO_CHUNK_LEN) + return kbox_dispatch_errno(ENOMEM); + unsigned *buf = (unsigned *) dispatch_scratch; + + pid_t pid = kbox_syscall_request_pid(req); + int rrc = guest_mem_read(ctx, pid, list, buf, byte_len); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + long ret = kbox_lkl_setgroups(ctx->sysnrs, size, buf); + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_setfsgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long gid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + return kbox_dispatch_from_lkl(kbox_lkl_setfsgid(ctx->sysnrs, gid)); +} + +/* Identity dispatch helpers. + * + * In host+root_identity mode, get* returns 0 and set* returns 0. + * In host+override mode, get* returns the override value. + * In host+neither mode, CONTINUE to host kernel. + * In image mode, forward to LKL. + */ + +/* Shared get-ID dispatcher. has_override + override_val avoid a sentinel + * comparison that breaks when uid_t/gid_t is narrower than unsigned long. + */ +static struct kbox_dispatch dispatch_get_id( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx, + int has_override, + unsigned override_val) +{ + if (ctx->host_root) { + if (ctx->root_identity) + return kbox_dispatch_value(0); + if (has_override) + return kbox_dispatch_value((int64_t) override_val); + return kbox_dispatch_continue(); + } + return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); +} + +struct kbox_dispatch dispatch_get_uid( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx) +{ + int has = ctx->override_uid != (uid_t) -1; + return dispatch_get_id(lkl_func, ctx, has, (unsigned) ctx->override_uid); +} + +struct kbox_dispatch dispatch_get_gid( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx) +{ + int has = ctx->override_gid != (gid_t) -1; + return dispatch_get_id(lkl_func, ctx, has, (unsigned) ctx->override_gid); +} + +struct kbox_dispatch dispatch_set_id( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + struct kbox_dispatch (*lkl_forward)(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx)) +{ + if (ctx->host_root) { + if (ctx->root_identity) + return kbox_dispatch_value(0); + return kbox_dispatch_continue(); + } + return lkl_forward(req, ctx); +} + +struct kbox_dispatch forward_umask(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long mask = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long ret = kbox_lkl_umask(ctx->sysnrs, mask); + return kbox_dispatch_from_lkl(ret); +} diff --git a/src/dispatch-internal.h b/src/dispatch-internal.h new file mode 100644 index 0000000..b02d562 --- /dev/null +++ b/src/dispatch-internal.h @@ -0,0 +1,468 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef KBOX_DISPATCH_INTERNAL_H +#define KBOX_DISPATCH_INTERNAL_H + +/* Internal header for the seccomp dispatch subsystem. Exposes shared helpers + * from seccomp-dispatch.c so that handler functions extracted into separate + * translation units (dispatch-net.c, dispatch-id.c, dispatch-exec.c, + * dispatch-misc.c) can call them. + */ + +#include "fd-table.h" +#include "lkl-wrap.h" +#include "seccomp.h" + +/* Sentinel values for host_fd and lkl_fd fields in kbox_fd_entry. + * + * KBOX_FD_HOST_SAME_FD_SHADOW: host_fd is a same-fd shadow (memfd injected + * at the tracee's FD number via SECCOMP_IOCTL_NOTIF_ADDFD). + * KBOX_FD_LOCAL_ONLY_SHADOW: host_fd is a local-only shadow (supervisor + * holds the memfd; trap/rewrite mode, no tracee injection). + * KBOX_LKL_FD_SHADOW_ONLY: lkl_fd placeholder for entries that exist only + * as host shadows with no LKL backing FD. + */ +#define KBOX_FD_HOST_SAME_FD_SHADOW (-2) +#define KBOX_FD_LOCAL_ONLY_SHADOW (-3) +#define KBOX_LKL_FD_SHADOW_ONLY (-2) + +/* Shared scratch buffer for I/O dispatch. The dispatcher is single-threaded and + * non-reentrant: only one syscall is dispatched at a time. + */ +extern uint8_t dispatch_scratch[KBOX_IO_CHUNK_LEN]; + +/* Argument extraction helpers. */ + +static inline int64_t to_c_long_arg(uint64_t v) +{ + return (int64_t) v; +} + +/* Sign-extend a 32-bit dirfd argument. seccomp_data.args[] zero-extends 32-bit + * values, so AT_FDCWD (-100) arrives as 0xFFFFFF9C. This helper restores the + * sign. + */ +static inline long to_dirfd_arg(uint64_t v) +{ + return (long) (int) (uint32_t) v; +} + +/* Return nonzero if req was delivered via SIGSYS trap or syscall rewrite rather + * than the seccomp user-notification path. + */ +static inline int request_uses_trap_signals( + const struct kbox_syscall_request *req) +{ + return req && (req->source == KBOX_SYSCALL_SOURCE_TRAP || + req->source == KBOX_SYSCALL_SOURCE_REWRITE); +} + +/* Look up the FD table entry for a virtual or low-range FD. Returns NULL if fd + * is out of range or t is NULL. + */ +static inline struct kbox_fd_entry *fd_table_entry(struct kbox_fd_table *t, + long fd) +{ + if (!t) + return NULL; + if (fd >= KBOX_FD_BASE && fd < KBOX_FD_BASE + KBOX_FD_TABLE_MAX) + return &t->entries[fd - KBOX_FD_BASE]; + if (fd >= 0 && fd < KBOX_LOW_FD_MAX) + return &t->low_fds[fd]; + return NULL; +} + +/* Evict an LKL FD from the stat cache so stale fstat results are never + * returned after the FD is closed and its number reused. + */ +static inline void invalidate_stat_cache_fd(struct kbox_supervisor_ctx *ctx, + long lkl_fd) +{ +#if KBOX_STAT_CACHE_ENABLED + for (int i = 0; i < KBOX_STAT_CACHE_MAX; i++) + if (ctx->stat_cache[i].lkl_fd == lkl_fd) + ctx->stat_cache[i].lkl_fd = -1; +#else + (void) ctx; + (void) lkl_fd; +#endif +} + +/* Close an LKL FD and evict it from the stat cache. Every LKL close in the + * dispatch code should go through this wrapper to prevent stale fstat results + * when the LKL FD number is reused. + */ +static inline long lkl_close_and_invalidate(struct kbox_supervisor_ctx *ctx, + long lkl_fd) +{ + invalidate_stat_cache_fd(ctx, lkl_fd); + return kbox_lkl_close(ctx->sysnrs, lkl_fd); +} + +/* Open-flag ABI translation (aarch64 host <-> asm-generic LKL). + * + * aarch64 and asm-generic define four O_* flags differently: + * + * Flag aarch64 asm-generic (LKL) + * O_DIRECTORY 0x04000 0x10000 + * O_NOFOLLOW 0x08000 0x20000 + * O_DIRECT 0x10000 0x04000 + * O_LARGEFILE 0x20000 0x08000 + * + * x86_64 values already match asm-generic so no translation is needed. + */ +#if defined(__aarch64__) + +#define HOST_O_DIRECTORY 0x04000 +#define HOST_O_NOFOLLOW 0x08000 +#define HOST_O_DIRECT 0x10000 +#define HOST_O_LARGEFILE 0x20000 + +#define LKL_O_DIRECTORY 0x10000 +#define LKL_O_NOFOLLOW 0x20000 +#define LKL_O_DIRECT 0x04000 +#define LKL_O_LARGEFILE 0x08000 + +static inline long host_to_lkl_open_flags(long flags) +{ + long out = flags & ~(HOST_O_DIRECTORY | HOST_O_NOFOLLOW | HOST_O_DIRECT | + HOST_O_LARGEFILE); + if (flags & HOST_O_DIRECTORY) + out |= LKL_O_DIRECTORY; + if (flags & HOST_O_NOFOLLOW) + out |= LKL_O_NOFOLLOW; + if (flags & HOST_O_DIRECT) + out |= LKL_O_DIRECT; + if (flags & HOST_O_LARGEFILE) + out |= LKL_O_LARGEFILE; + return out; +} + +static inline long lkl_to_host_open_flags(long flags) +{ + long out = flags & ~(LKL_O_DIRECTORY | LKL_O_NOFOLLOW | LKL_O_DIRECT | + LKL_O_LARGEFILE); + if (flags & LKL_O_DIRECTORY) + out |= HOST_O_DIRECTORY; + if (flags & LKL_O_NOFOLLOW) + out |= HOST_O_NOFOLLOW; + if (flags & LKL_O_DIRECT) + out |= HOST_O_DIRECT; + if (flags & LKL_O_LARGEFILE) + out |= HOST_O_LARGEFILE; + return out; +} + +#else /* x86_64: flags already match asm-generic */ + +static inline long host_to_lkl_open_flags(long flags) +{ + return flags; +} + +static inline long lkl_to_host_open_flags(long flags) +{ + return flags; +} + +#endif + +/* Guest memory access. */ + +int guest_mem_read(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + void *out, + size_t len); +int guest_mem_write(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len); +int guest_mem_write_force(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len); +int guest_mem_read_string(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + char *buf, + size_t max_len); +int guest_mem_read_open_how(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out); +int guest_mem_write_small_metadata(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len); +int read_guest_string(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t addr, + char *buf, + size_t size); + +/* FD injection into the tracee. */ + +int request_addfd(const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + int srcfd, + uint32_t newfd_flags); +int request_addfd_at(const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + int srcfd, + int target_fd, + uint32_t newfd_flags); + +/* Path resolution and translation. */ + +long resolve_open_dirfd(const char *path, + long dirfd, + const struct kbox_fd_table *table); +int translate_guest_path(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t addr, + const char *host_root, + char *translated, + size_t size); +int translate_request_path(const struct kbox_syscall_request *req, + const struct kbox_supervisor_ctx *ctx, + size_t path_idx, + const char *host_root, + char *translated, + size_t size); +int translate_request_at_path(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + size_t dirfd_idx, + size_t path_idx, + char *translated, + size_t size, + long *lkl_dirfd); +int should_continue_for_dirfd(long lkl_dirfd); +int guest_addr_is_writable(pid_t pid, uint64_t addr); + +/* FD utilities. */ + +int child_fd_is_open(const struct kbox_supervisor_ctx *ctx, long fd); +long allocate_passthrough_hostonly_fd(struct kbox_supervisor_ctx *ctx); +long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx); +int ensure_proc_self_fd_dir(struct kbox_supervisor_ctx *ctx); +int ensure_proc_mem_fd(struct kbox_supervisor_ctx *ctx); + +/* Stat ABI conversion. */ + +void kbox_lkl_stat_to_host(const struct kbox_lkl_stat *src, struct stat *dst); +void normalize_host_stat_if_needed(struct kbox_supervisor_ctx *ctx, + const char *path, + struct stat *host_stat); +void normalize_statx_if_needed(struct kbox_supervisor_ctx *ctx, + const char *path, + uint8_t *statx_buf); + +/* Shadow FD management. */ + +int ensure_same_fd_shadow(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long fd, + long lkl_fd); +int try_cached_shadow_open_dispatch(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long flags, + const char *translated, + struct kbox_dispatch *out); +int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, + const char *translated, + uint64_t remote_stat, + pid_t pid); +int ensure_path_shadow_cache(struct kbox_supervisor_ctx *ctx, + const char *translated); +void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx); +void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx); +void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry); +void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry); +int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long lkl_fd, + long flags, + const char *translated, + struct kbox_dispatch *out); +int sync_shadow_writeback(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry); + +/* Handler functions: dispatch-net.c (networking syscalls). */ + +struct kbox_dispatch forward_socket(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_bind(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_connect(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_sendto(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_recvfrom(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_recvmsg(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getsockopt(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setsockopt(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getsockname(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getpeername(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_shutdown(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); + +/* Handler functions: dispatch-id.c (identity syscalls). */ + +struct kbox_dispatch dispatch_get_uid( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch dispatch_get_gid( + long (*lkl_func)(const struct kbox_sysnrs *), + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch dispatch_set_id( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + struct kbox_dispatch (*lkl_forward)(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx)); +struct kbox_dispatch forward_getresuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getresuid_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + uid_t override_val); +struct kbox_dispatch forward_getresgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getresgid_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + gid_t override_val); +struct kbox_dispatch forward_getgroups(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getgroups_override( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + gid_t override_val); +struct kbox_dispatch forward_setuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setreuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setresuid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setregid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setresgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setgroups(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_setfsgid(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_umask(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); + +/* Handler functions: dispatch-exec.c (exec, mprotect, clone3). */ + +struct kbox_dispatch forward_execve(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + int is_execveat); +struct kbox_dispatch forward_mprotect(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_clone3(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); + +/* Handler functions: dispatch-misc.c (time, info, device, memory). */ + +struct kbox_dispatch forward_clock_gettime( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_clock_getres( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_gettimeofday( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_uname(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_getrandom(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_syslog(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_prctl(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_pipe2(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_ioctl(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_pwrite64(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_writev(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_readv(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_ftruncate(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_fallocate(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_flock(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_fsync(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_fdatasync(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_sync(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_symlinkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_linkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_utimensat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_readlinkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); +struct kbox_dispatch forward_mmap(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx); + +/* Clone namespace flags (portable fallbacks for older headers). */ + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000ULL +#endif +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080ULL +#endif +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000ULL +#endif +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000ULL +#endif +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000ULL +#endif +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000ULL +#endif +#ifndef CLONE_NEWPID +#define CLONE_NEWPID 0x20000000ULL +#endif +#ifndef CLONE_NEWNET +#define CLONE_NEWNET 0x40000000ULL +#endif +#ifndef CLONE_THREAD +#define CLONE_THREAD 0x00010000ULL +#endif + +#define CLONE_NEW_MASK \ + (CLONE_NEWNS | CLONE_NEWTIME | CLONE_NEWCGROUP | CLONE_NEWUTS | \ + CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET) + +#endif /* KBOX_DISPATCH_INTERNAL_H */ diff --git a/src/dispatch-misc.c b/src/dispatch-misc.c new file mode 100644 index 0000000..07a54af --- /dev/null +++ b/src/dispatch-misc.c @@ -0,0 +1,916 @@ +/* SPDX-License-Identifier: MIT */ + +/* Miscellaneous syscall handlers for the seccomp dispatch engine. + * + * Time queries, pipe creation, uname, getrandom, prctl, extended I/O + * (pwrite, writev, readv), truncation, sync, links, and ioctl. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dispatch-internal.h" +#include "kbox/path.h" + +struct kbox_dispatch forward_clock_gettime( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + int clockid = (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t remote_ts = kbox_syscall_request_arg(req, 1); + + if (remote_ts == 0) + return kbox_dispatch_errno(EFAULT); + + struct timespec ts; + if (clock_gettime(clockid, &ts) < 0) + return kbox_dispatch_errno(errno); + + int wrc = guest_mem_write(ctx, pid, remote_ts, &ts, sizeof(ts)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_clock_getres( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + int clockid = (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t remote_ts = kbox_syscall_request_arg(req, 1); + + struct timespec ts; + if (clock_getres(clockid, remote_ts ? &ts : NULL) < 0) + return kbox_dispatch_errno(errno); + + if (remote_ts != 0) { + int wrc = guest_mem_write(ctx, pid, remote_ts, &ts, sizeof(ts)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_gettimeofday( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_tv = kbox_syscall_request_arg(req, 0); + uint64_t remote_tz = kbox_syscall_request_arg(req, 1); + + /* Use clock_gettime(CLOCK_REALTIME) as the underlying source, which works + * on both x86_64 and aarch64. + */ + if (remote_tv != 0) { + struct timespec ts; + if (clock_gettime(CLOCK_REALTIME, &ts) < 0) + return kbox_dispatch_errno(errno); + + struct { + long tv_sec; + long tv_usec; + } tv; + tv.tv_sec = ts.tv_sec; + tv.tv_usec = ts.tv_nsec / 1000; + + int wrc = guest_mem_write(ctx, pid, remote_tv, &tv, sizeof(tv)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + + if (remote_tz != 0) { + /* Return zeroed timezone (UTC). */ + struct { + int tz_minuteswest; + int tz_dsttime; + } tz = {0, 0}; + + int wrc = guest_mem_write(ctx, pid, remote_tz, &tz, sizeof(tz)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_readlinkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + long dirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); + char pathbuf[KBOX_MAX_PATH]; + int rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + pathbuf, sizeof(pathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + uint64_t remote_buf = kbox_syscall_request_arg(req, 2); + int64_t bufsiz_raw = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + if (bufsiz_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t bufsiz = (size_t) bufsiz_raw; + + if (remote_buf == 0) + return kbox_dispatch_errno(EFAULT); + + char translated[KBOX_MAX_PATH]; + rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, translated, + sizeof(translated)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + long lkl_dirfd = resolve_open_dirfd(translated, dirfd_raw, ctx->fd_table); + if (lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + + if (bufsiz > KBOX_MAX_PATH) + bufsiz = KBOX_MAX_PATH; + + char linkbuf[KBOX_MAX_PATH]; + long ret = kbox_lkl_readlinkat(ctx->sysnrs, lkl_dirfd, translated, linkbuf, + (long) bufsiz); + if (ret < 0) + return kbox_dispatch_errno((int) (-ret)); + + size_t n = (size_t) ret; + int wrc = guest_mem_write(ctx, pid, remote_buf, linkbuf, n); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value((int64_t) n); +} + +struct kbox_dispatch forward_pipe2(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_pipefd = kbox_syscall_request_arg(req, 0); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + + if (remote_pipefd == 0) + return kbox_dispatch_errno(EFAULT); + + /* Create a real host pipe and inject both ends into the tracee via + * SECCOMP_IOCTL_NOTIF_ADDFD. This makes pipes fully native: + * - dup2/close/read/write on pipe FDs -> CONTINUE (host kernel) + * - Proper fork semantics: both parent and child share the real + * pipe, no virtual FD table conflicts. + * - No LKL overhead for IPC data transfer. + */ + int host_pipefd[2]; + if (pipe2(host_pipefd, (int) flags) < 0) + return kbox_dispatch_errno(errno); + + uint32_t cloexec_flag = (flags & O_CLOEXEC) ? O_CLOEXEC : 0; + + int tracee_fd0 = request_addfd(ctx, req, host_pipefd[0], cloexec_flag); + if (tracee_fd0 < 0) { + close(host_pipefd[0]); + close(host_pipefd[1]); + return kbox_dispatch_errno(-tracee_fd0); + } + + int tracee_fd1 = request_addfd(ctx, req, host_pipefd[1], cloexec_flag); + if (tracee_fd1 < 0) { + close(host_pipefd[0]); + close(host_pipefd[1]); + return kbox_dispatch_errno(-tracee_fd1); + } + + /* Supervisor copies no longer needed; tracee owns its own copies. */ + close(host_pipefd[0]); + close(host_pipefd[1]); + + int guest_fds[2] = {tracee_fd0, tracee_fd1}; + int wrc = + guest_mem_write(ctx, pid, remote_pipefd, guest_fds, sizeof(guest_fds)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_uname(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_buf = kbox_syscall_request_arg(req, 0); + + if (remote_buf == 0) + return kbox_dispatch_errno(EFAULT); + + struct utsname uts; + memset(&uts, 0, sizeof(uts)); + snprintf(uts.sysname, sizeof(uts.sysname), "Linux"); + snprintf(uts.nodename, sizeof(uts.nodename), "kbox"); + snprintf(uts.release, sizeof(uts.release), "6.8.0-kbox"); + snprintf(uts.version, sizeof(uts.version), "#1 SMP"); +#if defined(__x86_64__) + snprintf(uts.machine, sizeof(uts.machine), "x86_64"); +#elif defined(__aarch64__) + snprintf(uts.machine, sizeof(uts.machine), "aarch64"); +#else + snprintf(uts.machine, sizeof(uts.machine), "unknown"); +#endif + + int wrc = guest_mem_write(ctx, pid, remote_buf, &uts, sizeof(uts)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_getrandom(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_buf = kbox_syscall_request_arg(req, 0); + int64_t buflen_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + + if (buflen_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t buflen = (size_t) buflen_raw; + + if (remote_buf == 0) + return kbox_dispatch_errno(EFAULT); + if (buflen == 0) + return kbox_dispatch_value(0); + + /* Read from /dev/urandom via LKL. Fall back to host if LKL does not + * have the device available. + */ + size_t max_chunk = 256; + if (buflen > max_chunk) + buflen = max_chunk; + + uint8_t scratch[256]; + long fd = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, "/dev/urandom", + O_RDONLY, 0); + if (fd < 0) { + /* Fallback: let host kernel handle it. */ + return kbox_dispatch_continue(); + } + + long ret = kbox_lkl_read(ctx->sysnrs, fd, scratch, (long) buflen); + lkl_close_and_invalidate(ctx, fd); + + if (ret < 0) + return kbox_dispatch_errno((int) (-ret)); + + size_t n = (size_t) ret; + int wrc = guest_mem_write(ctx, pid, remote_buf, scratch, n); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value((int64_t) n); +} + +/* syslog(type, buf, len): forward to LKL so dmesg shows the LKL kernel's ring + * buffer, not the host's. + * + * Types that read into buf (2=READ, 3=READ_ALL, 4=READ_CLEAR): call LKL with + * a scratch buffer, then copy to tracee. + * Types that just return a value (0,1,5-10): forward type+len, return the + * result directly. + */ +#define SYSLOG_ACTION_READ 2 +#define SYSLOG_ACTION_READ_ALL 3 +#define SYSLOG_ACTION_READ_CLEAR 4 +#define SYSLOG_ACTION_SIZE_BUFFER 10 + +struct kbox_dispatch forward_syslog(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + long type = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + long len = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + + int needs_buf = + (type == SYSLOG_ACTION_READ || type == SYSLOG_ACTION_READ_ALL || + type == SYSLOG_ACTION_READ_CLEAR); + + if (!needs_buf) { + /* No buffer transfer: SIZE_BUFFER, CONSOLE_ON/OFF, etc. */ + long ret = lkl_syscall6(ctx->sysnrs->syslog, type, 0, len, 0, 0, 0); + return kbox_dispatch_from_lkl(ret); + } + + if (len <= 0) + return kbox_dispatch_errno(EINVAL); + if (remote_buf == 0) + return kbox_dispatch_errno(EFAULT); + + /* Static buffer; safe because the supervisor is single-threaded. + * Clamp to the actual LKL ring buffer size so READ_CLEAR never discards + * data beyond what we can copy out. The ring buffer size is fixed at boot, + * so cache it after the first query. Hard-cap at 1MB (the static buffer + * size) as a safety ceiling. + */ + static uint8_t scratch[1024 * 1024]; + static long cached_ring_sz; + if (!cached_ring_sz) { + long sz = lkl_syscall6(ctx->sysnrs->syslog, SYSLOG_ACTION_SIZE_BUFFER, + 0, 0, 0, 0, 0); + cached_ring_sz = (sz > 0) ? sz : -1; + } + if (cached_ring_sz > 0 && len > cached_ring_sz) + len = cached_ring_sz; + if (len > (long) sizeof(scratch)) + len = (long) sizeof(scratch); + + long ret = + lkl_syscall6(ctx->sysnrs->syslog, type, (long) scratch, len, 0, 0, 0); + if (ret < 0) + return kbox_dispatch_errno((int) (-ret)); + + size_t n = (size_t) ret; + int wrc = guest_mem_write(ctx, pid, remote_buf, scratch, n); + + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value((int64_t) n); +} + +#ifndef PR_SET_NAME +#define PR_SET_NAME 15 +#endif +#ifndef PR_GET_NAME +#define PR_GET_NAME 16 +#endif +#ifndef PR_SET_DUMPABLE +#define PR_SET_DUMPABLE 4 +#endif +#ifndef PR_GET_DUMPABLE +#define PR_GET_DUMPABLE 3 +#endif + +struct kbox_dispatch forward_prctl(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long option = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + + /* Block PR_SET_DUMPABLE(0): clearing dumpability makes process_vm_readv + * fail, which would bypass clone3 namespace-flag sanitization (supervisor + * can't read clone_args.flags from a non-dumpable process). Return success + * without actually clearing; the tracee thinks it worked, but supervisor + * retains read access. + */ + if (option == PR_SET_DUMPABLE && + to_c_long_arg(kbox_syscall_request_arg(req, 1)) == 0) + return kbox_dispatch_value(0); + /* Match: report dumpable even if guest tried to clear it. */ + if (option == PR_GET_DUMPABLE) + return kbox_dispatch_value(1); + + /* Only forward PR_SET_NAME and PR_GET_NAME to LKL. Everything else passes + * through to the host kernel. + * + * PR_SET_NAME/PR_GET_NAME use a 16-byte name buffer. The tracee passes a + * pointer in arg2 which is in the tracee's address space, not ours. We must + * copy through kbox_vm_read/kbox_vm_write. + */ + if (option != PR_SET_NAME && option != PR_GET_NAME) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_name = kbox_syscall_request_arg(req, 1); + if (remote_name == 0) + return kbox_dispatch_errno(EFAULT); + + /* PR_SET_NAME: read 16-byte name from tracee, pass local copy to LKL. */ + if (option == PR_SET_NAME) { + char name[16]; + int rrc = guest_mem_read(ctx, pid, remote_name, name, sizeof(name)); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + name[15] = '\0'; /* ensure NUL termination */ + long ret = + lkl_syscall6(ctx->sysnrs->prctl, option, (long) name, 0, 0, 0, 0); + return kbox_dispatch_from_lkl(ret); + } + + /* PR_GET_NAME: get name from LKL into local buffer, write to tracee. */ + char name[16] = {0}; + long ret = + lkl_syscall6(ctx->sysnrs->prctl, option, (long) name, 0, 0, 0, 0); + if (ret < 0) + return kbox_dispatch_from_lkl(ret); + int wrc = guest_mem_write(ctx, pid, remote_name, name, sizeof(name)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_pwrite64(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); + + invalidate_stat_cache_fd(ctx, lkl_fd); + + uint64_t remote_buf = kbox_syscall_request_arg(req, 1); + int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + if (count_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t count = (size_t) count_raw; + long offset = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + + if (remote_buf == 0) + return kbox_dispatch_errno(EFAULT); + if (count == 0) + return kbox_dispatch_value(0); + + pid_t pid = kbox_syscall_request_pid(req); + size_t max_count = 1024 * 1024; + if (count > max_count) + count = max_count; + + size_t total = 0; + uint8_t *scratch = dispatch_scratch; + + while (total < count) { + size_t chunk_len = KBOX_IO_CHUNK_LEN; + if (chunk_len > count - total) + chunk_len = count - total; + + uint64_t remote = remote_buf + total; + int rrc = guest_mem_read(ctx, pid, remote, scratch, chunk_len); + if (rrc < 0) { + if (total > 0) + break; + return kbox_dispatch_errno(-rrc); + } + + long ret = kbox_lkl_pwrite64(ctx->sysnrs, lkl_fd, scratch, + (long) chunk_len, offset + (long) total); + if (ret < 0) { + if (total == 0) { + return kbox_dispatch_errno((int) (-ret)); + } + break; + } + + size_t n = (size_t) ret; + total += n; + if (n < chunk_len) + break; + } + + if (total > 0) + invalidate_path_shadow_cache(ctx); + return kbox_dispatch_value((int64_t) total); +} + +/* iovec layout matches the kernel's: { void *iov_base; size_t iov_len; } + * On 64-bit: 16 bytes per entry. + */ +#define IOV_ENTRY_SIZE 16 +/* Match the kernel's UIO_MAXIOV. The iov_buf is static (not stack-allocated) + * because in trap/rewrite mode dispatch runs in signal handler context where + * 16 KB on the stack risks overflow on threads with small stacks. Dispatcher is + * single-threaded (documented invariant), so a static buffer is safe. + */ +#define IOV_MAX_COUNT 1024 +static uint8_t iov_scratch[IOV_MAX_COUNT * IOV_ENTRY_SIZE]; + +/* Shared iov scatter/gather dispatcher for writev and readv. + * + * is_write selects the direction: 1 = guest->LKL (writev), 0 = LKL->guest + * (readv). The chunked loop, iov parsing, and error handling are identical in + * both directions. + */ +static struct kbox_dispatch dispatch_iov_transfer( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + int is_write) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + if (is_write) + invalidate_stat_cache_fd(ctx, lkl_fd); + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t remote_iov = kbox_syscall_request_arg(req, 1); + int64_t iovcnt_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + + if (iovcnt_raw < 0 || iovcnt_raw > IOV_MAX_COUNT) + return kbox_dispatch_errno(EINVAL); + if (iovcnt_raw == 0) + return kbox_dispatch_value(0); + if (remote_iov == 0) + return kbox_dispatch_errno(EFAULT); + + int iovcnt = (int) iovcnt_raw; + size_t iov_bytes = (size_t) iovcnt * IOV_ENTRY_SIZE; + + int rrc = guest_mem_read(ctx, pid, remote_iov, iov_scratch, iov_bytes); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + int mirror_host = + is_write ? kbox_fd_table_mirror_tty(ctx->fd_table, fd) : 0; + size_t total = 0; + uint8_t *scratch = dispatch_scratch; + int err = 0; + + int i; + for (i = 0; i < iovcnt; i++) { + uint64_t base; + uint64_t len; + memcpy(&base, &iov_scratch[i * IOV_ENTRY_SIZE], 8); + memcpy(&len, &iov_scratch[i * IOV_ENTRY_SIZE + 8], 8); + + if (base == 0 || len == 0) + continue; + + size_t seg_total = 0; + while (seg_total < len) { + size_t chunk = KBOX_IO_CHUNK_LEN; + if (chunk > len - seg_total) + chunk = len - seg_total; + + if (is_write) { + rrc = + guest_mem_read(ctx, pid, base + seg_total, scratch, chunk); + if (rrc < 0) { + err = -rrc; + goto done; + } + } + + long ret = + is_write + ? kbox_lkl_write(ctx->sysnrs, lkl_fd, scratch, (long) chunk) + : kbox_lkl_read(ctx->sysnrs, lkl_fd, scratch, (long) chunk); + if (ret < 0) { + err = (int) (-ret); + goto done; + } + + size_t n = (size_t) ret; + if (n == 0) + goto done; + + if (is_write) { + if (mirror_host) + (void) write(STDOUT_FILENO, scratch, n); + } else { + int wrc = + guest_mem_write(ctx, pid, base + seg_total, scratch, n); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + + seg_total += n; + total += n; + if (n < chunk) + goto done; + } + } + +done: + if (is_write && total > 0) + invalidate_path_shadow_cache(ctx); + if (total == 0 && err) + return kbox_dispatch_errno(err); + return kbox_dispatch_value((int64_t) total); +} + +struct kbox_dispatch forward_writev(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return dispatch_iov_transfer(req, ctx, 1); +} + +struct kbox_dispatch forward_readv(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return dispatch_iov_transfer(req, ctx, 0); +} + +struct kbox_dispatch forward_ftruncate(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); + + long length = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long ret = kbox_lkl_ftruncate(ctx->sysnrs, lkl_fd, length); + if (ret >= 0) { + invalidate_path_shadow_cache(ctx); + invalidate_stat_cache_fd(ctx, lkl_fd); + } + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_fallocate(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) + return kbox_dispatch_continue(); + + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long offset = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long len = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + long ret = kbox_lkl_fallocate(ctx->sysnrs, lkl_fd, mode, offset, len); + if (ret == -ENOSYS) + return kbox_dispatch_errno(ENOSYS); + if (ret >= 0) { + invalidate_path_shadow_cache(ctx); + invalidate_stat_cache_fd(ctx, lkl_fd); + } + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_flock(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + long operation = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long ret = kbox_lkl_flock(ctx->sysnrs, lkl_fd, operation); + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_fsync(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + if (entry && entry->shadow_writeback) { + int rc = sync_shadow_writeback(ctx, entry); + if (rc < 0) + return kbox_dispatch_errno(-rc); + return kbox_dispatch_value(0); + } + + long ret = kbox_lkl_fsync(ctx->sysnrs, lkl_fd); + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_fdatasync(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + if (entry && entry->shadow_writeback) { + int rc = sync_shadow_writeback(ctx, entry); + if (rc < 0) + return kbox_dispatch_errno(-rc); + return kbox_dispatch_value(0); + } + + long ret = kbox_lkl_fdatasync(ctx->sysnrs, lkl_fd); + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_sync(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + (void) req; + long ret = kbox_lkl_sync(ctx->sysnrs); + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_symlinkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + char targetbuf[KBOX_MAX_PATH]; + char linkpathbuf[KBOX_MAX_PATH]; + int rc; + + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 0), + targetbuf, sizeof(targetbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + long newdirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 1)); + + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 2), + linkpathbuf, sizeof(linkpathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + char linktrans[KBOX_MAX_PATH]; + rc = kbox_translate_path_for_lkl(pid, linkpathbuf, ctx->host_root, + linktrans, sizeof(linktrans)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + long newdirfd = resolve_open_dirfd(linktrans, newdirfd_raw, ctx->fd_table); + if (newdirfd < 0 && newdirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + + /* Target is stored as-is (not translated). */ + long ret = kbox_lkl_symlinkat(ctx->sysnrs, targetbuf, newdirfd, linktrans); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_linkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + long olddirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); + char oldpathbuf[KBOX_MAX_PATH]; + int rc; + + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + oldpathbuf, sizeof(oldpathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + long newdirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 2)); + char newpathbuf[KBOX_MAX_PATH]; + + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 3), + newpathbuf, sizeof(newpathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); + + char oldtrans[KBOX_MAX_PATH]; + rc = kbox_translate_path_for_lkl(pid, oldpathbuf, ctx->host_root, oldtrans, + sizeof(oldtrans)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + char newtrans[KBOX_MAX_PATH]; + rc = kbox_translate_path_for_lkl(pid, newpathbuf, ctx->host_root, newtrans, + sizeof(newtrans)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + long olddirfd = resolve_open_dirfd(oldtrans, olddirfd_raw, ctx->fd_table); + if (olddirfd < 0 && olddirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + + long newdirfd = resolve_open_dirfd(newtrans, newdirfd_raw, ctx->fd_table); + if (newdirfd < 0 && newdirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + + long ret = kbox_lkl_linkat(ctx->sysnrs, olddirfd, oldtrans, newdirfd, + newtrans, flags); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); + return kbox_dispatch_from_lkl(ret); +} + +/* struct timespec is 16 bytes on 64-bit: tv_sec(8) + tv_nsec(8). */ +#define TIMESPEC_SIZE 16 + +struct kbox_dispatch forward_utimensat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + pid_t pid = kbox_syscall_request_pid(req); + long dirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); + + /* pathname can be NULL for utimensat (operates on dirfd itself). In that + * case args[1] == 0. + */ + const char *translated_path = NULL; + char translated[KBOX_MAX_PATH]; + long lkl_dirfd; + int rc; + + if (kbox_syscall_request_arg(req, 1) != 0) { + char pathbuf[KBOX_MAX_PATH]; + rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), + pathbuf, sizeof(pathbuf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, + translated, sizeof(translated)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + + translated_path = translated; + lkl_dirfd = resolve_open_dirfd(translated, dirfd_raw, ctx->fd_table); + if (lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX) + return kbox_dispatch_continue(); + } else { + translated_path = NULL; + /* dirfd must be a virtual FD when path is NULL. */ + lkl_dirfd = kbox_fd_table_get_lkl(ctx->fd_table, dirfd_raw); + if (lkl_dirfd < 0) + return kbox_dispatch_continue(); + } + + /* Read the times array (2 x struct timespec) if provided. */ + uint8_t times_buf[TIMESPEC_SIZE * 2]; + const void *times = NULL; + if (kbox_syscall_request_arg(req, 2) != 0) { + rc = guest_mem_read(ctx, pid, kbox_syscall_request_arg(req, 2), + times_buf, sizeof(times_buf)); + if (rc < 0) + return kbox_dispatch_errno(-rc); + times = times_buf; + } + + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + long ret = kbox_lkl_utimensat(ctx->sysnrs, lkl_dirfd, translated_path, + times, flags); + if (ret >= 0) + invalidate_path_shadow_cache(ctx); + return kbox_dispatch_from_lkl(ret); +} + +/* Terminal ioctl constants. */ +#ifndef TCGETS +#define TCGETS 0x5401 +#endif +#ifndef TCSETS +#define TCSETS 0x5402 +#endif +#ifndef TIOCGWINSZ +#define TIOCGWINSZ 0x5413 +#endif +#ifndef TIOCSWINSZ +#define TIOCSWINSZ 0x5414 +#endif +#ifndef TIOCGPGRP +#define TIOCGPGRP 0x540F +#endif +#ifndef TIOCSPGRP +#define TIOCSPGRP 0x5410 +#endif +#ifndef TIOCSCTTY +#define TIOCSCTTY 0x540E +#endif + +struct kbox_dispatch forward_ioctl(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long cmd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + + if (lkl_fd < 0) { + /* Host FD (stdin/stdout/stderr or pipe). Most ioctls pass through to + * the host kernel. However, job-control ioctls (TIOCSPGRP/TIOCGPGRP) + * fail with EPERM under seccomp-unotify because the supervised child + * is not the session leader. Return ENOTTY so shells fall back to + * non-job-control mode instead of aborting. + */ + if (cmd == TIOCSPGRP || cmd == TIOCGPGRP || cmd == TIOCSCTTY) + return kbox_dispatch_errno(ENOTTY); + return kbox_dispatch_continue(); + } + + /* For virtual FDs backed by LKL, terminal ioctls yield -ENOTTY since LKL + * file-backed FDs are not terminals. Non-terminal ioctls also yield + * -ENOTTY, matching regular-file semantics. + */ + return kbox_dispatch_errno(ENOTTY); +} diff --git a/src/dispatch-net.c b/src/dispatch-net.c new file mode 100644 index 0000000..64b9aaf --- /dev/null +++ b/src/dispatch-net.c @@ -0,0 +1,598 @@ +/* SPDX-License-Identifier: MIT */ + +/* Network syscall handlers for the seccomp dispatch engine. */ + +#include +#include +#include +#include +#include + +#include "dispatch-internal.h" +#include "net.h" + +/* Shadow socket design: + * 1. Create an LKL network socket inside LKL's network stack + * 2. Create a host socketpair (sp[0]=supervisor, sp[1]=tracee) + * 3. Inject sp[1] into the tracee via ADDFD + * 4. Register sp[0]+lkl_fd with the SLIRP event loop + * 5. The event loop pumps data between sp[0] and the LKL socket + * + * The tracee sees a real host FD, so poll/epoll/read/write all work natively + * via the host kernel. Only control-plane ops (connect, getsockopt, etc.) need + * explicit forwarding. + * + * INET sockets with SLIRP active get a shadow socket bridge so data flows + * through the host kernel socketpair (bypassing BKL contention in blocking LKL + * recv/send calls). Non-INET sockets and INET sockets without SLIRP use the + * standard virtual FD path. + * + * Limitation: listen/accept on shadow sockets fail because AF_UNIX socketpair + * doesn't support inbound connections. Server sockets must be used without + * --net or via a future deferred-bridge approach. + */ +struct kbox_dispatch forward_socket(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long domain = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long type_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long protocol = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + + int base_type = (int) type_raw & 0xFF; + + long ret = kbox_lkl_socket(ctx->sysnrs, domain, type_raw, protocol); + if (ret < 0) + return kbox_dispatch_errno((int) (-ret)); + + long lkl_fd = ret; + + /* Virtual FD path when shadow bridge is not applicable: + * - SLIRP not active (no --net) + * - Non-INET domain (AF_UNIX, AF_NETLINK, etc.) + * - Non-stream/datagram type (SOCK_RAW, etc.): socketpair(AF_UNIX) only + * supports SOCK_STREAM and SOCK_DGRAM + */ + if (!kbox_net_is_active() || + (domain != 2 /* AF_INET */ && domain != 10 /* AF_INET6 */) || + (base_type != SOCK_STREAM && base_type != SOCK_DGRAM)) { + long vfd = kbox_fd_table_insert(ctx->fd_table, lkl_fd, 0); + if (vfd < 0) { + lkl_close_and_invalidate(ctx, lkl_fd); + return kbox_dispatch_errno(EMFILE); + } + return kbox_dispatch_value((int64_t) vfd); + } + + /* Shadow socket bridge for INET with SLIRP. */ + int sp[2]; + if (socketpair(AF_UNIX, base_type | SOCK_CLOEXEC, 0, sp) < 0) { + lkl_close_and_invalidate(ctx, lkl_fd); + return kbox_dispatch_errno(errno); + } + fcntl(sp[0], F_SETFL, O_NONBLOCK); + if (type_raw & SOCK_NONBLOCK) + fcntl(sp[1], F_SETFL, O_NONBLOCK); + + long vfd = kbox_fd_table_insert(ctx->fd_table, lkl_fd, 0); + if (vfd < 0) { + close(sp[0]); + close(sp[1]); + lkl_close_and_invalidate(ctx, lkl_fd); + return kbox_dispatch_errno(EMFILE); + } + + if (kbox_net_register_socket((int) lkl_fd, sp[0], base_type) < 0) { + close(sp[0]); + close(sp[1]); + /* Fall back to virtual FD. */ + return kbox_dispatch_value((int64_t) vfd); + } + + uint32_t addfd_flags = 0; + if (type_raw & SOCK_CLOEXEC) + addfd_flags = O_CLOEXEC; + int host_fd = request_addfd(ctx, req, sp[1], addfd_flags); + if (host_fd < 0) { + /* Deregister closes sp[0] and marks inactive. */ + kbox_net_deregister_socket((int) lkl_fd); + close(sp[1]); + kbox_fd_table_remove(ctx->fd_table, vfd); + lkl_close_and_invalidate(ctx, lkl_fd); + return kbox_dispatch_errno(-host_fd); + } + kbox_fd_table_set_host_fd(ctx->fd_table, vfd, host_fd); + + { + struct kbox_fd_entry *e = fd_table_entry(ctx->fd_table, vfd); + if (e) { + e->shadow_sp = sp[1]; + if (type_raw & SOCK_CLOEXEC) + e->cloexec = 1; + } + } + + return kbox_dispatch_value((int64_t) host_fd); +} + +static long resolve_lkl_socket(struct kbox_supervisor_ctx *ctx, long fd); + +struct kbox_dispatch forward_bind(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + if (len_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t len = (size_t) len_raw; + + if (addr_ptr == 0) + return kbox_dispatch_errno(EFAULT); + + if (len > 4096) + return kbox_dispatch_errno(EINVAL); + + uint8_t buf[4096]; + int rrc = guest_mem_read(ctx, pid, addr_ptr, buf, len); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + long ret = kbox_lkl_bind(ctx->sysnrs, lkl_fd, buf, (long) len); + return kbox_dispatch_from_lkl(ret); +} + +/* Resolve LKL FD from a tracee FD. The tracee may hold either a virtual FD + * (>= KBOX_FD_BASE) or a host FD from a shadow-socket bridge via ADDFD. + * Try both paths. + */ +static long resolve_lkl_socket(struct kbox_supervisor_ctx *ctx, long fd) +{ + long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); + if (lkl_fd >= 0) + return lkl_fd; + + /* Shadow socket: tracee uses the host_fd directly. */ + long vfd = kbox_fd_table_find_by_host_fd(ctx->fd_table, fd); + if (vfd >= 0) + return kbox_fd_table_get_lkl(ctx->fd_table, vfd); + + return -1; +} + +struct kbox_dispatch forward_connect(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + if (len_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t len = (size_t) len_raw; + + if (addr_ptr == 0) + return kbox_dispatch_errno(EFAULT); + + if (len > 4096) + return kbox_dispatch_errno(EINVAL); + + uint8_t buf[4096]; + int rrc = guest_mem_read(ctx, pid, addr_ptr, buf, len); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + long ret = kbox_lkl_connect(ctx->sysnrs, lkl_fd, buf, (long) len); + + /* Propagate -EINPROGRESS directly for nonblocking sockets. The tracee's + * poll(POLLOUT) on the AF_UNIX socketpair returns immediately (spurious + * wakeup), but getsockopt(SO_ERROR) is forwarded to the LKL socket and + * returns the real handshake status. The tracee retries poll+getsockopt + * until SO_ERROR clears; standard nonblocking connect flow. + */ + return kbox_dispatch_from_lkl(ret); +} + +struct kbox_dispatch forward_getsockopt(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + long level = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long optname = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + uint64_t optval_ptr = kbox_syscall_request_arg(req, 3); + uint64_t optlen_ptr = kbox_syscall_request_arg(req, 4); + + if (optval_ptr == 0 || optlen_ptr == 0) + return kbox_dispatch_errno(EFAULT); + + /* Read the optlen from tracee. */ + unsigned int optlen; + int rrc = guest_mem_read(ctx, pid, optlen_ptr, &optlen, sizeof(optlen)); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + if (optlen > 4096) + return kbox_dispatch_errno(EINVAL); + + uint8_t optval[4096]; + unsigned int out_len = optlen; + + long ret = kbox_lkl_getsockopt(ctx->sysnrs, lkl_fd, level, optname, optval, + &out_len); + if (ret < 0) + return kbox_dispatch_from_lkl(ret); + + /* Write min(out_len, optlen) to avoid leaking stack data. */ + unsigned int write_len = out_len < optlen ? out_len : optlen; + int wrc = guest_mem_write(ctx, pid, optval_ptr, optval, write_len); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + wrc = guest_mem_write(ctx, pid, optlen_ptr, &out_len, sizeof(out_len)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_setsockopt(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + long level = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long optname = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + uint64_t optval_ptr = kbox_syscall_request_arg(req, 3); + long optlen = to_c_long_arg(kbox_syscall_request_arg(req, 4)); + + if (optlen < 0 || optlen > 4096) + return kbox_dispatch_errno(EINVAL); + + uint8_t optval[4096] = {0}; + if (optval_ptr != 0 && optlen > 0) { + int rrc = guest_mem_read(ctx, pid, optval_ptr, optval, (size_t) optlen); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + } + + long ret = kbox_lkl_setsockopt(ctx->sysnrs, lkl_fd, level, optname, + optval_ptr ? optval : NULL, optlen); + return kbox_dispatch_from_lkl(ret); +} + +typedef long (*sockaddr_query_fn)(const struct kbox_sysnrs *s, + long fd, + void *addr, + void *addrlen); + +static struct kbox_dispatch forward_sockaddr_query( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + sockaddr_query_fn query) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); + uint64_t len_ptr = kbox_syscall_request_arg(req, 2); + + if (addr_ptr == 0 || len_ptr == 0) + return kbox_dispatch_errno(EFAULT); + + unsigned int addrlen; + int rrc = guest_mem_read(ctx, pid, len_ptr, &addrlen, sizeof(addrlen)); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + if (addrlen > 4096) + addrlen = 4096; + + uint8_t addr[4096]; + unsigned int out_len = addrlen; + + long ret = query(ctx->sysnrs, lkl_fd, addr, &out_len); + if (ret < 0) + return kbox_dispatch_from_lkl(ret); + + unsigned int write_len = out_len < addrlen ? out_len : addrlen; + int wrc = guest_mem_write(ctx, pid, addr_ptr, addr, write_len); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + wrc = guest_mem_write(ctx, pid, len_ptr, &out_len, sizeof(out_len)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + return kbox_dispatch_value(0); +} + +struct kbox_dispatch forward_getsockname(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_sockaddr_query(req, ctx, kbox_lkl_getsockname); +} + +struct kbox_dispatch forward_getpeername(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_sockaddr_query(req, ctx, kbox_lkl_getpeername); +} + +struct kbox_dispatch forward_shutdown(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + long how = to_c_long_arg(kbox_syscall_request_arg(req, 1)); + long ret = kbox_lkl_shutdown(ctx->sysnrs, lkl_fd, how); + return kbox_dispatch_from_lkl(ret); +} + +/* For shadow sockets with a destination address, forward the data + address + * directly to the LKL socket. This is needed for unconnected UDP (DNS resolver + * uses sendto with sockaddr_in without prior connect). + * + * sendto(fd, buf, len, flags, dest_addr, addrlen) + * args[0]=fd, args[1]=buf, args[2]=len, args[3]=flags, + * args[4]=dest_addr, args[5]=addrlen + */ +struct kbox_dispatch forward_sendto(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + uint64_t dest_ptr = kbox_syscall_request_arg(req, 4); + if (dest_ptr == 0) + return kbox_dispatch_continue(); /* no dest addr: stream data path */ + + /* Has a destination address: forward via LKL sendto. */ + pid_t pid = kbox_syscall_request_pid(req); + uint64_t buf_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + int64_t addrlen_raw = to_c_long_arg(kbox_syscall_request_arg(req, 5)); + + if (len_raw < 0 || addrlen_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t len = (size_t) len_raw; + size_t addrlen = (size_t) addrlen_raw; + + if (len > 65536) + len = 65536; + if (addrlen > 128) + return kbox_dispatch_errno(EINVAL); + + uint8_t buf[65536]; + uint8_t addr[128]; + + int rrc = guest_mem_read(ctx, pid, buf_ptr, buf, len); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + rrc = guest_mem_read(ctx, pid, dest_ptr, addr, addrlen); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + long ret = kbox_lkl_sendto(ctx->sysnrs, lkl_fd, buf, (long) len, flags, + addr, (long) addrlen); + return kbox_dispatch_from_lkl(ret); +} + +/* For shadow sockets, receive data + source address from the LKL socket and + * write them back to the tracee. + * + * recvfrom(fd, buf, len, flags, src_addr, addrlen) + * args[0]=fd, args[1]=buf, args[2]=len, args[3]=flags, + * args[4]=src_addr, args[5]=addrlen + */ +struct kbox_dispatch forward_recvfrom(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + uint64_t src_ptr = kbox_syscall_request_arg(req, 4); + if (src_ptr == 0) + return kbox_dispatch_continue(); /* no addr buffer: stream path */ + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t buf_ptr = kbox_syscall_request_arg(req, 1); + int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); + uint64_t addrlen_ptr = kbox_syscall_request_arg(req, 5); + + if (len_raw < 0) + return kbox_dispatch_errno(EINVAL); + size_t len = (size_t) len_raw; + if (len > 65536) + len = 65536; + + unsigned int addrlen = 0; + if (addrlen_ptr != 0) { + int rrc = + guest_mem_read(ctx, pid, addrlen_ptr, &addrlen, sizeof(addrlen)); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + } + if (addrlen > 128) + addrlen = 128; + + uint8_t buf[65536]; + uint8_t addr[128]; + unsigned int out_addrlen = addrlen; + + long ret = kbox_lkl_recvfrom(ctx->sysnrs, lkl_fd, buf, (long) len, flags, + addr, &out_addrlen); + if (ret < 0) + return kbox_dispatch_from_lkl(ret); + + int wrc = guest_mem_write(ctx, pid, buf_ptr, buf, (size_t) ret); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + + if (src_ptr != 0 && out_addrlen > 0) { + unsigned int write_len = out_addrlen < addrlen ? out_addrlen : addrlen; + wrc = guest_mem_write(ctx, pid, src_ptr, addr, write_len); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + if (addrlen_ptr != 0) { + wrc = guest_mem_write(ctx, pid, addrlen_ptr, &out_addrlen, + sizeof(out_addrlen)); + if (wrc < 0) + return kbox_dispatch_errno(-wrc); + } + + return kbox_dispatch_value(ret); +} + +/* Intercept for shadow sockets so that msg_name (source + * address) is populated from the LKL socket, not the AF_UNIX socketpair. + * + * recvmsg(fd, msg, flags) + * args[0]=fd, args[1]=msg_ptr, args[2]=flags + */ +struct kbox_dispatch forward_recvmsg(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); + long lkl_fd = resolve_lkl_socket(ctx, fd); + if (lkl_fd < 0) + return kbox_dispatch_continue(); + + pid_t pid = kbox_syscall_request_pid(req); + uint64_t msg_ptr = kbox_syscall_request_arg(req, 1); + long flags = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + + if (msg_ptr == 0) + return kbox_dispatch_errno(EFAULT); + + struct { + uint64_t msg_name; + uint32_t msg_namelen; + uint32_t __pad0; + uint64_t msg_iov; + uint64_t msg_iovlen; + uint64_t msg_control; + uint64_t msg_controllen; + int msg_flags; + } mh; + int rrc = guest_mem_read(ctx, pid, msg_ptr, &mh, sizeof(mh)); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + /* No msg_name: for connected stream sockets, CONTINUE via socketpair. */ + if (mh.msg_name == 0 || mh.msg_namelen == 0) + return kbox_dispatch_continue(); + + /* Read all iovecs to determine total buffer capacity. */ + if (mh.msg_iovlen == 0) + return kbox_dispatch_value(0); + + size_t niov = (size_t) mh.msg_iovlen; + if (niov > 64) + niov = 64; + + struct { + uint64_t iov_base; + uint64_t iov_len; + } iovs[64]; + rrc = guest_mem_read(ctx, pid, mh.msg_iov, iovs, niov * sizeof(iovs[0])); + if (rrc < 0) + return kbox_dispatch_errno(-rrc); + + size_t total_cap = 0; + for (size_t v = 0; v < niov; v++) + total_cap += (size_t) iovs[v].iov_len; + if (total_cap > 65536) + total_cap = 65536; + + uint8_t buf[65536]; + uint8_t addr[128]; + unsigned int addrlen = mh.msg_namelen < sizeof(addr) + ? mh.msg_namelen + : (unsigned int) sizeof(addr); + unsigned int out_addrlen = addrlen; + + long ret = kbox_lkl_recvfrom(ctx->sysnrs, lkl_fd, buf, (long) total_cap, + flags, addr, &out_addrlen); + if (ret < 0) + return kbox_dispatch_from_lkl(ret); + + /* Scatter received data across tracee iov buffers. */ + size_t written = 0; + for (size_t v = 0; v < niov && written < (size_t) ret; v++) { + size_t chunk = (size_t) ret - written; + if (chunk > (size_t) iovs[v].iov_len) + chunk = (size_t) iovs[v].iov_len; + if (chunk > 0 && iovs[v].iov_base != 0) { + int wrc2 = guest_mem_write(ctx, pid, iovs[v].iov_base, + buf + written, chunk); + if (wrc2 < 0) + return kbox_dispatch_errno(-wrc2); + written += chunk; + } + } + + /* Write source address to tracee msg_name. */ + if (out_addrlen > 0) { + unsigned int write_len = + out_addrlen < mh.msg_namelen ? out_addrlen : mh.msg_namelen; + int awrc = guest_mem_write(ctx, pid, mh.msg_name, addr, write_len); + if (awrc < 0) + return kbox_dispatch_errno(-awrc); + } + + /* Update msg_namelen in the msghdr. */ + int nwrc = + guest_mem_write(ctx, pid, msg_ptr + 8 /* offset of msg_namelen */, + &out_addrlen, sizeof(out_addrlen)); + if (nwrc < 0) + return kbox_dispatch_errno(-nwrc); + + /* Zero msg_controllen and msg_flags: the recvfrom path does not + * produce ancillary data. Without this, CMSG_FIRSTHDR() in the + * tracee would parse uninitialized memory from the msg_control + * buffer. + */ + uint64_t zero8 = 0; + int zero4 = 0; + int cwrc; + /* msg_controllen is at offset 40 (after msg_control at 32). */ + cwrc = guest_mem_write(ctx, pid, msg_ptr + 40, &zero8, sizeof(zero8)); + if (cwrc < 0) + return kbox_dispatch_errno(-cwrc); + /* msg_flags is at offset 48. */ + cwrc = guest_mem_write(ctx, pid, msg_ptr + 48, &zero4, sizeof(zero4)); + if (cwrc < 0) + return kbox_dispatch_errno(-cwrc); + + return kbox_dispatch_value(ret); +} diff --git a/src/seccomp-dispatch.c b/src/seccomp-dispatch.c index d29eb75..1447a22 100644 --- a/src/seccomp-dispatch.c +++ b/src/seccomp-dispatch.c @@ -25,6 +25,7 @@ #include #include +#include "dispatch-internal.h" #include "fd-table.h" #include "kbox/elf.h" #include "kbox/identity.h" @@ -40,92 +41,12 @@ #include "syscall-trap-signal.h" #include "syscall-trap.h" -#define KBOX_FD_HOST_SAME_FD_SHADOW (-2) -#define KBOX_FD_LOCAL_ONLY_SHADOW (-3) -#define KBOX_LKL_FD_SHADOW_ONLY (-2) - -/* Argument extraction helpers. */ - -static inline int64_t to_c_long_arg(uint64_t v) -{ - return (int64_t) v; -} - -/* Static scratch buffer for I/O dispatch. The dispatcher is single-threaded - * and non-reentrant: only one syscall is dispatched at a time. Using a static - * buffer instead of malloc avoids heap allocation from the SIGSYS handler in - * trap/rewrite mode, where the guest may hold glibc heap locks. - */ -static uint8_t dispatch_scratch[KBOX_IO_CHUNK_LEN]; - -static inline long to_dirfd_arg(uint64_t v) -{ - return (long) (int) (uint32_t) v; -} - -static int guest_mem_read(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - void *out, - size_t len); -static int guest_mem_write(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - const void *in, - size_t len); -static int try_cached_shadow_open_dispatch( - struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - long flags, - const char *translated, - struct kbox_dispatch *out); -static int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, - const char *translated, - uint64_t remote_stat, - pid_t pid); -static void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx); -static void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx); - -static inline void invalidate_stat_cache_fd(struct kbox_supervisor_ctx *ctx, - long lkl_fd) -{ -#if KBOX_STAT_CACHE_ENABLED - for (int i = 0; i < KBOX_STAT_CACHE_MAX; i++) - if (ctx->stat_cache[i].lkl_fd == lkl_fd) - ctx->stat_cache[i].lkl_fd = -1; -#else - (void) ctx; - (void) lkl_fd; -#endif -} - -/* Close an LKL FD and evict it from the stat cache. Every LKL close in - * the dispatch code should go through this wrapper to prevent stale fstat - * results when the LKL FD number is reused. +/* Static scratch buffer for I/O dispatch. Dispatcher is single-threaded and + * non-reentrant: only one syscall is dispatched at a time. Using static buffer + * instead of malloc avoids heap allocation from the SIGSYS handler in trap / + * rewrite mode, where the guest may hold glibc heap locks. */ -static inline long lkl_close_and_invalidate(struct kbox_supervisor_ctx *ctx, - long lkl_fd) -{ - invalidate_stat_cache_fd(ctx, lkl_fd); - return kbox_lkl_close(ctx->sysnrs, lkl_fd); -} - -static int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - long lkl_fd, - long flags, - const char *translated, - struct kbox_dispatch *out); -static void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, - struct kbox_fd_entry *entry); -static void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, - struct kbox_fd_entry *entry); - -static int request_uses_trap_signals(const struct kbox_syscall_request *req) -{ - return req && (req->source == KBOX_SYSCALL_SOURCE_TRAP || - req->source == KBOX_SYSCALL_SOURCE_REWRITE); -} +uint8_t dispatch_scratch[KBOX_IO_CHUNK_LEN]; static int request_blocks_reserved_sigsys( const struct kbox_syscall_request *req, @@ -157,17 +78,6 @@ static int request_blocks_reserved_sigsys( return kbox_syscall_trap_sigset_blocks_reserved(mask, read_len) ? 1 : 0; } -static struct kbox_fd_entry *fd_table_entry(struct kbox_fd_table *t, long fd) -{ - if (!t) - return NULL; - if (fd >= KBOX_FD_BASE && fd < KBOX_FD_BASE + KBOX_FD_TABLE_MAX) - return &t->entries[fd - KBOX_FD_BASE]; - if (fd >= 0 && fd < KBOX_LOW_FD_MAX) - return &t->low_fds[fd]; - return NULL; -} - static struct kbox_dispatch emulate_trap_rt_sigprocmask( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) @@ -186,10 +96,10 @@ static struct kbox_dispatch emulate_trap_rt_sigprocmask( return kbox_dispatch_errno(EINVAL); mask_len = sigset_size; - /* In TRAP mode the signal mask lives in the ucontext delivered by the - * kernel; modifying it there takes effect when the handler returns. - * In REWRITE mode there is no ucontext -- the rewrite dispatch runs - * as a normal function call, so fall back to sigprocmask(2) directly. + /* In TRAP mode the signal mask lives in the ucontext delivered by kernel; + * modifying it there takes effect when the handler returns. + * In REWRITE mode there is no ucontext; the rewrite dispatch runs as normal + * function call, so fall back to sigprocmask(2) directly. */ if (kbox_syscall_trap_get_sigmask(current, sizeof(current)) < 0) { sigset_t tmp; @@ -370,10 +280,10 @@ static const struct kbox_fd_inject_ops local_fd_inject_ops = { .addfd_at = local_request_addfd_at, }; -static int request_addfd(const struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - int srcfd, - uint32_t newfd_flags) +int request_addfd(const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + int srcfd, + uint32_t newfd_flags) { if (!ctx || !ctx->fd_inject_ops || !ctx->fd_inject_ops->addfd || !req) return -EINVAL; @@ -381,11 +291,11 @@ static int request_addfd(const struct kbox_supervisor_ctx *ctx, srcfd, newfd_flags); } -static int request_addfd_at(const struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - int srcfd, - int target_fd, - uint32_t newfd_flags) +int request_addfd_at(const struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + int srcfd, + int target_fd, + uint32_t newfd_flags) { if (!ctx || !ctx->fd_inject_ops || !ctx->fd_inject_ops->addfd_at || !req) return -EINVAL; @@ -415,53 +325,53 @@ void kbox_dispatch_prepare_request_ctx(struct kbox_supervisor_ctx *ctx, } } -static int guest_mem_read(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - void *out, - size_t len) +int guest_mem_read(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + void *out, + size_t len) { (void) pid; return kbox_guest_mem_read(&ctx->active_guest_mem, remote_addr, out, len); } -static int guest_mem_write(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - const void *in, - size_t len) +int guest_mem_write(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len) { (void) pid; return kbox_guest_mem_write(&ctx->active_guest_mem, remote_addr, in, len); } -static int guest_mem_write_force(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - const void *in, - size_t len) +int guest_mem_write_force(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len) { (void) pid; return kbox_guest_mem_write_force(&ctx->active_guest_mem, remote_addr, in, len); } -static int guest_mem_read_string(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - char *buf, - size_t max_len) +int guest_mem_read_string(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + char *buf, + size_t max_len) { (void) pid; return kbox_guest_mem_read_string(&ctx->active_guest_mem, remote_addr, buf, max_len); } -static int guest_mem_read_open_how(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - uint64_t size, - struct kbox_open_how *out) +int guest_mem_read_open_how(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + uint64_t size, + struct kbox_open_how *out) { (void) pid; @@ -469,74 +379,6 @@ static int guest_mem_read_open_how(const struct kbox_supervisor_ctx *ctx, size, out); } -/* Open-flag ABI translation (aarch64 host <-> asm-generic LKL). */ - -/* aarch64 and asm-generic define four O_* flags differently: - * - * Flag aarch64 asm-generic (LKL) - * O_DIRECTORY 0x04000 0x10000 - * O_NOFOLLOW 0x08000 0x20000 - * O_DIRECT 0x10000 0x04000 - * O_LARGEFILE 0x20000 0x08000 - * - * x86_64 values already match asm-generic so no translation is needed there. - */ -#if defined(__aarch64__) - -#define HOST_O_DIRECTORY 0x04000 -#define HOST_O_NOFOLLOW 0x08000 -#define HOST_O_DIRECT 0x10000 -#define HOST_O_LARGEFILE 0x20000 - -#define LKL_O_DIRECTORY 0x10000 -#define LKL_O_NOFOLLOW 0x20000 -#define LKL_O_DIRECT 0x04000 -#define LKL_O_LARGEFILE 0x08000 - -static inline long host_to_lkl_open_flags(long flags) -{ - long out = flags & ~(HOST_O_DIRECTORY | HOST_O_NOFOLLOW | HOST_O_DIRECT | - HOST_O_LARGEFILE); - if (flags & HOST_O_DIRECTORY) - out |= LKL_O_DIRECTORY; - if (flags & HOST_O_NOFOLLOW) - out |= LKL_O_NOFOLLOW; - if (flags & HOST_O_DIRECT) - out |= LKL_O_DIRECT; - if (flags & HOST_O_LARGEFILE) - out |= LKL_O_LARGEFILE; - return out; -} - -static inline long lkl_to_host_open_flags(long flags) -{ - long out = flags & ~(LKL_O_DIRECTORY | LKL_O_NOFOLLOW | LKL_O_DIRECT | - LKL_O_LARGEFILE); - if (flags & LKL_O_DIRECTORY) - out |= HOST_O_DIRECTORY; - if (flags & LKL_O_NOFOLLOW) - out |= HOST_O_NOFOLLOW; - if (flags & LKL_O_DIRECT) - out |= HOST_O_DIRECT; - if (flags & LKL_O_LARGEFILE) - out |= HOST_O_LARGEFILE; - return out; -} - -#else /* x86_64: flags already match asm-generic */ - -static inline long host_to_lkl_open_flags(long flags) -{ - return flags; -} - -static inline long lkl_to_host_open_flags(long flags) -{ - return flags; -} - -#endif - /* Stat ABI conversion. */ /* Convert LKL's generic-arch stat layout to the host's struct stat. @@ -546,12 +388,10 @@ static inline long lkl_to_host_open_flags(long flags) * generic: st_mode (u32) at offset 16, st_nlink (u32) at offset 20 * x86_64: st_nlink (u64) at offset 16, st_mode (u32) at offset 24 * - * On aarch64 the kernel uses the generic layout, but the C library's struct - * stat may still have different padding, so convert explicitly on all - * architectures. + * On aarch64 the kernel uses the generic layout, but C library's struct stat + * may still have different padding, so convert explicitly on all arch. */ -static void kbox_lkl_stat_to_host(const struct kbox_lkl_stat *src, - struct stat *dst) +void kbox_lkl_stat_to_host(const struct kbox_lkl_stat *src, struct stat *dst) { memset(dst, 0, sizeof(*dst)); dst->st_dev = (dev_t) src->st_dev; @@ -614,14 +454,14 @@ struct kbox_dispatch kbox_dispatch_from_lkl(long ret) /* Resolve dirfd for *at() syscalls. * - * If the path is absolute, AT_FDCWD is fine regardless of dirfd. If the - * dirfd is AT_FDCWD, pass it through. Otherwise look up the virtual FD in - * the table to get the LKL fd. Returns -1 if the fd is not in the table - * (caller should CONTINUE). + * If the path is absolute, AT_FDCWD is fine regardless of dirfd. If dirfd is + * AT_FDCWD, pass it through. Otherwise look up the virtual FD in the table to + * get the LKL fd. Returns -1 if the fd is not in the table (caller should + * CONTINUE). */ -static long resolve_open_dirfd(const char *path, - long dirfd, - const struct kbox_fd_table *table) +long resolve_open_dirfd(const char *path, + long dirfd, + const struct kbox_fd_table *table) { if (path[0] == '/') return AT_FDCWD_LINUX; @@ -630,11 +470,11 @@ static long resolve_open_dirfd(const char *path, return kbox_fd_table_get_lkl(table, dirfd); } -static int read_guest_string(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t addr, - char *buf, - size_t size) +int read_guest_string(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t addr, + char *buf, + size_t size) { return guest_mem_read_string(ctx, pid, addr, buf, size); } @@ -708,7 +548,7 @@ static struct kbox_literal_path_cache_entry *reserve_literal_path_cache( return &ctx->literal_path_cache[0]; } -static int guest_addr_is_writable(pid_t pid, uint64_t addr) +int guest_addr_is_writable(pid_t pid, uint64_t addr) { char maps_path[64]; FILE *fp; @@ -735,7 +575,7 @@ static int guest_addr_is_writable(pid_t pid, uint64_t addr) return 1; } -static void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx) +void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx) { size_t i; @@ -748,12 +588,12 @@ static void invalidate_translated_path_cache(struct kbox_supervisor_ctx *ctx) ctx->literal_path_cache[i].valid = 0; } -static int translate_guest_path(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t addr, - const char *host_root, - char *translated, - size_t size) +int translate_guest_path(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t addr, + const char *host_root, + char *translated, + size_t size) { struct kbox_supervisor_ctx *mutable_ctx = (struct kbox_supervisor_ctx *) ctx; @@ -814,25 +654,25 @@ static int translate_guest_path(const struct kbox_supervisor_ctx *ctx, return 0; } -static int translate_request_path(const struct kbox_syscall_request *req, - const struct kbox_supervisor_ctx *ctx, - size_t path_idx, - const char *host_root, - char *translated, - size_t size) +int translate_request_path(const struct kbox_syscall_request *req, + const struct kbox_supervisor_ctx *ctx, + size_t path_idx, + const char *host_root, + char *translated, + size_t size) { return translate_guest_path(ctx, kbox_syscall_request_pid(req), kbox_syscall_request_arg(req, path_idx), host_root, translated, size); } -static int translate_request_at_path(const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - size_t dirfd_idx, - size_t path_idx, - char *translated, - size_t size, - long *lkl_dirfd) +int translate_request_at_path(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + size_t dirfd_idx, + size_t path_idx, + char *translated, + size_t size, + long *lkl_dirfd) { int rc = translate_request_path(req, ctx, path_idx, ctx->host_root, translated, size); @@ -845,12 +685,12 @@ static int translate_request_at_path(const struct kbox_syscall_request *req, return 0; } -static int should_continue_for_dirfd(long lkl_dirfd) +int should_continue_for_dirfd(long lkl_dirfd) { return lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX; } -static int child_fd_is_open(const struct kbox_supervisor_ctx *ctx, long fd) +int child_fd_is_open(const struct kbox_supervisor_ctx *ctx, long fd) { char link_path[64]; char target[1]; @@ -864,7 +704,7 @@ static int child_fd_is_open(const struct kbox_supervisor_ctx *ctx, long fd) return errno != ENOENT; } -static long allocate_passthrough_hostonly_fd(struct kbox_supervisor_ctx *ctx) +long allocate_passthrough_hostonly_fd(struct kbox_supervisor_ctx *ctx) { long base_fd = KBOX_FD_HOSTONLY_BASE; long end_fd = KBOX_FD_BASE + KBOX_FD_TABLE_MAX; @@ -894,7 +734,7 @@ static long allocate_passthrough_hostonly_fd(struct kbox_supervisor_ctx *ctx) return -1; } -static long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx) +long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx) { long fd; long end_fd = KBOX_FD_BASE + KBOX_FD_TABLE_MAX; @@ -908,7 +748,7 @@ static long next_hostonly_fd_hint(const struct kbox_supervisor_ctx *ctx) return fd; } -static int ensure_proc_self_fd_dir(struct kbox_supervisor_ctx *ctx) +int ensure_proc_self_fd_dir(struct kbox_supervisor_ctx *ctx) { if (!ctx) return -1; @@ -920,7 +760,7 @@ static int ensure_proc_self_fd_dir(struct kbox_supervisor_ctx *ctx) return ctx->proc_self_fd_dirfd; } -static int ensure_proc_mem_fd(struct kbox_supervisor_ctx *ctx) +int ensure_proc_mem_fd(struct kbox_supervisor_ctx *ctx) { char path[64]; @@ -934,11 +774,11 @@ static int ensure_proc_mem_fd(struct kbox_supervisor_ctx *ctx) return ctx->proc_mem_fd; } -static int guest_mem_write_small_metadata(const struct kbox_supervisor_ctx *ctx, - pid_t pid, - uint64_t remote_addr, - const void *in, - size_t len) +int guest_mem_write_small_metadata(const struct kbox_supervisor_ctx *ctx, + pid_t pid, + uint64_t remote_addr, + const void *in, + size_t len) { struct kbox_supervisor_ctx *mutable_ctx = (struct kbox_supervisor_ctx *) ctx; @@ -967,9 +807,8 @@ static int guest_mem_write_small_metadata(const struct kbox_supervisor_ctx *ctx, return 0; } -static int reopen_cached_shadow_fd( - struct kbox_supervisor_ctx *ctx, - const struct kbox_path_shadow_cache_entry *entry) +int reopen_cached_shadow_fd(struct kbox_supervisor_ctx *ctx, + const struct kbox_path_shadow_cache_entry *entry) { char fd_name[32]; int dirfd; @@ -992,10 +831,10 @@ static int reopen_cached_shadow_fd( return openat(dirfd, fd_name, O_RDONLY | O_CLOEXEC); } -/* Promote a read-only regular LKL FD to a host-visible shadow at the same - * guest FD number on first eligible read-only access. This avoids paying the - * memfd copy cost at open time while still letting later read/lseek/fstat/mmap - * operations run on a real host FD. +/* Promote a read-only regular LKL FD to a host-visible shadow at the same guest + * FD number on first eligible read-only access. This avoids paying memfd copy + * cost at open time while still letting later read/lseek/fstat/mmap operations + * run on a real host FD. * * Returns: * 1 shadow is available (same-fd injected for seccomp, local-only for @@ -1003,10 +842,10 @@ static int reopen_cached_shadow_fd( * 0 shadow promotion not applicable * -1 promotion attempted but failed */ -static int ensure_same_fd_shadow(struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - long fd, - long lkl_fd) +int ensure_same_fd_shadow(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long fd, + long lkl_fd) { struct kbox_fd_entry *entry; long flags; @@ -1042,17 +881,13 @@ static int ensure_same_fd_shadow(struct kbox_supervisor_ctx *ctx, return -1; } - if (req->source == KBOX_SYSCALL_SOURCE_SECCOMP) { - int injected = request_addfd_at(ctx, req, memfd, (int) fd, - entry->cloexec ? O_CLOEXEC : 0); - if (injected < 0) { - close(memfd); - return -1; - } - entry->host_fd = KBOX_FD_HOST_SAME_FD_SHADOW; - } else { - entry->host_fd = KBOX_FD_LOCAL_ONLY_SHADOW; - } + /* Keep lazy read shadows local to the supervisor in all modes. + * Injecting them at the guest FD number lets read(2)/lseek(2) CONTINUE, + * but it reintroduces a close/open reuse race under concurrent seccomp + * notifications. The local-shadow handlers already cover read/lseek/fstat + * safely, so prefer them over same-fd injection here. + */ + entry->host_fd = KBOX_FD_LOCAL_ONLY_SHADOW; entry->shadow_sp = memfd; entry->shadow_writeback = 0; @@ -1210,9 +1045,9 @@ static struct kbox_dispatch finish_open_dispatch( return kbox_dispatch_value((int64_t) vfd); } -static void normalize_host_stat_if_needed(struct kbox_supervisor_ctx *ctx, - const char *path, - struct stat *host_stat) +void normalize_host_stat_if_needed(struct kbox_supervisor_ctx *ctx, + const char *path, + struct stat *host_stat) { if (!ctx->normalize) return; @@ -1226,9 +1061,9 @@ static void normalize_host_stat_if_needed(struct kbox_supervisor_ctx *ctx, host_stat->st_gid = n_gid; } -static void normalize_statx_if_needed(struct kbox_supervisor_ctx *ctx, - const char *path, - uint8_t *statx_buf) +void normalize_statx_if_needed(struct kbox_supervisor_ctx *ctx, + const char *path, + uint8_t *statx_buf) { if (!ctx->normalize) return; @@ -1243,7 +1078,7 @@ static void normalize_statx_if_needed(struct kbox_supervisor_ctx *ctx, memcpy(&statx_buf[STATX_GID_OFFSET], &n_gid, 4); } -static void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx) +void invalidate_path_shadow_cache(struct kbox_supervisor_ctx *ctx) { size_t i; @@ -1302,8 +1137,8 @@ static struct kbox_path_shadow_cache_entry *reserve_path_shadow_cache_slot( return entry; } -static int ensure_path_shadow_cache(struct kbox_supervisor_ctx *ctx, - const char *translated) +int ensure_path_shadow_cache(struct kbox_supervisor_ctx *ctx, + const char *translated) { struct kbox_path_shadow_cache_entry *entry; struct stat host_stat; @@ -1347,17 +1182,15 @@ static int ensure_path_shadow_cache(struct kbox_supervisor_ctx *ctx, return 1; } -static int try_cached_shadow_open_dispatch( - struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - long flags, - const char *translated, - struct kbox_dispatch *out) +int try_cached_shadow_open_dispatch(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long flags, + const char *translated, + struct kbox_dispatch *out) { struct kbox_path_shadow_cache_entry *entry; int injected; int dup_fd; - long fast_fd; if (!ctx || !req || !translated || !out) return 0; @@ -1376,35 +1209,25 @@ static int try_cached_shadow_open_dispatch( if (dup_fd < 0) return 0; - fast_fd = next_hostonly_fd_hint(ctx); - if (fast_fd < 0) { - close(dup_fd); - return 0; - } - injected = request_addfd_at(ctx, req, dup_fd, (int) fast_fd, - (flags & O_CLOEXEC) ? O_CLOEXEC : 0); - if (injected < 0) { - fast_fd = allocate_passthrough_hostonly_fd(ctx); - if (fast_fd < 0) { - close(dup_fd); - return 0; - } - injected = request_addfd_at(ctx, req, dup_fd, (int) fast_fd, - (flags & O_CLOEXEC) ? O_CLOEXEC : 0); - } + /* Let the kernel pick a fresh host-visible FD. Reusing a fixed target FD + * races with concurrent close(2): the supervisor removes its bookkeeping + * before the kernel replays the close, so another thread can reuse that FD + * number and have the older close tear down the new file. + */ + injected = + request_addfd(ctx, req, dup_fd, (flags & O_CLOEXEC) ? O_CLOEXEC : 0); close(dup_fd); if (injected < 0) return 0; - ctx->fd_table->next_hostonly_fd = fast_fd; - *out = kbox_dispatch_value((int64_t) fast_fd); + *out = kbox_dispatch_value((int64_t) injected); return 1; } -static int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, - const char *translated, - uint64_t remote_stat, - pid_t pid) +int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, + const char *translated, + uint64_t remote_stat, + pid_t pid) { struct kbox_path_shadow_cache_entry *entry; @@ -1422,8 +1245,8 @@ static int try_cached_shadow_stat_dispatch(struct kbox_supervisor_ctx *ctx, sizeof(entry->host_stat)) == 0; } -static void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, - struct kbox_fd_entry *entry) +void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) { if (!ctx || !entry || entry->shadow_writeback) return; @@ -1432,8 +1255,8 @@ static void note_shadow_writeback_open(struct kbox_supervisor_ctx *ctx, invalidate_path_shadow_cache(ctx); } -static void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, - struct kbox_fd_entry *entry) +void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) { if (!ctx || !entry || !entry->shadow_writeback) return; @@ -1442,17 +1265,17 @@ static void note_shadow_writeback_close(struct kbox_supervisor_ctx *ctx, ctx->active_writeback_shadows--; } -static int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req, - long lkl_fd, - long flags, - const char *translated, - struct kbox_dispatch *out) +int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, + const struct kbox_syscall_request *req, + long lkl_fd, + long flags, + const char *translated, + struct kbox_dispatch *out) { struct kbox_fd_entry *entry; int memfd; int injected; - long fast_fd; + long vfd; if (!ctx || !req || !out || lkl_fd < 0 || !translated) return 0; @@ -1470,23 +1293,23 @@ static int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, * sealed. */ - fast_fd = kbox_fd_table_insert_fast(ctx->fd_table, lkl_fd, 0); - if (fast_fd < 0) { + vfd = kbox_fd_table_insert_fast(ctx->fd_table, lkl_fd, 0); + if (vfd < 0) { close(memfd); return 0; } - injected = request_addfd_at(ctx, req, memfd, (int) fast_fd, + injected = request_addfd_at(ctx, req, memfd, (int) vfd, (flags & O_CLOEXEC) ? O_CLOEXEC : 0); if (injected < 0) { - kbox_fd_table_remove(ctx->fd_table, fast_fd); + kbox_fd_table_remove(ctx->fd_table, vfd); close(memfd); return 0; } - entry = fd_table_entry(ctx->fd_table, fast_fd); + entry = fd_table_entry(ctx->fd_table, vfd); if (!entry) { - kbox_fd_table_remove(ctx->fd_table, fast_fd); + kbox_fd_table_remove(ctx->fd_table, vfd); close(memfd); return 0; } @@ -1497,9 +1320,9 @@ static int try_writeback_shadow_open(struct kbox_supervisor_ctx *ctx, if (ctx->verbose) { fprintf(stderr, "kbox: writable shadow promote fd=%ld lkl_fd=%ld path=%s\n", - fast_fd, lkl_fd, translated); + vfd, lkl_fd, translated); } - *out = kbox_dispatch_value((int64_t) fast_fd); + *out = kbox_dispatch_value((int64_t) vfd); return 1; } @@ -1683,8 +1506,8 @@ static struct kbox_dispatch forward_open_legacy( return finish_open_dispatch(ctx, req, ret, flags, translated); } -static int sync_shadow_writeback(struct kbox_supervisor_ctx *ctx, - struct kbox_fd_entry *entry) +int sync_shadow_writeback(struct kbox_supervisor_ctx *ctx, + struct kbox_fd_entry *entry) { struct stat st; uint8_t *buf = NULL; @@ -2600,7 +2423,7 @@ static struct kbox_dispatch forward_fstat( if (lkl_fd < 0) return kbox_dispatch_continue(); /* If a shadow already exists (from a prior mmap), let the host handle - * fstat against the memfd. Do NOT create a shadow here -- fstat is a + * fstat against the memfd. Do NOT create a shadow here; fstat is a * metadata query that LKL answers directly without the expensive * memfd_create + pread loop. */ @@ -2717,9 +2540,9 @@ static struct kbox_dispatch forward_newfstatat( * thread. Safe to call from the SIGSYS handler or rewrite trampoline context. * * Three tiers: - * 1. Pure emulation -- cached constant values (getpid, getppid, gettid). - * 2. Always-CONTINUE -- host kernel handles the syscall unmodified. - * 3. Conditional emulation -- e.g. arch_prctl(SET_FS) in trap/rewrite. + * 1. Pure emulation: cached constant values (getpid, getppid, gettid). + * 2. Always-CONTINUE: host kernel handles the syscall unmodified. + * 3. Conditional emulation: e.g. arch_prctl(SET_FS) in trap/rewrite. * * LKL-touching syscalls (stat, openat, read on LKL FDs, etc.) are NOT * handled here; they MUST go through the service thread. @@ -2745,7 +2568,7 @@ int kbox_dispatch_try_local_fast_path(const struct kbox_host_nrs *h, return 1; } - /* Tier 2: always-CONTINUE -- host kernel handles these directly. */ + /* Tier 2: always-CONTINUE; host kernel handles these directly. */ if (nr == h->brk || nr == h->futex || nr == h->rseq || nr == h->set_tid_address || nr == h->set_robust_list || nr == h->munmap || nr == h->mremap || nr == h->membarrier || @@ -2939,11 +2762,20 @@ static struct kbox_dispatch forward_getcwd( return kbox_dispatch_value((int64_t) n); } -/* forward_mkdirat. */ +/* Shared skeleton for simple *at() syscalls: translate path + resolve dirfd, + * then delegate to a callback for the actual LKL call. + */ + +typedef long (*at_path_invoke_fn)(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + long lkl_dirfd, + const char *translated); -static struct kbox_dispatch forward_mkdirat( +static struct kbox_dispatch forward_at_path_call( const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) + struct kbox_supervisor_ctx *ctx, + at_path_invoke_fn invoke, + int invalidate_on_success) { char translated[KBOX_MAX_PATH]; long lkl_dirfd; @@ -2954,33 +2786,42 @@ static struct kbox_dispatch forward_mkdirat( if (should_continue_for_dirfd(lkl_dirfd)) return kbox_dispatch_continue(); - long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - long ret = kbox_lkl_mkdirat(ctx->sysnrs, lkl_dirfd, translated, mode); - if (ret >= 0) + long ret = invoke(req, ctx, lkl_dirfd, translated); + if (invalidate_on_success && ret >= 0) invalidate_path_shadow_cache(ctx); return kbox_dispatch_from_lkl(ret); } -/* forward_unlinkat. */ +static long invoke_mkdirat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + long lkl_dirfd, + const char *translated) +{ + long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); + return kbox_lkl_mkdirat(ctx->sysnrs, lkl_dirfd, translated, mode); +} -static struct kbox_dispatch forward_unlinkat( +static struct kbox_dispatch forward_mkdirat( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - char translated[KBOX_MAX_PATH]; - long lkl_dirfd; - int rc = translate_request_at_path(req, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); - if (rc < 0) - return kbox_dispatch_errno(-rc); - if (should_continue_for_dirfd(lkl_dirfd)) - return kbox_dispatch_continue(); + return forward_at_path_call(req, ctx, invoke_mkdirat, 1); +} +static long invoke_unlinkat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + long lkl_dirfd, + const char *translated) +{ long flags = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - long ret = kbox_lkl_unlinkat(ctx->sysnrs, lkl_dirfd, translated, flags); - if (ret >= 0) - invalidate_path_shadow_cache(ctx); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_unlinkat(ctx->sysnrs, lkl_dirfd, translated, flags); +} + +static struct kbox_dispatch forward_unlinkat( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_at_path_call(req, ctx, invoke_unlinkat, 1); } /* forward_renameat / forward_renameat2. */ @@ -3027,49 +2868,40 @@ static struct kbox_dispatch forward_renameat2( to_c_long_arg(kbox_syscall_request_arg(req, 4))); } -/* forward_fchmodat. */ - -static struct kbox_dispatch forward_fchmodat( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) +static long invoke_fchmodat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + long lkl_dirfd, + const char *translated) { - char translated[KBOX_MAX_PATH]; - long lkl_dirfd; - int rc = translate_request_at_path(req, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); - if (rc < 0) - return kbox_dispatch_errno(-rc); - if (should_continue_for_dirfd(lkl_dirfd)) - return kbox_dispatch_continue(); - long mode = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - long ret = - kbox_lkl_fchmodat(ctx->sysnrs, lkl_dirfd, translated, mode, flags); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_fchmodat(ctx->sysnrs, lkl_dirfd, translated, mode, flags); } -/* forward_fchownat. */ - -static struct kbox_dispatch forward_fchownat( +static struct kbox_dispatch forward_fchmodat( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - char translated[KBOX_MAX_PATH]; - long lkl_dirfd; - int rc = translate_request_at_path(req, ctx, 0, 1, translated, - sizeof(translated), &lkl_dirfd); - if (rc < 0) - return kbox_dispatch_errno(-rc); - if (should_continue_for_dirfd(lkl_dirfd)) - return kbox_dispatch_continue(); + return forward_at_path_call(req, ctx, invoke_fchmodat, 0); +} +static long invoke_fchownat(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + long lkl_dirfd, + const char *translated) +{ long owner = to_c_long_arg(kbox_syscall_request_arg(req, 2)); long group = to_c_long_arg(kbox_syscall_request_arg(req, 3)); long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); - long ret = kbox_lkl_fchownat(ctx->sysnrs, lkl_dirfd, translated, owner, - group, flags); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_fchownat(ctx->sysnrs, lkl_dirfd, translated, owner, group, + flags); +} + +static struct kbox_dispatch forward_fchownat( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_at_path_call(req, ctx, invoke_fchownat, 0); } /* forward_mount. */ @@ -3217,64 +3049,87 @@ static struct kbox_dispatch forward_stat_legacy( return kbox_dispatch_value(0); } -static struct kbox_dispatch forward_access_legacy( +/* Shared skeleton for legacy (non-*at) path syscalls: translate a single + * path from arg[0], then delegate to a callback for the LKL call. + */ + +typedef long (*legacy_path_invoke_fn)(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated); + +static struct kbox_dispatch forward_legacy_path_call( const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) + struct kbox_supervisor_ctx *ctx, + legacy_path_invoke_fn invoke) { char translated[KBOX_MAX_PATH]; int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, sizeof(translated)); if (rc < 0) return kbox_dispatch_errno(-rc); + return kbox_dispatch_from_lkl(invoke(req, ctx, translated)); +} +static long invoke_access_legacy(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long ret = - kbox_lkl_faccessat2(ctx->sysnrs, AT_FDCWD_LINUX, translated, mode, 0); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_faccessat2(ctx->sysnrs, AT_FDCWD_LINUX, translated, mode, + 0); } -static struct kbox_dispatch forward_mkdir_legacy( +static struct kbox_dispatch forward_access_legacy( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - char translated[KBOX_MAX_PATH]; - int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); + return forward_legacy_path_call(req, ctx, invoke_access_legacy); +} +static long invoke_mkdir_legacy(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long ret = kbox_lkl_mkdir(ctx->sysnrs, translated, (int) mode); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_mkdir(ctx->sysnrs, translated, (int) mode); +} + +static struct kbox_dispatch forward_mkdir_legacy( + const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx) +{ + return forward_legacy_path_call(req, ctx, invoke_mkdir_legacy); +} + +static long invoke_unlink_legacy(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ + (void) req; + return kbox_lkl_unlinkat(ctx->sysnrs, AT_FDCWD_LINUX, translated, 0); } static struct kbox_dispatch forward_unlink_legacy( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - char translated[KBOX_MAX_PATH]; - int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); + return forward_legacy_path_call(req, ctx, invoke_unlink_legacy); +} - long ret = kbox_lkl_unlinkat(ctx->sysnrs, AT_FDCWD_LINUX, translated, 0); - return kbox_dispatch_from_lkl(ret); +static long invoke_rmdir_legacy(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ + (void) req; + return kbox_lkl_unlinkat(ctx->sysnrs, AT_FDCWD_LINUX, translated, + AT_REMOVEDIR); } static struct kbox_dispatch forward_rmdir_legacy( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - char translated[KBOX_MAX_PATH]; - int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long ret = kbox_lkl_unlinkat(ctx->sysnrs, AT_FDCWD_LINUX, translated, - AT_REMOVEDIR); - return kbox_dispatch_from_lkl(ret); + return forward_legacy_path_call(req, ctx, invoke_rmdir_legacy); } static struct kbox_dispatch forward_rename_legacy( @@ -3297,2981 +3152,164 @@ static struct kbox_dispatch forward_rename_legacy( return kbox_dispatch_from_lkl(ret); } -static struct kbox_dispatch forward_chmod_legacy( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) +static long invoke_chmod_legacy(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated) { - char translated[KBOX_MAX_PATH]; - int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long ret = - kbox_lkl_fchmodat(ctx->sysnrs, AT_FDCWD_LINUX, translated, mode, 0); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_fchmodat(ctx->sysnrs, AT_FDCWD_LINUX, translated, mode, 0); } -static struct kbox_dispatch forward_chown_legacy( +static struct kbox_dispatch forward_chmod_legacy( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - char translated[KBOX_MAX_PATH]; - int rc = translate_request_path(req, ctx, 0, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); + return forward_legacy_path_call(req, ctx, invoke_chmod_legacy); +} +static long invoke_chown_legacy(const struct kbox_syscall_request *req, + struct kbox_supervisor_ctx *ctx, + const char *translated) +{ long owner = to_c_long_arg(kbox_syscall_request_arg(req, 1)); long group = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - long ret = kbox_lkl_fchownat(ctx->sysnrs, AT_FDCWD_LINUX, translated, owner, - group, 0); - return kbox_dispatch_from_lkl(ret); + return kbox_lkl_fchownat(ctx->sysnrs, AT_FDCWD_LINUX, translated, owner, + group, 0); } -/* Identity forwarders: getuid, geteuid, getresuid, etc. */ - -static struct kbox_dispatch forward_getresuid( +static struct kbox_dispatch forward_chown_legacy( const struct kbox_syscall_request *req, struct kbox_supervisor_ctx *ctx) { - pid_t pid = kbox_syscall_request_pid(req); - uint64_t ruid_ptr = kbox_syscall_request_arg(req, 0); - uint64_t euid_ptr = kbox_syscall_request_arg(req, 1); - uint64_t suid_ptr = kbox_syscall_request_arg(req, 2); - - if (ruid_ptr != 0) { - long r = kbox_lkl_getuid(ctx->sysnrs); - if (r < 0) - return kbox_dispatch_errno((int) (-r)); - unsigned val = (unsigned) r; - int wrc = guest_mem_write(ctx, pid, ruid_ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - if (euid_ptr != 0) { - long r = kbox_lkl_geteuid(ctx->sysnrs); - if (r < 0) - return kbox_dispatch_errno((int) (-r)); - unsigned val = (unsigned) r; - int wrc = guest_mem_write(ctx, pid, euid_ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - if (suid_ptr != 0) { - /* saved-set-uid = effective uid (LKL has no separate saved). */ - long r = kbox_lkl_geteuid(ctx->sysnrs); - if (r < 0) - return kbox_dispatch_errno((int) (-r)); - unsigned val = (unsigned) r; - int wrc = guest_mem_write(ctx, pid, suid_ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - return kbox_dispatch_value(0); + return forward_legacy_path_call(req, ctx, invoke_chown_legacy); } -static struct kbox_dispatch forward_getresuid_override( - const struct kbox_syscall_request *req, +/* Syscall dispatch tables. + * + * X-macros eliminate the repetitive if-chains for the two most common dispatch + * patterns. Each _(field, handler) entry expands to + * if (nr == h->field) return handler(req, ctx); + * + * Complex cases (extra args, guards, inline logic) remain as explicit if-blocks + * after the table expansions. + */ + +/* DISPATCH_FORWARD: forwarded to a handler that takes (req, ctx). */ +#define DISPATCH_FORWARD_TABLE(_) \ + /* Legacy x86_64 syscalls. */ \ + _(access, forward_access_legacy) \ + _(mkdir, forward_mkdir_legacy) \ + _(rmdir, forward_rmdir_legacy) \ + _(unlink, forward_unlink_legacy) \ + _(rename, forward_rename_legacy) \ + _(chmod, forward_chmod_legacy) \ + _(chown, forward_chown_legacy) \ + _(open, forward_open_legacy) \ + /* File open/create. */ \ + _(openat, forward_openat) \ + _(openat2, forward_openat2) \ + /* Metadata. */ \ + _(fstat, forward_fstat) \ + _(newfstatat, forward_newfstatat) \ + _(statx, forward_statx) \ + _(faccessat2, forward_faccessat2) \ + /* Directories. */ \ + _(getdents64, forward_getdents64) \ + _(getdents, forward_getdents) \ + _(mkdirat, forward_mkdirat) \ + _(unlinkat, forward_unlinkat) \ + _(renameat2, forward_renameat2) \ + _(fchmodat, forward_fchmodat) \ + _(fchownat, forward_fchownat) \ + /* Navigation. */ \ + _(chdir, forward_chdir) \ + _(fchdir, forward_fchdir) \ + _(getcwd, forward_getcwd) \ + /* Mount. */ \ + _(mount, forward_mount) \ + _(umount2, forward_umount2) \ + /* FD operations. */ \ + _(close, forward_close) \ + _(fcntl, forward_fcntl) \ + _(dup, forward_dup) \ + _(dup2, forward_dup2) \ + _(dup3, forward_dup3) \ + /* I/O. */ \ + _(write, forward_write) \ + _(lseek, forward_lseek) \ + /* Networking. */ \ + _(socket, forward_socket) \ + _(bind, forward_bind) \ + _(connect, forward_connect) \ + _(sendto, forward_sendto) \ + _(recvfrom, forward_recvfrom) \ + _(recvmsg, forward_recvmsg) \ + _(getsockopt, forward_getsockopt) \ + _(setsockopt, forward_setsockopt) \ + _(getsockname, forward_getsockname) \ + _(getpeername, forward_getpeername) \ + _(shutdown, forward_shutdown) \ + /* I/O extended. */ \ + _(pwrite64, forward_pwrite64) \ + _(writev, forward_writev) \ + _(readv, forward_readv) \ + _(ftruncate, forward_ftruncate) \ + _(fallocate, forward_fallocate) \ + _(flock, forward_flock) \ + _(fsync, forward_fsync) \ + _(fdatasync, forward_fdatasync) \ + _(sync, forward_sync) \ + _(ioctl, forward_ioctl) \ + /* File operations. */ \ + _(readlinkat, forward_readlinkat) \ + _(pipe2, forward_pipe2) \ + _(symlinkat, forward_symlinkat) \ + _(linkat, forward_linkat) \ + _(utimensat, forward_utimensat) \ + _(sendfile, forward_sendfile) \ + /* Time. */ \ + _(clock_gettime, forward_clock_gettime) \ + _(clock_getres, forward_clock_getres) \ + _(gettimeofday, forward_gettimeofday) \ + /* Process lifecycle. */ \ + _(umask, forward_umask) \ + _(uname, forward_uname) \ + _(getrandom, forward_getrandom) \ + _(syslog, forward_syslog) \ + _(prctl, forward_prctl) \ + /* Threading. */ \ + _(clone3, forward_clone3) + +/* DISPATCH_CONTINUE: host kernel handles directly, no LKL involvement. */ +/* clang-format off */ +#define DISPATCH_CONTINUE_TABLE(_) \ + _(setpgid) _(getpgid) _(getsid) _(setsid) _(brk) _(wait4) _(waitid) \ + _(exit) _(exit_group) _(rt_sigreturn) _(rt_sigaltstack) _(setitimer) \ + _(getitimer) _(set_tid_address) _(set_robust_list) _(futex) _(rseq) \ + _(fork) _(vfork) _(membarrier) _(madvise) _(getrlimit) _(getrusage) \ + /* Scheduling. */ \ + _(sched_yield) _(sched_setparam) _(sched_getparam) _(sched_setscheduler) \ + _(sched_getscheduler) _(sched_get_priority_max) _(sched_get_priority_min) \ + _(sched_setaffinity) _(sched_getaffinity) \ + /* I/O multiplexing. */ \ + _(epoll_create1) _(epoll_ctl) _(epoll_wait) _(epoll_pwait) _(ppoll) \ + _(pselect6) _(poll) \ + /* Sleep/timer. */ \ + _(nanosleep) _(clock_nanosleep) _(timerfd_create) _(timerfd_settime) \ + _(timerfd_gettime) _(eventfd) _(eventfd2) \ + /* Filesystem info. */ \ + _(statfs) _(fstatfs) _(sysinfo) +/* clang-format on */ + +struct kbox_dispatch kbox_dispatch_request( struct kbox_supervisor_ctx *ctx, - uid_t uid) + const struct kbox_syscall_request *req) { - pid_t pid = kbox_syscall_request_pid(req); - unsigned val = (unsigned) uid; - int i; + const struct kbox_host_nrs *h = ctx->host_nrs; + int nr; - for (i = 0; i < 3; i++) { - uint64_t ptr = kbox_syscall_request_arg(req, i); - if (ptr != 0) { - int wrc = guest_mem_write(ctx, pid, ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(EIO); - } - } - return kbox_dispatch_value(0); -} - -static struct kbox_dispatch forward_getresgid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - uint64_t rgid_ptr = kbox_syscall_request_arg(req, 0); - uint64_t egid_ptr = kbox_syscall_request_arg(req, 1); - uint64_t sgid_ptr = kbox_syscall_request_arg(req, 2); - - if (rgid_ptr != 0) { - long r = kbox_lkl_getgid(ctx->sysnrs); - if (r < 0) - return kbox_dispatch_errno((int) (-r)); - unsigned val = (unsigned) r; - int wrc = guest_mem_write(ctx, pid, rgid_ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - if (egid_ptr != 0) { - long r = kbox_lkl_getegid(ctx->sysnrs); - if (r < 0) - return kbox_dispatch_errno((int) (-r)); - unsigned val = (unsigned) r; - int wrc = guest_mem_write(ctx, pid, egid_ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - if (sgid_ptr != 0) { - long r = kbox_lkl_getegid(ctx->sysnrs); - if (r < 0) - return kbox_dispatch_errno((int) (-r)); - unsigned val = (unsigned) r; - int wrc = guest_mem_write(ctx, pid, sgid_ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - return kbox_dispatch_value(0); -} - -static struct kbox_dispatch forward_getresgid_override( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - gid_t gid) -{ - pid_t pid = kbox_syscall_request_pid(req); - unsigned val = (unsigned) gid; - int i; - - for (i = 0; i < 3; i++) { - uint64_t ptr = kbox_syscall_request_arg(req, i); - if (ptr != 0) { - int wrc = guest_mem_write(ctx, pid, ptr, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(EIO); - } - } - return kbox_dispatch_value(0); -} - -static struct kbox_dispatch forward_getgroups( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - uint64_t list = kbox_syscall_request_arg(req, 1); - - if (size < 0) - return kbox_dispatch_errno(EINVAL); - - /* Probe to get actual group count. */ - long count = kbox_lkl_getgroups(ctx->sysnrs, 0, NULL); - if (count < 0) - return kbox_dispatch_errno((int) (-count)); - - if (size == 0) - return kbox_dispatch_value((int64_t) count); - - /* Caller's buffer must be large enough. */ - if (size < count) - return kbox_dispatch_errno(EINVAL); - - size_t byte_len = (size_t) count * sizeof(unsigned); - if (byte_len > KBOX_IO_CHUNK_LEN) - return kbox_dispatch_errno(ENOMEM); - unsigned *buf = (unsigned *) dispatch_scratch; - - long ret = kbox_lkl_getgroups(ctx->sysnrs, count, buf); - if (ret < 0) - return kbox_dispatch_errno((int) (-ret)); - - if (list != 0 && ret > 0) { - size_t write_len = (size_t) ret * sizeof(unsigned); - pid_t pid = kbox_syscall_request_pid(req); - int wrc = guest_mem_write(ctx, pid, list, buf, write_len); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - - return kbox_dispatch_value((int64_t) ret); -} - -static struct kbox_dispatch forward_getgroups_override( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - gid_t gid) -{ - long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - if (size < 0) - return kbox_dispatch_errno(EINVAL); - if (size == 0) - return kbox_dispatch_value(1); - - uint64_t list = kbox_syscall_request_arg(req, 1); - if (list == 0) - return kbox_dispatch_errno(EFAULT); - - pid_t pid = kbox_syscall_request_pid(req); - unsigned val = (unsigned) gid; - int wrc = guest_mem_write(ctx, pid, list, &val, sizeof(val)); - if (wrc < 0) - return kbox_dispatch_errno(EIO); - - return kbox_dispatch_value(1); -} - -/* Identity set forwarders. */ - -static struct kbox_dispatch forward_setuid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long uid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - return kbox_dispatch_from_lkl(kbox_lkl_setuid(ctx->sysnrs, uid)); -} - -static struct kbox_dispatch forward_setreuid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long ruid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long euid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - return kbox_dispatch_from_lkl(kbox_lkl_setreuid(ctx->sysnrs, ruid, euid)); -} - -static struct kbox_dispatch forward_setresuid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long ruid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long euid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long suid = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - return kbox_dispatch_from_lkl( - kbox_lkl_setresuid(ctx->sysnrs, ruid, euid, suid)); -} - -static struct kbox_dispatch forward_setgid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long gid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - return kbox_dispatch_from_lkl(kbox_lkl_setgid(ctx->sysnrs, gid)); -} - -static struct kbox_dispatch forward_setregid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long rgid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long egid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - return kbox_dispatch_from_lkl(kbox_lkl_setregid(ctx->sysnrs, rgid, egid)); -} - -static struct kbox_dispatch forward_setresgid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long rgid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long egid = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long sgid = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - return kbox_dispatch_from_lkl( - kbox_lkl_setresgid(ctx->sysnrs, rgid, egid, sgid)); -} - -static struct kbox_dispatch forward_setgroups( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long size = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - uint64_t list = kbox_syscall_request_arg(req, 1); - - if (size < 0 || size > 65536) - return kbox_dispatch_errno(EINVAL); - - if (size == 0) - return kbox_dispatch_from_lkl(kbox_lkl_setgroups(ctx->sysnrs, 0, NULL)); - - size_t byte_len = (size_t) size * sizeof(unsigned); - if (byte_len > KBOX_IO_CHUNK_LEN) - return kbox_dispatch_errno(ENOMEM); - unsigned *buf = (unsigned *) dispatch_scratch; - - pid_t pid = kbox_syscall_request_pid(req); - int rrc = guest_mem_read(ctx, pid, list, buf, byte_len); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - long ret = kbox_lkl_setgroups(ctx->sysnrs, size, buf); - return kbox_dispatch_from_lkl(ret); -} - -static struct kbox_dispatch forward_setfsgid( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long gid = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - return kbox_dispatch_from_lkl(kbox_lkl_setfsgid(ctx->sysnrs, gid)); -} - -/* forward_socket. */ - -/* Shadow socket design: - * 1. Create an LKL socket (lives inside LKL's network stack) - * 2. Create a host socketpair (sp[0]=supervisor, sp[1]=tracee) - * 3. Inject sp[1] into the tracee via ADDFD - * 4. Register sp[0]+lkl_fd with the SLIRP event loop - * 5. The event loop pumps data between sp[0] and the LKL socket - * - * The tracee sees a real host FD, so poll/epoll/read/write all work natively - * via the host kernel. Only control-plane ops (connect, getsockopt, etc.) need - * explicit forwarding. - * - * INET sockets with SLIRP active get a shadow socket bridge so data flows - * through the host kernel socketpair (bypassing BKL contention in blocking LKL - * recv/send calls). Non-INET sockets and INET sockets without SLIRP use the - * standard virtual FD path. - * - * Limitation: listen/accept on shadow sockets fail because the AF_UNIX - * socketpair doesn't support inbound connections. Server sockets must be used - * without --net or via a future deferred-bridge approach. - */ -static struct kbox_dispatch forward_socket( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long domain = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long type_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long protocol = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - - int base_type = (int) type_raw & 0xFF; - - long ret = kbox_lkl_socket(ctx->sysnrs, domain, type_raw, protocol); - if (ret < 0) - return kbox_dispatch_errno((int) (-ret)); - - long lkl_fd = ret; - - /* Virtual FD path when shadow bridge is not applicable: - * - SLIRP not active (no --net) - * - Non-INET domain (AF_UNIX, AF_NETLINK, etc.) - * - Non-stream/datagram type (SOCK_RAW, etc.): socketpair(AF_UNIX) - * only supports SOCK_STREAM and SOCK_DGRAM - */ - if (!kbox_net_is_active() || - (domain != 2 /* AF_INET */ && domain != 10 /* AF_INET6 */) || - (base_type != SOCK_STREAM && base_type != SOCK_DGRAM)) { - long vfd = kbox_fd_table_insert(ctx->fd_table, lkl_fd, 0); - if (vfd < 0) { - lkl_close_and_invalidate(ctx, lkl_fd); - return kbox_dispatch_errno(EMFILE); - } - return kbox_dispatch_value((int64_t) vfd); - } - - /* Shadow socket bridge for INET with SLIRP. */ - int sp[2]; - if (socketpair(AF_UNIX, base_type | SOCK_CLOEXEC, 0, sp) < 0) { - lkl_close_and_invalidate(ctx, lkl_fd); - return kbox_dispatch_errno(errno); - } - fcntl(sp[0], F_SETFL, O_NONBLOCK); - if (type_raw & SOCK_NONBLOCK) - fcntl(sp[1], F_SETFL, O_NONBLOCK); - - long vfd = kbox_fd_table_insert(ctx->fd_table, lkl_fd, 0); - if (vfd < 0) { - close(sp[0]); - close(sp[1]); - lkl_close_and_invalidate(ctx, lkl_fd); - return kbox_dispatch_errno(EMFILE); - } - - if (kbox_net_register_socket((int) lkl_fd, sp[0], base_type) < 0) { - close(sp[0]); - close(sp[1]); - /* Fall back to virtual FD. */ - return kbox_dispatch_value((int64_t) vfd); - } - - uint32_t addfd_flags = 0; - if (type_raw & SOCK_CLOEXEC) - addfd_flags = O_CLOEXEC; - int host_fd = request_addfd(ctx, req, sp[1], addfd_flags); - if (host_fd < 0) { - /* Deregister closes sp[0] and marks inactive. */ - kbox_net_deregister_socket((int) lkl_fd); - close(sp[1]); - kbox_fd_table_remove(ctx->fd_table, vfd); - lkl_close_and_invalidate(ctx, lkl_fd); - return kbox_dispatch_errno(-host_fd); - } - kbox_fd_table_set_host_fd(ctx->fd_table, vfd, host_fd); - - { - struct kbox_fd_entry *e = NULL; - if (vfd >= KBOX_FD_BASE) - e = &ctx->fd_table->entries[vfd - KBOX_FD_BASE]; - else if (vfd >= 0 && vfd < KBOX_LOW_FD_MAX) - e = &ctx->fd_table->low_fds[vfd]; - if (e) { - e->shadow_sp = sp[1]; - if (type_raw & SOCK_CLOEXEC) - e->cloexec = 1; - } - } - - return kbox_dispatch_value((int64_t) host_fd); -} - -/* forward_bind / forward_connect. */ - -static long resolve_lkl_socket(struct kbox_supervisor_ctx *ctx, long fd); - -static struct kbox_dispatch forward_bind(const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); - int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - if (len_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t len = (size_t) len_raw; - - if (addr_ptr == 0) - return kbox_dispatch_errno(EFAULT); - - if (len > 4096) - return kbox_dispatch_errno(EINVAL); - - uint8_t buf[4096]; - int rrc = guest_mem_read(ctx, pid, addr_ptr, buf, len); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - long ret = kbox_lkl_bind(ctx->sysnrs, lkl_fd, buf, (long) len); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_connect. */ - -/* Resolve LKL FD from a tracee FD. The tracee may hold either a virtual FD - * (>= KBOX_FD_BASE) or a host FD from a shadow socket (injected via ADDFD). - * Try both paths. - */ -static long resolve_lkl_socket(struct kbox_supervisor_ctx *ctx, long fd) -{ - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - if (lkl_fd >= 0) - return lkl_fd; - - /* Shadow socket: tracee uses the host_fd directly. */ - long vfd = kbox_fd_table_find_by_host_fd(ctx->fd_table, fd); - if (vfd >= 0) - return kbox_fd_table_get_lkl(ctx->fd_table, vfd); - - return -1; -} - -static struct kbox_dispatch forward_connect( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); - int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - if (len_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t len = (size_t) len_raw; - - if (addr_ptr == 0) - return kbox_dispatch_errno(EFAULT); - - if (len > 4096) - return kbox_dispatch_errno(EINVAL); - - uint8_t buf[4096]; - int rrc = guest_mem_read(ctx, pid, addr_ptr, buf, len); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - long ret = kbox_lkl_connect(ctx->sysnrs, lkl_fd, buf, (long) len); - - /* Propagate -EINPROGRESS directly for nonblocking sockets. The tracee's - * poll(POLLOUT) on the AF_UNIX socketpair returns immediately (spurious - * wakeup), but getsockopt(SO_ERROR) is forwarded to the LKL socket and - * returns the real handshake status. The tracee retries poll+getsockopt - * until SO_ERROR clears; standard nonblocking connect flow. - */ - return kbox_dispatch_from_lkl(ret); -} - -/* forward_getsockopt. */ - -static struct kbox_dispatch forward_getsockopt( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - long level = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long optname = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - uint64_t optval_ptr = kbox_syscall_request_arg(req, 3); - uint64_t optlen_ptr = kbox_syscall_request_arg(req, 4); - - if (optval_ptr == 0 || optlen_ptr == 0) - return kbox_dispatch_errno(EFAULT); - - /* Read the optlen from tracee. */ - unsigned int optlen; - int rrc = guest_mem_read(ctx, pid, optlen_ptr, &optlen, sizeof(optlen)); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - if (optlen > 4096) - return kbox_dispatch_errno(EINVAL); - - uint8_t optval[4096]; - unsigned int out_len = optlen; - - long ret = kbox_lkl_getsockopt(ctx->sysnrs, lkl_fd, level, optname, optval, - &out_len); - if (ret < 0) - return kbox_dispatch_from_lkl(ret); - - /* Write min(out_len, optlen) to avoid leaking stack data. */ - unsigned int write_len = out_len < optlen ? out_len : optlen; - int wrc = guest_mem_write(ctx, pid, optval_ptr, optval, write_len); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - wrc = guest_mem_write(ctx, pid, optlen_ptr, &out_len, sizeof(out_len)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value(0); -} - -/* forward_setsockopt. */ - -static struct kbox_dispatch forward_setsockopt( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - long level = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long optname = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - uint64_t optval_ptr = kbox_syscall_request_arg(req, 3); - long optlen = to_c_long_arg(kbox_syscall_request_arg(req, 4)); - - if (optlen < 0 || optlen > 4096) - return kbox_dispatch_errno(EINVAL); - - uint8_t optval[4096] = {0}; - if (optval_ptr != 0 && optlen > 0) { - int rrc = guest_mem_read(ctx, pid, optval_ptr, optval, (size_t) optlen); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - } - - long ret = kbox_lkl_setsockopt(ctx->sysnrs, lkl_fd, level, optname, - optval_ptr ? optval : NULL, optlen); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_getsockname / forward_getpeername. */ - -typedef long (*sockaddr_query_fn)(const struct kbox_sysnrs *s, - long fd, - void *addr, - void *addrlen); - -static struct kbox_dispatch forward_sockaddr_query( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - sockaddr_query_fn query) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t addr_ptr = kbox_syscall_request_arg(req, 1); - uint64_t len_ptr = kbox_syscall_request_arg(req, 2); - - if (addr_ptr == 0 || len_ptr == 0) - return kbox_dispatch_errno(EFAULT); - - unsigned int addrlen; - int rrc = guest_mem_read(ctx, pid, len_ptr, &addrlen, sizeof(addrlen)); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - if (addrlen > 4096) - addrlen = 4096; - - uint8_t addr[4096]; - unsigned int out_len = addrlen; - - long ret = query(ctx->sysnrs, lkl_fd, addr, &out_len); - if (ret < 0) - return kbox_dispatch_from_lkl(ret); - - unsigned int write_len = out_len < addrlen ? out_len : addrlen; - int wrc = guest_mem_write(ctx, pid, addr_ptr, addr, write_len); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - wrc = guest_mem_write(ctx, pid, len_ptr, &out_len, sizeof(out_len)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value(0); -} - -static struct kbox_dispatch forward_getsockname( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - return forward_sockaddr_query(req, ctx, kbox_lkl_getsockname); -} - -static struct kbox_dispatch forward_getpeername( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - return forward_sockaddr_query(req, ctx, kbox_lkl_getpeername); -} - -/* forward_shutdown. */ - -static struct kbox_dispatch forward_shutdown( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - long how = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long ret = kbox_lkl_shutdown(ctx->sysnrs, lkl_fd, how); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_sendto / forward_recvfrom / forward_recvmsg. */ - -/* forward_sendto: for shadow sockets with a destination address, forward the - * data + address directly to the LKL socket. This is needed for unconnected - * UDP (DNS resolver uses sendto with sockaddr_in without prior connect). - * - * sendto(fd, buf, len, flags, dest_addr, addrlen) - * args[0]=fd, args[1]=buf, args[2]=len, args[3]=flags, - * args[4]=dest_addr, args[5]=addrlen - */ -static struct kbox_dispatch forward_sendto( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - uint64_t dest_ptr = kbox_syscall_request_arg(req, 4); - if (dest_ptr == 0) - return kbox_dispatch_continue(); /* no dest addr: stream data path */ - - /* Has a destination address: forward via LKL sendto. */ - pid_t pid = kbox_syscall_request_pid(req); - uint64_t buf_ptr = kbox_syscall_request_arg(req, 1); - int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - int64_t addrlen_raw = to_c_long_arg(kbox_syscall_request_arg(req, 5)); - - if (len_raw < 0 || addrlen_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t len = (size_t) len_raw; - size_t addrlen = (size_t) addrlen_raw; - - if (len > 65536) - len = 65536; - if (addrlen > 128) - return kbox_dispatch_errno(EINVAL); - - uint8_t buf[65536]; - uint8_t addr[128]; - - int rrc = guest_mem_read(ctx, pid, buf_ptr, buf, len); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - rrc = guest_mem_read(ctx, pid, dest_ptr, addr, addrlen); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - long ret = kbox_lkl_sendto(ctx->sysnrs, lkl_fd, buf, (long) len, flags, - addr, (long) addrlen); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_recvfrom: for shadow sockets, receive data + source address from - * the LKL socket and write them back to the tracee. - * - * recvfrom(fd, buf, len, flags, src_addr, addrlen) - * args[0]=fd, args[1]=buf, args[2]=len, args[3]=flags, - * args[4]=src_addr, args[5]=addrlen - */ -static struct kbox_dispatch forward_recvfrom( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - uint64_t src_ptr = kbox_syscall_request_arg(req, 4); - if (src_ptr == 0) - return kbox_dispatch_continue(); /* no addr buffer: stream path */ - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t buf_ptr = kbox_syscall_request_arg(req, 1); - int64_t len_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - uint64_t addrlen_ptr = kbox_syscall_request_arg(req, 5); - - if (len_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t len = (size_t) len_raw; - if (len > 65536) - len = 65536; - - unsigned int addrlen = 0; - if (addrlen_ptr != 0) { - int rrc = - guest_mem_read(ctx, pid, addrlen_ptr, &addrlen, sizeof(addrlen)); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - } - if (addrlen > 128) - addrlen = 128; - - uint8_t buf[65536]; - uint8_t addr[128]; - unsigned int out_addrlen = addrlen; - - long ret = kbox_lkl_recvfrom(ctx->sysnrs, lkl_fd, buf, (long) len, flags, - addr, &out_addrlen); - if (ret < 0) - return kbox_dispatch_from_lkl(ret); - - int wrc = guest_mem_write(ctx, pid, buf_ptr, buf, (size_t) ret); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - if (src_ptr != 0 && out_addrlen > 0) { - unsigned int write_len = out_addrlen < addrlen ? out_addrlen : addrlen; - wrc = guest_mem_write(ctx, pid, src_ptr, addr, write_len); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - if (addrlen_ptr != 0) { - wrc = guest_mem_write(ctx, pid, addrlen_ptr, &out_addrlen, - sizeof(out_addrlen)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - - return kbox_dispatch_value(ret); -} - -/* forward_recvmsg: intercept for shadow sockets so that msg_name (source - * address) is populated from the LKL socket, not the AF_UNIX socketpair. - * - * recvmsg(fd, msg, flags) - * args[0]=fd, args[1]=msg_ptr, args[2]=flags - */ -static struct kbox_dispatch forward_recvmsg( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = resolve_lkl_socket(ctx, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t msg_ptr = kbox_syscall_request_arg(req, 1); - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - - if (msg_ptr == 0) - return kbox_dispatch_errno(EFAULT); - - struct { - uint64_t msg_name; - uint32_t msg_namelen; - uint32_t __pad0; - uint64_t msg_iov; - uint64_t msg_iovlen; - uint64_t msg_control; - uint64_t msg_controllen; - int msg_flags; - } mh; - int rrc = guest_mem_read(ctx, pid, msg_ptr, &mh, sizeof(mh)); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - /* No msg_name: for connected stream sockets, CONTINUE via socketpair. */ - if (mh.msg_name == 0 || mh.msg_namelen == 0) - return kbox_dispatch_continue(); - - /* Read all iovecs to determine total buffer capacity. */ - if (mh.msg_iovlen == 0) - return kbox_dispatch_value(0); - - size_t niov = (size_t) mh.msg_iovlen; - if (niov > 64) - niov = 64; - - struct { - uint64_t iov_base; - uint64_t iov_len; - } iovs[64]; - rrc = guest_mem_read(ctx, pid, mh.msg_iov, iovs, niov * sizeof(iovs[0])); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - - size_t total_cap = 0; - for (size_t v = 0; v < niov; v++) - total_cap += (size_t) iovs[v].iov_len; - if (total_cap > 65536) - total_cap = 65536; - - uint8_t buf[65536]; - uint8_t addr[128]; - unsigned int addrlen = mh.msg_namelen < sizeof(addr) - ? mh.msg_namelen - : (unsigned int) sizeof(addr); - unsigned int out_addrlen = addrlen; - - long ret = kbox_lkl_recvfrom(ctx->sysnrs, lkl_fd, buf, (long) total_cap, - flags, addr, &out_addrlen); - if (ret < 0) - return kbox_dispatch_from_lkl(ret); - - /* Scatter received data across tracee iov buffers. */ - size_t written = 0; - for (size_t v = 0; v < niov && written < (size_t) ret; v++) { - size_t chunk = (size_t) ret - written; - if (chunk > (size_t) iovs[v].iov_len) - chunk = (size_t) iovs[v].iov_len; - if (chunk > 0 && iovs[v].iov_base != 0) { - int wrc2 = guest_mem_write(ctx, pid, iovs[v].iov_base, - buf + written, chunk); - if (wrc2 < 0) - return kbox_dispatch_errno(-wrc2); - written += chunk; - } - } - - /* Write source address to tracee msg_name. */ - if (out_addrlen > 0) { - unsigned int write_len = - out_addrlen < mh.msg_namelen ? out_addrlen : mh.msg_namelen; - int awrc = guest_mem_write(ctx, pid, mh.msg_name, addr, write_len); - if (awrc < 0) - return kbox_dispatch_errno(-awrc); - } - - /* Update msg_namelen in the msghdr. */ - int nwrc = - guest_mem_write(ctx, pid, msg_ptr + 8 /* offset of msg_namelen */, - &out_addrlen, sizeof(out_addrlen)); - if (nwrc < 0) - return kbox_dispatch_errno(-nwrc); - - return kbox_dispatch_value(ret); -} - -/* forward_clock_gettime. */ - -static struct kbox_dispatch forward_clock_gettime( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - int clockid = (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)); - uint64_t remote_ts = kbox_syscall_request_arg(req, 1); - - if (remote_ts == 0) - return kbox_dispatch_errno(EFAULT); - - struct timespec ts; - if (clock_gettime(clockid, &ts) < 0) - return kbox_dispatch_errno(errno); - - int wrc = guest_mem_write(ctx, pid, remote_ts, &ts, sizeof(ts)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value(0); -} - -/* forward_clock_getres. */ - -static struct kbox_dispatch forward_clock_getres( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - int clockid = (int) to_c_long_arg(kbox_syscall_request_arg(req, 0)); - uint64_t remote_ts = kbox_syscall_request_arg(req, 1); - - struct timespec ts; - if (clock_getres(clockid, remote_ts ? &ts : NULL) < 0) - return kbox_dispatch_errno(errno); - - if (remote_ts != 0) { - int wrc = guest_mem_write(ctx, pid, remote_ts, &ts, sizeof(ts)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - - return kbox_dispatch_value(0); -} - -/* forward_gettimeofday. */ - -static struct kbox_dispatch forward_gettimeofday( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_tv = kbox_syscall_request_arg(req, 0); - uint64_t remote_tz = kbox_syscall_request_arg(req, 1); - - /* Use clock_gettime(CLOCK_REALTIME) as the underlying source, which - * works on both x86_64 and aarch64. - */ - if (remote_tv != 0) { - struct timespec ts; - if (clock_gettime(CLOCK_REALTIME, &ts) < 0) - return kbox_dispatch_errno(errno); - - struct { - long tv_sec; - long tv_usec; - } tv; - tv.tv_sec = ts.tv_sec; - tv.tv_usec = ts.tv_nsec / 1000; - - int wrc = guest_mem_write(ctx, pid, remote_tv, &tv, sizeof(tv)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - - if (remote_tz != 0) { - /* Return zeroed timezone (UTC). */ - struct { - int tz_minuteswest; - int tz_dsttime; - } tz = {0, 0}; - - int wrc = guest_mem_write(ctx, pid, remote_tz, &tz, sizeof(tz)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - } - - return kbox_dispatch_value(0); -} - -/* forward_readlinkat. */ - -static struct kbox_dispatch forward_readlinkat( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - long dirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); - char pathbuf[KBOX_MAX_PATH]; - int rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), - pathbuf, sizeof(pathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - uint64_t remote_buf = kbox_syscall_request_arg(req, 2); - int64_t bufsiz_raw = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - if (bufsiz_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t bufsiz = (size_t) bufsiz_raw; - - if (remote_buf == 0) - return kbox_dispatch_errno(EFAULT); - - char translated[KBOX_MAX_PATH]; - rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long lkl_dirfd = resolve_open_dirfd(translated, dirfd_raw, ctx->fd_table); - if (lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); - - if (bufsiz > KBOX_MAX_PATH) - bufsiz = KBOX_MAX_PATH; - - char linkbuf[KBOX_MAX_PATH]; - long ret = kbox_lkl_readlinkat(ctx->sysnrs, lkl_dirfd, translated, linkbuf, - (long) bufsiz); - if (ret < 0) - return kbox_dispatch_errno((int) (-ret)); - - size_t n = (size_t) ret; - int wrc = guest_mem_write(ctx, pid, remote_buf, linkbuf, n); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value((int64_t) n); -} - -/* forward_pipe2. */ - -static struct kbox_dispatch forward_pipe2( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_pipefd = kbox_syscall_request_arg(req, 0); - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - - if (remote_pipefd == 0) - return kbox_dispatch_errno(EFAULT); - - /* Create a real host pipe and inject both ends into the tracee via - * SECCOMP_IOCTL_NOTIF_ADDFD. This makes pipes fully native: - * - * - dup2/close/read/write on pipe FDs -> CONTINUE (host kernel) - * - Proper fork semantics: both parent and child share the real - * pipe, no virtual FD table conflicts. - * - No LKL overhead for IPC data transfer. - */ - int host_pipefd[2]; - if (pipe2(host_pipefd, (int) flags) < 0) - return kbox_dispatch_errno(errno); - - uint32_t cloexec_flag = (flags & O_CLOEXEC) ? O_CLOEXEC : 0; - - int tracee_fd0 = request_addfd(ctx, req, host_pipefd[0], cloexec_flag); - if (tracee_fd0 < 0) { - close(host_pipefd[0]); - close(host_pipefd[1]); - return kbox_dispatch_errno(-tracee_fd0); - } - - int tracee_fd1 = request_addfd(ctx, req, host_pipefd[1], cloexec_flag); - if (tracee_fd1 < 0) { - close(host_pipefd[0]); - close(host_pipefd[1]); - return kbox_dispatch_errno(-tracee_fd1); - } - - /* Supervisor copies no longer needed; tracee owns its own copies. */ - close(host_pipefd[0]); - close(host_pipefd[1]); - - int guest_fds[2] = {tracee_fd0, tracee_fd1}; - int wrc = - guest_mem_write(ctx, pid, remote_pipefd, guest_fds, sizeof(guest_fds)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value(0); -} - -/* forward_uname. */ - -static struct kbox_dispatch forward_uname( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_buf = kbox_syscall_request_arg(req, 0); - - if (remote_buf == 0) - return kbox_dispatch_errno(EFAULT); - - struct utsname uts; - memset(&uts, 0, sizeof(uts)); - snprintf(uts.sysname, sizeof(uts.sysname), "Linux"); - snprintf(uts.nodename, sizeof(uts.nodename), "kbox"); - snprintf(uts.release, sizeof(uts.release), "6.8.0-kbox"); - snprintf(uts.version, sizeof(uts.version), "#1 SMP"); -#if defined(__x86_64__) - snprintf(uts.machine, sizeof(uts.machine), "x86_64"); -#elif defined(__aarch64__) - snprintf(uts.machine, sizeof(uts.machine), "aarch64"); -#else - snprintf(uts.machine, sizeof(uts.machine), "unknown"); -#endif - - int wrc = guest_mem_write(ctx, pid, remote_buf, &uts, sizeof(uts)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value(0); -} - -/* forward_getrandom. */ - -static struct kbox_dispatch forward_getrandom( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_buf = kbox_syscall_request_arg(req, 0); - int64_t buflen_raw = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - - if (buflen_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t buflen = (size_t) buflen_raw; - - if (remote_buf == 0) - return kbox_dispatch_errno(EFAULT); - if (buflen == 0) - return kbox_dispatch_value(0); - - /* Read from /dev/urandom via LKL. Fall back to host if LKL does not - * have the device available. - */ - size_t max_chunk = 256; - if (buflen > max_chunk) - buflen = max_chunk; - - uint8_t scratch[256]; - long fd = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, "/dev/urandom", - O_RDONLY, 0); - if (fd < 0) { - /* Fallback: let host kernel handle it. */ - return kbox_dispatch_continue(); - } - - long ret = kbox_lkl_read(ctx->sysnrs, fd, scratch, (long) buflen); - lkl_close_and_invalidate(ctx, fd); - - if (ret < 0) - return kbox_dispatch_errno((int) (-ret)); - - size_t n = (size_t) ret; - int wrc = guest_mem_write(ctx, pid, remote_buf, scratch, n); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value((int64_t) n); -} - -/* forward_syslog (klogctl). */ - -/* syslog(type, buf, len): forward to LKL so dmesg shows the LKL kernel's - * ring buffer, not the host's. - * - * Types that read into buf (2=READ, 3=READ_ALL, 4=READ_CLEAR): call LKL - * with a scratch buffer, then copy to tracee. - * Types that just return a value (0,1,5-10): forward type+len, return the - * result directly. - */ -#define SYSLOG_ACTION_READ 2 -#define SYSLOG_ACTION_READ_ALL 3 -#define SYSLOG_ACTION_READ_CLEAR 4 -#define SYSLOG_ACTION_SIZE_BUFFER 10 - -static struct kbox_dispatch forward_syslog( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - long type = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - uint64_t remote_buf = kbox_syscall_request_arg(req, 1); - long len = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - - int needs_buf = - (type == SYSLOG_ACTION_READ || type == SYSLOG_ACTION_READ_ALL || - type == SYSLOG_ACTION_READ_CLEAR); - - if (!needs_buf) { - /* No buffer transfer: SIZE_BUFFER, CONSOLE_ON/OFF, etc. */ - long ret = lkl_syscall6(ctx->sysnrs->syslog, type, 0, len, 0, 0, 0); - return kbox_dispatch_from_lkl(ret); - } - - if (len <= 0) - return kbox_dispatch_errno(EINVAL); - if (remote_buf == 0) - return kbox_dispatch_errno(EFAULT); - - /* Static buffer; safe because the supervisor is single-threaded. - * Clamp to the actual LKL ring buffer size so READ_CLEAR never - * discards data beyond what we can copy out. The ring buffer size is - * fixed at boot, so cache it after the first query. Hard-cap at 1MB - * (the static buffer size) as a safety ceiling. - */ - static uint8_t scratch[1024 * 1024]; - static long cached_ring_sz; - if (!cached_ring_sz) { - long sz = lkl_syscall6(ctx->sysnrs->syslog, SYSLOG_ACTION_SIZE_BUFFER, - 0, 0, 0, 0, 0); - cached_ring_sz = (sz > 0) ? sz : -1; - } - if (cached_ring_sz > 0 && len > cached_ring_sz) - len = cached_ring_sz; - if (len > (long) sizeof(scratch)) - len = (long) sizeof(scratch); - - long ret = - lkl_syscall6(ctx->sysnrs->syslog, type, (long) scratch, len, 0, 0, 0); - if (ret < 0) - return kbox_dispatch_errno((int) (-ret)); - - size_t n = (size_t) ret; - int wrc = guest_mem_write(ctx, pid, remote_buf, scratch, n); - - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - - return kbox_dispatch_value((int64_t) n); -} - -/* forward_prctl. */ - -#ifndef PR_SET_NAME -#define PR_SET_NAME 15 -#endif -#ifndef PR_GET_NAME -#define PR_GET_NAME 16 -#endif -#ifndef PR_SET_DUMPABLE -#define PR_SET_DUMPABLE 4 -#endif -#ifndef PR_GET_DUMPABLE -#define PR_GET_DUMPABLE 3 -#endif - -static struct kbox_dispatch forward_prctl( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long option = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - - /* Block PR_SET_DUMPABLE(0): clearing dumpability makes process_vm_readv - * fail, which would bypass clone3 namespace-flag sanitization (the - * supervisor can't read clone_args.flags from a non-dumpable process). - * Return success without actually clearing; the tracee thinks it - * worked, but the supervisor retains read access. - */ - if (option == PR_SET_DUMPABLE && - to_c_long_arg(kbox_syscall_request_arg(req, 1)) == 0) - return kbox_dispatch_value(0); - /* Match: report dumpable even if guest tried to clear it. */ - if (option == PR_GET_DUMPABLE) - return kbox_dispatch_value(1); - - /* Only forward PR_SET_NAME and PR_GET_NAME to LKL. Everything else - * passes through to the host kernel. - * - * PR_SET_NAME/PR_GET_NAME use a 16-byte name buffer. The tracee - * passes a pointer in arg2 which is in the tracee's address space, - * not ours. We must copy through kbox_vm_read/kbox_vm_write. - */ - if (option != PR_SET_NAME && option != PR_GET_NAME) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_name = kbox_syscall_request_arg(req, 1); - if (remote_name == 0) - return kbox_dispatch_errno(EFAULT); - - /* PR_SET_NAME: read 16-byte name from tracee, pass local copy to LKL. */ - if (option == PR_SET_NAME) { - char name[16]; - int rrc = guest_mem_read(ctx, pid, remote_name, name, sizeof(name)); - if (rrc < 0) - return kbox_dispatch_errno(-rrc); - name[15] = '\0'; /* ensure NUL termination */ - long ret = - lkl_syscall6(ctx->sysnrs->prctl, option, (long) name, 0, 0, 0, 0); - return kbox_dispatch_from_lkl(ret); - } - - /* PR_GET_NAME: get name from LKL into local buffer, write to tracee. */ - char name[16] = {0}; - long ret = - lkl_syscall6(ctx->sysnrs->prctl, option, (long) name, 0, 0, 0, 0); - if (ret < 0) - return kbox_dispatch_from_lkl(ret); - int wrc = guest_mem_write(ctx, pid, remote_name, name, sizeof(name)); - if (wrc < 0) - return kbox_dispatch_errno(-wrc); - return kbox_dispatch_value(0); -} - -/* forward_umask. */ - -static struct kbox_dispatch forward_umask( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long mask = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long ret = kbox_lkl_umask(ctx->sysnrs, mask); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_pwrite64. */ - -static struct kbox_dispatch forward_pwrite64( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) - return kbox_dispatch_continue(); - - invalidate_stat_cache_fd(ctx, lkl_fd); - - uint64_t remote_buf = kbox_syscall_request_arg(req, 1); - int64_t count_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - if (count_raw < 0) - return kbox_dispatch_errno(EINVAL); - size_t count = (size_t) count_raw; - long offset = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - - if (remote_buf == 0) - return kbox_dispatch_errno(EFAULT); - if (count == 0) - return kbox_dispatch_value(0); - - pid_t pid = kbox_syscall_request_pid(req); - size_t max_count = 1024 * 1024; - if (count > max_count) - count = max_count; - - size_t total = 0; - uint8_t *scratch = dispatch_scratch; - - while (total < count) { - size_t chunk_len = KBOX_IO_CHUNK_LEN; - if (chunk_len > count - total) - chunk_len = count - total; - - uint64_t remote = remote_buf + total; - int rrc = guest_mem_read(ctx, pid, remote, scratch, chunk_len); - if (rrc < 0) { - if (total > 0) - break; - return kbox_dispatch_errno(-rrc); - } - - long ret = kbox_lkl_pwrite64(ctx->sysnrs, lkl_fd, scratch, - (long) chunk_len, offset + (long) total); - if (ret < 0) { - if (total == 0) { - return kbox_dispatch_errno((int) (-ret)); - } - break; - } - - size_t n = (size_t) ret; - total += n; - if (n < chunk_len) - break; - } - - if (total > 0) - invalidate_path_shadow_cache(ctx); - return kbox_dispatch_value((int64_t) total); -} - -/* forward_writev. */ - -/* iovec layout matches the kernel's: { void *iov_base; size_t iov_len; } - * On 64-bit: 16 bytes per entry. - */ -#define IOV_ENTRY_SIZE 16 -/* Match the kernel's UIO_MAXIOV. The iov_buf is static (not stack-allocated) - * because in trap/rewrite mode dispatch runs in signal handler context where - * 16 KB on the stack risks overflow on threads with small stacks. The - * dispatcher is single-threaded (documented invariant), so a static buffer - * is safe. - */ -#define IOV_MAX_COUNT 1024 -static uint8_t iov_scratch[IOV_MAX_COUNT * IOV_ENTRY_SIZE]; - -static struct kbox_dispatch forward_writev( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - invalidate_stat_cache_fd(ctx, lkl_fd); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_iov = kbox_syscall_request_arg(req, 1); - int64_t iovcnt_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - - if (iovcnt_raw <= 0 || iovcnt_raw > IOV_MAX_COUNT) - return kbox_dispatch_errno(EINVAL); - if (remote_iov == 0) - return kbox_dispatch_errno(EFAULT); - - int iovcnt = (int) iovcnt_raw; - size_t iov_bytes = (size_t) iovcnt * IOV_ENTRY_SIZE; - - int rrc = guest_mem_read(ctx, pid, remote_iov, iov_scratch, iov_bytes); - if (rrc < 0) { - return kbox_dispatch_errno(-rrc); - } - - int mirror_host = kbox_fd_table_mirror_tty(ctx->fd_table, fd); - size_t total = 0; - uint8_t *scratch = dispatch_scratch; - - int err = 0; - int i; - for (i = 0; i < iovcnt; i++) { - uint64_t base; - uint64_t len; - memcpy(&base, &iov_scratch[i * IOV_ENTRY_SIZE], 8); - memcpy(&len, &iov_scratch[i * IOV_ENTRY_SIZE + 8], 8); - - if (base == 0 || len == 0) - continue; - - size_t seg_total = 0; - while (seg_total < len) { - size_t chunk = KBOX_IO_CHUNK_LEN; - if (chunk > len - seg_total) - chunk = len - seg_total; - - rrc = guest_mem_read(ctx, pid, base + seg_total, scratch, chunk); - if (rrc < 0) { - err = -rrc; - goto done; - } - - long ret = - kbox_lkl_write(ctx->sysnrs, lkl_fd, scratch, (long) chunk); - if (ret < 0) { - err = (int) (-ret); - goto done; - } - - size_t n = (size_t) ret; - if (mirror_host && n > 0) - (void) write(STDOUT_FILENO, scratch, n); - - seg_total += n; - total += n; - if (n < chunk) - goto done; - } - } - -done: - if (total > 0) - invalidate_path_shadow_cache(ctx); - if (total == 0 && err) - return kbox_dispatch_errno(err); - return kbox_dispatch_value((int64_t) total); -} - -/* forward_readv. */ - -static struct kbox_dispatch forward_readv( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - pid_t pid = kbox_syscall_request_pid(req); - uint64_t remote_iov = kbox_syscall_request_arg(req, 1); - int64_t iovcnt_raw = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - - if (iovcnt_raw <= 0 || iovcnt_raw > IOV_MAX_COUNT) - return kbox_dispatch_errno(EINVAL); - if (remote_iov == 0) - return kbox_dispatch_errno(EFAULT); - - int iovcnt = (int) iovcnt_raw; - size_t iov_bytes = (size_t) iovcnt * IOV_ENTRY_SIZE; - - int rrc = guest_mem_read(ctx, pid, remote_iov, iov_scratch, iov_bytes); - if (rrc < 0) { - return kbox_dispatch_errno(-rrc); - } - - size_t total = 0; - uint8_t *scratch = dispatch_scratch; - - int i; - for (i = 0; i < iovcnt; i++) { - uint64_t base; - uint64_t len; - memcpy(&base, &iov_scratch[i * IOV_ENTRY_SIZE], 8); - memcpy(&len, &iov_scratch[i * IOV_ENTRY_SIZE + 8], 8); - - if (base == 0 || len == 0) - continue; - - size_t seg_total = 0; - while (seg_total < len) { - size_t chunk = KBOX_IO_CHUNK_LEN; - if (chunk > len - seg_total) - chunk = len - seg_total; - - long ret = - kbox_lkl_read(ctx->sysnrs, lkl_fd, scratch, (long) chunk); - if (ret < 0) { - if (total == 0) { - return kbox_dispatch_errno((int) (-ret)); - } - goto done_readv; - } - - size_t n = (size_t) ret; - if (n == 0) - goto done_readv; - - int wrc = guest_mem_write(ctx, pid, base + seg_total, scratch, n); - if (wrc < 0) { - return kbox_dispatch_errno(-wrc); - } - - seg_total += n; - total += n; - if (n < chunk) - goto done_readv; - } - } - -done_readv: - return kbox_dispatch_value((int64_t) total); -} - -/* forward_ftruncate. */ - -static struct kbox_dispatch forward_ftruncate( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) - return kbox_dispatch_continue(); - - long length = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long ret = kbox_lkl_ftruncate(ctx->sysnrs, lkl_fd, length); - if (ret >= 0) { - invalidate_path_shadow_cache(ctx); - invalidate_stat_cache_fd(ctx, lkl_fd); - } - return kbox_dispatch_from_lkl(ret); -} - -/* forward_fallocate. */ - -static struct kbox_dispatch forward_fallocate( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - if (entry && entry->host_fd == KBOX_FD_HOST_SAME_FD_SHADOW) - return kbox_dispatch_continue(); - - long mode = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long offset = to_c_long_arg(kbox_syscall_request_arg(req, 2)); - long len = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - long ret = kbox_lkl_fallocate(ctx->sysnrs, lkl_fd, mode, offset, len); - if (ret == -ENOSYS) - return kbox_dispatch_errno(ENOSYS); - if (ret >= 0) { - invalidate_path_shadow_cache(ctx); - invalidate_stat_cache_fd(ctx, lkl_fd); - } - return kbox_dispatch_from_lkl(ret); -} - -/* forward_flock. */ - -static struct kbox_dispatch forward_flock( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - - long operation = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long ret = kbox_lkl_flock(ctx->sysnrs, lkl_fd, operation); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_fsync. */ - -static struct kbox_dispatch forward_fsync( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - if (entry && entry->shadow_writeback) { - int rc = sync_shadow_writeback(ctx, entry); - if (rc < 0) - return kbox_dispatch_errno(-rc); - return kbox_dispatch_value(0); - } - - long ret = kbox_lkl_fsync(ctx->sysnrs, lkl_fd); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_fdatasync. */ - -static struct kbox_dispatch forward_fdatasync( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); - - if (lkl_fd < 0) - return kbox_dispatch_continue(); - if (entry && entry->shadow_writeback) { - int rc = sync_shadow_writeback(ctx, entry); - if (rc < 0) - return kbox_dispatch_errno(-rc); - return kbox_dispatch_value(0); - } - - long ret = kbox_lkl_fdatasync(ctx->sysnrs, lkl_fd); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_sync. */ - -static struct kbox_dispatch forward_sync(const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - (void) req; - long ret = kbox_lkl_sync(ctx->sysnrs); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_symlinkat. */ - -static struct kbox_dispatch forward_symlinkat( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - char targetbuf[KBOX_MAX_PATH]; - char linkpathbuf[KBOX_MAX_PATH]; - int rc; - - rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 0), - targetbuf, sizeof(targetbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long newdirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 1)); - - rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 2), - linkpathbuf, sizeof(linkpathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - char linktrans[KBOX_MAX_PATH]; - rc = kbox_translate_path_for_lkl(pid, linkpathbuf, ctx->host_root, - linktrans, sizeof(linktrans)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long newdirfd = resolve_open_dirfd(linktrans, newdirfd_raw, ctx->fd_table); - if (newdirfd < 0 && newdirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); - - /* Target is stored as-is (not translated). */ - long ret = kbox_lkl_symlinkat(ctx->sysnrs, targetbuf, newdirfd, linktrans); - if (ret >= 0) - invalidate_path_shadow_cache(ctx); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_linkat. */ - -static struct kbox_dispatch forward_linkat( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - long olddirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); - char oldpathbuf[KBOX_MAX_PATH]; - int rc; - - rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), - oldpathbuf, sizeof(oldpathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long newdirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 2)); - char newpathbuf[KBOX_MAX_PATH]; - - rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 3), - newpathbuf, sizeof(newpathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); - - char oldtrans[KBOX_MAX_PATH]; - rc = kbox_translate_path_for_lkl(pid, oldpathbuf, ctx->host_root, oldtrans, - sizeof(oldtrans)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - char newtrans[KBOX_MAX_PATH]; - rc = kbox_translate_path_for_lkl(pid, newpathbuf, ctx->host_root, newtrans, - sizeof(newtrans)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - long olddirfd = resolve_open_dirfd(oldtrans, olddirfd_raw, ctx->fd_table); - if (olddirfd < 0 && olddirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); - - long newdirfd = resolve_open_dirfd(newtrans, newdirfd_raw, ctx->fd_table); - if (newdirfd < 0 && newdirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); - - long ret = kbox_lkl_linkat(ctx->sysnrs, olddirfd, oldtrans, newdirfd, - newtrans, flags); - if (ret >= 0) - invalidate_path_shadow_cache(ctx); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_utimensat. */ - -/* struct timespec is 16 bytes on 64-bit: tv_sec(8) + tv_nsec(8). */ -#define TIMESPEC_SIZE 16 - -static struct kbox_dispatch forward_utimensat( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - pid_t pid = kbox_syscall_request_pid(req); - long dirfd_raw = to_dirfd_arg(kbox_syscall_request_arg(req, 0)); - - /* pathname can be NULL for utimensat (operates on dirfd itself). In - * that case args[1] == 0. - */ - const char *translated_path = NULL; - char translated[KBOX_MAX_PATH]; - long lkl_dirfd; - int rc; - - if (kbox_syscall_request_arg(req, 1) != 0) { - char pathbuf[KBOX_MAX_PATH]; - rc = guest_mem_read_string(ctx, pid, kbox_syscall_request_arg(req, 1), - pathbuf, sizeof(pathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, - translated, sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - translated_path = translated; - lkl_dirfd = resolve_open_dirfd(translated, dirfd_raw, ctx->fd_table); - if (lkl_dirfd < 0 && lkl_dirfd != AT_FDCWD_LINUX) - return kbox_dispatch_continue(); - } else { - translated_path = NULL; - /* dirfd must be a virtual FD when path is NULL. */ - lkl_dirfd = kbox_fd_table_get_lkl(ctx->fd_table, dirfd_raw); - if (lkl_dirfd < 0) - return kbox_dispatch_continue(); - } - - /* Read the times array (2 x struct timespec) if provided. */ - uint8_t times_buf[TIMESPEC_SIZE * 2]; - const void *times = NULL; - if (kbox_syscall_request_arg(req, 2) != 0) { - rc = guest_mem_read(ctx, pid, kbox_syscall_request_arg(req, 2), - times_buf, sizeof(times_buf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - times = times_buf; - } - - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 3)); - long ret = kbox_lkl_utimensat(ctx->sysnrs, lkl_dirfd, translated_path, - times, flags); - if (ret >= 0) - invalidate_path_shadow_cache(ctx); - return kbox_dispatch_from_lkl(ret); -} - -/* forward_ioctl. */ - -/* Terminal ioctl constants. */ -#ifndef TCGETS -#define TCGETS 0x5401 -#endif -#ifndef TCSETS -#define TCSETS 0x5402 -#endif -#ifndef TIOCGWINSZ -#define TIOCGWINSZ 0x5413 -#endif -#ifndef TIOCSWINSZ -#define TIOCSWINSZ 0x5414 -#endif -#ifndef TIOCGPGRP -#define TIOCGPGRP 0x540F -#endif -#ifndef TIOCSPGRP -#define TIOCSPGRP 0x5410 -#endif -#ifndef TIOCSCTTY -#define TIOCSCTTY 0x540E -#endif - -static struct kbox_dispatch forward_ioctl( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - long fd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); - long cmd = to_c_long_arg(kbox_syscall_request_arg(req, 1)); - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - - if (lkl_fd < 0) { - /* Host FD (stdin/stdout/stderr or pipe). Most ioctls pass through - * to the host kernel. However, job-control ioctls (TIOCSPGRP/ - * TIOCGPGRP) fail with EPERM under seccomp-unotify because the - * supervised child is not the session leader. Return ENOTTY so - * shells fall back to non-job-control mode instead of aborting. - */ - if (cmd == TIOCSPGRP || cmd == TIOCGPGRP || cmd == TIOCSCTTY) - return kbox_dispatch_errno(ENOTTY); - return kbox_dispatch_continue(); - } - - (void) lkl_fd; - - /* For virtual FDs backed by LKL, terminal ioctls return ENOTTY since - * LKL file-backed FDs are not terminals. Non-terminal ioctls also - * return ENOTTY, matching regular-file semantics. - */ - return kbox_dispatch_errno(ENOTTY); -} - -/* forward_mmap. */ - -/* mmap dispatch: if the FD is a virtual FD with no host shadow, create - * the shadow on demand (lazy shadow) and inject it into the tracee at - * the same FD number, then CONTINUE so the host kernel mmaps the real fd. - * - * Lazy shadow creation avoids the memfd_create + file-copy cost at every - * open. The shadow is only materialized when the guest actually mmaps. - */ -static struct kbox_dispatch forward_mmap(const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - /* W^X enforcement for mmap in trap/rewrite mode. */ - if (request_uses_trap_signals(req)) { - int prot = (int) kbox_syscall_request_arg(req, 2); - if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) { - if (ctx->verbose) - fprintf(stderr, - "kbox: mmap denied: W^X violation " - "(prot=0x%x, pid=%u)\n", - prot, kbox_syscall_request_pid(req)); - return kbox_dispatch_errno(EACCES); - } - } - - long fd = to_dirfd_arg(kbox_syscall_request_arg(req, 4)); - - if (fd == -1) - return kbox_dispatch_continue(); - - long lkl_fd = kbox_fd_table_get_lkl(ctx->fd_table, fd); - if (lkl_fd >= 0) { - long host = kbox_fd_table_get_host_fd(ctx->fd_table, fd); - if (host == -1) { - /* Only create lazy shadows for read-only/private mappings. - * Writable MAP_SHARED mappings on LKL files cannot be - * supported via memfd (writes would go to the copy, not LKL). - */ - int mmap_flags = (int) kbox_syscall_request_arg(req, 3); - int mmap_prot = (int) kbox_syscall_request_arg(req, 2); - if ((mmap_flags & MAP_SHARED) && (mmap_prot & PROT_WRITE)) - return kbox_dispatch_errno(ENODEV); - - int memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); - if (memfd < 0) - return kbox_dispatch_errno(ENODEV); - kbox_shadow_seal(memfd); - int injected = request_addfd_at(ctx, req, memfd, (int) fd, 0); - if (injected < 0) { - close(memfd); - return kbox_dispatch_errno(ENODEV); - } - /* Mark that a shadow was injected so repeated mmaps don't - * re-create it. Use -2 as a sentinel: host_fd >= 0 means - * "supervisor-owned shadow fd" (closed on remove). host_fd - * == -2 means "tracee-owned shadow, don't close in supervisor." - * fd_table_remove only closes host_fd when host_fd >= 0 AND - * shadow_sp < 0, so -2 is safe. - */ - kbox_fd_table_set_host_fd(ctx->fd_table, fd, - KBOX_FD_HOST_SAME_FD_SHADOW); - { - struct kbox_fd_entry *entry = fd_table_entry(ctx->fd_table, fd); - if (entry) - entry->shadow_sp = memfd; - } - } - } - - return kbox_dispatch_continue(); -} - -/* Identity dispatch helpers */ -/* */ -/* In host+root_identity mode, get* returns 0 and set* returns 0. */ -/* In host+override mode, get* returns the override value. */ -/* In host+neither mode, CONTINUE to host kernel. */ -/* In image mode, forward to LKL. */ - -static struct kbox_dispatch dispatch_get_uid( - long (*lkl_func)(const struct kbox_sysnrs *), - struct kbox_supervisor_ctx *ctx) -{ - if (ctx->host_root) { - if (ctx->root_identity) - return kbox_dispatch_value(0); - if (ctx->override_uid != (uid_t) -1) - return kbox_dispatch_value((int64_t) ctx->override_uid); - return kbox_dispatch_continue(); - } - return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); -} - -static struct kbox_dispatch dispatch_get_gid( - long (*lkl_func)(const struct kbox_sysnrs *), - struct kbox_supervisor_ctx *ctx) -{ - if (ctx->host_root) { - if (ctx->root_identity) - return kbox_dispatch_value(0); - if (ctx->override_gid != (gid_t) -1) - return kbox_dispatch_value((int64_t) ctx->override_gid); - return kbox_dispatch_continue(); - } - return kbox_dispatch_from_lkl(lkl_func(ctx->sysnrs)); -} - -static struct kbox_dispatch dispatch_set_id( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - struct kbox_dispatch (*lkl_forward)(const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx)) -{ - if (ctx->host_root) { - if (ctx->root_identity) - return kbox_dispatch_value(0); - return kbox_dispatch_continue(); - } - return lkl_forward(req, ctx); -} - -/* forward_execve. */ - -/* AT_EMPTY_PATH flag for execveat: indicates fexecve() usage. Defined - * here to avoid pulling in the full linux/fcntl.h. - */ -#define KBOX_AT_EMPTY_PATH 0x1000 - -/* Load biases for the userspace ELF loader. Must match image.c - * prepare_userspace_launch. The loader places main and interpreter - * ELFs at these fixed virtual addresses, and the stack just below - * stack_top. - */ -#define KBOX_EXEC_MAIN_LOAD_BIAS 0x600000000000ULL -#define KBOX_EXEC_INTERP_LOAD_BIAS 0x610000000000ULL -#define KBOX_EXEC_STACK_TOP 0x700000010000ULL - -/* Alternate stack region for userspace re-exec. During re-exec the - * SIGSYS handler is running on the old guest stack, so we cannot - * unmap it until after transferring to the new binary. Place the - * new stack at a different address; the old stack region is reclaimed - * by the subsequent munmap in teardown_old_guest_mappings during the - * NEXT re-exec. - */ -#define KBOX_EXEC_REEXEC_STACK_TOP 0x6F0000010000ULL - -/* Maximum entries in argv or envp for userspace exec. */ -#define KBOX_EXEC_MAX_ARGS 4096 - -/* Track which stack region is in use by the current guest. The - * initial launch uses KBOX_EXEC_STACK_TOP; re-exec alternates - * between the two addresses. The signal handler runs on the - * current guest's stack, so we must not unmap it during re-exec. - */ -static uint64_t reexec_current_stack_top; - -/* Safely count a null-terminated pointer array in guest address space. - * Uses process_vm_readv to avoid SIGSEGV on bad guest pointers. - * Returns the count (not including the final NULL), or -EFAULT on bad memory. - */ -static long count_user_ptrs_safe(uint64_t arr_addr, size_t max_count) -{ - size_t n = 0; - uint64_t ptr; - - if (arr_addr == 0) - return -EFAULT; - - while (n < max_count) { - uint64_t offset, probe_addr; - int rc; - if (__builtin_mul_overflow((uint64_t) n, sizeof(uint64_t), &offset) || - __builtin_add_overflow(arr_addr, offset, &probe_addr)) - return -EFAULT; - rc = kbox_current_read(probe_addr, &ptr, sizeof(ptr)); - if (rc < 0) - return -EFAULT; - if (ptr == 0) - return (long) n; - n++; - } - - return -E2BIG; -} - -/* Safely measure the length of a guest string. - * Returns the length (not including NUL), or -EFAULT on bad memory. - */ -static long strlen_user_safe(uint64_t str_addr) -{ - char buf[256]; - size_t total = 0; - - if (str_addr == 0) - return -EFAULT; - - for (;;) { - int rc = kbox_current_read(str_addr + total, buf, sizeof(buf)); - if (rc < 0) - return -EFAULT; - for (size_t i = 0; i < sizeof(buf); i++) { - if (buf[i] == '\0') - return (long) (total + i); - } - total += sizeof(buf); - if (total > (size_t) (256 * 1024)) - return -ENAMETOOLONG; - } -} - -/* Safely read a single guest pointer (8 bytes). */ -static int read_user_ptr(uint64_t addr, uint64_t *out) -{ - return kbox_current_read(addr, out, sizeof(*out)); -} - -/* Safely copy a guest string into a destination buffer. - * Returns the string length (not including NUL), or -EFAULT. - */ -static long copy_user_string(uint64_t str_addr, char *dst, size_t dst_size) -{ - return kbox_current_read_string(str_addr, dst, dst_size); -} - -/* Tear down old guest code/data mappings and the stale stack at the - * new stack address. The current guest stack (which the SIGSYS - * handler is running on) is at the OTHER address and left alone. - * It leaks one stack-sized region until the next re-exec cycle. - */ -static void teardown_old_guest_mappings(uint64_t new_stack_top) -{ - /* Main binary region: up to 256 MB from the load bias. */ - munmap((void *) (uintptr_t) KBOX_EXEC_MAIN_LOAD_BIAS, 256UL * 1024 * 1024); - /* Interpreter region: up to 256 MB from the load bias. */ - munmap((void *) (uintptr_t) KBOX_EXEC_INTERP_LOAD_BIAS, - 256UL * 1024 * 1024); - /* Unmap any stale stack at the new stack address. On the first - * re-exec (new = REEXEC), this is a no-op (nothing mapped there). - * On the second re-exec (new = STACK_TOP), this unmaps the - * initial launch stack. Subsequent cycles alternate and reclaim. - */ - munmap((void *) (uintptr_t) (new_stack_top - 16UL * 1024 * 1024), - 16UL * 1024 * 1024 + 0x10000UL); -} - -/* Perform userspace exec for trap mode. Called from inside the SIGSYS - * handler when the guest calls execve/execveat. This replaces the - * current process image without a real exec syscall, preserving the - * SIGSYS handler and seccomp filter chain. - * - * The function is noreturn on success: it transfers control to the new - * binary's entry point. On failure, it returns a dispatch with errno. - */ -static struct kbox_dispatch trap_userspace_exec( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - int exec_memfd, - const char *pathname, - int is_execveat) -{ - unsigned char *elf_buf = NULL; - size_t elf_buf_len = 0; - char interp_path[256]; - int interp_memfd = -1; - int ilen = 0; - struct kbox_loader_launch_spec spec; - struct kbox_loader_launch launch = {0}; - struct kbox_syscall_trap_ip_range ranges[KBOX_LOADER_MAX_MAPPINGS]; - struct kbox_loader_exec_range exec_ranges[KBOX_LOADER_MAX_MAPPINGS]; - size_t exec_count = 0; - size_t range_count = 0; - unsigned char random_bytes[KBOX_LOADER_RANDOM_SIZE]; - - /* execve(path, argv, envp): argv=args[1], envp=args[2] - * execveat(dirfd, path, argv, envp, flags): argv=args[2], envp=args[3] - * - * In trap mode these are guest pointers in our address space, but still - * guest-controlled. All accesses must use safe reads (process_vm_readv) - * to return EFAULT on bad pointers instead of crashing the SIGSYS handler. - */ - uint64_t argv_addr = kbox_syscall_request_arg(req, is_execveat ? 2 : 1); - uint64_t envp_addr = kbox_syscall_request_arg(req, is_execveat ? 3 : 2); - long argc_long = count_user_ptrs_safe(argv_addr, KBOX_EXEC_MAX_ARGS); - long envc_long = count_user_ptrs_safe(envp_addr, KBOX_EXEC_MAX_ARGS); - size_t argc, envc; - - if (argc_long < 0) { - close(exec_memfd); - return kbox_dispatch_errno(argc_long == -E2BIG ? EINVAL : EFAULT); - } - if (envc_long < 0) { - close(exec_memfd); - return kbox_dispatch_errno(envc_long == -E2BIG ? EINVAL : EFAULT); - } - argc = (size_t) argc_long; - envc = (size_t) envc_long; - if (argc == 0) { - close(exec_memfd); - return kbox_dispatch_errno(EINVAL); - } - - /* Deep-copy argv and envp into a single mmap'd arena. Using mmap - * instead of malloc/strdup because we are inside the SIGSYS handler - * and glibc's allocator is not async-signal-safe. - * - * Two passes: first measure total size (via safe string length reads), - * then copy. All guest pointer reads use process_vm_readv. - */ - size_t arena_size = (argc + envc) * sizeof(char *); - for (size_t i = 0; i < argc; i++) { - uint64_t str_addr; - long slen; - if (read_user_ptr(argv_addr + i * sizeof(uint64_t), &str_addr) < 0) { - close(exec_memfd); - return kbox_dispatch_errno(EFAULT); - } - slen = strlen_user_safe(str_addr); - if (slen < 0) { - close(exec_memfd); - return kbox_dispatch_errno(EFAULT); - } - arena_size += (size_t) slen + 1; - } - for (size_t i = 0; i < envc; i++) { - uint64_t str_addr; - long slen; - if (read_user_ptr(envp_addr + i * sizeof(uint64_t), &str_addr) < 0) { - close(exec_memfd); - return kbox_dispatch_errno(EFAULT); - } - slen = strlen_user_safe(str_addr); - if (slen < 0) { - close(exec_memfd); - return kbox_dispatch_errno(EFAULT); - } - arena_size += (size_t) slen + 1; - } - arena_size = (arena_size + 4095) & ~(size_t) 4095; - - char *arena = mmap(NULL, arena_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (arena == MAP_FAILED) { - close(exec_memfd); - return kbox_dispatch_errno(ENOMEM); - } - size_t arena_used = 0; - char **argv_copy = (char **) (arena + arena_used); - arena_used += argc * sizeof(char *); - char **envp_copy = (char **) (arena + arena_used); - arena_used += envc * sizeof(char *); - for (size_t i = 0; i < argc; i++) { - uint64_t str_addr; - long slen; - if (read_user_ptr(argv_addr + i * sizeof(uint64_t), &str_addr) < 0) - goto fail_arena; - slen = copy_user_string(str_addr, arena + arena_used, - arena_size - arena_used); - if (slen < 0) - goto fail_arena; - argv_copy[i] = arena + arena_used; - arena_used += (size_t) slen + 1; - } - for (size_t i = 0; i < envc; i++) { - uint64_t str_addr; - long slen; - if (read_user_ptr(envp_addr + i * sizeof(uint64_t), &str_addr) < 0) - goto fail_arena; - slen = copy_user_string(str_addr, arena + arena_used, - arena_size - arena_used); - if (slen < 0) - goto fail_arena; - envp_copy[i] = arena + arena_used; - arena_used += (size_t) slen + 1; - } - - /* Check for PT_INTERP (dynamic binary needing an interpreter). */ - if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, &elf_buf_len) == - 0) { - uint64_t pt_offset, pt_filesz; - - ilen = kbox_find_elf_interp_loc(elf_buf, elf_buf_len, interp_path, - sizeof(interp_path), &pt_offset, - &pt_filesz); - munmap(elf_buf, elf_buf_len); - elf_buf = NULL; - - if (ilen < 0) { - ilen = -ENOEXEC; - goto fail_early; - } - - if (ilen > 0) { - long interp_lkl = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, - interp_path, O_RDONLY, 0); - if (interp_lkl < 0) { - if (ctx->verbose) - fprintf(stderr, - "kbox: trap exec %s: cannot open " - "interpreter %s: %s\n", - pathname, interp_path, kbox_err_text(interp_lkl)); - ilen = (int) interp_lkl; - goto fail_early; - } - - interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); - lkl_close_and_invalidate(ctx, interp_lkl); - - if (interp_memfd < 0) { - ilen = interp_memfd; - goto fail_early; - } - } - } - /* else: kbox_read_elf_header_window_fd failed, elf_buf is still NULL. - * Nothing to unmap. Treat as static binary (no interpreter). - */ - - /* Generate random bytes for AT_RANDOM auxv entry. Use the raw - * syscall to avoid depending on sys/random.h availability. - */ - memset(random_bytes, 0x42, sizeof(random_bytes)); -#ifdef __NR_getrandom - { - long gr = - syscall(__NR_getrandom, random_bytes, sizeof(random_bytes), 0); - (void) gr; - } -#endif - - /* Pick a stack address that does not collide with the old guest - * stack (which we are currently running on from inside the SIGSYS - * handler). Alternate between two stack tops so the old one - * survives until the next re-exec reclaims it. - */ - uint64_t new_stack_top = - (reexec_current_stack_top == KBOX_EXEC_REEXEC_STACK_TOP) - ? KBOX_EXEC_STACK_TOP - : KBOX_EXEC_REEXEC_STACK_TOP; - - /* Build the loader launch spec. Use the same load biases as the - * initial launch so the address space layout is consistent. - */ - memset(&spec, 0, sizeof(spec)); - spec.exec_fd = exec_memfd; - spec.interp_fd = interp_memfd; - spec.argv = (const char *const *) argv_copy; - spec.argc = argc; - spec.envp = (const char *const *) envp_copy; - spec.envc = envc; - spec.execfn = pathname; - spec.random_bytes = random_bytes; - spec.page_size = (uint64_t) sysconf(_SC_PAGESIZE); - spec.stack_top = new_stack_top; - spec.main_load_bias = KBOX_EXEC_MAIN_LOAD_BIAS; - spec.interp_load_bias = KBOX_EXEC_INTERP_LOAD_BIAS; - spec.uid = ctx->root_identity ? 0 : (uint32_t) getuid(); - spec.euid = ctx->root_identity ? 0 : (uint32_t) getuid(); - spec.gid = ctx->root_identity ? 0 : (uint32_t) getgid(); - spec.egid = ctx->root_identity ? 0 : (uint32_t) getgid(); - spec.secure = 0; - - /* Tear down old guest code/data mappings BEFORE materializing new - * ones (MAP_FIXED_NOREPLACE requires the addresses to be free). - * But do NOT teardown before reading the memfds; the reads use - * pread which doesn't depend on the old mappings. - */ - teardown_old_guest_mappings(new_stack_top); - - { - int launch_rc = kbox_loader_prepare_launch(&spec, &launch); - if (launch_rc < 0) { - const char msg[] = "kbox: trap exec: loader prepare failed\n"; - (void) write(STDERR_FILENO, msg, sizeof(msg) - 1); - _exit(127); - } - } - - /* The memfds have been read into launch buffers; close them. */ - close(exec_memfd); - if (interp_memfd >= 0) - close(interp_memfd); - - /* Collect executable ranges from the new layout for the BPF - * filter. The new filter is appended to the filter chain; the - * old filter is harmless (matches unmapped addresses). - */ - if (kbox_loader_collect_exec_ranges( - &launch, exec_ranges, KBOX_LOADER_MAX_MAPPINGS, &exec_count) < 0) { - if (ctx->verbose) - fprintf(stderr, "kbox: trap exec %s: cannot collect exec ranges\n", - pathname); - kbox_loader_launch_reset(&launch); - _exit(127); - } - for (size_t i = 0; i < exec_count; i++) { - ranges[i].start = (uintptr_t) exec_ranges[i].start; - ranges[i].end = (uintptr_t) exec_ranges[i].end; - } - range_count = exec_count; - - /* Install a new BPF RET_TRAP filter covering the new binary's - * executable ranges. seccomp filters form a chain; calling - * seccomp(SET_MODE_FILTER) adds to it rather than replacing. - */ - if (kbox_install_seccomp_trap_ranges(ctx->host_nrs, ranges, range_count) < - 0) { - if (ctx->verbose) - fprintf(stderr, - "kbox: trap exec %s: cannot install new BPF filter\n", - pathname); - kbox_loader_launch_reset(&launch); - _exit(127); - } - - /* Clean up CLOEXEC entries from the FD table, matching what a - * real exec would do. - */ - kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs); - - /* If the original launch used rewrite mode, re-apply binary rewriting - * to the new binary. This patches syscall instructions in the newly - * loaded executable segments and sets up trampoline regions, promoting - * the new binary from Tier 1 (SIGSYS ~3us) to Tier 2 (~41ns) for - * rewritten sites. - * - * If rewrite installation fails (e.g., trampoline allocation), the - * binary still works correctly via the SIGSYS handler (Tier 1). - */ - if (req->source == KBOX_SYSCALL_SOURCE_REWRITE) { - /* Static: the runtime is stored globally via - * store_active_rewrite_runtime and must survive past the noreturn - * transfer_to_guest. Single-threaded trap mode guarantees no concurrent - * re-exec. - */ - static struct kbox_rewrite_runtime rewrite_rt; - kbox_rewrite_runtime_reset(&rewrite_rt); - if (kbox_rewrite_runtime_install(&rewrite_rt, ctx, &launch) == 0) { - if (ctx->verbose) - fprintf(stderr, - "kbox: trap exec %s: rewrite installed " - "(%zu trampoline regions)\n", - pathname, rewrite_rt.trampoline_region_count); - } else { - if (ctx->verbose) - fprintf(stderr, - "kbox: trap exec %s: rewrite failed, " - "falling back to SIGSYS\n", - pathname); - } - } - -#if defined(__x86_64__) - /* Reset the guest FS base to the host (kbox) FS base. We are - * inside the SIGSYS handler where FS already points to kbox's - * TLS. The new binary starts with no TLS set up; it will call - * arch_prctl(ARCH_SET_FS) during libc init to establish its own. - * Until then, SIGSYS handler entry should see FS == host FS and - * the save/restore becomes a no-op, which is correct. - */ - { - uint64_t host_fs = 0; - - kbox_syscall_trap_host_arch_prctl_get_fs(&host_fs); - kbox_syscall_trap_set_guest_fs(host_fs); - } -#endif - - if (ctx->verbose) - fprintf(stderr, - "kbox: trap exec %s: transferring to new image " - "pc=0x%llx sp=0x%llx\n", - pathname, (unsigned long long) launch.transfer.pc, - (unsigned long long) launch.transfer.sp); - - /* Record which stack the new guest is using. The next re-exec - * will pick the other address and reclaim this one. - */ - reexec_current_stack_top = new_stack_top; - - /* Free staging buffers before transferring. The image regions - * (mmap'd guest code/data/stack) must survive. - */ - munmap(arena, arena_size); - if (launch.main_elf && launch.main_elf_len > 0) - munmap(launch.main_elf, launch.main_elf_len); - launch.main_elf = NULL; - if (launch.interp_elf && launch.interp_elf_len > 0) - munmap(launch.interp_elf, launch.interp_elf_len); - launch.interp_elf = NULL; - kbox_loader_stack_image_reset(&launch.layout.stack); - - /* Unblock SIGSYS before transferring. We are inside the SIGSYS - * handler, which runs with SIGSYS blocked (SA_SIGINFO default). - * Since we jump to the new entry point instead of returning from - * the handler, the kernel never restores the pre-handler signal - * mask. The new binary needs SIGSYS unblocked so the BPF RET_TRAP - * filter can deliver it. - */ - { - uint64_t mask[2] = {0, 0}; - unsigned int signo = SIGSYS - 1; - mask[signo / 64] = 1ULL << (signo % 64); - kbox_syscall_trap_host_rt_sigprocmask_unblock(mask, - 8 /* kernel sigset_t */); - } - - /* Transfer control to the new binary. This is noreturn. */ - kbox_loader_transfer_to_guest(&launch.transfer); - -fail_arena: - munmap(arena, arena_size); - close(exec_memfd); - return kbox_dispatch_errno(EFAULT); - -fail_early: - munmap(arena, arena_size); - close(exec_memfd); - if (interp_memfd >= 0) - close(interp_memfd); - return kbox_dispatch_errno((int) (-ilen)); -} - -/* Handle execve/execveat from inside the image. - * - * For fexecve (execveat with AT_EMPTY_PATH on a host memfd): CONTINUE, - * the host kernel handles it directly. This is the initial exec path - * from image.c. - * - * For in-image exec (e.g. shell runs /bin/ls): - * 1. Read the pathname from tracee memory - * 2. Open the binary from LKL, create a memfd - * 3. Check for PT_INTERP; if dynamic, extract interpreter into a second - * memfd and patch PT_INTERP to /proc/self/fd/N - * 4. Inject memfds into the tracee via ADDFD - * 5. Overwrite the pathname in tracee memory with /proc/self/fd/N - * 6. CONTINUE: kernel re-reads the rewritten path and execs - * - * The seccomp-unotify guarantees the tracee is blocked during steps 1-5, - * and the kernel has not yet copied the pathname (getname happens after - * the seccomp check), so the overwrite is race-free. - */ -static struct kbox_dispatch forward_execve( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx, - int is_execveat) -{ - pid_t pid = kbox_syscall_request_pid(req); - - /* Detect fexecve: execveat(fd, "", argv, envp, AT_EMPTY_PATH). This - * is the initial exec from image.c on the host memfd. Let the kernel - * handle it directly. - */ - if (is_execveat) { - long flags = to_c_long_arg(kbox_syscall_request_arg(req, 4)); - if (flags & KBOX_AT_EMPTY_PATH) - return kbox_dispatch_continue(); - } - - /* Read pathname from tracee memory. */ - uint64_t path_addr = is_execveat ? kbox_syscall_request_arg(req, 1) - : kbox_syscall_request_arg(req, 0); - char pathbuf[KBOX_MAX_PATH]; - int rc = - guest_mem_read_string(ctx, pid, path_addr, pathbuf, sizeof(pathbuf)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - /* Translate path for LKL. */ - char translated[KBOX_MAX_PATH]; - rc = kbox_translate_path_for_lkl(pid, pathbuf, ctx->host_root, translated, - sizeof(translated)); - if (rc < 0) - return kbox_dispatch_errno(-rc); - - /* Virtual paths (/proc, /sys, /dev): let the host handle them. */ - if (kbox_is_lkl_virtual_path(translated)) - return kbox_dispatch_continue(); - - /* Open the binary from LKL. */ - long lkl_fd = - kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, translated, O_RDONLY, 0); - if (lkl_fd < 0) - return kbox_dispatch_errno((int) (-lkl_fd)); - - /* Create a memfd with the binary contents. */ - int exec_memfd = kbox_shadow_create(ctx->sysnrs, lkl_fd); - lkl_close_and_invalidate(ctx, lkl_fd); - - if (exec_memfd < 0) - return kbox_dispatch_errno(-exec_memfd); - - /* Trap mode: the SIGSYS handler and BPF filter do not survive a - * real exec, so perform a userspace exec instead. This replaces - * the process image in-place (unmap old, map new, jump to entry) - * without invoking the kernel's execve. On success the function - * does not return. - */ - if (request_uses_trap_signals(req)) - return trap_userspace_exec(req, ctx, exec_memfd, pathbuf, is_execveat); - - /* Check for PT_INTERP (dynamic binary). */ - { - unsigned char *elf_buf = NULL; - size_t elf_buf_len = 0; - - if (kbox_read_elf_header_window_fd(exec_memfd, &elf_buf, - &elf_buf_len) == 0) { - char interp_path[256]; - uint64_t pt_offset, pt_filesz; - int ilen = kbox_find_elf_interp_loc( - elf_buf, elf_buf_len, interp_path, sizeof(interp_path), - &pt_offset, &pt_filesz); - - munmap(elf_buf, elf_buf_len); - - if (ilen < 0) { - close(exec_memfd); - return kbox_dispatch_errno(ENOEXEC); - } - - if (ilen > 0) { - /* Dynamic binary. Extract the interpreter from LKL and - * inject it into the tracee. - */ - long interp_lkl = kbox_lkl_openat(ctx->sysnrs, AT_FDCWD_LINUX, - interp_path, O_RDONLY, 0); - if (interp_lkl < 0) { - if (ctx->verbose) - fprintf(stderr, - "kbox: exec %s: cannot open " - "interpreter %s: %s\n", - pathbuf, interp_path, - kbox_err_text(interp_lkl)); - close(exec_memfd); - return kbox_dispatch_errno((int) (-interp_lkl)); - } - - int interp_memfd = kbox_shadow_create(ctx->sysnrs, interp_lkl); - lkl_close_and_invalidate(ctx, interp_lkl); - - if (interp_memfd < 0) { - close(exec_memfd); - return kbox_dispatch_errno(-interp_memfd); - } - - /* Inject the interpreter memfd first so we know its FD - * number in the tracee for the PT_INTERP patch. O_CLOEXEC - * is safe: the kernel resolves /proc/self/fd/N via - * open_exec() before begin_new_exec() closes CLOEXEC - * descriptors. - */ - int tracee_interp_fd = - request_addfd(ctx, req, interp_memfd, O_CLOEXEC); - close(interp_memfd); - - if (tracee_interp_fd < 0) { - close(exec_memfd); - return kbox_dispatch_errno(-tracee_interp_fd); - } - - /* Patch PT_INTERP in the exec memfd to point at the - * injected interpreter: /proc/self/fd/. - */ - char new_interp[64]; - int new_len = snprintf(new_interp, sizeof(new_interp), - "/proc/self/fd/%d", tracee_interp_fd); - - if ((uint64_t) (new_len + 1) > pt_filesz) { - close(exec_memfd); - return kbox_dispatch_errno(ENOMEM); - } - - char patch[256]; - size_t patch_len = (size_t) pt_filesz; - if (patch_len > sizeof(patch)) - patch_len = sizeof(patch); - memset(patch, 0, patch_len); - memcpy(patch, new_interp, (size_t) new_len); - - if (pwrite(exec_memfd, patch, patch_len, (off_t) pt_offset) != - (ssize_t) patch_len) { - close(exec_memfd); - return kbox_dispatch_errno(EIO); - } - - if (ctx->verbose) - fprintf(stderr, - "kbox: exec %s: interpreter %s " - "-> /proc/self/fd/%d\n", - pathbuf, interp_path, tracee_interp_fd); - } - } else { - munmap(elf_buf, elf_buf_len); - } - } - - /* Inject the exec memfd into the tracee. O_CLOEXEC keeps the tracee's - * FD table clean after exec succeeds. - */ - int tracee_exec_fd = request_addfd(ctx, req, exec_memfd, O_CLOEXEC); - close(exec_memfd); - - if (tracee_exec_fd < 0) - return kbox_dispatch_errno(-tracee_exec_fd); - - /* Overwrite the pathname in the tracee's memory with /proc/self/fd/. - * The kernel has not yet copied the pathname (getname happens after - * the seccomp check), so when we CONTINUE, it reads our rewritten - * path. - * - * argv[0] aliasing: some shells pass the same pointer for pathname - * and argv[0]. If we overwrite the pathname, we corrupt argv[0]. - * Detect this and fix it by writing the original path right after - * the new path in the same buffer, then updating the argv[0] pointer - * in the argv array. - * - * Try process_vm_writev first (fast path). If that fails (e.g. - * pathname is in .rodata), fall back to /proc/pid/mem which can - * write through page protections. - */ - char new_path[64]; - int new_path_len = snprintf(new_path, sizeof(new_path), "/proc/self/fd/%d", - tracee_exec_fd); - - /* Check if argv[0] is aliased with the pathname. argv pointer is args[1] - * for execve, args[2] for execveat. - */ - uint64_t argv_addr = is_execveat ? kbox_syscall_request_arg(req, 2) - : kbox_syscall_request_arg(req, 1); - uint64_t argv0_ptr = 0; - int argv0_aliased = 0; - - if (argv_addr != 0) { - rc = guest_mem_read(ctx, pid, argv_addr, &argv0_ptr, sizeof(argv0_ptr)); - if (rc == 0 && argv0_ptr == path_addr) - argv0_aliased = 1; - } - - /* Build the write buffer: new_path + NUL + original_path + NUL. Original - * path goes right after the new path so we can point argv[0] at it. - */ - size_t orig_len = strlen(pathbuf); - size_t total_write = (size_t) (new_path_len + 1); - - if (argv0_aliased) - total_write += orig_len + 1; - - char write_buf[KBOX_MAX_PATH + 64]; - if (total_write > sizeof(write_buf)) - return kbox_dispatch_errno(ENAMETOOLONG); - - memcpy(write_buf, new_path, (size_t) (new_path_len + 1)); - if (argv0_aliased) - memcpy(write_buf + new_path_len + 1, pathbuf, orig_len + 1); - - rc = guest_mem_write(ctx, pid, path_addr, write_buf, total_write); - if (rc < 0) { - rc = guest_mem_write_force(ctx, pid, path_addr, write_buf, total_write); - if (rc < 0) { - if (ctx->verbose) - fprintf(stderr, - "kbox: exec %s: cannot rewrite " - "pathname: %s\n", - pathbuf, strerror(-rc)); - return kbox_dispatch_errno(ENOEXEC); - } - } - - /* If argv[0] was aliased, update the argv[0] pointer to point at original - * path copy (right after the new path). - */ - if (argv0_aliased) { - uint64_t new_argv0 = path_addr + (uint64_t) (new_path_len + 1); - rc = - guest_mem_write(ctx, pid, argv_addr, &new_argv0, sizeof(new_argv0)); - if (rc < 0) - guest_mem_write_force(ctx, pid, argv_addr, &new_argv0, - sizeof(new_argv0)); - } - - if (ctx->verbose) - fprintf(stderr, "kbox: exec %s -> /proc/self/fd/%d\n", pathbuf, - tracee_exec_fd); - - /* Clean up CLOEXEC entries from the FD table, matching what a - * successful exec will do in the kernel. - * - * This is still conservative: if exec later fails, the tracee resumes - * after we have already purged those mappings. That rollback problem is - * preferable to keeping stale mappings alive across a successful exec, - * which misroutes future FD operations in the new image. - */ - kbox_fd_table_close_cloexec(ctx->fd_table, ctx->sysnrs); - - /* Invalidate the cached /proc/pid/mem FD. After exec, the kernel - * may revoke access to the old FD even though the PID is the same - * (credential check against the new binary). Forcing a reopen on - * the next write ensures we have valid access. - */ - if (ctx->proc_mem_fd >= 0) { - close(ctx->proc_mem_fd); - ctx->proc_mem_fd = -1; - } - - return kbox_dispatch_continue(); -} - -/* clone3 namespace-flag sanitization. */ - -/* CLONE_NEW* flags that clone3 can smuggle in via clone_args.flags. The BPF - * deny-list blocks unshare/setns, but clone3 bypasses it unless we check here. - */ -#ifndef CLONE_NEWNS -#define CLONE_NEWNS 0x00020000ULL -#endif -#ifndef CLONE_NEWTIME -#define CLONE_NEWTIME 0x00000080ULL -#endif -#ifndef CLONE_NEWCGROUP -#define CLONE_NEWCGROUP 0x02000000ULL -#endif -#ifndef CLONE_NEWUTS -#define CLONE_NEWUTS 0x04000000ULL -#endif -#ifndef CLONE_NEWIPC -#define CLONE_NEWIPC 0x08000000ULL -#endif -#ifndef CLONE_NEWUSER -#define CLONE_NEWUSER 0x10000000ULL -#endif -#ifndef CLONE_NEWPID -#define CLONE_NEWPID 0x20000000ULL -#endif -#ifndef CLONE_NEWNET -#define CLONE_NEWNET 0x40000000ULL -#endif -#ifndef CLONE_THREAD -#define CLONE_THREAD 0x00010000ULL -#endif - -#define CLONE_NEW_MASK \ - (CLONE_NEWNS | CLONE_NEWTIME | CLONE_NEWCGROUP | CLONE_NEWUTS | \ - CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET) - -/* W^X enforcement for mprotect in trap/rewrite mode. - * - * Reject simultaneous PROT_WRITE|PROT_EXEC to prevent JIT spray attacks. - * On none->X transitions, scan the page for syscall/sysenter/SVC instructions - * and add them to the origin map for rewrite-mode caller validation. - * - * In seccomp mode, this is a no-op: CONTINUE lets the host kernel handle it. - */ -static struct kbox_dispatch forward_mprotect( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - uint64_t addr = kbox_syscall_request_arg(req, 0); - uint64_t len = kbox_syscall_request_arg(req, 1); - int prot = (int) kbox_syscall_request_arg(req, 2); - - /* In seccomp mode (supervisor), just pass through. */ - if (!request_uses_trap_signals(req)) - return kbox_dispatch_continue(); - - /* W^X enforcement: reject PROT_WRITE | PROT_EXEC. */ - if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) { - if (ctx->verbose) - fprintf(stderr, - "kbox: mprotect denied: W^X violation at 0x%llx len=%llu " - "(pid=%u)\n", - (unsigned long long) addr, (unsigned long long) len, - kbox_syscall_request_pid(req)); - return kbox_dispatch_errno(EACCES); - } - - /* Allow the mprotect to proceed via host kernel. If the page transitions - * to PROT_EXEC, JIT code on it will take the Tier 1 (RET_TRAP) slow path - * because it won't be in the BPF allow ranges. This is safe: un-rewritten - * syscall instructions in JIT pages are caught by the SIGSYS handler. - * - * Full scan-on-X (rewriting JIT pages at mprotect time) is a future - * optimization: it would promote JIT pages from Tier 1 (~3us) to Tier 2 - * (~41ns) but requires synchronous instruction scanning while the page - * is still writable, which adds latency to the mprotect call. - */ - return kbox_dispatch_continue(); -} - -static struct kbox_dispatch forward_clone3( - const struct kbox_syscall_request *req, - struct kbox_supervisor_ctx *ctx) -{ - uint64_t flags; - int rc; - - /* clone3(struct clone_args *args, size_t size). flags is the first uint64_t - * field in clone_args. We only need to read the first 8 bytes. - */ - rc = - guest_mem_read(ctx, kbox_syscall_request_pid(req), - kbox_syscall_request_arg(req, 0), &flags, sizeof(flags)); - if (rc < 0) { - /* Can't read tracee memory; fail closed with EPERM. - * - * CONTINUE is unsafe here: a tracee can clear dumpability via - * prctl(PR_SET_DUMPABLE, 0), causing process_vm_readv to fail with - * EPERM. If we CONTINUE, clone3 reaches host kernel with unchecked - * namespace flags: a sandbox escape. Returning EPERM is the only safe - * option. - */ - if (ctx->verbose) - fprintf(stderr, - "kbox: clone3 denied: cannot read clone_args " - "(pid=%u, rc=%d)\n", - kbox_syscall_request_pid(req), rc); - return kbox_dispatch_errno(EPERM); - } - - if (flags & CLONE_NEW_MASK) { - if (ctx->verbose) - fprintf(stderr, - "kbox: clone3 denied: namespace flags 0x%llx " - "(pid=%u)\n", - (unsigned long long) (flags & CLONE_NEW_MASK), - kbox_syscall_request_pid(req)); - return kbox_dispatch_errno(EPERM); - } - - /* In trap/rewrite mode, block thread creation (CLONE_THREAD). - * Multi-threaded guests require --syscall-mode=seccomp. - */ - if ((flags & CLONE_THREAD) && request_uses_trap_signals(req)) { - if (ctx->verbose) - fprintf(stderr, - "kbox: clone3 denied: CLONE_THREAD in trap/rewrite mode " - "(pid=%u, use --syscall-mode=seccomp)\n", - kbox_syscall_request_pid(req)); - return kbox_dispatch_errno(EPERM); - } - - return kbox_dispatch_continue(); -} - -/* Main dispatch function. */ - -struct kbox_dispatch kbox_dispatch_request( - struct kbox_supervisor_ctx *ctx, - const struct kbox_syscall_request *req) -{ - const struct kbox_host_nrs *h = ctx->host_nrs; - int nr; - - if (!ctx || !req) - return kbox_dispatch_errno(EINVAL); + if (!ctx || !req) + return kbox_dispatch_errno(EINVAL); kbox_dispatch_prepare_request_ctx(ctx, req); nr = req->nr; @@ -6284,76 +3322,51 @@ struct kbox_dispatch kbox_dispatch_request( kbox_syscall_request_pid(req), nr, name ? name : "unknown"); } - /* Legacy x86_64 syscalls. */ + /* Table-driven dispatch: forward to handler. */ + +#define _(field, handler) \ + if (nr == h->field) \ + return handler(req, ctx); + DISPATCH_FORWARD_TABLE(_) +#undef _ + + /* Table-driven dispatch: CONTINUE to host kernel. */ + +#define _(field) \ + if (nr == h->field) \ + return kbox_dispatch_continue(); + DISPATCH_CONTINUE_TABLE(_) +#undef _ + + /* Entries with extra arguments or guards. */ if (nr == h->stat) return forward_stat_legacy(req, ctx, 0); if (nr == h->lstat) return forward_stat_legacy(req, ctx, 1); - if (nr == h->access) - return forward_access_legacy(req, ctx); - if (nr == h->mkdir) - return forward_mkdir_legacy(req, ctx); - if (nr == h->rmdir) - return forward_rmdir_legacy(req, ctx); - if (nr == h->unlink) - return forward_unlink_legacy(req, ctx); - if (nr == h->rename) - return forward_rename_legacy(req, ctx); - if (nr == h->chmod) - return forward_chmod_legacy(req, ctx); - if (nr == h->chown) - return forward_chown_legacy(req, ctx); - if (nr == h->open) - return forward_open_legacy(req, ctx); - - /* File open/create. */ - - if (nr == h->openat) - return forward_openat(req, ctx); - if (nr == h->openat2) - return forward_openat2(req, ctx); - - /* Metadata. */ - - if (nr == h->fstat) - return forward_fstat(req, ctx); - if (nr == h->newfstatat) - return forward_newfstatat(req, ctx); - if (nr == h->statx) - return forward_statx(req, ctx); + if (nr == h->read) + return forward_read_like(req, ctx, 0); + if (nr == h->pread64) + return forward_read_like(req, ctx, 1); if (nr == h->faccessat && h->faccessat > 0) return forward_faccessat(req, ctx); - if (nr == h->faccessat2) - return forward_faccessat2(req, ctx); - - /* Directories. */ - - if (nr == h->getdents64) - return forward_getdents64(req, ctx); - if (nr == h->getdents) - return forward_getdents(req, ctx); - if (nr == h->mkdirat) - return forward_mkdirat(req, ctx); - if (nr == h->unlinkat) - return forward_unlinkat(req, ctx); if (nr == h->renameat && h->renameat > 0) return forward_renameat(req, ctx); - if (nr == h->renameat2) - return forward_renameat2(req, ctx); - if (nr == h->fchmodat) - return forward_fchmodat(req, ctx); - if (nr == h->fchownat) - return forward_fchownat(req, ctx); - - /* Navigation. */ - - if (nr == h->chdir) - return forward_chdir(req, ctx); - if (nr == h->fchdir) - return forward_fchdir(req, ctx); - if (nr == h->getcwd) - return forward_getcwd(req, ctx); + if (nr == h->copy_file_range) + return kbox_dispatch_errno(ENOSYS); + if (nr == h->execve) + return forward_execve(req, ctx, 0); + if (nr == h->execveat) + return forward_execve(req, ctx, 1); + + /* Process info: constant return values. */ + + if (nr == h->getpid) + return kbox_dispatch_value(1); + if (nr == h->getppid) + return kbox_dispatch_value(0); + if (nr == h->gettid) + return kbox_dispatch_value(1); /* Identity: UID. */ @@ -6421,98 +3434,9 @@ struct kbox_dispatch kbox_dispatch_request( if (nr == h->setfsgid) return dispatch_set_id(req, ctx, forward_setfsgid); - /* Mount. */ - - if (nr == h->mount) - return forward_mount(req, ctx); - if (nr == h->umount2) - return forward_umount2(req, ctx); + /* Legacy pipe syscall: one arg, create host pipe2 and inject via ADDFD. */ - /* FD operations. */ - - if (nr == h->close) - return forward_close(req, ctx); - if (nr == h->fcntl) - return forward_fcntl(req, ctx); - if (nr == h->dup) - return forward_dup(req, ctx); - if (nr == h->dup2) - return forward_dup2(req, ctx); - if (nr == h->dup3) - return forward_dup3(req, ctx); - - /* I/O. */ - - if (nr == h->read) - return forward_read_like(req, ctx, 0); - if (nr == h->pread64) - return forward_read_like(req, ctx, 1); - if (nr == h->write) - return forward_write(req, ctx); - if (nr == h->lseek) - return forward_lseek(req, ctx); - - /* Networking. */ - - if (nr == h->socket) - return forward_socket(req, ctx); - if (nr == h->bind) - return forward_bind(req, ctx); - if (nr == h->connect) - return forward_connect(req, ctx); - if (nr == h->sendto) - return forward_sendto(req, ctx); - if (nr == h->recvfrom) - return forward_recvfrom(req, ctx); - /* sendmsg: BPF allow-listed (SCM_RIGHTS), never reaches here. - * Shadow socket callers should use sendto for addressed datagrams. - */ - if (nr == h->recvmsg) - return forward_recvmsg(req, ctx); - if (nr == h->getsockopt) - return forward_getsockopt(req, ctx); - if (nr == h->setsockopt) - return forward_setsockopt(req, ctx); - if (nr == h->getsockname) - return forward_getsockname(req, ctx); - if (nr == h->getpeername) - return forward_getpeername(req, ctx); - if (nr == h->shutdown) - return forward_shutdown(req, ctx); - - /* I/O extended. */ - - if (nr == h->pwrite64) - return forward_pwrite64(req, ctx); - if (nr == h->writev) - return forward_writev(req, ctx); - if (nr == h->readv) - return forward_readv(req, ctx); - if (nr == h->ftruncate) - return forward_ftruncate(req, ctx); - if (nr == h->fallocate) - return forward_fallocate(req, ctx); - if (nr == h->flock) - return forward_flock(req, ctx); - if (nr == h->fsync) - return forward_fsync(req, ctx); - if (nr == h->fdatasync) - return forward_fdatasync(req, ctx); - if (nr == h->sync) - return forward_sync(req, ctx); - if (nr == h->ioctl) - return forward_ioctl(req, ctx); - - /* File operations. */ - - if (nr == h->readlinkat) - return forward_readlinkat(req, ctx); - if (nr == h->pipe2) - return forward_pipe2(req, ctx); if (nr == h->pipe) { - /* Legacy pipe(2) has only one arg: pipefd. Create host pipe and inject - * via ADDFD, same as the pipe2 path. - */ pid_t ppid = kbox_syscall_request_pid(req); uint64_t remote_pfd = kbox_syscall_request_arg(req, 0); if (remote_pfd == 0) @@ -6543,68 +3467,8 @@ struct kbox_dispatch kbox_dispatch_request( return kbox_dispatch_errno(-pwrc); return kbox_dispatch_value(0); } - if (nr == h->symlinkat) - return forward_symlinkat(req, ctx); - if (nr == h->linkat) - return forward_linkat(req, ctx); - if (nr == h->utimensat) - return forward_utimensat(req, ctx); - if (nr == h->sendfile) - return forward_sendfile(req, ctx); - if (nr == h->copy_file_range) - return kbox_dispatch_errno(ENOSYS); - - /* Process info. */ - - if (nr == h->getpid) - return kbox_dispatch_value(1); - if (nr == h->getppid) - return kbox_dispatch_value(0); - if (nr == h->gettid) - return kbox_dispatch_value(1); - if (nr == h->setpgid) - return kbox_dispatch_continue(); - if (nr == h->getpgid) - return kbox_dispatch_continue(); - if (nr == h->getsid) - return kbox_dispatch_continue(); - if (nr == h->setsid) - return kbox_dispatch_continue(); - - /* Time. */ - if (nr == h->clock_gettime) - return forward_clock_gettime(req, ctx); - if (nr == h->clock_getres) - return forward_clock_getres(req, ctx); - if (nr == h->gettimeofday) - return forward_gettimeofday(req, ctx); - - /* Process lifecycle. */ - - if (nr == h->umask) - return forward_umask(req, ctx); - if (nr == h->uname) - return forward_uname(req, ctx); - if (nr == h->brk) - return kbox_dispatch_continue(); - if (nr == h->getrandom) - return forward_getrandom(req, ctx); - if (nr == h->syslog) - return forward_syslog(req, ctx); - if (nr == h->prctl) - return forward_prctl(req, ctx); - if (nr == h->wait4) - return kbox_dispatch_continue(); - if (nr == h->waitid) - return kbox_dispatch_continue(); - if (nr == h->exit) - return kbox_dispatch_continue(); - if (nr == h->exit_group) - return kbox_dispatch_continue(); - - /* Signals (CONTINUE). */ - /* Signal disposition and masking are per-process host kernel state. */ + /* Signals. */ if (nr == h->rt_sigaction) { if (request_uses_trap_signals(req) && @@ -6623,7 +3487,7 @@ struct kbox_dispatch kbox_dispatch_request( if (signo == 11 /* SIGSEGV */ || signo == 7 /* SIGBUS */) kbox_procmem_signal_changed(); } - return kbox_dispatch_continue(); /* signal handler registration */ + return kbox_dispatch_continue(); } if (nr == h->rt_sigprocmask) { if (request_uses_trap_signals(req)) { @@ -6643,136 +3507,90 @@ struct kbox_dispatch kbox_dispatch_request( } return emulate_trap_rt_sigprocmask(req, ctx); } - return kbox_dispatch_continue(); /* signal mask manipulation */ + return kbox_dispatch_continue(); } - if (nr == h->rt_sigreturn) - return kbox_dispatch_continue(); /* return from signal handler */ if (nr == h->rt_sigpending) { if (request_uses_trap_signals(req)) return emulate_trap_rt_sigpending(req, ctx); - return kbox_dispatch_continue(); /* pending signal query */ + return kbox_dispatch_continue(); } - if (nr == h->rt_sigaltstack) - return kbox_dispatch_continue(); /* alternate signal stack */ - if (nr == h->setitimer) - return kbox_dispatch_continue(); /* interval timer */ - if (nr == h->getitimer) - return kbox_dispatch_continue(); /* query interval timer */ if (h->alarm >= 0 && nr == h->alarm) - return kbox_dispatch_continue(); /* alarm (not on aarch64) */ + return kbox_dispatch_continue(); - /* Signal delivery (dispatch: PID validation). */ - /* kill/tgkill/tkill must go through dispatch (not BPF deny) because ash - * needs them for job control. We validate the target PID belongs to the - * guest process tree. PID is in register args (no TOCTOU). + /* Signal delivery: PID validation + virtual-to-real translation. + * + * kill/tgkill/tkill share PID validation (guest process tree only), virtual + * PID 1 -> real PID translation, and trap-mode pending signal bookkeeping. + * The helper below covers the common tail. */ - /* Accept the guest's virtual PID (1) as equivalent to the real host - * PID. getpid/gettid return 1, so raise() calls tgkill(1, 1, sig) which - * must reach the host kernel with the real PID. Also accept notif->pid - * (the tracee's actual host PID from the seccomp notification). - */ #define IS_GUEST_PID(p) \ ((p) == ctx->child_pid || (p) == kbox_syscall_request_pid(req) || (p) == 1) +#define DENY_NON_GUEST(pid_val, name) \ + do { \ + if (!IS_GUEST_PID(pid_val)) { \ + if (ctx->verbose) \ + fprintf(stderr, \ + "kbox: " name \ + "(%d) " \ + "denied: not guest PID\n", \ + (int) (pid_val)); \ + return kbox_dispatch_errno(EPERM); \ + } \ + } while (0) + if (nr == h->kill) { pid_t target = (pid_t) kbox_syscall_request_arg(req, 0); int sig = (int) kbox_syscall_request_arg(req, 1); - if (!IS_GUEST_PID(target) && target != 0) { - if (ctx->verbose) - fprintf(stderr, "kbox: kill(%d) denied: not guest PID\n", - target); - return kbox_dispatch_errno(EPERM); - } - /* Translate virtual PID to real PID. In both seccomp and trap - * mode, the guest sees itself as PID 1. Route kill(1, sig) and - * kill(0, sig) to the real child PID. - */ - { - pid_t real_target = ctx->child_pid; - long ret = syscall(SYS_kill, real_target, sig); - if (ret < 0) - return kbox_dispatch_errno(errno); - if (request_uses_trap_signals(req) && - real_target == ctx->child_pid && - trap_sigmask_contains_signal(sig)) - (void) kbox_syscall_trap_add_pending_signal(sig); - return kbox_dispatch_value(0); - } + if (!IS_GUEST_PID(target) && target != 0) + DENY_NON_GUEST(target, "kill"); + pid_t real = ctx->child_pid; + long ret = syscall(SYS_kill, real, sig); + if (ret < 0) + return kbox_dispatch_errno(errno); + if (request_uses_trap_signals(req) && trap_sigmask_contains_signal(sig)) + (void) kbox_syscall_trap_add_pending_signal(sig); + return kbox_dispatch_value(0); } if (nr == h->tgkill) { pid_t tgid = (pid_t) kbox_syscall_request_arg(req, 0); pid_t tid = (pid_t) kbox_syscall_request_arg(req, 1); int sig = (int) kbox_syscall_request_arg(req, 2); - if (!IS_GUEST_PID(tgid)) { - if (ctx->verbose) - fprintf(stderr, "kbox: tgkill(%d) denied: not guest PID\n", - tgid); - return kbox_dispatch_errno(EPERM); - } - /* Translate virtual PID/TID to real. Both seccomp and trap modes - * must emulate tgkill because the guest uses virtual PID 1. - */ - { - pid_t real_tgid = ctx->child_pid; - pid_t real_tid = (tid == 1) ? kbox_syscall_request_pid(req) : tid; - long ret = syscall(SYS_tgkill, real_tgid, real_tid, sig); - if (ret < 0) - return kbox_dispatch_errno(errno); - if (request_uses_trap_signals(req) && real_tgid == ctx->child_pid && - real_tid == kbox_syscall_request_pid(req) && - trap_sigmask_contains_signal(sig)) - (void) kbox_syscall_trap_add_pending_signal(sig); - return kbox_dispatch_value(0); - } + DENY_NON_GUEST(tgid, "tgkill"); + pid_t real_tgid = ctx->child_pid; + pid_t real_tid = (tid == 1) ? kbox_syscall_request_pid(req) : tid; + long ret = syscall(SYS_tgkill, real_tgid, real_tid, sig); + if (ret < 0) + return kbox_dispatch_errno(errno); + if (request_uses_trap_signals(req) && + real_tid == kbox_syscall_request_pid(req) && + trap_sigmask_contains_signal(sig)) + (void) kbox_syscall_trap_add_pending_signal(sig); + return kbox_dispatch_value(0); } if (nr == h->tkill) { pid_t target = (pid_t) kbox_syscall_request_arg(req, 0); int sig = (int) kbox_syscall_request_arg(req, 1); - if (!IS_GUEST_PID(target)) { - if (ctx->verbose) - fprintf(stderr, "kbox: tkill(%d) denied: not guest PID\n", - target); - return kbox_dispatch_errno(EPERM); - } - { - pid_t real_tid = - (target == 1) ? kbox_syscall_request_pid(req) : target; - long ret = syscall(SYS_tkill, real_tid, sig); - if (ret < 0) - return kbox_dispatch_errno(errno); - if (request_uses_trap_signals(req) && - real_tid == kbox_syscall_request_pid(req) && - trap_sigmask_contains_signal(sig)) - (void) kbox_syscall_trap_add_pending_signal(sig); - return kbox_dispatch_value(0); - } + DENY_NON_GUEST(target, "tkill"); + pid_t real_tid = (target == 1) ? kbox_syscall_request_pid(req) : target; + long ret = syscall(SYS_tkill, real_tid, sig); + if (ret < 0) + return kbox_dispatch_errno(errno); + if (request_uses_trap_signals(req) && + real_tid == kbox_syscall_request_pid(req) && + trap_sigmask_contains_signal(sig)) + (void) kbox_syscall_trap_add_pending_signal(sig); + return kbox_dispatch_value(0); } +#undef DENY_NON_GUEST #undef IS_GUEST_PID - if (nr == h->pidfd_send_signal) { - /* pidfd_send_signal is rare; deny by default for now. */ + if (nr == h->pidfd_send_signal) return kbox_dispatch_errno(EPERM); - } - /* Threading (CONTINUE). */ - /* Thread management is host kernel state; LKL is not involved. */ - - if (nr == h->set_tid_address) - return kbox_dispatch_continue(); /* set clear_child_tid pointer */ - if (nr == h->set_robust_list) - return kbox_dispatch_continue(); /* robust futex list */ - if (nr == h->futex) - return kbox_dispatch_continue(); /* fast userspace mutex */ - if (nr == h->clone3) - return forward_clone3(req, ctx); /* sanitize namespace flags */ + /* arch_prctl: intercept SET_FS/GET_FS in trap/rewrite mode. */ + if (nr == h->arch_prctl) { - /* In trap/rewrite mode, arch_prctl(SET_FS) must be intercepted - * to avoid overwriting kbox's TLS. The SIGSYS handler swaps - * FS on entry/exit; SET_FS updates the guest's saved FS base - * so it takes effect when the handler returns. GET_FS returns - * the guest's saved FS base. In seccomp mode, CONTINUE is fine - * because the supervisor runs in a separate process. - */ if (request_uses_trap_signals(req)) { long subcmd = to_c_long_arg(kbox_syscall_request_arg(req, 0)); if (subcmd == 0x1002 /* ARCH_SET_FS */) { @@ -6792,12 +3610,12 @@ struct kbox_dispatch kbox_dispatch_request( return kbox_dispatch_value(0); } } - return kbox_dispatch_continue(); /* GS or seccomp mode */ + return kbox_dispatch_continue(); } - if (nr == h->rseq) - return kbox_dispatch_continue(); /* restartable sequences */ + + /* clone: validate namespace and thread flags. */ + if (nr == h->clone) { - /* Legacy clone: flags are in args[0] directly (not a struct). */ uint64_t cflags = kbox_syscall_request_arg(req, 0); if (cflags & CLONE_NEW_MASK) { if (ctx->verbose) @@ -6808,10 +3626,6 @@ struct kbox_dispatch kbox_dispatch_request( kbox_syscall_request_pid(req)); return kbox_dispatch_errno(EPERM); } - /* In trap/rewrite mode, block thread creation (CLONE_THREAD). - * The SIGSYS handler and shared LKL state are not thread-safe; - * multi-threaded guests must use --syscall-mode=seccomp. - */ if ((cflags & CLONE_THREAD) && request_uses_trap_signals(req)) { if (ctx->verbose) fprintf(stderr, @@ -6822,12 +3636,8 @@ struct kbox_dispatch kbox_dispatch_request( } return kbox_dispatch_continue(); } - if (nr == h->fork) - return kbox_dispatch_continue(); /* legacy fork */ - if (nr == h->vfork) - return kbox_dispatch_continue(); /* legacy vfork */ - /* Memory mapping. */ + /* Memory mapping: invalidate path cache, then forward/CONTINUE. */ if (nr == h->mmap) { invalidate_translated_path_cache(ctx); @@ -6835,54 +3645,24 @@ struct kbox_dispatch kbox_dispatch_request( } if (nr == h->munmap) { invalidate_translated_path_cache(ctx); - return kbox_dispatch_continue(); /* unmap pages */ + return kbox_dispatch_continue(); } if (nr == h->mprotect) { invalidate_translated_path_cache(ctx); - return forward_mprotect(req, ctx); /* W^X enforcement + CONTINUE */ + return forward_mprotect(req, ctx); } if (nr == h->mremap) { invalidate_translated_path_cache(ctx); - return kbox_dispatch_continue(); /* remap pages */ - } - if (nr == h->membarrier) - return kbox_dispatch_continue(); /* memory barrier (musl threads) */ - - /* Scheduling (CONTINUE). */ - /* Scheduler ops are safe; RLIMIT_RTPRIO=0 prevents RT starvation. */ - - if (nr == h->sched_yield) - return kbox_dispatch_continue(); - if (nr == h->sched_setparam) - return kbox_dispatch_continue(); - if (nr == h->sched_getparam) - return kbox_dispatch_continue(); - if (nr == h->sched_setscheduler) - return kbox_dispatch_continue(); - if (nr == h->sched_getscheduler) - return kbox_dispatch_continue(); - if (nr == h->sched_get_priority_max) - return kbox_dispatch_continue(); - if (nr == h->sched_get_priority_min) - return kbox_dispatch_continue(); - if (nr == h->sched_setaffinity) - return kbox_dispatch_continue(); - if (nr == h->sched_getaffinity) return kbox_dispatch_continue(); + } - /* Resource management. */ + /* prlimit64: GET is safe, SET is restricted. */ - /* prlimit64: GET ops are safe (read-only). SET ops on dangerous resources - * (RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_RTPRIO) are blocked to prevent the - * guest from escaping resource limits. - */ if (nr == h->prlimit64) { uint64_t new_limit_ptr = kbox_syscall_request_arg(req, 2); if (new_limit_ptr == 0) - return kbox_dispatch_continue(); /* GET only */ - /* SET operation: check which resource. */ + return kbox_dispatch_continue(); int resource = (int) kbox_syscall_request_arg(req, 1); - /* Allow safe resources: RLIMIT_CORE(4), RLIMIT_AS(9), etc. */ if (resource == 4 /* RLIMIT_CORE */ || resource == 9 /* RLIMIT_AS */) return kbox_dispatch_continue(); if (ctx->verbose) @@ -6890,61 +3670,9 @@ struct kbox_dispatch kbox_dispatch_request( resource); return kbox_dispatch_errno(EPERM); } - if (nr == h->madvise) - return kbox_dispatch_continue(); /* memory advice */ - if (nr == h->getrlimit) - return kbox_dispatch_continue(); /* read resource limits */ - if (nr == h->getrusage) - return kbox_dispatch_continue(); /* read resource usage */ - - /* I/O multiplexing (CONTINUE). */ - /* All polling/select variants are pure host kernel operations. */ - - if (nr == h->epoll_create1) - return kbox_dispatch_continue(); - if (nr == h->epoll_ctl) - return kbox_dispatch_continue(); - if (nr == h->epoll_wait) - return kbox_dispatch_continue(); - if (nr == h->epoll_pwait) - return kbox_dispatch_continue(); - if (nr == h->ppoll) - return kbox_dispatch_continue(); - if (nr == h->pselect6) - return kbox_dispatch_continue(); - if (nr == h->poll) - return kbox_dispatch_continue(); /* legacy poll (musl/busybox) */ - - /* Sleep/timer (CONTINUE). */ - /* Time waiting is pure host kernel; no LKL involvement. */ - if (nr == h->nanosleep) - return kbox_dispatch_continue(); - if (nr == h->clock_nanosleep) - return kbox_dispatch_continue(); - if (nr == h->timerfd_create) - return kbox_dispatch_continue(); - if (nr == h->timerfd_settime) - return kbox_dispatch_continue(); - if (nr == h->timerfd_gettime) - return kbox_dispatch_continue(); - if (nr == h->eventfd) - return kbox_dispatch_continue(); - if (nr == h->eventfd2) - return kbox_dispatch_continue(); - - /* Filesystem info (CONTINUE/dispatch). */ + /* readlink: TOCTOU risk, forward to LKL via readlinkat. */ - if (nr == h->statfs) - return kbox_dispatch_continue(); /* filesystem stats */ - if (nr == h->fstatfs) - return kbox_dispatch_continue(); /* filesystem stats by fd */ - if (nr == h->sysinfo) - return kbox_dispatch_continue(); /* system info (busybox free) */ - - /* readlink: takes path pointer (TOCTOU risk). Forward to LKL via readlinkat - * instead of CONTINUE. - */ if (nr == h->readlink) { char path[4096]; int ret = guest_mem_read_string(ctx, kbox_syscall_request_pid(req), @@ -6968,13 +3696,6 @@ struct kbox_dispatch kbox_dispatch_request( return kbox_dispatch_value(lret); } - /* Exec (in-image binary extraction + pathname rewrite). */ - - if (nr == h->execve) - return forward_execve(req, ctx, 0); - if (nr == h->execveat) - return forward_execve(req, ctx, 1); - /* Default: deny unknown syscalls. */ if (ctx->verbose) fprintf(stderr, "kbox: DENY unknown syscall nr=%d (pid=%u)\n", nr,