From 7a98c581222ebf89fc9b36dc84bd62920cf30f48 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 31 May 2026 20:56:24 +0800 Subject: [PATCH] Route sendmsg/recvmsg via shared host_iov helper sys_sendmsg and sys_recvmsg resolved each guest iov entry with guest_ptr_bound() and clamped the entry's length to the contiguous prefix when the guest VA crossed a host-side discontinuity (primary slab to guest_mapping / guest_overflow region, or between two distinct mappings). The loop then continued populating host_iov[i+1..] at full requested length, so the host sendmsg(2) placed bytes from iov[i+1] on the wire where the truncated tail of iov[i] belonged, and recvmsg(2) scattered received bytes into the wrong guest buffer past a truncated entry. The case only fires when a guest iov entry straddles two distinct host mappings; the typical single-region iov masked it in tests. Hoist the already-correct build_host_iov helper from src/syscall/io.c into src/syscall/internal.h as host_iov_prepare and the wrapper host_iov_prepare_msg, together with host_iov_buf_t and SYSCALL_IOV_{MAX, STACK_MAX} constants. host_iov_prepare keeps the strict readv/writev contract (rejects iovcnt <= 0); host_iov_prepare_msg short-circuits iovcnt == 0 for ancillary-only sendmsg/recvmsg payloads. Replace the duplicated inline iov build loops in src/syscall/net-msg.c with the shared call. Side effects: lifts the per-call iovcnt cap from 64 to SYSCALL_IOV_MAX (1024) and drops roughly 80 lines of duplicated logic. The shared helper truncates at the first non-contiguous iov entry and zeros every subsequent entry, so the host call returns a POSIX-compliant short I/O instead of mis-aligning the wire data. --- src/syscall/internal.h | 46 ++++++++++++++++ src/syscall/io.c | 33 +++++++----- src/syscall/net-msg.c | 107 +++++++++++++++----------------------- tests/test-readv-writev.c | 31 +++++++++++ 4 files changed, 138 insertions(+), 79 deletions(-) diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 8534e6a..4e3fa56 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "syscall/abi.h" @@ -312,6 +313,51 @@ static inline int64_t host_fd_ref_open_io(guest_fd_t guest_fd, return 0; } +/* iov limits shared between readv/writev/preadv/pwritev and sendmsg/recvmsg. + * SYSCALL_IOV_MAX matches the Linux UIO_MAXIOV cap; SYSCALL_IOV_STACK_MAX + * keeps the typical case on the call-site stack. + */ +#define SYSCALL_IOV_MAX 1024 +#define SYSCALL_IOV_STACK_MAX 64 + +/* Resolved host iov vector backed by an inline stack buffer with a heap + * fallback for large iovcnt. Pair host_iov_prepare with host_iov_free. + */ +typedef struct { + struct iovec stack[SYSCALL_IOV_STACK_MAX]; + struct iovec *iov; +} host_iov_buf_t; + +/* Translate a guest iovec array at iov_gva (iovcnt entries) into the host + * iovec layout in buf->iov, resolving each guest_base to a contiguous host + * pointer with the requested permissions. On a non-contiguous iov entry the + * helper truncates that entry to the contiguous prefix and zeros every + * subsequent entry; the host readv/writev/sendmsg/recvmsg then returns a + * POSIX-compliant short I/O instead of silently packing bytes from the next + * guest buffer into the truncated tail. + * + * iovcnt <= 0 or > SYSCALL_IOV_MAX returns -LINUX_EINVAL. + * + * Returns 0 on success or a negative Linux errno on failure. The caller must + * pair every successful prepare with host_iov_free to release any heap + * spillover. + */ +int64_t host_iov_prepare(guest_t *g, + uint64_t iov_gva, + int iovcnt, + int required_perms, + host_iov_buf_t *buf); + +/* sendmsg/recvmsg variant: iovcnt == 0 is legal for ancillary-only messages. */ +int64_t host_iov_prepare_msg(guest_t *g, + uint64_t iov_gva, + int iovcnt, + int required_perms, + host_iov_buf_t *buf); + +/* Release any heap spillover backing a host_iov_buf_t. Idempotent. */ +void host_iov_free(host_iov_buf_t *buf); + /* Read a guest path string with small-buffer optimization. * * Tries the stack-allocated short_buf first; falls back to long_buf for diff --git a/src/syscall/io.c b/src/syscall/io.c index ef04d56..c3d26d2 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -43,8 +43,6 @@ #include "syscall/proc.h" #include "syscall/signal.h" -#define SYSCALL_IOV_MAX 1024 -#define SYSCALL_IOV_STACK_MAX 64 #define URANDOM_CACHE_SIZE 4096 /* Linux terminal struct types. */ @@ -1098,21 +1096,17 @@ static int64_t build_host_iov(guest_t *g, return 0; } -typedef struct { - struct iovec stack[SYSCALL_IOV_STACK_MAX]; - struct iovec *iov; -} host_iov_buf_t; - -static int64_t host_iov_prepare(guest_t *g, - uint64_t iov_gva, - int iovcnt, - int required_perms, - host_iov_buf_t *buf) +int64_t host_iov_prepare(guest_t *g, + uint64_t iov_gva, + int iovcnt, + int required_perms, + host_iov_buf_t *buf) { if (iovcnt <= 0 || iovcnt > SYSCALL_IOV_MAX) return -LINUX_EINVAL; buf->iov = buf->stack; + if (iovcnt > SYSCALL_IOV_STACK_MAX) { buf->iov = malloc((size_t) iovcnt * sizeof(*buf->iov)); if (!buf->iov) @@ -1130,7 +1124,20 @@ static int64_t host_iov_prepare(guest_t *g, return 0; } -static void host_iov_free(host_iov_buf_t *buf) +int64_t host_iov_prepare_msg(guest_t *g, + uint64_t iov_gva, + int iovcnt, + int required_perms, + host_iov_buf_t *buf) +{ + if (iovcnt == 0) { + buf->iov = buf->stack; + return 0; + } + return host_iov_prepare(g, iov_gva, iovcnt, required_perms, buf); +} + +void host_iov_free(host_iov_buf_t *buf) { if (buf->iov != buf->stack) free(buf->iov); diff --git a/src/syscall/net-msg.c b/src/syscall/net-msg.c index 96221ff..d388969 100644 --- a/src/syscall/net-msg.c +++ b/src/syscall/net-msg.c @@ -174,41 +174,22 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) dest_len = (socklen_t) ml; } - if (lmsg.msg_iovlen > 64) { + /* msg_iovlen is uint64_t on Linux; bound it against SYSCALL_IOV_MAX + * before the int narrowing below so a 64-bit value whose low 32 bits + * fall inside [0, SYSCALL_IOV_MAX] cannot slip past the cap. + */ + if (lmsg.msg_iovlen > SYSCALL_IOV_MAX) { host_fd_ref_close(&host_ref); return -LINUX_EINVAL; } + int send_iovcnt = (int) lmsg.msg_iovlen; - struct { - uint64_t iov_base, iov_len; - } guest_iov[64]; - - if (lmsg.msg_iovlen > 0) { - if (guest_read(g, lmsg.msg_iov, guest_iov, lmsg.msg_iovlen * 16) < 0) { - host_fd_ref_close(&host_ref); - return -LINUX_EFAULT; - } - } - - struct iovec host_iov[64]; - for (uint64_t i = 0; i < lmsg.msg_iovlen; i++) { - if (guest_iov[i].iov_len == 0) { - host_iov[i].iov_base = NULL; - host_iov[i].iov_len = 0; - continue; - } - uint64_t avail = 0; - void *base = guest_ptr_bound(g, guest_iov[i].iov_base, &avail, - MEM_PERM_R, guest_iov[i].iov_len); - if (!base) { - host_fd_ref_close(&host_ref); - return -LINUX_EFAULT; - } - uint64_t len = guest_iov[i].iov_len; - if (len > avail) - len = avail; - host_iov[i].iov_base = base; - host_iov[i].iov_len = len; + host_iov_buf_t host_iov; + int64_t iov_err = host_iov_prepare_msg(g, lmsg.msg_iov, send_iovcnt, + MEM_PERM_R, &host_iov); + if (iov_err < 0) { + host_fd_ref_close(&host_ref); + return iov_err; } uint8_t linux_ctrl_stack[512], mac_ctrl_stack[512]; @@ -223,6 +204,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) if (lmsg.msg_control && lmsg.msg_controllen > 0) { size_t clen = lmsg.msg_controllen; if (clen > 65536) { + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EINVAL; } @@ -232,6 +214,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) if (!linux_ctrl_heap || !mac_ctrl_heap) { free(linux_ctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_ENOMEM; } @@ -242,6 +225,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) if (guest_read(g, lmsg.msg_control, linux_ctrl, clen) < 0) { free(linux_ctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -300,6 +284,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) if (rc < 0) { free(linux_ctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return rc; } @@ -318,8 +303,8 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) struct msghdr msg = { .msg_name = dest_sa, .msg_namelen = dest_len, - .msg_iov = host_iov, - .msg_iovlen = (int) lmsg.msg_iovlen, + .msg_iov = host_iov.iov, + .msg_iovlen = send_iovcnt, .msg_control = ctrl_ptr, .msg_controllen = ctrl_len, .msg_flags = 0, @@ -328,6 +313,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags) ssize_t ret = sendmsg(host_ref.fd, &msg, mac_flags); free(linux_ctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); if (ret < 0) { if (errno == EPIPE && !suppress_sigpipe) @@ -410,41 +396,19 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) return ret; } - if (lmsg.msg_iovlen > 64) { + /* See sys_sendmsg above: bound msg_iovlen before the int narrowing. */ + if (lmsg.msg_iovlen > SYSCALL_IOV_MAX) { host_fd_ref_close(&host_ref); return -LINUX_EINVAL; } + int recv_iovcnt = (int) lmsg.msg_iovlen; - struct { - uint64_t iov_base, iov_len; - } guest_iov[64]; - - if (lmsg.msg_iovlen > 0) { - if (guest_read(g, lmsg.msg_iov, guest_iov, lmsg.msg_iovlen * 16) < 0) { - host_fd_ref_close(&host_ref); - return -LINUX_EFAULT; - } - } - - struct iovec host_iov[64]; - for (uint64_t i = 0; i < lmsg.msg_iovlen; i++) { - if (guest_iov[i].iov_len == 0) { - host_iov[i].iov_base = NULL; - host_iov[i].iov_len = 0; - continue; - } - uint64_t avail = 0; - void *base = guest_ptr_bound(g, guest_iov[i].iov_base, &avail, - MEM_PERM_W, guest_iov[i].iov_len); - if (!base) { - host_fd_ref_close(&host_ref); - return -LINUX_EFAULT; - } - uint64_t len = guest_iov[i].iov_len; - if (len > avail) - len = avail; - host_iov[i].iov_base = base; - host_iov[i].iov_len = len; + host_iov_buf_t host_iov; + int64_t iov_err = host_iov_prepare_msg(g, lmsg.msg_iov, recv_iovcnt, + MEM_PERM_W, &host_iov); + if (iov_err < 0) { + host_fd_ref_close(&host_ref); + return iov_err; } struct sockaddr_storage mac_sa; @@ -477,8 +441,8 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) struct msghdr msg = { .msg_name = lmsg.msg_name ? &mac_sa : NULL, .msg_namelen = lmsg.msg_name ? sa_len : 0, - .msg_iov = host_iov, - .msg_iovlen = (int) lmsg.msg_iovlen, + .msg_iov = host_iov.iov, + .msg_iovlen = recv_iovcnt, .msg_control = ctrl_alloc > 0 ? mac_ctrl : NULL, .msg_controllen = ctrl_alloc, .msg_flags = 0, @@ -491,6 +455,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) ssize_t ret = recvmsg(host_ref.fd, &msg, mac_flags); if (ret < 0) { free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return linux_errno(); } @@ -508,6 +473,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) if (guest_write_small(g, lmsg.msg_name, linux_sa, write_len) < 0) { free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -518,6 +484,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) msg_gva + offsetof(linux_msghdr_t, msg_namelen), &nl, sizeof(nl)) < 0) { free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -534,6 +501,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) lctrl_heap = malloc(lctrl_size); if (!lctrl_heap) { free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_ENOMEM; } @@ -581,6 +549,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds); free(lctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EINVAL; } @@ -654,6 +623,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds); free(lctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -664,6 +634,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds); free(lctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -674,6 +645,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) &zero64, sizeof(zero64)) < 0) { free(lctrl_heap); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -722,6 +694,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) g, msg_gva + offsetof(linux_msghdr_t, msg_controllen), &zero64, sizeof(zero64)) < 0) { free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } @@ -734,11 +707,13 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags) &mflags, sizeof(mflags)) < 0) { recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds); free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return -LINUX_EFAULT; } free(mac_ctrl_heap); + host_iov_free(&host_iov); host_fd_ref_close(&host_ref); return ret; } diff --git a/tests/test-readv-writev.c b/tests/test-readv-writev.c index c7b1855..1148e96 100644 --- a/tests/test-readv-writev.c +++ b/tests/test-readv-writev.c @@ -248,6 +248,36 @@ static void test_pwritev2_append(void) EXPECT_TRUE(nr == 6 && !memcmp(buf, "baseXY", 6), "append data mismatch"); } +/* Test 7: zero iovcnt must not move the append file offset */ + +static void test_pwritev2_append_zero_iovcnt(void) +{ + TEST("pwritev2 RWF_APPEND zero iovcnt"); + + const char *path = "/tmp/elfuse-test-pwritev2-append-zero.txt"; + int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + FAIL("open"); + return; + } + + if (write(fd, "base", 4) != 4 || lseek(fd, 1, SEEK_SET) != 1) { + FAIL("setup"); + close(fd); + unlink(path); + return; + } + + struct iovec wv = {.iov_base = (void *) "X", .iov_len = 1}; + long ret = + raw_syscall6(__NR_pwritev2, fd, (long) &wv, 0, -1, 0, RWF_APPEND); + off_t pos = lseek(fd, 0, SEEK_CUR); + close(fd); + unlink(path); + + EXPECT_TRUE(ret == -EINVAL && pos == 1, "zero iovcnt changed offset"); +} + /* Main */ int main(void) @@ -260,6 +290,7 @@ int main(void) test_zero_length_iovec(); test_many_iovecs(); test_pwritev2_append(); + test_pwritev2_append_zero_iovcnt(); SUMMARY("test-readv-writev"); return fails > 0 ? 1 : 0;