Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions src/syscall/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <fcntl.h>
#include <pthread.h>
#include <stdbool.h>
#include <sys/uio.h>
#include <unistd.h>

#include "syscall/abi.h"
Expand Down Expand Up @@ -312,6 +313,51 @@ static inline int64_t host_fd_ref_open_io(guest_fd_t guest_fd,
return 0;
}

/* iov limits shared between readv/writev/preadv/pwritev and sendmsg/recvmsg.
* SYSCALL_IOV_MAX matches the Linux UIO_MAXIOV cap; SYSCALL_IOV_STACK_MAX
* keeps the typical case on the call-site stack.
*/
#define SYSCALL_IOV_MAX 1024
#define SYSCALL_IOV_STACK_MAX 64

/* Resolved host iov vector backed by an inline stack buffer with a heap
* fallback for large iovcnt. Pair host_iov_prepare with host_iov_free.
*/
typedef struct {
struct iovec stack[SYSCALL_IOV_STACK_MAX];
struct iovec *iov;
} host_iov_buf_t;

/* Translate a guest iovec array at iov_gva (iovcnt entries) into the host
* iovec layout in buf->iov, resolving each guest_base to a contiguous host
* pointer with the requested permissions. On a non-contiguous iov entry the
* helper truncates that entry to the contiguous prefix and zeros every
* subsequent entry; the host readv/writev/sendmsg/recvmsg then returns a
* POSIX-compliant short I/O instead of silently packing bytes from the next
* guest buffer into the truncated tail.
*
* iovcnt <= 0 or > SYSCALL_IOV_MAX returns -LINUX_EINVAL.
*
* Returns 0 on success or a negative Linux errno on failure. The caller must
* pair every successful prepare with host_iov_free to release any heap
* spillover.
*/
int64_t host_iov_prepare(guest_t *g,
uint64_t iov_gva,
int iovcnt,
int required_perms,
host_iov_buf_t *buf);

/* sendmsg/recvmsg variant: iovcnt == 0 is legal for ancillary-only messages. */
int64_t host_iov_prepare_msg(guest_t *g,
uint64_t iov_gva,
int iovcnt,
int required_perms,
host_iov_buf_t *buf);

/* Release any heap spillover backing a host_iov_buf_t. Idempotent. */
void host_iov_free(host_iov_buf_t *buf);

/* Read a guest path string with small-buffer optimization.
*
* Tries the stack-allocated short_buf first; falls back to long_buf for
Expand Down
33 changes: 20 additions & 13 deletions src/syscall/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@
#include "syscall/proc.h"
#include "syscall/signal.h"

#define SYSCALL_IOV_MAX 1024
#define SYSCALL_IOV_STACK_MAX 64
#define URANDOM_CACHE_SIZE 4096

/* Linux terminal struct types. */
Expand Down Expand Up @@ -1098,21 +1096,17 @@ static int64_t build_host_iov(guest_t *g,
return 0;
}

typedef struct {
struct iovec stack[SYSCALL_IOV_STACK_MAX];
struct iovec *iov;
} host_iov_buf_t;

static int64_t host_iov_prepare(guest_t *g,
uint64_t iov_gva,
int iovcnt,
int required_perms,
host_iov_buf_t *buf)
int64_t host_iov_prepare(guest_t *g,
uint64_t iov_gva,
int iovcnt,
int required_perms,
host_iov_buf_t *buf)
{
if (iovcnt <= 0 || iovcnt > SYSCALL_IOV_MAX)
return -LINUX_EINVAL;

buf->iov = buf->stack;

if (iovcnt > SYSCALL_IOV_STACK_MAX) {
buf->iov = malloc((size_t) iovcnt * sizeof(*buf->iov));
if (!buf->iov)
Expand All @@ -1130,7 +1124,20 @@ static int64_t host_iov_prepare(guest_t *g,
return 0;
}

static void host_iov_free(host_iov_buf_t *buf)
int64_t host_iov_prepare_msg(guest_t *g,
uint64_t iov_gva,
int iovcnt,
int required_perms,
host_iov_buf_t *buf)
{
if (iovcnt == 0) {
buf->iov = buf->stack;
return 0;
}
return host_iov_prepare(g, iov_gva, iovcnt, required_perms, buf);
}

void host_iov_free(host_iov_buf_t *buf)
{
if (buf->iov != buf->stack)
free(buf->iov);
Expand Down
107 changes: 41 additions & 66 deletions src/syscall/net-msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,41 +174,22 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
dest_len = (socklen_t) ml;
}

if (lmsg.msg_iovlen > 64) {
/* msg_iovlen is uint64_t on Linux; bound it against SYSCALL_IOV_MAX
* before the int narrowing below so a 64-bit value whose low 32 bits
* fall inside [0, SYSCALL_IOV_MAX] cannot slip past the cap.
*/
if (lmsg.msg_iovlen > SYSCALL_IOV_MAX) {
host_fd_ref_close(&host_ref);
return -LINUX_EINVAL;
}
int send_iovcnt = (int) lmsg.msg_iovlen;

struct {
uint64_t iov_base, iov_len;
} guest_iov[64];

if (lmsg.msg_iovlen > 0) {
if (guest_read(g, lmsg.msg_iov, guest_iov, lmsg.msg_iovlen * 16) < 0) {
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
}

struct iovec host_iov[64];
for (uint64_t i = 0; i < lmsg.msg_iovlen; i++) {
if (guest_iov[i].iov_len == 0) {
host_iov[i].iov_base = NULL;
host_iov[i].iov_len = 0;
continue;
}
uint64_t avail = 0;
void *base = guest_ptr_bound(g, guest_iov[i].iov_base, &avail,
MEM_PERM_R, guest_iov[i].iov_len);
if (!base) {
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
uint64_t len = guest_iov[i].iov_len;
if (len > avail)
len = avail;
host_iov[i].iov_base = base;
host_iov[i].iov_len = len;
host_iov_buf_t host_iov;
int64_t iov_err = host_iov_prepare_msg(g, lmsg.msg_iov, send_iovcnt,
MEM_PERM_R, &host_iov);
if (iov_err < 0) {
host_fd_ref_close(&host_ref);
return iov_err;
}

uint8_t linux_ctrl_stack[512], mac_ctrl_stack[512];
Expand All @@ -223,6 +204,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
if (lmsg.msg_control && lmsg.msg_controllen > 0) {
size_t clen = lmsg.msg_controllen;
if (clen > 65536) {
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EINVAL;
}
Expand All @@ -232,6 +214,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
if (!linux_ctrl_heap || !mac_ctrl_heap) {
free(linux_ctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_ENOMEM;
}
Expand All @@ -242,6 +225,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
if (guest_read(g, lmsg.msg_control, linux_ctrl, clen) < 0) {
free(linux_ctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand Down Expand Up @@ -300,6 +284,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
if (rc < 0) {
free(linux_ctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return rc;
}
Expand All @@ -318,8 +303,8 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
struct msghdr msg = {
.msg_name = dest_sa,
.msg_namelen = dest_len,
.msg_iov = host_iov,
.msg_iovlen = (int) lmsg.msg_iovlen,
.msg_iov = host_iov.iov,
.msg_iovlen = send_iovcnt,
.msg_control = ctrl_ptr,
.msg_controllen = ctrl_len,
.msg_flags = 0,
Expand All @@ -328,6 +313,7 @@ int64_t sys_sendmsg(guest_t *g, int fd, uint64_t msg_gva, int linux_flags)
ssize_t ret = sendmsg(host_ref.fd, &msg, mac_flags);
free(linux_ctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
if (ret < 0) {
if (errno == EPIPE && !suppress_sigpipe)
Expand Down Expand Up @@ -410,41 +396,19 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
return ret;
}

if (lmsg.msg_iovlen > 64) {
/* See sys_sendmsg above: bound msg_iovlen before the int narrowing. */
if (lmsg.msg_iovlen > SYSCALL_IOV_MAX) {
host_fd_ref_close(&host_ref);
return -LINUX_EINVAL;
}
int recv_iovcnt = (int) lmsg.msg_iovlen;

struct {
uint64_t iov_base, iov_len;
} guest_iov[64];

if (lmsg.msg_iovlen > 0) {
if (guest_read(g, lmsg.msg_iov, guest_iov, lmsg.msg_iovlen * 16) < 0) {
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
}

struct iovec host_iov[64];
for (uint64_t i = 0; i < lmsg.msg_iovlen; i++) {
if (guest_iov[i].iov_len == 0) {
host_iov[i].iov_base = NULL;
host_iov[i].iov_len = 0;
continue;
}
uint64_t avail = 0;
void *base = guest_ptr_bound(g, guest_iov[i].iov_base, &avail,
MEM_PERM_W, guest_iov[i].iov_len);
if (!base) {
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
uint64_t len = guest_iov[i].iov_len;
if (len > avail)
len = avail;
host_iov[i].iov_base = base;
host_iov[i].iov_len = len;
host_iov_buf_t host_iov;
int64_t iov_err = host_iov_prepare_msg(g, lmsg.msg_iov, recv_iovcnt,
MEM_PERM_W, &host_iov);
if (iov_err < 0) {
host_fd_ref_close(&host_ref);
return iov_err;
}

struct sockaddr_storage mac_sa;
Expand Down Expand Up @@ -477,8 +441,8 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
struct msghdr msg = {
.msg_name = lmsg.msg_name ? &mac_sa : NULL,
.msg_namelen = lmsg.msg_name ? sa_len : 0,
.msg_iov = host_iov,
.msg_iovlen = (int) lmsg.msg_iovlen,
.msg_iov = host_iov.iov,
.msg_iovlen = recv_iovcnt,
.msg_control = ctrl_alloc > 0 ? mac_ctrl : NULL,
.msg_controllen = ctrl_alloc,
.msg_flags = 0,
Expand All @@ -491,6 +455,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
ssize_t ret = recvmsg(host_ref.fd, &msg, mac_flags);
if (ret < 0) {
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return linux_errno();
}
Expand All @@ -508,6 +473,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
if (guest_write_small(g, lmsg.msg_name, linux_sa, write_len) <
0) {
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand All @@ -518,6 +484,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
msg_gva + offsetof(linux_msghdr_t, msg_namelen),
&nl, sizeof(nl)) < 0) {
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand All @@ -534,6 +501,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
lctrl_heap = malloc(lctrl_size);
if (!lctrl_heap) {
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_ENOMEM;
}
Expand Down Expand Up @@ -581,6 +549,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds);
free(lctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EINVAL;
}
Expand Down Expand Up @@ -654,6 +623,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds);
free(lctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand All @@ -664,6 +634,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds);
free(lctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand All @@ -674,6 +645,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
&zero64, sizeof(zero64)) < 0) {
free(lctrl_heap);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand Down Expand Up @@ -722,6 +694,7 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
g, msg_gva + offsetof(linux_msghdr_t, msg_controllen), &zero64,
sizeof(zero64)) < 0) {
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}
Expand All @@ -734,11 +707,13 @@ int64_t sys_recvmsg(guest_t *g, int fd, uint64_t msg_gva, int flags)
&mflags, sizeof(mflags)) < 0) {
recvmsg_cleanup_scm_rights(scm_gfds, scm_hfds, scm_nfds);
free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return -LINUX_EFAULT;
}

free(mac_ctrl_heap);
host_iov_free(&host_iov);
host_fd_ref_close(&host_ref);
return ret;
}
Expand Down
Loading
Loading