Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 215 additions & 35 deletions src/syscall/inotify.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <errno.h>
#include <limits.h>
#include <pthread.h>
#include <dirent.h>
#include <stdlib.h>
#include <sys/event.h>
#include <sys/stat.h>
#include <sys/uio.h>
Expand Down Expand Up @@ -83,6 +85,11 @@ typedef struct {
bool is_dir; /* true if watching a directory */
dev_t dev; /* Device ID (for re-add lookup by inode) */
ino_t ino; /* Inode number (for re-add lookup by inode) */
/* Dir watches only: path + entry-name snapshot, diffed on change to
* recover the child name kqueue omits. NULL/0 for file watches. */
char *path;
char **entries;
int n_entries;
} inotify_watch_t;

typedef struct {
Expand Down Expand Up @@ -296,8 +303,161 @@ static void pipe_drain(inotify_instance_t *inst)
;
}

static void free_dir_snapshot(char **entries, int n)
{
if (!entries)
return;
for (int i = 0; i < n; i++)
free(entries[i]);
free(entries);
}

/* List a directory's child names, excluding "." and "..", into the out array
* (free with free_dir_snapshot). Returns false on any failure, leaving the
* result empty -- which the caller must treat as distinct from a true return
* with zero entries, since a failure mistaken for "empty" would diff every
* known child as deleted.
*/
static bool dir_snapshot(const char *path, char ***out, int *n_out)
{
*out = NULL;
*n_out = 0;

DIR *d = opendir(path);
if (!d)
return false;

char **names = NULL;
int n = 0, cap = 0;
bool ok = true;
for (;;) {
/* readdir returns NULL both at end-of-stream and on error; reset
* errno immediately before each call so a non-zero errno afterwards
* unambiguously signals a read error rather than EOF.
*/
errno = 0;
struct dirent *de = readdir(d);
if (!de) {
if (errno != 0)
ok = false;
break;
}
if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."))
continue;
if (n == cap) {
int ncap = cap ? cap * 2 : 16;
char **tmp = realloc(names, (size_t) ncap * sizeof(char *));
if (!tmp) {
ok = false;
break;
}
names = tmp;
cap = ncap;
}
names[n] = strdup(de->d_name);
if (!names[n]) {
ok = false;
break;
}
n++;
}
closedir(d);

if (!ok) {
free_dir_snapshot(names, n);
return false;
}

*out = names;
*n_out = n;
return true;
}

static bool snapshot_contains(char *const *entries, int n, const char *name)
{
for (int i = 0; i < n; i++)
if (!strcmp(entries[i], name))
return true;
return false;
}

/* Collect events from kqueue. */

/* Translate one EVFILT_VNODE notification into queued inotify events for the
* watch on host_fd. Returns the number queued, or -1 on buffer overflow (an
* IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock.
*/
static int process_vnode_event(inotify_instance_t *inst,
int host_fd,
uint32_t fflags)
{
int widx = watch_find_by_hostfd(inst, host_fd);
if (widx < 0)
return 0;

inotify_watch_t *w = &inst->watches[widx];
int queued = 0;
bool overflow = false;

if (w->is_dir && (fflags & NOTE_WRITE) && w->path) {
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Filesystem I/O (opendir/readdir) under global inotify_lock causes potential lock contention and latency regression

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/syscall/inotify.c, line 379:

<comment>Filesystem I/O (opendir/readdir) under global inotify_lock causes potential lock contention and latency regression</comment>

<file context>
@@ -296,8 +303,133 @@ static void pipe_drain(inotify_instance_t *inst)
+    int queued = 0;
+    bool overflow = false;
+
+    if (w->is_dir && (fflags & NOTE_WRITE) && w->path) {
+        char **now = NULL;
+        int now_n = 0;
</file context>

char **now = NULL;
int now_n = 0;
/* Only diff against -- and advance to -- a snapshot that succeeded.
* On failure keep the previous baseline; the next successful snapshot
* reconciles whatever changed in between.
*/
if (dir_snapshot(w->path, &now, &now_n)) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stale w->path reads the wrong directory after rename

The snapshot/diff design rests on w->path (captured by strdup(path) at inotify_add_watch time, src/syscall/inotify.c:594) being a valid handle to the directory whose entries the diff should compare. The watch's identity, however, is w->host_fd — an O_EVTONLY fd that follows the inode across renames. Once the two diverge, dir_snapshot(w->path, ...) at src/syscall/inotify.c:409 no longer reads the directory the watch is registered for.

I reproduced both failure modes with two small static aarch64 binaries cross-built and run against build/elfuse at this PR's tip (7fd5187).

Scenario A — old path no longer exists

mkdtemp /tmp/A
inotify_add_watch(fd, /tmp/A, IN_CREATE | IN_DELETE)
rename /tmp/A -> /tmp/A-renamed    /* watched inode moves with the fd */
touch /tmp/A-renamed/child         /* in the watched dir */

mv raises NOTE_RENAME on the watched fd (delivered as IN_MOVE_SELF, correct). The subsequent touch raises NOTE_WRITE on the same fd; process_vnode_event calls dir_snapshot("/tmp/A", ...), opendir returns ENOENT, the function returns false, the keep-baseline branch fires — and no IN_CREATE for child is ever emitted.

Actual output:

watching /tmp/elfuse-renA-KRUJg3 (wd=1)
renamed /tmp/elfuse-renA-KRUJg3 -> /tmp/elfuse-renA-KRUJg3-renamed; old path is now nonexistent
created /tmp/elfuse-renA-KRUJg3-renamed/child in the renamed (watched) dir

IN_CREATE name="child" count: 0 (expected 1 on Linux)
verdict: BUG REPRODUCED (named event lost after rename)

From the caller's perspective the watch silently stops naming children after a rename — exactly the failure mode #55 was opened to fix.

Scenario B — old path is taken by a different inode

mkdtemp /tmp/A
inotify_add_watch(fd, /tmp/A, IN_CREATE | IN_DELETE)
rename /tmp/A -> /tmp/A-renamed    /* watched inode moves with the fd */
mkdir /tmp/A                        /* fresh dir, different inode, unwatched */
touch /tmp/A/decoy                  /* in the new unwatched dir */
touch /tmp/A-renamed/child          /* in the watched dir */

Now opendir("/tmp/A") succeeds, but it reads the new /tmp/A, not the directory the watch tracks. The diff compares the new directory's entries (["decoy"]) to the baseline ([]) and synthesises an IN_CREATE for the decoy. The real child event in the renamed directory never surfaces because the watched dir is never sampled.

Actual output:

step 1: watching /tmp/elfuse-rename-A-nydrPl (wd=1)
step 2: rename /tmp/elfuse-rename-A-nydrPl -> /tmp/elfuse-rename-A-nydrPl-renamed (watched inode moves with the fd)
step 3: mkdir /tmp/elfuse-rename-A-nydrPl again (NEW inode at old path -- unwatched)
step 4a: created /tmp/elfuse-rename-A-nydrPl/decoy in the NEW (unwatched) dir
step 4b: created /tmp/elfuse-rename-A-nydrPl-renamed/child in the WATCHED (renamed) dir
step 5: draining events for ~1s
  event: wd=1 mask=0x00000100 (IN_CREATE) len=8 name=decoy

summary:
  total IN_CREATE: 1
  IN_CREATE name="decoy" (BUG if >0): 1
  IN_CREATE name="child" (expected: 1): 0
verdict: BUG REPRODUCED

The emitted event is objectively wrong on both axes: it names a file in a directory the caller never watched, and it suppresses the named event for the actual child of the watched directory. This isn't theoretical — build systems (yarn/pnpm/Cargo) and test runners routinely mv build → build.old; mkdir build; … for atomic swap; the watch on the old build/ will start fabricating events for the unrelated replacement directory after the swap.

Reproducers

Both binaries are small static aarch64 ELFs cross-built with aarch64-linux-gnu-gcc -static -O2 -D_GNU_SOURCE. Sources attached as Scenario A / Scenario B above; they exit non-zero on the buggy path. A correct fix should make both exit zero (Scenario A: emit IN_CREATE name="child"; Scenario B: emit IN_CREATE name="child" and not name="decoy").

for (int j = 0; j < now_n && !overflow; j++) {
if ((w->mask & IN_CREATE) &&
!snapshot_contains(w->entries, w->n_entries, now[j])) {
if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
overflow = true;
else
queued++;
}
}
for (int j = 0; j < w->n_entries && !overflow; j++) {
if ((w->mask & IN_DELETE) &&
!snapshot_contains(now, now_n, w->entries[j])) {
if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) <
0)
overflow = true;
else
queued++;
}
}

/* Advance the snapshot: the directory state has moved on, and any
* names dropped under overflow are covered by IN_Q_OVERFLOW.
*/
free_dir_snapshot(w->entries, w->n_entries);
w->entries = now;
w->n_entries = now_n;
}
}

if (!overflow) {
uint32_t in_mask = notes_to_in_mask(fflags, w->mask, w->is_dir);
/* The per-child create/delete is emitted by the diff above; only emit
* the bare-mask event for file watches or non-create/delete changes.
*/
if (in_mask != 0 &&
!(w->is_dir && (in_mask & (IN_CREATE | IN_DELETE)))) {
if (queue_event(inst, w->wd, in_mask, 0, NULL) < 0)
overflow = true;
else
queued++;
}
}

if (overflow) {
/* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. */
queue_event(inst, -1, 0x4000, 0, NULL);
return -1;
}
return queued;
}

/* Poll the kqueue for pending vnode events and translate them into
* inotify events in the instance buffer. Returns the number of
* events collected.
Expand All @@ -312,35 +472,19 @@ static int collect_events(inotify_instance_t *inst)
return 0;

int collected = 0;
bool overflow = false;
for (int i = 0; i < nev; i++) {
int host_fd = (int) kevs[i].ident;
int widx = watch_find_by_hostfd(inst, host_fd);
if (widx < 0)
continue;

inotify_watch_t *w = &inst->watches[widx];
uint32_t in_mask =
notes_to_in_mask((uint32_t) kevs[i].fflags, w->mask, w->is_dir);
if (in_mask == 0)
continue;

/* Queue event without a filename for file watches. For directory
* watches, inotify emulation also omits the filename since kqueue
* EVFILT_VNODE does not report which child changed.
*/
if (queue_event(inst, w->wd, in_mask, 0, NULL) == 0) {
collected++;
} else {
/* Fixed inotify queue is full; queue IN_Q_OVERFLOW and stop.
* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics.
*/
queue_event(inst, -1, 0x4000, 0, NULL);
int r = process_vnode_event(inst, (int) kevs[i].ident,
(uint32_t) kevs[i].fflags);
if (r < 0) {
overflow = true;
break;
}
collected += r;
}

/* Signal the self-pipe so poll/epoll sees readability */
if (collected > 0)
if (collected > 0 || overflow)
pipe_signal(inst);

return collected;
Expand Down Expand Up @@ -438,12 +582,30 @@ int64_t sys_inotify_add_watch(guest_t *g,
/* Strip IN_MASK_ADD control flag before storing */
uint32_t event_mask = mask & ~(uint32_t) IN_MASK_ADD;

/* For directory watches, snapshot the path + current entries up-front
* (outside the lock) so collect_events can diff on each change to emit
* named IN_CREATE/IN_DELETE. Ownership moves to the watch slot on success;
* every early-exit path below frees these.
*/
char *wpath = NULL;
char **wentries = NULL;
int wn = 0;
if (is_dir) {
wpath = strdup(path);
/* Best-effort: a failed listing starts the watch with an empty
* baseline, which is the only state worth recording at add time.
*/
(void) dir_snapshot(path, &wentries, &wn);
}

pthread_mutex_lock(&inotify_lock);

int slot = inotify_find(inotify_fd);
if (slot < 0) {
pthread_mutex_unlock(&inotify_lock);
close(host_fd);
free_dir_snapshot(wentries, wn);
free(wpath);
return -LINUX_EBADF;
}

Expand All @@ -466,8 +628,12 @@ int64_t sys_inotify_add_watch(guest_t *g,
uint32_t snapshot_mask = w->mask; /* Snapshot before unlock */
pthread_mutex_unlock(&inotify_lock);

/* Close the duplicate fd; inotify emulation keeps the original */
/* Close the duplicate fd; inotify emulation keeps the original.
* The existing watch keeps its snapshot; drop this call's copy.
*/
close(host_fd);
free_dir_snapshot(wentries, wn);
free(wpath);

/* Update kevent filter with the new mask (use snapshot --
* w->mask may be modified by another thread after unlock)
Expand All @@ -486,6 +652,8 @@ int64_t sys_inotify_add_watch(guest_t *g,
if (widx < 0) {
pthread_mutex_unlock(&inotify_lock);
close(host_fd);
free_dir_snapshot(wentries, wn);
free(wpath);
return -LINUX_ENOSPC;
}

Expand All @@ -501,6 +669,9 @@ int64_t sys_inotify_add_watch(guest_t *g,
w->is_dir = is_dir;
w->dev = st.st_dev;
w->ino = st.st_ino;
w->path = wpath;
w->entries = wentries;
w->n_entries = wn;

/* Capture kq_fd while under lock */
int kq_fd = inst->kq_fd;
Expand All @@ -521,6 +692,11 @@ int64_t sys_inotify_add_watch(guest_t *g,
pthread_mutex_lock(&inotify_lock);
w->wd = 0;
w->host_fd = 0;
free_dir_snapshot(w->entries, w->n_entries);
w->entries = NULL;
w->n_entries = 0;
free(w->path);
w->path = NULL;
pthread_mutex_unlock(&inotify_lock);
close(host_fd);
errno = saved;
Expand Down Expand Up @@ -555,6 +731,11 @@ int64_t sys_inotify_rm_watch(int inotify_fd, int wd)
w->host_fd = 0;
w->mask = 0;
w->is_dir = 0;
free_dir_snapshot(w->entries, w->n_entries);
w->entries = NULL;
w->n_entries = 0;
free(w->path);
w->path = NULL;
pthread_mutex_unlock(&inotify_lock);

/* Remove from kqueue and close outside lock */
Expand Down Expand Up @@ -619,18 +800,12 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
}
inst = &inotify_state[slot];

/* Process the received event */
/* Process the received event (same named-directory diff as the
* non-blocking collect path).
*/
int host_fd = (int) kev.ident;
int widx = watch_find_by_hostfd(inst, host_fd);
if (widx >= 0) {
inotify_watch_t *w = &inst->watches[widx];
uint32_t in_mask =
notes_to_in_mask((uint32_t) kev.fflags, w->mask, w->is_dir);
if (in_mask != 0) {
queue_event(inst, w->wd, in_mask, 0, NULL);
pipe_signal(inst);
}
}
if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0)
pipe_signal(inst);
}
}

Expand Down Expand Up @@ -711,6 +886,11 @@ static void inotify_close(int guest_fd)
watch_fds[nfds++] = inst->watches[i].host_fd;
inst->watches[i].wd = 0;
}
free_dir_snapshot(inst->watches[i].entries, inst->watches[i].n_entries);
inst->watches[i].entries = NULL;
inst->watches[i].n_entries = 0;
free(inst->watches[i].path);
inst->watches[i].path = NULL;
}

inst->guest_fd = -1;
Expand Down
Loading
Loading