From b6e2122ff39be2b3deb51afa7c9519a2627ca720 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Fri, 29 May 2026 12:34:13 +0300 Subject: [PATCH 1/2] Emit named IN_CREATE/IN_DELETE for inotify directory watches EVFILT_VNODE reports that a watched directory changed but not which child, so directory events were queued with no name. fsnotify-based consumers (notably the k0s manifest applier, which re-applies only when a *.yaml entry appears) filter on the entry name and silently drop nameless events, so manifests written after the watch was established were never picked up. Keep a per-watch snapshot of the directory's entry names; on each NOTE_WRITE re-list the directory and diff against the snapshot to emit a named IN_CREATE per added child and IN_DELETE per removed one, matching real inotify semantics. The blocking-read and non-blocking collect paths share one process_vnode_event() helper. The snapshot is allocated on add_watch and freed on rm_watch and inotify_close. Add a regression test (test-inotify Test 6) that watches a fresh directory, creates a child, and asserts a named IN_CREATE for it is delivered; this fails before the fix (the event arrives without a name). Validated with make check on Apple Silicon. (cherry picked from commit 2a5fa28b5257ceb5ed116c231aecc7c4a2ef54ab) --- src/syscall/inotify.c | 219 +++++++++++++++++++++++++++++++++++------- tests/test-inotify.c | 70 ++++++++++++++ 2 files changed, 254 insertions(+), 35 deletions(-) diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index d9b54dd..87f5b60 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -83,6 +85,11 @@ typedef struct { bool is_dir; /* true if watching a directory */ dev_t dev; /* Device ID (for re-add lookup by inode) */ ino_t ino; /* Inode number (for re-add lookup by inode) */ + /* Dir watches only: path + entry-name snapshot, diffed on change to + * recover the child name kqueue omits. NULL/0 for file watches. */ + char *path; + char **entries; + int n_entries; } inotify_watch_t; typedef struct { @@ -296,8 +303,133 @@ static void pipe_drain(inotify_instance_t *inst) ; } +/* Snapshot the entry names of a directory (excluding "." and ".."). On return + * *out is a malloc'd array of malloc'd strings with *n_out entries (free with + * free_dir_snapshot). On any failure the snapshot is left empty. + */ +static void dir_snapshot(const char *path, char ***out, int *n_out) +{ + *out = NULL; + *n_out = 0; + + DIR *d = opendir(path); + if (!d) + return; + + char **names = NULL; + int n = 0, cap = 0; + struct dirent *de; + while ((de = readdir(d)) != NULL) { + if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..")) + continue; + if (n == cap) { + int ncap = cap ? cap * 2 : 16; + char **tmp = realloc(names, (size_t) ncap * sizeof(char *)); + if (!tmp) + break; + names = tmp; + cap = ncap; + } + names[n] = strdup(de->d_name); + if (!names[n]) + break; + n++; + } + closedir(d); + + *out = names; + *n_out = n; +} + +static void free_dir_snapshot(char **entries, int n) +{ + if (!entries) + return; + for (int i = 0; i < n; i++) + free(entries[i]); + free(entries); +} + +static bool snapshot_contains(char *const *entries, int n, const char *name) +{ + for (int i = 0; i < n; i++) + if (!strcmp(entries[i], name)) + return true; + return false; +} + /* Collect events from kqueue. */ +/* Translate one EVFILT_VNODE notification into queued inotify events for the + * watch on host_fd. Returns the number queued, or -1 on buffer overflow (an + * IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock. + */ +static int process_vnode_event(inotify_instance_t *inst, + int host_fd, + uint32_t fflags) +{ + int widx = watch_find_by_hostfd(inst, host_fd); + if (widx < 0) + return 0; + + inotify_watch_t *w = &inst->watches[widx]; + int queued = 0; + bool overflow = false; + + if (w->is_dir && (fflags & NOTE_WRITE) && w->path) { + char **now = NULL; + int now_n = 0; + dir_snapshot(w->path, &now, &now_n); + + for (int j = 0; j < now_n && !overflow; j++) { + if ((w->mask & IN_CREATE) && + !snapshot_contains(w->entries, w->n_entries, now[j])) { + if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) + overflow = true; + else + queued++; + } + } + for (int j = 0; j < w->n_entries && !overflow; j++) { + if ((w->mask & IN_DELETE) && + !snapshot_contains(now, now_n, w->entries[j])) { + if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0) + overflow = true; + else + queued++; + } + } + + /* Advance the snapshot regardless: the directory state has moved on, + * and any names dropped under overflow are covered by IN_Q_OVERFLOW. + */ + free_dir_snapshot(w->entries, w->n_entries); + w->entries = now; + w->n_entries = now_n; + } + + if (!overflow) { + uint32_t in_mask = notes_to_in_mask(fflags, w->mask, w->is_dir); + /* The per-child create/delete is emitted by the diff above; only emit + * the bare-mask event for file watches or non-create/delete changes. + */ + if (in_mask != 0 && + !(w->is_dir && (in_mask & (IN_CREATE | IN_DELETE)))) { + if (queue_event(inst, w->wd, in_mask, 0, NULL) < 0) + overflow = true; + else + queued++; + } + } + + if (overflow) { + /* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. */ + queue_event(inst, -1, 0x4000, 0, NULL); + return -1; + } + return queued; +} + /* Poll the kqueue for pending vnode events and translate them into * inotify events in the instance buffer. Returns the number of * events collected. @@ -312,35 +444,19 @@ static int collect_events(inotify_instance_t *inst) return 0; int collected = 0; + bool overflow = false; for (int i = 0; i < nev; i++) { - int host_fd = (int) kevs[i].ident; - int widx = watch_find_by_hostfd(inst, host_fd); - if (widx < 0) - continue; - - inotify_watch_t *w = &inst->watches[widx]; - uint32_t in_mask = - notes_to_in_mask((uint32_t) kevs[i].fflags, w->mask, w->is_dir); - if (in_mask == 0) - continue; - - /* Queue event without a filename for file watches. For directory - * watches, inotify emulation also omits the filename since kqueue - * EVFILT_VNODE does not report which child changed. - */ - if (queue_event(inst, w->wd, in_mask, 0, NULL) == 0) { - collected++; - } else { - /* Fixed inotify queue is full; queue IN_Q_OVERFLOW and stop. - * IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. - */ - queue_event(inst, -1, 0x4000, 0, NULL); + int r = process_vnode_event(inst, (int) kevs[i].ident, + (uint32_t) kevs[i].fflags); + if (r < 0) { + overflow = true; break; } + collected += r; } /* Signal the self-pipe so poll/epoll sees readability */ - if (collected > 0) + if (collected > 0 || overflow) pipe_signal(inst); return collected; @@ -438,12 +554,27 @@ int64_t sys_inotify_add_watch(guest_t *g, /* Strip IN_MASK_ADD control flag before storing */ uint32_t event_mask = mask & ~(uint32_t) IN_MASK_ADD; + /* For directory watches, snapshot the path + current entries up-front + * (outside the lock) so collect_events can diff on each change to emit + * named IN_CREATE/IN_DELETE. Ownership moves to the watch slot on success; + * every early-exit path below frees these. + */ + char *wpath = NULL; + char **wentries = NULL; + int wn = 0; + if (is_dir) { + wpath = strdup(path); + dir_snapshot(path, &wentries, &wn); + } + pthread_mutex_lock(&inotify_lock); int slot = inotify_find(inotify_fd); if (slot < 0) { pthread_mutex_unlock(&inotify_lock); close(host_fd); + free_dir_snapshot(wentries, wn); + free(wpath); return -LINUX_EBADF; } @@ -466,8 +597,12 @@ int64_t sys_inotify_add_watch(guest_t *g, uint32_t snapshot_mask = w->mask; /* Snapshot before unlock */ pthread_mutex_unlock(&inotify_lock); - /* Close the duplicate fd; inotify emulation keeps the original */ + /* Close the duplicate fd; inotify emulation keeps the original. + * The existing watch keeps its snapshot; drop this call's copy. + */ close(host_fd); + free_dir_snapshot(wentries, wn); + free(wpath); /* Update kevent filter with the new mask (use snapshot -- * w->mask may be modified by another thread after unlock) @@ -486,6 +621,8 @@ int64_t sys_inotify_add_watch(guest_t *g, if (widx < 0) { pthread_mutex_unlock(&inotify_lock); close(host_fd); + free_dir_snapshot(wentries, wn); + free(wpath); return -LINUX_ENOSPC; } @@ -501,6 +638,9 @@ int64_t sys_inotify_add_watch(guest_t *g, w->is_dir = is_dir; w->dev = st.st_dev; w->ino = st.st_ino; + w->path = wpath; + w->entries = wentries; + w->n_entries = wn; /* Capture kq_fd while under lock */ int kq_fd = inst->kq_fd; @@ -521,6 +661,11 @@ int64_t sys_inotify_add_watch(guest_t *g, pthread_mutex_lock(&inotify_lock); w->wd = 0; w->host_fd = 0; + free_dir_snapshot(w->entries, w->n_entries); + w->entries = NULL; + w->n_entries = 0; + free(w->path); + w->path = NULL; pthread_mutex_unlock(&inotify_lock); close(host_fd); errno = saved; @@ -555,6 +700,11 @@ int64_t sys_inotify_rm_watch(int inotify_fd, int wd) w->host_fd = 0; w->mask = 0; w->is_dir = 0; + free_dir_snapshot(w->entries, w->n_entries); + w->entries = NULL; + w->n_entries = 0; + free(w->path); + w->path = NULL; pthread_mutex_unlock(&inotify_lock); /* Remove from kqueue and close outside lock */ @@ -619,18 +769,12 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) } inst = &inotify_state[slot]; - /* Process the received event */ + /* Process the received event (same named-directory diff as the + * non-blocking collect path). + */ int host_fd = (int) kev.ident; - int widx = watch_find_by_hostfd(inst, host_fd); - if (widx >= 0) { - inotify_watch_t *w = &inst->watches[widx]; - uint32_t in_mask = - notes_to_in_mask((uint32_t) kev.fflags, w->mask, w->is_dir); - if (in_mask != 0) { - queue_event(inst, w->wd, in_mask, 0, NULL); - pipe_signal(inst); - } - } + if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0) + pipe_signal(inst); } } @@ -711,6 +855,11 @@ static void inotify_close(int guest_fd) watch_fds[nfds++] = inst->watches[i].host_fd; inst->watches[i].wd = 0; } + free_dir_snapshot(inst->watches[i].entries, inst->watches[i].n_entries); + inst->watches[i].entries = NULL; + inst->watches[i].n_entries = 0; + free(inst->watches[i].path); + inst->watches[i].path = NULL; } inst->guest_fd = -1; diff --git a/tests/test-inotify.c b/tests/test-inotify.c index 02deee0..36c392e 100644 --- a/tests/test-inotify.c +++ b/tests/test-inotify.c @@ -15,7 +15,10 @@ #include #include +#include #include +#include +#include #include #include #include @@ -188,6 +191,72 @@ static void test_dir_create(void) close(fd); } +/* Test 6: a directory watch must deliver a NAMED IN_CREATE for the new child, + * not just a nameless event. + */ +static void test_dir_create_named(void) +{ + TEST("watch dir delivers named IN_CREATE"); + + char dir[] = "/tmp/elfuse-inotify-dir-XXXXXX"; + if (!mkdtemp(dir)) { + FAIL("mkdtemp"); + return; + } + + int fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + FAIL("inotify_init1"); + rmdir(dir); + return; + } + + int wd = inotify_add_watch(fd, dir, IN_CREATE); + if (wd < 0) { + FAIL("inotify_add_watch"); + close(fd); + rmdir(dir); + return; + } + + const char *child = "manifest.yaml"; + char path[300]; + snprintf(path, sizeof(path), "%s/%s", dir, child); + int tfd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (tfd < 0) { + FAIL("create child in watched dir"); + inotify_rm_watch(fd, wd); + close(fd); + rmdir(dir); + return; + } + close(tfd); + + /* read() drives the diff; retry until the vnode notification is queued. */ + bool found = false; + for (int attempt = 0; attempt < 50 && !found; attempt++) { + char buf[1024]; + ssize_t n = read(fd, buf, sizeof(buf)); + for (ssize_t off = 0; + off + (ssize_t) sizeof(struct inotify_event) <= n;) { + struct inotify_event *ev = (struct inotify_event *) (buf + off); + if ((ev->mask & IN_CREATE) && ev->len > 0 && + strcmp(ev->name, child) == 0) + found = true; + off += (ssize_t) sizeof(struct inotify_event) + ev->len; + } + if (!found) + usleep(20000); /* 20ms */ + } + + EXPECT_TRUE(found, "named IN_CREATE for new child not delivered"); + + unlink(path); + inotify_rm_watch(fd, wd); + close(fd); + rmdir(dir); +} + /* Main */ int main(void) @@ -199,6 +268,7 @@ int main(void) test_modify_event(); test_nonblock(); test_dir_create(); + test_dir_create_named(); SUMMARY("test-inotify"); return fails > 0 ? 1 : 0; From 7fd5187ca2e3c6e041dcc33bacb5ff1db9acb3f7 Mon Sep 17 00:00:00 2001 From: "Hung-Han (Henry) Chen" Date: Sat, 30 May 2026 13:54:12 +0300 Subject: [PATCH 2/2] Keep inotify directory baseline when a snapshot fails A directory watch snapshots the child-name set on each change and diffs it against the previous baseline to recover the name kqueue omits. dir_snapshot returned an empty list on any failure (opendir error, a mid-read readdir error, or an allocation failure), which the diff could not tell apart from a genuinely empty directory. On a transient failure the IN_DELETE pass then saw every known child as missing and queued a spurious IN_DELETE for each, and the baseline was overwritten with the empty list, so every later change re-reported the whole directory as IN_CREATE. The corruption was permanent. Make dir_snapshot return bool: on failure it frees any partial result and reports false, and readdir read errors are now detected by clearing errno before each call. process_vnode_event only diffs against and advances to a snapshot that succeeded; on failure it keeps the previous baseline so the next successful snapshot reconciles whatever changed in between. The watch-add path documents that a failed initial snapshot is a best-effort empty baseline. Validated with make check on Apple Silicon; test-inotify still passes 6/6. --- src/syscall/inotify.c | 117 ++++++++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 43 deletions(-) diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 87f5b60..94b94f0 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -303,51 +303,74 @@ static void pipe_drain(inotify_instance_t *inst) ; } -/* Snapshot the entry names of a directory (excluding "." and ".."). On return - * *out is a malloc'd array of malloc'd strings with *n_out entries (free with - * free_dir_snapshot). On any failure the snapshot is left empty. +static void free_dir_snapshot(char **entries, int n) +{ + if (!entries) + return; + for (int i = 0; i < n; i++) + free(entries[i]); + free(entries); +} + +/* List a directory's child names, excluding "." and "..", into the out array + * (free with free_dir_snapshot). Returns false on any failure, leaving the + * result empty -- which the caller must treat as distinct from a true return + * with zero entries, since a failure mistaken for "empty" would diff every + * known child as deleted. */ -static void dir_snapshot(const char *path, char ***out, int *n_out) +static bool dir_snapshot(const char *path, char ***out, int *n_out) { *out = NULL; *n_out = 0; DIR *d = opendir(path); if (!d) - return; + return false; char **names = NULL; int n = 0, cap = 0; - struct dirent *de; - while ((de = readdir(d)) != NULL) { + bool ok = true; + for (;;) { + /* readdir returns NULL both at end-of-stream and on error; reset + * errno immediately before each call so a non-zero errno afterwards + * unambiguously signals a read error rather than EOF. + */ + errno = 0; + struct dirent *de = readdir(d); + if (!de) { + if (errno != 0) + ok = false; + break; + } if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..")) continue; if (n == cap) { int ncap = cap ? cap * 2 : 16; char **tmp = realloc(names, (size_t) ncap * sizeof(char *)); - if (!tmp) + if (!tmp) { + ok = false; break; + } names = tmp; cap = ncap; } names[n] = strdup(de->d_name); - if (!names[n]) + if (!names[n]) { + ok = false; break; + } n++; } closedir(d); + if (!ok) { + free_dir_snapshot(names, n); + return false; + } + *out = names; *n_out = n; -} - -static void free_dir_snapshot(char **entries, int n) -{ - if (!entries) - return; - for (int i = 0; i < n; i++) - free(entries[i]); - free(entries); + return true; } static bool snapshot_contains(char *const *entries, int n, const char *name) @@ -379,33 +402,38 @@ static int process_vnode_event(inotify_instance_t *inst, if (w->is_dir && (fflags & NOTE_WRITE) && w->path) { char **now = NULL; int now_n = 0; - dir_snapshot(w->path, &now, &now_n); - - for (int j = 0; j < now_n && !overflow; j++) { - if ((w->mask & IN_CREATE) && - !snapshot_contains(w->entries, w->n_entries, now[j])) { - if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) - overflow = true; - else - queued++; + /* Only diff against -- and advance to -- a snapshot that succeeded. + * On failure keep the previous baseline; the next successful snapshot + * reconciles whatever changed in between. + */ + if (dir_snapshot(w->path, &now, &now_n)) { + for (int j = 0; j < now_n && !overflow; j++) { + if ((w->mask & IN_CREATE) && + !snapshot_contains(w->entries, w->n_entries, now[j])) { + if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0) + overflow = true; + else + queued++; + } } - } - for (int j = 0; j < w->n_entries && !overflow; j++) { - if ((w->mask & IN_DELETE) && - !snapshot_contains(now, now_n, w->entries[j])) { - if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0) - overflow = true; - else - queued++; + for (int j = 0; j < w->n_entries && !overflow; j++) { + if ((w->mask & IN_DELETE) && + !snapshot_contains(now, now_n, w->entries[j])) { + if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < + 0) + overflow = true; + else + queued++; + } } - } - /* Advance the snapshot regardless: the directory state has moved on, - * and any names dropped under overflow are covered by IN_Q_OVERFLOW. - */ - free_dir_snapshot(w->entries, w->n_entries); - w->entries = now; - w->n_entries = now_n; + /* Advance the snapshot: the directory state has moved on, and any + * names dropped under overflow are covered by IN_Q_OVERFLOW. + */ + free_dir_snapshot(w->entries, w->n_entries); + w->entries = now; + w->n_entries = now_n; + } } if (!overflow) { @@ -564,7 +592,10 @@ int64_t sys_inotify_add_watch(guest_t *g, int wn = 0; if (is_dir) { wpath = strdup(path); - dir_snapshot(path, &wentries, &wn); + /* Best-effort: a failed listing starts the watch with an empty + * baseline, which is the only state worth recording at add time. + */ + (void) dir_snapshot(path, &wentries, &wn); } pthread_mutex_lock(&inotify_lock);