From d0b0c38dcc9fe0a6ed5e17ea2d361b9a31f0b5e5 Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Mon, 22 Jun 2026 08:54:19 +0200
Subject: [PATCH 01/12] fuse: drop BDI_CAP_STRICTLIMIT from fuse bdi setup

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6a94de9528210f..de104cb029c6d3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1834,7 +1834,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 
 	/* fuse does it's own writeback accounting */
 	sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
-	sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+	sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
 
 	/*
 	 * For a single fuse filesystem use max 1% of dirty +

From f218446ea97e8bb935f1b3892422485ef36d544f Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Mon, 22 Jun 2026 08:54:40 +0200
Subject: [PATCH 02/12] fuse: use writethrough for writes matching the server
 alignment

Writes that already match the alignment advertised via
FUSE_ALIGN_PG_ORDER gain nothing from the writeback cache and can
degrade into page-sized WRITE requests under dirty throttling.  Send
them through fuse_perform_write() instead, which packs requests up to
max_write and keeps them stripe-aligned for the backend.  They create
no dirty pages, so no DLM write lock needs to be cached for them.
Unaligned writes keep using the writeback cache.

Also clarify in the uapi header that align_page_order is the log2 of
the alignment in bytes, not in pages.

Ported from the redfs-ubuntu-noble-writethrough-split branch and
adapted to the iomap-based writeback path: the decision gates the
writeback bool in fuse_cache_write_iter() (and the DLM write-lock
acquisition) instead of branching to a writethrough label.

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/file.c            | 40 +++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/fuse.h |  7 ++++---
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 467093acadbb89..2aa0040fa22398 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1666,6 +1666,41 @@ static ssize_t fuse_writeback_write_iter(struct kiocb *iocb,
 	return written < 0 ? written : total_written;
 }
 
+/*
+ * With writeback caching the request size seen by the server depends on
+ * how many contiguous dirty pages the flusher finds, which is bounded by
+ * dirty throttling: with BDI_CAP_STRICTLIMIT the dirty window can degrade
+ * to a single page under streaming writes, turning large application
+ * writes into page-sized requests.
+ *
+ * Writes that already match the server's preferred alignment gain
+ * nothing from accumulating in the page cache, so send them through
+ * fuse_perform_write() instead, which packs requests up to max_write.
+ * They create no dirty pages, hence no DLM write lock needs to be cached
+ * for them.  Unaligned writes keep using the writeback cache, where they
+ * can merge with neighbouring data.
+ */
+static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
+				     struct iov_iter *from)
+{
+	size_t count = iov_iter_count(from);
+	u64 align;
+	bool ret;
+
+	if (!fc->big_writes)
+		return true;
+
+	/* these rely on the semantics of their current paths */
+	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT))
+		return true;
+
+	align = fc->alignment_pages ?
+		(u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE;
+
+	ret = !IS_ALIGNED(iocb->ki_pos, align) || !IS_ALIGNED((u64)count, align);
+	return ret;
+}
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -1684,8 +1719,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (err)
 			return err;
 
-		if (!fc->handle_killpriv_v2 ||
-		    !setattr_should_drop_suidgid(idmap, file_inode(file))) {
+		if ((!fc->handle_killpriv_v2 ||
+		     !setattr_should_drop_suidgid(idmap, file_inode(file))) &&
+		    fuse_use_writeback_cache(fc, iocb, from)) {
 			writeback = true;
 
 			/*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 23374de18e2fd9..0493b39af4adbb 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -445,8 +445,8 @@ struct fuse_file_lock {
  *			 init_out.request_timeout contains the timeout (in secs)
  * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation
  * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation
- * FUSE_ALIGN_PG_ORDER: page order (power of 2 exponent for number of pages) for
- *			optimal io-size alignment
+ * FUSE_ALIGN_PG_ORDER: alignment order (power of 2 exponent of the IO size
+ *			in bytes) for optimal io-size alignment
  * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free
  *			 to register between 1 and nr-core io-uring queues
  */
@@ -947,7 +947,8 @@ struct fuse_init_in {
 #define FUSE_COMPAT_22_INIT_OUT_SIZE 24
 
 /*
- * align_page_order: Number of pages for optimal IO, or a multiple of that
+ * align_page_order: log2 of the optimal IO size in bytes; IO is optimal
+ * when sized and aligned to (1 << align_page_order) or a multiple of it
  */
 struct fuse_init_out {
 	uint32_t	major;

From 37ce0b03b2137c6e6d1896a4787b129f5ead7534 Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Mon, 22 Jun 2026 08:54:58 +0200
Subject: [PATCH 03/12] fuse: add writethrough_threshold knob to bypass the
 writeback cache

Add a per-connection size threshold, settable via fusectl as
writethrough_threshold, that sends buffered writes >= threshold
through fuse_perform_write() regardless of alignment.

The knob is off by default (0 == disabled) and leaves the existing
alignment-based decision in place for writes below the threshold.

Ported from the redfs-ubuntu-noble-writethrough-split branch; the
fusectl dentry uses this branch's fuse_ctl_add_dentry() signature and
the ops struct omits the now-removed no_llseek.

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/control.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/file.c    |  8 ++++++
 fs/fuse/fuse_i.h  |  5 +++-
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 5247df896c5d01..0ce4d791763289 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -203,6 +203,67 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = {
 	.write = fuse_conn_congestion_threshold_write,
 };
 
+static ssize_t fuse_conn_writethrough_threshold_read(struct file *file,
+						     char __user *buf,
+						     size_t len, loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = READ_ONCE(fc->writethrough_threshold);
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_writethrough_threshold_write(struct file *file,
+						      const char __user *buf,
+						      size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	char kbuf[32];
+	unsigned long long val;
+	char *end;
+
+	if (*ppos)
+		return -EINVAL;
+	if (count == 0 || count >= sizeof(kbuf))
+		return -EINVAL;
+	if (copy_from_user(kbuf, buf, count))
+		return -EFAULT;
+	kbuf[count] = '\0';
+
+	/* memparse accepts a bare suffix without a digit; require a digit */
+	if (kbuf[0] < '0' || kbuf[0] > '9')
+		return -EINVAL;
+
+	val = memparse(kbuf, &end);
+	end = skip_spaces(end);
+	if (*end)
+		return -EINVAL;
+	if (val > UINT_MAX)
+		return -EINVAL;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return -ENOENT;
+
+	WRITE_ONCE(fc->writethrough_threshold, (unsigned int)val);
+	fuse_conn_put(fc);
+
+	return count;
+}
+
+static const struct file_operations fuse_conn_writethrough_threshold_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_writethrough_threshold_read,
+	.write = fuse_conn_writethrough_threshold_write,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name, int mode,
@@ -269,7 +330,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 				 NULL, &fuse_conn_max_background_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
 				 S_IFREG | 0600, NULL,
-				 &fuse_conn_congestion_threshold_ops))
+				 &fuse_conn_congestion_threshold_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "writethrough_threshold",
+				 S_IFREG | 0600, NULL,
+				 &fuse_conn_writethrough_threshold_ops))
 		goto err;
 
 	return 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2aa0040fa22398..9b7f30bb015f08 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1679,11 +1679,15 @@ static ssize_t fuse_writeback_write_iter(struct kiocb *iocb,
  * They create no dirty pages, hence no DLM write lock needs to be cached
  * for them.  Unaligned writes keep using the writeback cache, where they
  * can merge with neighbouring data.
+ *
+ * A non-zero writethrough_threshold additionally forces any write at or
+ * above that size through fuse_perform_write() regardless of alignment.
  */
 static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
 				     struct iov_iter *from)
 {
 	size_t count = iov_iter_count(from);
+	unsigned int wt;
 	u64 align;
 	bool ret;
 
@@ -1694,6 +1698,10 @@ static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
 	if (iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND | IOCB_NOWAIT))
 		return true;
 
+	wt = READ_ONCE(fc->writethrough_threshold);
+	if (wt && count >= wt)
+		return false;
+
 	align = fc->alignment_pages ?
 		(u64)fc->alignment_pages << PAGE_SHIFT : PAGE_SIZE;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 365c28cb282146..0c7196270538d2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -47,7 +47,7 @@
 #define FUSE_NAME_MAX (PATH_MAX - 1)
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 5
+#define FUSE_CTL_NUM_DENTRIES 6
 
 /* Frequency (in seconds) of request timeout checks, if opted into */
 #define FUSE_TIMEOUT_TIMER_FREQ 15
@@ -1045,6 +1045,9 @@ struct fuse_conn {
 	/* The foffset alignment in PAGE */
 	unsigned int alignment_pages;
 
+	/* Buffered writes >= this size bypass the writeback cache (0 = off) */
+	unsigned int writethrough_threshold;
+
 	/**
 	 * XArray tracking tasks that need DLM retry.
 	 * Maps task pointer -> struct fuse_dlm_retry.

From cf708adbc495c33aa9888bd9649f44606ab87c37 Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Mon, 22 Jun 2026 15:14:46 +0200
Subject: [PATCH 04/12] fuse: bound folio order to the per-request page limit

fuse_readahead() batches whole folios into a single request, capped at
min(fc->max_pages, fc->max_read/PAGE_SIZE) pages, but fuse_init_file_inode()
let the page cache build folios up to MAX_PAGECACHE_ORDER. A large
sequential read could thus produce a folio bigger than one request can
carry: the first loop iteration took the folio_pages > cur_pages path,
fired WARN_ON(!pages), and broke with ap->num_folios == 0.
fuse_send_readpages() was still called and dereferenced a NULL
ap->folios[0] via folio_pos(), oopsing at CR2=0x20 (folio->index).

Cap the folio order to the per-request page limit so the page cache can
never build an unserviceable folio.

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/file.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9b7f30bb015f08..f37038a6401748 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3550,6 +3550,20 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
 		fuse_dax_inode_init(inode, flags);
 
-	if (enable_large_folios)
-		mapping_set_large_folios(inode->i_mapping);
+	if (enable_large_folios) {
+		/*
+		 * Readahead and writeback batch whole folios into a single
+		 * request, capped at min(fc->max_pages, fc->max_read/PAGE_SIZE)
+		 * pages.  The page cache must therefore never build a folio
+		 * larger than that, or fuse_readahead() trips WARN_ON(!pages)
+		 * and then dereferences a NULL ap->folios[0] in
+		 * fuse_send_readpages().  Bound the folio order to the request
+		 * limit instead of MAX_PAGECACHE_ORDER.
+		 */
+		unsigned int max_pages = min(fc->max_pages,
+					     fc->max_read >> PAGE_SHIFT);
+
+		mapping_set_folio_order_range(inode->i_mapping, 0,
+					      ilog2(max_pages ?: 1));
+	}
 }

From a29cc8338d4257ca5e9a075442419e4e9bf6b241 Mon Sep 17 00:00:00 2001
From: Jim Harris <jim.harris@nvidia.com>
Date: Mon, 22 Jun 2026 17:42:27 -0700
Subject: [PATCH 05/12] fuse: allow larger read requests by setting
 bdi->io_pages

A FUSE server that advertises a large max_pages and max_write (e.g.
max_pages=256, max_write=1MB) cannot currently obtain matching
FUSE_READ request sizes from the kernel.  Buffered sequential writes
arrive at the server at the negotiated max_write size, but a large
buffered read() is split into several smaller FUSE_READ requests.

For a buffered read, filemap_get_pages() -> page_cache_sync_ra() sizes
the read against ractl_max_pages():

	max_pages = ractl->ra->ra_pages;
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

fuse leaves bdi->io_pages at the default VM_READAHEAD_PAGES (128KB), so
a 1MB read() (req_size = 256 pages) is clamped to the readahead window
(128KB, or 256KB for POSIX_FADV_SEQUENTIAL), producing four 256KB
FUSE_READ round-trips instead of one.

Set bdi->io_pages to fc->max_pages after feature negotiation.  As the
code above shows, io_pages only raises the limit when the request size
already exceeds the readahead window, so it enlarges explicitly
requested reads without enlarging the speculative readahead window.
This avoids increasing speculative page-cache readahead on behalf of
an unprivileged server.  NFS does the same, setting io_pages from
rpages while leaving ra_pages at the default.

fc->max_pages is already bounded by fc->max_pages_limit (and, for
virtio-fs, by the virtqueue descriptor count), so io_pages inherits
the same bound.

Suggested-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Jim Harris <jim.harris@nvidia.com>
Assisted-by: Cursor:claude-opus-4.8
---
 fs/fuse/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index de104cb029c6d3..70cf89c9fd6bd9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1704,6 +1704,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 		else
 			fm->sb->s_bdi->ra_pages =
 				min(fm->sb->s_bdi->ra_pages, ra_pages);
+		fm->sb->s_bdi->io_pages = fc->max_pages;
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);

From 48345b51f9c3e0203d545a0a279a9048e7507931 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 7 Jul 2025 16:46:05 -0700
Subject: [PATCH 06/12] fuse: use default writeback accounting

commit 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal
rb tree") removed temp folios for dirty page writeback. Consequently,
fuse can now use the default writeback accounting.

With switching fuse to use default writeback accounting, there are some
added benefits. This updates wb->writeback_inodes tracking as well now
and updates writeback throughput estimates after writeback completion.

This commit also removes inc_wb_stat() and dec_wb_stat(). These have no
callers anymore now that fuse does not call them.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
(cherry picked from commit 494d2f508883a6e5c4530e5c6b3c8b2bbfb7318d)
---
 fs/fuse/file.c              |  9 +--------
 fs/fuse/inode.c             |  8 ++++++--
 include/linux/backing-dev.h | 10 ----------
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f37038a6401748..6586a87105b010 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2158,19 +2158,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
 	struct fuse_args_pages *ap = &wpa->ia.ap;
 	struct inode *inode = wpa->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
-	for (i = 0; i < ap->num_folios; i++) {
+	for (i = 0; i < ap->num_folios; i++)
 		/*
 		 * Benchmarks showed that ending writeback within the
 		 * scope of the fi->lock alleviates xarray lock
 		 * contention and noticeably improves performance.
 		 */
 		iomap_finish_folio_write(inode, ap->folios[i], 1);
-		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		wb_writeout_inc(&bdi->wb);
-	}
 
 	wake_up(&fi->page_waitq);
 }
@@ -2345,14 +2341,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
 					  uint32_t folio_index, loff_t offset, unsigned len)
 {
-	struct inode *inode = folio->mapping->host;
 	struct fuse_args_pages *ap = &wpa->ia.ap;
 
 	ap->folios[folio_index] = folio;
 	ap->descs[folio_index].offset = offset;
 	ap->descs[folio_index].length = len;
-
-	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 }
 
 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 70cf89c9fd6bd9..66baa32d60c29e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1833,8 +1833,12 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	if (err)
 		return err;
 
-	/* fuse does it's own writeback accounting */
-	sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
+	/*
+	 * fuse uses the default (core) writeback accounting now that the manual
+	 * WB_WRITEBACK updates are gone, so leave BDI_CAP_WRITEBACK_ACCT set:
+	 * clearing it would zero the per-bdi writeback stats and writeout
+	 * fraction that balance_dirty_pages() throttles on.
+	 */
 	sb->s_bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
 
 	/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e721148c95d07d..9a1e895dd5df1b 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb,
 	percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
-static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-{
-	wb_stat_mod(wb, item, 1);
-}
-
-static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
-{
-	wb_stat_mod(wb, item, -1);
-}
-
 static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	return percpu_counter_read_positive(&wb->stat[item]);

From 7c1c4923e40e06fad5e9aee7dcf26e401594d6fb Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Mon, 29 Jun 2026 12:11:30 +0200
Subject: [PATCH 07/12] fuse: switch multi-writer inodes to direct IO on lock
 contention

When several entities open the same file for writing, the cached write
path (fuse_cache_write_iter()) serializes them on the exclusive inode
lock, which is held across synchronous server round-trips.

Add an opt-in per-connection knob (force_dio_on_contention, off by
default, exposed via the fuse control filesystem).  When enabled, an
inode opened by a second writer is latched into direct IO: reads and
writes use the parallel shared-lock dio path, the page cache is flushed
and dropped at open (reusing the O_TRUNC scaffolding), and new opens
skip caching mode.  The latch is cleared when the last writer closes and
reverted to caching mode on mmap.

Detection uses fi->write_files so only writers count: a reader opening
alongside a single writer does not trigger the switch.

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/control.c |  63 ++++++++++++++++++++++++-
 fs/fuse/file.c    | 118 +++++++++++++++++++++++++++++++++++++++++-----
 fs/fuse/fuse_i.h  |  48 +++++++++++++++++++
 fs/fuse/inode.c   |   1 +
 fs/fuse/iomode.c  |  10 ++++
 5 files changed, 226 insertions(+), 14 deletions(-)

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 0ce4d791763289..10d6a08e6a7b02 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -264,6 +264,64 @@ static const struct file_operations fuse_conn_writethrough_threshold_ops = {
 	.write = fuse_conn_writethrough_threshold_write,
 };
 
+static ssize_t fuse_conn_force_dio_on_contention_read(struct file *file,
+						      char __user *buf,
+						      size_t len, loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	char tmp[32];
+	size_t size;
+	int val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = READ_ONCE(fc->force_dio_on_contention);
+	fuse_conn_put(fc);
+
+	size = sprintf(tmp, "%d\n", val);
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_force_dio_on_contention_write(struct file *file,
+						       const char __user *buf,
+						       size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	int val;
+	int err;
+
+	if (*ppos)
+		return -EINVAL;
+
+	/*
+	 * -1 disables; otherwise the number of other writers that must already
+	 * hold the inode before it is latched into direct IO (0 = first writer,
+	 * 1 = second writer, ...).
+	 */
+	err = kstrtoint_from_user(buf, count, 0, &val);
+	if (err)
+		return err;
+	if (val < -1)
+		return -EINVAL;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return -ENOENT;
+
+	WRITE_ONCE(fc->force_dio_on_contention, val);
+	fuse_conn_put(fc);
+
+	return count;
+}
+
+static const struct file_operations fuse_conn_force_dio_on_contention_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_force_dio_on_contention_read,
+	.write = fuse_conn_force_dio_on_contention_write,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name, int mode,
@@ -333,7 +391,10 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 				 &fuse_conn_congestion_threshold_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "writethrough_threshold",
 				 S_IFREG | 0600, NULL,
-				 &fuse_conn_writethrough_threshold_ops))
+				 &fuse_conn_writethrough_threshold_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "force_dio_on_contention",
+				 S_IFREG | 0600, NULL,
+				 &fuse_conn_force_dio_on_contention_ops))
 		goto err;
 
 	return 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6586a87105b010..0e55afb0b0bf6b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -350,6 +350,8 @@ static int fuse_open(struct inode *inode, struct file *file)
 	bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
 	bool is_wb_truncate = is_truncate && fc->writeback_cache;
 	bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
+	bool force_dio = false;
+	int dio_threshold = READ_ONCE(fc->force_dio_on_contention);
 
 	if (fuse_is_bad(inode))
 		return -EIO;
@@ -358,7 +360,26 @@ static int fuse_open(struct inode *inode, struct file *file)
 	if (err)
 		return err;
 
-	if (is_wb_truncate || dax_truncate)
+	/*
+	 * If this file is opened for writing while another writer already has
+	 * the inode open, the cached write path (fuse_cache_write_iter())
+	 * serializes them on the exclusive inode lock.  Latch the inode into
+	 * direct IO so writes use the shared-lock parallel dio path instead.
+	 * The page cache is flushed and dropped below (fuse_launder_folio()
+	 * writes back dirty folios during the invalidate), making the switch
+	 * coherent.  Disabled for DAX, passthrough/backing and mmapped inodes,
+	 * which cannot bypass the page cache.
+	 */
+	if (dio_threshold >= 0 &&
+	    (file->f_mode & FMODE_WRITE) && fc->writeback_cache &&
+	    !FUSE_IS_DAX(inode) && !fuse_inode_backing(fi) &&
+	    !mapping_mapped(inode->i_mapping)) {
+		spin_lock(&fi->lock);
+		force_dio = fuse_writers_contended(fi, dio_threshold);
+		spin_unlock(&fi->lock);
+	}
+
+	if (is_wb_truncate || dax_truncate || force_dio)
 		inode_lock(inode);
 
 	if (dax_truncate) {
@@ -368,7 +389,7 @@ static int fuse_open(struct inode *inode, struct file *file)
 			goto out_inode_unlock;
 	}
 
-	if (is_wb_truncate || dax_truncate)
+	if (is_wb_truncate || dax_truncate || force_dio)
 		fuse_set_nowrite(inode);
 
 	err = fuse_do_open(fm, get_node_id(inode), file, false);
@@ -381,18 +402,28 @@ static int fuse_open(struct inode *inode, struct file *file)
 			fuse_truncate_update_attr(inode, file);
 	}
 
-	if (is_wb_truncate || dax_truncate)
+	if (is_wb_truncate || dax_truncate || force_dio)
 		fuse_release_nowrite(inode);
 	if (!err) {
 		if (is_truncate)
 			truncate_pagecache(inode, 0);
-		else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+		else if (force_dio && !fuse_inode_backing(fi) &&
+			 !mapping_mapped(inode->i_mapping)) {
+			/*
+			 * Latch direct IO and drop the page cache regardless of
+			 * FOPEN_KEEP_CACHE.  Still under inode_lock, so any other
+			 * writer is blocked in fuse_cache_write_iter() and will
+			 * observe the latch when it proceeds.
+			 */
+			set_bit(FUSE_I_FORCE_DIO, &fi->state);
+			invalidate_inode_pages2(inode->i_mapping);
+		} else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 			invalidate_inode_pages2(inode->i_mapping);
 	}
 	if (dax_truncate)
 		filemap_invalidate_unlock(inode->i_mapping);
 out_inode_unlock:
-	if (is_wb_truncate || dax_truncate)
+	if (is_wb_truncate || dax_truncate || force_dio)
 		inode_unlock(inode);
 
 	return err;
@@ -411,6 +442,17 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 	if (likely(fi)) {
 		spin_lock(&fi->lock);
 		list_del(&ff->write_entry);
+		/*
+		 * Leave forced direct IO mode once the last writer is gone;
+		 * a lone (or no) writer no longer contends on the inode lock.
+		 * Restore FUSE_I_CACHE_IO_MODE for any frozen cached opens.
+		 */
+		if (test_bit(FUSE_I_FORCE_DIO, &fi->state) &&
+		    list_empty(&fi->write_files)) {
+			clear_bit(FUSE_I_FORCE_DIO, &fi->state);
+			if (fi->iocachectr > 0)
+				set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+		}
 		spin_unlock(&fi->lock);
 	}
 	spin_lock(&fc->lock);
@@ -1513,9 +1555,15 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
 	struct fuse_file *ff = file->private_data;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool force_dio = test_bit(FUSE_I_FORCE_DIO, &fi->state);
 
-	/* Server side has to advise that it supports parallel dio writes. */
-	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
+	/*
+	 * Server side has to advise that it supports parallel dio writes.
+	 * When the inode is latched into forced direct IO due to multi-opener
+	 * contention, parallel writes are used unconditionally: the page cache
+	 * has been flushed and is bypassed for this inode.
+	 */
+	if (!force_dio && !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
 		return true;
 
 	/*
@@ -1526,7 +1574,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
 		return true;
 
 	/* shared locks are not allowed with parallel page cache IO */
-	if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
+	if (!force_dio && test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
 		return true;
 
 	/* Parallel dio beyond EOF is not supported, at least for now. */
@@ -1553,9 +1601,16 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
 		 * should be performed only after taking shared inode lock.
 		 * Previous past eof check was without inode lock and might
 		 * have raced, so check it again.
+		 *
+		 * Under the forced-dio latch the cached/uncached accounting is
+		 * bypassed (the latch already guarantees the cache is flushed
+		 * and not repopulated), so just take the shared lock, only
+		 * re-checking the past-eof condition.  The latch is stable
+		 * while the shared lock is held, keeping start/end balanced.
 		 */
 		if (fuse_io_past_eof(iocb, from) ||
-		    fuse_inode_uncached_io_start(fi, NULL) != 0) {
+		    (!test_bit(FUSE_I_FORCE_DIO, &fi->state) &&
+		     fuse_inode_uncached_io_start(fi, NULL) != 0)) {
 			inode_unlock_shared(inode);
 			inode_lock(inode);
 			*exclusive = true;
@@ -1571,8 +1626,13 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
 	if (exclusive) {
 		inode_unlock(inode);
 	} else {
-		/* Allow opens in caching mode after last parallel dio end */
-		fuse_inode_uncached_io_end(fi);
+		/*
+		 * Allow opens in caching mode after last parallel dio end.
+		 * Skipped under the forced-dio latch, which never took an
+		 * uncached_io reference in fuse_dio_lock().
+		 */
+		if (!test_bit(FUSE_I_FORCE_DIO, &fi->state))
+			fuse_inode_uncached_io_end(fi);
 		inode_unlock_shared(inode);
 	}
 }
@@ -1709,6 +1769,8 @@ static bool fuse_use_writeback_cache(struct fuse_conn *fc, struct kiocb *iocb,
 	return ret;
 }
 
+static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from);
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -1720,6 +1782,15 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool writeback = false;
 
+	/*
+	 * The inode may have been latched into forced direct IO after this
+	 * write was routed here but before it acquired any lock.  Re-route to
+	 * the direct path (before taking a DLM lock) so we do not repopulate
+	 * the page cache that the latch just dropped.
+	 */
+	if (fuse_inode_force_dio(inode))
+		return fuse_direct_write_iter(iocb, from);
+
 	if (fc->writeback_cache) {
 		/* Update size (EOF optimization) and mode (SUID clearing) */
 		err = fuse_update_attributes(mapping->host, file,
@@ -2086,7 +2157,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		return fuse_dax_read_iter(iocb, to);
 
 	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
-	if (ff->open_flags & FOPEN_DIRECT_IO)
+	if ((ff->open_flags & FOPEN_DIRECT_IO) || fuse_inode_force_dio(inode))
 		return fuse_direct_read_iter(iocb, to);
 	else if (fuse_file_passthrough(ff))
 		return fuse_passthrough_read_iter(iocb, to);
@@ -2107,7 +2178,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return fuse_dax_write_iter(iocb, from);
 
 	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
-	if (ff->open_flags & FOPEN_DIRECT_IO)
+	if ((ff->open_flags & FOPEN_DIRECT_IO) || fuse_inode_force_dio(inode))
 		return fuse_direct_write_iter(iocb, from);
 	else if (fuse_file_passthrough(ff))
 		return fuse_passthrough_write_iter(iocb, from);
@@ -2728,6 +2799,27 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 	else if (fuse_inode_backing(get_fuse_inode(inode)))
 		return -ENODEV;
 
+	/*
+	 * If the inode was latched into forced direct IO due to multi-opener
+	 * contention, a mapping needs the page cache, so revert to caching
+	 * mode.  The exclusive inode lock drains in-flight parallel (shared
+	 * lock) dio writes before the page cache is reused.  Cached opens that
+	 * were frozen while latched are still counted in iocachectr, so restore
+	 * FUSE_I_CACHE_IO_MODE for them.
+	 */
+	if (fuse_inode_force_dio(inode)) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		inode_lock(inode);
+		spin_lock(&fi->lock);
+		clear_bit(FUSE_I_FORCE_DIO, &fi->state);
+		if (fi->iocachectr > 0)
+			set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+		spin_unlock(&fi->lock);
+		invalidate_inode_pages2(file->f_mapping);
+		inode_unlock(inode);
+	}
+
 	/*
 	 * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
 	 * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0c7196270538d2..d16316e59f647c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -257,6 +257,13 @@ enum {
 	FUSE_I_BTIME,
 	/* Wants or already has page cache IO */
 	FUSE_I_CACHE_IO_MODE,
+	/*
+	 * Latched into direct IO because several entities have the file open
+	 * and were contending on the exclusive inode lock of the cached write
+	 * path.  Reads and writes are routed direct (shared-lock parallel dio)
+	 * until the last writer closes.  See fuse_open()/fuse_file_io_open().
+	 */
+	FUSE_I_FORCE_DIO,
 };
 
 struct fuse_conn;
@@ -1048,6 +1055,18 @@ struct fuse_conn {
 	/* Buffered writes >= this size bypass the writeback cache (0 = off) */
 	unsigned int writethrough_threshold;
 
+	/*
+	 * Latch an inode into direct IO once it is open for writing by enough
+	 * entities to contend on the exclusive inode lock in the cached write
+	 * path.  The value is the number of *other* writers that must already
+	 * hold the inode before the latch trips: 0 forces DIO on the first
+	 * writer, 1 on the second (i.e. as soon as a second writer appears --
+	 * the historic on/off behaviour), N on the (N+1)th.  -1 disables the
+	 * feature.  Requires the server to tolerate concurrent direct writes
+	 * (as cluster/parallel filesystems do).
+	 */
+	int force_dio_on_contention;
+
 	/**
 	 * XArray tracking tasks that need DLM retry.
 	 * Maps task pointer -> struct fuse_dlm_retry.
@@ -1603,6 +1622,35 @@ void fuse_inode_uncached_io_end(struct fuse_inode *fi);
 int fuse_file_io_open(struct file *file, struct inode *inode);
 void fuse_file_io_release(struct fuse_file *ff, struct inode *inode);
 
+/* Inode latched into forced direct IO due to multi-opener lock contention */
+static inline bool fuse_inode_force_dio(struct inode *inode)
+{
+	return test_bit(FUSE_I_FORCE_DIO, &get_fuse_inode(inode)->state);
+}
+
+/*
+ * True when the inode is contended enough to force direct IO: at least
+ * @threshold writers are already linked on write_files, besides the opener or
+ * remote modifier that prompted the check.  @threshold is fc->force_dio_on_-
+ * contention: 0 trips on the first writer, 1 on the second, N on the (N+1)th,
+ * and a negative value disables the feature.  Must be called with fi->lock held.
+ */
+static inline bool fuse_writers_contended(struct fuse_inode *fi, int threshold)
+{
+	struct fuse_file *ff;
+	int count = 0;
+
+	if (threshold < 0)
+		return false;
+
+	list_for_each_entry(ff, &fi->write_files, write_entry) {
+		if (count >= threshold)
+			return true;
+		count++;
+	}
+	return count >= threshold;
+}
+
 /* file.c */
 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 								struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 66baa32d60c29e..e13f1b45ebe4fd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1203,6 +1203,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
 	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+	fc->force_dio_on_contention = -1;	/* disabled by default */
 	atomic64_set(&fc->khctr, 0);
 	fc->polled_files = RB_ROOT;
 	fc->blocked = 0;
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index c99e285f3183ef..66ee36145a60bc 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -233,6 +233,16 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
 	    !(ff->open_flags & FOPEN_PASSTHROUGH))
 		return 0;
 
+	/*
+	 * The inode was latched into direct IO because several entities have
+	 * it open and were contending on the exclusive inode lock of the
+	 * cached write path.  Open this file uncached as well so that its IO
+	 * is routed direct and it does not re-enter caching mode.
+	 */
+	if (test_bit(FUSE_I_FORCE_DIO, &fi->state) &&
+	    !(ff->open_flags & FOPEN_PASSTHROUGH))
+		return 0;
+
 	if (ff->open_flags & FOPEN_PASSTHROUGH)
 		err = fuse_file_passthrough_open(inode, file);
 	else

From 5889fa23e69cdba0f975b43f23cdb4b7384e82a2 Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Mon, 29 Jun 2026 12:41:46 +0200
Subject: [PATCH 08/12] fuse: switch to direct IO on inode invalidation with a
 local writer

A FUSE_NOTIFY_INVAL_INODE data invalidation means another (remote)
entity is modifying the file. When it is also open for writing here,
this is the multi-writer contention already handled at open() time, so
latch the inode into direct IO (gated by force_dio_on_contention).
Reverted on last writer close / mmap, as for the open-time latch.

Latching to direct IO is only coherent if no buffered write can deposit
dirty folios into the page cache after it has been dropped: once latched
the inode serves reads and writes direct (from the server).

Add a dedicated per-inode rw_semaphore, wb_inval_rwsem, to serialize the
buffered-write page-cache dirtying against the latch transitions. The
writeback path holds it for read around the dirtying and re-checks the
latch under it; fuse_reverse_inval_inode(), the mmap revert and the
last-writer-close revert hold it for write around their invalidate +
latch update.  The close revert also gains an invalidate (in
fuse_file_release(), a sleepable context) so a clean folio repopulated by
a read racing the latch is not served stale once caching resumes; only
clean folios exist while latched, so it is server-free.

The writer's read-side section must stay free of server round-trips or
the down_write() could wait on the server;

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/file.c   | 92 +++++++++++++++++++++++++++++++++++++++++-------
 fs/fuse/fuse_i.h | 17 +++++++++
 fs/fuse/inode.c  | 46 ++++++++++++++++++++++--
 3 files changed, 140 insertions(+), 15 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0e55afb0b0bf6b..c7f1080c4749cd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -405,19 +405,24 @@ static int fuse_open(struct inode *inode, struct file *file)
 	if (is_wb_truncate || dax_truncate || force_dio)
 		fuse_release_nowrite(inode);
 	if (!err) {
+		/*
+		 * Latch into forced direct IO whenever a second writer is
+		 * contending, independently of the O_TRUNC page-cache handling
+		 * below -- an O_TRUNC multi-writer open must switch to direct IO
+		 * too.  Drop the page cache regardless of FOPEN_KEEP_CACHE.
+		 * Still under inode_lock, so a writer contending in
+		 * fuse_cache_write_iter() re-checks the latch after it acquires
+		 * the inode lock and re-routes to direct IO instead of
+		 * repopulating the cache dropped here.
+		 */
+		bool latch = force_dio && !fuse_inode_backing(fi) &&
+			     !mapping_mapped(inode->i_mapping);
+
+		if (latch)
+			set_bit(FUSE_I_FORCE_DIO, &fi->state);
 		if (is_truncate)
 			truncate_pagecache(inode, 0);
-		else if (force_dio && !fuse_inode_backing(fi) &&
-			 !mapping_mapped(inode->i_mapping)) {
-			/*
-			 * Latch direct IO and drop the page cache regardless of
-			 * FOPEN_KEEP_CACHE.  Still under inode_lock, so any other
-			 * writer is blocked in fuse_cache_write_iter() and will
-			 * observe the latch when it proceeds.
-			 */
-			set_bit(FUSE_I_FORCE_DIO, &fi->state);
-			invalidate_inode_pages2(inode->i_mapping);
-		} else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+		else if (latch || !(ff->open_flags & FOPEN_KEEP_CACHE))
 			invalidate_inode_pages2(inode->i_mapping);
 	}
 	if (dax_truncate)
@@ -491,9 +496,25 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_release_args *ra = &ff->args->release_args;
 	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+	bool was_force_dio = test_bit(FUSE_I_FORCE_DIO, &fi->state);
 
 	fuse_prepare_release(fi, ff, open_flags, opcode, false);
 
+	/*
+	 * If this release dropped the last writer, fuse_prepare_release()
+	 * cleared the forced-direct-IO latch (under fi->lock).  Drop any clean
+	 * folios a read racing the latch may have repopulated, so they cannot
+	 * be served stale once caching mode resumes.  No inode lock or
+	 * wb_inval_rwsem: release may run on the fuse server thread (async
+	 * fput from aio completion), where blocking on a contended inode lock
+	 * could stall the connection.  Writes were routed direct while
+	 * latched, so only clean folios exist and this invalidate is
+	 * server-free; the last writer is gone, so no forced-dio writer can
+	 * race the drop.
+	 */
+	if (was_force_dio && !test_bit(FUSE_I_FORCE_DIO, &fi->state))
+		invalidate_inode_pages2(inode->i_mapping);
+
 	if (ra && ff->flock) {
 		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
 		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
@@ -1778,9 +1799,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct address_space *mapping = file->f_mapping;
 	ssize_t written = 0;
 	struct inode *inode = mapping->host;
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	ssize_t err, count;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool writeback = false;
+	bool wb_guard = false;
 
 	/*
 	 * The inode may have been latched into forced direct IO after this
@@ -1830,6 +1853,39 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	inode_lock(inode);
 
+	/*
+	 * The forced-direct-IO latch may have been set by a second writer's
+	 * open() while we blocked acquiring the inode lock (fuse_open() holds
+	 * it across the latch + page-cache invalidate).  Re-check now that we
+	 * hold the lock and re-route to the direct path rather than repopulate
+	 * the just-dropped cache.  The DLM write-lock notification issued above
+	 * is harmless: the direct path performs its own server coordination.
+	 */
+	if (fuse_inode_force_dio(inode)) {
+		inode_unlock(inode);
+		return fuse_direct_write_iter(iocb, from);
+	}
+
+	/*
+	 * When force_dio_on_contention is enabled, hold wb_inval_rwsem for
+	 * read across the page-cache dirtying so a concurrent
+	 * NOTIFY_INVAL_INODE -- which latches the inode into direct IO under
+	 * the write side of this lock (via trylock) -- cannot strand the
+	 * folios we are about to write.  Re-check the latch under it and
+	 * re-route to the direct path if set.  Gated on the knob so writeback
+	 * mounts without the feature never touch the rwsem, and taken before
+	 * task_io_account_write() so a re-route is not double-counted.
+	 */
+	wb_guard = READ_ONCE(fc->force_dio_on_contention) >= 0;
+	if (wb_guard) {
+		down_read(&fi->wb_inval_rwsem);
+		if (fuse_inode_force_dio(inode)) {
+			up_read(&fi->wb_inval_rwsem);
+			inode_unlock(inode);
+			return fuse_direct_write_iter(iocb, from);
+		}
+	}
+
 	err = count = generic_write_checks(iocb, from);
 	if (err <= 0)
 		goto out;
@@ -1856,6 +1912,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		written = fuse_perform_write(iocb, from);
 	}
 out:
+	if (wb_guard)
+		up_read(&fi->wb_inval_rwsem);
 	inode_unlock(inode);
 	if (written > 0)
 		written = generic_write_sync(iocb, written);
@@ -2810,14 +2868,21 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (fuse_inode_force_dio(inode)) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
 
-		inode_lock(inode);
+		/*
+		 * Revert without the inode lock or wb_inval_rwsem: ->mmap runs
+		 * under mmap_lock, and the buffered write path holds both the
+		 * inode lock and wb_inval_rwsem across a fault on the user buffer
+		 * (which takes mmap_lock), so taking either here would invert
+		 * lock order (ABBA).  Clearing the latch and dropping the cache
+		 * is sufficient -- writers re-check the latch and route to cached
+		 * IO once it is clear, and in-flight parallel dio drains itself.
+		 */
 		spin_lock(&fi->lock);
 		clear_bit(FUSE_I_FORCE_DIO, &fi->state);
 		if (fi->iocachectr > 0)
 			set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
 		spin_unlock(&fi->lock);
 		invalidate_inode_pages2(file->f_mapping);
-		inode_unlock(inode);
 	}
 
 	/*
@@ -3631,6 +3696,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
 	fi->iocachectr = 0;
 	init_waitqueue_head(&fi->page_waitq);
 	init_waitqueue_head(&fi->direct_io_waitq);
+	init_rwsem(&fi->wb_inval_rwsem);
 
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
 		fuse_dax_inode_init(inode, flags);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d16316e59f647c..cc85a21fe081be 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -186,6 +186,23 @@ struct fuse_inode {
 
 			/* dlm locked areas we have sent lock requests for */
 			struct fuse_dlm_cache dlm_locked_areas;
+
+			/*
+			 * Serializes buffered-write page-cache dirtying against
+			 * the forced-direct-IO latch transitions that cannot
+			 * take the inode lock -- most importantly
+			 * NOTIFY_INVAL_INODE (fuse_reverse_inval_inode()), which
+			 * may be delivered by the same server thread that still
+			 * owes a reply to an in-flight write holding the inode
+			 * lock.  The buffered writer holds this for read around
+			 * the dirtying and re-checks the latch under it; the
+			 * latch set/clear sites hold it for write around their
+			 * page-cache invalidate + latch update.  The writer's
+			 * read-side section must stay free of server round-trips
+			 * (under fc->dlm the partial-write RMW read is skipped),
+			 * or the down_write() could wait on the server.
+			 */
+			struct rw_semaphore wb_inval_rwsem;
 		};
 
 		/* readdir cache (directory only) */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e13f1b45ebe4fd..037f68c4fb4a0f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -764,6 +764,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
 	struct inode *inode;
 	pgoff_t pg_start;
 	pgoff_t pg_end;
+	int dio_threshold;
 
 	inode = fuse_ilookup(fc, nodeid, NULL);
 	if (!inode)
@@ -806,8 +807,49 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
 								pg_end == -1 ? 0 :
 								(offset + len - 1));
 
-		invalidate_inode_pages2_range(inode->i_mapping,
-					      pg_start, pg_end);
+		/*
+		 * A data invalidation means another (remote) entity is modifying
+		 * the file.  If it is also open for writing here, latch the inode
+		 * into direct IO (when enabled), as for the open-time multi-writer
+		 * case.  Take wb_inval_rwsem for write so the buffered write path
+		 * -- which holds it for read across its dirtying and re-checks the
+		 * latch under it -- cannot strand dirty folios after the cache is
+		 * dropped.  Use a trylock and never block: this may run on the
+		 * server thread that still owes an in-flight write (holding the
+		 * inode lock) its reply, so blocking on the rwsem (or the inode
+		 * lock) would deadlock.  If a writer is active, skip the latch
+		 * this round (best effort); the invalidate still runs.  Only
+		 * regular files initialise the rwsem (it shares storage with the
+		 * readdir-cache union arm), so gate on S_ISREG.  The latch is
+		 * whole-inode, so when set drop the whole mapping rather than just
+		 * the notified range, or dirty folios outside it would be invisible
+		 * to the forced direct reads (stale read / lost write).
+		 */
+		dio_threshold = READ_ONCE(fc->force_dio_on_contention);
+
+		if (S_ISREG(inode->i_mode) && dio_threshold >= 0 &&
+		    fc->writeback_cache && !FUSE_IS_DAX(inode) &&
+		    !fuse_inode_backing(fi) && !mapping_mapped(inode->i_mapping) &&
+		    down_write_trylock(&fi->wb_inval_rwsem)) {
+			bool latched = false;
+
+			spin_lock(&fi->lock);
+			if (fuse_writers_contended(fi, dio_threshold)) {
+				set_bit(FUSE_I_FORCE_DIO, &fi->state);
+				latched = true;
+			}
+			spin_unlock(&fi->lock);
+
+			if (latched)
+				invalidate_inode_pages2(inode->i_mapping);
+			else
+				invalidate_inode_pages2_range(inode->i_mapping,
+							      pg_start, pg_end);
+			up_write(&fi->wb_inval_rwsem);
+		} else {
+			invalidate_inode_pages2_range(inode->i_mapping,
+						      pg_start, pg_end);
+		}
 	}
 	iput(inode);
 	return 0;

From 9449bfeeb734dc897a8a06e77b4e6a5574ee2adf Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bernd@bsbernd.com>
Date: Tue, 30 Jun 2026 13:47:52 +0200
Subject: [PATCH 09/12] fuse: allow parallel direct writes past EOF

Extending FOPEN_PARALLEL_DIRECT_WRITES writes were forced onto the
exclusive inode lock, re-serializing the parallel phase. The exclusive
lock only bundled "write + advance i_size + undo-on-failure" into one
unit. But i_size is committed by fuse_write_update_attr() under
fi->lock, only on a successful growing write and independent of the
inode rwsem -- so shared-lock writers commit size correctly and have
nothing to undo. Drop the past-EOF exclusive triggers and gate the
whole-file fuse_do_truncate() rollback on holding the exclusive lock.

Lock mode is passed to __fuse_direct_IO(); i_size is committed at the
same point in every path, only the failure rollback differs:

  non-exclusive (relaxed, parallel):
    fuse_direct_write_iter
      fuse_dio_lock -> inode_lock_shared       (exclusive=false)
      __fuse_direct_IO(.., false)
        fuse_direct_io()          write to server
        fuse_write_update_attr()  commit i_size (on success)
        no rollback

  exclusive (append / caching / !parallel):
    fuse_direct_write_iter
      fuse_dio_lock -> inode_lock              (exclusive=true)
      __fuse_direct_IO(.., true)
        fuse_direct_io()          write to server
        fuse_write_update_attr()  commit i_size (on success)
        ret<0 & extend -> fuse_do_truncate()  rollback

  exclusive (caching-mode O_DIRECT):
    fuse_cache_write_iter -> inode_lock (exclusive)
      generic_file_direct_write -> fuse_direct_IO
        __fuse_direct_IO(.., true)
          fuse_direct_io()          write to server
          fuse_write_update_attr()  commit i_size (on success)
          ret<0 & extend -> fuse_do_truncate()  rollback

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
---
 fs/fuse/file.c | 55 +++++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c7f1080c4749cd..2bc0b9806e5206 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1560,13 +1560,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
 	return res;
 }
 
-static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct inode *inode = file_inode(iocb->ki_filp);
-
-	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
-}
-
 /*
  * @return true if an exclusive lock for direct IO writes is needed
  */
@@ -1598,10 +1591,6 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
 	if (!force_dio && test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
 		return true;
 
-	/* Parallel dio beyond EOF is not supported, at least for now. */
-	if (fuse_io_past_eof(iocb, from))
-		return true;
-
 	return false;
 }
 
@@ -1620,18 +1609,15 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
 		 * New parallal dio allowed only if inode is not in caching
 		 * mode and denies new opens in caching mode. This check
 		 * should be performed only after taking shared inode lock.
-		 * Previous past eof check was without inode lock and might
-		 * have raced, so check it again.
 		 *
-		 * Under the forced-dio latch the cached/uncached accounting is
-		 * bypassed (the latch already guarantees the cache is flushed
-		 * and not repopulated), so just take the shared lock, only
-		 * re-checking the past-eof condition.  The latch is stable
-		 * while the shared lock is held, keeping start/end balanced.
+		 * Under the forced-dio latch the uncached-io accounting is
+		 * bypassed entirely -- fuse_dio_unlock() likewise skips
+		 * fuse_inode_uncached_io_end() -- so do not take a reference
+		 * here.  An unbalanced start would drive fi->iocachectr
+		 * permanently negative and hang the next caching-mode open.
 		 */
-		if (fuse_io_past_eof(iocb, from) ||
-		    (!test_bit(FUSE_I_FORCE_DIO, &fi->state) &&
-		     fuse_inode_uncached_io_start(fi, NULL) != 0)) {
+		if (!test_bit(FUSE_I_FORCE_DIO, &fi->state) &&
+		    fuse_inode_uncached_io_start(fi, NULL) != 0) {
 			inode_unlock_shared(inode);
 			inode_lock(inode);
 			*exclusive = true;
@@ -2148,14 +2134,16 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
 	return res;
 }
 
-static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+static ssize_t __fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+				bool exclusive);
 
 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	ssize_t res;
 
 	if (!is_sync_kiocb(iocb)) {
-		res = fuse_direct_IO(iocb, to);
+		/* exclusive is unused on reads; rollback is write-only */
+		res = __fuse_direct_IO(iocb, to, true);
 	} else {
 		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 
@@ -2178,7 +2166,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (res > 0) {
 		task_io_account_write(res);
 		if (!is_sync_kiocb(iocb)) {
-			res = fuse_direct_IO(iocb, from);
+			res = __fuse_direct_IO(iocb, from, exclusive);
 		} else {
 			struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 
@@ -3313,7 +3301,7 @@ static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
 }
 
 static ssize_t
-fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+__fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bool exclusive)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	ssize_t ret = 0;
@@ -3407,14 +3395,27 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 
 	if (iov_iter_rw(iter) == WRITE) {
 		fuse_write_update_attr(inode, pos, ret);
-		/* For extending writes we already hold exclusive lock */
-		if (ret < 0 && offset + count > i_size)
+		/*
+		 * Whole-file rollback is only safe under an exclusive lock.
+		 * Parallel writers commit i_size only on success (nothing to
+		 * undo); the server owns failed-extend cleanup.
+		 */
+		if (exclusive && ret < 0 && offset + count > i_size)
 			fuse_do_truncate(file);
 	}
 
 	return ret;
 }
 
+static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+	/*
+	 * Only reached via generic_file_direct_write() (caching-mode
+	 * O_DIRECT), which holds the inode lock exclusively.
+	 */
+	return __fuse_direct_IO(iocb, iter, true);
+}
+
 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
 {
 	int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);

From c1fc28043d4296f554e9037c59bed45eb6c7d863 Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Tue, 10 Jun 2025 16:23:28 +0200
Subject: [PATCH 10/12] fuse: {io-uring}: Use a bitmap which queues are
 available

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
(cherry picked from commit c7fc2de393418fbf27bd9a6be02aecce227c11f9)
---
 fs/fuse/dev_uring.c   | 94 +++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev_uring_i.h | 15 +++++++
 2 files changed, 109 insertions(+)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index f235417dad466b..1b3bdf2ac26c6e 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -43,6 +43,9 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
 )
 #endif
 
+/* Number of queued fuse requests until a queue is considered full */
+#define FUSE_URING_QUEUE_THRESHOLD 5
+
 bool fuse_uring_enabled(void)
 {
 	return enable_uring;
@@ -244,6 +247,23 @@ static void fuse_uring_destruct_q_masks(struct fuse_ring *ring)
 	}
 }
 
+static void fuse_ring_destruct_q_masks(struct fuse_ring *ring)
+{
+	free_cpumask_var(ring->avail_q_mask);
+	if (ring->per_numa_avail_q_mask) {
+		for (int node = 0; node < ring->nr_numa_nodes; node++)
+			free_cpumask_var(ring->per_numa_avail_q_mask[node]);
+		kfree(ring->per_numa_avail_q_mask);
+	}
+
+	free_cpumask_var(ring->registered_q_mask);
+	if (ring->numa_registered_q_mask) {
+		for (int node = 0; node < ring->nr_numa_nodes; node++)
+			free_cpumask_var(ring->numa_registered_q_mask[node]);
+		kfree(ring->numa_registered_q_mask);
+	}
+}
+
 void fuse_uring_destruct(struct fuse_conn *fc)
 {
 	struct fuse_ring *ring = fc->ring;
@@ -279,6 +299,7 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 	}
 
 	fuse_uring_destruct_q_masks(ring);
+	fuse_ring_destruct_q_masks(ring);
 	kfree(ring->queues);
 	kfree(ring);
 	fc->ring = NULL;
@@ -319,6 +340,38 @@ static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues)
 	return 0;
 }
 
+static int fuse_ring_create_q_masks(struct fuse_ring *ring, int nr_queues)
+{
+	if (!zalloc_cpumask_var(&ring->avail_q_mask, GFP_KERNEL_ACCOUNT))
+		return -ENOMEM;
+
+	if (!zalloc_cpumask_var(&ring->registered_q_mask, GFP_KERNEL_ACCOUNT))
+		return -ENOMEM;
+
+	ring->per_numa_avail_q_mask = kmalloc_array(ring->nr_numa_nodes,
+						    sizeof(struct cpumask *),
+						    GFP_KERNEL_ACCOUNT);
+	if (!ring->per_numa_avail_q_mask)
+		return -ENOMEM;
+	for (int node = 0; node < ring->nr_numa_nodes; node++)
+		if (!zalloc_cpumask_var(&ring->per_numa_avail_q_mask[node],
+					GFP_KERNEL_ACCOUNT))
+			return -ENOMEM;
+
+	ring->numa_registered_q_mask = kmalloc_array(ring->nr_numa_nodes,
+						     sizeof(struct cpumask *),
+						     GFP_KERNEL_ACCOUNT);
+	if (!ring->numa_registered_q_mask)
+		return -ENOMEM;
+	for (int node = 0; node < ring->nr_numa_nodes; node++) {
+		if (!zalloc_cpumask_var(&ring->numa_registered_q_mask[node],
+					GFP_KERNEL_ACCOUNT))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 /*
  * Basic ring setup for this connection based on the provided configuration
  */
@@ -348,6 +401,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 	if (err)
 		goto out_err;
 
+	err = fuse_ring_create_q_masks(ring, nr_queues);
+	if (err)
+		goto out_err;
+
 	spin_lock(&fc->lock);
 	if (fc->ring) {
 		/* race, another thread created the ring in the meantime */
@@ -368,6 +425,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 
 out_err:
 	fuse_uring_destruct_q_masks(ring);
+	fuse_ring_destruct_q_masks(ring);
 	kfree(ring->queues);
 	kfree(ring);
 	return res;
@@ -415,6 +473,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 
 	queue->qid = qid;
 	queue->ring = ring;
+	queue->numa_node = cpu_to_node(qid);
 	spin_lock_init(&queue->lock);
 
 	INIT_LIST_HEAD(&queue->ent_avail_queue);
@@ -625,6 +684,16 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
 
 		fuse_uring_abort_end_queue_requests(queue);
 		fuse_uring_teardown_entries(queue);
+
+		cpumask_clear_cpu(qid, ring->registered_q_mask);
+		cpumask_clear_cpu(qid, ring->avail_q_mask);
+		for (node = 0; node < ring->nr_numa_nodes; node++) {
+			/* Clear the queue from all masks */
+			cpumask_clear_cpu(qid,
+					  ring->numa_registered_q_mask[node]);
+			cpumask_clear_cpu(qid,
+					  ring->per_numa_avail_q_mask[node]);
+		}
 	}
 
 	/* Reset all queue masks, we won't process any more IO */
@@ -973,9 +1042,18 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
 				 struct fuse_ring_queue *queue)
 {
+	struct fuse_ring *ring = queue->ring;
+	int node = queue->numa_node;
+
 	WARN_ON_ONCE(!ent->cmd);
 	list_move(&ent->list, &queue->ent_avail_queue);
 	ent->state = FRRS_AVAILABLE;
+
+	if (list_is_singular(&queue->ent_avail_queue) &&
+	    queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD) {
+		cpumask_set_cpu(queue->qid, ring->avail_q_mask);
+		cpumask_set_cpu(queue->qid, ring->per_numa_avail_q_mask[node]);
+	}
 }
 
 /* Used to find the request on SQE commit */
@@ -998,6 +1076,8 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
 					   struct fuse_req *req)
 {
 	struct fuse_ring_queue *queue = ent->queue;
+	struct fuse_ring *ring = queue->ring;
+	int node = queue->numa_node;
 
 	lockdep_assert_held(&queue->lock);
 
@@ -1012,6 +1092,16 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
 	ent->state = FRRS_FUSE_REQ;
 	list_move_tail(&ent->list, &queue->ent_w_req_queue);
 	fuse_uring_add_to_pq(ent, req);
+
+	/*
+	 * If there are no more available entries, mark the queue as unavailable
+	 * in both global and per-NUMA node masks
+	 */
+	if (list_empty(&queue->ent_avail_queue)) {
+		cpumask_clear_cpu(queue->qid, ring->avail_q_mask);
+		cpumask_clear_cpu(queue->qid,
+				  ring->per_numa_avail_q_mask[node]);
+	}
 }
 
 /* Fetch the next fuse request if available */
@@ -1401,6 +1491,10 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
 	/* Marks the ring entry as ready */
 	fuse_uring_next_fuse_req(ent, queue, issue_flags);
 
+	cpumask_set_cpu(queue->qid, ring->registered_q_mask);
+	cpumask_set_cpu(queue->qid,
+			ring->numa_registered_q_mask[queue->numa_node]);
+
 	return 0;
 }
 
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 667f668ef3277f..476dd6ce6d477f 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -70,6 +70,9 @@ struct fuse_ring_queue {
 	/* queue id, corresponds to the cpu core */
 	unsigned int qid;
 
+	/* NUMA node this queue belongs to */
+	int numa_node;
+
 	/*
 	 * queue lock, taken when any value in the queue changes _and_ also
 	 * a ring entry state changes.
@@ -149,6 +152,18 @@ struct fuse_ring {
 	/* all queue tracking */
 	struct fuse_queue_map q_map;
 
+	/* Tracks which queues are available (empty) globally */
+	cpumask_var_t avail_q_mask;
+
+	/* Tracks which queues are available per NUMA node */
+	cpumask_var_t *per_numa_avail_q_mask;
+
+	/* Tracks which queues are registered */
+	cpumask_var_t registered_q_mask;
+
+	/* Tracks which queues are registered per NUMA node */
+	cpumask_var_t *numa_registered_q_mask;
+
 	wait_queue_head_t stop_waitq;
 
 	/* async tear down */

From ce153d25fafa99abab2876d494f5f0b048b5edbe Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Sat, 14 Jun 2025 23:23:21 +0200
Subject: [PATCH 11/12] fuse: {io-uring} Distribute load among queues

So far queue selection was statically - the a request on core X
was always handled by the queue corresponding to core X.
A previous commit introduced bitmaps that track which queues
are available - queue selection can make use of these bitmaps
and try to use the ideal queue.

Rules are

- Tries the queue of the current core first, if available
and if that queue does not have too many entries queued

- Then tries the first available queue on the current
numa node. It does not use random distribution here,
because light queue usage is probably ok, so that
kernel/userspace switches can be avoided

- Then tries the first available queue on the
current numa node

If no queue is free, it tries again the queue of the current
core, but if that queue does not exist it falls back
to a random queue on the current numa node - that also might
not exist - it then uses a random available queue.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
(cherry picked from commit a5957769d838c2be211fd896081a474747b1a1b3)
---
 fs/fuse/dev_uring.c | 199 +++++++++++++++++++++++++++++---------------
 1 file changed, 130 insertions(+), 69 deletions(-)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 1b3bdf2ac26c6e..ad916b4277bbf3 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -43,8 +43,15 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
 )
 #endif
 
-/* Number of queued fuse requests until a queue is considered full */
-#define FUSE_URING_QUEUE_THRESHOLD 5
+/* Number of queued fuse requests until a queue is considered full
+ * Basically no entries, as synchronization is with bitmaps and lockless. I.e.
+ * no accuracy - queues always get a bit more requests that way. Lightly
+ * loaded queues is wanted to reduced kernel/userspace switches.
+ */
+#define FUSE_URING_QUEUE_THRESHOLD 0
+
+static unsigned int fuse_uring_get_random_qid(struct fuse_ring *ring,
+					      const struct cpumask *mask);
 
 bool fuse_uring_enabled(void)
 {
@@ -1049,8 +1056,7 @@ static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
 	list_move(&ent->list, &queue->ent_avail_queue);
 	ent->state = FRRS_AVAILABLE;
 
-	if (list_is_singular(&queue->ent_avail_queue) &&
-	    queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD) {
+	if (queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD) {
 		cpumask_set_cpu(queue->qid, ring->avail_q_mask);
 		cpumask_set_cpu(queue->qid, ring->per_numa_avail_q_mask[node]);
 	}
@@ -1594,80 +1600,109 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
 	fuse_uring_send(ent, cmd, err, issue_flags);
 }
 
-static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring,
-						       bool background)
+static struct fuse_ring_queue *
+fuse_uring_get_first_queue(struct fuse_ring *ring, const struct cpumask *mask)
 {
-	unsigned int qid;
-	int node, tries = 0;
-	unsigned int nr_queues;
-	unsigned int cpu = task_cpu(current);
-	struct fuse_ring_queue *queue, *primary_queue = NULL;
+	int qid;
 
-	/*
-	 *  Background requests result in better performance on a different
-	 *  CPU, unless CPUs are already busy.
-	 */
-	if (background)
-		cpu++;
+	/* Find the first available CPU in this mask */
+	qid = cpumask_first(mask);
 
-retry:
-	cpu = cpu % ring->max_nr_queues;
-
-	/* numa local registered queue bitmap */
-	node = cpu_to_node(cpu);
-	if (WARN_ONCE(node >= ring->nr_numa_nodes,
-		      "Node number (%d) exceeds nr nodes (%d)\n",
-		      node, ring->nr_numa_nodes)) {
-		node = 0;
-	}
-
-	nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues);
-	if (nr_queues) {
-		/* prefer the queue that corresponds to the current cpu */
-		queue = READ_ONCE(ring->queues[cpu]);
-		if (queue) {
-			if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD)
-				return queue;
-			primary_queue = queue;
-		}
+	/* Check if we found a valid CPU */
+	if (qid >= ring->max_nr_queues)
+		return NULL; /* No available queues */
+
+	/* This is the global mask, cpu is already the global qid */
+	return ring->queues[qid];
+}
 
-		qid = ring->numa_q_map[node].cpu_to_qid[cpu];
-		if (WARN_ON_ONCE(qid >= ring->max_nr_queues))
-			return NULL;
-		if (qid != cpu) {
-			queue = READ_ONCE(ring->queues[qid]);
+/*
+ * Return a random queue from the registered queues mask
+ *
+ * Uses a deterministic but well-distributed algorithm to select
+ * a random queue from the provided CPU mask.
+ */
+static unsigned int fuse_uring_get_random_qid(struct fuse_ring *ring,
+					      const struct cpumask *mask)
+{
+	unsigned int nr_bits = cpumask_weight(mask);
+	unsigned int nth, cpu;
 
-			/* Might happen on teardown */
-			if (unlikely(!queue))
-				return NULL;
+	if (nr_bits == 0)
+		return UINT_MAX;
 
-			if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD)
-				return queue;
-		}
+	/* Fast path for single CPU */
+	if (nr_bits == 1)
+		return cpumask_first(mask);
 
-		/* Retries help for load balancing */
-		if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) {
-			if (!primary_queue)
-				primary_queue = queue;
+	/*
+	 * Use current jiffies and task PID to create a pseudo-random
+	 * but well-distributed selection that varies across calls
+	 */
+	nth = (get_random_u32() ^ (jiffies & 0xFFFF) ^
+	       (current->pid & 0xFFFF)) %
+	      nr_bits;
 
-			/* Increase cpu, assuming it will map to a different qid*/
-			cpu++;
-			tries++;
-			goto retry;
-		}
+	/* Find the CPU at that position */
+	for_each_cpu(cpu, mask) {
+		if (nth-- == 0)
+			return cpu;
 	}
 
-	/* Retries exceeded, take the primary target queue */
-	if (primary_queue)
-		return primary_queue;
+	return UINT_MAX;
+}
 
-	/* global registered queue bitmap */
-	qid = ring->q_map.cpu_to_qid[cpu];
-	if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) {
-		/* Might happen on teardown */
-		return NULL;
-	}
-	return READ_ONCE(ring->queues[qid]);
+/*
+ * Get the best queue for the current CPU
+ */
+static struct fuse_ring_queue *fuse_uring_get_queue(struct fuse_ring *ring)
+{
+	unsigned int qid;
+	struct fuse_ring_queue *queue, *local_queue = NULL;
+	int local_node;
+	struct cpumask *mask;
+	struct fuse_conn *fc = ring->fc;
+
+	qid = task_cpu(current);
+	local_node = cpu_to_node(qid);
+	if (WARN_ON_ONCE(local_node >= ring->nr_numa_nodes || local_node < 0))
+		local_node = 0;
+
+	/* First check if current CPU's queue is available */
+	if (qid < ring->max_nr_queues) {
+		local_queue = queue = ring->queues[qid];
+		if (queue && queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD)
+			return queue;
+	}
+
+	/* Second check if there are any available queues on the local node */
+	mask = ring->per_numa_avail_q_mask[local_node];
+	queue = fuse_uring_get_first_queue(ring, mask);
+	if (queue)
+		return queue;
+
+	/* Third check if there are any available queues on any node */
+	queue = fuse_uring_get_first_queue(ring, ring->avail_q_mask);
+	if (queue)
+		return queue;
+
+	/* No free queue, use the local queue if it exists */
+	if (local_queue)
+		return local_queue;
+
+	/* Try to use a random queue from the local NUMA node, if there is one */
+	mask = ring->numa_registered_q_mask[local_node];
+	qid = fuse_uring_get_random_qid(ring, mask);
+	if (qid < ring->max_nr_queues)
+		return ring->queues[qid];
+
+	/* Finally, use a random queue among all queues that are registered */
+	qid = fuse_uring_get_random_qid(ring, ring->registered_q_mask);
+	if (qid < ring->max_nr_queues)
+		return ring->queues[qid];
+
+	WARN_ON_ONCE(fc->connected);
+	return NULL;
 }
 
 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg)
@@ -1707,7 +1742,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
 	int err;
 
 	err = -EINVAL;
-	queue = fuse_uring_select_queue(ring, false);
+	queue = fuse_uring_get_queue(ring);
 	if (!queue)
 		goto err;
 
@@ -1722,6 +1757,19 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
 				       struct fuse_ring_ent, list);
 	queue->nr_reqs++;
 
+	/*
+	 * Update queue availability based on number of requests
+	 * A queue is considered busy if it has more than
+	 * FUSE_URING_QUEUE_THRESHOLD requests
+	 */
+	if (queue->nr_reqs == FUSE_URING_QUEUE_THRESHOLD + 1) {
+		/* Queue just became busy */
+		cpumask_clear_cpu(queue->qid, ring->avail_q_mask);
+		cpumask_clear_cpu(
+			queue->qid,
+			ring->per_numa_avail_q_mask[queue->numa_node]);
+	}
+
 	if (ent)
 		fuse_uring_add_req_to_ring_ent(ent, req);
 	else
@@ -1749,7 +1797,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
 	struct fuse_ring_queue *queue;
 	struct fuse_ring_ent *ent = NULL;
 
-	queue = fuse_uring_select_queue(ring, true);
+	queue = fuse_uring_get_queue(ring);
 	if (!queue)
 		return false;
 
@@ -1795,12 +1843,25 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
 bool fuse_uring_remove_pending_req(struct fuse_req *req)
 {
 	struct fuse_ring_queue *queue = req->ring_queue;
+	struct fuse_ring *ring = queue->ring;
+	int node = queue->numa_node;
 	bool removed = fuse_remove_pending_req(req, &queue->lock);
 
 	if (removed) {
 		/* Update counters after successful removal */
 		spin_lock(&queue->lock);
 		queue->nr_reqs--;
+
+		/*
+		 * Update queue availability based on number of requests
+		 * A queue is considered available if it has FUSE_URING_QUEUE_THRESHOLD or fewer requests
+		 */
+		if (queue->nr_reqs == FUSE_URING_QUEUE_THRESHOLD) {
+			/* Queue just became available */
+			cpumask_set_cpu(queue->qid, ring->avail_q_mask);
+			cpumask_set_cpu(queue->qid,
+					ring->per_numa_avail_q_mask[node]);
+		}
 		spin_unlock(&queue->lock);
 	}
 

From 016862e4200cdde7c544bf1e28984427d60594bc Mon Sep 17 00:00:00 2001
From: Horst Birthelmer <hbirthelmer@ddn.com>
Date: Fri, 3 Jul 2026 16:35:51 +0200
Subject: [PATCH 12/12] fuse: {io-uring} drop static queue mapping in favour of
 bitmaps

The two preceding cherry-picks
("Use a bitmap which queues are available" and "Distribute load among queues")
switch queue selection to the per-NUMA availability/registration cpumasks.
This tree also carried an alternative static cpu->qid mapping
(struct fuse_queue_map, ring->numa_q_map/q_map, fuse_uring_cpu_qid_mapping())
from a different backport lineage, which the bitmap selector no longer reads.

Remove the now-dead static mapping
(allocation, per-queue population, teardown and the
FUSE_URING_Q_THRESHOLD/FUSE_URING_Q_TRIES knobs) so only the bitmap mechanism remains.

Signed-off-by: Horst Birthelmer <hbirthelmer@ddn.com>
---
 fs/fuse/dev_uring.c   | 124 ++----------------------------------------
 fs/fuse/dev_uring_i.h |  17 ------
 2 files changed, 6 insertions(+), 135 deletions(-)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index ad916b4277bbf3..4a25601ce93d29 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -22,12 +22,6 @@ MODULE_PARM_DESC(enable_uring,
 #define FUSE_RING_HEADER_PG 0
 #define FUSE_RING_PAYLOAD_PG 1
 
-/* Threshold that determines if a better queue should be searched for */
-#define FUSE_URING_Q_THRESHOLD 2
-
-/* Number of (re)tries to find a better queue */
-#define FUSE_URING_Q_TRIES 3
-
 /* redfs only to allow patch backports */
 #define IO_URING_F_TASK_DEAD (1 << 13)
 
@@ -235,25 +229,6 @@ static void io_pages_free(struct page ***pages, int npages)
 	*pages = NULL;
 }
 
-static void fuse_ring_destruct_q_map(struct fuse_queue_map *q_map)
-{
-	free_cpumask_var(q_map->registered_q_mask);
-	kfree(q_map->cpu_to_qid);
-}
-
-static void fuse_uring_destruct_q_masks(struct fuse_ring *ring)
-{
-	int node;
-
-	fuse_ring_destruct_q_map(&ring->q_map);
-
-	if (ring->numa_q_map) {
-		for (node = 0; node < ring->nr_numa_nodes; node++)
-			fuse_ring_destruct_q_map(&ring->numa_q_map[node]);
-		kfree(ring->numa_q_map);
-	}
-}
-
 static void fuse_ring_destruct_q_masks(struct fuse_ring *ring)
 {
 	free_cpumask_var(ring->avail_q_mask);
@@ -305,48 +280,12 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 		ring->queues[qid] = NULL;
 	}
 
-	fuse_uring_destruct_q_masks(ring);
 	fuse_ring_destruct_q_masks(ring);
 	kfree(ring->queues);
 	kfree(ring);
 	fc->ring = NULL;
 }
 
-static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu)
-{
-	if (!zalloc_cpumask_var(&q_map->registered_q_mask, GFP_KERNEL_ACCOUNT))
-		return -ENOMEM;
-
-	q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid),
-				    GFP_KERNEL_ACCOUNT);
-	if (!q_map->cpu_to_qid)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues)
-{
-	int err, node;
-
-	err = fuse_uring_init_q_map(&ring->q_map, nr_queues);
-	if (err)
-		return err;
-
-	ring->numa_q_map = kcalloc(ring->nr_numa_nodes,
-				   sizeof(*ring->numa_q_map),
-				   GFP_KERNEL_ACCOUNT);
-	if (!ring->numa_q_map)
-		return -ENOMEM;
-	for (node = 0; node < ring->nr_numa_nodes; node++) {
-		err = fuse_uring_init_q_map(&ring->numa_q_map[node],
-					    nr_queues);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
 static int fuse_ring_create_q_masks(struct fuse_ring *ring, int nr_queues)
 {
 	if (!zalloc_cpumask_var(&ring->avail_q_mask, GFP_KERNEL_ACCOUNT))
@@ -355,9 +294,9 @@ static int fuse_ring_create_q_masks(struct fuse_ring *ring, int nr_queues)
 	if (!zalloc_cpumask_var(&ring->registered_q_mask, GFP_KERNEL_ACCOUNT))
 		return -ENOMEM;
 
-	ring->per_numa_avail_q_mask = kmalloc_array(ring->nr_numa_nodes,
-						    sizeof(struct cpumask *),
-						    GFP_KERNEL_ACCOUNT);
+	ring->per_numa_avail_q_mask = kcalloc(ring->nr_numa_nodes,
+					      sizeof(*ring->per_numa_avail_q_mask),
+					      GFP_KERNEL_ACCOUNT);
 	if (!ring->per_numa_avail_q_mask)
 		return -ENOMEM;
 	for (int node = 0; node < ring->nr_numa_nodes; node++)
@@ -365,9 +304,9 @@ static int fuse_ring_create_q_masks(struct fuse_ring *ring, int nr_queues)
 					GFP_KERNEL_ACCOUNT))
 			return -ENOMEM;
 
-	ring->numa_registered_q_mask = kmalloc_array(ring->nr_numa_nodes,
-						     sizeof(struct cpumask *),
-						     GFP_KERNEL_ACCOUNT);
+	ring->numa_registered_q_mask = kcalloc(ring->nr_numa_nodes,
+					       sizeof(*ring->numa_registered_q_mask),
+					       GFP_KERNEL_ACCOUNT);
 	if (!ring->numa_registered_q_mask)
 		return -ENOMEM;
 	for (int node = 0; node < ring->nr_numa_nodes; node++) {
@@ -404,10 +343,6 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 	max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write);
 	max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE);
 
-	err = fuse_uring_create_q_masks(ring, nr_queues);
-	if (err)
-		goto out_err;
-
 	err = fuse_ring_create_q_masks(ring, nr_queues);
 	if (err)
 		goto out_err;
@@ -431,43 +366,18 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
 	return ring;
 
 out_err:
-	fuse_uring_destruct_q_masks(ring);
 	fuse_ring_destruct_q_masks(ring);
 	kfree(ring->queues);
 	kfree(ring);
 	return res;
 }
 
-static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid,
-				       struct fuse_queue_map *q_map,
-				       int node)
-{
-	int cpu, qid_idx, mapping_count = 0;
-	size_t nr_queues;
-
-	cpumask_set_cpu(qid, q_map->registered_q_mask);
-	nr_queues = cpumask_weight(q_map->registered_q_mask);
-	for (cpu = 0; cpu < ring->max_nr_queues; cpu++) {
-		if (node != -1 && cpu_to_node(cpu) != node)
-			continue;
-
-		qid_idx = mapping_count % nr_queues;
-		q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx,
-						     q_map->registered_q_mask);
-		mapping_count++;
-		pr_debug("%s node=%d qid=%d qid_idx=%d nr_queues=%zu %d->%d\n",
-			 __func__, node, qid, qid_idx, nr_queues, cpu,
-			 q_map->cpu_to_qid[cpu]);
-	}
-}
-
 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 						       int qid)
 {
 	struct fuse_conn *fc = ring->fc;
 	struct fuse_ring_queue *queue;
 	struct list_head *pq;
-	int node;
 
 	queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
 	if (!queue)
@@ -507,21 +417,6 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
 	 */
 	WRITE_ONCE(ring->queues[qid], queue);
 
-	/* Static mapping from cpu to per numa queues */
-	node = cpu_to_node(qid);
-	fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node], node);
-
-	/*
-	 * smp_store_release, as the variable is read without fc->lock and
-	 * we need to avoid compiler re-ordering of updating the nr_queues
-	 * and setting ring->numa_queues[node].cpu_to_qid above
-	 */
-	smp_store_release (&ring->numa_q_map[node].nr_queues,
-			   ring->numa_q_map[node].nr_queues + 1);
-
-	/* global mapping */
-	fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map, -1);
-
 	spin_unlock(&fc->lock);
 
 	return queue;
@@ -703,13 +598,6 @@ void fuse_uring_stop_queues(struct fuse_ring *ring)
 		}
 	}
 
-	/* Reset all queue masks, we won't process any more IO */
-	cpumask_clear(ring->q_map.registered_q_mask);
-	for (node = 0; node < ring->nr_numa_nodes; node++) {
-		if (ring->numa_q_map)
-			cpumask_clear(ring->numa_q_map[node].registered_q_mask);
-	}
-
 	if (atomic_read(&ring->queue_refs) > 0) {
 		ring->teardown_time = jiffies;
 		INIT_DELAYED_WORK(&ring->async_teardown_work,
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 476dd6ce6d477f..d0b6dd1e85ff49 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -111,17 +111,6 @@ struct fuse_ring_queue {
 	bool stopped;
 };
 
-struct fuse_queue_map {
-	/* Tracks which queues are registered */
-	cpumask_var_t registered_q_mask;
-
-	/* number of registered queues */
-	size_t nr_queues;
-
-	/* cpu to qid mapping */
-	int *cpu_to_qid;
-};
-
 /**
  * Describes if uring is for communication and holds alls the data needed
  * for uring communication
@@ -146,12 +135,6 @@ struct fuse_ring {
 	 */
 	unsigned int stop_debug_log : 1;
 
-	/* per numa node queue tracking */
-	struct fuse_queue_map *numa_q_map;
-
-	/* all queue tracking */
-	struct fuse_queue_map q_map;
-
 	/* Tracks which queues are available (empty) globally */
 	cpumask_var_t avail_q_mask;