From bee179db3ab3c3df2137945ce6fd1c9130af8b1c Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 28 Jan 2026 21:29:06 +0000 Subject: [PATCH 1/2] nvme: CQ db_buf EventIdx should slightly lag CQ tail Doorbell buffers want the CQ event index to roughly track with the tail, because most CQ doorbells are not interesting. If there are ten CQ entries, space for 54 more, and the guest has acknowledged one, we don't really care anything happened. If there are 64 CQ entries, 0 available, and one SQ entry that we're notified for, we'll try to reserve space in a full CQ, see it's at capacity, and the SQ will idle with the CQ aware that that SQ is corked. Without doorbell buffers, at some point the guest will process a CQE, ring a CQ doorbell, uncork the CQ, and we'll nudge the parked SQ worker thread into operation again. With doorbell buffers, if the event index matches the CQ tail we'll never get a doorbell when the CQ has been drained. If the guest never submits another I/O on the corked SQ, the pending I/O may never get processed. A more careful treatment would only have the event index lag if we are at risk of having a worker thread defer an I/O because this CQ would be full. This would look something like not letting EventIdx advance so far forward that there is less than one entry available for each SQ pointed at this CQ. Eating the unnecessary doorbell cost at least gets doorbell buffers correct in this case and we can turn it back on. The more careful treatment of EventIdx can come later. --- lib/propolis/src/hw/nvme/queue.rs | 38 +++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index a9b61da16..ebe5b80c3 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -343,18 +343,36 @@ impl QueueGuard<'_, CompQueueState> { /// Write update to the EventIdx in Doorbell Buffer page, if possible fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_cq_dbbuf_write!(|| (devq_id, self.state.tail)); - // Keep EventIdx populated with the position of the CQ tail. We are - // not especially concerned with receiving timely (doorbell) updates - // from the guest about where the head pointer sits. We keep our - // own tally of how many entries are in the CQ are available for - // completions to land. + // Update EventIdx as far as we're willing to forego doorbell + // updates about the CQ head. We are not especially concerned with + // receiving timely doorbell updates from the guest about the head. + // We keep our own tally of how many entries in the CQ are available + // for completions to land. In the typical case, we will read the + // shadow doorbell from db_buf JIT to update the available CQ space. // - // When checking for available space before issuing a Permit, we can - // perform our own JIT read from the db_buf to stay updated on the - // true space available. + // However, simply keeping EventIdx matching the queue tail means we + // opt out of *any* notification that a completion is posted. Then, + // if an I/O was submitted, not processed immediately due to + // insufficient CQ space, but the guest submits no further I/Os on + // that queue, we would never notice that there is space in the CQ. + // The I/O would go unfulfilled forever. + // + // For now, leave EventIdx one before the actual CQ tail, so that + // making the CQ empty requires a doorbell notify. This is + // excessive; there are many cases where we don't care if the CQ is + // to be emptied. + if self.avail_occupied() <= 1 { + // The queue was empty and just became non-empty. So `tail` has + // not advanced far enough that we can actually advance + // EventIdx. + return; + } + + let next_evtidx = self.idx_sub(self.state.tail, 1); + + probes::nvme_cq_dbbuf_write!(|| (devq_id, next_evtidx)); fence(Ordering::Release); - mem.write(db_buf.eventidx, &self.state.tail); + mem.write(db_buf.eventidx, &next_evtidx); } } From 9484c10d53df4456dd87d683c47aa68e8cd5edba Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 28 Jan 2026 21:34:22 +0000 Subject: [PATCH 2/2] Revert "do not adverstise Doorbell Buffer support for now (#1023)" This reverts commit d6cd9573a8134d6d9b1aa26e9adc10a823642275. --- lib/propolis/src/hw/nvme/admin.rs | 1 - lib/propolis/src/hw/nvme/mod.rs | 17 ++++------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/lib/propolis/src/hw/nvme/admin.rs b/lib/propolis/src/hw/nvme/admin.rs index ed0d454a7..df8746a62 100644 --- a/lib/propolis/src/hw/nvme/admin.rs +++ b/lib/propolis/src/hw/nvme/admin.rs @@ -463,7 +463,6 @@ impl NvmeCtrl { } } - #[allow(dead_code)] pub(super) fn acmd_doorbell_buf_cfg( &mut self, cmd: &cmds::DoorbellBufCfgCmd, diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 2a10c6f7d..78d0b1ffa 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -844,9 +844,8 @@ impl PciNvme { nn: 1, // bit 0 indicates volatile write cache is present vwc: 1, - // bit 8 indicates Doorbell Buffer support. Theoretically supported, - // but disabled for Propolis issue #1008. - oacs: (0 << 8), + // bit 8 indicates Doorbell Buffer support + oacs: (1 << 8), ..Default::default() }; @@ -1340,16 +1339,8 @@ impl PciNvme { // this can detect it and stop posting async events. cmds::Completion::generic_err(bits::STS_INVAL_OPC).dnr() } - AdminCmd::DoorbellBufCfg(_cmd) => { - // XXX: issue #1008 suggests that Doorbell Buffer support - // can end up with guest disks in a state that *looks like* - // we've failed to notify after writing a completion. While - // we're debugging this, we hide Doorbell Buffer support - // from OACS. Instead, treat this the same as an - // `AdminCmd::Unknown`. - - // state.acmd_doorbell_buf_cfg(&cmd) - cmds::Completion::generic_err(bits::STS_INTERNAL_ERR) + AdminCmd::DoorbellBufCfg(cmd) => { + state.acmd_doorbell_buf_cfg(&cmd) } AdminCmd::Unknown(_) => { cmds::Completion::generic_err(bits::STS_INTERNAL_ERR)