diff --git a/lib/propolis/src/hw/nvme/admin.rs b/lib/propolis/src/hw/nvme/admin.rs index ed0d454a7..df8746a62 100644 --- a/lib/propolis/src/hw/nvme/admin.rs +++ b/lib/propolis/src/hw/nvme/admin.rs @@ -463,7 +463,6 @@ impl NvmeCtrl { } } - #[allow(dead_code)] pub(super) fn acmd_doorbell_buf_cfg( &mut self, cmd: &cmds::DoorbellBufCfgCmd, diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 2a10c6f7d..78d0b1ffa 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -844,9 +844,8 @@ impl PciNvme { nn: 1, // bit 0 indicates volatile write cache is present vwc: 1, - // bit 8 indicates Doorbell Buffer support. Theoretically supported, - // but disabled for Propolis issue #1008. - oacs: (0 << 8), + // bit 8 indicates Doorbell Buffer support + oacs: (1 << 8), ..Default::default() }; @@ -1340,16 +1339,8 @@ impl PciNvme { // this can detect it and stop posting async events. cmds::Completion::generic_err(bits::STS_INVAL_OPC).dnr() } - AdminCmd::DoorbellBufCfg(_cmd) => { - // XXX: issue #1008 suggests that Doorbell Buffer support - // can end up with guest disks in a state that *looks like* - // we've failed to notify after writing a completion. While - // we're debugging this, we hide Doorbell Buffer support - // from OACS. Instead, treat this the same as an - // `AdminCmd::Unknown`. - - // state.acmd_doorbell_buf_cfg(&cmd) - cmds::Completion::generic_err(bits::STS_INTERNAL_ERR) + AdminCmd::DoorbellBufCfg(cmd) => { + state.acmd_doorbell_buf_cfg(&cmd) } AdminCmd::Unknown(_) => { cmds::Completion::generic_err(bits::STS_INTERNAL_ERR) diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index a9b61da16..ebe5b80c3 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -343,18 +343,36 @@ impl QueueGuard<'_, CompQueueState> { /// Write update to the EventIdx in Doorbell Buffer page, if possible fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_cq_dbbuf_write!(|| (devq_id, self.state.tail)); - // Keep EventIdx populated with the position of the CQ tail. We are - // not especially concerned with receiving timely (doorbell) updates - // from the guest about where the head pointer sits. We keep our - // own tally of how many entries are in the CQ are available for - // completions to land. + // Update EventIdx as far as we're willing to forego doorbell + // updates about the CQ head. We are not especially concerned with + // receiving timely doorbell updates from the guest about the head. + // We keep our own tally of how many entries in the CQ are available + // for completions to land. In the typical case, we will read the + // shadow doorbell from db_buf JIT to update the available CQ space. // - // When checking for available space before issuing a Permit, we can - // perform our own JIT read from the db_buf to stay updated on the - // true space available. + // However, simply keeping EventIdx matching the queue tail means we + // opt out of *any* notification that a completion is posted. Then, + // if an I/O was submitted, not processed immediately due to + // insufficient CQ space, but the guest submits no further I/Os on + // that queue, we would never notice that there is space in the CQ. + // The I/O would go unfulfilled forever. + // + // For now, leave EventIdx one before the actual CQ tail, so that + // making the CQ empty requires a doorbell notify. This is + // excessive; there are many cases where we don't care if the CQ is + // to be emptied. + if self.avail_occupied() <= 1 { + // The queue was empty and just became non-empty. So `tail` has + // not advanced far enough that we can actually advance + // EventIdx. + return; + } + + let next_evtidx = self.idx_sub(self.state.tail, 1); + + probes::nvme_cq_dbbuf_write!(|| (devq_id, next_evtidx)); fence(Ordering::Release); - mem.write(db_buf.eventidx, &self.state.tail); + mem.write(db_buf.eventidx, &next_evtidx); } }