All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/3] hw/nvme: add irqfd support
@ 2022-08-25  7:47 Jinhao Fan
  2022-08-25  7:47 ` [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd Jinhao Fan
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25  7:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch, stefanha, Jinhao Fan

This patch series changes qemu-nvme's interrupt emulation to use event
notifiers, which can ensure thread-safe interrupt delivery when iothread
is used. In the first patche, I convert qemu-nvme's IO emulation
logic to send irq via eventfd, so that the actual assertion and
deassertion is always done in the main loop thread with BQL held. In the
second patch, support is added to send irq via KVM irqfd, bypassing
qemu's MSI-x emulation. In the last patch, I add MSI-x mask handlers
when irqfd is enabled so that qemu-nvme knows which vector is masked
even when qemu's MSI-x emulation is bypassed.

Changes since v1:
 - Made nvme_irq_(de)assert wrappers around eventfd call and actual irq
   assertion
 - Dropped the previous first patch to avoid duplicate checks for 
   irq_enabled and msix_enabled

Jinhao Fan (3):
  hw/nvme: support irq(de)assertion with eventfd
  hw/nvme: use KVM irqfd when available
  hw/nvme: add MSI-x mask handlers for irqfd

 hw/nvme/ctrl.c       | 264 ++++++++++++++++++++++++++++++++++++++++---
 hw/nvme/nvme.h       |   7 ++
 hw/nvme/trace-events |   3 +
 3 files changed, 257 insertions(+), 17 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25  7:47 [PATCH v2 0/3] hw/nvme: add irqfd support Jinhao Fan
@ 2022-08-25  7:47 ` Jinhao Fan
  2022-08-25  9:33   ` Klaus Jensen
  2022-08-25  7:47 ` [PATCH v2 2/3] hw/nvme: use KVM irqfd when available Jinhao Fan
  2022-08-25  7:47 ` [PATCH v2 3/3] hw/nvme: add MSI-x mask handlers for irqfd Jinhao Fan
  2 siblings, 1 reply; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25  7:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch, stefanha, Jinhao Fan, open list:nvme

When the new option 'irq-eventfd' is turned on, the IO emulation code
signals an eventfd when it want to (de)assert an irq. The main loop
eventfd handler does the actual irq (de)assertion.  This paves the way
for iothread support since QEMU's interrupt emulation is not thread
safe.

Asserting and deasseting irq with eventfd has some performance
implications. For small queue depth it increases request latency but
for large queue depth it effectively coalesces irqs.

Comparision (KIOPS):

QD            1   4  16  64
QEMU         38 123 210 329
irq-eventfd  32 106 240 364

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c | 136 ++++++++++++++++++++++++++++++++++++++++++-------
 hw/nvme/nvme.h |   4 ++
 2 files changed, 123 insertions(+), 17 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 87aeba0564..6ecf6fafd9 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -526,34 +526,57 @@ static void nvme_irq_check(NvmeCtrl *n)
     }
 }
 
+static void nvme_irq_do_assert(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    if (msix_enabled(&(n->parent_obj))) {
+        trace_pci_nvme_irq_msix(cq->vector);
+        msix_notify(&(n->parent_obj), cq->vector);
+    } else {
+        trace_pci_nvme_irq_pin();
+        assert(cq->vector < 32);
+        n->irq_status |= 1 << cq->vector;
+        nvme_irq_check(n);
+    }
+}
+
 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
-        if (msix_enabled(&(n->parent_obj))) {
-            trace_pci_nvme_irq_msix(cq->vector);
-            msix_notify(&(n->parent_obj), cq->vector);
+        if (cq->assert_notifier.initialized) {
+            event_notifier_set(&cq->assert_notifier);
         } else {
-            trace_pci_nvme_irq_pin();
-            assert(cq->vector < 32);
-            n->irq_status |= 1 << cq->vector;
-            nvme_irq_check(n);
+            nvme_irq_do_assert(n, cq);
         }
     } else {
         trace_pci_nvme_irq_masked();
     }
 }
 
+static void nvme_irq_do_deassert(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    if (msix_enabled(&(n->parent_obj))) {
+        return;
+    } else {
+        assert(cq->vector < 32);
+        if (!n->cq_pending) {
+            n->irq_status &= ~(1 << cq->vector);
+        }
+        nvme_irq_check(n);
+    }
+}
+
 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
-        if (msix_enabled(&(n->parent_obj))) {
-            return;
+        if (cq->deassert_notifier.initialized) {
+            /* 
+             * Event notifier will only be initilized when MSI-X is in use,
+             * therefore no need to worry about extra eventfd syscall for
+             * pin-based interrupts.
+             */
+            event_notifier_set(&cq->deassert_notifier);
         } else {
-            assert(cq->vector < 32);
-            if (!n->cq_pending) {
-                n->irq_status &= ~(1 << cq->vector);
-            }
-            nvme_irq_check(n);
+            nvme_irq_do_deassert(n, cq);
         }
     }
 }
@@ -1338,6 +1361,54 @@ static void nvme_update_cq_head(NvmeCQueue *cq)
     trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head);
 }
 
+static void nvme_assert_notifier_read(EventNotifier *e)
+{
+    NvmeCQueue *cq = container_of(e, NvmeCQueue, assert_notifier);
+    if (event_notifier_test_and_clear(e)) {
+        nvme_irq_do_assert(cq->ctrl, cq);
+    }
+}
+
+static void nvme_deassert_notifier_read(EventNotifier *e)
+{
+    NvmeCQueue *cq = container_of(e, NvmeCQueue, deassert_notifier);
+    if (event_notifier_test_and_clear(e)) {
+        nvme_irq_do_deassert(cq->ctrl, cq);
+    }
+}
+
+static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    int ret;
+
+    ret = event_notifier_init(&cq->assert_notifier, 0);
+    if (ret < 0) {
+        goto fail_assert_handler;
+    }
+
+    event_notifier_set_handler(&cq->assert_notifier,
+                                nvme_assert_notifier_read);
+
+    if (!msix_enabled(&n->parent_obj)) {
+        ret = event_notifier_init(&cq->deassert_notifier, 0);
+        if (ret < 0) {
+            goto fail_deassert_handler;
+        }
+
+        event_notifier_set_handler(&cq->deassert_notifier,
+                                   nvme_deassert_notifier_read);
+    }
+
+    return;
+
+fail_deassert_handler:
+    event_notifier_set_handler(&cq->deassert_notifier, NULL);
+    event_notifier_cleanup(&cq->deassert_notifier);
+fail_assert_handler:
+    event_notifier_set_handler(&cq->assert_notifier, NULL);
+    event_notifier_cleanup(&cq->assert_notifier);
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -1377,8 +1448,25 @@ static void nvme_post_cqes(void *opaque)
         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
     }
     if (cq->tail != cq->head) {
-        if (cq->irq_enabled && !pending) {
-            n->cq_pending++;
+        if (cq->irq_enabled) {
+            if (!pending) {
+                n->cq_pending++;
+            }
+
+            if (unlikely(cq->first_io_cqe)) {
+                /*
+                 * Initilize event notifier when first cqe is posted. For irqfd 
+                 * support we need to register the MSI message in KVM. We
+                 * can not do this registration at CQ creation time because
+                 * Linux's NVMe driver changes the MSI message after CQ creation.
+                 */
+                cq->first_io_cqe = false;
+
+                if (n->params.irq_eventfd) {
+                    nvme_init_irq_notifier(n, cq);
+                }
+            }
+
         }
 
         nvme_irq_assert(n, cq);
@@ -4705,6 +4793,14 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
         event_notifier_set_handler(&cq->notifier, NULL);
         event_notifier_cleanup(&cq->notifier);
     }
+    if (cq->assert_notifier.initialized) {
+        event_notifier_set_handler(&cq->assert_notifier, NULL);
+        event_notifier_cleanup(&cq->assert_notifier);
+    }
+    if (cq->deassert_notifier.initialized) {
+        event_notifier_set_handler(&cq->deassert_notifier, NULL);
+        event_notifier_cleanup(&cq->deassert_notifier);
+    }
     if (msix_enabled(&n->parent_obj)) {
         msix_vector_unuse(&n->parent_obj, cq->vector);
     }
@@ -4734,7 +4830,7 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
         n->cq_pending--;
     }
 
-    nvme_irq_deassert(n, cq);
+    nvme_irq_do_deassert(n, cq);
     trace_pci_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
@@ -4772,6 +4868,11 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     }
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
+    /* 
+     * Only enable irqfd for IO queues since we always emulate admin queue 
+     * in main loop thread 
+     */
+    cq->first_io_cqe = cqid != 0;
 }
 
 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
@@ -7671,6 +7772,7 @@ static Property nvme_props[] = {
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
     DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
+    DEFINE_PROP_BOOL("irq-eventfd", NvmeCtrl, params.irq_eventfd, false),
     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
                      params.auto_transition_zones, true),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..759d0ecd7c 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -398,6 +398,9 @@ typedef struct NvmeCQueue {
     uint64_t    ei_addr;
     QEMUTimer   *timer;
     EventNotifier notifier;
+    EventNotifier assert_notifier;
+    EventNotifier deassert_notifier;
+    bool        first_io_cqe;
     bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
     QTAILQ_HEAD(, NvmeRequest) req_list;
@@ -422,6 +425,7 @@ typedef struct NvmeParams {
     bool     auto_transition_zones;
     bool     legacy_cmb;
     bool     ioeventfd;
+    bool     irq_eventfd;
     uint8_t  sriov_max_vfs;
     uint16_t sriov_vq_flexible;
     uint16_t sriov_vi_flexible;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/3] hw/nvme: use KVM irqfd when available
  2022-08-25  7:47 [PATCH v2 0/3] hw/nvme: add irqfd support Jinhao Fan
  2022-08-25  7:47 ` [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd Jinhao Fan
@ 2022-08-25  7:47 ` Jinhao Fan
  2022-08-25  7:47 ` [PATCH v2 3/3] hw/nvme: add MSI-x mask handlers for irqfd Jinhao Fan
  2 siblings, 0 replies; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25  7:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch, stefanha, Jinhao Fan, open list:nvme

Use KVM's irqfd to send interrupts when possible. This approach is
thread safe. Moreover, it does not have the inter-thread communication
overhead of plain event notifiers since handler callback are called
in the same system call as irqfd write.

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 hw/nvme/nvme.h |  1 +
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 6ecf6fafd9..74075f782f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -192,6 +192,7 @@
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/hostmem.h"
 #include "hw/pci/msix.h"
@@ -1377,8 +1378,26 @@ static void nvme_deassert_notifier_read(EventNotifier *e)
     }
 }
 
+static int nvme_kvm_msix_vector_use(NvmeCtrl *n,
+                                    NvmeCQueue *cq,
+                                    uint32_t vector)
+{
+    int ret;
+
+    KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+    ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
+    if (ret < 0) {
+        return ret;
+    }
+    kvm_irqchip_commit_route_changes(&c);
+    cq->virq = ret;
+    return 0;
+}
+
 static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
 {
+    bool with_irqfd = msix_enabled(&n->parent_obj) &&
+                      kvm_msi_via_irqfd_enabled();
     int ret;
 
     ret = event_notifier_init(&cq->assert_notifier, 0);
@@ -1386,8 +1405,21 @@ static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
         goto fail_assert_handler;
     }
 
-    event_notifier_set_handler(&cq->assert_notifier,
-                                nvme_assert_notifier_read);
+    if (with_irqfd) {
+        ret = nvme_kvm_msix_vector_use(n, cq, cq->vector);
+        if (ret < 0) {
+            goto fail_assert_handler;
+        }
+        ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+                                                 &cq->assert_notifier, NULL, 
+                                                 cq->virq);
+        if (ret < 0) {
+            goto fail_kvm;
+         }
+    } else {
+        event_notifier_set_handler(&cq->assert_notifier,
+                                   nvme_assert_notifier_read);
+    }
 
     if (!msix_enabled(&n->parent_obj)) {
         ret = event_notifier_init(&cq->deassert_notifier, 0);
@@ -1404,6 +1436,12 @@ static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
 fail_deassert_handler:
     event_notifier_set_handler(&cq->deassert_notifier, NULL);
     event_notifier_cleanup(&cq->deassert_notifier);
+    if (with_irqfd) {
+        kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &cq->assert_notifier,
+                                              cq->virq);
+fail_kvm:
+        kvm_irqchip_release_virq(kvm_state, cq->virq);
+    }
 fail_assert_handler:
     event_notifier_set_handler(&cq->assert_notifier, NULL);
     event_notifier_cleanup(&cq->assert_notifier);
@@ -4783,6 +4821,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 
 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
+    bool with_irqfd = msix_enabled(&n->parent_obj) &&
+                      kvm_msi_via_irqfd_enabled();
     uint16_t offset = (cq->cqid << 3) + (1 << 2);
 
     n->cq[cq->cqid] = NULL;
@@ -4794,6 +4834,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
         event_notifier_cleanup(&cq->notifier);
     }
     if (cq->assert_notifier.initialized) {
+        if (with_irqfd) {
+            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+                                                  &cq->assert_notifier, 
+                                                  cq->virq);
+            kvm_irqchip_release_virq(kvm_state, cq->virq);
+        }
         event_notifier_set_handler(&cq->assert_notifier, NULL);
         event_notifier_cleanup(&cq->assert_notifier);
     }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 759d0ecd7c..85fd9cd0e2 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -396,6 +396,7 @@ typedef struct NvmeCQueue {
     uint64_t    dma_addr;
     uint64_t    db_addr;
     uint64_t    ei_addr;
+    int         virq;
     QEMUTimer   *timer;
     EventNotifier notifier;
     EventNotifier assert_notifier;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 3/3] hw/nvme: add MSI-x mask handlers for irqfd
  2022-08-25  7:47 [PATCH v2 0/3] hw/nvme: add irqfd support Jinhao Fan
  2022-08-25  7:47 ` [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd Jinhao Fan
  2022-08-25  7:47 ` [PATCH v2 2/3] hw/nvme: use KVM irqfd when available Jinhao Fan
@ 2022-08-25  7:47 ` Jinhao Fan
  2 siblings, 0 replies; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25  7:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch, stefanha, Jinhao Fan, open list:nvme

When irqfd is enabled, we bypass QEMU's irq emulation and let KVM to
directly assert the irq. However, KVM is not aware of the device's MSI-x
masking status. Add MSI-x mask bookkeeping in NVMe emulation and
detach the corresponding irqfd when the certain vector is masked.

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c       | 82 ++++++++++++++++++++++++++++++++++++++++++++
 hw/nvme/nvme.h       |  2 ++
 hw/nvme/trace-events |  3 ++
 3 files changed, 87 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 74075f782f..30bbda7bb5 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -7493,10 +7493,84 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
 
     return 0;
 }
+static int nvme_vector_unmask(PCIDevice *pci_dev, unsigned vector,
+                               MSIMessage msg)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+    int ret;
+
+    trace_pci_nvme_irq_unmask(vector, msg.address, msg.data);
+    
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+        /* 
+         * If this function is called, then irqfd must be available. Therefore,
+         * irqfd must be in use if cq->assert_notifier.initialized is true.
+         */
+        if (cq && cq->vector == vector && cq->assert_notifier.initialized) {
+            if (cq->msg.data != msg.data || cq->msg.address != msg.address) {
+                ret = kvm_irqchip_update_msi_route(kvm_state, cq->virq, msg,
+                                                   pci_dev);
+                if (ret < 0) {
+                    return ret;
+                }
+                kvm_irqchip_commit_routes(kvm_state);
+                cq->msg = msg;
+            }
+
+            ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+                                                     &cq->assert_notifier,
+                                                     NULL, cq->virq);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void nvme_vector_mask(PCIDevice *pci_dev, unsigned vector)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+
+    trace_pci_nvme_irq_mask(vector);
+    
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+        if (cq && cq->vector == vector && cq->assert_notifier.initialized) {
+            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+                                                  &cq->assert_notifier,
+                                                  cq->virq);
+        }
+    }
+}
+
+static void nvme_vector_poll(PCIDevice *pci_dev,
+                             unsigned int vector_start,
+                             unsigned int vector_end)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+
+    trace_pci_nvme_irq_poll(vector_start, vector_end);
+
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+        if (cq && cq->vector >= vector_start && cq->vector <= vector_end 
+            && msix_is_masked(pci_dev, cq->vector) 
+            && cq->assert_notifier.initialized) {
+            if (event_notifier_test_and_clear(&cq->assert_notifier)) {
+                msix_set_pending(pci_dev, i);
+            }
+        }
+    }
+}
 
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
     uint8_t *pci_conf = pci_dev->config;
+    bool with_irqfd = msix_enabled(&n->parent_obj) &&
+                      kvm_msi_via_irqfd_enabled();
     uint64_t bar_size;
     unsigned msix_table_offset, msix_pba_offset;
     int ret;
@@ -7549,6 +7623,13 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
         }
     }
 
+    if (with_irqfd) {
+        msix_set_vector_notifiers(pci_dev,
+                                  nvme_vector_unmask,
+                                  nvme_vector_mask,
+                                  nvme_vector_poll);
+    }
+
     nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
 
     if (n->params.cmb_size_mb) {
@@ -7796,6 +7877,7 @@ static void nvme_exit(PCIDevice *pci_dev)
         pcie_sriov_pf_exit(pci_dev);
     }
 
+    msix_unset_vector_notifiers(pci_dev);
     msix_uninit(pci_dev, &n->bar0, &n->bar0);
     memory_region_del_subregion(&n->bar0, &n->iomem);
 }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 85fd9cd0e2..707a55ebfc 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -20,6 +20,7 @@
 
 #include "qemu/uuid.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/msi.h"
 #include "hw/block/block.h"
 
 #include "block/nvme.h"
@@ -401,6 +402,7 @@ typedef struct NvmeCQueue {
     EventNotifier notifier;
     EventNotifier assert_notifier;
     EventNotifier deassert_notifier;
+    MSIMessage  msg;
     bool        first_io_cqe;
     bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index fccb79f489..b11fcf4a65 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -2,6 +2,9 @@
 pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
 pci_nvme_irq_pin(void) "pulsing IRQ pin"
 pci_nvme_irq_masked(void) "IRQ is masked"
+pci_nvme_irq_mask(uint32_t vector) "IRQ %u gets masked"
+pci_nvme_irq_unmask(uint32_t vector, uint64_t addr, uint32_t data) "IRQ %u gets unmasked, addr=0x%"PRIx64" data=0x%"PRIu32""
+pci_nvme_irq_poll(uint32_t vector_start, uint32_t vector_end) "IRQ poll, start=0x%"PRIu32" end=0x%"PRIu32""
 pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64""
 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25  7:47 ` [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd Jinhao Fan
@ 2022-08-25  9:33   ` Klaus Jensen
  2022-08-25 11:16     ` Jinhao Fan
  0 siblings, 1 reply; 13+ messages in thread
From: Klaus Jensen @ 2022-08-25  9:33 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

[-- Attachment #1: Type: text/plain, Size: 2291 bytes --]

On Aug 25 15:47, Jinhao Fan wrote:
> When the new option 'irq-eventfd' is turned on, the IO emulation code
> signals an eventfd when it want to (de)assert an irq. The main loop
> eventfd handler does the actual irq (de)assertion.  This paves the way
> for iothread support since QEMU's interrupt emulation is not thread
> safe.
> 
> Asserting and deasseting irq with eventfd has some performance
> implications. For small queue depth it increases request latency but
> for large queue depth it effectively coalesces irqs.
> 
> Comparision (KIOPS):
> 
> QD            1   4  16  64
> QEMU         38 123 210 329
> irq-eventfd  32 106 240 364
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c | 136 ++++++++++++++++++++++++++++++++++++++++++-------
>  hw/nvme/nvme.h |   4 ++
>  2 files changed, 123 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 87aeba0564..6ecf6fafd9 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -1377,8 +1448,25 @@ static void nvme_post_cqes(void *opaque)
>          QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
>      }
>      if (cq->tail != cq->head) {
> -        if (cq->irq_enabled && !pending) {
> -            n->cq_pending++;
> +        if (cq->irq_enabled) {
> +            if (!pending) {
> +                n->cq_pending++;
> +            }
> +
> +            if (unlikely(cq->first_io_cqe)) {
> +                /*
> +                 * Initilize event notifier when first cqe is posted. For irqfd 
> +                 * support we need to register the MSI message in KVM. We
> +                 * can not do this registration at CQ creation time because
> +                 * Linux's NVMe driver changes the MSI message after CQ creation.
> +                 */
> +                cq->first_io_cqe = false;
> +
> +                if (n->params.irq_eventfd) {
> +                    nvme_init_irq_notifier(n, cq);
> +                }
> +            }

I'm still a bit perplexed by this issue, so I just tried moving
nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
first_io_cqe thing. I did not observe any particular issues?

What bad behavior did you encounter, it seems to work fine to me?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25  9:33   ` Klaus Jensen
@ 2022-08-25 11:16     ` Jinhao Fan
  2022-08-25 11:56       ` Klaus Jensen
  0 siblings, 1 reply; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25 11:16 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

On 8/25/2022 5:33 PM, Klaus Jensen wrote:
> I'm still a bit perplexed by this issue, so I just tried moving
> nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
> first_io_cqe thing. I did not observe any particular issues?
> 
> What bad behavior did you encounter, it seems to work fine to me

The kernel boots up and got stuck, waiting for interrupts. Then the 
request times out and got retried three times. Finally the driver seems 
to decide that the drive is down and continues to boot.

I added some prints during debugging and found that the MSI-X message 
which got registered in KVM via kvm_irqchip_add_msi_route() is not the 
same as the one actually used in msix_notify().

Are you sure you are using KVM's irqfd?

Here is a previous discussion with Keith [1].

[1] 
https://lore.kernel.org/qemu-devel/YvKJk2dYiwomexFv@kbusch-mbp.dhcp.thefacebook.com/#t



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25 11:16     ` Jinhao Fan
@ 2022-08-25 11:56       ` Klaus Jensen
  2022-08-25 12:38         ` Klaus Jensen
  2022-08-25 14:05         ` Jinhao Fan
  0 siblings, 2 replies; 13+ messages in thread
From: Klaus Jensen @ 2022-08-25 11:56 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

[-- Attachment #1: Type: text/plain, Size: 2330 bytes --]

On Aug 25 19:16, Jinhao Fan wrote:
> On 8/25/2022 5:33 PM, Klaus Jensen wrote:
> > I'm still a bit perplexed by this issue, so I just tried moving
> > nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
> > first_io_cqe thing. I did not observe any particular issues?
> > 
> > What bad behavior did you encounter, it seems to work fine to me
> 
> The kernel boots up and got stuck, waiting for interrupts. Then the request
> times out and got retried three times. Finally the driver seems to decide
> that the drive is down and continues to boot.
> 
> I added some prints during debugging and found that the MSI-X message which
> got registered in KVM via kvm_irqchip_add_msi_route() is not the same as the
> one actually used in msix_notify().
> 
> Are you sure you are using KVM's irqfd?
> 

Pretty sure? Using "ioeventfd=on,irq-eventfd=on" on the controller.

And the following patch.


diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
index 30bbda7bb5ae..b2e41d3bd745 100644
--- i/hw/nvme/ctrl.c
+++ w/hw/nvme/ctrl.c
@@ -1490,21 +1490,6 @@ static void nvme_post_cqes(void *opaque)
             if (!pending) {
                 n->cq_pending++;
             }
-
-            if (unlikely(cq->first_io_cqe)) {
-                /*
-                 * Initilize event notifier when first cqe is posted. For irqfd 
-                 * support we need to register the MSI message in KVM. We
-                 * can not do this registration at CQ creation time because
-                 * Linux's NVMe driver changes the MSI message after CQ creation.
-                 */
-                cq->first_io_cqe = false;
-
-                if (n->params.irq_eventfd) {
-                    nvme_init_irq_notifier(n, cq);
-                }
-            }
-
         }
 
         nvme_irq_assert(n, cq);
@@ -4914,11 +4899,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     }
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
+
     /* 
      * Only enable irqfd for IO queues since we always emulate admin queue 
      * in main loop thread 
      */
-    cq->first_io_cqe = cqid != 0;
+    if (cqid && n->params.irq_eventfd) {
+        nvme_init_irq_notifier(n, cq);
+    }
 }



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25 11:56       ` Klaus Jensen
@ 2022-08-25 12:38         ` Klaus Jensen
  2022-08-25 13:09           ` Jinhao Fan
  2022-08-25 14:05         ` Jinhao Fan
  1 sibling, 1 reply; 13+ messages in thread
From: Klaus Jensen @ 2022-08-25 12:38 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

[-- Attachment #1: Type: text/plain, Size: 4580 bytes --]

On Aug 25 13:56, Klaus Jensen wrote:
> On Aug 25 19:16, Jinhao Fan wrote:
> > On 8/25/2022 5:33 PM, Klaus Jensen wrote:
> > > I'm still a bit perplexed by this issue, so I just tried moving
> > > nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
> > > first_io_cqe thing. I did not observe any particular issues?
> > > 
> > > What bad behavior did you encounter, it seems to work fine to me
> > 
> > The kernel boots up and got stuck, waiting for interrupts. Then the request
> > times out and got retried three times. Finally the driver seems to decide
> > that the drive is down and continues to boot.
> > 
> > I added some prints during debugging and found that the MSI-X message which
> > got registered in KVM via kvm_irqchip_add_msi_route() is not the same as the
> > one actually used in msix_notify().
> > 
> > Are you sure you are using KVM's irqfd?
> > 
> 
> Pretty sure? Using "ioeventfd=on,irq-eventfd=on" on the controller.
> 
> And the following patch.
> 
> 
> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
> index 30bbda7bb5ae..b2e41d3bd745 100644
> --- i/hw/nvme/ctrl.c
> +++ w/hw/nvme/ctrl.c
> @@ -1490,21 +1490,6 @@ static void nvme_post_cqes(void *opaque)
>              if (!pending) {
>                  n->cq_pending++;
>              }
> -
> -            if (unlikely(cq->first_io_cqe)) {
> -                /*
> -                 * Initilize event notifier when first cqe is posted. For irqfd 
> -                 * support we need to register the MSI message in KVM. We
> -                 * can not do this registration at CQ creation time because
> -                 * Linux's NVMe driver changes the MSI message after CQ creation.
> -                 */
> -                cq->first_io_cqe = false;
> -
> -                if (n->params.irq_eventfd) {
> -                    nvme_init_irq_notifier(n, cq);
> -                }
> -            }
> -
>          }
>  
>          nvme_irq_assert(n, cq);
> @@ -4914,11 +4899,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>      }
>      n->cq[cqid] = cq;
>      cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> +
>      /* 
>       * Only enable irqfd for IO queues since we always emulate admin queue 
>       * in main loop thread 
>       */
> -    cq->first_io_cqe = cqid != 0;
> +    if (cqid && n->params.irq_eventfd) {
> +        nvme_init_irq_notifier(n, cq);
> +    }
>  }
> 
> 

From a trace, this is what I observe:

First, the queue is created and a virq (0) is assigned.

  msix_table_mmio_write dev nvme hwaddr 0xc val 0x0 size 4
  pci_nvme_mmio_write addr 0x1000 data 0x7 size 4
  pci_nvme_mmio_doorbell_sq sqid 0 new_tail 7
  pci_nvme_admin_cmd cid 4117 sqid 0 opc 0x5 opname 'NVME_ADM_CMD_CREATE_CQ'
  pci_nvme_create_cq create completion queue, addr=0x104318000, cqid=1, vector=1, qsize=1023, qflags=3, ien=1
  kvm_irqchip_add_msi_route dev nvme vector 1 virq 0
  kvm_irqchip_commit_routes
  pci_nvme_enqueue_req_completion cid 4117 cqid 0 dw0 0x0 dw1 0x0 status 0x0
  pci_nvme_irq_msix raising MSI-X IRQ vector 0
  pci_nvme_mmio_write addr 0x1004 data 0x7 size 4
  pci_nvme_mmio_doorbell_cq cqid 0 new_head 7

We go on and the SQ is created as well.

  pci_nvme_mmio_write addr 0x1000 data 0x8 size 4
  pci_nvme_mmio_doorbell_sq sqid 0 new_tail 8
  pci_nvme_admin_cmd cid 4118 sqid 0 opc 0x1 opname 'NVME_ADM_CMD_CREATE_SQ'
  pci_nvme_create_sq create submission queue, addr=0x1049a0000, sqid=1, cqid=1, qsize=1023, qflags=1
  pci_nvme_enqueue_req_completion cid 4118 cqid 0 dw0 0x0 dw1 0x0 status 0x0
  pci_nvme_irq_msix raising MSI-X IRQ vector 0
  pci_nvme_mmio_write addr 0x1004 data 0x8 size 4
  pci_nvme_mmio_doorbell_cq cqid 0 new_head 8


Then i get a bunch of update_msi_routes, but the virq's are not related
to the nvme device.

However, I then assume we hit queue_request_irq() in the kernel and we
see the MSI-X table updated:

  msix_table_mmio_write dev nvme hwaddr 0x1c val 0x1 size 4
  msix_table_mmio_write dev nvme hwaddr 0x10 val 0xfee003f8 size 4
  msix_table_mmio_write dev nvme hwaddr 0x14 val 0x0 size 4
  msix_table_mmio_write dev nvme hwaddr 0x18 val 0x0 size 4
  msix_table_mmio_write dev nvme hwaddr 0x1c val 0x0 size 4
  kvm_irqchip_update_msi_route Updating MSI route virq=0
  ... other virq updates
  kvm_irqchip_commit_routes

Notice the last trace line. The route for virq 0 is updated.

Looks to me that the virq route is implicitly updated with the new
message, no?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25 12:38         ` Klaus Jensen
@ 2022-08-25 13:09           ` Jinhao Fan
  2022-08-25 13:59             ` Klaus Jensen
  0 siblings, 1 reply; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25 13:09 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

[-- Attachment #1: Type: text/plain, Size: 4902 bytes --]




> 在 2022年8月25日,20:39,Klaus Jensen <its@irrelevant.dk> 写道:
> 
> On Aug 25 13:56, Klaus Jensen wrote:
>>> On Aug 25 19:16, Jinhao Fan wrote:
>>> On 8/25/2022 5:33 PM, Klaus Jensen wrote:
>>>> I'm still a bit perplexed by this issue, so I just tried moving
>>>> nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
>>>> first_io_cqe thing. I did not observe any particular issues?
>>>> 
>>>> What bad behavior did you encounter, it seems to work fine to me
>>> 
>>> The kernel boots up and got stuck, waiting for interrupts. Then the request
>>> times out and got retried three times. Finally the driver seems to decide
>>> that the drive is down and continues to boot.
>>> 
>>> I added some prints during debugging and found that the MSI-X message which
>>> got registered in KVM via kvm_irqchip_add_msi_route() is not the same as the
>>> one actually used in msix_notify().
>>> 
>>> Are you sure you are using KVM's irqfd?
>>> 
>> 
>> Pretty sure? Using "ioeventfd=on,irq-eventfd=on" on the controller.
>> 
>> And the following patch.
>> 
>> 
>> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
>> index 30bbda7bb5ae..b2e41d3bd745 100644
>> --- i/hw/nvme/ctrl.c
>> +++ w/hw/nvme/ctrl.c
>> @@ -1490,21 +1490,6 @@ static void nvme_post_cqes(void *opaque)
>>             if (!pending) {
>>                 n->cq_pending++;
>>             }
>> -
>> -            if (unlikely(cq->first_io_cqe)) {
>> -                /*
>> -                 * Initilize event notifier when first cqe is posted. For irqfd 
>> -                 * support we need to register the MSI message in KVM. We
>> -                 * can not do this registration at CQ creation time because
>> -                 * Linux's NVMe driver changes the MSI message after CQ creation.
>> -                 */
>> -                cq->first_io_cqe = false;
>> -
>> -                if (n->params.irq_eventfd) {
>> -                    nvme_init_irq_notifier(n, cq);
>> -                }
>> -            }
>> -
>>         }
>> 
>>         nvme_irq_assert(n, cq);
>> @@ -4914,11 +4899,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>>     }
>>     n->cq[cqid] = cq;
>>     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
>> +
>>     /* 
>>      * Only enable irqfd for IO queues since we always emulate admin queue 
>>      * in main loop thread 
>>      */
>> -    cq->first_io_cqe = cqid != 0;
>> +    if (cqid && n->params.irq_eventfd) {
>> +        nvme_init_irq_notifier(n, cq);
>> +    }
>> }
>> 
>> 
> 
> From a trace, this is what I observe:
> 
> First, the queue is created and a virq (0) is assigned.
> 
>  msix_table_mmio_write dev nvme hwaddr 0xc val 0x0 size 4
>  pci_nvme_mmio_write addr 0x1000 data 0x7 size 4
>  pci_nvme_mmio_doorbell_sq sqid 0 new_tail 7
>  pci_nvme_admin_cmd cid 4117 sqid 0 opc 0x5 opname 'NVME_ADM_CMD_CREATE_CQ'
>  pci_nvme_create_cq create completion queue, addr=0x104318000, cqid=1, vector=1, qsize=1023, qflags=3, ien=1
>  kvm_irqchip_add_msi_route dev nvme vector 1 virq 0
>  kvm_irqchip_commit_routes
>  pci_nvme_enqueue_req_completion cid 4117 cqid 0 dw0 0x0 dw1 0x0 status 0x0
>  pci_nvme_irq_msix raising MSI-X IRQ vector 0
>  pci_nvme_mmio_write addr 0x1004 data 0x7 size 4
>  pci_nvme_mmio_doorbell_cq cqid 0 new_head 7
> 
> We go on and the SQ is created as well.
> 
>  pci_nvme_mmio_write addr 0x1000 data 0x8 size 4
>  pci_nvme_mmio_doorbell_sq sqid 0 new_tail 8
>  pci_nvme_admin_cmd cid 4118 sqid 0 opc 0x1 opname 'NVME_ADM_CMD_CREATE_SQ'
>  pci_nvme_create_sq create submission queue, addr=0x1049a0000, sqid=1, cqid=1, qsize=1023, qflags=1
>  pci_nvme_enqueue_req_completion cid 4118 cqid 0 dw0 0x0 dw1 0x0 status 0x0
>  pci_nvme_irq_msix raising MSI-X IRQ vector 0
>  pci_nvme_mmio_write addr 0x1004 data 0x8 size 4
>  pci_nvme_mmio_doorbell_cq cqid 0 new_head 8
> 
> 
> Then i get a bunch of update_msi_routes, but the virq's are not related
> to the nvme device.
> 
> However, I then assume we hit queue_request_irq() in the kernel and we
> see the MSI-X table updated:
> 
>  msix_table_mmio_write dev nvme hwaddr 0x1c val 0x1 size 4
>  msix_table_mmio_write dev nvme hwaddr 0x10 val 0xfee003f8 size 4
>  msix_table_mmio_write dev nvme hwaddr 0x14 val 0x0 size 4
>  msix_table_mmio_write dev nvme hwaddr 0x18 val 0x0 size 4
>  msix_table_mmio_write dev nvme hwaddr 0x1c val 0x0 size 4
>  kvm_irqchip_update_msi_route Updating MSI route virq=0
>  ... other virq updates
>  kvm_irqchip_commit_routes
> 
> Notice the last trace line. The route for virq 0 is updated.
> 
> Looks to me that the virq route is implicitly updated with the new
> message, no?

Could you try without the msix masking patch? I suspect our unmask function actually did the “implicit” update here.

> 

[-- Attachment #2: signature.asc --]
[-- Type: application/octet-stream, Size: 489 bytes --]

-----BEGIN PGP SIGNATURE-----

iQEzBAEBCAAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmMHbTcACgkQTeGvMW1P
DempXwgAhefrwajftbdm0SLeL3isCwQYRl3qCekbUGw2cFlcblNyvn7NSj5sjtX3
1x7XcHv4LVGaRlE7b/r5RadVT938KFjs57Jxc0xzzPeo+tHT5B62Hw6lFZf4KT/d
kykxcggCuVS7Fvmg/iQgXuY5O3FgH+3NiUX8mc6Qgu8rQHBC1eD3u067sVS9K2Gr
Ky2StxAly6chRY8veARY9Go6jUZpD+N0nDRa8x9CP+wn19bqp8oq7DcanB3hvBv4
m30Y2QmzdIQge68gWpsP1x4ehVmn399/87TJz25tboPexWVEmkuHvSDZRU7HyoJ5
QOqqKYZ1Ioexxhd8cuMFvE9N3edPAg==
=Yhtw
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25 13:09           ` Jinhao Fan
@ 2022-08-25 13:59             ` Klaus Jensen
  2022-08-25 14:11               ` Jinhao Fan
  0 siblings, 1 reply; 13+ messages in thread
From: Klaus Jensen @ 2022-08-25 13:59 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

[-- Attachment #1: Type: text/plain, Size: 5977 bytes --]

On Aug 25 21:09, Jinhao Fan wrote:
> 
> 
> 
> > 在 2022年8月25日,20:39,Klaus Jensen <its@irrelevant.dk> 写道:
> > 
> > On Aug 25 13:56, Klaus Jensen wrote:
> >>> On Aug 25 19:16, Jinhao Fan wrote:
> >>> On 8/25/2022 5:33 PM, Klaus Jensen wrote:
> >>>> I'm still a bit perplexed by this issue, so I just tried moving
> >>>> nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
> >>>> first_io_cqe thing. I did not observe any particular issues?
> >>>> 
> >>>> What bad behavior did you encounter, it seems to work fine to me
> >>> 
> >>> The kernel boots up and got stuck, waiting for interrupts. Then the request
> >>> times out and got retried three times. Finally the driver seems to decide
> >>> that the drive is down and continues to boot.
> >>> 
> >>> I added some prints during debugging and found that the MSI-X message which
> >>> got registered in KVM via kvm_irqchip_add_msi_route() is not the same as the
> >>> one actually used in msix_notify().
> >>> 
> >>> Are you sure you are using KVM's irqfd?
> >>> 
> >> 
> >> Pretty sure? Using "ioeventfd=on,irq-eventfd=on" on the controller.
> >> 
> >> And the following patch.
> >> 
> >> 
> >> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
> >> index 30bbda7bb5ae..b2e41d3bd745 100644
> >> --- i/hw/nvme/ctrl.c
> >> +++ w/hw/nvme/ctrl.c
> >> @@ -1490,21 +1490,6 @@ static void nvme_post_cqes(void *opaque)
> >>             if (!pending) {
> >>                 n->cq_pending++;
> >>             }
> >> -
> >> -            if (unlikely(cq->first_io_cqe)) {
> >> -                /*
> >> -                 * Initilize event notifier when first cqe is posted. For irqfd 
> >> -                 * support we need to register the MSI message in KVM. We
> >> -                 * can not do this registration at CQ creation time because
> >> -                 * Linux's NVMe driver changes the MSI message after CQ creation.
> >> -                 */
> >> -                cq->first_io_cqe = false;
> >> -
> >> -                if (n->params.irq_eventfd) {
> >> -                    nvme_init_irq_notifier(n, cq);
> >> -                }
> >> -            }
> >> -
> >>         }
> >> 
> >>         nvme_irq_assert(n, cq);
> >> @@ -4914,11 +4899,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
> >>     }
> >>     n->cq[cqid] = cq;
> >>     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> >> +
> >>     /* 
> >>      * Only enable irqfd for IO queues since we always emulate admin queue 
> >>      * in main loop thread 
> >>      */
> >> -    cq->first_io_cqe = cqid != 0;
> >> +    if (cqid && n->params.irq_eventfd) {
> >> +        nvme_init_irq_notifier(n, cq);
> >> +    }
> >> }
> >> 
> >> 
> > 
> > From a trace, this is what I observe:
> > 
> > First, the queue is created and a virq (0) is assigned.
> > 
> >  msix_table_mmio_write dev nvme hwaddr 0xc val 0x0 size 4
> >  pci_nvme_mmio_write addr 0x1000 data 0x7 size 4
> >  pci_nvme_mmio_doorbell_sq sqid 0 new_tail 7
> >  pci_nvme_admin_cmd cid 4117 sqid 0 opc 0x5 opname 'NVME_ADM_CMD_CREATE_CQ'
> >  pci_nvme_create_cq create completion queue, addr=0x104318000, cqid=1, vector=1, qsize=1023, qflags=3, ien=1
> >  kvm_irqchip_add_msi_route dev nvme vector 1 virq 0
> >  kvm_irqchip_commit_routes
> >  pci_nvme_enqueue_req_completion cid 4117 cqid 0 dw0 0x0 dw1 0x0 status 0x0
> >  pci_nvme_irq_msix raising MSI-X IRQ vector 0
> >  pci_nvme_mmio_write addr 0x1004 data 0x7 size 4
> >  pci_nvme_mmio_doorbell_cq cqid 0 new_head 7
> > 
> > We go on and the SQ is created as well.
> > 
> >  pci_nvme_mmio_write addr 0x1000 data 0x8 size 4
> >  pci_nvme_mmio_doorbell_sq sqid 0 new_tail 8
> >  pci_nvme_admin_cmd cid 4118 sqid 0 opc 0x1 opname 'NVME_ADM_CMD_CREATE_SQ'
> >  pci_nvme_create_sq create submission queue, addr=0x1049a0000, sqid=1, cqid=1, qsize=1023, qflags=1
> >  pci_nvme_enqueue_req_completion cid 4118 cqid 0 dw0 0x0 dw1 0x0 status 0x0
> >  pci_nvme_irq_msix raising MSI-X IRQ vector 0
> >  pci_nvme_mmio_write addr 0x1004 data 0x8 size 4
> >  pci_nvme_mmio_doorbell_cq cqid 0 new_head 8
> > 
> > 
> > Then i get a bunch of update_msi_routes, but the virq's are not related
> > to the nvme device.
> > 
> > However, I then assume we hit queue_request_irq() in the kernel and we
> > see the MSI-X table updated:
> > 
> >  msix_table_mmio_write dev nvme hwaddr 0x1c val 0x1 size 4
> >  msix_table_mmio_write dev nvme hwaddr 0x10 val 0xfee003f8 size 4
> >  msix_table_mmio_write dev nvme hwaddr 0x14 val 0x0 size 4
> >  msix_table_mmio_write dev nvme hwaddr 0x18 val 0x0 size 4
> >  msix_table_mmio_write dev nvme hwaddr 0x1c val 0x0 size 4
> >  kvm_irqchip_update_msi_route Updating MSI route virq=0
> >  ... other virq updates
> >  kvm_irqchip_commit_routes
> > 
> > Notice the last trace line. The route for virq 0 is updated.
> > 
> > Looks to me that the virq route is implicitly updated with the new
> > message, no?
> 
> Could you try without the msix masking patch? I suspect our unmask function actually did the “implicit” update here.
> 
> > 

RIGHT.

target/i386/kvm/kvm.c:

5353     if (!notify_list_inited) {
5354         /* For the first time we do add route, add ourselves into
5355          * IOMMU's IEC notify list if needed. */
5356         X86IOMMUState *iommu = x86_iommu_get_default();
5357         if (iommu) {
5358             x86_iommu_iec_register_notifier(iommu,
5359                                             kvm_update_msi_routes_all,
5360                                             NULL);
5361         }
5362         notify_list_inited = true;
5363     }

If we have an IOMMU, then it all just works. I always run with a viommu
configured, so that is why I was not seeing the issue. The masking has
nothing to do with it.

I wonder if this can be made to work without the iommu as well...

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25 11:56       ` Klaus Jensen
  2022-08-25 12:38         ` Klaus Jensen
@ 2022-08-25 14:05         ` Jinhao Fan
  1 sibling, 0 replies; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25 14:05 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

On 8/25/2022 7:56 PM, Klaus Jensen wrote:
> On Aug 25 19:16, Jinhao Fan wrote:
>> On 8/25/2022 5:33 PM, Klaus Jensen wrote:
>>> I'm still a bit perplexed by this issue, so I just tried moving
>>> nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
>>> first_io_cqe thing. I did not observe any particular issues?
>>>
>>> What bad behavior did you encounter, it seems to work fine to me
>>
>> The kernel boots up and got stuck, waiting for interrupts. Then the request
>> times out and got retried three times. Finally the driver seems to decide
>> that the drive is down and continues to boot.
>>
>> I added some prints during debugging and found that the MSI-X message which
>> got registered in KVM via kvm_irqchip_add_msi_route() is not the same as the
>> one actually used in msix_notify().
>>
>> Are you sure you are using KVM's irqfd?
>>
> 
> Pretty sure? Using "ioeventfd=on,irq-eventfd=on" on the controller.
> 
> And the following patch.
> 
> 
> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
> index 30bbda7bb5ae..b2e41d3bd745 100644
> --- i/hw/nvme/ctrl.c
> +++ w/hw/nvme/ctrl.c
> @@ -1490,21 +1490,6 @@ static void nvme_post_cqes(void *opaque)
>               if (!pending) {
>                   n->cq_pending++;
>               }
> -
> -            if (unlikely(cq->first_io_cqe)) {
> -                /*
> -                 * Initilize event notifier when first cqe is posted. For irqfd
> -                 * support we need to register the MSI message in KVM. We
> -                 * can not do this registration at CQ creation time because
> -                 * Linux's NVMe driver changes the MSI message after CQ creation.
> -                 */
> -                cq->first_io_cqe = false;
> -
> -                if (n->params.irq_eventfd) {
> -                    nvme_init_irq_notifier(n, cq);
> -                }
> -            }
> -
>           }
>   
>           nvme_irq_assert(n, cq);
> @@ -4914,11 +4899,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>       }
>       n->cq[cqid] = cq;
>       cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> +
>       /*
>        * Only enable irqfd for IO queues since we always emulate admin queue
>        * in main loop thread
>        */
> -    cq->first_io_cqe = cqid != 0;
> +    if (cqid && n->params.irq_eventfd) {
> +        nvme_init_irq_notifier(n, cq);
> +    }
>   }
> 
> 

Interesting. This patch does not work well for me. Here is the kernel 
boot log related to NVMe:

[    0.625096] nvme nvme0: 4/0/0 default/read/poll queues
[    0.627386] IPI shorthand broadcast: enabled
[    0.627534] nvme nvme0: Ignoring bogus Namespace Identifiers
[    0.628167] sched_clock: Marking stable (528813377, 
99314943)->(636909773, -8781453)
[    0.629204] Spurious interrupt (vector 0xef) on CPU#0. Acked
....
[   30.901423] nvme nvme0: I/O 320 (Read) QID 2 timeout, aborting
[   30.902696] nvme nvme0: Abort status: 0x0

Notice the "spurious interrupt" line. I believe this means the MSI 
message is incorrect.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-25 13:59             ` Klaus Jensen
@ 2022-08-25 14:11               ` Jinhao Fan
  0 siblings, 0 replies; 13+ messages in thread
From: Jinhao Fan @ 2022-08-25 14:11 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: qemu-devel, kbusch, stefanha, open list:nvme

On 8/25/2022 9:59 PM, Klaus Jensen wrote:
> On Aug 25 21:09, Jinhao Fan wrote:
>>
>>
>>
>>> 在 2022年8月25日,20:39,Klaus Jensen <its@irrelevant.dk> 写道:
>>>
>>> On Aug 25 13:56, Klaus Jensen wrote:
>>>>> On Aug 25 19:16, Jinhao Fan wrote:
>>>>> On 8/25/2022 5:33 PM, Klaus Jensen wrote:
>>>>>> I'm still a bit perplexed by this issue, so I just tried moving
>>>>>> nvme_init_irq_notifier() to the end of nvme_init_cq() and removing this
>>>>>> first_io_cqe thing. I did not observe any particular issues?
>>>>>>
>>>>>> What bad behavior did you encounter, it seems to work fine to me
>>>>>
>>>>> The kernel boots up and got stuck, waiting for interrupts. Then the request
>>>>> times out and got retried three times. Finally the driver seems to decide
>>>>> that the drive is down and continues to boot.
>>>>>
>>>>> I added some prints during debugging and found that the MSI-X message which
>>>>> got registered in KVM via kvm_irqchip_add_msi_route() is not the same as the
>>>>> one actually used in msix_notify().
>>>>>
>>>>> Are you sure you are using KVM's irqfd?
>>>>>
>>>>
>>>> Pretty sure? Using "ioeventfd=on,irq-eventfd=on" on the controller.
>>>>
>>>> And the following patch.
>>>>
>>>>
>>>> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
>>>> index 30bbda7bb5ae..b2e41d3bd745 100644
>>>> --- i/hw/nvme/ctrl.c
>>>> +++ w/hw/nvme/ctrl.c
>>>> @@ -1490,21 +1490,6 @@ static void nvme_post_cqes(void *opaque)
>>>>              if (!pending) {
>>>>                  n->cq_pending++;
>>>>              }
>>>> -
>>>> -            if (unlikely(cq->first_io_cqe)) {
>>>> -                /*
>>>> -                 * Initilize event notifier when first cqe is posted. For irqfd
>>>> -                 * support we need to register the MSI message in KVM. We
>>>> -                 * can not do this registration at CQ creation time because
>>>> -                 * Linux's NVMe driver changes the MSI message after CQ creation.
>>>> -                 */
>>>> -                cq->first_io_cqe = false;
>>>> -
>>>> -                if (n->params.irq_eventfd) {
>>>> -                    nvme_init_irq_notifier(n, cq);
>>>> -                }
>>>> -            }
>>>> -
>>>>          }
>>>>
>>>>          nvme_irq_assert(n, cq);
>>>> @@ -4914,11 +4899,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>>>>      }
>>>>      n->cq[cqid] = cq;
>>>>      cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
>>>> +
>>>>      /*
>>>>       * Only enable irqfd for IO queues since we always emulate admin queue
>>>>       * in main loop thread
>>>>       */
>>>> -    cq->first_io_cqe = cqid != 0;
>>>> +    if (cqid && n->params.irq_eventfd) {
>>>> +        nvme_init_irq_notifier(n, cq);
>>>> +    }
>>>> }
>>>>
>>>>
>>>
>>>  From a trace, this is what I observe:
>>>
>>> First, the queue is created and a virq (0) is assigned.
>>>
>>>   msix_table_mmio_write dev nvme hwaddr 0xc val 0x0 size 4
>>>   pci_nvme_mmio_write addr 0x1000 data 0x7 size 4
>>>   pci_nvme_mmio_doorbell_sq sqid 0 new_tail 7
>>>   pci_nvme_admin_cmd cid 4117 sqid 0 opc 0x5 opname 'NVME_ADM_CMD_CREATE_CQ'
>>>   pci_nvme_create_cq create completion queue, addr=0x104318000, cqid=1, vector=1, qsize=1023, qflags=3, ien=1
>>>   kvm_irqchip_add_msi_route dev nvme vector 1 virq 0
>>>   kvm_irqchip_commit_routes
>>>   pci_nvme_enqueue_req_completion cid 4117 cqid 0 dw0 0x0 dw1 0x0 status 0x0
>>>   pci_nvme_irq_msix raising MSI-X IRQ vector 0
>>>   pci_nvme_mmio_write addr 0x1004 data 0x7 size 4
>>>   pci_nvme_mmio_doorbell_cq cqid 0 new_head 7
>>>
>>> We go on and the SQ is created as well.
>>>
>>>   pci_nvme_mmio_write addr 0x1000 data 0x8 size 4
>>>   pci_nvme_mmio_doorbell_sq sqid 0 new_tail 8
>>>   pci_nvme_admin_cmd cid 4118 sqid 0 opc 0x1 opname 'NVME_ADM_CMD_CREATE_SQ'
>>>   pci_nvme_create_sq create submission queue, addr=0x1049a0000, sqid=1, cqid=1, qsize=1023, qflags=1
>>>   pci_nvme_enqueue_req_completion cid 4118 cqid 0 dw0 0x0 dw1 0x0 status 0x0
>>>   pci_nvme_irq_msix raising MSI-X IRQ vector 0
>>>   pci_nvme_mmio_write addr 0x1004 data 0x8 size 4
>>>   pci_nvme_mmio_doorbell_cq cqid 0 new_head 8
>>>
>>>
>>> Then i get a bunch of update_msi_routes, but the virq's are not related
>>> to the nvme device.
>>>
>>> However, I then assume we hit queue_request_irq() in the kernel and we
>>> see the MSI-X table updated:
>>>
>>>   msix_table_mmio_write dev nvme hwaddr 0x1c val 0x1 size 4
>>>   msix_table_mmio_write dev nvme hwaddr 0x10 val 0xfee003f8 size 4
>>>   msix_table_mmio_write dev nvme hwaddr 0x14 val 0x0 size 4
>>>   msix_table_mmio_write dev nvme hwaddr 0x18 val 0x0 size 4
>>>   msix_table_mmio_write dev nvme hwaddr 0x1c val 0x0 size 4
>>>   kvm_irqchip_update_msi_route Updating MSI route virq=0
>>>   ... other virq updates
>>>   kvm_irqchip_commit_routes
>>>
>>> Notice the last trace line. The route for virq 0 is updated.
>>>
>>> Looks to me that the virq route is implicitly updated with the new
>>> message, no?
>>
>> Could you try without the msix masking patch? I suspect our unmask function actually did the “implicit” update here.
>>
>>>
> 
> RIGHT.
> 
> target/i386/kvm/kvm.c:
> 
> 5353     if (!notify_list_inited) {
> 5354         /* For the first time we do add route, add ourselves into
> 5355          * IOMMU's IEC notify list if needed. */
> 5356         X86IOMMUState *iommu = x86_iommu_get_default();
> 5357         if (iommu) {
> 5358             x86_iommu_iec_register_notifier(iommu,
> 5359                                             kvm_update_msi_routes_all,
> 5360                                             NULL);
> 5361         }
> 5362         notify_list_inited = true;
> 5363     }
> 
> If we have an IOMMU, then it all just works. I always run with a viommu
> configured, so that is why I was not seeing the issue. The masking has
> nothing to do with it.
> 
> I wonder if this can be made to work without the iommu as well...

Yes, I didn't use IOMMU in my tests. Is it possible that there is some 
unstated requirement that irqfd only works with IOMMU enabled? :)



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd
  2022-08-26 15:12 [PATCH v2 0/3] iothread and irqfd support Jinhao Fan
@ 2022-08-26 15:12 ` Jinhao Fan
  0 siblings, 0 replies; 13+ messages in thread
From: Jinhao Fan @ 2022-08-26 15:12 UTC (permalink / raw)
  To: qemu-devel
  Cc: its, kbusch, stefanha, Jinhao Fan, Klaus Jensen, open list:nvme

When the new option 'irq-eventfd' is turned on, the IO emulation code
signals an eventfd when it want to (de)assert an irq. The main loop
eventfd handler does the actual irq (de)assertion.  This paves the way
for iothread support since QEMU's interrupt emulation is not thread
safe.

Asserting and deasseting irq with eventfd has some performance
implications. For small queue depth it increases request latency but
for large queue depth it effectively coalesces irqs.

Comparision (KIOPS):

QD            1   4  16  64
QEMU         38 123 210 329
irq-eventfd  32 106 240 364

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c | 120 ++++++++++++++++++++++++++++++++++++++++++-------
 hw/nvme/nvme.h |   3 ++
 2 files changed, 106 insertions(+), 17 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 87aeba0564..51792f3955 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -526,34 +526,57 @@ static void nvme_irq_check(NvmeCtrl *n)
     }
 }
 
+static void nvme_irq_do_assert(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    if (msix_enabled(&(n->parent_obj))) {
+        trace_pci_nvme_irq_msix(cq->vector);
+        msix_notify(&(n->parent_obj), cq->vector);
+    } else {
+        trace_pci_nvme_irq_pin();
+        assert(cq->vector < 32);
+        n->irq_status |= 1 << cq->vector;
+        nvme_irq_check(n);
+    }
+}
+
 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
-        if (msix_enabled(&(n->parent_obj))) {
-            trace_pci_nvme_irq_msix(cq->vector);
-            msix_notify(&(n->parent_obj), cq->vector);
+        if (cq->assert_notifier.initialized) {
+            event_notifier_set(&cq->assert_notifier);
         } else {
-            trace_pci_nvme_irq_pin();
-            assert(cq->vector < 32);
-            n->irq_status |= 1 << cq->vector;
-            nvme_irq_check(n);
+            nvme_irq_do_assert(n, cq);
         }
     } else {
         trace_pci_nvme_irq_masked();
     }
 }
 
+static void nvme_irq_do_deassert(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    if (msix_enabled(&(n->parent_obj))) {
+        return;
+    } else {
+        assert(cq->vector < 32);
+        if (!n->cq_pending) {
+            n->irq_status &= ~(1 << cq->vector);
+        }
+        nvme_irq_check(n);
+    }
+}
+
 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
-        if (msix_enabled(&(n->parent_obj))) {
-            return;
+        if (cq->deassert_notifier.initialized) {
+            /*
+             * The deassert notifier will only be initilized when MSI-X is NOT
+             * in use. Therefore no need to worry about extra eventfd syscall
+             * for pin-based interrupts.
+             */
+            event_notifier_set(&cq->deassert_notifier);
         } else {
-            assert(cq->vector < 32);
-            if (!n->cq_pending) {
-                n->irq_status &= ~(1 << cq->vector);
-            }
-            nvme_irq_check(n);
+            nvme_irq_do_deassert(n, cq);
         }
     }
 }
@@ -1338,6 +1361,50 @@ static void nvme_update_cq_head(NvmeCQueue *cq)
     trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head);
 }
 
+static void nvme_assert_notifier_read(EventNotifier *e)
+{
+    NvmeCQueue *cq = container_of(e, NvmeCQueue, assert_notifier);
+    if (event_notifier_test_and_clear(e)) {
+        nvme_irq_do_assert(cq->ctrl, cq);
+    }
+}
+
+static void nvme_deassert_notifier_read(EventNotifier *e)
+{
+    NvmeCQueue *cq = container_of(e, NvmeCQueue, deassert_notifier);
+    if (event_notifier_test_and_clear(e)) {
+        nvme_irq_do_deassert(cq->ctrl, cq);
+    }
+}
+
+static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
+{
+    int ret;
+
+    ret = event_notifier_init(&cq->assert_notifier, 0);
+    if (ret < 0) {
+        return;
+    }
+
+    event_notifier_set_handler(&cq->assert_notifier,
+                                nvme_assert_notifier_read);
+
+    if (!msix_enabled(&n->parent_obj)) {
+        ret = event_notifier_init(&cq->deassert_notifier, 0);
+        if (ret < 0) {
+            event_notifier_set_handler(&cq->assert_notifier, NULL);
+            event_notifier_cleanup(&cq->assert_notifier);
+
+            return;
+        }
+
+        event_notifier_set_handler(&cq->deassert_notifier,
+                                   nvme_deassert_notifier_read);
+    }
+
+    return;
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -1377,8 +1444,10 @@ static void nvme_post_cqes(void *opaque)
         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
     }
     if (cq->tail != cq->head) {
-        if (cq->irq_enabled && !pending) {
-            n->cq_pending++;
+        if (cq->irq_enabled) {
+            if (!pending) {
+                n->cq_pending++;
+            }
         }
 
         nvme_irq_assert(n, cq);
@@ -4705,6 +4774,14 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
         event_notifier_set_handler(&cq->notifier, NULL);
         event_notifier_cleanup(&cq->notifier);
     }
+    if (cq->assert_notifier.initialized) {
+        event_notifier_set_handler(&cq->assert_notifier, NULL);
+        event_notifier_cleanup(&cq->assert_notifier);
+    }
+    if (cq->deassert_notifier.initialized) {
+        event_notifier_set_handler(&cq->deassert_notifier, NULL);
+        event_notifier_cleanup(&cq->deassert_notifier);
+    }
     if (msix_enabled(&n->parent_obj)) {
         msix_vector_unuse(&n->parent_obj, cq->vector);
     }
@@ -4734,7 +4811,7 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
         n->cq_pending--;
     }
 
-    nvme_irq_deassert(n, cq);
+    nvme_irq_do_deassert(n, cq);
     trace_pci_nvme_del_cq(qid);
     nvme_free_cq(cq, n);
     return NVME_SUCCESS;
@@ -4772,6 +4849,14 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     }
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
+
+    /*
+     * Only enable irq eventfd for IO queues since we always emulate admin
+     * queue in main loop thread.
+     */
+    if (cqid && n->params.irq_eventfd) {
+        nvme_init_irq_notifier(n, cq);
+    }
 }
 
 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
@@ -7671,6 +7756,7 @@ static Property nvme_props[] = {
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
     DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
+    DEFINE_PROP_BOOL("irq-eventfd", NvmeCtrl, params.irq_eventfd, false),
     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
                      params.auto_transition_zones, true),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..4850d3e965 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -398,6 +398,8 @@ typedef struct NvmeCQueue {
     uint64_t    ei_addr;
     QEMUTimer   *timer;
     EventNotifier notifier;
+    EventNotifier assert_notifier;
+    EventNotifier deassert_notifier;
     bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
     QTAILQ_HEAD(, NvmeRequest) req_list;
@@ -422,6 +424,7 @@ typedef struct NvmeParams {
     bool     auto_transition_zones;
     bool     legacy_cmb;
     bool     ioeventfd;
+    bool     irq_eventfd;
     uint8_t  sriov_max_vfs;
     uint16_t sriov_vq_flexible;
     uint16_t sriov_vi_flexible;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-08-26 15:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-25  7:47 [PATCH v2 0/3] hw/nvme: add irqfd support Jinhao Fan
2022-08-25  7:47 ` [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd Jinhao Fan
2022-08-25  9:33   ` Klaus Jensen
2022-08-25 11:16     ` Jinhao Fan
2022-08-25 11:56       ` Klaus Jensen
2022-08-25 12:38         ` Klaus Jensen
2022-08-25 13:09           ` Jinhao Fan
2022-08-25 13:59             ` Klaus Jensen
2022-08-25 14:11               ` Jinhao Fan
2022-08-25 14:05         ` Jinhao Fan
2022-08-25  7:47 ` [PATCH v2 2/3] hw/nvme: use KVM irqfd when available Jinhao Fan
2022-08-25  7:47 ` [PATCH v2 3/3] hw/nvme: add MSI-x mask handlers for irqfd Jinhao Fan
2022-08-26 15:12 [PATCH v2 0/3] iothread and irqfd support Jinhao Fan
2022-08-26 15:12 ` [PATCH v2 1/3] hw/nvme: support irq(de)assertion with eventfd Jinhao Fan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.