All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates
@ 2022-06-27 10:48 Jinhao Fan
  2022-06-29  9:04 ` Jinhao Fan
  2022-06-29 20:13 ` Klaus Jensen
  0 siblings, 2 replies; 6+ messages in thread
From: Jinhao Fan @ 2022-06-27 10:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch, Jinhao Fan

Add property "ioeventfd" which is enabled by default. When this is
enabled, updates on the doorbell registers will cause KVM to signal
an event to the QEMU main loop to handle the doorbell updates.
Therefore, instead of letting the vcpu thread run both guest VM and
IO emulation, we now use the main loop thread to do IO emulation and
thus the vcpu thread has more cycles for the guest VM.

Since ioeventfd does not tell us the exact value that is written, it is
only useful when shadow doorbell buffer is enabled, where we check
for the value in the shadow doorbell buffer when we get the doorbell
update event.

IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)

qd           1   4  16  64
qemu        35 121 176 153
ioeventfd   41 133 258 313

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/nvme/nvme.h |  5 +++
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index c952c34f94..787b89f7d3 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 
     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
-    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+
+    if (req->sq->ioeventfd_enabled) {
+        /* Post CQE directly since we are in main loop thread */
+        nvme_post_cqes(cq);
+    } else {
+        /* Schedule the timer to post CQE later since we are in vcpu thread */
+        timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+    }
 }
 
 static void nvme_process_aers(void *opaque)
@@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     return NVME_INVALID_OPCODE | NVME_DNR;
 }
 
+static void nvme_cq_notifier(EventNotifier *e)
+{
+    NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
+    NvmeCtrl *n = cq->ctrl;
+
+    event_notifier_test_and_clear(&cq->notifier);
+
+    nvme_update_cq_head(cq);
+
+    if (cq->tail == cq->head) {
+        if (cq->irq_enabled) {
+            n->cq_pending--;
+        }
+
+        nvme_irq_deassert(n, cq);
+    }
+
+    nvme_post_cqes(cq);
+}
+
+static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
+{
+    NvmeCtrl *n = cq->ctrl;
+    uint16_t offset = (cq->cqid << 3) + (1 << 2);
+    int ret;
+
+    if ((ret = event_notifier_init(&cq->notifier, 0))) {
+        return ret;
+    }
+
+    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+    memory_region_add_eventfd(&n->iomem,
+                              0x1000 + offset, 4, false, 0, &cq->notifier);
+    
+    return 0;
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+    NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
+
+    event_notifier_test_and_clear(&sq->notifier);
+
+    nvme_process_sq(sq);
+}
+
+static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
+{
+    NvmeCtrl *n = sq->ctrl;
+    uint16_t offset = sq->sqid << 3;
+    int ret;
+
+    if ((ret = event_notifier_init(&sq->notifier, 0))) {
+        return ret;
+    }
+
+    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+    memory_region_add_eventfd(&n->iomem,
+                              0x1000 + offset, 4, false, 0, &sq->notifier);
+
+    return 0;
+}
+
 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
 {
     n->sq[sq->sqid] = NULL;
     timer_free(sq->timer);
+    event_notifier_cleanup(&sq->notifier);
     g_free(sq->io_req);
     if (sq->sqid) {
         g_free(sq);
@@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
                          uint16_t sqid, uint16_t cqid, uint16_t size)
 {
     int i;
+    int ret;
     NvmeCQueue *cq;
 
     sq->ctrl = n;
@@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
     if (n->dbbuf_enabled) {
         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
+            
+        if (n->params.ioeventfd && sq->sqid != 0) {
+            ret = nvme_init_sq_ioeventfd(sq);
+            sq->ioeventfd_enabled = ret == 0;
+        }
     }
 
     assert(n->cq[cqid]);
@@ -4577,6 +4654,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
     n->cq[cq->cqid] = NULL;
     timer_free(cq->timer);
+    event_notifier_cleanup(&cq->notifier);
     if (msix_enabled(&n->parent_obj)) {
         msix_vector_unuse(&n->parent_obj, cq->vector);
     }
@@ -4635,6 +4713,11 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     if (n->dbbuf_enabled) {
         cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
         cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
+
+        if (n->params.ioeventfd && cqid != 0) {
+            ret = nvme_init_cq_ioeventfd(cq);
+            cq->ioeventfd_enabled = ret == 0;
+        }
     }
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -5793,6 +5876,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
     uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
     uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
     int i;
+    int ret;
 
     /* Address should be page aligned */
     if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
@@ -5818,6 +5902,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
             sq->ei_addr = eis_addr + (i << 3);
             pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
                     sizeof(sq->tail));
+            
+            if (n->params.ioeventfd && sq->sqid != 0) {
+                ret = nvme_init_sq_ioeventfd(sq);
+                sq->ioeventfd_enabled = ret == 0;
+            }
         }
 
         if (cq) {
@@ -5826,6 +5915,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
             cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
             pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
                     sizeof(cq->head));
+            
+            if (n->params.ioeventfd && cq->cqid != 0) {
+                ret = nvme_init_cq_ioeventfd(cq);
+                cq->ioeventfd_enabled = ret == 0;
+            }
         }
     }
 
@@ -7040,6 +7134,7 @@ static Property nvme_props[] = {
     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
                      params.auto_transition_zones, true),
+    DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 4452e4b1bf..2a9beea0c8 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -369,6 +369,8 @@ typedef struct NvmeSQueue {
     uint64_t    db_addr;
     uint64_t    ei_addr;
     QEMUTimer   *timer;
+    EventNotifier notifier;
+    bool        ioeventfd_enabled;
     NvmeRequest *io_req;
     QTAILQ_HEAD(, NvmeRequest) req_list;
     QTAILQ_HEAD(, NvmeRequest) out_req_list;
@@ -388,6 +390,8 @@ typedef struct NvmeCQueue {
     uint64_t    db_addr;
     uint64_t    ei_addr;
     QEMUTimer   *timer;
+    EventNotifier notifier;
+    bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
     QTAILQ_HEAD(, NvmeRequest) req_list;
 } NvmeCQueue;
@@ -410,6 +414,7 @@ typedef struct NvmeParams {
     uint8_t  zasl;
     bool     auto_transition_zones;
     bool     legacy_cmb;
+    bool     ioeventfd;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates
  2022-06-27 10:48 [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates Jinhao Fan
@ 2022-06-29  9:04 ` Jinhao Fan
  2022-06-29 15:58   ` Keith Busch
  2022-06-29 20:13 ` Klaus Jensen
  1 sibling, 1 reply; 6+ messages in thread
From: Jinhao Fan @ 2022-06-29  9:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch

Ping~

> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>     if (n->dbbuf_enabled) {
>         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> +            
> +        if (n->params.ioeventfd && sq->sqid != 0) {
> +            ret = nvme_init_sq_ioeventfd(sq);
> +            sq->ioeventfd_enabled = ret == 0;
> +        }
>     }
> 
>     assert(n->cq[cqid]);

Is this “ret == 0” a correct way for error handling?

I’ve also been wondering whether using irqfd for sending interrupts can
bring some benefits. I’m not familiar with how QEMU emulates interrupts.
What do you think of irqfd’s?


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates
  2022-06-29  9:04 ` Jinhao Fan
@ 2022-06-29 15:58   ` Keith Busch
  2022-06-30  2:04     ` Jinhao Fan
  0 siblings, 1 reply; 6+ messages in thread
From: Keith Busch @ 2022-06-29 15:58 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, its

On Wed, Jun 29, 2022 at 05:04:25PM +0800, Jinhao Fan wrote:
> Ping~
> 
> > @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
> >     if (n->dbbuf_enabled) {
> >         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
> >         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> > +            
> > +        if (n->params.ioeventfd && sq->sqid != 0) {
> > +            ret = nvme_init_sq_ioeventfd(sq);
> > +            sq->ioeventfd_enabled = ret == 0;
> > +        }
> >     }
> > 
> >     assert(n->cq[cqid]);
> 
> Is this “ret == 0” a correct way for error handling?

That looks correct since we don't need the ioevent is an optional optimization.

I would just suggest making this easier to read. For example, in
nvme_init_sq_ioeventfd(), instead of assigning within a conditional:

    if ((ret = event_notifier_init(&cq->notifier, 0))) {

Do each part separately:

    ret = event_notifier_init(&cq->notifier, 0);
    if (ret) {
 
> I’ve also been wondering whether using irqfd for sending interrupts can
> bring some benefits. I’m not familiar with how QEMU emulates interrupts.
> What do you think of irqfd’s?

Not sure about this mechanism, I'll need to look into it.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates
  2022-06-27 10:48 [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates Jinhao Fan
  2022-06-29  9:04 ` Jinhao Fan
@ 2022-06-29 20:13 ` Klaus Jensen
  2022-06-30  3:06   ` Jinhao Fan
  1 sibling, 1 reply; 6+ messages in thread
From: Klaus Jensen @ 2022-06-29 20:13 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch

[-- Attachment #1: Type: text/plain, Size: 8707 bytes --]

On Jun 27 18:48, Jinhao Fan wrote:
> Add property "ioeventfd" which is enabled by default. When this is
> enabled, updates on the doorbell registers will cause KVM to signal
> an event to the QEMU main loop to handle the doorbell updates.
> Therefore, instead of letting the vcpu thread run both guest VM and
> IO emulation, we now use the main loop thread to do IO emulation and
> thus the vcpu thread has more cycles for the guest VM.
> 
> Since ioeventfd does not tell us the exact value that is written, it is
> only useful when shadow doorbell buffer is enabled, where we check
> for the value in the shadow doorbell buffer when we get the doorbell
> update event.
> 
> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
> 
> qd           1   4  16  64
> qemu        35 121 176 153
> ioeventfd   41 133 258 313
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/nvme/nvme.h |  5 +++
>  2 files changed, 101 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index c952c34f94..787b89f7d3 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>  
>      QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
>      QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
> -    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +
> +    if (req->sq->ioeventfd_enabled) {
> +        /* Post CQE directly since we are in main loop thread */
> +        nvme_post_cqes(cq);
> +    } else {
> +        /* Schedule the timer to post CQE later since we are in vcpu thread */
> +        timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +    }
>  }
>  
>  static void nvme_process_aers(void *opaque)
> @@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>      return NVME_INVALID_OPCODE | NVME_DNR;
>  }
>  
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> +    NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
> +    NvmeCtrl *n = cq->ctrl;
> +
> +    event_notifier_test_and_clear(&cq->notifier);
> +
> +    nvme_update_cq_head(cq);
> +
> +    if (cq->tail == cq->head) {
> +        if (cq->irq_enabled) {
> +            n->cq_pending--;
> +        }
> +
> +        nvme_irq_deassert(n, cq);
> +    }
> +
> +    nvme_post_cqes(cq);
> +}
> +
> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> +{
> +    NvmeCtrl *n = cq->ctrl;
> +    uint16_t offset = (cq->cqid << 3) + (1 << 2);
> +    int ret;
> +
> +    if ((ret = event_notifier_init(&cq->notifier, 0))) {
> +        return ret;
> +    }

Dont assign in conditionals and rely on the implicit value. It's too
error prone. Split into

  ret = event_notifier_init(&cq->notifier, 0);
  if (ret < 0) {
    return ret;
  }

> +
> +    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> +    memory_region_add_eventfd(&n->iomem,
> +                              0x1000 + offset, 4, false, 0, &cq->notifier);
> +    
> +    return 0;
> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> +    NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
> +
> +    event_notifier_test_and_clear(&sq->notifier);
> +
> +    nvme_process_sq(sq);
> +}
> +
> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> +{
> +    NvmeCtrl *n = sq->ctrl;
> +    uint16_t offset = sq->sqid << 3;
> +    int ret;
> +
> +    if ((ret = event_notifier_init(&sq->notifier, 0))) {
> +        return ret;
> +    }

Same as above.

> +
> +    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> +    memory_region_add_eventfd(&n->iomem,
> +                              0x1000 + offset, 4, false, 0, &sq->notifier);
> +
> +    return 0;
> +}
> +
>  static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
>  {
>      n->sq[sq->sqid] = NULL;
>      timer_free(sq->timer);
> +    event_notifier_cleanup(&sq->notifier);
>      g_free(sq->io_req);
>      if (sq->sqid) {
>          g_free(sq);
> @@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>                           uint16_t sqid, uint16_t cqid, uint16_t size)
>  {
>      int i;
> +    int ret;
>      NvmeCQueue *cq;
>  
>      sq->ctrl = n;
> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>      if (n->dbbuf_enabled) {
>          sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>          sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> +            
> +        if (n->params.ioeventfd && sq->sqid != 0) {
> +            ret = nvme_init_sq_ioeventfd(sq);
> +            sq->ioeventfd_enabled = ret == 0;
> +        }

Not using ret for anything really, so

  if (!nvme_init_sq_ioeventfd(sq)) {
    sq->ioeventfd_enabled = true;
  }

should do.

>      }
>  
>      assert(n->cq[cqid]);
> @@ -4577,6 +4654,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
>  {
>      n->cq[cq->cqid] = NULL;
>      timer_free(cq->timer);
> +    event_notifier_cleanup(&cq->notifier);
>      if (msix_enabled(&n->parent_obj)) {
>          msix_vector_unuse(&n->parent_obj, cq->vector);
>      }
> @@ -4635,6 +4713,11 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>      if (n->dbbuf_enabled) {
>          cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
>          cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
> +
> +        if (n->params.ioeventfd && cqid != 0) {
> +            ret = nvme_init_cq_ioeventfd(cq);
> +            cq->ioeventfd_enabled = ret == 0;
> +        }
>      }
>      n->cq[cqid] = cq;
>      cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> @@ -5793,6 +5876,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>      uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
>      uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
>      int i;
> +    int ret;
>  
>      /* Address should be page aligned */
>      if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
> @@ -5818,6 +5902,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>              sq->ei_addr = eis_addr + (i << 3);
>              pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
>                      sizeof(sq->tail));
> +            
> +            if (n->params.ioeventfd && sq->sqid != 0) {
> +                ret = nvme_init_sq_ioeventfd(sq);
> +                sq->ioeventfd_enabled = ret == 0;
> +            }

Same as above.

>          }
>  
>          if (cq) {
> @@ -5826,6 +5915,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>              cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
>              pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
>                      sizeof(cq->head));
> +            
> +            if (n->params.ioeventfd && cq->cqid != 0) {
> +                ret = nvme_init_cq_ioeventfd(cq);
> +                cq->ioeventfd_enabled = ret == 0;
> +            }
>          }
>      }
>  
> @@ -7040,6 +7134,7 @@ static Property nvme_props[] = {
>      DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
>      DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
>                       params.auto_transition_zones, true),
> +    DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 4452e4b1bf..2a9beea0c8 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -369,6 +369,8 @@ typedef struct NvmeSQueue {
>      uint64_t    db_addr;
>      uint64_t    ei_addr;
>      QEMUTimer   *timer;
> +    EventNotifier notifier;
> +    bool        ioeventfd_enabled;
>      NvmeRequest *io_req;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>      QTAILQ_HEAD(, NvmeRequest) out_req_list;
> @@ -388,6 +390,8 @@ typedef struct NvmeCQueue {
>      uint64_t    db_addr;
>      uint64_t    ei_addr;
>      QEMUTimer   *timer;
> +    EventNotifier notifier;
> +    bool        ioeventfd_enabled;
>      QTAILQ_HEAD(, NvmeSQueue) sq_list;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>  } NvmeCQueue;
> @@ -410,6 +414,7 @@ typedef struct NvmeParams {
>      uint8_t  zasl;
>      bool     auto_transition_zones;
>      bool     legacy_cmb;
> +    bool     ioeventfd;
>  } NvmeParams;
>  
>  typedef struct NvmeCtrl {
> -- 
> 2.25.1
> 

Otherwise, looks good!

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates
  2022-06-29 15:58   ` Keith Busch
@ 2022-06-30  2:04     ` Jinhao Fan
  0 siblings, 0 replies; 6+ messages in thread
From: Jinhao Fan @ 2022-06-30  2:04 UTC (permalink / raw)
  To: Keith Busch; +Cc: qemu-devel, Klaus Jensen


> That looks correct since we don't need the ioevent is an optional optimization.
> 
> I would just suggest making this easier to read. For example, in
> nvme_init_sq_ioeventfd(), instead of assigning within a conditional:
> 
>    if ((ret = event_notifier_init(&cq->notifier, 0))) {
> 
> Do each part separately:
> 
>    ret = event_notifier_init(&cq->notifier, 0);
>    if (ret) {

Thanks for the comment! Will change in the next version.

> 
>> I’ve also been wondering whether using irqfd for sending interrupts can
>> bring some benefits. I’m not familiar with how QEMU emulates interrupts.
>> What do you think of irqfd’s?
> 
> Not sure about this mechanism, I'll need to look into it.

Since irqfd is internally a counter, I guess it may be able to “coalesce”
interrupts so that performance can be improved. I will try it tomorrow.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates
  2022-06-29 20:13 ` Klaus Jensen
@ 2022-06-30  3:06   ` Jinhao Fan
  0 siblings, 0 replies; 6+ messages in thread
From: Jinhao Fan @ 2022-06-30  3:06 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: qemu-devel, kbusch

at 4:13 AM, Klaus Jensen <its@irrelevant.dk> wrote:

> On Jun 27 18:48, Jinhao Fan wrote:
>> Add property "ioeventfd" which is enabled by default. When this is
>> enabled, updates on the doorbell registers will cause KVM to signal
>> an event to the QEMU main loop to handle the doorbell updates.
>> Therefore, instead of letting the vcpu thread run both guest VM and
>> IO emulation, we now use the main loop thread to do IO emulation and
>> thus the vcpu thread has more cycles for the guest VM.
>> 
>> Since ioeventfd does not tell us the exact value that is written, it is
>> only useful when shadow doorbell buffer is enabled, where we check
>> for the value in the shadow doorbell buffer when we get the doorbell
>> update event.
>> 
>> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
>> 
>> qd           1   4  16  64
>> qemu        35 121 176 153
>> ioeventfd   41 133 258 313
>> 
>> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
>> ---
>> hw/nvme/ctrl.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++-
>> hw/nvme/nvme.h |  5 +++
>> 2 files changed, 101 insertions(+), 1 deletion(-)
>> 
>> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>> index c952c34f94..787b89f7d3 100644
>> --- a/hw/nvme/ctrl.c
>> +++ b/hw/nvme/ctrl.c
>> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>> 
>>     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
>>     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
>> -    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>> +
>> +    if (req->sq->ioeventfd_enabled) {
>> +        /* Post CQE directly since we are in main loop thread */
>> +        nvme_post_cqes(cq);
>> +    } else {
>> +        /* Schedule the timer to post CQE later since we are in vcpu thread */
>> +        timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>> +    }
>> }
>> 
>> static void nvme_process_aers(void *opaque)
>> @@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>>     return NVME_INVALID_OPCODE | NVME_DNR;
>> }
>> 
>> +static void nvme_cq_notifier(EventNotifier *e)
>> +{
>> +    NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
>> +    NvmeCtrl *n = cq->ctrl;
>> +
>> +    event_notifier_test_and_clear(&cq->notifier);
>> +
>> +    nvme_update_cq_head(cq);
>> +
>> +    if (cq->tail == cq->head) {
>> +        if (cq->irq_enabled) {
>> +            n->cq_pending--;
>> +        }
>> +
>> +        nvme_irq_deassert(n, cq);
>> +    }
>> +
>> +    nvme_post_cqes(cq);
>> +}
>> +
>> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
>> +{
>> +    NvmeCtrl *n = cq->ctrl;
>> +    uint16_t offset = (cq->cqid << 3) + (1 << 2);
>> +    int ret;
>> +
>> +    if ((ret = event_notifier_init(&cq->notifier, 0))) {
>> +        return ret;
>> +    }
> 
> Dont assign in conditionals and rely on the implicit value. It's too
> error prone. Split into
> 
>  ret = event_notifier_init(&cq->notifier, 0);
>  if (ret < 0) {
>    return ret;
>  }
> 
>> +
>> +    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
>> +    memory_region_add_eventfd(&n->iomem,
>> +                              0x1000 + offset, 4, false, 0, &cq->notifier);
>> +    
>> +    return 0;
>> +}
>> +
>> +static void nvme_sq_notifier(EventNotifier *e)
>> +{
>> +    NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
>> +
>> +    event_notifier_test_and_clear(&sq->notifier);
>> +
>> +    nvme_process_sq(sq);
>> +}
>> +
>> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
>> +{
>> +    NvmeCtrl *n = sq->ctrl;
>> +    uint16_t offset = sq->sqid << 3;
>> +    int ret;
>> +
>> +    if ((ret = event_notifier_init(&sq->notifier, 0))) {
>> +        return ret;
>> +    }
> 
> Same as above.
> 
>> +
>> +    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
>> +    memory_region_add_eventfd(&n->iomem,
>> +                              0x1000 + offset, 4, false, 0, &sq->notifier);
>> +
>> +    return 0;
>> +}
>> +
>> static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
>> {
>>     n->sq[sq->sqid] = NULL;
>>     timer_free(sq->timer);
>> +    event_notifier_cleanup(&sq->notifier);
>>     g_free(sq->io_req);
>>     if (sq->sqid) {
>>         g_free(sq);
>> @@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>>                          uint16_t sqid, uint16_t cqid, uint16_t size)
>> {
>>     int i;
>> +    int ret;
>>     NvmeCQueue *cq;
>> 
>>     sq->ctrl = n;
>> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>>     if (n->dbbuf_enabled) {
>>         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>>         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
>> +            
>> +        if (n->params.ioeventfd && sq->sqid != 0) {
>> +            ret = nvme_init_sq_ioeventfd(sq);
>> +            sq->ioeventfd_enabled = ret == 0;
>> +        }
> 
> Not using ret for anything really, so
> 
>  if (!nvme_init_sq_ioeventfd(sq)) {
>    sq->ioeventfd_enabled = true;
>  }
> 
> should do.
> 
>>     }
>> 
>>     assert(n->cq[cqid]);
>> @@ -4577,6 +4654,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
>> {
>>     n->cq[cq->cqid] = NULL;
>>     timer_free(cq->timer);
>> +    event_notifier_cleanup(&cq->notifier);
>>     if (msix_enabled(&n->parent_obj)) {
>>         msix_vector_unuse(&n->parent_obj, cq->vector);
>>     }
>> @@ -4635,6 +4713,11 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>>     if (n->dbbuf_enabled) {
>>         cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
>>         cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
>> +
>> +        if (n->params.ioeventfd && cqid != 0) {
>> +            ret = nvme_init_cq_ioeventfd(cq);
>> +            cq->ioeventfd_enabled = ret == 0;
>> +        }
>>     }
>>     n->cq[cqid] = cq;
>>     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
>> @@ -5793,6 +5876,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>>     uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
>>     uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
>>     int i;
>> +    int ret;
>> 
>>     /* Address should be page aligned */
>>     if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
>> @@ -5818,6 +5902,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>>             sq->ei_addr = eis_addr + (i << 3);
>>             pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
>>                     sizeof(sq->tail));
>> +            
>> +            if (n->params.ioeventfd && sq->sqid != 0) {
>> +                ret = nvme_init_sq_ioeventfd(sq);
>> +                sq->ioeventfd_enabled = ret == 0;
>> +            }
> 
> Same as above.
> 
>>         }
>> 
>>         if (cq) {
>> @@ -5826,6 +5915,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>>             cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
>>             pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
>>                     sizeof(cq->head));
>> +            
>> +            if (n->params.ioeventfd && cq->cqid != 0) {
>> +                ret = nvme_init_cq_ioeventfd(cq);
>> +                cq->ioeventfd_enabled = ret == 0;
>> +            }
>>         }
>>     }
>> 
>> @@ -7040,6 +7134,7 @@ static Property nvme_props[] = {
>>     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
>>     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
>>                      params.auto_transition_zones, true),
>> +    DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
>>     DEFINE_PROP_END_OF_LIST(),
>> };
>> 
>> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
>> index 4452e4b1bf..2a9beea0c8 100644
>> --- a/hw/nvme/nvme.h
>> +++ b/hw/nvme/nvme.h
>> @@ -369,6 +369,8 @@ typedef struct NvmeSQueue {
>>     uint64_t    db_addr;
>>     uint64_t    ei_addr;
>>     QEMUTimer   *timer;
>> +    EventNotifier notifier;
>> +    bool        ioeventfd_enabled;
>>     NvmeRequest *io_req;
>>     QTAILQ_HEAD(, NvmeRequest) req_list;
>>     QTAILQ_HEAD(, NvmeRequest) out_req_list;
>> @@ -388,6 +390,8 @@ typedef struct NvmeCQueue {
>>     uint64_t    db_addr;
>>     uint64_t    ei_addr;
>>     QEMUTimer   *timer;
>> +    EventNotifier notifier;
>> +    bool        ioeventfd_enabled;
>>     QTAILQ_HEAD(, NvmeSQueue) sq_list;
>>     QTAILQ_HEAD(, NvmeRequest) req_list;
>> } NvmeCQueue;
>> @@ -410,6 +414,7 @@ typedef struct NvmeParams {
>>     uint8_t  zasl;
>>     bool     auto_transition_zones;
>>     bool     legacy_cmb;
>> +    bool     ioeventfd;
>> } NvmeParams;
>> 
>> typedef struct NvmeCtrl {
>> -- 
>> 2.25.1
> 
> Otherwise, looks good!

Hi Klaus,

Thanks for the comments. I will post v2 soon.


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-06-30  3:07 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-27 10:48 [PATCH] hw/nvme: Use ioeventfd to handle doorbell updates Jinhao Fan
2022-06-29  9:04 ` Jinhao Fan
2022-06-29 15:58   ` Keith Busch
2022-06-30  2:04     ` Jinhao Fan
2022-06-29 20:13 ` Klaus Jensen
2022-06-30  3:06   ` Jinhao Fan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.