All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stefan Hajnoczi <stefanha@gmail.com>
To: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Cc: qemu-devel <qemu-devel@nongnu.org>,
	Klaus Birkelund Jensen <its@irrelevant.dk>,
	 Keith Busch <kbusch@kernel.org>
Subject: Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
Date: Tue, 26 Jul 2022 08:35:29 -0400	[thread overview]
Message-ID: <CAJSP0QWAK-x4inYNPWBrd+KJDKPszjkCsadH+Ov5=DcDF6=eFA@mail.gmail.com> (raw)
In-Reply-To: <20220705142403.101539-1-fanjinhao21s@ict.ac.cn>

On Tue, 5 Jul 2022 at 10:25, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:
>
> Add property "ioeventfd" which is enabled by default. When this is
> enabled, updates on the doorbell registers will cause KVM to signal
> an event to the QEMU main loop to handle the doorbell updates.
> Therefore, instead of letting the vcpu thread run both guest VM and
> IO emulation, we now use the main loop thread to do IO emulation and
> thus the vcpu thread has more cycles for the guest VM.
>
> Since ioeventfd does not tell us the exact value that is written, it is
> only useful when shadow doorbell buffer is enabled, where we check
> for the value in the shadow doorbell buffer when we get the doorbell
> update event.
>
> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
>
> qd           1   4  16  64
> qemu        35 121 176 153
> ioeventfd   41 133 258 313

Nice

>
> Changes since v3:
>  - Do not deregister ioeventfd when it was not enabled on a SQ/CQ
>
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/nvme/nvme.h |   5 +++
>  2 files changed, 118 insertions(+), 1 deletion(-)
>
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index c952c34f94..4b75c5f549 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>
>      QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
>      QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
> -    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +
> +    if (req->sq->ioeventfd_enabled) {
> +        /* Post CQE directly since we are in main loop thread */
> +        nvme_post_cqes(cq);
> +    } else {
> +        /* Schedule the timer to post CQE later since we are in vcpu thread */
> +        timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> +    }

This change is somewhat independent of ioeventfd. I wonder how much of
the performance improvement is due to the sq ioeventfd and how much is
due to bypassing the completion timer.

In the qd=1 case I expect the completion timer to increase latency and
hurt performance. In the qd=64 case the timer increases batching,
reduces CPU consumption, and probably improves performance.

It would be nice to measure this in isolation, independently of
ioeventfd/IOThreads.

>  }
>
>  static void nvme_process_aers(void *opaque)
> @@ -4195,10 +4202,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>      return NVME_INVALID_OPCODE | NVME_DNR;
>  }
>
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> +    NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
> +    NvmeCtrl *n = cq->ctrl;
> +
> +    event_notifier_test_and_clear(&cq->notifier);
> +
> +    nvme_update_cq_head(cq);
> +
> +    if (cq->tail == cq->head) {
> +        if (cq->irq_enabled) {
> +            n->cq_pending--;
> +        }
> +
> +        nvme_irq_deassert(n, cq);
> +    }
> +
> +    nvme_post_cqes(cq);
> +}
> +
> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> +{
> +    NvmeCtrl *n = cq->ctrl;
> +    uint16_t offset = (cq->cqid << 3) + (1 << 2);

Too many...

> +    int ret;
> +
> +    ret = event_notifier_init(&cq->notifier, 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> +    memory_region_add_eventfd(&n->iomem,
> +                              0x1000 + offset, 4, false, 0, &cq->notifier);

...magic numbers. Please use constants so it's clear what << 3, (1 <<
2), 0x1000, etc mean.

> +
> +    return 0;
> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> +    NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
> +
> +    event_notifier_test_and_clear(&sq->notifier);
> +
> +    nvme_process_sq(sq);
> +}
> +
> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> +{
> +    NvmeCtrl *n = sq->ctrl;
> +    uint16_t offset = sq->sqid << 3;
> +    int ret;
> +
> +    ret = event_notifier_init(&sq->notifier, 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> +    memory_region_add_eventfd(&n->iomem,
> +                              0x1000 + offset, 4, false, 0, &sq->notifier);
> +
> +    return 0;
> +}
> +
>  static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
>  {
> +    uint16_t offset = sq->sqid << 3;
> +
>      n->sq[sq->sqid] = NULL;
>      timer_free(sq->timer);
> +    if (sq->ioeventfd_enabled) {
> +        memory_region_del_eventfd(&n->iomem,
> +                                  0x1000 + offset, 4, false, 0, &sq->notifier);
> +        event_notifier_cleanup(&sq->notifier);
> +    }
>      g_free(sq->io_req);
>      if (sq->sqid) {
>          g_free(sq);
> @@ -4271,6 +4350,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>      if (n->dbbuf_enabled) {
>          sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>          sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> +
> +        if (n->params.ioeventfd && sq->sqid != 0) {
> +            if (!nvme_init_sq_ioeventfd(sq)) {
> +                sq->ioeventfd_enabled = true;
> +            }
> +        }
>      }
>
>      assert(n->cq[cqid]);
> @@ -4575,8 +4660,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
>
>  static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
>  {
> +    uint16_t offset = (cq->cqid << 3) + (1 << 2);
> +
>      n->cq[cq->cqid] = NULL;
>      timer_free(cq->timer);
> +    if (cq->ioeventfd_enabled) {
> +        memory_region_del_eventfd(&n->iomem,
> +                                  0x1000 + offset, 4, false, 0, &cq->notifier);
> +        event_notifier_cleanup(&cq->notifier);
> +    }
>      if (msix_enabled(&n->parent_obj)) {
>          msix_vector_unuse(&n->parent_obj, cq->vector);
>      }
> @@ -4635,6 +4727,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>      if (n->dbbuf_enabled) {
>          cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
>          cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
> +
> +        if (n->params.ioeventfd && cqid != 0) {
> +            if (!nvme_init_cq_ioeventfd(cq)) {
> +                cq->ioeventfd_enabled = true;
> +            }
> +        }
>      }
>      n->cq[cqid] = cq;
>      cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> @@ -5793,6 +5891,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>      uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
>      uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
>      int i;
> +    int ret;
>
>      /* Address should be page aligned */
>      if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
> @@ -5818,6 +5917,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>              sq->ei_addr = eis_addr + (i << 3);
>              pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
>                      sizeof(sq->tail));
> +
> +            if (n->params.ioeventfd && sq->sqid != 0) {
> +                if (!nvme_init_sq_ioeventfd(sq)) {
> +                    sq->ioeventfd_enabled = true;
> +                }
> +            }
>          }
>
>          if (cq) {
> @@ -5826,6 +5931,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
>              cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
>              pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
>                      sizeof(cq->head));
> +
> +            if (n->params.ioeventfd && cq->cqid != 0) {
> +                if (!nvme_init_cq_ioeventfd(cq)) {
> +                    cq->ioeventfd_enabled = true;
> +                }
> +            }
>          }
>      }
>
> @@ -7040,6 +7151,7 @@ static Property nvme_props[] = {
>      DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
>      DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
>                       params.auto_transition_zones, true),
> +    DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 4452e4b1bf..2a9beea0c8 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -369,6 +369,8 @@ typedef struct NvmeSQueue {
>      uint64_t    db_addr;
>      uint64_t    ei_addr;
>      QEMUTimer   *timer;
> +    EventNotifier notifier;
> +    bool        ioeventfd_enabled;
>      NvmeRequest *io_req;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>      QTAILQ_HEAD(, NvmeRequest) out_req_list;
> @@ -388,6 +390,8 @@ typedef struct NvmeCQueue {
>      uint64_t    db_addr;
>      uint64_t    ei_addr;
>      QEMUTimer   *timer;
> +    EventNotifier notifier;
> +    bool        ioeventfd_enabled;
>      QTAILQ_HEAD(, NvmeSQueue) sq_list;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>  } NvmeCQueue;
> @@ -410,6 +414,7 @@ typedef struct NvmeParams {
>      uint8_t  zasl;
>      bool     auto_transition_zones;
>      bool     legacy_cmb;
> +    bool     ioeventfd;
>  } NvmeParams;
>
>  typedef struct NvmeCtrl {
> --
> 2.25.1
>
>


      parent reply	other threads:[~2022-07-26 12:41 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-05 14:24 [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates Jinhao Fan
2022-07-05 17:11 ` Klaus Jensen
2022-07-05 18:43   ` Keith Busch
2022-07-06 11:34     ` Jinhao Fan
2022-07-07  5:51       ` Klaus Jensen
2022-07-07  8:50         ` Jinhao Fan
2022-07-06 10:57   ` Jinhao Fan
2022-07-09  3:06 ` Jinhao Fan
2022-07-12 12:23   ` Klaus Jensen
2022-07-14  5:35     ` Klaus Jensen
2022-07-25 20:55 ` Klaus Jensen
2022-07-26  7:35   ` Jinhao Fan
2022-07-26  7:41     ` Klaus Jensen
2022-07-26  7:55       ` Jinhao Fan
2022-07-26  9:19         ` Klaus Jensen
2022-07-26 10:09           ` Klaus Jensen
2022-07-26 11:24             ` Klaus Jensen
2022-07-26 11:32               ` Klaus Jensen
2022-07-26 12:08                 ` Klaus Jensen
2022-07-27  7:06                   ` Klaus Jensen
2022-07-27  8:16                     ` Jinhao Fan
2022-07-27  8:21                       ` Klaus Jensen
2022-07-27 15:09                         ` Jinhao Fan
2022-07-26 12:35 ` Stefan Hajnoczi [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAJSP0QWAK-x4inYNPWBrd+KJDKPszjkCsadH+Ov5=DcDF6=eFA@mail.gmail.com' \
    --to=stefanha@gmail.com \
    --cc=fanjinhao21s@ict.ac.cn \
    --cc=its@irrelevant.dk \
    --cc=kbusch@kernel.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.