All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] hw/nvme: Use irqfd to send interrupts
@ 2022-07-09  4:35 Jinhao Fan
  2022-07-12 12:26 ` Klaus Jensen
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Jinhao Fan @ 2022-07-09  4:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch, Jinhao Fan

Use irqfd to directly notify KVM to inject interrupts. This is done by
registering a virtual IRQ(virq) in KVM and associate the virq with an
irqfd, so that KVM can directly inject the interrupt when it receives
notification from the irqfd. This approach is supposed to improve 
performance because it bypasses QEMU's MSI interrupt emulation logic.

However, I did not see an obvious improvement of the emulation KIOPS:

QD      1   4  16  64 
QEMU   38 123 210 329
irqfd  40 129 219 328

I found this problem quite hard to diagnose since irqfd's workflow
involves both QEMU and the in-kernel KVM. 

Could you help me figure out the following questions:

1. How much performance improvement can I expect from using irqfd?
2. How can I debug this kind of cross QEMU-KVM problems?

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/nvme/nvme.h |  3 +++
 2 files changed, 68 insertions(+), 1 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 4b75c5f549..59768c4586 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -159,6 +159,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/hostmem.h"
+#include "sysemu/kvm.h"
 #include "hw/pci/msix.h"
 #include "migration/vmstate.h"
 
@@ -484,12 +485,70 @@ static void nvme_irq_check(NvmeCtrl *n)
     }
 }
 
+static int nvme_kvm_msix_vector_use(NvmeCtrl *n,
+                                    NvmeCQueue *cq,
+                                    int vector)
+{
+    int ret;
+
+    KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+    ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
+    if (ret < 0) {
+        return ret;
+    }
+    kvm_irqchip_commit_route_changes(&c);
+    cq->virq = ret;
+    return 0;
+}
+
+static int nvme_init_cq_irqfd(NvmeCQueue *cq)
+{
+    int ret;
+    
+    ret = nvme_kvm_msix_vector_use(cq->ctrl, cq, (int)cq->vector);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = event_notifier_init(&cq->irq_notifier, 0);
+    if (ret < 0) {
+        goto fail_notifier;
+    }
+
+    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
+                                              NULL, cq->virq);
+    if (ret < 0) {
+        goto fail_kvm;
+    }
+
+    return 0;
+                        
+fail_kvm:
+    event_notifier_cleanup(&cq->irq_notifier);
+fail_notifier:
+    kvm_irqchip_release_virq(kvm_state, cq->virq);
+fail:
+    return ret;
+}
+
 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
 {
     if (cq->irq_enabled) {
         if (msix_enabled(&(n->parent_obj))) {
+            /* Initialize CQ irqfd */
+            if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) {
+                int ret = nvme_init_cq_irqfd(cq);
+                if (ret == 0) {
+                    cq->irqfd_enabled = true;
+                }
+            }
+
             trace_pci_nvme_irq_msix(cq->vector);
-            msix_notify(&(n->parent_obj), cq->vector);
+            if (cq->irqfd_enabled) {
+                event_notifier_set(&cq->irq_notifier);
+            } else {
+                msix_notify(&(n->parent_obj), cq->vector);
+            }
         } else {
             trace_pci_nvme_irq_pin();
             assert(cq->vector < 32);
@@ -4670,6 +4729,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
         event_notifier_cleanup(&cq->notifier);
     }
     if (msix_enabled(&n->parent_obj)) {
+        if (cq->irqfd_enabled) {
+            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
+                                                  cq->virq);
+            kvm_irqchip_release_virq(kvm_state, cq->virq);
+            event_notifier_cleanup(&cq->irq_notifier);
+        }
         msix_vector_unuse(&n->parent_obj, cq->vector);
     }
     if (cq->cqid) {
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 2a9beea0c8..84e5b00fe3 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -391,7 +391,10 @@ typedef struct NvmeCQueue {
     uint64_t    ei_addr;
     QEMUTimer   *timer;
     EventNotifier notifier;
+    EventNotifier irq_notifier;
+    int         virq;
     bool        ioeventfd_enabled;
+    bool        irqfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
     QTAILQ_HEAD(, NvmeRequest) req_list;
 } NvmeCQueue;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-09  4:35 [RFC] hw/nvme: Use irqfd to send interrupts Jinhao Fan
@ 2022-07-12 12:26 ` Klaus Jensen
  2022-07-14  4:18   ` Klaus Jensen
  2022-07-20  9:19 ` Jinhao Fan
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 23+ messages in thread
From: Klaus Jensen @ 2022-07-12 12:26 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch

[-- Attachment #1: Type: text/plain, Size: 1136 bytes --]

On Jul  9 12:35, Jinhao Fan wrote:
> Use irqfd to directly notify KVM to inject interrupts. This is done by
> registering a virtual IRQ(virq) in KVM and associate the virq with an
> irqfd, so that KVM can directly inject the interrupt when it receives
> notification from the irqfd. This approach is supposed to improve 
> performance because it bypasses QEMU's MSI interrupt emulation logic.
> 
> However, I did not see an obvious improvement of the emulation KIOPS:
> 
> QD      1   4  16  64 
> QEMU   38 123 210 329
> irqfd  40 129 219 328
> 
> I found this problem quite hard to diagnose since irqfd's workflow
> involves both QEMU and the in-kernel KVM. 
> 
> Could you help me figure out the following questions:
> 
> 1. How much performance improvement can I expect from using irqfd?

This is a level of QEMU/KVM that I am by no means an expert on and I
would have to let the broader QEMU community comment on this.

> 2. How can I debug this kind of cross QEMU-KVM problems?

Not sure how to directly "debug" it, but there is `perf kvm` to get
information about what is happing in the kvm subsystem.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-12 12:26 ` Klaus Jensen
@ 2022-07-14  4:18   ` Klaus Jensen
  2022-07-14 14:31     ` Jinhao Fan
  0 siblings, 1 reply; 23+ messages in thread
From: Klaus Jensen @ 2022-07-14  4:18 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, kbusch

[-- Attachment #1: Type: text/plain, Size: 1227 bytes --]

On Jul 12 14:26, Klaus Jensen wrote:
> On Jul  9 12:35, Jinhao Fan wrote:
> > Use irqfd to directly notify KVM to inject interrupts. This is done by
> > registering a virtual IRQ(virq) in KVM and associate the virq with an
> > irqfd, so that KVM can directly inject the interrupt when it receives
> > notification from the irqfd. This approach is supposed to improve 
> > performance because it bypasses QEMU's MSI interrupt emulation logic.
> > 
> > However, I did not see an obvious improvement of the emulation KIOPS:
> > 
> > QD      1   4  16  64 
> > QEMU   38 123 210 329
> > irqfd  40 129 219 328
> > 
> > I found this problem quite hard to diagnose since irqfd's workflow
> > involves both QEMU and the in-kernel KVM. 
> > 
> > Could you help me figure out the following questions:
> > 
> > 1. How much performance improvement can I expect from using irqfd?
> 
> This is a level of QEMU/KVM that I am by no means an expert on and I
> would have to let the broader QEMU community comment on this.
> 

In any case, I'm wary about adding this level of kvm-dependence in the
device. This wont work on non-kvm platforms any more.

I think you should put irqfd on hold and focus on iothreads :)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-14  4:18   ` Klaus Jensen
@ 2022-07-14 14:31     ` Jinhao Fan
  0 siblings, 0 replies; 23+ messages in thread
From: Jinhao Fan @ 2022-07-14 14:31 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: qemu-devel, Keith Busch

at 12:18 PM, Klaus Jensen <its@irrelevant.dk> wrote:

> On Jul 12 14:26, Klaus Jensen wrote:
>> On Jul  9 12:35, Jinhao Fan wrote:
>>> Use irqfd to directly notify KVM to inject interrupts. This is done by
>>> registering a virtual IRQ(virq) in KVM and associate the virq with an
>>> irqfd, so that KVM can directly inject the interrupt when it receives
>>> notification from the irqfd. This approach is supposed to improve 
>>> performance because it bypasses QEMU's MSI interrupt emulation logic.
>>> 
>>> However, I did not see an obvious improvement of the emulation KIOPS:
>>> 
>>> QD      1   4  16  64 
>>> QEMU   38 123 210 329
>>> irqfd  40 129 219 328
>>> 
>>> I found this problem quite hard to diagnose since irqfd's workflow
>>> involves both QEMU and the in-kernel KVM. 
>>> 
>>> Could you help me figure out the following questions:
>>> 
>>> 1. How much performance improvement can I expect from using irqfd?
>> 
>> This is a level of QEMU/KVM that I am by no means an expert on and I
>> would have to let the broader QEMU community comment on this.
> 
> In any case, I'm wary about adding this level of kvm-dependence in the
> device. This wont work on non-kvm platforms any more.

Yes, irqfd seems only useful on KVM-based systems. Maybe it is more suitable
for vhost or VFIO based solutions which need irqfd to deliver interrupts.

> I think you should put irqfd on hold and focus on iothreads :)

I’m working on iothread currently. But I also observed a performance
regression with iothread enabled. I found ftrace, which is supported by both
QEMU and KVM, seems good for analyzing performance issues. I’m currently
exploring with ftrace.



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-09  4:35 [RFC] hw/nvme: Use irqfd to send interrupts Jinhao Fan
  2022-07-12 12:26 ` Klaus Jensen
@ 2022-07-20  9:19 ` Jinhao Fan
  2022-07-20 10:21 ` Stefan Hajnoczi
  2022-08-08  2:23 ` Jinhao Fan
  3 siblings, 0 replies; 23+ messages in thread
From: Jinhao Fan @ 2022-07-20  9:19 UTC (permalink / raw)
  To: qemu-devel, stefanha; +Cc: Klaus Jensen, Keith Busch

at 12:35 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> Use irqfd to directly notify KVM to inject interrupts. This is done by
> registering a virtual IRQ(virq) in KVM and associate the virq with an
> irqfd, so that KVM can directly inject the interrupt when it receives
> notification from the irqfd. This approach is supposed to improve 
> performance because it bypasses QEMU's MSI interrupt emulation logic.
> 
> However, I did not see an obvious improvement of the emulation KIOPS:
> 
> QD      1   4  16  64 
> QEMU   38 123 210 329
> irqfd  40 129 219 328
> 
> I found this problem quite hard to diagnose since irqfd's workflow
> involves both QEMU and the in-kernel KVM. 
> 
> Could you help me figure out the following questions:
> 
> 1. How much performance improvement can I expect from using irqfd?
> 2. How can I debug this kind of cross QEMU-KVM problems?
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
> hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-
> hw/nvme/nvme.h |  3 +++
> 2 files changed, 68 insertions(+), 1 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 4b75c5f549..59768c4586 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -159,6 +159,7 @@
> #include "sysemu/sysemu.h"
> #include "sysemu/block-backend.h"
> #include "sysemu/hostmem.h"
> +#include "sysemu/kvm.h"
> #include "hw/pci/msix.h"
> #include "migration/vmstate.h"
> 
> @@ -484,12 +485,70 @@ static void nvme_irq_check(NvmeCtrl *n)
>     }
> }
> 
> +static int nvme_kvm_msix_vector_use(NvmeCtrl *n,
> +                                    NvmeCQueue *cq,
> +                                    int vector)
> +{
> +    int ret;
> +
> +    KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
> +    ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    kvm_irqchip_commit_route_changes(&c);
> +    cq->virq = ret;
> +    return 0;
> +}
> +
> +static int nvme_init_cq_irqfd(NvmeCQueue *cq)
> +{
> +    int ret;
> +    
> +    ret = nvme_kvm_msix_vector_use(cq->ctrl, cq, (int)cq->vector);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = event_notifier_init(&cq->irq_notifier, 0);
> +    if (ret < 0) {
> +        goto fail_notifier;
> +    }
> +
> +    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
> +                                              NULL, cq->virq);
> +    if (ret < 0) {
> +        goto fail_kvm;
> +    }
> +
> +    return 0;
> +                        
> +fail_kvm:
> +    event_notifier_cleanup(&cq->irq_notifier);
> +fail_notifier:
> +    kvm_irqchip_release_virq(kvm_state, cq->virq);
> +fail:
> +    return ret;
> +}
> +
> static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
> {
>     if (cq->irq_enabled) {
>         if (msix_enabled(&(n->parent_obj))) {
> +            /* Initialize CQ irqfd */
> +            if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) {
> +                int ret = nvme_init_cq_irqfd(cq);
> +                if (ret == 0) {
> +                    cq->irqfd_enabled = true;
> +                }
> +            }
> +
>             trace_pci_nvme_irq_msix(cq->vector);
> -            msix_notify(&(n->parent_obj), cq->vector);
> +            if (cq->irqfd_enabled) {
> +                event_notifier_set(&cq->irq_notifier);
> +            } else {
> +                msix_notify(&(n->parent_obj), cq->vector);
> +            }
>         } else {
>             trace_pci_nvme_irq_pin();
>             assert(cq->vector < 32);
> @@ -4670,6 +4729,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
>         event_notifier_cleanup(&cq->notifier);
>     }
>     if (msix_enabled(&n->parent_obj)) {
> +        if (cq->irqfd_enabled) {
> +            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
> +                                                  cq->virq);
> +            kvm_irqchip_release_virq(kvm_state, cq->virq);
> +            event_notifier_cleanup(&cq->irq_notifier);
> +        }
>         msix_vector_unuse(&n->parent_obj, cq->vector);
>     }
>     if (cq->cqid) {
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 2a9beea0c8..84e5b00fe3 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -391,7 +391,10 @@ typedef struct NvmeCQueue {
>     uint64_t    ei_addr;
>     QEMUTimer   *timer;
>     EventNotifier notifier;
> +    EventNotifier irq_notifier;
> +    int         virq;
>     bool        ioeventfd_enabled;
> +    bool        irqfd_enabled;
>     QTAILQ_HEAD(, NvmeSQueue) sq_list;
>     QTAILQ_HEAD(, NvmeRequest) req_list;
> } NvmeCQueue;
> -- 
> 2.25.1

Hi Stefan,

It seems you originally introduced irqfd to virtio-blk to solve thread
safety problems [1]. Could you help explain the benefits of irqfd?

Thanks!
Jinhao Fan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-09  4:35 [RFC] hw/nvme: Use irqfd to send interrupts Jinhao Fan
  2022-07-12 12:26 ` Klaus Jensen
  2022-07-20  9:19 ` Jinhao Fan
@ 2022-07-20 10:21 ` Stefan Hajnoczi
  2022-07-21  2:36   ` Jinhao Fan
  2022-08-02  4:03   ` Jinhao Fan
  2022-08-08  2:23 ` Jinhao Fan
  3 siblings, 2 replies; 23+ messages in thread
From: Stefan Hajnoczi @ 2022-07-20 10:21 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 5930 bytes --]

On Sat, Jul 9, 2022, 00:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> Use irqfd to directly notify KVM to inject interrupts. This is done by
> registering a virtual IRQ(virq) in KVM and associate the virq with an
> irqfd, so that KVM can directly inject the interrupt when it receives
> notification from the irqfd. This approach is supposed to improve
> performance because it bypasses QEMU's MSI interrupt emulation logic.
>
> However, I did not see an obvious improvement of the emulation KIOPS:
>
> QD      1   4  16  64
> QEMU   38 123 210 329
> irqfd  40 129 219 328
>
> I found this problem quite hard to diagnose since irqfd's workflow
> involves both QEMU and the in-kernel KVM.
>
> Could you help me figure out the following questions:
>
> 1. How much performance improvement can I expect from using irqfd?
>

Hi Jinhao,
Thanks for working on this!

irqfd is not necessarily faster than KVM ioctl interrupt injection.

There are at least two non performance reasons for irqfd:
1. It avoids QEMU emulation code, which historically was not thread safe
and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore
cannot safely call the regular interrupt emulation code in QEMU. I think
this is still true today although parts of the code may now be less reliant
on the BQL.
2. The eventfd interface decouples interrupt injection from the KVM ioctl
interface. Vhost kernel and vhost-user device emulation code has no
dependency on KVM thanks to irqfd. They work with any eventfd, including
irqfd.

2. How can I debug this kind of cross QEMU-KVM problems?
>

perf(1) is good at observing both kernel and userspace activity together.
What is it that you want to debug.


> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/nvme/nvme.h |  3 +++
>  2 files changed, 68 insertions(+), 1 deletions(-)
>
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 4b75c5f549..59768c4586 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -159,6 +159,7 @@
>  #include "sysemu/sysemu.h"
>  #include "sysemu/block-backend.h"
>  #include "sysemu/hostmem.h"
> +#include "sysemu/kvm.h"
>  #include "hw/pci/msix.h"
>  #include "migration/vmstate.h"
>
> @@ -484,12 +485,70 @@ static void nvme_irq_check(NvmeCtrl *n)
>      }
>  }
>
> +static int nvme_kvm_msix_vector_use(NvmeCtrl *n,
> +                                    NvmeCQueue *cq,
> +                                    int vector)
> +{
> +    int ret;
> +
> +    KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
> +    ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    kvm_irqchip_commit_route_changes(&c);
> +    cq->virq = ret;
> +    return 0;
> +}
> +
> +static int nvme_init_cq_irqfd(NvmeCQueue *cq)
> +{
> +    int ret;
> +
> +    ret = nvme_kvm_msix_vector_use(cq->ctrl, cq, (int)cq->vector);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = event_notifier_init(&cq->irq_notifier, 0);
> +    if (ret < 0) {
> +        goto fail_notifier;
> +    }
> +
> +    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
> +                                              NULL, cq->virq);
> +    if (ret < 0) {
> +        goto fail_kvm;
> +    }
> +
> +    return 0;
> +
> +fail_kvm:
> +    event_notifier_cleanup(&cq->irq_notifier);
> +fail_notifier:
> +    kvm_irqchip_release_virq(kvm_state, cq->virq);
> +fail:
> +    return ret;
> +}
> +
>  static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
>  {
>      if (cq->irq_enabled) {
>          if (msix_enabled(&(n->parent_obj))) {
> +            /* Initialize CQ irqfd */
> +            if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid !=
> 0) {
> +                int ret = nvme_init_cq_irqfd(cq);
> +                if (ret == 0) {
> +                    cq->irqfd_enabled = true;
> +                }
> +            }
> +
>              trace_pci_nvme_irq_msix(cq->vector);
> -            msix_notify(&(n->parent_obj), cq->vector);
> +            if (cq->irqfd_enabled) {
> +                event_notifier_set(&cq->irq_notifier);
>

What happens when the MSI-X vector is masked?

I remember the VIRTIO code having masking support. I'm on my phone and
can't check now, but I think it registers a temporary eventfd and buffers
irqs while the vector is masked.

This makes me wonder if the VIRTIO and NVMe IOThread irqfd code can be
unified. Maybe IOThread support can be built into the core device emulation
code (e.g. irq APIs) so that it's not necessary to duplicate it.

+            } else {
> +                msix_notify(&(n->parent_obj), cq->vector);
> +            }
>          } else {
>              trace_pci_nvme_irq_pin();
>              assert(cq->vector < 32);
> @@ -4670,6 +4729,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl
> *n)
>          event_notifier_cleanup(&cq->notifier);
>      }
>      if (msix_enabled(&n->parent_obj)) {
> +        if (cq->irqfd_enabled) {
> +            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
> &cq->irq_notifier,
> +                                                  cq->virq);
> +            kvm_irqchip_release_virq(kvm_state, cq->virq);
> +            event_notifier_cleanup(&cq->irq_notifier);
> +        }
>          msix_vector_unuse(&n->parent_obj, cq->vector);
>      }
>      if (cq->cqid) {
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 2a9beea0c8..84e5b00fe3 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -391,7 +391,10 @@ typedef struct NvmeCQueue {
>      uint64_t    ei_addr;
>      QEMUTimer   *timer;
>      EventNotifier notifier;
> +    EventNotifier irq_notifier;
> +    int         virq;
>      bool        ioeventfd_enabled;
> +    bool        irqfd_enabled;
>      QTAILQ_HEAD(, NvmeSQueue) sq_list;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>  } NvmeCQueue;
> --
> 2.25.1
>
>
>

[-- Attachment #2: Type: text/html, Size: 8391 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-20 10:21 ` Stefan Hajnoczi
@ 2022-07-21  2:36   ` Jinhao Fan
  2022-07-21 13:29     ` Stefan Hajnoczi
  2022-08-02  4:03   ` Jinhao Fan
  1 sibling, 1 reply; 23+ messages in thread
From: Jinhao Fan @ 2022-07-21  2:36 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

Hi Stefan,

Thanks for the detailed explanation! 

at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:

> Hi Jinhao,
> Thanks for working on this!
> 
> irqfd is not necessarily faster than KVM ioctl interrupt injection.
> 
> There are at least two non performance reasons for irqfd:
> 1. It avoids QEMU emulation code, which historically was not thread safe and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore cannot safely call the regular interrupt emulation code in QEMU. I think this is still true today although parts of the code may now be less reliant on the BQL.

This probably means we need to move to irqfd when iothread support is added
in qemu-nvme.

> 2. The eventfd interface decouples interrupt injection from the KVM ioctl interface. Vhost kernel and vhost-user device emulation code has no dependency on KVM thanks to irqfd. They work with any eventfd, including irqfd.

This is contrary to our original belief. Klaus once pointed out that irqfd
is KVM specific. I agreed with him since I found irqfd implementation is in
virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you
elaborate on what “no dependency on KVM” means?

> 2. How can I debug this kind of cross QEMU-KVM problems?
> 
> perf(1) is good at observing both kernel and userspace activity together. What is it that you want to debug.
> 

I’ll look into perf(1). I think what I was trying to do is like a breakdown
analysis on which part caused the latency. For example, what is the root
cause of the performance improvements or regressions when irqfd is turned
on.

> What happens when the MSI-X vector is masked?
> 
> I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked.

Yes, this RFC ignored interrupt masking support. 

> 
> This makes me wonder if the VIRTIO and NVMe IOThread irqfd code can be unified. Maybe IOThread support can be built into the core device emulation code (e.g. irq APIs) so that it's not necessary to duplicate it.
> 

Agreed. Recently when working on ioeventfd, iothread and polling support, my
typical workflow is to look at how virtio does that and adjust that code
into nvme. I think unifying their IOThread code can be beneficial since
VIRTIO has incorporated many optimizations over the years that can not be
directly enjoyed by nvme. But I fear that subtle differences in the two
protocols may cause challenges for the unification.

Again, thanks for your help :)


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-21  2:36   ` Jinhao Fan
@ 2022-07-21 13:29     ` Stefan Hajnoczi
  2022-07-24 15:20       ` Jinhao Fan
  2022-07-27  7:18       ` Klaus Jensen
  0 siblings, 2 replies; 23+ messages in thread
From: Stefan Hajnoczi @ 2022-07-21 13:29 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 3654 bytes --]

On Wed, Jul 20, 2022, 22:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> Hi Stefan,
>
> Thanks for the detailed explanation!
>
> at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> > Hi Jinhao,
> > Thanks for working on this!
> >
> > irqfd is not necessarily faster than KVM ioctl interrupt injection.
> >
> > There are at least two non performance reasons for irqfd:
> > 1. It avoids QEMU emulation code, which historically was not thread safe
> and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore
> cannot safely call the regular interrupt emulation code in QEMU. I think
> this is still true today although parts of the code may now be less reliant
> on the BQL.
>
> This probably means we need to move to irqfd when iothread support is added
> in qemu-nvme.
>

Yes. You can audit the interrupt code but I'm pretty sure there is shared
state that needs to be protected by the BQL. So the NVMe emulation code
probably needs to use irqfd to avoid the interrupt emulation code.


> > 2. The eventfd interface decouples interrupt injection from the KVM
> ioctl interface. Vhost kernel and vhost-user device emulation code has no
> dependency on KVM thanks to irqfd. They work with any eventfd, including
> irqfd.
>
> This is contrary to our original belief. Klaus once pointed out that irqfd
> is KVM specific. I agreed with him since I found irqfd implementation is in
> virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you
> elaborate on what “no dependency on KVM” means?
>

"They work with any eventfd, including irqfd"

If you look at the vhost kernel or vhost-user code, you'll see they just
signal the eventfd. It doesn't have to be an irqfd.

An irqfd is a specific type of eventfd that the KVM kernel module
implements to inject interrupts when the eventfd is signaled.

By the way, this not only decouples vhost from the KVM kernel module, but
also allows QEMU to emulate MSI-X masking via buffering the interrupt in
userspace.


> > 2. How can I debug this kind of cross QEMU-KVM problems?
> >
> > perf(1) is good at observing both kernel and userspace activity
> together. What is it that you want to debug.
> >
>
> I’ll look into perf(1). I think what I was trying to do is like a breakdown
> analysis on which part caused the latency. For example, what is the root
> cause of the performance improvements or regressions when irqfd is turned
> on.
>

Nice, perf(1) is good for that. You can enable trace events and add
kprobes/uprobes to record timestamps when specific functions are entered.

>
> > What happens when the MSI-X vector is masked?
> >
> > I remember the VIRTIO code having masking support. I'm on my phone and
> can't check now, but I think it registers a temporary eventfd and buffers
> irqs while the vector is masked.
>
> Yes, this RFC ignored interrupt masking support.
>
> >
> > This makes me wonder if the VIRTIO and NVMe IOThread irqfd code can be
> unified. Maybe IOThread support can be built into the core device emulation
> code (e.g. irq APIs) so that it's not necessary to duplicate it.
> >
>
> Agreed. Recently when working on ioeventfd, iothread and polling support,
> my
> typical workflow is to look at how virtio does that and adjust that code
> into nvme. I think unifying their IOThread code can be beneficial since
> VIRTIO has incorporated many optimizations over the years that can not be
> directly enjoyed by nvme. But I fear that subtle differences in the two
> protocols may cause challenges for the unification.
>
> Again, thanks for your help :)
>

[-- Attachment #2: Type: text/html, Size: 4988 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-21 13:29     ` Stefan Hajnoczi
@ 2022-07-24 15:20       ` Jinhao Fan
  2022-07-24 19:36         ` Stefan Hajnoczi
  2022-07-27  7:18       ` Klaus Jensen
  1 sibling, 1 reply; 23+ messages in thread
From: Jinhao Fan @ 2022-07-24 15:20 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

at 9:29 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:

> 
> Nice, perf(1) is good for that. You can enable trace events and add
> kprobes/uprobes to record timestamps when specific functions are entered.
> 

Thanks Stefan,

One last question: Currently we can achieve hundreds of KIOPS. That means
perf can easily capture millions of trace events per second. I found perf
has quite high overhead at such a rate of trace events. Do you have any
advices on tracing high IOPS tasks?



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-24 15:20       ` Jinhao Fan
@ 2022-07-24 19:36         ` Stefan Hajnoczi
  2022-07-25  2:48           ` Jinhao Fan
  0 siblings, 1 reply; 23+ messages in thread
From: Stefan Hajnoczi @ 2022-07-24 19:36 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 648 bytes --]

On Sun, Jul 24, 2022, 11:21 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> at 9:29 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> >
> > Nice, perf(1) is good for that. You can enable trace events and add
> > kprobes/uprobes to record timestamps when specific functions are entered.
> >
>
> Thanks Stefan,
>
> One last question: Currently we can achieve hundreds of KIOPS. That means
> perf can easily capture millions of trace events per second. I found perf
> has quite high overhead at such a rate of trace events. Do you have any
> advices on tracing high IOPS tasks?


I don't. BTW uprobes are expensive but kprobes are cheaper.

Stefan

>

[-- Attachment #2: Type: text/html, Size: 1342 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-24 19:36         ` Stefan Hajnoczi
@ 2022-07-25  2:48           ` Jinhao Fan
  0 siblings, 0 replies; 23+ messages in thread
From: Jinhao Fan @ 2022-07-25  2:48 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

at 3:36 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:

> 
> 
> On Sun, Jul 24, 2022, 11:21 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:
> at 9:29 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> 
> > 
> > Nice, perf(1) is good for that. You can enable trace events and add
> > kprobes/uprobes to record timestamps when specific functions are entered.
> > 
> 
> Thanks Stefan,
> 
> One last question: Currently we can achieve hundreds of KIOPS. That means
> perf can easily capture millions of trace events per second. I found perf
> has quite high overhead at such a rate of trace events. Do you have any
> advices on tracing high IOPS tasks?
> 
> I don't. BTW uprobes are expensive but kprobes are cheaper.
> 
> Stefan

Gotcha. Thanks!

Jinhao Fan



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-21 13:29     ` Stefan Hajnoczi
  2022-07-24 15:20       ` Jinhao Fan
@ 2022-07-27  7:18       ` Klaus Jensen
  2022-07-27 15:18         ` Stefan Hajnoczi
  1 sibling, 1 reply; 23+ messages in thread
From: Klaus Jensen @ 2022-07-27  7:18 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Jinhao Fan, qemu-devel, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 2456 bytes --]

On Jul 21 09:29, Stefan Hajnoczi wrote:
> On Wed, Jul 20, 2022, 22:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:
> 
> > Hi Stefan,
> >
> > Thanks for the detailed explanation!
> >
> > at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> >
> > > Hi Jinhao,
> > > Thanks for working on this!
> > >
> > > irqfd is not necessarily faster than KVM ioctl interrupt injection.
> > >
> > > There are at least two non performance reasons for irqfd:
> > > 1. It avoids QEMU emulation code, which historically was not thread safe
> > and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore
> > cannot safely call the regular interrupt emulation code in QEMU. I think
> > this is still true today although parts of the code may now be less reliant
> > on the BQL.
> >
> > This probably means we need to move to irqfd when iothread support is added
> > in qemu-nvme.
> >
> 
> Yes. You can audit the interrupt code but I'm pretty sure there is shared
> state that needs to be protected by the BQL. So the NVMe emulation code
> probably needs to use irqfd to avoid the interrupt emulation code.
> 
> 
> > > 2. The eventfd interface decouples interrupt injection from the KVM
> > ioctl interface. Vhost kernel and vhost-user device emulation code has no
> > dependency on KVM thanks to irqfd. They work with any eventfd, including
> > irqfd.
> >
> > This is contrary to our original belief. Klaus once pointed out that irqfd
> > is KVM specific. I agreed with him since I found irqfd implementation is in
> > virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you
> > elaborate on what “no dependency on KVM” means?
> >
> 
> "They work with any eventfd, including irqfd"
> 
> If you look at the vhost kernel or vhost-user code, you'll see they just
> signal the eventfd. It doesn't have to be an irqfd.
> 
> An irqfd is a specific type of eventfd that the KVM kernel module
> implements to inject interrupts when the eventfd is signaled.
> 
> By the way, this not only decouples vhost from the KVM kernel module, but
> also allows QEMU to emulate MSI-X masking via buffering the interrupt in
> userspace.
> 
> 

The virtio dataplane (iothread support) only works with kvm if I am not
mistaken, so I guess this is similar to what we want to do here. If we
dont have KVM, we wont use iothread and we wont use the kvm
irqchip/irqfd.

Am I understanding this correctly?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-27  7:18       ` Klaus Jensen
@ 2022-07-27 15:18         ` Stefan Hajnoczi
  2022-07-28 15:34           ` Jinhao Fan
  0 siblings, 1 reply; 23+ messages in thread
From: Stefan Hajnoczi @ 2022-07-27 15:18 UTC (permalink / raw)
  To: Klaus Jensen; +Cc: Jinhao Fan, qemu-devel, Keith Busch

On Wed, 27 Jul 2022 at 03:18, Klaus Jensen <its@irrelevant.dk> wrote:
>
> On Jul 21 09:29, Stefan Hajnoczi wrote:
> > On Wed, Jul 20, 2022, 22:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:
> >
> > > Hi Stefan,
> > >
> > > Thanks for the detailed explanation!
> > >
> > > at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > >
> > > > Hi Jinhao,
> > > > Thanks for working on this!
> > > >
> > > > irqfd is not necessarily faster than KVM ioctl interrupt injection.
> > > >
> > > > There are at least two non performance reasons for irqfd:
> > > > 1. It avoids QEMU emulation code, which historically was not thread safe
> > > and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore
> > > cannot safely call the regular interrupt emulation code in QEMU. I think
> > > this is still true today although parts of the code may now be less reliant
> > > on the BQL.
> > >
> > > This probably means we need to move to irqfd when iothread support is added
> > > in qemu-nvme.
> > >
> >
> > Yes. You can audit the interrupt code but I'm pretty sure there is shared
> > state that needs to be protected by the BQL. So the NVMe emulation code
> > probably needs to use irqfd to avoid the interrupt emulation code.
> >
> >
> > > > 2. The eventfd interface decouples interrupt injection from the KVM
> > > ioctl interface. Vhost kernel and vhost-user device emulation code has no
> > > dependency on KVM thanks to irqfd. They work with any eventfd, including
> > > irqfd.
> > >
> > > This is contrary to our original belief. Klaus once pointed out that irqfd
> > > is KVM specific. I agreed with him since I found irqfd implementation is in
> > > virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you
> > > elaborate on what “no dependency on KVM” means?
> > >
> >
> > "They work with any eventfd, including irqfd"
> >
> > If you look at the vhost kernel or vhost-user code, you'll see they just
> > signal the eventfd. It doesn't have to be an irqfd.
> >
> > An irqfd is a specific type of eventfd that the KVM kernel module
> > implements to inject interrupts when the eventfd is signaled.
> >
> > By the way, this not only decouples vhost from the KVM kernel module, but
> > also allows QEMU to emulate MSI-X masking via buffering the interrupt in
> > userspace.
> >
> >
>
> The virtio dataplane (iothread support) only works with kvm if I am not
> mistaken, so I guess this is similar to what we want to do here. If we
> dont have KVM, we wont use iothread and we wont use the kvm
> irqchip/irqfd.
>
> Am I understanding this correctly?

I think that is incorrect. QEMU has guest notifier emulation for the
non-KVM (and non-MSI-X PCI) cases. When there is no irqfd support
available, QEMU sets up a regular eventfd and calls
virtio_queue_guest_notifier_read() when it becomes readable.

Stefan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-27 15:18         ` Stefan Hajnoczi
@ 2022-07-28 15:34           ` Jinhao Fan
  2022-07-28 15:38             ` Stefan Hajnoczi
  0 siblings, 1 reply; 23+ messages in thread
From: Jinhao Fan @ 2022-07-28 15:34 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Klaus Jensen, qemu-devel, Keith Busch

at 11:18 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:

> I think that is incorrect. QEMU has guest notifier emulation for the
> non-KVM (and non-MSI-X PCI) cases. When there is no irqfd support
> available, QEMU sets up a regular eventfd and calls
> virtio_queue_guest_notifier_read() when it becomes readable.

Thanks Stefan. I finally understand why there is a `with_irqfd` parameter
for virtio_queue_set_guest_notifier_fd_handler.

But if `with_irqfd` is false, it seems OK to directly call virtio_irq(). Why
still bother using an eventfd? Is it for interrupt batching?



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-28 15:34           ` Jinhao Fan
@ 2022-07-28 15:38             ` Stefan Hajnoczi
  0 siblings, 0 replies; 23+ messages in thread
From: Stefan Hajnoczi @ 2022-07-28 15:38 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: Klaus Jensen, qemu-devel, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 1064 bytes --]

On Thu, Jul 28, 2022, 11:34 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> at 11:18 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> > I think that is incorrect. QEMU has guest notifier emulation for the
> > non-KVM (and non-MSI-X PCI) cases. When there is no irqfd support
> > available, QEMU sets up a regular eventfd and calls
> > virtio_queue_guest_notifier_read() when it becomes readable.
>
> Thanks Stefan. I finally understand why there is a `with_irqfd` parameter
> for virtio_queue_set_guest_notifier_fd_handler.
>
> But if `with_irqfd` is false, it seems OK to directly call virtio_irq().
> Why
> still bother using an eventfd? Is it for interrupt batching?
>

virtio_irq() is not thread safe so it cannot be called directly from the
IOThread. Bouncing through the eventfd ensures that the virtio_irq() call
happens in the QEMU main loop thread with the BQL held.

It may be cheaper to use a BH instead of an eventfd when irqfd is not
available, but this is a slow path anyway. We might as well reuse the
eventfd code that's already there.

Stefan

>

[-- Attachment #2: Type: text/html, Size: 1780 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-20 10:21 ` Stefan Hajnoczi
  2022-07-21  2:36   ` Jinhao Fan
@ 2022-08-02  4:03   ` Jinhao Fan
  2022-08-02  5:29     ` Klaus Jensen
  1 sibling, 1 reply; 23+ messages in thread
From: Jinhao Fan @ 2022-08-02  4:03 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-devel, Klaus Birkelund Jensen, Keith Busch

at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:

> What happens when the MSI-X vector is masked?
> 
> I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked.

Hi Stefan,

While implementing interrupt masking support, I found it hard to test this
feature on the host. Keith told me that no NVMe drivers are currently using
this feature. Do you remember how you tested interrupt masking?

Regards,
Jinhao Fan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-08-02  4:03   ` Jinhao Fan
@ 2022-08-02  5:29     ` Klaus Jensen
  0 siblings, 0 replies; 23+ messages in thread
From: Klaus Jensen @ 2022-08-02  5:29 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: Stefan Hajnoczi, qemu-devel, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 1282 bytes --]

On Aug  2 12:03, Jinhao Fan wrote:
> at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> 
> > What happens when the MSI-X vector is masked?
> > 
> > I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked.
> 
> Hi Stefan,
> 
> While implementing interrupt masking support, I found it hard to test this
> feature on the host. Keith told me that no NVMe drivers are currently using
> this feature. Do you remember how you tested interrupt masking?
> 

You can probably do this with qtest. I don't see a helper for masking
the vectors, but qpci_msix_masked() should be usable as a base for just
changing that readl to a writel and mask it out.

This should allow you to do a relatively simple test case where you

  1. bootstrap the device as simple as possible (forget about I/O
     queues) - I *think* you just need to use guest_alloc for the admin
     queue memory, use qpci_msix_enable() etc.
  2. setup a simple admin command in the queue
  3. mask the interrupt
  4. ring the doorbell (a writel)
  5. check that the vector remains in pending state
  (qpci_msix_pending()).


This *could* be a potential way to do this.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-07-09  4:35 [RFC] hw/nvme: Use irqfd to send interrupts Jinhao Fan
                   ` (2 preceding siblings ...)
  2022-07-20 10:21 ` Stefan Hajnoczi
@ 2022-08-08  2:23 ` Jinhao Fan
  2022-08-09 15:31   ` 樊金昊
  2022-08-09 16:21   ` Keith Busch
  3 siblings, 2 replies; 23+ messages in thread
From: Jinhao Fan @ 2022-08-08  2:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch

at 12:35 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
> {
>     if (cq->irq_enabled) {
>         if (msix_enabled(&(n->parent_obj))) {
> +            /* Initialize CQ irqfd */
> +            if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) {
> +                int ret = nvme_init_cq_irqfd(cq);
> +                if (ret == 0) {
> +                    cq->irqfd_enabled = true;
> +                }
> +            }
> +

Another question:

In this version I left irqfd initialization to the first assertion of an
irq. But I think it is better to initialize irqfd at cq creation time so we
won’t bother checking it at each irq assertion. However if I put these code
in nvme_init_cq(), irqfd does not work properly. After adding some
tracepoints I found the MSI messages in MSI-X table changed after
nvme_init_cq(). Specifically, the `data` field does not seem correct at the
time when nvme_init_cq() is called.

Keith, you must be familiar with how the nvme driver initializes CQs. Could
you give some information on when I can safely use the contents in the MSI-X
table?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-08-08  2:23 ` Jinhao Fan
@ 2022-08-09 15:31   ` 樊金昊
  2022-08-09 16:21   ` Keith Busch
  1 sibling, 0 replies; 23+ messages in thread
From: 樊金昊 @ 2022-08-09 15:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: its, kbusch

&gt; In this version I left irqfd initialization to the first assertion of an
&gt; irq. But I think it is better to initialize irqfd at cq creation time so we
&gt; won’t bother checking it at each irq assertion. However if I put these code
&gt; in nvme_init_cq(), irqfd does not work properly. After adding some
&gt; tracepoints I found the MSI messages in MSI-X table changed after
&gt; nvme_init_cq(). Specifically, the `data` field does not seem correct at the
&gt; time when nvme_init_cq() is called.

I found that in Linux NVMe driver, in nvme_create_cq() (drivers/nvme/host/pci.c),
queue_request_irq() is called after nvme_init_queue(). Does this possibly
cause the incorrect MSI messages at CQ creation time?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-08-08  2:23 ` Jinhao Fan
  2022-08-09 15:31   ` 樊金昊
@ 2022-08-09 16:21   ` Keith Busch
  2022-08-09 16:40     ` 樊金昊
  2022-08-09 16:48     ` 樊金昊
  1 sibling, 2 replies; 23+ messages in thread
From: Keith Busch @ 2022-08-09 16:21 UTC (permalink / raw)
  To: Jinhao Fan; +Cc: qemu-devel, its

On Mon, Aug 08, 2022 at 10:23:03AM +0800, Jinhao Fan wrote:
> at 12:35 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:
> 
> > static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
> > {
> >     if (cq->irq_enabled) {
> >         if (msix_enabled(&(n->parent_obj))) {
> > +            /* Initialize CQ irqfd */
> > +            if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) {
> > +                int ret = nvme_init_cq_irqfd(cq);
> > +                if (ret == 0) {
> > +                    cq->irqfd_enabled = true;
> > +                }
> > +            }
> > +
> 
> Another question:
> 
> In this version I left irqfd initialization to the first assertion of an
> irq. But I think it is better to initialize irqfd at cq creation time so we
> won’t bother checking it at each irq assertion. However if I put these code
> in nvme_init_cq(), irqfd does not work properly. After adding some
> tracepoints I found the MSI messages in MSI-X table changed after
> nvme_init_cq(). Specifically, the `data` field does not seem correct at the
> time when nvme_init_cq() is called.
> 
> Keith, you must be familiar with how the nvme driver initializes CQs. Could
> you give some information on when I can safely use the contents in the MSI-X
> table?

The driver will create the cq with an allocated vector, but it's not activated
until after the driver wires it up to a handler. I think that's what you're
observing with the incomplete MSIx table entry on creation.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-08-09 16:21   ` Keith Busch
@ 2022-08-09 16:40     ` 樊金昊
  2022-08-09 16:48     ` 樊金昊
  1 sibling, 0 replies; 23+ messages in thread
From: 樊金昊 @ 2022-08-09 16:40 UTC (permalink / raw)
  To: Keith Busch; +Cc: qemu-devel, its

&gt; The driver will create the cq with an allocated vector, but it's not activated
&gt; until after the driver wires it up to a handler. I think that's what you're
&gt; observing with the incomplete MSIx table entry on creation.

Agreed. I digged through pci_request_irq()'s call chain and found 
pci_write_msi_msg() was called in the end.

Now to implement irqfd support, we need to register the (complete) MSI message 
in KVM so that KVM can directly send the interrupt when we signal the irqfd.
My prior implementation delayed each CQ's MSI message registration to its first
nvme_post_cqes(). I'm not sure whether this is a good choice. What do you think
about this approach? 

BTW, since we skip QEMU's MSI-x emulation with irqfd, we need to record the
mask status of each interrupt vector. QEMU provides msix_set_vector_notifiers()
to help us call handlers on each mask and unmask event. But this function works
on a per-device basis. I guess it is best to call msix_set_vector_notifiers()
after all CQs are created. But I think qemu-nvme can't tell when the host has
finished CQ creation. Where do you think is the best place we register the
mask/unmask callbacks? Is it OK to put it at, say, the first nvme_post_cqes()
of the whole device?

Thanks,
Jinhao Fan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-08-09 16:21   ` Keith Busch
  2022-08-09 16:40     ` 樊金昊
@ 2022-08-09 16:48     ` 樊金昊
  2022-08-09 17:01       ` Keith Busch
  1 sibling, 1 reply; 23+ messages in thread
From: 樊金昊 @ 2022-08-09 16:48 UTC (permalink / raw)
  To: Keith Busch; +Cc: qemu-devel, its

&gt; The driver will create the cq with an allocated vector, but it's not activated
&gt; until after the driver wires it up to a handler. I think that's what you're
&gt; observing with the incomplete MSIx table entry on creation.

Also, I'm wondering if this is inconsistent with the NVMe spec. In Section 7.6.1
of the 1.4 spec, it says "After determining the number of I/O Queues, the MSI
and/or MSI-X registers should be configured;" in Step 8, and CQ creation happens
in Step 9. Now the driver changes MSI-X registers after CQ creation, is it a
violation of the spec?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Re: [RFC] hw/nvme: Use irqfd to send interrupts
  2022-08-09 16:48     ` 樊金昊
@ 2022-08-09 17:01       ` Keith Busch
  0 siblings, 0 replies; 23+ messages in thread
From: Keith Busch @ 2022-08-09 17:01 UTC (permalink / raw)
  To: 樊金昊; +Cc: qemu-devel, its

On Wed, Aug 10, 2022 at 12:48:53AM +0800, 樊金昊 wrote:
>> The driver will create the cq with an allocated vector, but it's not activated
>> until after the driver wires it up to a handler. I think that's what you're
>> observing with the incomplete MSIx table entry on creation.
> 
> Also, I'm wondering if this is inconsistent with the NVMe spec. In Section 7.6.1
> of the 1.4 spec, it says "After determining the number of I/O Queues, the MSI
> and/or MSI-X registers should be configured;" in Step 8, and CQ creation happens
> in Step 9. Now the driver changes MSI-X registers after CQ creation, is it a
> violation of the spec?

I don't think it's a problem. This is really a more "informative" section of
the spec and doesn't specify any hard requirements. You should be able to rely
on the entry's data being stable after the first queue doorbell ring, though.


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2022-08-09 17:10 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-09  4:35 [RFC] hw/nvme: Use irqfd to send interrupts Jinhao Fan
2022-07-12 12:26 ` Klaus Jensen
2022-07-14  4:18   ` Klaus Jensen
2022-07-14 14:31     ` Jinhao Fan
2022-07-20  9:19 ` Jinhao Fan
2022-07-20 10:21 ` Stefan Hajnoczi
2022-07-21  2:36   ` Jinhao Fan
2022-07-21 13:29     ` Stefan Hajnoczi
2022-07-24 15:20       ` Jinhao Fan
2022-07-24 19:36         ` Stefan Hajnoczi
2022-07-25  2:48           ` Jinhao Fan
2022-07-27  7:18       ` Klaus Jensen
2022-07-27 15:18         ` Stefan Hajnoczi
2022-07-28 15:34           ` Jinhao Fan
2022-07-28 15:38             ` Stefan Hajnoczi
2022-08-02  4:03   ` Jinhao Fan
2022-08-02  5:29     ` Klaus Jensen
2022-08-08  2:23 ` Jinhao Fan
2022-08-09 15:31   ` 樊金昊
2022-08-09 16:21   ` Keith Busch
2022-08-09 16:40     ` 樊金昊
2022-08-09 16:48     ` 樊金昊
2022-08-09 17:01       ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.