All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] vfio: avoid SET_ACTION_TRIGGER ioctls
@ 2020-02-28 12:08 Roman Kapl
  2020-03-05 22:37 ` Alex Williamson
  0 siblings, 1 reply; 4+ messages in thread
From: Roman Kapl @ 2020-02-28 12:08 UTC (permalink / raw)
  Cc: Alex Williamson, qemu-devel, Roman Kapl

For MSI-X interrupts, remember what the last used eventfd was (KVM
bypass vs QEMU) and only call vfio_set_irq_signaling if it has changed.

This not only helps with performance, but it seems that interrupts can
be lost during VFIO_IRQ_SET_ACTION_TRIGGER. With the 'x-no-kvm-msix'
switch and this patch, SET_ACTION_TRIGGER is not called during
mask/unmask. This really only affects guests that actively use MSI-X masking.

Signed-off-by: Roman Kapl <rka@sysgo.com>
---

This patch scratches my particular itch. I am able to get our guest (which masks
MSI on each interrupt) running, without getting randomly stuck on waiting for
interrupt. However, the solution is far from perfect (x-no-kvm-msix is required)
and pretty slow. I would be interested in hearing any ideas how to improve this.
Some ideas:

1) Fix the kernel so that SET_ACTION_TRIGGER does not loose interrupts (I think
the problem is there, but not 100% sure). I've tested on 5.3.0-40-generic
#32~18.04.1-Ubuntu SMP.

2) Add support for MASK/UNMASK for MSI-X in kernel and use that. But I don't
know how to do PBA in that case. Another IOCTL? We could look at the real PBA
array, if mapping is supported, but that seems hacky.

3) Twiddle the bits behing kernel's back, if it can be mapped?

Still, I think this patch does not hurt anything and could be applied if no-one
can think of a better way.

---

 hw/vfio/pci.c | 32 ++++++++++++++++++++++----------
 hw/vfio/pci.h |  2 ++
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e6569a7968..5f7ce91519 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -390,12 +390,16 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
          * MSI-X mask and pending bits are emulated, so we want to use the
          * KVM signaling path only when configured and unmasked.
          */
-        if (vdev->msi_vectors[i].use) {
-            if (vdev->msi_vectors[i].virq < 0 ||
-                (msix && msix_is_masked(&vdev->pdev, i))) {
-                fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
+        if (vector->use) {
+            if (vector->virq < 0 ||
+                (msix && msix_is_masked(&vdev->pdev, i)))
+            {
+                vector->kvm_path_active = false;
+                fd = event_notifier_get_fd(&vector->interrupt);
             } else {
-                fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
+                vector->kvm_path_active = true;
+                fd = event_notifier_get_fd(&vector->kvm_interrupt);
             }
         }
 
@@ -509,17 +513,23 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
     } else {
         Error *err = NULL;
         int32_t fd;
+        bool kvm_path;
 
         if (vector->virq >= 0) {
             fd = event_notifier_get_fd(&vector->kvm_interrupt);
+            kvm_path = true;
         } else {
             fd = event_notifier_get_fd(&vector->interrupt);
+            kvm_path = false;
         }
 
-        if (vfio_set_irq_signaling(&vdev->vbasedev,
-                                     VFIO_PCI_MSIX_IRQ_INDEX, nr,
-                                     VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
-            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+        if (vector->kvm_path_active != kvm_path) {
+            if (vfio_set_irq_signaling(&vdev->vbasedev,
+                                       VFIO_PCI_MSIX_IRQ_INDEX, nr,
+                                       VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
+                error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+            }
+            vector->kvm_path_active = kvm_path;
         }
     }
 
@@ -555,13 +565,15 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
      * core will mask the interrupt and set pending bits, allowing it to
      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
      */
-    if (vector->virq >= 0) {
+    if (vector->virq >= 0 && vector->kvm_path_active) {
         int32_t fd = event_notifier_get_fd(&vector->interrupt);
         Error *err = NULL;
 
         if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
                                    VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
             error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+        } else {
+            vector->kvm_path_active = false;
         }
     }
 }
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index b329d50338..b01d2676cf 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -91,6 +91,8 @@ typedef struct VFIOMSIVector {
      */
     EventNotifier interrupt;
     EventNotifier kvm_interrupt;
+    /* Set when the trigger action is set to the KVM bypass FD */
+    bool kvm_path_active;
     struct VFIOPCIDevice *vdev; /* back pointer to device */
     int virq;
     bool use;
-- 
2.22.0



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] vfio: avoid SET_ACTION_TRIGGER ioctls
  2020-02-28 12:08 [PATCH] vfio: avoid SET_ACTION_TRIGGER ioctls Roman Kapl
@ 2020-03-05 22:37 ` Alex Williamson
  2020-03-09 11:43   ` Roman Kapl
  0 siblings, 1 reply; 4+ messages in thread
From: Alex Williamson @ 2020-03-05 22:37 UTC (permalink / raw)
  To: Roman Kapl; +Cc: qemu-devel

On Fri, 28 Feb 2020 13:08:00 +0100
Roman Kapl <rka@sysgo.com> wrote:

> For MSI-X interrupts, remember what the last used eventfd was (KVM
> bypass vs QEMU) and only call vfio_set_irq_signaling if it has changed.
> 
> This not only helps with performance, but it seems that interrupts can
> be lost during VFIO_IRQ_SET_ACTION_TRIGGER. With the 'x-no-kvm-msix'
> switch and this patch, SET_ACTION_TRIGGER is not called during
> mask/unmask. This really only affects guests that actively use MSI-X masking.
> 
> Signed-off-by: Roman Kapl <rka@sysgo.com>
> ---
> 
> This patch scratches my particular itch. I am able to get our guest (which masks
> MSI on each interrupt) running, without getting randomly stuck on waiting for
> interrupt. However, the solution is far from perfect (x-no-kvm-msix is required)
> and pretty slow. I would be interested in hearing any ideas how to improve this.
> Some ideas:
> 
> 1) Fix the kernel so that SET_ACTION_TRIGGER does not loose interrupts (I think
> the problem is there, but not 100% sure). I've tested on 5.3.0-40-generic
> #32~18.04.1-Ubuntu SMP.

I'd be curious if this (yet unmerged) series resolve this:

https://lore.kernel.org/lkml/cover.1567394624.git.luoben@linux.alibaba.com/
 
> 2) Add support for MASK/UNMASK for MSI-X in kernel and use that. But I don't
> know how to do PBA in that case. Another IOCTL? We could look at the real PBA
> array, if mapping is supported, but that seems hacky.

That lack of a masking API in the host kernel is part of the reason we
take the hacky approach of emulating the PBA in QEMU.  We could have
the PBA MemoryRegion do a pread() from the device, but if we're doing
some ioctls on every un/mask, we're probably already digging ourselves
out of a hole.

It would be interesting to see if the series above prevents dropping
interrupts, how it compares with the reduced ioctls + QEMU handling you
have here.

> 3) Twiddle the bits behing kernel's back, if it can be mapped?

I'm not sure what you're thinking here, you mean mask vectors directly
on the device w/o a host kernel masking API and then read the PBA
directly from the device, mapping it directly to the guest if possible?
The MSI-X MMIO space can be mmapped by QEMU, we rely on interrupt
remmappers to protect us from malicious users.  QEMU certainly
shouldn't touch the vector or data fields, but masking might be
reasonably safe, then we could leave the KVM route in place.  I'm
afraid it might be difficult to integrate with QEMU MSI-X support
though.

> Still, I think this patch does not hurt anything and could be applied if no-one
> can think of a better way.
> 
> ---
> 
>  hw/vfio/pci.c | 32 ++++++++++++++++++++++----------
>  hw/vfio/pci.h |  2 ++
>  2 files changed, 24 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index e6569a7968..5f7ce91519 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -390,12 +390,16 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>           * MSI-X mask and pending bits are emulated, so we want to use the
>           * KVM signaling path only when configured and unmasked.
>           */
> -        if (vdev->msi_vectors[i].use) {
> -            if (vdev->msi_vectors[i].virq < 0 ||
> -                (msix && msix_is_masked(&vdev->pdev, i))) {
> -                fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> +        VFIOMSIVector *vector = &vdev->msi_vectors[i];
> +        if (vector->use) {
> +            if (vector->virq < 0 ||
> +                (msix && msix_is_masked(&vdev->pdev, i)))
> +            {
> +                vector->kvm_path_active = false;
> +                fd = event_notifier_get_fd(&vector->interrupt);
>              } else {
> -                fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
> +                vector->kvm_path_active = true;
> +                fd = event_notifier_get_fd(&vector->kvm_interrupt);
>              }
>          }
>  
> @@ -509,17 +513,23 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>      } else {
>          Error *err = NULL;
>          int32_t fd;
> +        bool kvm_path;
>  
>          if (vector->virq >= 0) {
>              fd = event_notifier_get_fd(&vector->kvm_interrupt);
> +            kvm_path = true;
>          } else {
>              fd = event_notifier_get_fd(&vector->interrupt);
> +            kvm_path = false;
>          }
>  
> -        if (vfio_set_irq_signaling(&vdev->vbasedev,
> -                                     VFIO_PCI_MSIX_IRQ_INDEX, nr,
> -                                     VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
> -            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> +        if (vector->kvm_path_active != kvm_path) {
> +            if (vfio_set_irq_signaling(&vdev->vbasedev,
> +                                       VFIO_PCI_MSIX_IRQ_INDEX, nr,
> +                                       VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
> +                error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> +            }
> +            vector->kvm_path_active = kvm_path;


Wouldn't this be more intuitive if we just cached the current fd on the
VFIOMSIVector object and created a vfio_set_irq_signaling() wrapper for
vectors that only calls through when the fd changes, updating the fd on
successful return otherwise?  AIUI, you're only trying to prevent
gratuitous calls to vfio_set_irq_signaling() when the eventfd remains
unchanged, which is the common case for your configuration of running
in QEMU interrupt mode.  Thanks,

Alex

>          }
>      }
>  
> @@ -555,13 +565,15 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>       * core will mask the interrupt and set pending bits, allowing it to
>       * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
>       */
> -    if (vector->virq >= 0) {
> +    if (vector->virq >= 0 && vector->kvm_path_active) {
>          int32_t fd = event_notifier_get_fd(&vector->interrupt);
>          Error *err = NULL;
>  
>          if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
>                                     VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
>              error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> +        } else {
> +            vector->kvm_path_active = false;
>          }
>      }
>  }
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index b329d50338..b01d2676cf 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -91,6 +91,8 @@ typedef struct VFIOMSIVector {
>       */
>      EventNotifier interrupt;
>      EventNotifier kvm_interrupt;
> +    /* Set when the trigger action is set to the KVM bypass FD */
> +    bool kvm_path_active;
>      struct VFIOPCIDevice *vdev; /* back pointer to device */
>      int virq;
>      bool use;



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] vfio: avoid SET_ACTION_TRIGGER ioctls
  2020-03-05 22:37 ` Alex Williamson
@ 2020-03-09 11:43   ` Roman Kapl
  2020-03-09 15:37     ` Roman Kapl
  0 siblings, 1 reply; 4+ messages in thread
From: Roman Kapl @ 2020-03-09 11:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: Alex Williamson

On 3/5/20 11:37 PM, Alex Williamson wrote:
> On Fri, 28 Feb 2020 13:08:00 +0100
> Roman Kapl <rka@sysgo.com> wrote:
> 
>> For MSI-X interrupts, remember what the last used eventfd was (KVM
>> bypass vs QEMU) and only call vfio_set_irq_signaling if it has changed.
>>
>> This not only helps with performance, but it seems that interrupts can
>> be lost during VFIO_IRQ_SET_ACTION_TRIGGER. With the 'x-no-kvm-msix'
>> switch and this patch, SET_ACTION_TRIGGER is not called during
>> mask/unmask. This really only affects guests that actively use MSI-X masking.
>>
>> Signed-off-by: Roman Kapl <rka@sysgo.com>
>> ---
>>
>> This patch scratches my particular itch. I am able to get our guest (which masks
>> MSI on each interrupt) running, without getting randomly stuck on waiting for
>> interrupt. However, the solution is far from perfect (x-no-kvm-msix is required)
>> and pretty slow. I would be interested in hearing any ideas how to improve this.
>> Some ideas:
>>
>> 1) Fix the kernel so that SET_ACTION_TRIGGER does not loose interrupts (I think
>> the problem is there, but not 100% sure). I've tested on 5.3.0-40-generic
>> #32~18.04.1-Ubuntu SMP.
> 
> I'd be curious if this (yet unmerged) series resolve this:
> 
> https://lore.kernel.org/lkml/cover.1567394624.git.luoben@linux.alibaba.com/

Indeed it does. Thanks for pointing out this patch. This seems to nicely 
fix the underlying issue and thus QEMU now work both with and without 
KVM bypass.

>   
>> 2) Add support for MASK/UNMASK for MSI-X in kernel and use that. But I don't
>> know how to do PBA in that case. Another IOCTL? We could look at the real PBA
>> array, if mapping is supported, but that seems hacky.
> 
> That lack of a masking API in the host kernel is part of the reason we
> take the hacky approach of emulating the PBA in QEMU.  We could have
> the PBA MemoryRegion do a pread() from the device, but if we're doing
> some ioctls on every un/mask, we're probably already digging ourselves
> out of a hole.
> 
> It would be interesting to see if the series above prevents dropping
> interrupts, how it compares with the reduced ioctls + QEMU handling you
> have here.

Unfortunately I was not able to get any reasonable performance data 
here, because in the interrupt latency seems to be pretty bad (cca 1.5ms 
- 2ms) in all three options I've tried: no-KVM-bypass, KVM-bypass, 
no-KVM-bypass + my patch). So saving one IOCTL does not really make a 
dent in that.

If I will be able to find out why the latency is so bad, or reduce it, I 
will get back.

> 
>> 3) Twiddle the bits behing kernel's back, if it can be mapped?
> 
> I'm not sure what you're thinking here, you mean mask vectors directly
> on the device w/o a host kernel masking API and then read the PBA
> directly from the device, mapping it directly to the guest if possible?
> The MSI-X MMIO space can be mmapped by QEMU, we rely on interrupt
> remmappers to protect us from malicious users.  QEMU certainly
> shouldn't touch the vector or data fields, but masking might be
> reasonably safe, then we could leave the KVM route in place.  I'm
> afraid it might be difficult to integrate with QEMU MSI-X support
> though.

Yes, that's what I was thinking. But that's just an idea.

> 
>> Still, I think this patch does not hurt anything and could be applied if no-one
>> can think of a better way.
>>
>> ---
>>
>>   hw/vfio/pci.c | 32 ++++++++++++++++++++++----------
>>   hw/vfio/pci.h |  2 ++
>>   2 files changed, 24 insertions(+), 10 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index e6569a7968..5f7ce91519 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -390,12 +390,16 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>>            * MSI-X mask and pending bits are emulated, so we want to use the
>>            * KVM signaling path only when configured and unmasked.
>>            */
>> -        if (vdev->msi_vectors[i].use) {
>> -            if (vdev->msi_vectors[i].virq < 0 ||
>> -                (msix && msix_is_masked(&vdev->pdev, i))) {
>> -                fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>> +        VFIOMSIVector *vector = &vdev->msi_vectors[i];
>> +        if (vector->use) {
>> +            if (vector->virq < 0 ||
>> +                (msix && msix_is_masked(&vdev->pdev, i)))
>> +            {
>> +                vector->kvm_path_active = false;
>> +                fd = event_notifier_get_fd(&vector->interrupt);
>>               } else {
>> -                fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
>> +                vector->kvm_path_active = true;
>> +                fd = event_notifier_get_fd(&vector->kvm_interrupt);
>>               }
>>           }
>>   
>> @@ -509,17 +513,23 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>       } else {
>>           Error *err = NULL;
>>           int32_t fd;
>> +        bool kvm_path;
>>   
>>           if (vector->virq >= 0) {
>>               fd = event_notifier_get_fd(&vector->kvm_interrupt);
>> +            kvm_path = true;
>>           } else {
>>               fd = event_notifier_get_fd(&vector->interrupt);
>> +            kvm_path = false;
>>           }
>>   
>> -        if (vfio_set_irq_signaling(&vdev->vbasedev,
>> -                                     VFIO_PCI_MSIX_IRQ_INDEX, nr,
>> -                                     VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
>> -            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>> +        if (vector->kvm_path_active != kvm_path) {
>> +            if (vfio_set_irq_signaling(&vdev->vbasedev,
>> +                                       VFIO_PCI_MSIX_IRQ_INDEX, nr,
>> +                                       VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
>> +                error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>> +            }
>> +            vector->kvm_path_active = kvm_path;
> 
> 
> Wouldn't this be more intuitive if we just cached the current fd on the
> VFIOMSIVector object and created a vfio_set_irq_signaling() wrapper for
> vectors that only calls through when the fd changes, updating the fd on
> successful return otherwise?  AIUI, you're only trying to prevent
> gratuitous calls to vfio_set_irq_signaling() when the eventfd remains
> unchanged, which is the common case for your configuration of running
> in QEMU interrupt mode.  Thanks,

I am not against that. If we decide to apply the patch, I can replace 
the two call sites with the wrapper. It's not applicable in 
vfio_enable_vectors.

> 
> Alex

Thanks for comments, Roman Kapl

> 
>>           }
>>       }
>>   
>> @@ -555,13 +565,15 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>>        * core will mask the interrupt and set pending bits, allowing it to
>>        * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
>>        */
>> -    if (vector->virq >= 0) {
>> +    if (vector->virq >= 0 && vector->kvm_path_active) {
>>           int32_t fd = event_notifier_get_fd(&vector->interrupt);
>>           Error *err = NULL;
>>   
>>           if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
>>                                      VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
>>               error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>> +        } else {
>> +            vector->kvm_path_active = false;
>>           }
>>       }
>>   }
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index b329d50338..b01d2676cf 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -91,6 +91,8 @@ typedef struct VFIOMSIVector {
>>        */
>>       EventNotifier interrupt;
>>       EventNotifier kvm_interrupt;
>> +    /* Set when the trigger action is set to the KVM bypass FD */
>> +    bool kvm_path_active;
>>       struct VFIOPCIDevice *vdev; /* back pointer to device */
>>       int virq;
>>       bool use;
> 
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] vfio: avoid SET_ACTION_TRIGGER ioctls
  2020-03-09 11:43   ` Roman Kapl
@ 2020-03-09 15:37     ` Roman Kapl
  0 siblings, 0 replies; 4+ messages in thread
From: Roman Kapl @ 2020-03-09 15:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Alex Williamson


On 3/9/20 12:43 PM, Roman Kapl wrote:
> On 3/5/20 11:37 PM, Alex Williamson wrote:
>> On Fri, 28 Feb 2020 13:08:00 +0100
>> Roman Kapl <rka@sysgo.com> wrote:
>>
>>> For MSI-X interrupts, remember what the last used eventfd was (KVM
>>> bypass vs QEMU) and only call vfio_set_irq_signaling if it has changed.
>>>
>>> This not only helps with performance, but it seems that interrupts can
>>> be lost during VFIO_IRQ_SET_ACTION_TRIGGER. With the 'x-no-kvm-msix'
>>> switch and this patch, SET_ACTION_TRIGGER is not called during
>>> mask/unmask. This really only affects guests that actively use MSI-X 
>>> masking.
>>>
>>> Signed-off-by: Roman Kapl <rka@sysgo.com>
>>> ---
>>>
>>> This patch scratches my particular itch. I am able to get our guest 
>>> (which masks
>>> MSI on each interrupt) running, without getting randomly stuck on 
>>> waiting for
>>> interrupt. However, the solution is far from perfect (x-no-kvm-msix 
>>> is required)
>>> and pretty slow. I would be interested in hearing any ideas how to 
>>> improve this.
>>> Some ideas:
>>>
>>> 1) Fix the kernel so that SET_ACTION_TRIGGER does not loose 
>>> interrupts (I think
>>> the problem is there, but not 100% sure). I've tested on 
>>> 5.3.0-40-generic
>>> #32~18.04.1-Ubuntu SMP.
>>
>> I'd be curious if this (yet unmerged) series resolve this:
>>
>> https://lore.kernel.org/lkml/cover.1567394624.git.luoben@linux.alibaba.com/ 
>>
> 
> Indeed it does. Thanks for pointing out this patch. This seems to nicely 
> fix the underlying issue and thus QEMU now work both with and without 
> KVM bypass.
> 
>>> 2) Add support for MASK/UNMASK for MSI-X in kernel and use that. But 
>>> I don't
>>> know how to do PBA in that case. Another IOCTL? We could look at the 
>>> real PBA
>>> array, if mapping is supported, but that seems hacky.
>>
>> That lack of a masking API in the host kernel is part of the reason we
>> take the hacky approach of emulating the PBA in QEMU.  We could have
>> the PBA MemoryRegion do a pread() from the device, but if we're doing
>> some ioctls on every un/mask, we're probably already digging ourselves
>> out of a hole.
>>
>> It would be interesting to see if the series above prevents dropping
>> interrupts, how it compares with the reduced ioctls + QEMU handling you
>> have here.
> 
> Unfortunately I was not able to get any reasonable performance data 
> here, because in the interrupt latency seems to be pretty bad (cca 1.5ms 
> - 2ms) in all three options I've tried: no-KVM-bypass, KVM-bypass, 
> no-KVM-bypass + my patch). So saving one IOCTL does not really make a 
> dent in that.
> 
> If I will be able to find out why the latency is so bad, or reduce it, I 
> will get back.

Ok, just to recapitulate, with the kernel patch linked above, the only 
issue now is performance, not corectness.

Using perf, I've found out the bad performance was caused by 
memory_region_set_enabled(msix_pba_mmio). Simply leaving it enabled had 
a huge effect (maybe 5x) on my workload.

Second in line is adding x-no-kvm-msix, which has modest impact. I've 
also tried this proposed patch and I've also removed the unecessary 
qemu_set_fd_handler. Both have a small impact in comparison

So the priority now would be to avoid the calls to 
memory_region_set_enabled. I can think of two options:

1) Add a flag, e.g. 'x-pba-always-on' and possibly also 'x-pba-disable'. 
This is probably the simplest thing to do.

2) Add auto-detection. If the guest tries to switch PBA on/off too many 
times, just permanently enable it. The same goes for KVM bypass.
'x-pba-disable' flag would still probably be useful for guests where PBA 
would cause performance degradation, but the guest does not use it.

Any thoughts? Or is there some faster way to toggle the memory region 
emulation?

Thank you, Roman Kapl

> 
>>
>>> 3) Twiddle the bits behing kernel's back, if it can be mapped?
>>
>> I'm not sure what you're thinking here, you mean mask vectors directly
>> on the device w/o a host kernel masking API and then read the PBA
>> directly from the device, mapping it directly to the guest if possible?
>> The MSI-X MMIO space can be mmapped by QEMU, we rely on interrupt
>> remmappers to protect us from malicious users.  QEMU certainly
>> shouldn't touch the vector or data fields, but masking might be
>> reasonably safe, then we could leave the KVM route in place.  I'm
>> afraid it might be difficult to integrate with QEMU MSI-X support
>> though.
> 
> Yes, that's what I was thinking. But that's just an idea.
> 
>>
>>> Still, I think this patch does not hurt anything and could be applied 
>>> if no-one
>>> can think of a better way.
>>>
>>> ---
>>>
>>>   hw/vfio/pci.c | 32 ++++++++++++++++++++++----------
>>>   hw/vfio/pci.h |  2 ++
>>>   2 files changed, 24 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index e6569a7968..5f7ce91519 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -390,12 +390,16 @@ static int vfio_enable_vectors(VFIOPCIDevice 
>>> *vdev, bool msix)
>>>            * MSI-X mask and pending bits are emulated, so we want to 
>>> use the
>>>            * KVM signaling path only when configured and unmasked.
>>>            */
>>> -        if (vdev->msi_vectors[i].use) {
>>> -            if (vdev->msi_vectors[i].virq < 0 ||
>>> -                (msix && msix_is_masked(&vdev->pdev, i))) {
>>> -                fd = 
>>> event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>>> +        VFIOMSIVector *vector = &vdev->msi_vectors[i];
>>> +        if (vector->use) {
>>> +            if (vector->virq < 0 ||
>>> +                (msix && msix_is_masked(&vdev->pdev, i)))
>>> +            {
>>> +                vector->kvm_path_active = false;
>>> +                fd = event_notifier_get_fd(&vector->interrupt);
>>>               } else {
>>> -                fd = 
>>> event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
>>> +                vector->kvm_path_active = true;
>>> +                fd = event_notifier_get_fd(&vector->kvm_interrupt);
>>>               }
>>>           }
>>> @@ -509,17 +513,23 @@ static int vfio_msix_vector_do_use(PCIDevice 
>>> *pdev, unsigned int nr,
>>>       } else {
>>>           Error *err = NULL;
>>>           int32_t fd;
>>> +        bool kvm_path;
>>>           if (vector->virq >= 0) {
>>>               fd = event_notifier_get_fd(&vector->kvm_interrupt);
>>> +            kvm_path = true;
>>>           } else {
>>>               fd = event_notifier_get_fd(&vector->interrupt);
>>> +            kvm_path = false;
>>>           }
>>> -        if (vfio_set_irq_signaling(&vdev->vbasedev,
>>> -                                     VFIO_PCI_MSIX_IRQ_INDEX, nr,
>>> -                                     VFIO_IRQ_SET_ACTION_TRIGGER, 
>>> fd, &err)) {
>>> -            error_reportf_err(err, VFIO_MSG_PREFIX, 
>>> vdev->vbasedev.name);
>>> +        if (vector->kvm_path_active != kvm_path) {
>>> +            if (vfio_set_irq_signaling(&vdev->vbasedev,
>>> +                                       VFIO_PCI_MSIX_IRQ_INDEX, nr,
>>> +                                       VFIO_IRQ_SET_ACTION_TRIGGER, 
>>> fd, &err)) {
>>> +                error_reportf_err(err, VFIO_MSG_PREFIX, 
>>> vdev->vbasedev.name);
>>> +            }
>>> +            vector->kvm_path_active = kvm_path;
>>
>>
>> Wouldn't this be more intuitive if we just cached the current fd on the
>> VFIOMSIVector object and created a vfio_set_irq_signaling() wrapper for
>> vectors that only calls through when the fd changes, updating the fd on
>> successful return otherwise?  AIUI, you're only trying to prevent
>> gratuitous calls to vfio_set_irq_signaling() when the eventfd remains
>> unchanged, which is the common case for your configuration of running
>> in QEMU interrupt mode.  Thanks,
> 
> I am not against that. If we decide to apply the patch, I can replace 
> the two call sites with the wrapper. It's not applicable in 
> vfio_enable_vectors.
> 
>>
>> Alex
> 
> Thanks for comments, Roman Kapl
> 
>>
>>>           }
>>>       }
>>> @@ -555,13 +565,15 @@ static void vfio_msix_vector_release(PCIDevice 
>>> *pdev, unsigned int nr)
>>>        * core will mask the interrupt and set pending bits, allowing 
>>> it to
>>>        * be re-asserted on unmask.  Nothing to do if already using 
>>> QEMU mode.
>>>        */
>>> -    if (vector->virq >= 0) {
>>> +    if (vector->virq >= 0 && vector->kvm_path_active) {
>>>           int32_t fd = event_notifier_get_fd(&vector->interrupt);
>>>           Error *err = NULL;
>>>           if (vfio_set_irq_signaling(&vdev->vbasedev, 
>>> VFIO_PCI_MSIX_IRQ_INDEX, nr,
>>>                                      VFIO_IRQ_SET_ACTION_TRIGGER, fd, 
>>> &err)) {
>>>               error_reportf_err(err, VFIO_MSG_PREFIX, 
>>> vdev->vbasedev.name);
>>> +        } else {
>>> +            vector->kvm_path_active = false;
>>>           }
>>>       }
>>>   }
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>> index b329d50338..b01d2676cf 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -91,6 +91,8 @@ typedef struct VFIOMSIVector {
>>>        */
>>>       EventNotifier interrupt;
>>>       EventNotifier kvm_interrupt;
>>> +    /* Set when the trigger action is set to the KVM bypass FD */
>>> +    bool kvm_path_active;
>>>       struct VFIOPCIDevice *vdev; /* back pointer to device */
>>>       int virq;
>>>       bool use;
>>
>>
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-03-09 15:56 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-28 12:08 [PATCH] vfio: avoid SET_ACTION_TRIGGER ioctls Roman Kapl
2020-03-05 22:37 ` Alex Williamson
2020-03-09 11:43   ` Roman Kapl
2020-03-09 15:37     ` Roman Kapl

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.