All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] optimize the downtime for vfio migration
@ 2021-08-25  7:56 Longpeng(Mike)
  2021-08-25  7:56 ` [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable Longpeng(Mike)
                   ` (6 more replies)
  0 siblings, 7 replies; 22+ messages in thread
From: Longpeng(Mike) @ 2021-08-25  7:56 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Longpeng(Mike), arei.gonglei, huangzhichao, qemu-devel

In vfio migration resume phase, the cost would increase if the
vfio device has more unmasked vectors. We try to optimize it in
this series.

Patch 1 & 2 are simple code cleanups.
Patch 3 defers to set irqs to vfio core.
Patch 4 & 5 defer to commit the route to KVM core. 

The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
we mesure the cost of the vfio_msix_enable for each one, and
we can see the total cost can be significantly reduced.

        Origin         Apply Patch 3     Apply Patch 3/4/5   
1st     8              4                 2
2nd     15             11                2
3rd     22             18                2
4th     24             25                3
5th     36             33                2
6th     44             40                3
7th     51             47                3
8th     58             54                4
Total   258ms          232ms             21ms


Longpeng (Mike) (5):
  vfio: use helper to simplfy the failure path in vfio_msi_enable
  msix: simplfy the conditional in msix_set/unset_vector_notifiers
  vfio: defer to enable msix in migration resume phase
  kvm: irqchip: support defer to commit the route
  vfio: defer to commit kvm route in migraiton resume phase

 accel/kvm/kvm-all.c    | 10 +++--
 accel/stubs/kvm-stub.c |  3 +-
 hw/misc/ivshmem.c      |  2 +-
 hw/pci/msix.c          |  7 ++--
 hw/vfio/pci.c          | 99 ++++++++++++++++++++++++++++++++++++++------------
 hw/vfio/pci.h          |  1 +
 hw/virtio/virtio-pci.c |  2 +-
 include/sysemu/kvm.h   |  4 +-
 target/i386/kvm/kvm.c  |  2 +-
 9 files changed, 95 insertions(+), 35 deletions(-)

-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
@ 2021-08-25  7:56 ` Longpeng(Mike)
  2021-09-03 21:55   ` Alex Williamson
  2021-08-25  7:56 ` [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers Longpeng(Mike)
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Longpeng(Mike) @ 2021-08-25  7:56 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Longpeng(Mike), arei.gonglei, huangzhichao, qemu-devel

The main difference of the failure path in vfio_msi_enable and
vfio_msi_disable_common is enable INTX or not.

Extend the vfio_msi_disable_common to provide a arg to decide
whether need to fallback, and then we can use this helper to
instead the redundant code in vfio_msi_enable.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
---
 hw/vfio/pci.c | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e1ea1d8..7cc43fe 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -47,6 +47,7 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_msi_disable_common(VFIOPCIDevice *vdev, bool enable_intx);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -650,29 +651,17 @@ retry:
     if (ret) {
         if (ret < 0) {
             error_report("vfio: Error: Failed to setup MSI fds: %m");
-        } else if (ret != vdev->nr_vectors) {
+        } else {
             error_report("vfio: Error: Failed to enable %d "
                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
         }
 
-        for (i = 0; i < vdev->nr_vectors; i++) {
-            VFIOMSIVector *vector = &vdev->msi_vectors[i];
-            if (vector->virq >= 0) {
-                vfio_remove_kvm_msi_virq(vector);
-            }
-            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
-                                NULL, NULL, NULL);
-            event_notifier_cleanup(&vector->interrupt);
-        }
-
-        g_free(vdev->msi_vectors);
-        vdev->msi_vectors = NULL;
+        vfio_msi_disable_common(vdev, false);
 
-        if (ret > 0 && ret != vdev->nr_vectors) {
+        if (ret > 0) {
             vdev->nr_vectors = ret;
             goto retry;
         }
-        vdev->nr_vectors = 0;
 
         /*
          * Failing to setup MSI doesn't really fall within any specification.
@@ -680,7 +669,6 @@ retry:
          * out to fall back to INTx for this device.
          */
         error_report("vfio: Error: Failed to enable MSI");
-        vdev->interrupt = VFIO_INT_NONE;
 
         return;
     }
@@ -688,7 +676,7 @@ retry:
     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
 }
 
-static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
+static void vfio_msi_disable_common(VFIOPCIDevice *vdev, bool enable_intx)
 {
     Error *err = NULL;
     int i;
@@ -710,9 +698,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
     vdev->nr_vectors = 0;
     vdev->interrupt = VFIO_INT_NONE;
 
-    vfio_intx_enable(vdev, &err);
-    if (err) {
-        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+    if (enable_intx) {
+        vfio_intx_enable(vdev, &err);
+        if (err) {
+            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+        }
     }
 }
 
@@ -737,7 +727,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
     }
 
-    vfio_msi_disable_common(vdev);
+    vfio_msi_disable_common(vdev, true);
 
     memset(vdev->msix->pending, 0,
            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
@@ -748,7 +738,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
 static void vfio_msi_disable(VFIOPCIDevice *vdev)
 {
     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
-    vfio_msi_disable_common(vdev);
+    vfio_msi_disable_common(vdev, true);
 
     trace_vfio_msi_disable(vdev->vbasedev.name);
 }
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
  2021-08-25  7:56 ` [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable Longpeng(Mike)
@ 2021-08-25  7:56 ` Longpeng(Mike)
  2021-08-25  9:52   ` Philippe Mathieu-Daudé
  2021-09-03 21:55   ` Alex Williamson
  2021-08-25  7:56 ` [PATCH 3/5] vfio: defer to enable msix in migration resume phase Longpeng(Mike)
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 22+ messages in thread
From: Longpeng(Mike) @ 2021-08-25  7:56 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Longpeng(Mike), arei.gonglei, huangzhichao, qemu-devel

'msix_function_masked' is kept pace with the device's config,
we can use it to replace the complex conditional in
msix_set/unset_vector_notifiers.

poll_notifier should be reset to NULL in the error path in
msix_set_vector_notifiers, fix it incidentally.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
---
 hw/pci/msix.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index ae9331c..8057709 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -592,8 +592,7 @@ int msix_set_vector_notifiers(PCIDevice *dev,
     dev->msix_vector_release_notifier = release_notifier;
     dev->msix_vector_poll_notifier = poll_notifier;
 
-    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
-        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
+    if (!dev->msix_function_masked) {
         for (vector = 0; vector < dev->msix_entries_nr; vector++) {
             ret = msix_set_notifier_for_vector(dev, vector);
             if (ret < 0) {
@@ -612,6 +611,7 @@ undo:
     }
     dev->msix_vector_use_notifier = NULL;
     dev->msix_vector_release_notifier = NULL;
+    dev->msix_vector_poll_notifier = NULL;
     return ret;
 }
 
@@ -622,8 +622,7 @@ void msix_unset_vector_notifiers(PCIDevice *dev)
     assert(dev->msix_vector_use_notifier &&
            dev->msix_vector_release_notifier);
 
-    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
-        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
+    if (!dev->msix_function_masked) {
         for (vector = 0; vector < dev->msix_entries_nr; vector++) {
             msix_unset_notifier_for_vector(dev, vector);
         }
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 3/5] vfio: defer to enable msix in migration resume phase
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
  2021-08-25  7:56 ` [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable Longpeng(Mike)
  2021-08-25  7:56 ` [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers Longpeng(Mike)
@ 2021-08-25  7:56 ` Longpeng(Mike)
  2021-08-25  9:57   ` Philippe Mathieu-Daudé
  2021-09-03 21:56   ` Alex Williamson
  2021-08-25  7:56 ` [PATCH 4/5] kvm: irqchip: support defer to commit the route Longpeng(Mike)
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 22+ messages in thread
From: Longpeng(Mike) @ 2021-08-25  7:56 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Longpeng(Mike), arei.gonglei, huangzhichao, qemu-devel

The vf's unmasked msix vectors will be enable one by one in
migraiton resume phase, VFIO_DEVICE_SET_IRQS will be called
for each vector, it's a bit expensive if the vf has more
vectors.

We can call VFIO_DEVICE_SET_IRQS once outside the loop of set
vector notifiers to reduce the cost.

The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
we mesure the cost of the vfio_msix_enable for each one, and
we can see 10% costs can be reduced.

        Origin          Apply this patch
1st     8               4
2nd     15              11
3rd     22              18
4th     24              25
5th     36              33
6th     44              40
7th     51              47
8th     58              54
Total   258ms           232ms

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
---
 hw/vfio/pci.c | 22 ++++++++++++++++++++++
 hw/vfio/pci.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 7cc43fe..ca37fb7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -372,6 +372,10 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
     int ret = 0, i, argsz;
     int32_t *fds;
 
+    if (!vdev->nr_vectors) {
+        return 0;
+    }
+
     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 
     irq_set = g_malloc0(argsz);
@@ -495,6 +499,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
         }
     }
 
+    if (vdev->defer_add_virq) {
+        vdev->nr_vectors = MAX(vdev->nr_vectors, nr + 1);
+        goto clear_pending;
+    }
+
     /*
      * We don't want to have the host allocate all possible MSI vectors
      * for a device if they're not in use, so we shutdown and incrementally
@@ -524,6 +533,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
         }
     }
 
+clear_pending:
     /* Disable PBA emulation when nothing more is pending. */
     clear_bit(nr, vdev->msix->pending);
     if (find_first_bit(vdev->msix->pending,
@@ -608,6 +618,16 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
     if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
                                   vfio_msix_vector_release, NULL)) {
         error_report("vfio: msix_set_vector_notifiers failed");
+        return;
+    }
+
+    if (!pdev->msix_function_masked && vdev->defer_add_virq) {
+        int ret;
+        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
+        ret = vfio_enable_vectors(vdev, true);
+        if (ret) {
+            error_report("vfio: failed to enable vectors, %d", ret);
+        }
     }
 
     trace_vfio_msix_enable(vdev->vbasedev.name);
@@ -2456,7 +2476,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
     if (msi_enabled(pdev)) {
         vfio_msi_enable(vdev);
     } else if (msix_enabled(pdev)) {
+        vdev->defer_add_virq = true;
         vfio_msix_enable(vdev);
+        vdev->defer_add_virq = false;
     }
 
     return ret;
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 6477751..4235c83 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -171,6 +171,7 @@ struct VFIOPCIDevice {
     bool no_kvm_ioeventfd;
     bool no_vfio_ioeventfd;
     bool enable_ramfb;
+    bool defer_add_virq;
     VFIODisplay *dpy;
     Notifier irqchip_change_notifier;
 };
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 4/5] kvm: irqchip: support defer to commit the route
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
                   ` (2 preceding siblings ...)
  2021-08-25  7:56 ` [PATCH 3/5] vfio: defer to enable msix in migration resume phase Longpeng(Mike)
@ 2021-08-25  7:56 ` Longpeng(Mike)
  2021-09-03 21:57   ` Alex Williamson
  2021-08-25  7:56 ` [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase Longpeng(Mike)
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Longpeng(Mike) @ 2021-08-25  7:56 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Longpeng(Mike), arei.gonglei, huangzhichao, qemu-devel

The kvm_irqchip_commit_routes() is relatively expensive, so
provide the users a choice to commit the route immediately
or not when they add msi/msix route.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
---
 accel/kvm/kvm-all.c    | 10 +++++++---
 accel/stubs/kvm-stub.c |  3 ++-
 hw/misc/ivshmem.c      |  2 +-
 hw/vfio/pci.c          |  2 +-
 hw/virtio/virtio-pci.c |  2 +-
 include/sysemu/kvm.h   |  4 +++-
 target/i386/kvm/kvm.c  |  2 +-
 7 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 0125c17..1f788a2 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1950,7 +1950,8 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
     return kvm_set_irq(s, route->kroute.gsi, 1);
 }
 
-int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
+int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
+                              bool defer_commit)
 {
     struct kvm_irq_routing_entry kroute = {};
     int virq;
@@ -1993,7 +1994,9 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
 
     kvm_add_routing_entry(s, &kroute);
     kvm_arch_add_msi_route_post(&kroute, vector, dev);
-    kvm_irqchip_commit_routes(s);
+    if (!defer_commit) {
+        kvm_irqchip_commit_routes(s);
+    }
 
     return virq;
 }
@@ -2151,7 +2154,8 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
     abort();
 }
 
-int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
+int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
+                              bool defer_commit)
 {
     return -ENOSYS;
 }
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 5b1d00a..d5caaca 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -81,7 +81,8 @@ int kvm_on_sigbus(int code, void *addr)
 }
 
 #ifndef CONFIG_USER_ONLY
-int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
+int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
+                              bool defer_commit)
 {
     return -ENOSYS;
 }
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 1ba4a98..98b14cc 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -429,7 +429,7 @@ static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
     IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
     assert(!s->msi_vectors[vector].pdev);
 
-    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev);
+    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev, false);
     if (ret < 0) {
         error_setg(errp, "kvm_irqchip_add_msi_route failed");
         return;
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ca37fb7..3ab67d6 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -427,7 +427,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
         return;
     }
 
-    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
+    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev, false);
     if (virq < 0) {
         event_notifier_cleanup(&vector->kvm_interrupt);
         return;
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 433060a..7e2d021 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -684,7 +684,7 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
     int ret;
 
     if (irqfd->users == 0) {
-        ret = kvm_irqchip_add_msi_route(kvm_state, vector, &proxy->pci_dev);
+        ret = kvm_irqchip_add_msi_route(kvm_state, vector, &proxy->pci_dev, false);
         if (ret < 0) {
             return ret;
         }
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index a1ab1ee..1932dc0 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -473,9 +473,11 @@ void kvm_init_cpu_signals(CPUState *cpu);
  *          message.
  * @dev:    Owner PCI device to add the route. If @dev is specified
  *          as @NULL, an empty MSI message will be inited.
+ * @defer_commit:   Defer to commit new route to the KVM core.
  * @return: virq (>=0) when success, errno (<0) when failed.
  */
-int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev);
+int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
+                              bool defer_commit);
 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
                                  PCIDevice *dev);
 void kvm_irqchip_commit_routes(KVMState *s);
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index e69abe4..896406b 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -4724,7 +4724,7 @@ void kvm_arch_init_irq_routing(KVMState *s)
         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
            MSI routes for signaling interrupts to the local apics. */
         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
-            if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
+            if (kvm_irqchip_add_msi_route(s, 0, NULL, false) < 0) {
                 error_report("Could not enable split IRQ mode.");
                 exit(1);
             }
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
                   ` (3 preceding siblings ...)
  2021-08-25  7:56 ` [PATCH 4/5] kvm: irqchip: support defer to commit the route Longpeng(Mike)
@ 2021-08-25  7:56 ` Longpeng(Mike)
  2021-09-03 21:57   ` Alex Williamson
  2021-08-25 10:05 ` [PATCH 0/5] optimize the downtime for vfio migration Philippe Mathieu-Daudé
  2021-09-02  9:43 ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  6 siblings, 1 reply; 22+ messages in thread
From: Longpeng(Mike) @ 2021-08-25  7:56 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Longpeng(Mike), arei.gonglei, huangzhichao, qemu-devel

In migration resume phase, all unmasked msix vectors need to be
setup when load the VF state. However, the setup operation would
takes longer if the VF has more unmasked vectors.

In our case, the VF has 65 vectors and each one spend at most 0.8ms
on setup operation the total cost of the VF is about 8-58ms. For a
VM that has 8 VFs of this type, the total cost is more than 250ms.

vfio_pci_load_config
  vfio_msix_enable
    msix_set_vector_notifiers
      for (vector = 0; vector < dev->msix_entries_nr; vector++) {
        vfio_msix_vector_do_use
          vfio_add_kvm_msi_virq
            kvm_irqchip_commit_routes <-- expensive
      }

We can reduce the cost by only commit once outside the loop. The
routes is cached in kvm_state, we commit them first and then bind
irqfd for each vector.

The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
we mesure the cost of the vfio_msix_enable for each one, and
we can see 90+% costs can be reduce.

        Origin          Apply this patch
                        and vfio enable optimization
1st     8               2
2nd     15              2
3rd     22              2
4th     24              3
5th     36              2
6th     44              3
7th     51              3
8th     58              4
Total   258ms           21ms

The optimition can be also applied to msi type.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
---
 hw/vfio/pci.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3ab67d6..50e7ec7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -427,12 +427,17 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
         return;
     }
 
-    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev, false);
+    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev,
+                                     vdev->defer_add_virq);
     if (virq < 0) {
         event_notifier_cleanup(&vector->kvm_interrupt);
         return;
     }
 
+    if (vdev->defer_add_virq) {
+        goto out;
+    }
+
     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
                                        NULL, virq) < 0) {
         kvm_irqchip_release_virq(kvm_state, virq);
@@ -440,6 +445,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
         return;
     }
 
+out:
     vector->virq = virq;
 }
 
@@ -577,6 +583,36 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
     }
 }
 
+static void vfio_commit_kvm_msi_virq(VFIOPCIDevice *vdev)
+{
+    int i;
+    VFIOMSIVector *vector;
+    bool commited = false;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        vector = &vdev->msi_vectors[i];
+
+        if (vector->virq < 0) {
+            continue;
+        }
+
+        /* Commit cached route entries to KVM core first if not yet */
+        if (!commited) {
+            kvm_irqchip_commit_routes(kvm_state);
+            commited = true;
+        }
+
+        if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+                                               &vector->kvm_interrupt,
+                                               NULL, vector->virq) < 0) {
+            kvm_irqchip_release_virq(kvm_state, vector->virq);
+            event_notifier_cleanup(&vector->kvm_interrupt);
+            vector->virq = -1;
+            return;
+        }
+    }
+}
+
 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 {
     PCIDevice *pdev = &vdev->pdev;
@@ -624,6 +660,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
     if (!pdev->msix_function_masked && vdev->defer_add_virq) {
         int ret;
         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
+        vfio_commit_kvm_msi_virq(vdev);
         ret = vfio_enable_vectors(vdev, true);
         if (ret) {
             error_report("vfio: failed to enable vectors, %d", ret);
@@ -664,6 +701,10 @@ retry:
         vfio_add_kvm_msi_virq(vdev, vector, i, false);
     }
 
+    if (vdev->defer_add_virq){
+        vfio_commit_kvm_msi_virq(vdev);
+    }
+
     /* Set interrupt type prior to possible interrupts */
     vdev->interrupt = VFIO_INT_MSI;
 
@@ -2473,13 +2514,13 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
     vfio_pci_write_config(pdev, PCI_COMMAND,
                           pci_get_word(pdev->config + PCI_COMMAND), 2);
 
+    vdev->defer_add_virq = true;
     if (msi_enabled(pdev)) {
         vfio_msi_enable(vdev);
     } else if (msix_enabled(pdev)) {
-        vdev->defer_add_virq = true;
         vfio_msix_enable(vdev);
-        vdev->defer_add_virq = false;
     }
+    vdev->defer_add_virq = false;
 
     return ret;
 }
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers
  2021-08-25  7:56 ` [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers Longpeng(Mike)
@ 2021-08-25  9:52   ` Philippe Mathieu-Daudé
  2021-08-25  9:56     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  2021-09-03 21:55   ` Alex Williamson
  1 sibling, 1 reply; 22+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-08-25  9:52 UTC (permalink / raw)
  To: Longpeng(Mike), alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: arei.gonglei, huangzhichao, qemu-devel

On 8/25/21 9:56 AM, Longpeng(Mike) wrote:
> 'msix_function_masked' is kept pace with the device's config,
> we can use it to replace the complex conditional in
> msix_set/unset_vector_notifiers.

Typo 'simplfy' -> 'simplify' in this/previous patch subject.

> poll_notifier should be reset to NULL in the error path in
> msix_set_vector_notifiers, fix it incidentally.

I'd rather see this fix in a different patch, being
unrelated to the msix_function_masked optimization.

> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/pci/msix.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
> index ae9331c..8057709 100644
> --- a/hw/pci/msix.c
> +++ b/hw/pci/msix.c
> @@ -592,8 +592,7 @@ int msix_set_vector_notifiers(PCIDevice *dev,
>      dev->msix_vector_release_notifier = release_notifier;
>      dev->msix_vector_poll_notifier = poll_notifier;
>  
> -    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
> -        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
> +    if (!dev->msix_function_masked) {
>          for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>              ret = msix_set_notifier_for_vector(dev, vector);
>              if (ret < 0) {
> @@ -612,6 +611,7 @@ undo:
>      }
>      dev->msix_vector_use_notifier = NULL;
>      dev->msix_vector_release_notifier = NULL;
> +    dev->msix_vector_poll_notifier = NULL;
>      return ret;
>  }
>  
> @@ -622,8 +622,7 @@ void msix_unset_vector_notifiers(PCIDevice *dev)
>      assert(dev->msix_vector_use_notifier &&
>             dev->msix_vector_release_notifier);
>  
> -    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
> -        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
> +    if (!dev->msix_function_masked) {
>          for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>              msix_unset_notifier_for_vector(dev, vector);
>          }
> 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers
  2021-08-25  9:52   ` Philippe Mathieu-Daudé
@ 2021-08-25  9:56     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-08-25  9:56 UTC (permalink / raw)
  To: Philippe Mathieu-Daudé,
	alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: arei.gonglei, huangzhichao, qemu-devel



在 2021/8/25 17:52, Philippe Mathieu-Daudé 写道:
> On 8/25/21 9:56 AM, Longpeng(Mike) wrote:
>> 'msix_function_masked' is kept pace with the device's config,
>> we can use it to replace the complex conditional in
>> msix_set/unset_vector_notifiers.
> 
> Typo 'simplfy' -> 'simplify' in this/previous patch subject.
> Ok.

>> poll_notifier should be reset to NULL in the error path in
>> msix_set_vector_notifiers, fix it incidentally.
> 
> I'd rather see this fix in a different patch, being
> unrelated to the msix_function_masked optimization.
>
Ok, will split in next version. Thanks.

>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  hw/pci/msix.c | 7 +++----
>>  1 file changed, 3 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
>> index ae9331c..8057709 100644
>> --- a/hw/pci/msix.c
>> +++ b/hw/pci/msix.c
>> @@ -592,8 +592,7 @@ int msix_set_vector_notifiers(PCIDevice *dev,
>>      dev->msix_vector_release_notifier = release_notifier;
>>      dev->msix_vector_poll_notifier = poll_notifier;
>>  
>> -    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
>> -        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
>> +    if (!dev->msix_function_masked) {
>>          for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>>              ret = msix_set_notifier_for_vector(dev, vector);
>>              if (ret < 0) {
>> @@ -612,6 +611,7 @@ undo:
>>      }
>>      dev->msix_vector_use_notifier = NULL;
>>      dev->msix_vector_release_notifier = NULL;
>> +    dev->msix_vector_poll_notifier = NULL;
>>      return ret;
>>  }
>>  
>> @@ -622,8 +622,7 @@ void msix_unset_vector_notifiers(PCIDevice *dev)
>>      assert(dev->msix_vector_use_notifier &&
>>             dev->msix_vector_release_notifier);
>>  
>> -    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
>> -        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
>> +    if (!dev->msix_function_masked) {
>>          for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>>              msix_unset_notifier_for_vector(dev, vector);
>>          }
>>
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/5] vfio: defer to enable msix in migration resume phase
  2021-08-25  7:56 ` [PATCH 3/5] vfio: defer to enable msix in migration resume phase Longpeng(Mike)
@ 2021-08-25  9:57   ` Philippe Mathieu-Daudé
  2021-08-25 10:06     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  2021-09-03 21:56   ` Alex Williamson
  1 sibling, 1 reply; 22+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-08-25  9:57 UTC (permalink / raw)
  To: Longpeng(Mike), alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: arei.gonglei, huangzhichao, qemu-devel

On 8/25/21 9:56 AM, Longpeng(Mike) wrote:
> The vf's unmasked msix vectors will be enable one by one in
> migraiton resume phase, VFIO_DEVICE_SET_IRQS will be called

Typo "migration"

> for each vector, it's a bit expensive if the vf has more
> vectors.
> 
> We can call VFIO_DEVICE_SET_IRQS once outside the loop of set
> vector notifiers to reduce the cost.
> 
> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
> we mesure the cost of the vfio_msix_enable for each one, and

Typo "measure"

> we can see 10% costs can be reduced.
> 
>         Origin          Apply this patch
> 1st     8               4
> 2nd     15              11
> 3rd     22              18
> 4th     24              25
> 5th     36              33
> 6th     44              40
> 7th     51              47
> 8th     58              54
> Total   258ms           232ms
> 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/vfio/pci.c | 22 ++++++++++++++++++++++
>  hw/vfio/pci.h |  1 +
>  2 files changed, 23 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 7cc43fe..ca37fb7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -372,6 +372,10 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>      int ret = 0, i, argsz;
>      int32_t *fds;
>  
> +    if (!vdev->nr_vectors) {
> +        return 0;
> +    }
> +
>      argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
>  
>      irq_set = g_malloc0(argsz);
> @@ -495,6 +499,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>          }
>      }
>  
> +    if (vdev->defer_add_virq) {
> +        vdev->nr_vectors = MAX(vdev->nr_vectors, nr + 1);
> +        goto clear_pending;
> +    }
> +
>      /*
>       * We don't want to have the host allocate all possible MSI vectors
>       * for a device if they're not in use, so we shutdown and incrementally
> @@ -524,6 +533,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>          }
>      }
>  
> +clear_pending:
>      /* Disable PBA emulation when nothing more is pending. */
>      clear_bit(nr, vdev->msix->pending);
>      if (find_first_bit(vdev->msix->pending,
> @@ -608,6 +618,16 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>      if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
>                                    vfio_msix_vector_release, NULL)) {
>          error_report("vfio: msix_set_vector_notifiers failed");
> +        return;
> +    }
> +
> +    if (!pdev->msix_function_masked && vdev->defer_add_virq) {
> +        int ret;
> +        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
> +        ret = vfio_enable_vectors(vdev, true);
> +        if (ret) {
> +            error_report("vfio: failed to enable vectors, %d", ret);
> +        }
>      }
>  
>      trace_vfio_msix_enable(vdev->vbasedev.name);
> @@ -2456,7 +2476,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>      if (msi_enabled(pdev)) {
>          vfio_msi_enable(vdev);
>      } else if (msix_enabled(pdev)) {
> +        vdev->defer_add_virq = true;
>          vfio_msix_enable(vdev);

What about passing defer_add_virq as boolean argument
to vfio_msix_enable()?

> +        vdev->defer_add_virq = false;
>      }
>  
>      return ret;
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 6477751..4235c83 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -171,6 +171,7 @@ struct VFIOPCIDevice {
>      bool no_kvm_ioeventfd;
>      bool no_vfio_ioeventfd;
>      bool enable_ramfb;
> +    bool defer_add_virq;
>      VFIODisplay *dpy;
>      Notifier irqchip_change_notifier;
>  };
> 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/5] optimize the downtime for vfio migration
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
                   ` (4 preceding siblings ...)
  2021-08-25  7:56 ` [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase Longpeng(Mike)
@ 2021-08-25 10:05 ` Philippe Mathieu-Daudé
  2021-08-25 10:09   ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  2021-09-02  9:43 ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  6 siblings, 1 reply; 22+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-08-25 10:05 UTC (permalink / raw)
  To: Longpeng(Mike), alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Juan Quintela, arei.gonglei, huangzhichao, qemu-devel,
	Dr. David Alan Gilbert

Cc'ing David/Juan for migration big picture (just in case).

On 8/25/21 9:56 AM, Longpeng(Mike) wrote:
> In vfio migration resume phase, the cost would increase if the
> vfio device has more unmasked vectors. We try to optimize it in
> this series.
> 
> Patch 1 & 2 are simple code cleanups.
> Patch 3 defers to set irqs to vfio core.
> Patch 4 & 5 defer to commit the route to KVM core. 
> 
> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
> we mesure the cost of the vfio_msix_enable for each one, and
> we can see the total cost can be significantly reduced.
> 
>         Origin         Apply Patch 3     Apply Patch 3/4/5   
> 1st     8              4                 2
> 2nd     15             11                2
> 3rd     22             18                2
> 4th     24             25                3
> 5th     36             33                2
> 6th     44             40                3
> 7th     51             47                3
> 8th     58             54                4
> Total   258ms          232ms             21ms
> 
> 
> Longpeng (Mike) (5):
>   vfio: use helper to simplfy the failure path in vfio_msi_enable
>   msix: simplfy the conditional in msix_set/unset_vector_notifiers
>   vfio: defer to enable msix in migration resume phase
>   kvm: irqchip: support defer to commit the route
>   vfio: defer to commit kvm route in migraiton resume phase

Overall makes sense and LGTM but migration/KVM are not my area :/



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/5] vfio: defer to enable msix in migration resume phase
  2021-08-25  9:57   ` Philippe Mathieu-Daudé
@ 2021-08-25 10:06     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-08-25 10:06 UTC (permalink / raw)
  To: Philippe Mathieu-Daudé,
	alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: arei.gonglei, huangzhichao, qemu-devel



在 2021/8/25 17:57, Philippe Mathieu-Daudé 写道:
> On 8/25/21 9:56 AM, Longpeng(Mike) wrote:
>> The vf's unmasked msix vectors will be enable one by one in
>> migraiton resume phase, VFIO_DEVICE_SET_IRQS will be called
> 
> Typo "migration"
> 
Ok.

>> for each vector, it's a bit expensive if the vf has more
>> vectors.
>>
>> We can call VFIO_DEVICE_SET_IRQS once outside the loop of set
>> vector notifiers to reduce the cost.
>>
>> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
>> we mesure the cost of the vfio_msix_enable for each one, and
> 
> Typo "measure"
> 
Ok.

>> we can see 10% costs can be reduced.
>>
>>         Origin          Apply this patch
>> 1st     8               4
>> 2nd     15              11
>> 3rd     22              18
>> 4th     24              25
>> 5th     36              33
>> 6th     44              40
>> 7th     51              47
>> 8th     58              54
>> Total   258ms           232ms
>>
>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  hw/vfio/pci.c | 22 ++++++++++++++++++++++
>>  hw/vfio/pci.h |  1 +
>>  2 files changed, 23 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 7cc43fe..ca37fb7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -372,6 +372,10 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>>      int ret = 0, i, argsz;
>>      int32_t *fds;
>>  
>> +    if (!vdev->nr_vectors) {
>> +        return 0;
>> +    }
>> +
>>      argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
>>  
>>      irq_set = g_malloc0(argsz);
>> @@ -495,6 +499,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>          }
>>      }
>>  
>> +    if (vdev->defer_add_virq) {
>> +        vdev->nr_vectors = MAX(vdev->nr_vectors, nr + 1);
>> +        goto clear_pending;
>> +    }
>> +
>>      /*
>>       * We don't want to have the host allocate all possible MSI vectors
>>       * for a device if they're not in use, so we shutdown and incrementally
>> @@ -524,6 +533,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>          }
>>      }
>>  
>> +clear_pending:
>>      /* Disable PBA emulation when nothing more is pending. */
>>      clear_bit(nr, vdev->msix->pending);
>>      if (find_first_bit(vdev->msix->pending,
>> @@ -608,6 +618,16 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>      if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
>>                                    vfio_msix_vector_release, NULL)) {
>>          error_report("vfio: msix_set_vector_notifiers failed");
>> +        return;
>> +    }
>> +
>> +    if (!pdev->msix_function_masked && vdev->defer_add_virq) {
>> +        int ret;
>> +        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
>> +        ret = vfio_enable_vectors(vdev, true);
>> +        if (ret) {
>> +            error_report("vfio: failed to enable vectors, %d", ret);
>> +        }
>>      }
>>  
>>      trace_vfio_msix_enable(vdev->vbasedev.name);
>> @@ -2456,7 +2476,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>      if (msi_enabled(pdev)) {
>>          vfio_msi_enable(vdev);
>>      } else if (msix_enabled(pdev)) {
>> +        vdev->defer_add_virq = true;
>>          vfio_msix_enable(vdev);
> 
> What about passing defer_add_virq as boolean argument
> to vfio_msix_enable()?
> 
We'll use defer_add_virq in the deep of the calltrace, it need to change more
functions to support the parameter passing in this way.

>> +        vdev->defer_add_virq = false;
>>      }
>>  
>>      return ret;
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 6477751..4235c83 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -171,6 +171,7 @@ struct VFIOPCIDevice {
>>      bool no_kvm_ioeventfd;
>>      bool no_vfio_ioeventfd;
>>      bool enable_ramfb;
>> +    bool defer_add_virq;
>>      VFIODisplay *dpy;
>>      Notifier irqchip_change_notifier;
>>  };
>>
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/5] optimize the downtime for vfio migration
  2021-08-25 10:05 ` [PATCH 0/5] optimize the downtime for vfio migration Philippe Mathieu-Daudé
@ 2021-08-25 10:09   ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-08-25 10:09 UTC (permalink / raw)
  To: Philippe Mathieu-Daudé,
	alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: Juan Quintela, arei.gonglei, huangzhichao, qemu-devel,
	Dr. David Alan Gilbert



在 2021/8/25 18:05, Philippe Mathieu-Daudé 写道:
> Cc'ing David/Juan for migration big picture (just in case).
> 
> On 8/25/21 9:56 AM, Longpeng(Mike) wrote:
>> In vfio migration resume phase, the cost would increase if the
>> vfio device has more unmasked vectors. We try to optimize it in
>> this series.
>>
>> Patch 1 & 2 are simple code cleanups.
>> Patch 3 defers to set irqs to vfio core.
>> Patch 4 & 5 defer to commit the route to KVM core. 
>>
>> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
>> we mesure the cost of the vfio_msix_enable for each one, and
>> we can see the total cost can be significantly reduced.
>>
>>         Origin         Apply Patch 3     Apply Patch 3/4/5   
>> 1st     8              4                 2
>> 2nd     15             11                2
>> 3rd     22             18                2
>> 4th     24             25                3
>> 5th     36             33                2
>> 6th     44             40                3
>> 7th     51             47                3
>> 8th     58             54                4
>> Total   258ms          232ms             21ms
>>
>>
>> Longpeng (Mike) (5):
>>   vfio: use helper to simplfy the failure path in vfio_msi_enable
>>   msix: simplfy the conditional in msix_set/unset_vector_notifiers
>>   vfio: defer to enable msix in migration resume phase
>>   kvm: irqchip: support defer to commit the route
>>   vfio: defer to commit kvm route in migraiton resume phase
> 
> Overall makes sense and LGTM but migration/KVM are not my area :/
>
Thanks all the same :)

> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/5] optimize the downtime for vfio migration
  2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
                   ` (5 preceding siblings ...)
  2021-08-25 10:05 ` [PATCH 0/5] optimize the downtime for vfio migration Philippe Mathieu-Daudé
@ 2021-09-02  9:43 ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  6 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-09-02  9:43 UTC (permalink / raw)
  To: alex.williamson, mst, marcel.apfelbaum, pbonzini
  Cc: arei.gonglei, huangzhichao, qemu-devel

Hi maintainers,

Ping...

在 2021/8/25 15:56, Longpeng(Mike) 写道:
> In vfio migration resume phase, the cost would increase if the
> vfio device has more unmasked vectors. We try to optimize it in
> this series.
> 
> Patch 1 & 2 are simple code cleanups.
> Patch 3 defers to set irqs to vfio core.
> Patch 4 & 5 defer to commit the route to KVM core. 
> 
> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
> we mesure the cost of the vfio_msix_enable for each one, and
> we can see the total cost can be significantly reduced.
> 
>         Origin         Apply Patch 3     Apply Patch 3/4/5   
> 1st     8              4                 2
> 2nd     15             11                2
> 3rd     22             18                2
> 4th     24             25                3
> 5th     36             33                2
> 6th     44             40                3
> 7th     51             47                3
> 8th     58             54                4
> Total   258ms          232ms             21ms
> 
> 
> Longpeng (Mike) (5):
>   vfio: use helper to simplfy the failure path in vfio_msi_enable
>   msix: simplfy the conditional in msix_set/unset_vector_notifiers
>   vfio: defer to enable msix in migration resume phase
>   kvm: irqchip: support defer to commit the route
>   vfio: defer to commit kvm route in migraiton resume phase
> 
>  accel/kvm/kvm-all.c    | 10 +++--
>  accel/stubs/kvm-stub.c |  3 +-
>  hw/misc/ivshmem.c      |  2 +-
>  hw/pci/msix.c          |  7 ++--
>  hw/vfio/pci.c          | 99 ++++++++++++++++++++++++++++++++++++++------------
>  hw/vfio/pci.h          |  1 +
>  hw/virtio/virtio-pci.c |  2 +-
>  include/sysemu/kvm.h   |  4 +-
>  target/i386/kvm/kvm.c  |  2 +-
>  9 files changed, 95 insertions(+), 35 deletions(-)
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable
  2021-08-25  7:56 ` [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable Longpeng(Mike)
@ 2021-09-03 21:55   ` Alex Williamson
  2021-09-07  2:11     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 1 reply; 22+ messages in thread
From: Alex Williamson @ 2021-09-03 21:55 UTC (permalink / raw)
  To: Longpeng(Mike); +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini

On Wed, 25 Aug 2021 15:56:16 +0800
"Longpeng(Mike)" <longpeng2@huawei.com> wrote:

> The main difference of the failure path in vfio_msi_enable and
> vfio_msi_disable_common is enable INTX or not.
> 
> Extend the vfio_msi_disable_common to provide a arg to decide

"an arg"

> whether need to fallback, and then we can use this helper to
> instead the redundant code in vfio_msi_enable.

Do you mean s/instead/replace/?

> 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/vfio/pci.c | 34 ++++++++++++----------------------
>  1 file changed, 12 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index e1ea1d8..7cc43fe 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -47,6 +47,7 @@
>  
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> +static void vfio_msi_disable_common(VFIOPCIDevice *vdev, bool enable_intx);
>  
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> @@ -650,29 +651,17 @@ retry:
>      if (ret) {
>          if (ret < 0) {
>              error_report("vfio: Error: Failed to setup MSI fds: %m");
> -        } else if (ret != vdev->nr_vectors) {

This part of the change is subtle and not mentioned in the commit log.
It does seem unnecessary to test against this specific return value
since any positive return is an error indicating the number of vectors
we should retry with, but this change should be described in a separate
patch.

> +        } else {
>              error_report("vfio: Error: Failed to enable %d "
>                           "MSI vectors, retry with %d", vdev->nr_vectors, ret);
>          }
>  
> -        for (i = 0; i < vdev->nr_vectors; i++) {
> -            VFIOMSIVector *vector = &vdev->msi_vectors[i];
> -            if (vector->virq >= 0) {
> -                vfio_remove_kvm_msi_virq(vector);
> -            }
> -            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
> -                                NULL, NULL, NULL);
> -            event_notifier_cleanup(&vector->interrupt);
> -        }
> -
> -        g_free(vdev->msi_vectors);
> -        vdev->msi_vectors = NULL;
> +        vfio_msi_disable_common(vdev, false);
>  
> -        if (ret > 0 && ret != vdev->nr_vectors) {
> +        if (ret > 0) {
>              vdev->nr_vectors = ret;
>              goto retry;
>          }
> -        vdev->nr_vectors = 0;
>  
>          /*
>           * Failing to setup MSI doesn't really fall within any specification.
> @@ -680,7 +669,6 @@ retry:
>           * out to fall back to INTx for this device.
>           */
>          error_report("vfio: Error: Failed to enable MSI");
> -        vdev->interrupt = VFIO_INT_NONE;
>  
>          return;
>      }
> @@ -688,7 +676,7 @@ retry:
>      trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
>  }
>  
> -static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
> +static void vfio_msi_disable_common(VFIOPCIDevice *vdev, bool enable_intx)

I'd rather avoid these sort of modal options to functions where we can.
Maybe this suggests instead that re-enabling INTx should be removed
from the common helper and callers needing to do so should do it
outside of the common helper.  Thanks,

Alex


>  {
>      Error *err = NULL;
>      int i;
> @@ -710,9 +698,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
>      vdev->nr_vectors = 0;
>      vdev->interrupt = VFIO_INT_NONE;
>  
> -    vfio_intx_enable(vdev, &err);
> -    if (err) {
> -        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> +    if (enable_intx) {
> +        vfio_intx_enable(vdev, &err);
> +        if (err) {
> +            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
> +        }
>      }
>  }
>  
> @@ -737,7 +727,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
>          vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
>      }
>  
> -    vfio_msi_disable_common(vdev);
> +    vfio_msi_disable_common(vdev, true);
>  
>      memset(vdev->msix->pending, 0,
>             BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
> @@ -748,7 +738,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
>  static void vfio_msi_disable(VFIOPCIDevice *vdev)
>  {
>      vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
> -    vfio_msi_disable_common(vdev);
> +    vfio_msi_disable_common(vdev, true);
>  
>      trace_vfio_msi_disable(vdev->vbasedev.name);
>  }



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers
  2021-08-25  7:56 ` [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers Longpeng(Mike)
  2021-08-25  9:52   ` Philippe Mathieu-Daudé
@ 2021-09-03 21:55   ` Alex Williamson
  1 sibling, 0 replies; 22+ messages in thread
From: Alex Williamson @ 2021-09-03 21:55 UTC (permalink / raw)
  To: Longpeng(Mike); +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini

On Wed, 25 Aug 2021 15:56:17 +0800
"Longpeng(Mike)" <longpeng2@huawei.com> wrote:

> 'msix_function_masked' is kept pace with the device's config,

s/pace/synchronized/?

> we can use it to replace the complex conditional in
> msix_set/unset_vector_notifiers.
> 
> poll_notifier should be reset to NULL in the error path in
> msix_set_vector_notifiers, fix it incidentally.

Agree with Philippe, separate patch.
 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/pci/msix.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
> index ae9331c..8057709 100644
> --- a/hw/pci/msix.c
> +++ b/hw/pci/msix.c
> @@ -592,8 +592,7 @@ int msix_set_vector_notifiers(PCIDevice *dev,
>      dev->msix_vector_release_notifier = release_notifier;
>      dev->msix_vector_poll_notifier = poll_notifier;
>  
> -    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
> -        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
> +    if (!dev->msix_function_masked) {
>          for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>              ret = msix_set_notifier_for_vector(dev, vector);
>              if (ret < 0) {
> @@ -612,6 +611,7 @@ undo:
>      }
>      dev->msix_vector_use_notifier = NULL;
>      dev->msix_vector_release_notifier = NULL;
> +    dev->msix_vector_poll_notifier = NULL;
>      return ret;
>  }
>  
> @@ -622,8 +622,7 @@ void msix_unset_vector_notifiers(PCIDevice *dev)
>      assert(dev->msix_vector_use_notifier &&
>             dev->msix_vector_release_notifier);
>  
> -    if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
> -        (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) {
> +    if (!dev->msix_function_masked) {
>          for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>              msix_unset_notifier_for_vector(dev, vector);
>          }



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/5] vfio: defer to enable msix in migration resume phase
  2021-08-25  7:56 ` [PATCH 3/5] vfio: defer to enable msix in migration resume phase Longpeng(Mike)
  2021-08-25  9:57   ` Philippe Mathieu-Daudé
@ 2021-09-03 21:56   ` Alex Williamson
  2021-09-07  2:12     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  1 sibling, 1 reply; 22+ messages in thread
From: Alex Williamson @ 2021-09-03 21:56 UTC (permalink / raw)
  To: Longpeng(Mike); +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini

On Wed, 25 Aug 2021 15:56:18 +0800
"Longpeng(Mike)" <longpeng2@huawei.com> wrote:

> The vf's unmasked msix vectors will be enable one by one in
> migraiton resume phase, VFIO_DEVICE_SET_IRQS will be called
> for each vector, it's a bit expensive if the vf has more
> vectors.
> 
> We can call VFIO_DEVICE_SET_IRQS once outside the loop of set
> vector notifiers to reduce the cost.
> 
> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
> we mesure the cost of the vfio_msix_enable for each one, and
> we can see 10% costs can be reduced.
> 
>         Origin          Apply this patch

Original?

> 1st     8               4
> 2nd     15              11
> 3rd     22              18
> 4th     24              25
> 5th     36              33
> 6th     44              40
> 7th     51              47
> 8th     58              54
> Total   258ms           232ms

If the values here are ms for execution of vfio_msix_enable() per VF,
why are the values increasing per VF?  Do we have 65 vectors per VF or
do we have 65 vectors total, weighted towards to higher VFs?
This doesn't make sense without the data from the last patch in the
series.

> 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/vfio/pci.c | 22 ++++++++++++++++++++++
>  hw/vfio/pci.h |  1 +
>  2 files changed, 23 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 7cc43fe..ca37fb7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -372,6 +372,10 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>      int ret = 0, i, argsz;
>      int32_t *fds;
>  
> +    if (!vdev->nr_vectors) {
> +        return 0;
> +    }

How would this occur?  Via the new call below?  But then we'd leave
vfio_msix_enabled() with MSI-X DISABLED???

> +
>      argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
>  
>      irq_set = g_malloc0(argsz);
> @@ -495,6 +499,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>          }
>      }
>  
> +    if (vdev->defer_add_virq) {
> +        vdev->nr_vectors = MAX(vdev->nr_vectors, nr + 1);
> +        goto clear_pending;
> +    }

This is a really ugly use of 'goto' to simply jump around code you'd
like to skip rather than reformat the function with branches to
conditionalize that code.  Gotos for consolidated error paths, retries,
hard to break loops are ok, not this.


> +
>      /*
>       * We don't want to have the host allocate all possible MSI vectors
>       * for a device if they're not in use, so we shutdown and incrementally
> @@ -524,6 +533,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>          }
>      }
>  
> +clear_pending:
>      /* Disable PBA emulation when nothing more is pending. */
>      clear_bit(nr, vdev->msix->pending);
>      if (find_first_bit(vdev->msix->pending,
> @@ -608,6 +618,16 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>      if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
>                                    vfio_msix_vector_release, NULL)) {
>          error_report("vfio: msix_set_vector_notifiers failed");
> +        return;
> +    }
> +
> +    if (!pdev->msix_function_masked && vdev->defer_add_virq) {
> +        int ret;
> +        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
> +        ret = vfio_enable_vectors(vdev, true);
> +        if (ret) {
> +            error_report("vfio: failed to enable vectors, %d", ret);
> +        }
>      }
>  
>      trace_vfio_msix_enable(vdev->vbasedev.name);
> @@ -2456,7 +2476,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>      if (msi_enabled(pdev)) {
>          vfio_msi_enable(vdev);
>      } else if (msix_enabled(pdev)) {
> +        vdev->defer_add_virq = true;
>          vfio_msix_enable(vdev);
> +        vdev->defer_add_virq = false;


Ick.  Why is this a special case for vfio_msix_enable()?  Wouldn't we
prefer to always batch vector-use work while we're in the process of
enabling MSI-X?  

>      }
>  
>      return ret;
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 6477751..4235c83 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -171,6 +171,7 @@ struct VFIOPCIDevice {
>      bool no_kvm_ioeventfd;
>      bool no_vfio_ioeventfd;
>      bool enable_ramfb;
> +    bool defer_add_virq;
>      VFIODisplay *dpy;
>      Notifier irqchip_change_notifier;
>  };



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/5] kvm: irqchip: support defer to commit the route
  2021-08-25  7:56 ` [PATCH 4/5] kvm: irqchip: support defer to commit the route Longpeng(Mike)
@ 2021-09-03 21:57   ` Alex Williamson
  2021-09-07  2:13     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 1 reply; 22+ messages in thread
From: Alex Williamson @ 2021-09-03 21:57 UTC (permalink / raw)
  To: Longpeng(Mike); +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini

On Wed, 25 Aug 2021 15:56:19 +0800
"Longpeng(Mike)" <longpeng2@huawei.com> wrote:

> The kvm_irqchip_commit_routes() is relatively expensive, so
> provide the users a choice to commit the route immediately
> or not when they add msi/msix route.
> 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  accel/kvm/kvm-all.c    | 10 +++++++---
>  accel/stubs/kvm-stub.c |  3 ++-
>  hw/misc/ivshmem.c      |  2 +-
>  hw/vfio/pci.c          |  2 +-
>  hw/virtio/virtio-pci.c |  2 +-
>  include/sysemu/kvm.h   |  4 +++-
>  target/i386/kvm/kvm.c  |  2 +-
>  7 files changed, 16 insertions(+), 9 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 0125c17..1f788a2 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1950,7 +1950,8 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
>      return kvm_set_irq(s, route->kroute.gsi, 1);
>  }
>  
> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
> +                              bool defer_commit)
>  {
>      struct kvm_irq_routing_entry kroute = {};
>      int virq;
> @@ -1993,7 +1994,9 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
>  
>      kvm_add_routing_entry(s, &kroute);
>      kvm_arch_add_msi_route_post(&kroute, vector, dev);
> -    kvm_irqchip_commit_routes(s);
> +    if (!defer_commit) {
> +        kvm_irqchip_commit_routes(s);
> +    }


Personally I'd rather rename the function to
kvm_irqchip_add_deferred_msi_route() and kvm_irqchip_add_msi_route()
becomes a wrapper appending kvm_irqchip_commit_routes() to that.
Thanks,

Alex

>  
>      return virq;
>  }
> @@ -2151,7 +2154,8 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
>      abort();
>  }
>  
> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
> +                              bool defer_commit)
>  {
>      return -ENOSYS;
>  }
> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
> index 5b1d00a..d5caaca 100644
> --- a/accel/stubs/kvm-stub.c
> +++ b/accel/stubs/kvm-stub.c
> @@ -81,7 +81,8 @@ int kvm_on_sigbus(int code, void *addr)
>  }
>  
>  #ifndef CONFIG_USER_ONLY
> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
> +                              bool defer_commit)
>  {
>      return -ENOSYS;
>  }
> diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
> index 1ba4a98..98b14cc 100644
> --- a/hw/misc/ivshmem.c
> +++ b/hw/misc/ivshmem.c
> @@ -429,7 +429,7 @@ static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
>      IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
>      assert(!s->msi_vectors[vector].pdev);
>  
> -    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev);
> +    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev, false);
>      if (ret < 0) {
>          error_setg(errp, "kvm_irqchip_add_msi_route failed");
>          return;
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ca37fb7..3ab67d6 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -427,7 +427,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>          return;
>      }
>  
> -    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
> +    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev, false);
>      if (virq < 0) {
>          event_notifier_cleanup(&vector->kvm_interrupt);
>          return;
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index 433060a..7e2d021 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -684,7 +684,7 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>      int ret;
>  
>      if (irqfd->users == 0) {
> -        ret = kvm_irqchip_add_msi_route(kvm_state, vector, &proxy->pci_dev);
> +        ret = kvm_irqchip_add_msi_route(kvm_state, vector, &proxy->pci_dev, false);
>          if (ret < 0) {
>              return ret;
>          }
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index a1ab1ee..1932dc0 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -473,9 +473,11 @@ void kvm_init_cpu_signals(CPUState *cpu);
>   *          message.
>   * @dev:    Owner PCI device to add the route. If @dev is specified
>   *          as @NULL, an empty MSI message will be inited.
> + * @defer_commit:   Defer to commit new route to the KVM core.
>   * @return: virq (>=0) when success, errno (<0) when failed.
>   */
> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev);
> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
> +                              bool defer_commit);
>  int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
>                                   PCIDevice *dev);
>  void kvm_irqchip_commit_routes(KVMState *s);
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index e69abe4..896406b 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -4724,7 +4724,7 @@ void kvm_arch_init_irq_routing(KVMState *s)
>          /* If the ioapic is in QEMU and the lapics are in KVM, reserve
>             MSI routes for signaling interrupts to the local apics. */
>          for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> -            if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
> +            if (kvm_irqchip_add_msi_route(s, 0, NULL, false) < 0) {
>                  error_report("Could not enable split IRQ mode.");
>                  exit(1);
>              }



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase
  2021-08-25  7:56 ` [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase Longpeng(Mike)
@ 2021-09-03 21:57   ` Alex Williamson
  2021-09-07  2:14     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 1 reply; 22+ messages in thread
From: Alex Williamson @ 2021-09-03 21:57 UTC (permalink / raw)
  To: Longpeng(Mike); +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini

On Wed, 25 Aug 2021 15:56:20 +0800
"Longpeng(Mike)" <longpeng2@huawei.com> wrote:

> In migration resume phase, all unmasked msix vectors need to be
> setup when load the VF state. However, the setup operation would
> takes longer if the VF has more unmasked vectors.
> 
> In our case, the VF has 65 vectors and each one spend at most 0.8ms
> on setup operation the total cost of the VF is about 8-58ms. For a
> VM that has 8 VFs of this type, the total cost is more than 250ms.
> 
> vfio_pci_load_config
>   vfio_msix_enable
>     msix_set_vector_notifiers
>       for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>         vfio_msix_vector_do_use
>           vfio_add_kvm_msi_virq
>             kvm_irqchip_commit_routes <-- expensive
>       }
> 
> We can reduce the cost by only commit once outside the loop. The
> routes is cached in kvm_state, we commit them first and then bind
> irqfd for each vector.
> 
> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
> we mesure the cost of the vfio_msix_enable for each one, and
> we can see 90+% costs can be reduce.
> 
>         Origin          Apply this patch
>                         and vfio enable optimization
> 1st     8               2
> 2nd     15              2
> 3rd     22              2
> 4th     24              3
> 5th     36              2
> 6th     44              3
> 7th     51              3
> 8th     58              4
> Total   258ms           21ms

Almost seems like we should have started here rather than much more
subtle improvements from patch 3.

 
> The optimition can be also applied to msi type.
> 
> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
> ---
>  hw/vfio/pci.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 44 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 3ab67d6..50e7ec7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -427,12 +427,17 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>          return;
>      }
>  
> -    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev, false);
> +    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev,
> +                                     vdev->defer_add_virq);

See comment on previous patch about these bool function args.

>      if (virq < 0) {
>          event_notifier_cleanup(&vector->kvm_interrupt);
>          return;
>      }
>  
> +    if (vdev->defer_add_virq) {
> +        goto out;
> +    }

See comment on previous patch about this goto flow.

> +
>      if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
>                                         NULL, virq) < 0) {
>          kvm_irqchip_release_virq(kvm_state, virq);
> @@ -440,6 +445,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>          return;
>      }
>  
> +out:
>      vector->virq = virq;
>  }
>  
> @@ -577,6 +583,36 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>      }
>  }
>  
> +static void vfio_commit_kvm_msi_virq(VFIOPCIDevice *vdev)
> +{
> +    int i;
> +    VFIOMSIVector *vector;
> +    bool commited = false;
> +
> +    for (i = 0; i < vdev->nr_vectors; i++) {
> +        vector = &vdev->msi_vectors[i];
> +
> +        if (vector->virq < 0) {
> +            continue;
> +        }
> +
> +        /* Commit cached route entries to KVM core first if not yet */
> +        if (!commited) {
> +            kvm_irqchip_commit_routes(kvm_state);
> +            commited = true;
> +        }

Why is this in the loop, shouldn't we just start with:

    if (!vdev->nr_vectors) {
        return;
    }

    kvm_irqchip_commit_routes(kvm_state);

    for (...

> +
> +        if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
> +                                               &vector->kvm_interrupt,
> +                                               NULL, vector->virq) < 0) {
> +            kvm_irqchip_release_virq(kvm_state, vector->virq);
> +            event_notifier_cleanup(&vector->kvm_interrupt);
> +            vector->virq = -1;
> +            return;
> +        }

And all the other vectors we've allocated?  Error logging?

> +    }
> +}
> +
>  static void vfio_msix_enable(VFIOPCIDevice *vdev)
>  {
>      PCIDevice *pdev = &vdev->pdev;
> @@ -624,6 +660,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>      if (!pdev->msix_function_masked && vdev->defer_add_virq) {
>          int ret;
>          vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
> +        vfio_commit_kvm_msi_virq(vdev);
>          ret = vfio_enable_vectors(vdev, true);
>          if (ret) {
>              error_report("vfio: failed to enable vectors, %d", ret);
> @@ -664,6 +701,10 @@ retry:
>          vfio_add_kvm_msi_virq(vdev, vector, i, false);
>      }
>  
> +    if (vdev->defer_add_virq){
> +        vfio_commit_kvm_msi_virq(vdev);
> +    }

Again, why is the load_config path unique, shouldn't we always batch on
setup?

> +
>      /* Set interrupt type prior to possible interrupts */
>      vdev->interrupt = VFIO_INT_MSI;
>  
> @@ -2473,13 +2514,13 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>      vfio_pci_write_config(pdev, PCI_COMMAND,
>                            pci_get_word(pdev->config + PCI_COMMAND), 2);
>  
> +    vdev->defer_add_virq = true;
>      if (msi_enabled(pdev)) {
>          vfio_msi_enable(vdev);
>      } else if (msix_enabled(pdev)) {
> -        vdev->defer_add_virq = true;
>          vfio_msix_enable(vdev);
> -        vdev->defer_add_virq = false;
>      }
> +    vdev->defer_add_virq = false;
>  
>      return ret;
>  }



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable
  2021-09-03 21:55   ` Alex Williamson
@ 2021-09-07  2:11     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-09-07  2:11 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini



在 2021/9/4 5:55, Alex Williamson 写道:
> On Wed, 25 Aug 2021 15:56:16 +0800
> "Longpeng(Mike)" <longpeng2@huawei.com> wrote:
> 
>> The main difference of the failure path in vfio_msi_enable and
>> vfio_msi_disable_common is enable INTX or not.
>>
>> Extend the vfio_msi_disable_common to provide a arg to decide
> 
> "an arg"
> 

Thanks a lot, I'll fix all of the grammatical errors and typos in this version
together.

>> whether need to fallback, and then we can use this helper to
>> instead the redundant code in vfio_msi_enable.
> 
> Do you mean s/instead/replace/?
> 
>>
>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  hw/vfio/pci.c | 34 ++++++++++++----------------------
>>  1 file changed, 12 insertions(+), 22 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index e1ea1d8..7cc43fe 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -47,6 +47,7 @@
>>  
>>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>> +static void vfio_msi_disable_common(VFIOPCIDevice *vdev, bool enable_intx);
>>  
>>  /*
>>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
>> @@ -650,29 +651,17 @@ retry:
>>      if (ret) {
>>          if (ret < 0) {
>>              error_report("vfio: Error: Failed to setup MSI fds: %m");
>> -        } else if (ret != vdev->nr_vectors) {
> 
> This part of the change is subtle and not mentioned in the commit log.
> It does seem unnecessary to test against this specific return value
> since any positive return is an error indicating the number of vectors
> we should retry with, but this change should be described in a separate
> patch.
> 
Ok, thanks, I'll split in the next version.

>> +        } else {
>>              error_report("vfio: Error: Failed to enable %d "
>>                           "MSI vectors, retry with %d", vdev->nr_vectors, ret);
>>          }
>>  
>> -        for (i = 0; i < vdev->nr_vectors; i++) {
>> -            VFIOMSIVector *vector = &vdev->msi_vectors[i];
>> -            if (vector->virq >= 0) {
>> -                vfio_remove_kvm_msi_virq(vector);
>> -            }
>> -            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
>> -                                NULL, NULL, NULL);
>> -            event_notifier_cleanup(&vector->interrupt);
>> -        }
>> -
>> -        g_free(vdev->msi_vectors);
>> -        vdev->msi_vectors = NULL;
>> +        vfio_msi_disable_common(vdev, false);
>>  
>> -        if (ret > 0 && ret != vdev->nr_vectors) {
>> +        if (ret > 0) {
>>              vdev->nr_vectors = ret;
>>              goto retry;
>>          }
>> -        vdev->nr_vectors = 0;
>>  
>>          /*
>>           * Failing to setup MSI doesn't really fall within any specification.
>> @@ -680,7 +669,6 @@ retry:
>>           * out to fall back to INTx for this device.
>>           */
>>          error_report("vfio: Error: Failed to enable MSI");
>> -        vdev->interrupt = VFIO_INT_NONE;
>>  
>>          return;
>>      }
>> @@ -688,7 +676,7 @@ retry:
>>      trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
>>  }
>>  
>> -static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
>> +static void vfio_msi_disable_common(VFIOPCIDevice *vdev, bool enable_intx)
> 
> I'd rather avoid these sort of modal options to functions where we can.
> Maybe this suggests instead that re-enabling INTx should be removed
> from the common helper and callers needing to do so should do it
> outside of the common helper.  Thanks,
> 
Ok, thanks.

> Alex
> 
> 
>>  {
>>      Error *err = NULL;
>>      int i;
>> @@ -710,9 +698,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
>>      vdev->nr_vectors = 0;
>>      vdev->interrupt = VFIO_INT_NONE;
>>  
>> -    vfio_intx_enable(vdev, &err);
>> -    if (err) {
>> -        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>> +    if (enable_intx) {
>> +        vfio_intx_enable(vdev, &err);
>> +        if (err) {
>> +            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
>> +        }
>>      }
>>  }
>>  
>> @@ -737,7 +727,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
>>          vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
>>      }
>>  
>> -    vfio_msi_disable_common(vdev);
>> +    vfio_msi_disable_common(vdev, true);
>>  
>>      memset(vdev->msix->pending, 0,
>>             BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
>> @@ -748,7 +738,7 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
>>  static void vfio_msi_disable(VFIOPCIDevice *vdev)
>>  {
>>      vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
>> -    vfio_msi_disable_common(vdev);
>> +    vfio_msi_disable_common(vdev, true);
>>  
>>      trace_vfio_msi_disable(vdev->vbasedev.name);
>>  }
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/5] vfio: defer to enable msix in migration resume phase
  2021-09-03 21:56   ` Alex Williamson
@ 2021-09-07  2:12     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-09-07  2:12 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini



在 2021/9/4 5:56, Alex Williamson 写道:
> On Wed, 25 Aug 2021 15:56:18 +0800
> "Longpeng(Mike)" <longpeng2@huawei.com> wrote:
> 
>> The vf's unmasked msix vectors will be enable one by one in
>> migraiton resume phase, VFIO_DEVICE_SET_IRQS will be called
>> for each vector, it's a bit expensive if the vf has more
>> vectors.
>>
>> We can call VFIO_DEVICE_SET_IRQS once outside the loop of set
>> vector notifiers to reduce the cost.
>>
>> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
>> we mesure the cost of the vfio_msix_enable for each one, and
>> we can see 10% costs can be reduced.
>>
>>         Origin          Apply this patch
> 
> Original?
> 
>> 1st     8               4
>> 2nd     15              11
>> 3rd     22              18
>> 4th     24              25
>> 5th     36              33
>> 6th     44              40
>> 7th     51              47
>> 8th     58              54
>> Total   258ms           232ms
> 
> If the values here are ms for execution of vfio_msix_enable() per VF,

Yes.

> why are the values increasing per VF?  Do we have 65 vectors per VF or
> do we have 65 vectors total, weighted towards to higher VFs?

We have 65 vectors per VF.

The KVM_SET_GSI_ROUTING scans and updates all of the assigned irqfds
unconditionally, so it will spend more time if there are more irqfds.

We have 65 irqfds when process the 1st VF, 130 irqfds when process the 2nd VF,
195 irqfds when process the 3rd VF ... so we'll see the values are increasing as
a result.

> This doesn't make sense without the data from the last patch in the
> series.
> 
>>
>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  hw/vfio/pci.c | 22 ++++++++++++++++++++++
>>  hw/vfio/pci.h |  1 +
>>  2 files changed, 23 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 7cc43fe..ca37fb7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -372,6 +372,10 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>>      int ret = 0, i, argsz;
>>      int32_t *fds;
>>  
>> +    if (!vdev->nr_vectors) {
>> +        return 0;
>> +    }
> 
> How would this occur?  Via the new call below?  But then we'd leave
> vfio_msix_enabled() with MSI-X DISABLED???
> 
>> +
>>      argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
>>  
>>      irq_set = g_malloc0(argsz);
>> @@ -495,6 +499,11 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>          }
>>      }
>>  
>> +    if (vdev->defer_add_virq) {
>> +        vdev->nr_vectors = MAX(vdev->nr_vectors, nr + 1);
>> +        goto clear_pending;
>> +    }
> 
> This is a really ugly use of 'goto' to simply jump around code you'd
> like to skip rather than reformat the function with branches to
> conditionalize that code.  Gotos for consolidated error paths, retries,
> hard to break loops are ok, not this.
> 

Got it, thanks.

> 
>> +
>>      /*
>>       * We don't want to have the host allocate all possible MSI vectors
>>       * for a device if they're not in use, so we shutdown and incrementally
>> @@ -524,6 +533,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>          }
>>      }
>>  
>> +clear_pending:
>>      /* Disable PBA emulation when nothing more is pending. */
>>      clear_bit(nr, vdev->msix->pending);
>>      if (find_first_bit(vdev->msix->pending,
>> @@ -608,6 +618,16 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>      if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
>>                                    vfio_msix_vector_release, NULL)) {
>>          error_report("vfio: msix_set_vector_notifiers failed");
>> +        return;
>> +    }
>> +
>> +    if (!pdev->msix_function_masked && vdev->defer_add_virq) {
>> +        int ret;
>> +        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
>> +        ret = vfio_enable_vectors(vdev, true);
>> +        if (ret) {
>> +            error_report("vfio: failed to enable vectors, %d", ret);
>> +        }
>>      }
>>  
>>      trace_vfio_msix_enable(vdev->vbasedev.name);
>> @@ -2456,7 +2476,9 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>      if (msi_enabled(pdev)) {
>>          vfio_msi_enable(vdev);
>>      } else if (msix_enabled(pdev)) {
>> +        vdev->defer_add_virq = true;
>>          vfio_msix_enable(vdev);
>> +        vdev->defer_add_virq = false;
> 
> 
> Ick.  Why is this a special case for vfio_msix_enable()?  Wouldn't we
> prefer to always batch vector-use work while we're in the process of
> enabling MSI-X?  
> 

Ok, will do in next version.

In addition, I'll rename the field to 'defer_kvm_irq_routing' as you suggested
in another earlier thread.

    '''
    > -            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
    > +            if (unlikely(vdev->defer_set_virq)) {

    Likewise this could be "vdev->defer_kvm_irq_routing" and we could apply
    it to all IRQ types.

    > +                vector->need_switch = true;
    > +            } else {
    '''

>>      }
>>  
>>      return ret;
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 6477751..4235c83 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -171,6 +171,7 @@ struct VFIOPCIDevice {
>>      bool no_kvm_ioeventfd;
>>      bool no_vfio_ioeventfd;
>>      bool enable_ramfb;
>> +    bool defer_add_virq;
>>      VFIODisplay *dpy;
>>      Notifier irqchip_change_notifier;
>>  };
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/5] kvm: irqchip: support defer to commit the route
  2021-09-03 21:57   ` Alex Williamson
@ 2021-09-07  2:13     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-09-07  2:13 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini



在 2021/9/4 5:57, Alex Williamson 写道:
> On Wed, 25 Aug 2021 15:56:19 +0800
> "Longpeng(Mike)" <longpeng2@huawei.com> wrote:
> 
>> The kvm_irqchip_commit_routes() is relatively expensive, so
>> provide the users a choice to commit the route immediately
>> or not when they add msi/msix route.
>>
>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  accel/kvm/kvm-all.c    | 10 +++++++---
>>  accel/stubs/kvm-stub.c |  3 ++-
>>  hw/misc/ivshmem.c      |  2 +-
>>  hw/vfio/pci.c          |  2 +-
>>  hw/virtio/virtio-pci.c |  2 +-
>>  include/sysemu/kvm.h   |  4 +++-
>>  target/i386/kvm/kvm.c  |  2 +-
>>  7 files changed, 16 insertions(+), 9 deletions(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index 0125c17..1f788a2 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1950,7 +1950,8 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
>>      return kvm_set_irq(s, route->kroute.gsi, 1);
>>  }
>>  
>> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
>> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
>> +                              bool defer_commit)
>>  {
>>      struct kvm_irq_routing_entry kroute = {};
>>      int virq;
>> @@ -1993,7 +1994,9 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
>>  
>>      kvm_add_routing_entry(s, &kroute);
>>      kvm_arch_add_msi_route_post(&kroute, vector, dev);
>> -    kvm_irqchip_commit_routes(s);
>> +    if (!defer_commit) {
>> +        kvm_irqchip_commit_routes(s);
>> +    }
> 
> 
> Personally I'd rather rename the function to
> kvm_irqchip_add_deferred_msi_route() and kvm_irqchip_add_msi_route()
> becomes a wrapper appending kvm_irqchip_commit_routes() to that.
> Thanks,
> 

Ok, will do in the next version.

> Alex
> 
>>  
>>      return virq;
>>  }
>> @@ -2151,7 +2154,8 @@ int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
>>      abort();
>>  }
>>  
>> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
>> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
>> +                              bool defer_commit)
>>  {
>>      return -ENOSYS;
>>  }
>> diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
>> index 5b1d00a..d5caaca 100644
>> --- a/accel/stubs/kvm-stub.c
>> +++ b/accel/stubs/kvm-stub.c
>> @@ -81,7 +81,8 @@ int kvm_on_sigbus(int code, void *addr)
>>  }
>>  
>>  #ifndef CONFIG_USER_ONLY
>> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
>> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
>> +                              bool defer_commit)
>>  {
>>      return -ENOSYS;
>>  }
>> diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
>> index 1ba4a98..98b14cc 100644
>> --- a/hw/misc/ivshmem.c
>> +++ b/hw/misc/ivshmem.c
>> @@ -429,7 +429,7 @@ static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
>>      IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
>>      assert(!s->msi_vectors[vector].pdev);
>>  
>> -    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev);
>> +    ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev, false);
>>      if (ret < 0) {
>>          error_setg(errp, "kvm_irqchip_add_msi_route failed");
>>          return;
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index ca37fb7..3ab67d6 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -427,7 +427,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>>          return;
>>      }
>>  
>> -    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
>> +    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev, false);
>>      if (virq < 0) {
>>          event_notifier_cleanup(&vector->kvm_interrupt);
>>          return;
>> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
>> index 433060a..7e2d021 100644
>> --- a/hw/virtio/virtio-pci.c
>> +++ b/hw/virtio/virtio-pci.c
>> @@ -684,7 +684,7 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
>>      int ret;
>>  
>>      if (irqfd->users == 0) {
>> -        ret = kvm_irqchip_add_msi_route(kvm_state, vector, &proxy->pci_dev);
>> +        ret = kvm_irqchip_add_msi_route(kvm_state, vector, &proxy->pci_dev, false);
>>          if (ret < 0) {
>>              return ret;
>>          }
>> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
>> index a1ab1ee..1932dc0 100644
>> --- a/include/sysemu/kvm.h
>> +++ b/include/sysemu/kvm.h
>> @@ -473,9 +473,11 @@ void kvm_init_cpu_signals(CPUState *cpu);
>>   *          message.
>>   * @dev:    Owner PCI device to add the route. If @dev is specified
>>   *          as @NULL, an empty MSI message will be inited.
>> + * @defer_commit:   Defer to commit new route to the KVM core.
>>   * @return: virq (>=0) when success, errno (<0) when failed.
>>   */
>> -int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev);
>> +int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev,
>> +                              bool defer_commit);
>>  int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
>>                                   PCIDevice *dev);
>>  void kvm_irqchip_commit_routes(KVMState *s);
>> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
>> index e69abe4..896406b 100644
>> --- a/target/i386/kvm/kvm.c
>> +++ b/target/i386/kvm/kvm.c
>> @@ -4724,7 +4724,7 @@ void kvm_arch_init_irq_routing(KVMState *s)
>>          /* If the ioapic is in QEMU and the lapics are in KVM, reserve
>>             MSI routes for signaling interrupts to the local apics. */
>>          for (i = 0; i < IOAPIC_NUM_PINS; i++) {
>> -            if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
>> +            if (kvm_irqchip_add_msi_route(s, 0, NULL, false) < 0) {
>>                  error_report("Could not enable split IRQ mode.");
>>                  exit(1);
>>              }
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase
  2021-09-03 21:57   ` Alex Williamson
@ 2021-09-07  2:14     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
  0 siblings, 0 replies; 22+ messages in thread
From: Longpeng (Mike, Cloud Infrastructure Service Product Dept.) @ 2021-09-07  2:14 UTC (permalink / raw)
  To: Alex Williamson; +Cc: mst, qemu-devel, arei.gonglei, huangzhichao, pbonzini



在 2021/9/4 5:57, Alex Williamson 写道:
> On Wed, 25 Aug 2021 15:56:20 +0800
> "Longpeng(Mike)" <longpeng2@huawei.com> wrote:
> 
>> In migration resume phase, all unmasked msix vectors need to be
>> setup when load the VF state. However, the setup operation would
>> takes longer if the VF has more unmasked vectors.
>>
>> In our case, the VF has 65 vectors and each one spend at most 0.8ms
>> on setup operation the total cost of the VF is about 8-58ms. For a
>> VM that has 8 VFs of this type, the total cost is more than 250ms.
>>
>> vfio_pci_load_config
>>   vfio_msix_enable
>>     msix_set_vector_notifiers
>>       for (vector = 0; vector < dev->msix_entries_nr; vector++) {
>>         vfio_msix_vector_do_use
>>           vfio_add_kvm_msi_virq
>>             kvm_irqchip_commit_routes <-- expensive
>>       }
>>
>> We can reduce the cost by only commit once outside the loop. The
>> routes is cached in kvm_state, we commit them first and then bind
>> irqfd for each vector.
>>
>> The test VM has 128 vcpus and 8 VF (with 65 vectors enabled),
>> we mesure the cost of the vfio_msix_enable for each one, and
>> we can see 90+% costs can be reduce.
>>
>>         Origin          Apply this patch
>>                         and vfio enable optimization
>> 1st     8               2
>> 2nd     15              2
>> 3rd     22              2
>> 4th     24              3
>> 5th     36              2
>> 6th     44              3
>> 7th     51              3
>> 8th     58              4
>> Total   258ms           21ms
> 
> Almost seems like we should have started here rather than much more
> subtle improvements from patch 3.
> 
>  
>> The optimition can be also applied to msi type.
>>
>> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
>> ---
>>  hw/vfio/pci.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 44 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 3ab67d6..50e7ec7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -427,12 +427,17 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>>          return;
>>      }
>>  
>> -    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev, false);
>> +    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev,
>> +                                     vdev->defer_add_virq);
> 
> See comment on previous patch about these bool function args.
> 
>>      if (virq < 0) {
>>          event_notifier_cleanup(&vector->kvm_interrupt);
>>          return;
>>      }
>>  
>> +    if (vdev->defer_add_virq) {
>> +        goto out;
>> +    }
> 
> See comment on previous patch about this goto flow.
> 
>> +
>>      if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
>>                                         NULL, virq) < 0) {
>>          kvm_irqchip_release_virq(kvm_state, virq);
>> @@ -440,6 +445,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
>>          return;
>>      }
>>  
>> +out:
>>      vector->virq = virq;
>>  }
>>  
>> @@ -577,6 +583,36 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>>      }
>>  }
>>  
>> +static void vfio_commit_kvm_msi_virq(VFIOPCIDevice *vdev)
>> +{
>> +    int i;
>> +    VFIOMSIVector *vector;
>> +    bool commited = false;
>> +
>> +    for (i = 0; i < vdev->nr_vectors; i++) {
>> +        vector = &vdev->msi_vectors[i];
>> +
>> +        if (vector->virq < 0) {
>> +            continue;
>> +        }
>> +
>> +        /* Commit cached route entries to KVM core first if not yet */
>> +        if (!commited) {
>> +            kvm_irqchip_commit_routes(kvm_state);
>> +            commited = true;
>> +        }
> 
> Why is this in the loop, shouldn't we just start with:
> 

The kvm_irqchip_commit_routes won't be called if all of the vector->virq are -1
originally, so I just want to preserve the behavior here.

But it seems no any side effect if we call it directly, I'll take your advice in
the next version, thanks.

>     if (!vdev->nr_vectors) {
>         return;
>     }
> 
>     kvm_irqchip_commit_routes(kvm_state);
> 
>     for (...
> 
>> +
>> +        if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
>> +                                               &vector->kvm_interrupt,
>> +                                               NULL, vector->virq) < 0) {
>> +            kvm_irqchip_release_virq(kvm_state, vector->virq);
>> +            event_notifier_cleanup(&vector->kvm_interrupt);
>> +            vector->virq = -1;
>> +            return;
>> +        }
> 
> And all the other vectors we've allocated?  Error logging?
> 

Oh, it's a bug, will fix.

>> +    }
>> +}
>> +
>>  static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>  {
>>      PCIDevice *pdev = &vdev->pdev;
>> @@ -624,6 +660,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>      if (!pdev->msix_function_masked && vdev->defer_add_virq) {
>>          int ret;
>>          vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
>> +        vfio_commit_kvm_msi_virq(vdev);
>>          ret = vfio_enable_vectors(vdev, true);
>>          if (ret) {
>>              error_report("vfio: failed to enable vectors, %d", ret);
>> @@ -664,6 +701,10 @@ retry:
>>          vfio_add_kvm_msi_virq(vdev, vector, i, false);
>>      }
>>  
>> +    if (vdev->defer_add_virq){
>> +        vfio_commit_kvm_msi_virq(vdev);
>> +    }
> 
> Again, why is the load_config path unique, shouldn't we always batch on
> setup?
> 
>> +
>>      /* Set interrupt type prior to possible interrupts */
>>      vdev->interrupt = VFIO_INT_MSI;
>>  
>> @@ -2473,13 +2514,13 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>      vfio_pci_write_config(pdev, PCI_COMMAND,
>>                            pci_get_word(pdev->config + PCI_COMMAND), 2);
>>  
>> +    vdev->defer_add_virq = true;
>>      if (msi_enabled(pdev)) {
>>          vfio_msi_enable(vdev);
>>      } else if (msix_enabled(pdev)) {
>> -        vdev->defer_add_virq = true;
>>          vfio_msix_enable(vdev);
>> -        vdev->defer_add_virq = false;
>>      }
>> +    vdev->defer_add_virq = false;
>>  
>>      return ret;
>>  }
> 
> .
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2021-09-07  2:24 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-25  7:56 [PATCH 0/5] optimize the downtime for vfio migration Longpeng(Mike)
2021-08-25  7:56 ` [PATCH 1/5] vfio: use helper to simplfy the failure path in vfio_msi_enable Longpeng(Mike)
2021-09-03 21:55   ` Alex Williamson
2021-09-07  2:11     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-08-25  7:56 ` [PATCH 2/5] msix: simplfy the conditional in msix_set/unset_vector_notifiers Longpeng(Mike)
2021-08-25  9:52   ` Philippe Mathieu-Daudé
2021-08-25  9:56     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-09-03 21:55   ` Alex Williamson
2021-08-25  7:56 ` [PATCH 3/5] vfio: defer to enable msix in migration resume phase Longpeng(Mike)
2021-08-25  9:57   ` Philippe Mathieu-Daudé
2021-08-25 10:06     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-09-03 21:56   ` Alex Williamson
2021-09-07  2:12     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-08-25  7:56 ` [PATCH 4/5] kvm: irqchip: support defer to commit the route Longpeng(Mike)
2021-09-03 21:57   ` Alex Williamson
2021-09-07  2:13     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-08-25  7:56 ` [PATCH 5/5] vfio: defer to commit kvm route in migraiton resume phase Longpeng(Mike)
2021-09-03 21:57   ` Alex Williamson
2021-09-07  2:14     ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-08-25 10:05 ` [PATCH 0/5] optimize the downtime for vfio migration Philippe Mathieu-Daudé
2021-08-25 10:09   ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2021-09-02  9:43 ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.