All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alex Williamson <alex.williamson@redhat.com>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] [PULL 4/6] vfio/quirks: ioeventfd quirk acceleration
Date: Tue, 05 Jun 2018 14:06:41 -0600	[thread overview]
Message-ID: <20180605200641.24323.10169.stgit@gimli.home> (raw)
In-Reply-To: <20180605200425.24323.56023.stgit@gimli.home>

The NVIDIA BAR0 quirks virtualize the PCI config space mirrors found
in device MMIO space.  Normally PCI config space is considered a slow
path and further optimization is unnecessary, however NVIDIA uses a
register here to enable the MSI interrupt to re-trigger.  Exiting to
QEMU for this MSI-ACK handling can therefore rate limit our interrupt
handling.  Fortunately the MSI-ACK write is easily detected since the
quirk MemoryRegion otherwise has very few accesses, so simply looking
for consecutive writes with the same data is sufficient, in this case
10 consecutive writes with the same data and size is arbitrarily
chosen.  We configure the KVM ioeventfd with data match, so there's
no risk of triggering for the wrong data or size, but we do risk that
pathological driver behavior might consume all of QEMU's file
descriptors, so we cap ourselves to 10 ioeventfds for this purpose.

In support of the above, generic ioeventfd infrastructure is added
for vfio quirks.  This automatically initializes an ioeventfd list
per quirk, disables and frees ioeventfds on exit, and allows
ioeventfds marked as dynamic to be dropped on device reset.  The
rationale for this latter feature is that useful ioeventfds may
depend on specific driver behavior and since we necessarily place a
cap on our use of ioeventfds, a machine reset is a reasonable point
at which to assume a new driver and re-profile.

Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci-quirks.c |  166 +++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/vfio/pci.c        |    2 +
 hw/vfio/pci.h        |   14 ++++
 hw/vfio/trace-events |    3 +
 4 files changed, 183 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index f0947cbf152f..f7886487744e 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -12,6 +12,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
+#include "qemu/main-loop.h"
 #include "qemu/range.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
@@ -202,6 +203,7 @@ typedef struct VFIOConfigMirrorQuirk {
     uint32_t offset;
     uint8_t bar;
     MemoryRegion *mem;
+    uint8_t data[];
 } VFIOConfigMirrorQuirk;
 
 static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
@@ -278,12 +280,95 @@ static const MemoryRegionOps vfio_ati_3c3_quirk = {
 static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
 {
     VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
+    QLIST_INIT(&quirk->ioeventfds);
     quirk->mem = g_new0(MemoryRegion, nr_mem);
     quirk->nr_mem = nr_mem;
 
     return quirk;
 }
 
+static void vfio_ioeventfd_exit(VFIOIOEventFD *ioeventfd)
+{
+    QLIST_REMOVE(ioeventfd, next);
+    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
+                              true, ioeventfd->data, &ioeventfd->e);
+    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e), NULL, NULL, NULL);
+    event_notifier_cleanup(&ioeventfd->e);
+    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
+                              (uint64_t)ioeventfd->addr, ioeventfd->size,
+                              ioeventfd->data);
+    g_free(ioeventfd);
+}
+
+static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
+{
+    VFIOIOEventFD *ioeventfd, *tmp;
+
+    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
+        if (ioeventfd->dynamic) {
+            vfio_ioeventfd_exit(ioeventfd);
+        }
+    }
+}
+
+static void vfio_ioeventfd_handler(void *opaque)
+{
+    VFIOIOEventFD *ioeventfd = opaque;
+
+    if (event_notifier_test_and_clear(&ioeventfd->e)) {
+        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
+                          ioeventfd->data, ioeventfd->size);
+        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
+                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
+                                     ioeventfd->data);
+    }
+}
+
+static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
+                                          MemoryRegion *mr, hwaddr addr,
+                                          unsigned size, uint64_t data,
+                                          VFIORegion *region,
+                                          hwaddr region_addr, bool dynamic)
+{
+    VFIOIOEventFD *ioeventfd;
+
+    if (vdev->no_kvm_ioeventfd) {
+        return NULL;
+    }
+
+    ioeventfd = g_malloc0(sizeof(*ioeventfd));
+
+    if (event_notifier_init(&ioeventfd->e, 0)) {
+        g_free(ioeventfd);
+        return NULL;
+    }
+
+    /*
+     * MemoryRegion and relative offset, plus additional ioeventfd setup
+     * parameters for configuring and later tearing down KVM ioeventfd.
+     */
+    ioeventfd->mr = mr;
+    ioeventfd->addr = addr;
+    ioeventfd->size = size;
+    ioeventfd->data = data;
+    ioeventfd->dynamic = dynamic;
+    /*
+     * VFIORegion and relative offset for implementing the userspace
+     * handler.  data & size fields shared for both uses.
+     */
+    ioeventfd->region = region;
+    ioeventfd->region_addr = region_addr;
+
+    qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
+                        vfio_ioeventfd_handler, NULL, ioeventfd);
+    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
+                              true, ioeventfd->data, &ioeventfd->e);
+    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
+                              size, data);
+
+    return ioeventfd;
+}
+
 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
 {
     VFIOQuirk *quirk;
@@ -719,6 +804,18 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
     trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
 }
 
+typedef struct LastDataSet {
+    VFIOQuirk *quirk;
+    hwaddr addr;
+    uint64_t data;
+    unsigned size;
+    int hits;
+    int added;
+} LastDataSet;
+
+#define MAX_DYN_IOEVENTFD 10
+#define HITS_FOR_IOEVENTFD 10
+
 /*
  * Finally, BAR0 itself.  We want to redirect any accesses to either
  * 0x1800 or 0x88000 through the PCI config space access functions.
@@ -729,6 +826,7 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
     VFIOConfigMirrorQuirk *mirror = opaque;
     VFIOPCIDevice *vdev = mirror->vdev;
     PCIDevice *pdev = &vdev->pdev;
+    LastDataSet *last = (LastDataSet *)&mirror->data;
 
     vfio_generic_quirk_mirror_write(opaque, addr, data, size);
 
@@ -743,6 +841,49 @@ static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
                           addr + mirror->offset, data, size);
         trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
     }
+
+    /*
+     * Automatically add an ioeventfd to handle any repeated write with the
+     * same data and size above the standard PCI config space header.  This is
+     * primarily expected to accelerate the MSI-ACK behavior, such as noted
+     * above.  Current hardware/drivers should trigger an ioeventfd at config
+     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
+     *
+     * The criteria of 10 successive hits is arbitrary but reliably adds the
+     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
+     * the remaining ones have a greater chance of being seen successively.
+     * To avoid the pathological case of burning up all of QEMU's open file
+     * handles, arbitrarily limit this algorithm from adding no more than 10
+     * ioeventfds, print an error if we would have added an 11th, and then
+     * stop counting.
+     */
+    if (!vdev->no_kvm_ioeventfd &&
+        addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
+        if (addr != last->addr || data != last->data || size != last->size) {
+            last->addr = addr;
+            last->data = data;
+            last->size = size;
+            last->hits = 1;
+        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
+            if (last->added < MAX_DYN_IOEVENTFD) {
+                VFIOIOEventFD *ioeventfd;
+                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
+                                        data, &vdev->bars[mirror->bar].region,
+                                        mirror->offset + addr, true);
+                if (ioeventfd) {
+                    VFIOQuirk *quirk = last->quirk;
+
+                    QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
+                    last->added++;
+                }
+            } else {
+                last->added++;
+                warn_report("NVIDIA ioeventfd queue full for %s, unable to "
+                            "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
+                            "size %u", vdev->vbasedev.name, addr, data, size);
+            }
+        }
+    }
 }
 
 static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
@@ -751,10 +892,21 @@ static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
+static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
+{
+    VFIOConfigMirrorQuirk *mirror = quirk->data;
+    LastDataSet *last = (LastDataSet *)&mirror->data;
+
+    last->addr = last->data = last->size = last->hits = last->added = 0;
+
+    vfio_drop_dynamic_eventfds(vdev, quirk);
+}
+
 static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 {
     VFIOQuirk *quirk;
     VFIOConfigMirrorQuirk *mirror;
+    LastDataSet *last;
 
     if (vdev->no_geforce_quirks ||
         !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
@@ -763,11 +915,14 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
     }
 
     quirk = vfio_quirk_alloc(1);
-    mirror = quirk->data = g_malloc0(sizeof(*mirror));
+    quirk->reset = vfio_nvidia_bar0_quirk_reset;
+    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
     mirror->mem = quirk->mem;
     mirror->vdev = vdev;
     mirror->offset = 0x88000;
     mirror->bar = nr;
+    last = (LastDataSet *)&mirror->data;
+    last->quirk = quirk;
 
     memory_region_init_io(mirror->mem, OBJECT(vdev),
                           &vfio_nvidia_mirror_quirk, mirror,
@@ -781,11 +936,14 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
     /* The 0x1800 offset mirror only seems to get used by legacy VGA */
     if (vdev->vga) {
         quirk = vfio_quirk_alloc(1);
-        mirror = quirk->data = g_malloc0(sizeof(*mirror));
+        quirk->reset = vfio_nvidia_bar0_quirk_reset;
+        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
         mirror->mem = quirk->mem;
         mirror->vdev = vdev;
         mirror->offset = 0x1800;
         mirror->bar = nr;
+        last = (LastDataSet *)&mirror->data;
+        last->quirk = quirk;
 
         memory_region_init_io(mirror->mem, OBJECT(vdev),
                               &vfio_nvidia_mirror_quirk, mirror,
@@ -1668,6 +1826,10 @@ void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
     int i;
 
     QLIST_FOREACH(quirk, &bar->quirks, next) {
+        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
+            vfio_ioeventfd_exit(QLIST_FIRST(&quirk->ioeventfds));
+        }
+
         for (i = 0; i < quirk->nr_mem; i++) {
             memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
         }
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 65446fb42845..ba1239551115 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3175,6 +3175,8 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
     DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
                      no_geforce_quirks, false),
+    DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
+                     false),
     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 594a5bd00593..a4ac583fbd6e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -24,9 +24,22 @@
 
 struct VFIOPCIDevice;
 
+typedef struct VFIOIOEventFD {
+    QLIST_ENTRY(VFIOIOEventFD) next;
+    MemoryRegion *mr;
+    hwaddr addr;
+    unsigned size;
+    uint64_t data;
+    EventNotifier e;
+    VFIORegion *region;
+    hwaddr region_addr;
+    bool dynamic; /* Added runtime, removed on device reset */
+} VFIOIOEventFD;
+
 typedef struct VFIOQuirk {
     QLIST_ENTRY(VFIOQuirk) next;
     void *data;
+    QLIST_HEAD(, VFIOIOEventFD) ioeventfds;
     int nr_mem;
     MemoryRegion *mem;
     void (*reset)(struct VFIOPCIDevice *vdev, struct VFIOQuirk *quirk);
@@ -149,6 +162,7 @@ typedef struct VFIOPCIDevice {
     bool no_kvm_msi;
     bool no_kvm_msix;
     bool no_geforce_quirks;
+    bool no_kvm_ioeventfd;
     VFIODisplay *dpy;
 } VFIOPCIDevice;
 
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 20109cb7581f..f8f97d1ff90c 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -77,6 +77,9 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset(const char *name) "%s"
+vfio_ioeventfd_exit(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
+vfio_ioeventfd_handler(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d] -> 0x%"PRIx64
+vfio_ioeventfd_init(const char *name, uint64_t addr, unsigned size, uint64_t data) "%s+0x%"PRIx64"[%d]:0x%"PRIx64
 vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [0x%03x] 0x%08x -> 0x%08x"
 vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
 vfio_pci_igd_opregion_enabled(const char *name) "%s"

  parent reply	other threads:[~2018-06-05 20:06 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-05 20:06 [Qemu-devel] [PULL 0/6] VFIO updates 2018-06-05 Alex Williamson
2018-06-05 20:06 ` [Qemu-devel] [PULL 1/6] vfio: remove DPRINTF() definition from vfio-common.h Alex Williamson
2018-06-05 20:06 ` [Qemu-devel] [PULL 2/6] vfio/quirks: Add common quirk alloc helper Alex Williamson
2018-06-05 20:06 ` [Qemu-devel] [PULL 3/6] vfio/quirks: Add quirk reset callback Alex Williamson
2018-06-05 20:06 ` Alex Williamson [this message]
2018-06-05 20:06 ` [Qemu-devel] [PULL 5/6] vfio/quirks: Enable ioeventfd quirks to be handled by vfio directly Alex Williamson
2018-06-05 20:07 ` [Qemu-devel] [PULL 6/6] vfio/pci: Default display option to "off" Alex Williamson
2018-06-06 10:29 ` [Qemu-devel] [PULL 0/6] VFIO updates 2018-06-05 no-reply
2018-06-07  9:24   ` Fam Zheng
2018-06-07  8:55 ` Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180605200641.24323.10169.stgit@gimli.home \
    --to=alex.williamson@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.