All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation
@ 2018-01-10 19:01 Alex Williamson
  2018-01-10 19:01 ` [Qemu-devel] [PATCH v2 1/5] vfio/pci: Fixup VFIOMSIXInfo comment Alex Williamson
                   ` (6 more replies)
  0 siblings, 7 replies; 14+ messages in thread
From: Alex Williamson @ 2018-01-10 19:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: Eric Blake, Markus Armbruster, eric.auger, aik

v1: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03350.html

See patch 5/5 for a thorough description.  v2 changes the 'auto'
behavior as we've determined that there's no algorithm which has even
a likely chance of success.  Instead, auto is now a placeholder for
a device/platform lookup for known good combinations (though if I'm
pessimistic, even that might depend on guest and driver versions).
Thanks,

Alex

---

Alex Williamson (5):
      vfio/pci: Fixup VFIOMSIXInfo comment
      vfio/pci: Add base BAR MemoryRegion
      vfio/pci: Emulate BARs
      qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR
      vfio/pci: Allow relocating MSI-X MMIO


 hw/core/qdev-properties.c    |   11 +++
 hw/vfio/pci.c                |  175 ++++++++++++++++++++++++++++++++++++++----
 hw/vfio/pci.h                |    6 +
 hw/vfio/trace-events         |    2 
 include/hw/qdev-properties.h |    4 +
 qapi/common.json             |   26 ++++++
 6 files changed, 206 insertions(+), 18 deletions(-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v2 1/5] vfio/pci: Fixup VFIOMSIXInfo comment
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
@ 2018-01-10 19:01 ` Alex Williamson
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 2/5] vfio/pci: Add base BAR MemoryRegion Alex Williamson
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Alex Williamson @ 2018-01-10 19:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: eric.auger, aik

The fields were removed in the referenced commit, but the comment
still mentions them.

Fixes: 2fb9636ebf24 ("vfio-pci: Remove unused fields from VFIOMSIXInfo")
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a8fb3b34222c..3d753222ca4c 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -86,7 +86,7 @@ enum {
     VFIO_INT_MSIX = 3,
 };
 
-/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
+/* Cache of MSI-X setup */
 typedef struct VFIOMSIXInfo {
     uint8_t table_bar;
     uint8_t pba_bar;

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v2 2/5] vfio/pci: Add base BAR MemoryRegion
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
  2018-01-10 19:01 ` [Qemu-devel] [PATCH v2 1/5] vfio/pci: Fixup VFIOMSIXInfo comment Alex Williamson
@ 2018-01-10 19:02 ` Alex Williamson
  2018-02-02  2:19   ` [Qemu-devel] [PATCH v3 " Alex Williamson
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 3/5] vfio/pci: Emulate BARs Alex Williamson
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Alex Williamson @ 2018-01-10 19:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: eric.auger, aik

Add one more layer to our stack of MemoryRegions, this base region
allows us to register BARs independently of the vfio region or to
extend the size of BARs which do map to a region.  This will be
useful when we want hypervisor defined BARs or sections of BARs,
for purposes such as relocating MSI-X emulation.  We therefore call
msix_init() based on this new base MemoryRegion, while the quirks,
which only modify regions still operate on those sub-MemoryRegions.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci.c |   74 ++++++++++++++++++++++++++++++++++++++++++++-------------
 hw/vfio/pci.h |    3 ++
 2 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c977ee327f94..46f1e7ed9933 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1440,9 +1440,9 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
     vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
                                     sizeof(unsigned long));
     ret = msix_init(&vdev->pdev, vdev->msix->entries,
-                    vdev->bars[vdev->msix->table_bar].region.mem,
+                    vdev->bars[vdev->msix->table_bar].mr,
                     vdev->msix->table_bar, vdev->msix->table_offset,
-                    vdev->bars[vdev->msix->pba_bar].region.mem,
+                    vdev->bars[vdev->msix->pba_bar].mr,
                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
                     &err);
     if (ret < 0) {
@@ -1482,8 +1482,8 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
 
     if (vdev->msix) {
         msix_uninit(&vdev->pdev,
-                    vdev->bars[vdev->msix->table_bar].region.mem,
-                    vdev->bars[vdev->msix->pba_bar].region.mem);
+                    vdev->bars[vdev->msix->table_bar].mr,
+                    vdev->bars[vdev->msix->pba_bar].mr);
         g_free(vdev->msix->pending);
     }
 }
@@ -1500,12 +1500,11 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
     }
 }
 
-static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
+static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
 
     uint32_t pci_bar;
-    uint8_t type;
     int ret;
 
     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
@@ -1524,23 +1523,52 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
     pci_bar = le32_to_cpu(pci_bar);
     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
-    type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
-                                    ~PCI_BASE_ADDRESS_MEM_MASK);
+    bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
+                                         ~PCI_BASE_ADDRESS_MEM_MASK);
+    bar->size = bar->region.size;
+}
 
-    if (vfio_region_mmap(&bar->region)) {
-        error_report("Failed to mmap %s BAR %d. Performance may be slow",
-                     vdev->vbasedev.name, nr);
+static void vfio_bars_prepare(VFIOPCIDevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_bar_prepare(vdev, i);
     }
+}
 
-    pci_register_bar(&vdev->pdev, nr, type, bar->region.mem);
+static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    char *name;
+
+    if (!bar->size) {
+        return;
+    }
+
+    bar->mr = g_new0(MemoryRegion, 1);
+    name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
+    memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
+    g_free(name);
+
+    if (bar->region.size) {
+        memory_region_add_subregion(bar->mr, 0, bar->region.mem);
+
+        if (vfio_region_mmap(&bar->region)) {
+            error_report("Failed to mmap %s BAR %d. Performance may be slow",
+                         vdev->vbasedev.name, nr);
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
 }
 
-static void vfio_bars_setup(VFIOPCIDevice *vdev)
+static void vfio_bars_register(VFIOPCIDevice *vdev)
 {
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
-        vfio_bar_setup(vdev, i);
+        vfio_bar_register(vdev, i);
     }
 }
 
@@ -1549,8 +1577,13 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev)
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
         vfio_bar_quirk_exit(vdev, i);
-        vfio_region_exit(&vdev->bars[i].region);
+        vfio_region_exit(&bar->region);
+        if (bar->region.size) {
+            memory_region_del_subregion(bar->mr, bar->region.mem);
+        }
     }
 
     if (vdev->vga) {
@@ -1564,8 +1597,14 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev)
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
         vfio_bar_quirk_finalize(vdev, i);
-        vfio_region_finalize(&vdev->bars[i].region);
+        vfio_region_finalize(&bar->region);
+        if (bar->size) {
+            object_unparent(OBJECT(bar->mr));
+            g_free(bar->mr);
+        }
     }
 
     if (vdev->vga) {
@@ -2810,7 +2849,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         goto error;
     }
 
-    vfio_bars_setup(vdev);
+    vfio_bars_prepare(vdev);
+    vfio_bars_register(vdev);
 
     ret = vfio_add_capabilities(vdev, errp);
     if (ret) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3d753222ca4c..dcdb1a806769 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -33,6 +33,9 @@ typedef struct VFIOQuirk {
 
 typedef struct VFIOBAR {
     VFIORegion region;
+    MemoryRegion *mr;
+    size_t size;
+    uint8_t type;
     bool ioport;
     bool mem64;
     QLIST_HEAD(, VFIOQuirk) quirks;

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v2 3/5] vfio/pci: Emulate BARs
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
  2018-01-10 19:01 ` [Qemu-devel] [PATCH v2 1/5] vfio/pci: Fixup VFIOMSIXInfo comment Alex Williamson
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 2/5] vfio/pci: Add base BAR MemoryRegion Alex Williamson
@ 2018-01-10 19:02 ` Alex Williamson
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR Alex Williamson
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Alex Williamson @ 2018-01-10 19:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: eric.auger, aik

The kernel provides similar emulation of PCI BAR register access to
QEMU, so up until now we've used that for things like BAR sizing and
storing the BAR address.  However, if we intend to resize BARs or add
BARs that don't exist on the physical device, we need to switch to the
pure QEMU emulation of the BAR.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 46f1e7ed9933..20252ea7aeb7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2773,6 +2773,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
     /* QEMU can choose to expose the ROM or not */
     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
+    /* QEMU can also add or extend BARs */
+    memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
 
     /*
      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
                   ` (2 preceding siblings ...)
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 3/5] vfio/pci: Emulate BARs Alex Williamson
@ 2018-01-10 19:02 ` Alex Williamson
  2018-01-24 15:49   ` Alex Williamson
  2018-01-31 13:24   ` Markus Armbruster
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO Alex Williamson
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 14+ messages in thread
From: Alex Williamson @ 2018-01-10 19:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: Markus Armbruster, Eric Blake, eric.auger, aik

Add an option which allows the user to specify a PCI BAR number,
including an 'off' and 'auto' selection.

Cc: Markus Armbruster <armbru@redhat.com>
Cc: Eric Blake <eblake@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/core/qdev-properties.c    |   11 +++++++++++
 include/hw/qdev-properties.h |    4 ++++
 qapi/common.json             |   26 ++++++++++++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c
index 1dc80fcea2af..e33184e5a342 100644
--- a/hw/core/qdev-properties.c
+++ b/hw/core/qdev-properties.c
@@ -1256,3 +1256,14 @@ const PropertyInfo qdev_prop_link = {
     .name = "link",
     .create = create_link_property,
 };
+
+/* --- OffAutoPCIBAR off/auto/bar0/bar1/bar2/bar3/bar4/bar5 --- */
+
+const PropertyInfo qdev_prop_off_auto_pcibar = {
+    .name = "OffAutoPCIBAR",
+    .description = "off/auto/bar0/bar1/bar2/bar3/bar4/bar5",
+    .enum_table = &OffAutoPCIBAR_lookup,
+    .get = get_enum,
+    .set = set_enum,
+    .set_default_value = set_default_value_enum,
+};
diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h
index 60b42ac561af..e2643f5126c4 100644
--- a/include/hw/qdev-properties.h
+++ b/include/hw/qdev-properties.h
@@ -33,6 +33,7 @@ extern const PropertyInfo qdev_prop_blocksize;
 extern const PropertyInfo qdev_prop_pci_host_devaddr;
 extern const PropertyInfo qdev_prop_arraylen;
 extern const PropertyInfo qdev_prop_link;
+extern const PropertyInfo qdev_prop_off_auto_pcibar;
 
 #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
         .name      = (_name),                                    \
@@ -213,6 +214,9 @@ extern const PropertyInfo qdev_prop_link;
     DEFINE_PROP(_n, _s, _f, qdev_prop_pci_host_devaddr, PCIHostDeviceAddress)
 #define DEFINE_PROP_MEMORY_REGION(_n, _s, _f)             \
     DEFINE_PROP(_n, _s, _f, qdev_prop_ptr, MemoryRegion *)
+#define DEFINE_PROP_OFF_AUTO_PCIBAR(_n, _s, _f, _d) \
+    DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_off_auto_pcibar, \
+                        OffAutoPCIBAR)
 
 #define DEFINE_PROP_END_OF_LIST()               \
     {}
diff --git a/qapi/common.json b/qapi/common.json
index 6eb01821ef59..d9b14dd429f3 100644
--- a/qapi/common.json
+++ b/qapi/common.json
@@ -100,3 +100,29 @@
 { 'alternate': 'StrOrNull',
   'data': { 's': 'str',
             'n': 'null' } }
+
+##
+# @OffAutoPCIBAR:
+#
+# An enumeration of options for specifying a PCI BAR
+#
+# @off: The specified feature is disabled
+#
+# @auto: The PCI BAR for the feature is automatically selected
+#
+# @bar0: PCI BAR0 is used for the feature
+#
+# @bar1: PCI BAR1 is used for the feature
+#
+# @bar2: PCI BAR2 is used for the feature
+#
+# @bar3: PCI BAR3 is used for the feature
+#
+# @bar4: PCI BAR4 is used for the feature
+#
+# @bar5: PCI BAR5 is used for the feature
+#
+# Since: 2.12
+##
+{ 'enum': 'OffAutoPCIBAR',
+  'data': [ 'off', 'auto', 'bar0', 'bar1', 'bar2', 'bar3', 'bar4', 'bar5' ] }

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
                   ` (3 preceding siblings ...)
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR Alex Williamson
@ 2018-01-10 19:02 ` Alex Williamson
  2018-01-19  9:50   ` Auger Eric
  2018-01-15  3:39 ` [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alexey Kardashevskiy
  2018-01-19 10:26 ` Auger Eric
  6 siblings, 1 reply; 14+ messages in thread
From: Alex Williamson @ 2018-01-10 19:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: eric.auger, aik

Recently proposed vfio-pci kernel changes (v4.16) remove the
restriction preventing userspace from mmap'ing PCI BARs in areas
overlapping the MSI-X vector table.  This change is primarily intended
to benefit host platforms which make use of system page sizes larger
than the PCI spec recommendation for alignment of MSI-X data
structures (ie. not x86_64).  In the case of POWER systems, the SPAPR
spec requires the VM to program MSI-X using hypercalls, rendering the
MSI-X vector table unused in the VM view of the device.  However,
ARM64 platforms also support 64KB pages and rely on QEMU emulation of
MSI-X.  Regardless of the kernel driver allowing mmaps overlapping
the MSI-X vector table, emulation of the MSI-X vector table also
prevents direct mapping of device MMIO spaces overlapping this page.
Thanks to the fact that PCI devices have a standard self discovery
mechanism, we can try to resolve this by relocating the MSI-X data
structures, either by creating a new PCI BAR or extending an existing
BAR and updating the MSI-X capability for the new location.  There's
even a very slim chance that this could benefit devices which do not
adhere to the PCI spec alignment guidelines on x86_64 systems.

This new x-msix-relocation option accepts the following choices:

  off: Disable MSI-X relocation, use native device config (default)
  auto: Use a known good combination for the platform/device (none yet)
  bar0..bar5: Specify the target BAR for MSI-X data structures

If compatible, the target BAR will either be created or extended and
the new portion will be used for MSI-X emulation.

The first obvious user question with this option is how to determine
whether a given platform and device might benefit from this option.
In most cases, the answer is that it won't, especially on x86_64.
Devices often dedicate an entire BAR to MSI-X and therefore no
performance sensitive registers overlap the MSI-X area.  Take for
example:

# lspci -vvvs 0a:00.0
0a:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection
	...
	Region 0: Memory at db680000 (32-bit, non-prefetchable) [size=512K]
	Region 3: Memory at db7f8000 (32-bit, non-prefetchable) [size=16K]
	...
	Capabilities: [70] MSI-X: Enable+ Count=10 Masked-
		Vector table: BAR=3 offset=00000000
		PBA: BAR=3 offset=00002000

This device uses the 16K bar3 for MSI-X with the vector table at
offset zero and the pending bits arrary at offset 8K, fully honoring
the PCI spec alignment guidance.  The data sheet specifically refers
to this as an MSI-X BAR.  This device would not see a benefit from
MSI-X relocation regardless of the platform, regardless of the page
size.

However, here's another example:

# lspci -vvvs 02:00.0
02:00.0 Serial Attached SCSI controller: xxxxxxxx
	...
	Region 0: I/O ports at c000 [size=256]
	Region 1: Memory at ef640000 (64-bit, non-prefetchable) [size=64K]
	Region 3: Memory at ef600000 (64-bit, non-prefetchable) [size=256K]
	...
	Capabilities: [c0] MSI-X: Enable+ Count=16 Masked-
		Vector table: BAR=1 offset=0000e000
		PBA: BAR=1 offset=0000f000

Here the MSI-X data structures are placed on separate 4K pages at the
end of a 64KB BAR.  If our host page size is 4K, we're likely fine,
but at 64KB page size, MSI-X emulation at that location prevents the
entire BAR from being directly mapped into the VM address space.
Overlapping performance sensitive registers then starts to be a very
likely scenario on such a platform.  At this point, the user could
enable tracing on vfio_region_read and vfio_region_write to determine
more conclusively if device accesses are being trapped through QEMU.

Upon finding a device and platform in need of MSI-X relocation, the
next problem is how to choose target PCI BAR to host the MSI-X data
structures.  A few key rules to keep in mind for this selection
include:

 * There are only 6 BAR slots, bar0..bar5
 * 64-bit BARs occupy two BAR slots, 'lspci -vvv' lists the first slot
 * PCI BARs are always a power of 2 in size, extending == doubling
 * The maximum size of a 32-bit BAR is 2GB
 * MSI-X data structures must reside in an MMIO BAR

Using these rules, we can evaluate each BAR of the second example
device above as follows:

 bar0: I/O port BAR, incompatible with MSI-X tables
 bar1: BAR could be extended, incurring another 64KB of MMIO
 bar2: Unavailable, bar1 is 64-bit, this register is used by bar1
 bar3: BAR could be extended, incurring another 256KB of MMIO
 bar4: Unavailable, bar3 is 64bit, this register is used by bar3
 bar5: Available, empty BAR, minimum additional MMIO

A secondary optimization we might wish to make in relocating MSI-X
is to minimize the additional MMIO required for the device, therefore
we might test the available choices in order of preference as bar5,
bar1, and finally bar3.  The original proposal for this feature
included an 'auto' option which would choose bar5 in this case, but
various drivers have been found that make assumptions about the
properties of the "first" BAR or the size of BARs such that there
appears to be no foolproof automatic selection available, requiring
known good combinations to be sourced from users.  This patch is
pre-enabled for an 'auto' selection making use of a validated lookup
table, but no entries are yet identified.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci.c        |  101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.h        |    1 
 hw/vfio/trace-events |    2 +
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 20252ea7aeb7..7171ba18213c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1352,6 +1352,100 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
     }
 }
 
+static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
+{
+    int target_bar = -1;
+    size_t msix_sz;
+
+    if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
+        return;
+    }
+
+    /* The actual minimum size of MSI-X structures */
+    msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
+              (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
+    /* Round up to host pages, we don't want to share a page */
+    msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
+    /* PCI BARs must be a power of 2 */
+    msix_sz = pow2ceil(msix_sz);
+
+    if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
+        /*
+         * TODO: Lookup table for known devices.
+         *
+         * Logically we might use an algorithm here to select the BAR adding
+         * the least additional MMIO space, but we cannot programatically
+         * predict the driver dependency on BAR ordering or sizing, therefore
+         * 'auto' becomes a lookup for combinations reported to work.
+         */
+        if (target_bar < 0) {
+            error_setg_errno(errp, EINVAL, "No automatic MSI-X relocation "
+                             "available for device %04x:%04x",
+                             vdev->vendor_id, vdev->device_id);
+            return;
+        }
+    } else {
+        target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
+    }
+
+    /* I/O port BARs cannot host MSI-X structures */
+    if (vdev->bars[target_bar].ioport) {
+        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
+                         "I/O port BAR", target_bar);
+        return;
+    }
+
+    /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
+    if (!vdev->bars[target_bar].size &&
+         target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
+        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
+                         "consumed by 64-bit BAR %d", target_bar,
+                         target_bar - 1);
+        return;
+    }
+
+    /* 2GB max size for 32-bit BARs */
+    if (vdev->bars[target_bar].size > (1 * 1024 * 1024 * 1024) &&
+        !vdev->bars[target_bar].mem64) {
+        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
+                         "no space to extend 32-bit BAR", target_bar);
+        return;
+    }
+
+    /*
+     * If adding a new BAR, test if we can make it 64bit.  We make it
+     * prefetchable since QEMU MSI-X emulation has no read side effects
+     * and doing so makes mapping more flexible.
+     */
+    if (!vdev->bars[target_bar].size) {
+        if (target_bar < (PCI_ROM_SLOT - 1) &&
+            !vdev->bars[target_bar + 1].size) {
+            vdev->bars[target_bar].mem64 = true;
+            vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
+        }
+        vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
+        vdev->bars[target_bar].size = msix_sz;
+        vdev->msix->table_offset = 0;
+    } else {
+        vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
+                                          msix_sz * 2);
+        /*
+         * Due to above size calc, MSI-X always starts halfway into the BAR,
+         * which will always be a separate host page.
+         */
+        vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
+    }
+
+    vdev->msix->table_bar = target_bar;
+    vdev->msix->pba_bar = target_bar;
+    /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
+    vdev->msix->pba_offset = vdev->msix->table_offset +
+                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
+
+    trace_vfio_msix_relo(vdev->vbasedev.name,
+                         vdev->msix->table_bar, vdev->msix->table_offset);
+}
+
 /*
  * We don't have any control over how pci_add_capability() inserts
  * capabilities into the chain.  In order to setup MSI-X we need a
@@ -1430,6 +1524,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
     vdev->msix = msix;
 
     vfio_pci_fixup_msix_region(vdev);
+
+    vfio_pci_relocate_msix(vdev, errp);
 }
 
 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
@@ -2845,13 +2941,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
     vfio_pci_size_rom(vdev);
 
+    vfio_bars_prepare(vdev);
+
     vfio_msix_early_setup(vdev, &err);
     if (err) {
         error_propagate(errp, err);
         goto error;
     }
 
-    vfio_bars_prepare(vdev);
     vfio_bars_register(vdev);
 
     ret = vfio_add_capabilities(vdev, errp);
@@ -3041,6 +3138,8 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
                                    nv_gpudirect_clique,
                                    qdev_prop_nv_gpudirect_clique, uint8_t),
+    DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
+                                OFF_AUTOPCIBAR_OFF),
     /*
      * TODO - support passed fds... is this necessary?
      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index dcdb1a806769..588381f201b4 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -135,6 +135,7 @@ typedef struct VFIOPCIDevice {
                                 (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
     int32_t bootindex;
     uint32_t igd_gms;
+    OffAutoPCIBAR msix_relo;
     uint8_t pm_cap;
     uint8_t nv_gpudirect_clique;
     bool pci_aer;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index fae096c0724f..437ccdd29053 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -16,6 +16,8 @@ vfio_msix_pba_disable(const char *name) " (%s)"
 vfio_msix_pba_enable(const char *name) " (%s)"
 vfio_msix_disable(const char *name) " (%s)"
 vfio_msix_fixup(const char *name, int bar, uint64_t start, uint64_t end) " (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]"
+vfio_msix_relo_cost(const char *name, int bar, uint64_t cost) " (%s) BAR %d cost 0x%"PRIx64""
+vfio_msix_relo(const char *name, int bar, uint64_t offset) " (%s) BAR %d offset 0x%"PRIx64""
 vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors"
 vfio_msi_disable(const char *name) " (%s)"
 vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
                   ` (4 preceding siblings ...)
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO Alex Williamson
@ 2018-01-15  3:39 ` Alexey Kardashevskiy
  2018-01-19 10:26 ` Auger Eric
  6 siblings, 0 replies; 14+ messages in thread
From: Alexey Kardashevskiy @ 2018-01-15  3:39 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: Eric Blake, Markus Armbruster, eric.auger

On 11/01/18 06:01, Alex Williamson wrote:
> v1: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03350.html
> 
> See patch 5/5 for a thorough description.  v2 changes the 'auto'
> behavior as we've determined that there's no algorithm which has even
> a likely chance of success.  Instead, auto is now a placeholder for
> a device/platform lookup for known good combinations (though if I'm
> pessimistic, even that might depend on guest and driver versions).
> Thanks,



Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>




> 
> Alex
> 
> ---
> 
> Alex Williamson (5):
>       vfio/pci: Fixup VFIOMSIXInfo comment
>       vfio/pci: Add base BAR MemoryRegion
>       vfio/pci: Emulate BARs
>       qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR
>       vfio/pci: Allow relocating MSI-X MMIO
> 
> 
>  hw/core/qdev-properties.c    |   11 +++
>  hw/vfio/pci.c                |  175 ++++++++++++++++++++++++++++++++++++++----
>  hw/vfio/pci.h                |    6 +
>  hw/vfio/trace-events         |    2 
>  include/hw/qdev-properties.h |    4 +
>  qapi/common.json             |   26 ++++++
>  6 files changed, 206 insertions(+), 18 deletions(-)
> 


-- 
Alexey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO Alex Williamson
@ 2018-01-19  9:50   ` Auger Eric
  2018-01-19 15:23     ` Alex Williamson
  0 siblings, 1 reply; 14+ messages in thread
From: Auger Eric @ 2018-01-19  9:50 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: aik

Hi Alex,

On 10/01/18 20:02, Alex Williamson wrote:
> Recently proposed vfio-pci kernel changes (v4.16) remove the
> restriction preventing userspace from mmap'ing PCI BARs in areas
> overlapping the MSI-X vector table.  This change is primarily intended
> to benefit host platforms which make use of system page sizes larger
> than the PCI spec recommendation for alignment of MSI-X data
> structures (ie. not x86_64).  In the case of POWER systems, the SPAPR
> spec requires the VM to program MSI-X using hypercalls, rendering the
> MSI-X vector table unused in the VM view of the device.  However,
> ARM64 platforms also support 64KB pages and rely on QEMU emulation of
> MSI-X.  Regardless of the kernel driver allowing mmaps overlapping
> the MSI-X vector table, emulation of the MSI-X vector table also
> prevents direct mapping of device MMIO spaces overlapping this page.
> Thanks to the fact that PCI devices have a standard self discovery
> mechanism, we can try to resolve this by relocating the MSI-X data
> structures, either by creating a new PCI BAR or extending an existing
> BAR and updating the MSI-X capability for the new location.  There's
> even a very slim chance that this could benefit devices which do not
> adhere to the PCI spec alignment guidelines on x86_64 systems.
> 
> This new x-msix-relocation option accepts the following choices:
> 
>   off: Disable MSI-X relocation, use native device config (default)
>   auto: Use a known good combination for the platform/device (none yet)
>   bar0..bar5: Specify the target BAR for MSI-X data structures
> 
> If compatible, the target BAR will either be created or extended and
> the new portion will be used for MSI-X emulation.
> 
> The first obvious user question with this option is how to determine
> whether a given platform and device might benefit from this option.
> In most cases, the answer is that it won't, especially on x86_64.
> Devices often dedicate an entire BAR to MSI-X and therefore no
> performance sensitive registers overlap the MSI-X area.  Take for
> example:
> 
> # lspci -vvvs 0a:00.0
> 0a:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection
> 	...
> 	Region 0: Memory at db680000 (32-bit, non-prefetchable) [size=512K]
> 	Region 3: Memory at db7f8000 (32-bit, non-prefetchable) [size=16K]
> 	...
> 	Capabilities: [70] MSI-X: Enable+ Count=10 Masked-
> 		Vector table: BAR=3 offset=00000000
> 		PBA: BAR=3 offset=00002000
> 
> This device uses the 16K bar3 for MSI-X with the vector table at
> offset zero and the pending bits arrary at offset 8K, fully honoring
array
> the PCI spec alignment guidance.  The data sheet specifically refers
> to this as an MSI-X BAR.  This device would not see a benefit from
> MSI-X relocation regardless of the platform, regardless of the page
> size.
> 
> However, here's another example:
> 
> # lspci -vvvs 02:00.0
> 02:00.0 Serial Attached SCSI controller: xxxxxxxx
> 	...
> 	Region 0: I/O ports at c000 [size=256]
> 	Region 1: Memory at ef640000 (64-bit, non-prefetchable) [size=64K]
> 	Region 3: Memory at ef600000 (64-bit, non-prefetchable) [size=256K]
> 	...
> 	Capabilities: [c0] MSI-X: Enable+ Count=16 Masked-
> 		Vector table: BAR=1 offset=0000e000
> 		PBA: BAR=1 offset=0000f000
> 
> Here the MSI-X data structures are placed on separate 4K pages at the
> end of a 64KB BAR.  If our host page size is 4K, we're likely fine,
> but at 64KB page size, MSI-X emulation at that location prevents the
> entire BAR from being directly mapped into the VM address space.
> Overlapping performance sensitive registers then starts to be a very
> likely scenario on such a platform.  At this point, the user could
> enable tracing on vfio_region_read and vfio_region_write to determine
> more conclusively if device accesses are being trapped through QEMU.
> 
> Upon finding a device and platform in need of MSI-X relocation, the
> next problem is how to choose target PCI BAR to host the MSI-X data
> structures.  A few key rules to keep in mind for this selection
> include:
> 
>  * There are only 6 BAR slots, bar0..bar5
>  * 64-bit BARs occupy two BAR slots, 'lspci -vvv' lists the first slot
>  * PCI BARs are always a power of 2 in size, extending == doubling
>  * The maximum size of a 32-bit BAR is 2GB
>  * MSI-X data structures must reside in an MMIO BAR
> 
> Using these rules, we can evaluate each BAR of the second example
> device above as follows:
> 
>  bar0: I/O port BAR, incompatible with MSI-X tables
>  bar1: BAR could be extended, incurring another 64KB of MMIO
>  bar2: Unavailable, bar1 is 64-bit, this register is used by bar1
>  bar3: BAR could be extended, incurring another 256KB of MMIO
>  bar4: Unavailable, bar3 is 64bit, this register is used by bar3
>  bar5: Available, empty BAR, minimum additional MMIO
> 
> A secondary optimization we might wish to make in relocating MSI-X
> is to minimize the additional MMIO required for the device, therefore
> we might test the available choices in order of preference as bar5,
> bar1, and finally bar3.  The original proposal for this feature
> included an 'auto' option which would choose bar5 in this case, but
> various drivers have been found that make assumptions about the
> properties of the "first" BAR or the size of BARs such that there
> appears to be no foolproof automatic selection available, requiring
> known good combinations to be sourced from users.  This patch is
> pre-enabled for an 'auto' selection making use of a validated lookup
> table, but no entries are yet identified.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  hw/vfio/pci.c        |  101 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.h        |    1 
>  hw/vfio/trace-events |    2 +
>  3 files changed, 103 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 20252ea7aeb7..7171ba18213c 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1352,6 +1352,100 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
> +{
> +    int target_bar = -1;
> +    size_t msix_sz;
> +
> +    if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
> +        return;
> +    }
> +
> +    /* The actual minimum size of MSI-X structures */
> +    msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
> +              (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
> +    /* Round up to host pages, we don't want to share a page */
> +    msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
> +    /* PCI BARs must be a power of 2 */
> +    msix_sz = pow2ceil(msix_sz);
> +
> +    if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
> +        /*
> +         * TODO: Lookup table for known devices.
> +         *
> +         * Logically we might use an algorithm here to select the BAR adding
> +         * the least additional MMIO space, but we cannot programatically
> +         * predict the driver dependency on BAR ordering or sizing, therefore
> +         * 'auto' becomes a lookup for combinations reported to work.
> +         */
> +        if (target_bar < 0) {
> +            error_setg_errno(errp, EINVAL, "No automatic MSI-X relocation "
> +                             "available for device %04x:%04x",
> +                             vdev->vendor_id, vdev->device_id);
don't you want error_setg here and below?
> +            return;
> +        }
> +    } else {
> +        target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
> +    }
> +
> +    /* I/O port BARs cannot host MSI-X structures */
> +    if (vdev->bars[target_bar].ioport) {
> +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
> +                         "I/O port BAR", target_bar);

> +        return;
> +    }
> +
> +    /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
> +    if (!vdev->bars[target_bar].size &&
> +         target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
> +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
> +                         "consumed by 64-bit BAR %d", target_bar,
> +                         target_bar - 1);
> +        return;
> +    }
> +
> +    /* 2GB max size for 32-bit BARs */
> +    if (vdev->bars[target_bar].size > (1 * 1024 * 1024 * 1024) &&
nit: the comment versus the check is a bit misleading. If I understand
correctly, the x2 size would be gt 2GB.
> +        !vdev->bars[target_bar].mem64) {
> +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
> +                         "no space to extend 32-bit BAR", target_bar);
> +        return;
> +    }
> +
> +    /*
> +     * If adding a new BAR, test if we can make it 64bit.  We make it
> +     * prefetchable since QEMU MSI-X emulation has no read side effects
> +     * and doing so makes mapping more flexible.
> +     */
> +    if (!vdev->bars[target_bar].size) {
> +        if (target_bar < (PCI_ROM_SLOT - 1) &&
> +            !vdev->bars[target_bar + 1].size) {
> +            vdev->bars[target_bar].mem64 = true;
> +            vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
> +        }
> +        vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
> +        vdev->bars[target_bar].size = msix_sz;
> +        vdev->msix->table_offset = 0;
> +    } else {
> +        vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
> +                                          msix_sz * 2);
> +        /*
> +         * Due to above size calc, MSI-X always starts halfway into the BAR,
> +         * which will always be a separate host page.

nit: the spec gives this recommendation.

"If a dedicated Base Address register is not feasible, it is recommended
that a function isolate the MSI-X structures from the non-MSI-X
structures with aligned 8 KB ranges rather than
the mandatory aligned 4 KB ranges."

In some corner circumstances, with 4kB - which is not our main use case
- this may not be enforced here.

Otherwise looks good to me.

Thanks

Eric
> +         */
> +        vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
> +    }
> +
> +    vdev->msix->table_bar = target_bar;
> +    vdev->msix->pba_bar = target_bar;
> +    /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
> +    vdev->msix->pba_offset = vdev->msix->table_offset +
> +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
> +
> +    trace_vfio_msix_relo(vdev->vbasedev.name,
> +                         vdev->msix->table_bar, vdev->msix->table_offset);
> +}
> +
>  /*
>   * We don't have any control over how pci_add_capability() inserts
>   * capabilities into the chain.  In order to setup MSI-X we need a
> @@ -1430,6 +1524,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>      vdev->msix = msix;
>  
>      vfio_pci_fixup_msix_region(vdev);
> +
> +    vfio_pci_relocate_msix(vdev, errp);
>  }
>  
>  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> @@ -2845,13 +2941,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>  
>      vfio_pci_size_rom(vdev);
>  
> +    vfio_bars_prepare(vdev);
> +
>      vfio_msix_early_setup(vdev, &err);
>      if (err) {
>          error_propagate(errp, err);
>          goto error;
>      }
>  
> -    vfio_bars_prepare(vdev);
>      vfio_bars_register(vdev);
>  
>      ret = vfio_add_capabilities(vdev, errp);
> @@ -3041,6 +3138,8 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
>                                     nv_gpudirect_clique,
>                                     qdev_prop_nv_gpudirect_clique, uint8_t),
> +    DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
> +                                OFF_AUTOPCIBAR_OFF),
>      /*
>       * TODO - support passed fds... is this necessary?
>       * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index dcdb1a806769..588381f201b4 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -135,6 +135,7 @@ typedef struct VFIOPCIDevice {
>                                  (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
>      int32_t bootindex;
>      uint32_t igd_gms;
> +    OffAutoPCIBAR msix_relo;
>      uint8_t pm_cap;
>      uint8_t nv_gpudirect_clique;
>      bool pci_aer;
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index fae096c0724f..437ccdd29053 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -16,6 +16,8 @@ vfio_msix_pba_disable(const char *name) " (%s)"
>  vfio_msix_pba_enable(const char *name) " (%s)"
>  vfio_msix_disable(const char *name) " (%s)"
>  vfio_msix_fixup(const char *name, int bar, uint64_t start, uint64_t end) " (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]"
> +vfio_msix_relo_cost(const char *name, int bar, uint64_t cost) " (%s) BAR %d cost 0x%"PRIx64""
> +vfio_msix_relo(const char *name, int bar, uint64_t offset) " (%s) BAR %d offset 0x%"PRIx64""
>  vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors"
>  vfio_msi_disable(const char *name) " (%s)"
>  vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation
  2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
                   ` (5 preceding siblings ...)
  2018-01-15  3:39 ` [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alexey Kardashevskiy
@ 2018-01-19 10:26 ` Auger Eric
  6 siblings, 0 replies; 14+ messages in thread
From: Auger Eric @ 2018-01-19 10:26 UTC (permalink / raw)
  To: Alex Williamson, qemu-devel; +Cc: Eric Blake, Markus Armbruster, aik

Hi Alex,
On 10/01/18 20:01, Alex Williamson wrote:
> v1: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03350.html
> 
> See patch 5/5 for a thorough description.  v2 changes the 'auto'
> behavior as we've determined that there's no algorithm which has even
> a likely chance of success.  Instead, auto is now a placeholder for
> a device/platform lookup for known good combinations (though if I'm
> pessimistic, even that might depend on guest and driver versions).
> Thanks,
> 
> Alex
> 
> ---
> 
> Alex Williamson (5):
>       vfio/pci: Fixup VFIOMSIXInfo comment
>       vfio/pci: Add base BAR MemoryRegion
>       vfio/pci: Emulate BARs
>       qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR
>       vfio/pci: Allow relocating MSI-X MMIO
> 
> 
>  hw/core/qdev-properties.c    |   11 +++
>  hw/vfio/pci.c                |  175 ++++++++++++++++++++++++++++++++++++++----
>  hw/vfio/pci.h                |    6 +
>  hw/vfio/trace-events         |    2 
>  include/hw/qdev-properties.h |    4 +
>  qapi/common.json             |   26 ++++++
>  6 files changed, 206 insertions(+), 18 deletions(-)
> 

For the whole series:
Reviewed-by: Eric Auger <eric.auger@redhat.com>
(only minor comments on 5/5)

and
Tested-by: Eric Auger <eric.auger@redhat.com>
on aarch64 with a Mellanox CX-4 device

Thanks

Eric

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO
  2018-01-19  9:50   ` Auger Eric
@ 2018-01-19 15:23     ` Alex Williamson
  2018-01-19 15:27       ` Auger Eric
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Williamson @ 2018-01-19 15:23 UTC (permalink / raw)
  To: Auger Eric; +Cc: qemu-devel, aik

Hi Eric,

On Fri, 19 Jan 2018 10:50:53 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Alex,
> 
> On 10/01/18 20:02, Alex Williamson wrote:
> > Recently proposed vfio-pci kernel changes (v4.16) remove the
> > restriction preventing userspace from mmap'ing PCI BARs in areas
> > overlapping the MSI-X vector table.  This change is primarily intended
> > to benefit host platforms which make use of system page sizes larger
> > than the PCI spec recommendation for alignment of MSI-X data
> > structures (ie. not x86_64).  In the case of POWER systems, the SPAPR
> > spec requires the VM to program MSI-X using hypercalls, rendering the
> > MSI-X vector table unused in the VM view of the device.  However,
> > ARM64 platforms also support 64KB pages and rely on QEMU emulation of
> > MSI-X.  Regardless of the kernel driver allowing mmaps overlapping
> > the MSI-X vector table, emulation of the MSI-X vector table also
> > prevents direct mapping of device MMIO spaces overlapping this page.
> > Thanks to the fact that PCI devices have a standard self discovery
> > mechanism, we can try to resolve this by relocating the MSI-X data
> > structures, either by creating a new PCI BAR or extending an existing
> > BAR and updating the MSI-X capability for the new location.  There's
> > even a very slim chance that this could benefit devices which do not
> > adhere to the PCI spec alignment guidelines on x86_64 systems.
> > 
> > This new x-msix-relocation option accepts the following choices:
> > 
> >   off: Disable MSI-X relocation, use native device config (default)
> >   auto: Use a known good combination for the platform/device (none yet)
> >   bar0..bar5: Specify the target BAR for MSI-X data structures
> > 
> > If compatible, the target BAR will either be created or extended and
> > the new portion will be used for MSI-X emulation.
> > 
> > The first obvious user question with this option is how to determine
> > whether a given platform and device might benefit from this option.
> > In most cases, the answer is that it won't, especially on x86_64.
> > Devices often dedicate an entire BAR to MSI-X and therefore no
> > performance sensitive registers overlap the MSI-X area.  Take for
> > example:
> > 
> > # lspci -vvvs 0a:00.0
> > 0a:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection
> > 	...
> > 	Region 0: Memory at db680000 (32-bit, non-prefetchable) [size=512K]
> > 	Region 3: Memory at db7f8000 (32-bit, non-prefetchable) [size=16K]
> > 	...
> > 	Capabilities: [70] MSI-X: Enable+ Count=10 Masked-
> > 		Vector table: BAR=3 offset=00000000
> > 		PBA: BAR=3 offset=00002000
> > 
> > This device uses the 16K bar3 for MSI-X with the vector table at
> > offset zero and the pending bits arrary at offset 8K, fully honoring  
> array
> > the PCI spec alignment guidance.  The data sheet specifically refers
> > to this as an MSI-X BAR.  This device would not see a benefit from
> > MSI-X relocation regardless of the platform, regardless of the page
> > size.
> > 
> > However, here's another example:
> > 
> > # lspci -vvvs 02:00.0
> > 02:00.0 Serial Attached SCSI controller: xxxxxxxx
> > 	...
> > 	Region 0: I/O ports at c000 [size=256]
> > 	Region 1: Memory at ef640000 (64-bit, non-prefetchable) [size=64K]
> > 	Region 3: Memory at ef600000 (64-bit, non-prefetchable) [size=256K]
> > 	...
> > 	Capabilities: [c0] MSI-X: Enable+ Count=16 Masked-
> > 		Vector table: BAR=1 offset=0000e000
> > 		PBA: BAR=1 offset=0000f000
> > 
> > Here the MSI-X data structures are placed on separate 4K pages at the
> > end of a 64KB BAR.  If our host page size is 4K, we're likely fine,
> > but at 64KB page size, MSI-X emulation at that location prevents the
> > entire BAR from being directly mapped into the VM address space.
> > Overlapping performance sensitive registers then starts to be a very
> > likely scenario on such a platform.  At this point, the user could
> > enable tracing on vfio_region_read and vfio_region_write to determine
> > more conclusively if device accesses are being trapped through QEMU.
> > 
> > Upon finding a device and platform in need of MSI-X relocation, the
> > next problem is how to choose target PCI BAR to host the MSI-X data
> > structures.  A few key rules to keep in mind for this selection
> > include:
> > 
> >  * There are only 6 BAR slots, bar0..bar5
> >  * 64-bit BARs occupy two BAR slots, 'lspci -vvv' lists the first slot
> >  * PCI BARs are always a power of 2 in size, extending == doubling
> >  * The maximum size of a 32-bit BAR is 2GB
> >  * MSI-X data structures must reside in an MMIO BAR
> > 
> > Using these rules, we can evaluate each BAR of the second example
> > device above as follows:
> > 
> >  bar0: I/O port BAR, incompatible with MSI-X tables
> >  bar1: BAR could be extended, incurring another 64KB of MMIO
> >  bar2: Unavailable, bar1 is 64-bit, this register is used by bar1
> >  bar3: BAR could be extended, incurring another 256KB of MMIO
> >  bar4: Unavailable, bar3 is 64bit, this register is used by bar3
> >  bar5: Available, empty BAR, minimum additional MMIO
> > 
> > A secondary optimization we might wish to make in relocating MSI-X
> > is to minimize the additional MMIO required for the device, therefore
> > we might test the available choices in order of preference as bar5,
> > bar1, and finally bar3.  The original proposal for this feature
> > included an 'auto' option which would choose bar5 in this case, but
> > various drivers have been found that make assumptions about the
> > properties of the "first" BAR or the size of BARs such that there
> > appears to be no foolproof automatic selection available, requiring
> > known good combinations to be sourced from users.  This patch is
> > pre-enabled for an 'auto' selection making use of a validated lookup
> > table, but no entries are yet identified.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> >  hw/vfio/pci.c        |  101 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  hw/vfio/pci.h        |    1 
> >  hw/vfio/trace-events |    2 +
> >  3 files changed, 103 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 20252ea7aeb7..7171ba18213c 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -1352,6 +1352,100 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
> >      }
> >  }
> >  
> > +static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
> > +{
> > +    int target_bar = -1;
> > +    size_t msix_sz;
> > +
> > +    if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
> > +        return;
> > +    }
> > +
> > +    /* The actual minimum size of MSI-X structures */
> > +    msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
> > +              (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
> > +    /* Round up to host pages, we don't want to share a page */
> > +    msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
> > +    /* PCI BARs must be a power of 2 */
> > +    msix_sz = pow2ceil(msix_sz);
> > +
> > +    if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
> > +        /*
> > +         * TODO: Lookup table for known devices.
> > +         *
> > +         * Logically we might use an algorithm here to select the BAR adding
> > +         * the least additional MMIO space, but we cannot programatically
> > +         * predict the driver dependency on BAR ordering or sizing, therefore
> > +         * 'auto' becomes a lookup for combinations reported to work.
> > +         */
> > +        if (target_bar < 0) {
> > +            error_setg_errno(errp, EINVAL, "No automatic MSI-X relocation "
> > +                             "available for device %04x:%04x",
> > +                             vdev->vendor_id, vdev->device_id);  
> don't you want error_setg here and below?

What's the benefit of one vs the other?  I wasn't sure which to use.

> > +            return;
> > +        }
> > +    } else {
> > +        target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
> > +    }
> > +
> > +    /* I/O port BARs cannot host MSI-X structures */
> > +    if (vdev->bars[target_bar].ioport) {
> > +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
> > +                         "I/O port BAR", target_bar);  
> 
> > +        return;
> > +    }
> > +
> > +    /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
> > +    if (!vdev->bars[target_bar].size &&
> > +         target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
> > +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
> > +                         "consumed by 64-bit BAR %d", target_bar,
> > +                         target_bar - 1);
> > +        return;
> > +    }
> > +
> > +    /* 2GB max size for 32-bit BARs */
> > +    if (vdev->bars[target_bar].size > (1 * 1024 * 1024 * 1024) &&  
> nit: the comment versus the check is a bit misleading. If I understand
> correctly, the x2 size would be gt 2GB.

Right, if the BAR is >1G already (ie. 2G), there's no room to make it
bigger.  I'll add "... cannot double if already > 1G".

> > +        !vdev->bars[target_bar].mem64) {
> > +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
> > +                         "no space to extend 32-bit BAR", target_bar);
> > +        return;
> > +    }
> > +
> > +    /*
> > +     * If adding a new BAR, test if we can make it 64bit.  We make it
> > +     * prefetchable since QEMU MSI-X emulation has no read side effects
> > +     * and doing so makes mapping more flexible.
> > +     */
> > +    if (!vdev->bars[target_bar].size) {
> > +        if (target_bar < (PCI_ROM_SLOT - 1) &&
> > +            !vdev->bars[target_bar + 1].size) {
> > +            vdev->bars[target_bar].mem64 = true;
> > +            vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
> > +        }
> > +        vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
> > +        vdev->bars[target_bar].size = msix_sz;
> > +        vdev->msix->table_offset = 0;
> > +    } else {
> > +        vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
> > +                                          msix_sz * 2);
> > +        /*
> > +         * Due to above size calc, MSI-X always starts halfway into the BAR,
> > +         * which will always be a separate host page.  
> 
> nit: the spec gives this recommendation.
> 
> "If a dedicated Base Address register is not feasible, it is recommended
> that a function isolate the MSI-X structures from the non-MSI-X
> structures with aligned 8 KB ranges rather than
> the mandatory aligned 4 KB ranges."
> 
> In some corner circumstances, with 4kB - which is not our main use case
> - this may not be enforced here.

I think this was an attempt to future proof hardware designs for larger
page sizes, but 8K turned out to be mostly (entirely?) skipped as a
common page size.  We're not building real hardware and I can't think
of any advantage to using a minimum of 8K alignment if the host page
size is 4K, can you?  Thanks,

Alex


> > +         */
> > +        vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
> > +    }
> > +
> > +    vdev->msix->table_bar = target_bar;
> > +    vdev->msix->pba_bar = target_bar;
> > +    /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
> > +    vdev->msix->pba_offset = vdev->msix->table_offset +
> > +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
> > +
> > +    trace_vfio_msix_relo(vdev->vbasedev.name,
> > +                         vdev->msix->table_bar, vdev->msix->table_offset);
> > +}
> > +
> >  /*
> >   * We don't have any control over how pci_add_capability() inserts
> >   * capabilities into the chain.  In order to setup MSI-X we need a
> > @@ -1430,6 +1524,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
> >      vdev->msix = msix;
> >  
> >      vfio_pci_fixup_msix_region(vdev);
> > +
> > +    vfio_pci_relocate_msix(vdev, errp);
> >  }
> >  
> >  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
> > @@ -2845,13 +2941,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> >  
> >      vfio_pci_size_rom(vdev);
> >  
> > +    vfio_bars_prepare(vdev);
> > +
> >      vfio_msix_early_setup(vdev, &err);
> >      if (err) {
> >          error_propagate(errp, err);
> >          goto error;
> >      }
> >  
> > -    vfio_bars_prepare(vdev);
> >      vfio_bars_register(vdev);
> >  
> >      ret = vfio_add_capabilities(vdev, errp);
> > @@ -3041,6 +3138,8 @@ static Property vfio_pci_dev_properties[] = {
> >      DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
> >                                     nv_gpudirect_clique,
> >                                     qdev_prop_nv_gpudirect_clique, uint8_t),
> > +    DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
> > +                                OFF_AUTOPCIBAR_OFF),
> >      /*
> >       * TODO - support passed fds... is this necessary?
> >       * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
> > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > index dcdb1a806769..588381f201b4 100644
> > --- a/hw/vfio/pci.h
> > +++ b/hw/vfio/pci.h
> > @@ -135,6 +135,7 @@ typedef struct VFIOPCIDevice {
> >                                  (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
> >      int32_t bootindex;
> >      uint32_t igd_gms;
> > +    OffAutoPCIBAR msix_relo;
> >      uint8_t pm_cap;
> >      uint8_t nv_gpudirect_clique;
> >      bool pci_aer;
> > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> > index fae096c0724f..437ccdd29053 100644
> > --- a/hw/vfio/trace-events
> > +++ b/hw/vfio/trace-events
> > @@ -16,6 +16,8 @@ vfio_msix_pba_disable(const char *name) " (%s)"
> >  vfio_msix_pba_enable(const char *name) " (%s)"
> >  vfio_msix_disable(const char *name) " (%s)"
> >  vfio_msix_fixup(const char *name, int bar, uint64_t start, uint64_t end) " (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]"
> > +vfio_msix_relo_cost(const char *name, int bar, uint64_t cost) " (%s) BAR %d cost 0x%"PRIx64""
> > +vfio_msix_relo(const char *name, int bar, uint64_t offset) " (%s) BAR %d offset 0x%"PRIx64""
> >  vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors"
> >  vfio_msi_disable(const char *name) " (%s)"
> >  vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
> >   

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO
  2018-01-19 15:23     ` Alex Williamson
@ 2018-01-19 15:27       ` Auger Eric
  0 siblings, 0 replies; 14+ messages in thread
From: Auger Eric @ 2018-01-19 15:27 UTC (permalink / raw)
  To: Alex Williamson; +Cc: qemu-devel, aik

Hi Alex,

On 19/01/18 16:23, Alex Williamson wrote:
> Hi Eric,
> 
> On Fri, 19 Jan 2018 10:50:53 +0100
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Alex,
>>
>> On 10/01/18 20:02, Alex Williamson wrote:
>>> Recently proposed vfio-pci kernel changes (v4.16) remove the
>>> restriction preventing userspace from mmap'ing PCI BARs in areas
>>> overlapping the MSI-X vector table.  This change is primarily intended
>>> to benefit host platforms which make use of system page sizes larger
>>> than the PCI spec recommendation for alignment of MSI-X data
>>> structures (ie. not x86_64).  In the case of POWER systems, the SPAPR
>>> spec requires the VM to program MSI-X using hypercalls, rendering the
>>> MSI-X vector table unused in the VM view of the device.  However,
>>> ARM64 platforms also support 64KB pages and rely on QEMU emulation of
>>> MSI-X.  Regardless of the kernel driver allowing mmaps overlapping
>>> the MSI-X vector table, emulation of the MSI-X vector table also
>>> prevents direct mapping of device MMIO spaces overlapping this page.
>>> Thanks to the fact that PCI devices have a standard self discovery
>>> mechanism, we can try to resolve this by relocating the MSI-X data
>>> structures, either by creating a new PCI BAR or extending an existing
>>> BAR and updating the MSI-X capability for the new location.  There's
>>> even a very slim chance that this could benefit devices which do not
>>> adhere to the PCI spec alignment guidelines on x86_64 systems.
>>>
>>> This new x-msix-relocation option accepts the following choices:
>>>
>>>   off: Disable MSI-X relocation, use native device config (default)
>>>   auto: Use a known good combination for the platform/device (none yet)
>>>   bar0..bar5: Specify the target BAR for MSI-X data structures
>>>
>>> If compatible, the target BAR will either be created or extended and
>>> the new portion will be used for MSI-X emulation.
>>>
>>> The first obvious user question with this option is how to determine
>>> whether a given platform and device might benefit from this option.
>>> In most cases, the answer is that it won't, especially on x86_64.
>>> Devices often dedicate an entire BAR to MSI-X and therefore no
>>> performance sensitive registers overlap the MSI-X area.  Take for
>>> example:
>>>
>>> # lspci -vvvs 0a:00.0
>>> 0a:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection
>>> 	...
>>> 	Region 0: Memory at db680000 (32-bit, non-prefetchable) [size=512K]
>>> 	Region 3: Memory at db7f8000 (32-bit, non-prefetchable) [size=16K]
>>> 	...
>>> 	Capabilities: [70] MSI-X: Enable+ Count=10 Masked-
>>> 		Vector table: BAR=3 offset=00000000
>>> 		PBA: BAR=3 offset=00002000
>>>
>>> This device uses the 16K bar3 for MSI-X with the vector table at
>>> offset zero and the pending bits arrary at offset 8K, fully honoring  
>> array
>>> the PCI spec alignment guidance.  The data sheet specifically refers
>>> to this as an MSI-X BAR.  This device would not see a benefit from
>>> MSI-X relocation regardless of the platform, regardless of the page
>>> size.
>>>
>>> However, here's another example:
>>>
>>> # lspci -vvvs 02:00.0
>>> 02:00.0 Serial Attached SCSI controller: xxxxxxxx
>>> 	...
>>> 	Region 0: I/O ports at c000 [size=256]
>>> 	Region 1: Memory at ef640000 (64-bit, non-prefetchable) [size=64K]
>>> 	Region 3: Memory at ef600000 (64-bit, non-prefetchable) [size=256K]
>>> 	...
>>> 	Capabilities: [c0] MSI-X: Enable+ Count=16 Masked-
>>> 		Vector table: BAR=1 offset=0000e000
>>> 		PBA: BAR=1 offset=0000f000
>>>
>>> Here the MSI-X data structures are placed on separate 4K pages at the
>>> end of a 64KB BAR.  If our host page size is 4K, we're likely fine,
>>> but at 64KB page size, MSI-X emulation at that location prevents the
>>> entire BAR from being directly mapped into the VM address space.
>>> Overlapping performance sensitive registers then starts to be a very
>>> likely scenario on such a platform.  At this point, the user could
>>> enable tracing on vfio_region_read and vfio_region_write to determine
>>> more conclusively if device accesses are being trapped through QEMU.
>>>
>>> Upon finding a device and platform in need of MSI-X relocation, the
>>> next problem is how to choose target PCI BAR to host the MSI-X data
>>> structures.  A few key rules to keep in mind for this selection
>>> include:
>>>
>>>  * There are only 6 BAR slots, bar0..bar5
>>>  * 64-bit BARs occupy two BAR slots, 'lspci -vvv' lists the first slot
>>>  * PCI BARs are always a power of 2 in size, extending == doubling
>>>  * The maximum size of a 32-bit BAR is 2GB
>>>  * MSI-X data structures must reside in an MMIO BAR
>>>
>>> Using these rules, we can evaluate each BAR of the second example
>>> device above as follows:
>>>
>>>  bar0: I/O port BAR, incompatible with MSI-X tables
>>>  bar1: BAR could be extended, incurring another 64KB of MMIO
>>>  bar2: Unavailable, bar1 is 64-bit, this register is used by bar1
>>>  bar3: BAR could be extended, incurring another 256KB of MMIO
>>>  bar4: Unavailable, bar3 is 64bit, this register is used by bar3
>>>  bar5: Available, empty BAR, minimum additional MMIO
>>>
>>> A secondary optimization we might wish to make in relocating MSI-X
>>> is to minimize the additional MMIO required for the device, therefore
>>> we might test the available choices in order of preference as bar5,
>>> bar1, and finally bar3.  The original proposal for this feature
>>> included an 'auto' option which would choose bar5 in this case, but
>>> various drivers have been found that make assumptions about the
>>> properties of the "first" BAR or the size of BARs such that there
>>> appears to be no foolproof automatic selection available, requiring
>>> known good combinations to be sourced from users.  This patch is
>>> pre-enabled for an 'auto' selection making use of a validated lookup
>>> table, but no entries are yet identified.
>>>
>>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>>> ---
>>>  hw/vfio/pci.c        |  101 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  hw/vfio/pci.h        |    1 
>>>  hw/vfio/trace-events |    2 +
>>>  3 files changed, 103 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index 20252ea7aeb7..7171ba18213c 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -1352,6 +1352,100 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
>>>      }
>>>  }
>>>  
>>> +static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
>>> +{
>>> +    int target_bar = -1;
>>> +    size_t msix_sz;
>>> +
>>> +    if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
>>> +        return;
>>> +    }
>>> +
>>> +    /* The actual minimum size of MSI-X structures */
>>> +    msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
>>> +              (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
>>> +    /* Round up to host pages, we don't want to share a page */
>>> +    msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
>>> +    /* PCI BARs must be a power of 2 */
>>> +    msix_sz = pow2ceil(msix_sz);
>>> +
>>> +    if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
>>> +        /*
>>> +         * TODO: Lookup table for known devices.
>>> +         *
>>> +         * Logically we might use an algorithm here to select the BAR adding
>>> +         * the least additional MMIO space, but we cannot programatically
>>> +         * predict the driver dependency on BAR ordering or sizing, therefore
>>> +         * 'auto' becomes a lookup for combinations reported to work.
>>> +         */
>>> +        if (target_bar < 0) {
>>> +            error_setg_errno(errp, EINVAL, "No automatic MSI-X relocation "
>>> +                             "available for device %04x:%04x",
>>> +                             vdev->vendor_id, vdev->device_id);  
>> don't you want error_setg here and below?
> 
> What's the benefit of one vs the other?  I wasn't sure which to use.
well that's nit-picking: using the _errno with EINVAL arg will output
"Invalid Argument" at the end of the error message which does not bring
much value to the end-user.
> 
>>> +            return;
>>> +        }
>>> +    } else {
>>> +        target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
>>> +    }
>>> +
>>> +    /* I/O port BARs cannot host MSI-X structures */
>>> +    if (vdev->bars[target_bar].ioport) {
>>> +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
>>> +                         "I/O port BAR", target_bar);  
>>
>>> +        return;
>>> +    }
>>> +
>>> +    /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
>>> +    if (!vdev->bars[target_bar].size &&
>>> +         target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
>>> +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
>>> +                         "consumed by 64-bit BAR %d", target_bar,
>>> +                         target_bar - 1);
>>> +        return;
>>> +    }
>>> +
>>> +    /* 2GB max size for 32-bit BARs */
>>> +    if (vdev->bars[target_bar].size > (1 * 1024 * 1024 * 1024) &&  
>> nit: the comment versus the check is a bit misleading. If I understand
>> correctly, the x2 size would be gt 2GB.
> 
> Right, if the BAR is >1G already (ie. 2G), there's no room to make it
> bigger.  I'll add "... cannot double if already > 1G".
> 
>>> +        !vdev->bars[target_bar].mem64) {
>>> +        error_setg_errno(errp, EINVAL, "Invalid MSI-X relocation BAR %d, "
>>> +                         "no space to extend 32-bit BAR", target_bar);
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * If adding a new BAR, test if we can make it 64bit.  We make it
>>> +     * prefetchable since QEMU MSI-X emulation has no read side effects
>>> +     * and doing so makes mapping more flexible.
>>> +     */
>>> +    if (!vdev->bars[target_bar].size) {
>>> +        if (target_bar < (PCI_ROM_SLOT - 1) &&
>>> +            !vdev->bars[target_bar + 1].size) {
>>> +            vdev->bars[target_bar].mem64 = true;
>>> +            vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
>>> +        }
>>> +        vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
>>> +        vdev->bars[target_bar].size = msix_sz;
>>> +        vdev->msix->table_offset = 0;
>>> +    } else {
>>> +        vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
>>> +                                          msix_sz * 2);
>>> +        /*
>>> +         * Due to above size calc, MSI-X always starts halfway into the BAR,
>>> +         * which will always be a separate host page.  
>>
>> nit: the spec gives this recommendation.
>>
>> "If a dedicated Base Address register is not feasible, it is recommended
>> that a function isolate the MSI-X structures from the non-MSI-X
>> structures with aligned 8 KB ranges rather than
>> the mandatory aligned 4 KB ranges."
>>
>> In some corner circumstances, with 4kB - which is not our main use case
>> - this may not be enforced here.
> 
> I think this was an attempt to future proof hardware designs for larger
> page sizes, but 8K turned out to be mostly (entirely?) skipped as a
> common page size.  We're not building real hardware and I can't think
> of any advantage to using a minimum of 8K alignment if the host page
> size is 4K, can you?  Thanks,

OK no worries

Thanks

Eric
> 
> Alex
> 
> 
>>> +         */
>>> +        vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
>>> +    }
>>> +
>>> +    vdev->msix->table_bar = target_bar;
>>> +    vdev->msix->pba_bar = target_bar;
>>> +    /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
>>> +    vdev->msix->pba_offset = vdev->msix->table_offset +
>>> +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
>>> +
>>> +    trace_vfio_msix_relo(vdev->vbasedev.name,
>>> +                         vdev->msix->table_bar, vdev->msix->table_offset);
>>> +}
>>> +
>>>  /*
>>>   * We don't have any control over how pci_add_capability() inserts
>>>   * capabilities into the chain.  In order to setup MSI-X we need a
>>> @@ -1430,6 +1524,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
>>>      vdev->msix = msix;
>>>  
>>>      vfio_pci_fixup_msix_region(vdev);
>>> +
>>> +    vfio_pci_relocate_msix(vdev, errp);
>>>  }
>>>  
>>>  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
>>> @@ -2845,13 +2941,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>  
>>>      vfio_pci_size_rom(vdev);
>>>  
>>> +    vfio_bars_prepare(vdev);
>>> +
>>>      vfio_msix_early_setup(vdev, &err);
>>>      if (err) {
>>>          error_propagate(errp, err);
>>>          goto error;
>>>      }
>>>  
>>> -    vfio_bars_prepare(vdev);
>>>      vfio_bars_register(vdev);
>>>  
>>>      ret = vfio_add_capabilities(vdev, errp);
>>> @@ -3041,6 +3138,8 @@ static Property vfio_pci_dev_properties[] = {
>>>      DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
>>>                                     nv_gpudirect_clique,
>>>                                     qdev_prop_nv_gpudirect_clique, uint8_t),
>>> +    DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
>>> +                                OFF_AUTOPCIBAR_OFF),
>>>      /*
>>>       * TODO - support passed fds... is this necessary?
>>>       * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>> index dcdb1a806769..588381f201b4 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -135,6 +135,7 @@ typedef struct VFIOPCIDevice {
>>>                                  (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
>>>      int32_t bootindex;
>>>      uint32_t igd_gms;
>>> +    OffAutoPCIBAR msix_relo;
>>>      uint8_t pm_cap;
>>>      uint8_t nv_gpudirect_clique;
>>>      bool pci_aer;
>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>> index fae096c0724f..437ccdd29053 100644
>>> --- a/hw/vfio/trace-events
>>> +++ b/hw/vfio/trace-events
>>> @@ -16,6 +16,8 @@ vfio_msix_pba_disable(const char *name) " (%s)"
>>>  vfio_msix_pba_enable(const char *name) " (%s)"
>>>  vfio_msix_disable(const char *name) " (%s)"
>>>  vfio_msix_fixup(const char *name, int bar, uint64_t start, uint64_t end) " (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]"
>>> +vfio_msix_relo_cost(const char *name, int bar, uint64_t cost) " (%s) BAR %d cost 0x%"PRIx64""
>>> +vfio_msix_relo(const char *name, int bar, uint64_t offset) " (%s) BAR %d offset 0x%"PRIx64""
>>>  vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors"
>>>  vfio_msi_disable(const char *name) " (%s)"
>>>  vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n  size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
>>>   
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR Alex Williamson
@ 2018-01-24 15:49   ` Alex Williamson
  2018-01-31 13:24   ` Markus Armbruster
  1 sibling, 0 replies; 14+ messages in thread
From: Alex Williamson @ 2018-01-24 15:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Markus Armbruster, Eric Blake, eric.auger, aik


Ping, can any qapi folks comment or provide an ack/review for this
please?  Thanks!

Alex

On Wed, 10 Jan 2018 12:02:24 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> Add an option which allows the user to specify a PCI BAR number,
> including an 'off' and 'auto' selection.
> 
> Cc: Markus Armbruster <armbru@redhat.com>
> Cc: Eric Blake <eblake@redhat.com>
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
>  hw/core/qdev-properties.c    |   11 +++++++++++
>  include/hw/qdev-properties.h |    4 ++++
>  qapi/common.json             |   26 ++++++++++++++++++++++++++
>  3 files changed, 41 insertions(+)
> 
> diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c
> index 1dc80fcea2af..e33184e5a342 100644
> --- a/hw/core/qdev-properties.c
> +++ b/hw/core/qdev-properties.c
> @@ -1256,3 +1256,14 @@ const PropertyInfo qdev_prop_link = {
>      .name = "link",
>      .create = create_link_property,
>  };
> +
> +/* --- OffAutoPCIBAR off/auto/bar0/bar1/bar2/bar3/bar4/bar5 --- */
> +
> +const PropertyInfo qdev_prop_off_auto_pcibar = {
> +    .name = "OffAutoPCIBAR",
> +    .description = "off/auto/bar0/bar1/bar2/bar3/bar4/bar5",
> +    .enum_table = &OffAutoPCIBAR_lookup,
> +    .get = get_enum,
> +    .set = set_enum,
> +    .set_default_value = set_default_value_enum,
> +};
> diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h
> index 60b42ac561af..e2643f5126c4 100644
> --- a/include/hw/qdev-properties.h
> +++ b/include/hw/qdev-properties.h
> @@ -33,6 +33,7 @@ extern const PropertyInfo qdev_prop_blocksize;
>  extern const PropertyInfo qdev_prop_pci_host_devaddr;
>  extern const PropertyInfo qdev_prop_arraylen;
>  extern const PropertyInfo qdev_prop_link;
> +extern const PropertyInfo qdev_prop_off_auto_pcibar;
>  
>  #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
>          .name      = (_name),                                    \
> @@ -213,6 +214,9 @@ extern const PropertyInfo qdev_prop_link;
>      DEFINE_PROP(_n, _s, _f, qdev_prop_pci_host_devaddr, PCIHostDeviceAddress)
>  #define DEFINE_PROP_MEMORY_REGION(_n, _s, _f)             \
>      DEFINE_PROP(_n, _s, _f, qdev_prop_ptr, MemoryRegion *)
> +#define DEFINE_PROP_OFF_AUTO_PCIBAR(_n, _s, _f, _d) \
> +    DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_off_auto_pcibar, \
> +                        OffAutoPCIBAR)
>  
>  #define DEFINE_PROP_END_OF_LIST()               \
>      {}
> diff --git a/qapi/common.json b/qapi/common.json
> index 6eb01821ef59..d9b14dd429f3 100644
> --- a/qapi/common.json
> +++ b/qapi/common.json
> @@ -100,3 +100,29 @@
>  { 'alternate': 'StrOrNull',
>    'data': { 's': 'str',
>              'n': 'null' } }
> +
> +##
> +# @OffAutoPCIBAR:
> +#
> +# An enumeration of options for specifying a PCI BAR
> +#
> +# @off: The specified feature is disabled
> +#
> +# @auto: The PCI BAR for the feature is automatically selected
> +#
> +# @bar0: PCI BAR0 is used for the feature
> +#
> +# @bar1: PCI BAR1 is used for the feature
> +#
> +# @bar2: PCI BAR2 is used for the feature
> +#
> +# @bar3: PCI BAR3 is used for the feature
> +#
> +# @bar4: PCI BAR4 is used for the feature
> +#
> +# @bar5: PCI BAR5 is used for the feature
> +#
> +# Since: 2.12
> +##
> +{ 'enum': 'OffAutoPCIBAR',
> +  'data': [ 'off', 'auto', 'bar0', 'bar1', 'bar2', 'bar3', 'bar4', 'bar5' ] }
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR Alex Williamson
  2018-01-24 15:49   ` Alex Williamson
@ 2018-01-31 13:24   ` Markus Armbruster
  1 sibling, 0 replies; 14+ messages in thread
From: Markus Armbruster @ 2018-01-31 13:24 UTC (permalink / raw)
  To: Alex Williamson; +Cc: qemu-devel, aik, eric.auger

Alex Williamson <alex.williamson@redhat.com> writes:

> Add an option which allows the user to specify a PCI BAR number,
> including an 'off' and 'auto' selection.
>
> Cc: Markus Armbruster <armbru@redhat.com>
> Cc: Eric Blake <eblake@redhat.com>
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Reviewed-by: Markus Armbruster <armbru@redhat.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 2/5] vfio/pci: Add base BAR MemoryRegion
  2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 2/5] vfio/pci: Add base BAR MemoryRegion Alex Williamson
@ 2018-02-02  2:19   ` Alex Williamson
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Williamson @ 2018-02-02  2:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: eric.auger, aik

Add one more layer to our stack of MemoryRegions, this base region
allows us to register BARs independently of the vfio region or to
extend the size of BARs which do map to a region.  This will be
useful when we want hypervisor defined BARs or sections of BARs,
for purposes such as relocating MSI-X emulation.  We therefore call
msix_init() based on this new base MemoryRegion, while the quirks,
which only modify regions still operate on those sub-MemoryRegions.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

Pre-pull request testing uncovered an error with this patch, one of my
test VMs assigns USB host controllers with 1K MMIO BARs.  The host
BIOS happens to align these BARs at 4K, so I make use of the sub-page
code that extends these mappings without resizing the BARs.  With this
patch we also need to extend the base BAR if it's undersized, and
remove/re-add the base BAR MemoryRegion from the address space rather
than the vfio region MemoryRegion.  v3 adds the first 4 chunks of the
patch below to update the sub-page mapping support and is otherwise
identical to v2.  I'm only posting this one patch from the series for
v3, the others remain the same.  I dropped R-b/T-b here, but unless
you're testing a sub-page BAR with host alignment to use this support,
this code won't be executed and I'll be happy to re-add those.  Please
review, thanks,

Alex

 hw/vfio/pci.c |   94 ++++++++++++++++++++++++++++++++++++++++++---------------
 hw/vfio/pci.h |    3 ++
 2 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 2c7129512563..908b8dffca2b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1087,7 +1087,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
 {
     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
     VFIORegion *region = &vdev->bars[bar].region;
-    MemoryRegion *mmap_mr, *mr;
+    MemoryRegion *mmap_mr, *region_mr, *base_mr;
     PCIIORegion *r;
     pcibus_t bar_addr;
     uint64_t size = region->size;
@@ -1100,7 +1100,8 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
 
     r = &pdev->io_regions[bar];
     bar_addr = r->addr;
-    mr = region->mem;
+    base_mr = vdev->bars[bar].mr;
+    region_mr = region->mem;
     mmap_mr = &region->mmaps[0].mem;
 
     /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
@@ -1111,12 +1112,15 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
 
     memory_region_transaction_begin();
 
-    memory_region_set_size(mr, size);
+    if (vdev->bars[bar].size < size) {
+        memory_region_set_size(base_mr, size);
+    }
+    memory_region_set_size(region_mr, size);
     memory_region_set_size(mmap_mr, size);
-    if (size != region->size && memory_region_is_mapped(mr)) {
-        memory_region_del_subregion(r->address_space, mr);
+    if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
+        memory_region_del_subregion(r->address_space, base_mr);
         memory_region_add_subregion_overlap(r->address_space,
-                                            bar_addr, mr, 0);
+                                            bar_addr, base_mr, 0);
     }
 
     memory_region_transaction_commit();
@@ -1218,8 +1222,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
 
         for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
             if (old_addr[bar] != pdev->io_regions[bar].addr &&
-                pdev->io_regions[bar].size > 0 &&
-                pdev->io_regions[bar].size < qemu_real_host_page_size) {
+                vdev->bars[bar].region.size > 0 &&
+                vdev->bars[bar].region.size < qemu_real_host_page_size) {
                 vfio_sub_page_bar_update_mapping(pdev, bar);
             }
         }
@@ -1440,9 +1444,9 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
     vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
                                     sizeof(unsigned long));
     ret = msix_init(&vdev->pdev, vdev->msix->entries,
-                    vdev->bars[vdev->msix->table_bar].region.mem,
+                    vdev->bars[vdev->msix->table_bar].mr,
                     vdev->msix->table_bar, vdev->msix->table_offset,
-                    vdev->bars[vdev->msix->pba_bar].region.mem,
+                    vdev->bars[vdev->msix->pba_bar].mr,
                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
                     &err);
     if (ret < 0) {
@@ -1482,8 +1486,8 @@ static void vfio_teardown_msi(VFIOPCIDevice *vdev)
 
     if (vdev->msix) {
         msix_uninit(&vdev->pdev,
-                    vdev->bars[vdev->msix->table_bar].region.mem,
-                    vdev->bars[vdev->msix->pba_bar].region.mem);
+                    vdev->bars[vdev->msix->table_bar].mr,
+                    vdev->bars[vdev->msix->pba_bar].mr);
         g_free(vdev->msix->pending);
     }
 }
@@ -1500,12 +1504,11 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
     }
 }
 
-static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
+static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
 
     uint32_t pci_bar;
-    uint8_t type;
     int ret;
 
     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
@@ -1524,23 +1527,52 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
     pci_bar = le32_to_cpu(pci_bar);
     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
-    type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
-                                    ~PCI_BASE_ADDRESS_MEM_MASK);
+    bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
+                                         ~PCI_BASE_ADDRESS_MEM_MASK);
+    bar->size = bar->region.size;
+}
+
+static void vfio_bars_prepare(VFIOPCIDevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_bar_prepare(vdev, i);
+    }
+}
+
+static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    char *name;
 
-    if (vfio_region_mmap(&bar->region)) {
-        error_report("Failed to mmap %s BAR %d. Performance may be slow",
-                     vdev->vbasedev.name, nr);
+    if (!bar->size) {
+        return;
     }
 
-    pci_register_bar(&vdev->pdev, nr, type, bar->region.mem);
+    bar->mr = g_new0(MemoryRegion, 1);
+    name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
+    memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
+    g_free(name);
+
+    if (bar->region.size) {
+        memory_region_add_subregion(bar->mr, 0, bar->region.mem);
+
+        if (vfio_region_mmap(&bar->region)) {
+            error_report("Failed to mmap %s BAR %d. Performance may be slow",
+                         vdev->vbasedev.name, nr);
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
 }
 
-static void vfio_bars_setup(VFIOPCIDevice *vdev)
+static void vfio_bars_register(VFIOPCIDevice *vdev)
 {
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
-        vfio_bar_setup(vdev, i);
+        vfio_bar_register(vdev, i);
     }
 }
 
@@ -1549,8 +1581,13 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev)
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
         vfio_bar_quirk_exit(vdev, i);
-        vfio_region_exit(&vdev->bars[i].region);
+        vfio_region_exit(&bar->region);
+        if (bar->region.size) {
+            memory_region_del_subregion(bar->mr, bar->region.mem);
+        }
     }
 
     if (vdev->vga) {
@@ -1564,8 +1601,14 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev)
     int i;
 
     for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
         vfio_bar_quirk_finalize(vdev, i);
-        vfio_region_finalize(&vdev->bars[i].region);
+        vfio_region_finalize(&bar->region);
+        if (bar->size) {
+            object_unparent(OBJECT(bar->mr));
+            g_free(bar->mr);
+        }
     }
 
     if (vdev->vga) {
@@ -2810,7 +2853,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         goto error;
     }
 
-    vfio_bars_setup(vdev);
+    vfio_bars_prepare(vdev);
+    vfio_bars_register(vdev);
 
     ret = vfio_add_capabilities(vdev, errp);
     if (ret) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3d753222ca4c..dcdb1a806769 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -33,6 +33,9 @@ typedef struct VFIOQuirk {
 
 typedef struct VFIOBAR {
     VFIORegion region;
+    MemoryRegion *mr;
+    size_t size;
+    uint8_t type;
     bool ioport;
     bool mem64;
     QLIST_HEAD(, VFIOQuirk) quirks;

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-02-02  2:54 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-10 19:01 [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alex Williamson
2018-01-10 19:01 ` [Qemu-devel] [PATCH v2 1/5] vfio/pci: Fixup VFIOMSIXInfo comment Alex Williamson
2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 2/5] vfio/pci: Add base BAR MemoryRegion Alex Williamson
2018-02-02  2:19   ` [Qemu-devel] [PATCH v3 " Alex Williamson
2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 3/5] vfio/pci: Emulate BARs Alex Williamson
2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 4/5] qapi: Create DEFINE_PROP_OFF_AUTO_PCIBAR Alex Williamson
2018-01-24 15:49   ` Alex Williamson
2018-01-31 13:24   ` Markus Armbruster
2018-01-10 19:02 ` [Qemu-devel] [PATCH v2 5/5] vfio/pci: Allow relocating MSI-X MMIO Alex Williamson
2018-01-19  9:50   ` Auger Eric
2018-01-19 15:23     ` Alex Williamson
2018-01-19 15:27       ` Auger Eric
2018-01-15  3:39 ` [Qemu-devel] [PATCH v2 0/5] vfio/pci: MSI-X MMIO relocation Alexey Kardashevskiy
2018-01-19 10:26 ` Auger Eric

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.