All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v1 0/4] Expose GPU memory as coherently CPU accessible
@ 2023-06-05 23:50 ankita
  2023-06-05 23:50 ` [RFC v1 1/4] qemu: add GPU memory information as object ankita
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: ankita @ 2023-06-05 23:50 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel

From: Ankit Agrawal <ankita@nvidia.com>

NVIDIA is building systems which allows the CPU to coherently access
GPU memory. This GPU device memory can be added and managed by the
kernel memory manager. The patch holds the required changes in QEMU
to expose this memory to the device assigned VMs.

The GPU device memory region is exposed as device BAR1 and QEMU mmaps to
it. It then adds new proximity domains to represent the memory in the
VM ACPI SRAT. This allows the device memory to be added as separate NUMA
nodes inside the VM. The proximity domains (PXM) are passed to the VM
using ACPI DSD properties to help VM kernel modules add the memory.

Current Linux cannot create NUMA nodes on the fly, hence creating enough
NUMA nodes in ACPI is needed so that they are available at the VM bootup
time. The physical platform firwmare provides 8 NUMA nodes, which QEMU
is emulating here.

A new vfio-pci variant driver is added to manage the device memory and
report as a BAR. Ongoing review of the corresponding kernel side changes
along with the new vfio-pci variant driver.
Ref: https://lore.kernel.org/lkml/20230405180134.16932-1-ankita@nvidia.com/

Applied over v8.0.2.

Ankit Agrawal (4):
  qemu: add GPU memory information as object
  qemu: patch guest SRAT for GPU memory
  qemu: patch guest DSDT for GPU memory
  qemu: adjust queried bar size to power-of-2

 hw/arm/virt-acpi-build.c    | 54 ++++++++++++++++++++++++++++
 hw/pci-host/gpex-acpi.c     | 71 ++++++++++++++++++++++++++++++++++++
 hw/vfio/common.c            |  2 +-
 hw/vfio/pci-quirks.c        | 13 +++++++
 hw/vfio/pci.c               | 72 +++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.h               |  1 +
 include/hw/pci/pci_device.h |  3 ++
 7 files changed, 215 insertions(+), 1 deletion(-)

-- 
2.17.1



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC v1 1/4] qemu: add GPU memory information as object
  2023-06-05 23:50 [RFC v1 0/4] Expose GPU memory as coherently CPU accessible ankita
@ 2023-06-05 23:50 ` ankita
  2023-06-06 15:19   ` Alex Williamson
  2023-06-05 23:50 ` [RFC v1 2/4] qemu: patch guest SRAT for GPU memory ankita
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: ankita @ 2023-06-05 23:50 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel

From: Ankit Agrawal <ankita@nvidia.com>

The GPU memory is exposed as device BAR1 to the VM and is discovered
by QEMU through the VFIO_DEVICE_GET_REGION_INFO ioctl. QEMU performs
the mapping to it.

The GPU memory can be added in the VM as (upto 8) separate NUMA nodes.
To achieve this, QEMU inserts a series of the PXM domains in the SRAT
and communicate this range of nodes to the VM through DSD properties.

These PXM start and count are added as object properties and pushed to
the SRAT and DST builder code.

The code is activated only for a set of NVIDIA devices supporting the
feature.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 hw/vfio/pci-quirks.c        | 13 +++++++
 hw/vfio/pci.c               | 72 +++++++++++++++++++++++++++++++++++++
 hw/vfio/pci.h               |  1 +
 include/hw/pci/pci_device.h |  3 ++
 4 files changed, 89 insertions(+)

diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index f0147a050a..b7334ccd1d 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -1751,3 +1751,16 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
 
     return 0;
 }
+
+bool vfio_has_cpu_coherent_devmem(VFIOPCIDevice *vdev)
+{
+    switch (vdev->device_id) {
+    /* Nvidia */
+    case 0x2342:
+    case 0x2343:
+    case 0x2345:
+        return true;
+    }
+
+    return false;
+}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ec9a854361..403516ffb3 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -42,6 +42,8 @@
 #include "qapi/error.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
+#include "qapi/visitor.h"
+#include "include/hw/boards.h"
 
 #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
 
@@ -2824,6 +2826,22 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
     }
 }
 
+static void vfio_pci_get_gpu_mem_pxm_start(Object *obj, Visitor *v,
+                                           const char *name,
+                                           void *opaque, Error **errp)
+{
+    uint64_t pxm_start = (uintptr_t) opaque;
+    visit_type_uint64(v, name, &pxm_start, errp);
+}
+
+static void vfio_pci_get_gpu_mem_pxm_count(Object *obj, Visitor *v,
+                                           const char *name,
+                                           void *opaque, Error **errp)
+{
+    uint64_t pxm_count = (uintptr_t) opaque;
+    visit_type_uint64(v, name, &pxm_count, errp);
+}
+
 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
 {
     Error *err = NULL;
@@ -2843,6 +2861,53 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static int vfio_pci_nvidia_dev_mem_probe(VFIOPCIDevice *vPciDev,
+                                         Error **errp)
+{
+    unsigned int num_nodes;
+    MemoryRegion *nv2mr = g_malloc0(sizeof(*nv2mr));
+    Object *obj = NULL;
+    VFIODevice *vdev = &vPciDev->vbasedev;
+    MachineState *ms = MACHINE(qdev_get_machine());
+
+    if (!vfio_has_cpu_coherent_devmem(vPciDev)) {
+        return -ENODEV;
+    }
+
+    if (vdev->type == VFIO_DEVICE_TYPE_PCI) {
+        obj = vfio_pci_get_object(vdev);
+    }
+
+    if (!obj) {
+        return -EINVAL;
+    }
+
+    /*
+     * This device has memory that is coherently accessible from the CPU.
+     * The memory can be represented by upto 8 seperate memory-only
+     * NUMA nodes.
+     */
+    vPciDev->pdev.has_coherent_memory = true;
+    num_nodes = 8;
+
+    /*
+     * To have 8 unique nodes in the VM, a series of PXM nodes are
+     * required to be added to VM's SRAT. Send the information about
+     * the starting PXM ID and the count to the ACPI builder code.
+     */
+    object_property_add(OBJECT(vPciDev), "gpu_mem_pxm_start", "uint64",
+                        vfio_pci_get_gpu_mem_pxm_start, NULL, NULL,
+                        (void *) (uintptr_t) ms->numa_state->num_nodes);
+
+    object_property_add(OBJECT(vPciDev), "gpu_mem_pxm_count", "uint64",
+                        vfio_pci_get_gpu_mem_pxm_count, NULL, NULL,
+                        (void *) (uintptr_t) num_nodes);
+
+    ms->numa_state->num_nodes += num_nodes;
+
+    return 0;
+}
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = VFIO_PCI(pdev);
@@ -3151,6 +3216,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         }
     }
 
+    if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
+        ret = vfio_pci_nvidia_dev_mem_probe(vdev, errp);
+        if (ret && ret != -ENODEV) {
+            error_report("Failed to setup NVIDIA dev_mem with error %d", ret);
+        }
+    }
+
     vfio_register_err_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 177abcc8fb..d8791f8f1f 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -226,4 +226,5 @@ void vfio_display_reset(VFIOPCIDevice *vdev);
 int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
 void vfio_display_finalize(VFIOPCIDevice *vdev);
 
+bool vfio_has_cpu_coherent_devmem(VFIOPCIDevice *vdev);
 #endif /* HW_VFIO_VFIO_PCI_H */
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..aacd2279ae 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -157,6 +157,9 @@ struct PCIDevice {
     MSIVectorReleaseNotifier msix_vector_release_notifier;
     MSIVectorPollNotifier msix_vector_poll_notifier;
 
+    /* GPU coherent memory */
+    bool has_coherent_memory;
+
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
     uint32_t acpi_index;
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC v1 2/4] qemu: patch guest SRAT for GPU memory
  2023-06-05 23:50 [RFC v1 0/4] Expose GPU memory as coherently CPU accessible ankita
  2023-06-05 23:50 ` [RFC v1 1/4] qemu: add GPU memory information as object ankita
@ 2023-06-05 23:50 ` ankita
  2023-06-06  4:58   ` Philippe Mathieu-Daudé
  2023-06-05 23:50 ` [RFC v1 3/4] qemu: patch guest DSDT " ankita
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: ankita @ 2023-06-05 23:50 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel

From: Ankit Agrawal <ankita@nvidia.com>

The guest VM adds the GPU memory as (upto 8) separate memory-less NUMA
nodes. ACPI SRAT need to thus insert proximity domains and tag them as
MEM_AFFINITY_HOTPLUGGABLE. The VM kernel can then parse the SRAT and
create NUMA nodes.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 hw/arm/virt-acpi-build.c | 54 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 4156111d49..42f76752b4 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -45,6 +45,7 @@
 #include "hw/acpi/hmat.h"
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
+#include "hw/vfio/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/arm/virt.h"
@@ -514,6 +515,57 @@ build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     acpi_table_end(linker, &table);
 }
 
+static int devmem_device_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
+        *list = g_slist_append(*list, DEVICE(obj));
+    }
+
+    object_child_foreach(obj, devmem_device_list, opaque);
+    return 0;
+}
+
+static GSList *devmem_get_device_list(void)
+{
+    GSList *list = NULL;
+
+    object_child_foreach(qdev_get_machine(), devmem_device_list, &list);
+    return list;
+}
+
+static void build_srat_devmem(GArray *table_data)
+{
+    GSList *device_list, *list = devmem_get_device_list();
+
+    for (device_list = list; device_list; device_list = device_list->next) {
+        DeviceState *dev = device_list->data;
+        Object *obj = OBJECT(dev);
+        VFIOPCIDevice *pcidev
+            = ((VFIOPCIDevice *)object_dynamic_cast(OBJECT(obj),
+               TYPE_VFIO_PCI));
+
+        if (pcidev->pdev.has_coherent_memory) {
+            uint64_t start_node = object_property_get_uint(obj,
+                                  "gpu_mem_pxm_start", &error_abort);
+            uint64_t node_count = object_property_get_uint(obj,
+                                  "gpu_mem_pxm_count", &error_abort);
+            uint64_t node_index;
+
+            /*
+             * Add the node_count PXM domains starting from start_node as
+             * hot pluggable. The VM kernel parse the PXM domains and
+             * creates NUMA nodes.
+             */
+            for (node_index = 0; node_index < node_count; node_index++)
+                build_srat_memory(table_data, 0, 0, start_node + node_index,
+                    MEM_AFFINITY_ENABLED | MEM_AFFINITY_HOTPLUGGABLE);
+        }
+    }
+    g_slist_free(list);
+}
+
 /*
  * ACPI spec, Revision 5.1
  * 5.2.16 System Resource Affinity Table (SRAT)
@@ -568,6 +620,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
                           MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
     }
 
+    build_srat_devmem(table_data);
+
     acpi_table_end(linker, &table);
 }
 
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC v1 3/4] qemu: patch guest DSDT for GPU memory
  2023-06-05 23:50 [RFC v1 0/4] Expose GPU memory as coherently CPU accessible ankita
  2023-06-05 23:50 ` [RFC v1 1/4] qemu: add GPU memory information as object ankita
  2023-06-05 23:50 ` [RFC v1 2/4] qemu: patch guest SRAT for GPU memory ankita
@ 2023-06-05 23:50 ` ankita
  2023-06-05 23:50 ` [RFC v1 4/4] qemu: adjust queried bar size to power-of-2 ankita
  2023-06-06 14:54 ` [RFC v1 0/4] Expose GPU memory as coherently CPU accessible Cédric Le Goater
  4 siblings, 0 replies; 11+ messages in thread
From: ankita @ 2023-06-05 23:50 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel

From: Ankit Agrawal <ankita@nvidia.com>

To add the memory in the guest as NUMA nodes, it needs the PXM node index
and the total count of nodes associated with the memory. The range of
proximity domains are communicated to the VM as part of the guest ACPI
using the nvidia,gpu-mem-pxm-start and nvidia,gpu-mem-pxm-count DSD
properties. These value respectively represent the staring proximity
domain id and the count. Kernel modules can then fetch this information
and determine the numa node ID using pxm_to_node().

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 hw/pci-host/gpex-acpi.c | 71 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c
index 7c7316bc96..36873889c1 100644
--- a/hw/pci-host/gpex-acpi.c
+++ b/hw/pci-host/gpex-acpi.c
@@ -49,6 +49,74 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq)
     }
 }
 
+static void acpi_dsdt_add_gpu(Aml *dev, int32_t devfn,
+                             uint64_t gpu_mem_pxm_start,
+                             uint64_t gpu_mem_pxm_count)
+{
+    Aml *dev_gpu = aml_device("GPU%X", PCI_SLOT(devfn));
+    Aml *pkg = aml_package(2);
+    Aml *pkg1 = aml_package(2);
+    Aml *pkg2 = aml_package(2);
+    Aml *dev_pkg = aml_package(2);
+    Aml *UUID;
+
+    aml_append(dev_gpu, aml_name_decl("_ADR", aml_int(PCI_SLOT(devfn) << 16)));
+
+    aml_append(pkg1, aml_string("nvidia,gpu-mem-pxm-start"));
+    aml_append(pkg1, aml_int(gpu_mem_pxm_start));
+
+    aml_append(pkg2, aml_string("nvidia,gpu-mem-pxm-count"));
+    aml_append(pkg2, aml_int(gpu_mem_pxm_count));
+
+    aml_append(pkg, pkg1);
+    aml_append(pkg, pkg2);
+
+    UUID = aml_touuid("DAFFD814-6EBA-4D8C-8A91-BC9BBF4AA301");
+    aml_append(dev_pkg, UUID);
+    aml_append(dev_pkg, pkg);
+
+    aml_append(dev_gpu, aml_name_decl("_DSD", dev_pkg));
+    aml_append(dev, dev_gpu);
+}
+
+static void find_hbm_device(PCIBus *bus, PCIDevice *pdev,
+                            void *opaque)
+{
+    Aml *dev = (Aml *)opaque;
+    uint32_t vendor_id = pci_default_read_config(pdev, PCI_VENDOR_ID, 2);
+
+    if (bus == NULL) {
+        return;
+    }
+
+    if (vendor_id == PCI_VENDOR_ID_NVIDIA &&
+        pdev->has_coherent_memory) {
+        Object *po = OBJECT(pdev);
+
+        if (po == NULL) {
+            return;
+        }
+
+        uint64_t pxm_start
+           = object_property_get_uint(po, "gpu_mem_pxm_start", NULL);
+        uint64_t pxm_count
+           = object_property_get_uint(po, "gpu_mem_pxm_count", NULL);
+
+        acpi_dsdt_add_gpu(dev, pdev->devfn, pxm_start, pxm_count);
+    }
+}
+
+static void acpi_dsdt_find_and_add_gpu(PCIBus *bus, Aml *dev)
+{
+    if (bus == NULL) {
+        return;
+    }
+
+    pci_for_each_device_reverse(bus, pci_bus_num(bus),
+                                find_hbm_device, dev);
+
+}
+
 static void acpi_dsdt_add_pci_osc(Aml *dev)
 {
     Aml *method, *UUID, *ifctx, *ifctx1, *elsectx, *buf;
@@ -207,7 +275,10 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg)
 
     acpi_dsdt_add_pci_route_table(dev, cfg->irq);
 
+    acpi_dsdt_find_and_add_gpu(cfg->bus, dev);
+
     method = aml_method("_CBA", 0, AML_NOTSERIALIZED);
+
     aml_append(method, aml_return(aml_int(cfg->ecam.base)));
     aml_append(dev, method);
 
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC v1 4/4] qemu: adjust queried bar size to power-of-2
  2023-06-05 23:50 [RFC v1 0/4] Expose GPU memory as coherently CPU accessible ankita
                   ` (2 preceding siblings ...)
  2023-06-05 23:50 ` [RFC v1 3/4] qemu: patch guest DSDT " ankita
@ 2023-06-05 23:50 ` ankita
  2023-06-06  5:03   ` Philippe Mathieu-Daudé
  2023-06-06 12:54   ` Alex Williamson
  2023-06-06 14:54 ` [RFC v1 0/4] Expose GPU memory as coherently CPU accessible Cédric Le Goater
  4 siblings, 2 replies; 11+ messages in thread
From: ankita @ 2023-06-05 23:50 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel

From: Ankit Agrawal <ankita@nvidia.com>

The GPU device memory is reported to the VM as a BAR. The device memory
may not be aligned to the power-of-2, but the QEMU expects the PCI BAR to
be. Align the reported device memory size to the next power-of-2 before
QEMU does an mmap.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 hw/vfio/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4d01ea3515..bb49200458 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -2061,7 +2061,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
 
     region->vbasedev = vbasedev;
     region->flags = info->flags;
-    region->size = info->size;
+    region->size = info->size ? pow2ceil(info->size) : info->size;
     region->fd_offset = info->offset;
     region->nr = index;
 
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC v1 2/4] qemu: patch guest SRAT for GPU memory
  2023-06-05 23:50 ` [RFC v1 2/4] qemu: patch guest SRAT for GPU memory ankita
@ 2023-06-06  4:58   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 11+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-06-06  4:58 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel, qemu-arm

On 6/6/23 01:50, ankita@nvidia.com wrote:
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The guest VM adds the GPU memory as (upto 8) separate memory-less NUMA
> nodes. ACPI SRAT need to thus insert proximity domains and tag them as
> MEM_AFFINITY_HOTPLUGGABLE. The VM kernel can then parse the SRAT and
> create NUMA nodes.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>   hw/arm/virt-acpi-build.c | 54 ++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 54 insertions(+)
> 
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c


>   /*
>    * ACPI spec, Revision 5.1
>    * 5.2.16 System Resource Affinity Table (SRAT)
> @@ -568,6 +620,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)

There is a x86 build_srat() equivalent.

So some abstraction in hw/acpi/srat.c is possible.

>       }
>   
> +    build_srat_devmem(table_data);
> +
>       acpi_table_end(linker, &table);
>   }
>   



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC v1 4/4] qemu: adjust queried bar size to power-of-2
  2023-06-05 23:50 ` [RFC v1 4/4] qemu: adjust queried bar size to power-of-2 ankita
@ 2023-06-06  5:03   ` Philippe Mathieu-Daudé
  2023-06-06 12:54   ` Alex Williamson
  1 sibling, 0 replies; 11+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-06-06  5:03 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel, Cédric Le Goater

On 6/6/23 01:50, ankita@nvidia.com wrote:
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The GPU device memory is reported to the VM as a BAR. The device memory
> may not be aligned to the power-of-2, but the QEMU expects the PCI BAR to
> be. Align the reported device memory size to the next power-of-2 before
> QEMU does an mmap.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>   hw/vfio/common.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 4d01ea3515..bb49200458 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -2061,7 +2061,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>   
>       region->vbasedev = vbasedev;
>       region->flags = info->flags;
> -    region->size = info->size;
> +    region->size = info->size ? pow2ceil(info->size) : info->size;

        region->size = [REAL_]HOST_PAGE_ALIGN(info->size)?

>       region->fd_offset = info->offset;
>       region->nr = index;
>   



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC v1 4/4] qemu: adjust queried bar size to power-of-2
  2023-06-05 23:50 ` [RFC v1 4/4] qemu: adjust queried bar size to power-of-2 ankita
  2023-06-06  5:03   ` Philippe Mathieu-Daudé
@ 2023-06-06 12:54   ` Alex Williamson
  2023-06-06 14:19     ` Philippe Mathieu-Daudé
  1 sibling, 1 reply; 11+ messages in thread
From: Alex Williamson @ 2023-06-06 12:54 UTC (permalink / raw)
  To: ankita; +Cc: jgg, qemu-devel

On Mon, 5 Jun 2023 16:50:05 -0700
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The GPU device memory is reported to the VM as a BAR. The device memory
> may not be aligned to the power-of-2, but the QEMU expects the PCI BAR to
> be. Align the reported device memory size to the next power-of-2 before
> QEMU does an mmap.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  hw/vfio/common.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 4d01ea3515..bb49200458 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -2061,7 +2061,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>  
>      region->vbasedev = vbasedev;
>      region->flags = info->flags;
> -    region->size = info->size;
> +    region->size = info->size ? pow2ceil(info->size) : info->size;
>      region->fd_offset = info->offset;
>      region->nr = index;
>  

Nak, this means our kernel emulation of the BAR is broken, a BAR that
is not naturally aligned is not a PCI BAR.  PCI BAR sizing through the
BAR register still needs to work via the kernel interface alone.  It's
clear now how the kernel resizing the vma on mmap was a hack around
userspace mangling the region size.

Maybe this needs to be exposed as a device specific region, which then
userspace emulates as a BAR for the VM facing device rather than the
kernel emulating it as a BAR.  Thanks,

Alex



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC v1 4/4] qemu: adjust queried bar size to power-of-2
  2023-06-06 12:54   ` Alex Williamson
@ 2023-06-06 14:19     ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 11+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-06-06 14:19 UTC (permalink / raw)
  To: Alex Williamson, ankita; +Cc: jgg, qemu-devel

On 6/6/23 14:54, Alex Williamson wrote:
> On Mon, 5 Jun 2023 16:50:05 -0700
> <ankita@nvidia.com> wrote:
> 
>> From: Ankit Agrawal <ankita@nvidia.com>
>>
>> The GPU device memory is reported to the VM as a BAR. The device memory
>> may not be aligned to the power-of-2, but the QEMU expects the PCI BAR to
>> be. Align the reported device memory size to the next power-of-2 before
>> QEMU does an mmap.
>>
>> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
>> ---
>>   hw/vfio/common.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 4d01ea3515..bb49200458 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -2061,7 +2061,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
>>   
>>       region->vbasedev = vbasedev;
>>       region->flags = info->flags;
>> -    region->size = info->size;
>> +    region->size = info->size ? pow2ceil(info->size) : info->size;
>>       region->fd_offset = info->offset;
>>       region->nr = index;
>>   
> 
> Nak, this means our kernel emulation of the BAR is broken, a BAR that
> is not naturally aligned is not a PCI BAR.

Right. So the common code could check this value is correct, like:

   assert(is_power_of_2(->size));

Or less violet using error_report :)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC v1 0/4] Expose GPU memory as coherently CPU accessible
  2023-06-05 23:50 [RFC v1 0/4] Expose GPU memory as coherently CPU accessible ankita
                   ` (3 preceding siblings ...)
  2023-06-05 23:50 ` [RFC v1 4/4] qemu: adjust queried bar size to power-of-2 ankita
@ 2023-06-06 14:54 ` Cédric Le Goater
  4 siblings, 0 replies; 11+ messages in thread
From: Cédric Le Goater @ 2023-06-06 14:54 UTC (permalink / raw)
  To: ankita, jgg, alex.williamson; +Cc: qemu-devel

Hello Ankit,

On 6/6/23 01:50, ankita@nvidia.com wrote:
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> NVIDIA is building systems which allows the CPU to coherently access
> GPU memory. This GPU device memory can be added and managed by the
> kernel memory manager. The patch holds the required changes in QEMU
> to expose this memory to the device assigned VMs.
> 
> The GPU device memory region is exposed as device BAR1 and QEMU mmaps to
> it. It then adds new proximity domains to represent the memory in the
> VM ACPI SRAT. This allows the device memory to be added as separate NUMA
> nodes inside the VM. The proximity domains (PXM) are passed to the VM
> using ACPI DSD properties to help VM kernel modules add the memory.
> 
> Current Linux cannot create NUMA nodes on the fly, hence creating enough
> NUMA nodes in ACPI is needed so that they are available at the VM bootup
> time. The physical platform firwmare provides 8 NUMA nodes, which QEMU
> is emulating here.
> 
> A new vfio-pci variant driver is added to manage the device memory and
> report as a BAR. Ongoing review of the corresponding kernel side changes
> along with the new vfio-pci variant driver.
> Ref: https://lore.kernel.org/lkml/20230405180134.16932-1-ankita@nvidia.com/
> 
> Applied over v8.0.2.
> 
> Ankit Agrawal (4):
>    qemu: add GPU memory information as object
>    qemu: patch guest SRAT for GPU memory
>    qemu: patch guest DSDT for GPU memory
>    qemu: adjust queried bar size to power-of-2


Please use "vfio:" subject prefix when modifying the hw/vfio files.
If you are not sure and want to know what is the current practice,
simply run :
    
   git log --pretty=oneline  <files>

Also, to know who to send the series, please use :

   ./scripts/get_maintainer.pl <patches>

Thanks,

C.

> 
>   hw/arm/virt-acpi-build.c    | 54 ++++++++++++++++++++++++++++
>   hw/pci-host/gpex-acpi.c     | 71 ++++++++++++++++++++++++++++++++++++
>   hw/vfio/common.c            |  2 +-
>   hw/vfio/pci-quirks.c        | 13 +++++++
>   hw/vfio/pci.c               | 72 +++++++++++++++++++++++++++++++++++++
>   hw/vfio/pci.h               |  1 +
>   include/hw/pci/pci_device.h |  3 ++
>   7 files changed, 215 insertions(+), 1 deletion(-)
> 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC v1 1/4] qemu: add GPU memory information as object
  2023-06-05 23:50 ` [RFC v1 1/4] qemu: add GPU memory information as object ankita
@ 2023-06-06 15:19   ` Alex Williamson
  0 siblings, 0 replies; 11+ messages in thread
From: Alex Williamson @ 2023-06-06 15:19 UTC (permalink / raw)
  To: ankita; +Cc: jgg, qemu-devel

On Mon, 5 Jun 2023 16:50:02 -0700
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The GPU memory is exposed as device BAR1 to the VM and is discovered
> by QEMU through the VFIO_DEVICE_GET_REGION_INFO ioctl. QEMU performs
> the mapping to it.
> 
> The GPU memory can be added in the VM as (upto 8) separate NUMA nodes.
> To achieve this, QEMU inserts a series of the PXM domains in the SRAT
> and communicate this range of nodes to the VM through DSD properties.
> 
> These PXM start and count are added as object properties and pushed to
> the SRAT and DST builder code.
> 
> The code is activated only for a set of NVIDIA devices supporting the
> feature.
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  hw/vfio/pci-quirks.c        | 13 +++++++
>  hw/vfio/pci.c               | 72 +++++++++++++++++++++++++++++++++++++
>  hw/vfio/pci.h               |  1 +
>  include/hw/pci/pci_device.h |  3 ++
>  4 files changed, 89 insertions(+)
> 
> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> index f0147a050a..b7334ccd1d 100644
> --- a/hw/vfio/pci-quirks.c
> +++ b/hw/vfio/pci-quirks.c
> @@ -1751,3 +1751,16 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
>  
>      return 0;
>  }
> +
> +bool vfio_has_cpu_coherent_devmem(VFIOPCIDevice *vdev)
> +{
> +    switch (vdev->device_id) {
> +    /* Nvidia */
> +    case 0x2342:
> +    case 0x2343:
> +    case 0x2345:
> +        return true;
> +    }
> +
> +    return false;
> +}

I'm not sure why all of this isn't in pci-quirks.c, but the above
function is misleadingly NVIDIA specific by not testing the vendor ID
here.

Also, none of this looks compatible with hotplug, so shouldn't any of
this only be enabled only for the vfio-pci-nohotplug device type?
Thanks,

Alex

> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ec9a854361..403516ffb3 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -42,6 +42,8 @@
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
>  #include "migration/qemu-file.h"
> +#include "qapi/visitor.h"
> +#include "include/hw/boards.h"
>  
>  #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
>  
> @@ -2824,6 +2826,22 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +static void vfio_pci_get_gpu_mem_pxm_start(Object *obj, Visitor *v,
> +                                           const char *name,
> +                                           void *opaque, Error **errp)
> +{
> +    uint64_t pxm_start = (uintptr_t) opaque;
> +    visit_type_uint64(v, name, &pxm_start, errp);
> +}
> +
> +static void vfio_pci_get_gpu_mem_pxm_count(Object *obj, Visitor *v,
> +                                           const char *name,
> +                                           void *opaque, Error **errp)
> +{
> +    uint64_t pxm_count = (uintptr_t) opaque;
> +    visit_type_uint64(v, name, &pxm_count, errp);
> +}
> +
>  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>  {
>      Error *err = NULL;
> @@ -2843,6 +2861,53 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static int vfio_pci_nvidia_dev_mem_probe(VFIOPCIDevice *vPciDev,
> +                                         Error **errp)
> +{
> +    unsigned int num_nodes;
> +    MemoryRegion *nv2mr = g_malloc0(sizeof(*nv2mr));
> +    Object *obj = NULL;
> +    VFIODevice *vdev = &vPciDev->vbasedev;
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +
> +    if (!vfio_has_cpu_coherent_devmem(vPciDev)) {
> +        return -ENODEV;
> +    }
> +
> +    if (vdev->type == VFIO_DEVICE_TYPE_PCI) {
> +        obj = vfio_pci_get_object(vdev);
> +    }
> +
> +    if (!obj) {
> +        return -EINVAL;
> +    }
> +
> +    /*
> +     * This device has memory that is coherently accessible from the CPU.
> +     * The memory can be represented by upto 8 seperate memory-only
> +     * NUMA nodes.
> +     */
> +    vPciDev->pdev.has_coherent_memory = true;
> +    num_nodes = 8;
> +
> +    /*
> +     * To have 8 unique nodes in the VM, a series of PXM nodes are
> +     * required to be added to VM's SRAT. Send the information about
> +     * the starting PXM ID and the count to the ACPI builder code.
> +     */
> +    object_property_add(OBJECT(vPciDev), "gpu_mem_pxm_start", "uint64",
> +                        vfio_pci_get_gpu_mem_pxm_start, NULL, NULL,
> +                        (void *) (uintptr_t) ms->numa_state->num_nodes);
> +
> +    object_property_add(OBJECT(vPciDev), "gpu_mem_pxm_count", "uint64",
> +                        vfio_pci_get_gpu_mem_pxm_count, NULL, NULL,
> +                        (void *) (uintptr_t) num_nodes);
> +
> +    ms->numa_state->num_nodes += num_nodes;
> +
> +    return 0;
> +}
> +
>  static void vfio_realize(PCIDevice *pdev, Error **errp)
>  {
>      VFIOPCIDevice *vdev = VFIO_PCI(pdev);
> @@ -3151,6 +3216,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          }
>      }
>  
> +    if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
> +        ret = vfio_pci_nvidia_dev_mem_probe(vdev, errp);
> +        if (ret && ret != -ENODEV) {
> +            error_report("Failed to setup NVIDIA dev_mem with error %d", ret);
> +        }
> +    }
> +
>      vfio_register_err_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 177abcc8fb..d8791f8f1f 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -226,4 +226,5 @@ void vfio_display_reset(VFIOPCIDevice *vdev);
>  int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
>  void vfio_display_finalize(VFIOPCIDevice *vdev);
>  
> +bool vfio_has_cpu_coherent_devmem(VFIOPCIDevice *vdev);
>  #endif /* HW_VFIO_VFIO_PCI_H */
> diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> index d3dd0f64b2..aacd2279ae 100644
> --- a/include/hw/pci/pci_device.h
> +++ b/include/hw/pci/pci_device.h
> @@ -157,6 +157,9 @@ struct PCIDevice {
>      MSIVectorReleaseNotifier msix_vector_release_notifier;
>      MSIVectorPollNotifier msix_vector_poll_notifier;
>  
> +    /* GPU coherent memory */
> +    bool has_coherent_memory;
> +
>      /* ID of standby device in net_failover pair */
>      char *failover_pair_id;
>      uint32_t acpi_index;



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-06-06 15:19 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-05 23:50 [RFC v1 0/4] Expose GPU memory as coherently CPU accessible ankita
2023-06-05 23:50 ` [RFC v1 1/4] qemu: add GPU memory information as object ankita
2023-06-06 15:19   ` Alex Williamson
2023-06-05 23:50 ` [RFC v1 2/4] qemu: patch guest SRAT for GPU memory ankita
2023-06-06  4:58   ` Philippe Mathieu-Daudé
2023-06-05 23:50 ` [RFC v1 3/4] qemu: patch guest DSDT " ankita
2023-06-05 23:50 ` [RFC v1 4/4] qemu: adjust queried bar size to power-of-2 ankita
2023-06-06  5:03   ` Philippe Mathieu-Daudé
2023-06-06 12:54   ` Alex Williamson
2023-06-06 14:19     ` Philippe Mathieu-Daudé
2023-06-06 14:54 ` [RFC v1 0/4] Expose GPU memory as coherently CPU accessible Cédric Le Goater

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.