* [PATCH v4] spapr: Add a new level of NUMA for GPUs
@ 2020-07-16 22:56 Reza Arbab
2020-07-17 0:07 ` David Gibson
0 siblings, 1 reply; 2+ messages in thread
From: Reza Arbab @ 2020-07-16 22:56 UTC (permalink / raw)
To: David Gibson, qemu-ppc, qemu-devel
Cc: Alexey Kardashevskiy, Daniel Henrique Barboza,
Daniel Henrique Barboza, Greg Kurz,
Leonardo Augusto Guimaraes Garcia
NUMA nodes corresponding to GPU memory currently have the same
affinity/distance as normal memory nodes. Add a third NUMA associativity
reference point enabling us to give GPU nodes more distance.
This is guest visible information, which shouldn't change under a
running guest across migration between different qemu versions, so make
the change effective only in new (pseries > 5.0) machine types.
Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5):
node distances:
node 0 1 2 3 4 5
0: 10 40 40 40 40 40
1: 40 10 40 40 40 40
2: 40 40 10 40 40 40
3: 40 40 40 10 40 40
4: 40 40 40 40 10 40
5: 40 40 40 40 40 10
After:
node distances:
node 0 1 2 3 4 5
0: 10 40 80 80 80 80
1: 40 10 80 80 80 80
2: 80 80 10 80 80 80
3: 80 80 80 10 80 80
4: 80 80 80 80 10 80
5: 80 80 80 80 80 10
These are the same distances as on the host, mirroring the change made
to host firmware in skiboot commit f845a648b8cb ("numa/associativity:
Add a new level of NUMA for GPU's").
Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
---
v4:
* Use nvslot->numa_id for distinction at all levels of ibm,associativity
* Use ARRAY_SIZE(refpoints)
* Rebase
v3:
* Squash into one patch
* Add PHB compat property
---
hw/ppc/spapr.c | 21 +++++++++++++++++++--
hw/ppc/spapr_pci.c | 2 ++
hw/ppc/spapr_pci_nvlink2.c | 13 ++++++++++---
include/hw/pci-host/spapr.h | 1 +
include/hw/ppc/spapr.h | 1 +
5 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 299908cc7396..0ae293ec9431 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -890,10 +890,16 @@ static int spapr_dt_rng(void *fdt)
static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
{
MachineState *ms = MACHINE(spapr);
+ SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
int rtas;
GString *hypertas = g_string_sized_new(256);
GString *qemu_hypertas = g_string_sized_new(256);
- uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
+ uint32_t refpoints[] = {
+ cpu_to_be32(0x4),
+ cpu_to_be32(0x4),
+ cpu_to_be32(0x2),
+ };
+ uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
memory_region_size(&MACHINE(spapr)->device_memory->mr);
uint32_t lrdr_capacity[] = {
@@ -945,8 +951,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
qemu_hypertas->str, qemu_hypertas->len));
g_string_free(qemu_hypertas, TRUE);
+ if (smc->pre_5_1_assoc_refpoints) {
+ nr_refpoints = 2;
+ }
+
_FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
- refpoints, sizeof(refpoints)));
+ refpoints, nr_refpoints * sizeof(refpoints[0])));
_FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
maxdomains, sizeof(maxdomains)));
@@ -4584,9 +4594,16 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true);
*/
static void spapr_machine_5_0_class_options(MachineClass *mc)
{
+ SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+ static GlobalProperty compat[] = {
+ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
+ };
+
spapr_machine_5_1_class_options(mc);
compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
+ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
mc->numa_mem_supported = true;
+ smc->pre_5_1_assoc_refpoints = true;
}
DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 2a6a48744aaa..16739334e35f 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -2035,6 +2035,8 @@ static Property spapr_phb_properties[] = {
pcie_ecs, true),
DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0),
DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0),
+ DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState,
+ pre_5_1_assoc, false),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
index dd8cd6db9654..76ae77ebc851 100644
--- a/hw/ppc/spapr_pci_nvlink2.c
+++ b/hw/ppc/spapr_pci_nvlink2.c
@@ -362,9 +362,9 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
&error_abort);
uint32_t associativity[] = {
cpu_to_be32(0x4),
- SPAPR_GPU_NUMA_ID,
- SPAPR_GPU_NUMA_ID,
- SPAPR_GPU_NUMA_ID,
+ cpu_to_be32(nvslot->numa_id),
+ cpu_to_be32(nvslot->numa_id),
+ cpu_to_be32(nvslot->numa_id),
cpu_to_be32(nvslot->numa_id)
};
uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
@@ -375,6 +375,13 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
_FDT(off);
_FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
_FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
+
+ if (sphb->pre_5_1_assoc) {
+ associativity[1] = SPAPR_GPU_NUMA_ID;
+ associativity[2] = SPAPR_GPU_NUMA_ID;
+ associativity[3] = SPAPR_GPU_NUMA_ID;
+ }
+
_FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
sizeof(associativity))));
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 8877ff51fbf7..600eb55c3488 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -94,6 +94,7 @@ struct SpaprPhbState {
hwaddr nv2_gpa_win_addr;
hwaddr nv2_atsd_win_addr;
SpaprPhbPciNvGpuConfig *nvgpus;
+ bool pre_5_1_assoc;
};
#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index c421410e3fb8..3134d339e8fe 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -129,6 +129,7 @@ struct SpaprMachineClass {
bool linux_pci_probe;
bool smp_threads_vsmt; /* set VSMT to smp_threads by default */
hwaddr rma_limit; /* clamp the RMA to this size */
+ bool pre_5_1_assoc_refpoints;
void (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
--
2.18.2
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v4] spapr: Add a new level of NUMA for GPUs
2020-07-16 22:56 [PATCH v4] spapr: Add a new level of NUMA for GPUs Reza Arbab
@ 2020-07-17 0:07 ` David Gibson
0 siblings, 0 replies; 2+ messages in thread
From: David Gibson @ 2020-07-17 0:07 UTC (permalink / raw)
To: Reza Arbab
Cc: Leonardo Augusto Guimaraes Garcia, Alexey Kardashevskiy,
Daniel Henrique Barboza, Daniel Henrique Barboza, qemu-devel,
Greg Kurz, qemu-ppc
[-- Attachment #1: Type: text/plain, Size: 7282 bytes --]
On Thu, Jul 16, 2020 at 05:56:55PM -0500, Reza Arbab wrote:
> NUMA nodes corresponding to GPU memory currently have the same
> affinity/distance as normal memory nodes. Add a third NUMA associativity
> reference point enabling us to give GPU nodes more distance.
>
> This is guest visible information, which shouldn't change under a
> running guest across migration between different qemu versions, so make
> the change effective only in new (pseries > 5.0) machine types.
>
> Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5):
>
> node distances:
> node 0 1 2 3 4 5
> 0: 10 40 40 40 40 40
> 1: 40 10 40 40 40 40
> 2: 40 40 10 40 40 40
> 3: 40 40 40 10 40 40
> 4: 40 40 40 40 10 40
> 5: 40 40 40 40 40 10
>
> After:
>
> node distances:
> node 0 1 2 3 4 5
> 0: 10 40 80 80 80 80
> 1: 40 10 80 80 80 80
> 2: 80 80 10 80 80 80
> 3: 80 80 80 10 80 80
> 4: 80 80 80 80 10 80
> 5: 80 80 80 80 80 10
>
> These are the same distances as on the host, mirroring the change made
> to host firmware in skiboot commit f845a648b8cb ("numa/associativity:
> Add a new level of NUMA for GPU's").
Applied to ppc-for-5.1.
>
> Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
> ---
> v4:
> * Use nvslot->numa_id for distinction at all levels of ibm,associativity
> * Use ARRAY_SIZE(refpoints)
> * Rebase
>
> v3:
> * Squash into one patch
> * Add PHB compat property
> ---
> hw/ppc/spapr.c | 21 +++++++++++++++++++--
> hw/ppc/spapr_pci.c | 2 ++
> hw/ppc/spapr_pci_nvlink2.c | 13 ++++++++++---
> include/hw/pci-host/spapr.h | 1 +
> include/hw/ppc/spapr.h | 1 +
> 5 files changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 299908cc7396..0ae293ec9431 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -890,10 +890,16 @@ static int spapr_dt_rng(void *fdt)
> static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
> {
> MachineState *ms = MACHINE(spapr);
> + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
> int rtas;
> GString *hypertas = g_string_sized_new(256);
> GString *qemu_hypertas = g_string_sized_new(256);
> - uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
> + uint32_t refpoints[] = {
> + cpu_to_be32(0x4),
> + cpu_to_be32(0x4),
> + cpu_to_be32(0x2),
> + };
> + uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
> uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
> memory_region_size(&MACHINE(spapr)->device_memory->mr);
> uint32_t lrdr_capacity[] = {
> @@ -945,8 +951,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
> qemu_hypertas->str, qemu_hypertas->len));
> g_string_free(qemu_hypertas, TRUE);
>
> + if (smc->pre_5_1_assoc_refpoints) {
> + nr_refpoints = 2;
> + }
> +
> _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
> - refpoints, sizeof(refpoints)));
> + refpoints, nr_refpoints * sizeof(refpoints[0])));
>
> _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
> maxdomains, sizeof(maxdomains)));
> @@ -4584,9 +4594,16 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true);
> */
> static void spapr_machine_5_0_class_options(MachineClass *mc)
> {
> + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> + static GlobalProperty compat[] = {
> + { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
> + };
> +
> spapr_machine_5_1_class_options(mc);
> compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
> + compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
> mc->numa_mem_supported = true;
> + smc->pre_5_1_assoc_refpoints = true;
> }
>
> DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 2a6a48744aaa..16739334e35f 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -2035,6 +2035,8 @@ static Property spapr_phb_properties[] = {
> pcie_ecs, true),
> DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0),
> DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0),
> + DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState,
> + pre_5_1_assoc, false),
> DEFINE_PROP_END_OF_LIST(),
> };
>
> diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
> index dd8cd6db9654..76ae77ebc851 100644
> --- a/hw/ppc/spapr_pci_nvlink2.c
> +++ b/hw/ppc/spapr_pci_nvlink2.c
> @@ -362,9 +362,9 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
> &error_abort);
> uint32_t associativity[] = {
> cpu_to_be32(0x4),
> - SPAPR_GPU_NUMA_ID,
> - SPAPR_GPU_NUMA_ID,
> - SPAPR_GPU_NUMA_ID,
> + cpu_to_be32(nvslot->numa_id),
> + cpu_to_be32(nvslot->numa_id),
> + cpu_to_be32(nvslot->numa_id),
> cpu_to_be32(nvslot->numa_id)
> };
> uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
> @@ -375,6 +375,13 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
> _FDT(off);
> _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
> _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
> +
> + if (sphb->pre_5_1_assoc) {
> + associativity[1] = SPAPR_GPU_NUMA_ID;
> + associativity[2] = SPAPR_GPU_NUMA_ID;
> + associativity[3] = SPAPR_GPU_NUMA_ID;
> + }
> +
> _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
> sizeof(associativity))));
>
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 8877ff51fbf7..600eb55c3488 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -94,6 +94,7 @@ struct SpaprPhbState {
> hwaddr nv2_gpa_win_addr;
> hwaddr nv2_atsd_win_addr;
> SpaprPhbPciNvGpuConfig *nvgpus;
> + bool pre_5_1_assoc;
> };
>
> #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index c421410e3fb8..3134d339e8fe 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -129,6 +129,7 @@ struct SpaprMachineClass {
> bool linux_pci_probe;
> bool smp_threads_vsmt; /* set VSMT to smp_threads by default */
> hwaddr rma_limit; /* clamp the RMA to this size */
> + bool pre_5_1_assoc_refpoints;
>
> void (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
> uint64_t *buid, hwaddr *pio,
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-07-17 0:09 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-16 22:56 [PATCH v4] spapr: Add a new level of NUMA for GPUs Reza Arbab
2020-07-17 0:07 ` David Gibson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.