qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] spapr: Add a new level of NUMA for GPUs
@ 2020-05-22 19:53 Reza Arbab
  2020-05-22 20:08 ` Reza Arbab
  2020-05-25  5:05 ` David Gibson
  0 siblings, 2 replies; 9+ messages in thread
From: Reza Arbab @ 2020-05-22 19:53 UTC (permalink / raw)
  To: David Gibson, qemu-ppc, qemu-devel
  Cc: Alexey Kardashevskiy, Daniel Henrique Barboza,
	Leonardo Augusto Guimaraes Garcia, Greg Kurz

NUMA nodes corresponding to GPU memory currently have the same
affinity/distance as normal memory nodes. Add a third NUMA associativity
reference point enabling us to give GPU nodes more distance.

This is guest visible information, which shouldn't change under a
running guest across migration between different qemu versions, so make
the change effective only in new (pseries > 5.0) machine types.

Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5):

node distances:
node   0   1   2   3   4   5
  0:  10  40  40  40  40  40
  1:  40  10  40  40  40  40
  2:  40  40  10  40  40  40
  3:  40  40  40  10  40  40
  4:  40  40  40  40  10  40
  5:  40  40  40  40  40  10

After:

node distances:
node   0   1   2   3   4   5
  0:  10  40  80  80  80  80
  1:  40  10  80  80  80  80
  2:  80  80  10  80  80  80
  3:  80  80  80  10  80  80
  4:  80  80  80  80  10  80
  5:  80  80  80  80  80  10

These are the same distances as on the host, mirroring the change made
to host firmware in skiboot commit f845a648b8cb ("numa/associativity:
Add a new level of NUMA for GPU's").

Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
---
v3:
* Squash into one patch
* Add PHB compat property
---
 hw/ppc/spapr.c              | 21 +++++++++++++++++++--
 hw/ppc/spapr_pci.c          |  2 ++
 hw/ppc/spapr_pci_nvlink2.c  |  7 ++++++-
 include/hw/pci-host/spapr.h |  1 +
 include/hw/ppc/spapr.h      |  1 +
 5 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c18eab0a2305..7c304b6c389d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -889,10 +889,16 @@ static int spapr_dt_rng(void *fdt)
 static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
 {
     MachineState *ms = MACHINE(spapr);
+    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
     int rtas;
     GString *hypertas = g_string_sized_new(256);
     GString *qemu_hypertas = g_string_sized_new(256);
-    uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
+    uint32_t refpoints[] = {
+        cpu_to_be32(0x4),
+        cpu_to_be32(0x4),
+        cpu_to_be32(0x2),
+    };
+    uint32_t nr_refpoints = 3;
     uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
         memory_region_size(&MACHINE(spapr)->device_memory->mr);
     uint32_t lrdr_capacity[] = {
@@ -944,8 +950,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
                      qemu_hypertas->str, qemu_hypertas->len));
     g_string_free(qemu_hypertas, TRUE);
 
+    if (smc->pre_5_1_assoc_refpoints) {
+        nr_refpoints = 2;
+    }
+
     _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
-                     refpoints, sizeof(refpoints)));
+                     refpoints, nr_refpoints * sizeof(refpoints[0])));
 
     _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
                      maxdomains, sizeof(maxdomains)));
@@ -4607,8 +4617,15 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true);
  */
 static void spapr_machine_5_0_class_options(MachineClass *mc)
 {
+    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+    static GlobalProperty compat[] = {
+        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
+    };
+
     spapr_machine_5_1_class_options(mc);
     compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
+    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
+    smc->pre_5_1_assoc_refpoints = true;
 }
 
 DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 61b84a392d65..bcdf1a25ae8b 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -2092,6 +2092,8 @@ static Property spapr_phb_properties[] = {
                      pcie_ecs, true),
     DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0),
     DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0),
+    DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState,
+                     pre_5_1_assoc, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
index 8332d5694e46..3394ac425eee 100644
--- a/hw/ppc/spapr_pci_nvlink2.c
+++ b/hw/ppc/spapr_pci_nvlink2.c
@@ -362,7 +362,7 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
         uint32_t associativity[] = {
             cpu_to_be32(0x4),
             SPAPR_GPU_NUMA_ID,
-            SPAPR_GPU_NUMA_ID,
+            cpu_to_be32(nvslot->numa_id),
             SPAPR_GPU_NUMA_ID,
             cpu_to_be32(nvslot->numa_id)
         };
@@ -374,6 +374,11 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
         _FDT(off);
         _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
         _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
+
+        if (sphb->pre_5_1_assoc) {
+            associativity[2] = SPAPR_GPU_NUMA_ID;
+        }
+
         _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
                           sizeof(associativity))));
 
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 8877ff51fbf7..600eb55c3488 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -94,6 +94,7 @@ struct SpaprPhbState {
     hwaddr nv2_gpa_win_addr;
     hwaddr nv2_atsd_win_addr;
     SpaprPhbPciNvGpuConfig *nvgpus;
+    bool pre_5_1_assoc;
 };
 
 #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index e579eaf28c05..8316d9eea405 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -129,6 +129,7 @@ struct SpaprMachineClass {
     bool linux_pci_probe;
     bool smp_threads_vsmt; /* set VSMT to smp_threads by default */
     hwaddr rma_limit;          /* clamp the RMA to this size */
+    bool pre_5_1_assoc_refpoints;
 
     void (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
                           uint64_t *buid, hwaddr *pio, 
-- 
2.18.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-05-22 19:53 [PATCH v3] spapr: Add a new level of NUMA for GPUs Reza Arbab
@ 2020-05-22 20:08 ` Reza Arbab
  2020-05-25  5:06   ` David Gibson
  2020-05-25  5:05 ` David Gibson
  1 sibling, 1 reply; 9+ messages in thread
From: Reza Arbab @ 2020-05-22 20:08 UTC (permalink / raw)
  To: David Gibson, qemu-ppc, qemu-devel
  Cc: Alexey Kardashevskiy, Daniel Henrique Barboza,
	Leonardo Augusto Guimaraes Garcia, Greg Kurz

On Fri, May 22, 2020 at 02:53:33PM -0500, Reza Arbab wrote:
>--- a/hw/ppc/spapr.c
>+++ b/hw/ppc/spapr.c
>@@ -889,10 +889,16 @@ static int spapr_dt_rng(void *fdt)
> static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
> {
>     MachineState *ms = MACHINE(spapr);
>+    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
>     int rtas;
>     GString *hypertas = g_string_sized_new(256);
>     GString *qemu_hypertas = g_string_sized_new(256);
>-    uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
>+    uint32_t refpoints[] = {
>+        cpu_to_be32(0x4),
>+        cpu_to_be32(0x4),
>+        cpu_to_be32(0x2),
>+    };
>+    uint32_t nr_refpoints = 3;

Gah, I soon as I hit send I realize this should be

     uint32_t nr_refpoints = ARRAY_SIZE(refpoints);

Can you fixup or should I send a v4?

-- 
Reza Arbab


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-05-22 19:53 [PATCH v3] spapr: Add a new level of NUMA for GPUs Reza Arbab
  2020-05-22 20:08 ` Reza Arbab
@ 2020-05-25  5:05 ` David Gibson
  2020-05-25 17:49   ` Reza Arbab
  1 sibling, 1 reply; 9+ messages in thread
From: David Gibson @ 2020-05-25  5:05 UTC (permalink / raw)
  To: Reza Arbab
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	qemu-ppc, qemu-devel, Greg Kurz

[-- Attachment #1: Type: text/plain, Size: 6839 bytes --]

On Fri, May 22, 2020 at 02:53:33PM -0500, Reza Arbab wrote:
> NUMA nodes corresponding to GPU memory currently have the same
> affinity/distance as normal memory nodes. Add a third NUMA associativity
> reference point enabling us to give GPU nodes more distance.
> 
> This is guest visible information, which shouldn't change under a
> running guest across migration between different qemu versions, so make
> the change effective only in new (pseries > 5.0) machine types.
> 
> Before, `numactl -H` output in a guest with 4 GPUs (nodes 2-5):
> 
> node distances:
> node   0   1   2   3   4   5
>   0:  10  40  40  40  40  40
>   1:  40  10  40  40  40  40
>   2:  40  40  10  40  40  40
>   3:  40  40  40  10  40  40
>   4:  40  40  40  40  10  40
>   5:  40  40  40  40  40  10
> 
> After:
> 
> node distances:
> node   0   1   2   3   4   5
>   0:  10  40  80  80  80  80
>   1:  40  10  80  80  80  80
>   2:  80  80  10  80  80  80
>   3:  80  80  80  10  80  80
>   4:  80  80  80  80  10  80
>   5:  80  80  80  80  80  10
> 
> These are the same distances as on the host, mirroring the change made
> to host firmware in skiboot commit f845a648b8cb ("numa/associativity:
> Add a new level of NUMA for GPU's").
> 
> Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
> ---
> v3:
> * Squash into one patch
> * Add PHB compat property
> ---
>  hw/ppc/spapr.c              | 21 +++++++++++++++++++--
>  hw/ppc/spapr_pci.c          |  2 ++
>  hw/ppc/spapr_pci_nvlink2.c  |  7 ++++++-
>  include/hw/pci-host/spapr.h |  1 +
>  include/hw/ppc/spapr.h      |  1 +
>  5 files changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index c18eab0a2305..7c304b6c389d 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -889,10 +889,16 @@ static int spapr_dt_rng(void *fdt)
>  static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
>  {
>      MachineState *ms = MACHINE(spapr);
> +    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
>      int rtas;
>      GString *hypertas = g_string_sized_new(256);
>      GString *qemu_hypertas = g_string_sized_new(256);
> -    uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
> +    uint32_t refpoints[] = {
> +        cpu_to_be32(0x4),
> +        cpu_to_be32(0x4),
> +        cpu_to_be32(0x2),
> +    };
> +    uint32_t nr_refpoints = 3;
>      uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
>          memory_region_size(&MACHINE(spapr)->device_memory->mr);
>      uint32_t lrdr_capacity[] = {
> @@ -944,8 +950,12 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
>                       qemu_hypertas->str, qemu_hypertas->len));
>      g_string_free(qemu_hypertas, TRUE);
>  
> +    if (smc->pre_5_1_assoc_refpoints) {
> +        nr_refpoints = 2;
> +    }
> +
>      _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points",
> -                     refpoints, sizeof(refpoints)));
> +                     refpoints, nr_refpoints * sizeof(refpoints[0])));
>  
>      _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains",
>                       maxdomains, sizeof(maxdomains)));
> @@ -4607,8 +4617,15 @@ DEFINE_SPAPR_MACHINE(5_1, "5.1", true);
>   */
>  static void spapr_machine_5_0_class_options(MachineClass *mc)
>  {
> +    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
> +    static GlobalProperty compat[] = {
> +        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
> +    };
> +
>      spapr_machine_5_1_class_options(mc);
>      compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
> +    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
> +    smc->pre_5_1_assoc_refpoints = true;
>  }
>  
>  DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 61b84a392d65..bcdf1a25ae8b 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -2092,6 +2092,8 @@ static Property spapr_phb_properties[] = {
>                       pcie_ecs, true),
>      DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0),
>      DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0),
> +    DEFINE_PROP_BOOL("pre-5.1-associativity", SpaprPhbState,
> +                     pre_5_1_assoc, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
> index 8332d5694e46..3394ac425eee 100644
> --- a/hw/ppc/spapr_pci_nvlink2.c
> +++ b/hw/ppc/spapr_pci_nvlink2.c
> @@ -362,7 +362,7 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
>          uint32_t associativity[] = {
>              cpu_to_be32(0x4),
>              SPAPR_GPU_NUMA_ID,
> -            SPAPR_GPU_NUMA_ID,
> +            cpu_to_be32(nvslot->numa_id),
>              SPAPR_GPU_NUMA_ID,
>              cpu_to_be32(nvslot->numa_id)


This doesn't look quite right.  In the new case we'll get {
GPU_NUMA_ID, nvslot->numa_id, GPU_NUMA_ID, nvslot->numa_id }.

>          };
> @@ -374,6 +374,11 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
>          _FDT(off);
>          _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
>          _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
> +
> +        if (sphb->pre_5_1_assoc) {
> +            associativity[2] = SPAPR_GPU_NUMA_ID;
> +        }
> +
>          _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
>                            sizeof(associativity))));
>  
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 8877ff51fbf7..600eb55c3488 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -94,6 +94,7 @@ struct SpaprPhbState {
>      hwaddr nv2_gpa_win_addr;
>      hwaddr nv2_atsd_win_addr;
>      SpaprPhbPciNvGpuConfig *nvgpus;
> +    bool pre_5_1_assoc;
>  };
>  
>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index e579eaf28c05..8316d9eea405 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -129,6 +129,7 @@ struct SpaprMachineClass {
>      bool linux_pci_probe;
>      bool smp_threads_vsmt; /* set VSMT to smp_threads by default */
>      hwaddr rma_limit;          /* clamp the RMA to this size */
> +    bool pre_5_1_assoc_refpoints;
>  
>      void (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
>                            uint64_t *buid, hwaddr *pio, 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-05-22 20:08 ` Reza Arbab
@ 2020-05-25  5:06   ` David Gibson
  0 siblings, 0 replies; 9+ messages in thread
From: David Gibson @ 2020-05-25  5:06 UTC (permalink / raw)
  To: Reza Arbab
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	qemu-ppc, qemu-devel, Greg Kurz

[-- Attachment #1: Type: text/plain, Size: 1252 bytes --]

On Fri, May 22, 2020 at 03:08:56PM -0500, Reza Arbab wrote:
> On Fri, May 22, 2020 at 02:53:33PM -0500, Reza Arbab wrote:
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -889,10 +889,16 @@ static int spapr_dt_rng(void *fdt)
> > static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
> > {
> >     MachineState *ms = MACHINE(spapr);
> > +    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
> >     int rtas;
> >     GString *hypertas = g_string_sized_new(256);
> >     GString *qemu_hypertas = g_string_sized_new(256);
> > -    uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4) };
> > +    uint32_t refpoints[] = {
> > +        cpu_to_be32(0x4),
> > +        cpu_to_be32(0x4),
> > +        cpu_to_be32(0x2),
> > +    };
> > +    uint32_t nr_refpoints = 3;
> 
> Gah, I soon as I hit send I realize this should be
> 
>     uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
> 
> Can you fixup or should I send a v4?

I had one other comment that needs addressing, so you might as well
send a v4.

> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-05-25  5:05 ` David Gibson
@ 2020-05-25 17:49   ` Reza Arbab
  2020-07-16  5:04     ` David Gibson
  0 siblings, 1 reply; 9+ messages in thread
From: Reza Arbab @ 2020-05-25 17:49 UTC (permalink / raw)
  To: David Gibson
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	qemu-ppc, qemu-devel, Greg Kurz

On Mon, May 25, 2020 at 03:05:50PM +1000, David Gibson wrote:
>On Fri, May 22, 2020 at 02:53:33PM -0500, Reza Arbab wrote:
>> --- a/hw/ppc/spapr_pci_nvlink2.c
>> +++ b/hw/ppc/spapr_pci_nvlink2.c
>> @@ -362,7 +362,7 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
>>          uint32_t associativity[] = {
>>              cpu_to_be32(0x4),
>>              SPAPR_GPU_NUMA_ID,
>> -            SPAPR_GPU_NUMA_ID,
>> +            cpu_to_be32(nvslot->numa_id),
>>              SPAPR_GPU_NUMA_ID,
>>              cpu_to_be32(nvslot->numa_id)
>
>
>This doesn't look quite right.  In the new case we'll get {
>GPU_NUMA_ID, nvslot->numa_id, GPU_NUMA_ID, nvslot->numa_id }.

The associativity reference points are 4 (and now 2), so this is what we 
want. I think what you've noticed is that reference points are 1-based 
ordinals:

	"...the “ibm,associativity-reference-points” property indicates
boundaries between associativity domains presented by the 
“ibm,associativity” property containing “near” and “far” resources. The 
first such boundary in the list represents the 1 based ordinal in the 
associativity lists of the most significant boundary, with subsequent 
entries indicating progressively less significant boundaries."

-- 
Reza Arbab


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-05-25 17:49   ` Reza Arbab
@ 2020-07-16  5:04     ` David Gibson
  2020-07-16  9:42       ` Daniel Henrique Barboza
  0 siblings, 1 reply; 9+ messages in thread
From: David Gibson @ 2020-07-16  5:04 UTC (permalink / raw)
  To: Reza Arbab
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	qemu-ppc, qemu-devel, Greg Kurz

[-- Attachment #1: Type: text/plain, Size: 2111 bytes --]

On Mon, May 25, 2020 at 12:49:27PM -0500, Reza Arbab wrote:
> On Mon, May 25, 2020 at 03:05:50PM +1000, David Gibson wrote:
> > On Fri, May 22, 2020 at 02:53:33PM -0500, Reza Arbab wrote:
> > > --- a/hw/ppc/spapr_pci_nvlink2.c
> > > +++ b/hw/ppc/spapr_pci_nvlink2.c
> > > @@ -362,7 +362,7 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
> > >          uint32_t associativity[] = {
> > >              cpu_to_be32(0x4),
> > >              SPAPR_GPU_NUMA_ID,
> > > -            SPAPR_GPU_NUMA_ID,
> > > +            cpu_to_be32(nvslot->numa_id),
> > >              SPAPR_GPU_NUMA_ID,
> > >              cpu_to_be32(nvslot->numa_id)
> > 
> > 
> > This doesn't look quite right.  In the new case we'll get {
> > GPU_NUMA_ID, nvslot->numa_id, GPU_NUMA_ID, nvslot->numa_id }.
> 
> The associativity reference points are 4 (and now 2), so this is what we
> want. I think what you've noticed is that reference points are 1-based
> ordinals:
> 
> 	"...the “ibm,associativity-reference-points” property indicates
> boundaries between associativity domains presented by the
> “ibm,associativity” property containing “near” and “far” resources. The
> first such boundary in the list represents the 1 based ordinal in the
> associativity lists of the most significant boundary, with subsequent
> entries indicating progressively less significant boundaries."

Right.. AIUI, associativity-reference-points indicates which leves are
"important" from a NUMA distance point of view (though the spec is
very confusing).  But, I'm pretty sure, that ignoring
reference-points, the individual ibm,associativity lists are supposed
to describe a correct hierarchy, even if some levels will get ignored
for distance purposes.  So once you've split up into "numa_id" nodes
at the second level, you can't then go back to just 2 nodes (main
vs. gpu) at the third.


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-07-16  5:04     ` David Gibson
@ 2020-07-16  9:42       ` Daniel Henrique Barboza
  2020-07-16 16:00         ` Reza Arbab
  0 siblings, 1 reply; 9+ messages in thread
From: Daniel Henrique Barboza @ 2020-07-16  9:42 UTC (permalink / raw)
  To: David Gibson, Reza Arbab
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	qemu-ppc, qemu-devel, Greg Kurz



On 7/16/20 2:04 AM, David Gibson wrote:
> On Mon, May 25, 2020 at 12:49:27PM -0500, Reza Arbab wrote:
>> On Mon, May 25, 2020 at 03:05:50PM +1000, David Gibson wrote:
>>> On Fri, May 22, 2020 at 02:53:33PM -0500, Reza Arbab wrote:
>>>> --- a/hw/ppc/spapr_pci_nvlink2.c
>>>> +++ b/hw/ppc/spapr_pci_nvlink2.c
>>>> @@ -362,7 +362,7 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
>>>>           uint32_t associativity[] = {
>>>>               cpu_to_be32(0x4),
>>>>               SPAPR_GPU_NUMA_ID,
>>>> -            SPAPR_GPU_NUMA_ID,
>>>> +            cpu_to_be32(nvslot->numa_id),
>>>>               SPAPR_GPU_NUMA_ID,
>>>>               cpu_to_be32(nvslot->numa_id)
>>>
>>>
>>> This doesn't look quite right.  In the new case we'll get {
>>> GPU_NUMA_ID, nvslot->numa_id, GPU_NUMA_ID, nvslot->numa_id }.
>>
>> The associativity reference points are 4 (and now 2), so this is what we
>> want. I think what you've noticed is that reference points are 1-based
>> ordinals:
>>
>> 	"...the “ibm,associativity-reference-points” property indicates
>> boundaries between associativity domains presented by the
>> “ibm,associativity” property containing “near” and “far” resources. The
>> first such boundary in the list represents the 1 based ordinal in the
>> associativity lists of the most significant boundary, with subsequent
>> entries indicating progressively less significant boundaries."
> 
> Right.. AIUI, associativity-reference-points indicates which leves are
> "important" from a NUMA distance point of view (though the spec is
> very confusing).  But, I'm pretty sure, that ignoring
> reference-points, the individual ibm,associativity lists are supposed
> to describe a correct hierarchy, even if some levels will get ignored
> for distance purposes.  So once you've split up into "numa_id" nodes
> at the second level, you can't then go back to just 2 nodes (main
> vs. gpu) at the third.


I believe Reza should go with what Skiboot already does in this situation:

(hw/npu2.c)

dt_add_property_cells(mem, "ibm,associativity", 4, chip_id, chip_id, chip_id, chip_id);

Which would translate here to:

         uint32_t associativity[] = {
             cpu_to_be32(0x4),
             cpu_to_be32(nvslot->numa_id),
             cpu_to_be32(nvslot->numa_id),
             cpu_to_be32(nvslot->numa_id),
             cpu_to_be32(nvslot->numa_id),
         };


In the end it doesn't matter for the logic since the refpoints are always
0x4 0x4 0x2, meaning that we're ignoring the 1st and 3rd elements entirely
anyways, but at least make the intention clearer: GPUs are always at the
maximum distance from everything else.



Thanks,


DHB



> 
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-07-16  9:42       ` Daniel Henrique Barboza
@ 2020-07-16 16:00         ` Reza Arbab
  2020-07-16 16:40           ` Daniel Henrique Barboza
  0 siblings, 1 reply; 9+ messages in thread
From: Reza Arbab @ 2020-07-16 16:00 UTC (permalink / raw)
  To: Daniel Henrique Barboza
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	Greg Kurz, qemu-devel, qemu-ppc, David Gibson

On Thu, Jul 16, 2020 at 06:42:11AM -0300, Daniel Henrique Barboza wrote:
>Which would translate here to:
>
>        uint32_t associativity[] = {
>            cpu_to_be32(0x4),
>            cpu_to_be32(nvslot->numa_id),
>            cpu_to_be32(nvslot->numa_id),
>            cpu_to_be32(nvslot->numa_id),
>            cpu_to_be32(nvslot->numa_id),
>        };

Sure, that's how it originally was in v1 of the patch.

I'll send a v4 today. It's been a while so I need to rebase anyway.

-- 
Reza Arbab


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] spapr: Add a new level of NUMA for GPUs
  2020-07-16 16:00         ` Reza Arbab
@ 2020-07-16 16:40           ` Daniel Henrique Barboza
  0 siblings, 0 replies; 9+ messages in thread
From: Daniel Henrique Barboza @ 2020-07-16 16:40 UTC (permalink / raw)
  To: Reza Arbab
  Cc: Daniel Henrique Barboza, Leonardo Augusto Guimaraes Garcia,
	Greg Kurz, qemu-devel, qemu-ppc, David Gibson



On 7/16/20 1:00 PM, Reza Arbab wrote:
> On Thu, Jul 16, 2020 at 06:42:11AM -0300, Daniel Henrique Barboza wrote:
>> Which would translate here to:
>>
>>        uint32_t associativity[] = {
>>            cpu_to_be32(0x4),
>>            cpu_to_be32(nvslot->numa_id),
>>            cpu_to_be32(nvslot->numa_id),
>>            cpu_to_be32(nvslot->numa_id),
>>            cpu_to_be32(nvslot->numa_id),
>>        };
> 
> Sure, that's how it originally was in v1 of the patch.

Yeah, Greg commented this in v2 about this chunk:

------------
This is a guest visible change. It should theoretically be controlled
with a compat property of the PHB (look for "static GlobalProperty" in
spapr.c). But since this code is only used for GPU passthrough and we
don't support migration of such devices, I guess it's okay. Maybe just
mention it in the changelog.
--------------

By all means you can mention in changelog/code comment why the associativity
of the GPU is nvslot->numa_id 4 times in a row, but I believe this
format is still clearer for us to understand. It also makes it equal
to skiboot.

And it deprecates the SPAPR_GPU_NUMA_ID macro, allowing us to use its value
(1) for other internal purposes regarding NUMA without collision with the
GPU semantics.



Thanks,


DHB


> 
> I'll send a v4 today. It's been a while so I need to rebase anyway.
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-07-16 16:41 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-22 19:53 [PATCH v3] spapr: Add a new level of NUMA for GPUs Reza Arbab
2020-05-22 20:08 ` Reza Arbab
2020-05-25  5:06   ` David Gibson
2020-05-25  5:05 ` David Gibson
2020-05-25 17:49   ` Reza Arbab
2020-07-16  5:04     ` David Gibson
2020-07-16  9:42       ` Daniel Henrique Barboza
2020-07-16 16:00         ` Reza Arbab
2020-07-16 16:40           ` Daniel Henrique Barboza

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).