All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering
@ 2014-06-23 19:33 Nishanth Aravamudan
  2014-06-24  0:48 ` [Qemu-devel] [RFC PATCH v2] " Nishanth Aravamudan
  2014-06-24  7:33 ` [Qemu-devel] [RFC PATCH] " Igor Mammedov
  0 siblings, 2 replies; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-23 19:33 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alexey Kardashevskiy, Eduardo Habkost, qemu-devel

Add a present field to NodeInfo which indicates if a given nodeid was
present on the command-line or not. Current code relies on a memory
value being passed for a node to indicate presence, which is
insufficient in the presence of memoryless nodes or sparse numbering.
Adjust the iteration of various NUMA loops to use the maximum known NUMA
ID rather than the number of NUMA nodes.

numa.c::set_numa_nodes() has become a bit more convoluted for
round-robin'ing the CPUs over known nodes when not specified by the
user.

Note that architecture-specific code still needs changes (forthcoming)
for both sparse node numbering and memoryless nodes.

Examples:

(1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
node,nodeid=2 -smp 16

Before:

node 0 cpus: 0 2 4 6 8 10 12 14
node 0 size: 2048 MB
node 1 cpus: 1 3 5 7 9 11 13 15
node 1 size: 2048 MB

After:

node 2 cpus: 0 2 4 6 8 10 12 14                                               |
node 2 size: 2048 MB                                                          |
node 3 cpus: 1 3 5 7 9 11 13 15                                               |
node 3 size: 2048 MB

(2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16

Before:

node 0 cpus: 0 4 8 12                                                         |
node 0 size: 1024 MB                                                          |
node 1 cpus: 1 5 9 13                                                         |
node 1 size: 1024 MB                                                          |
node 2 cpus: 2 6 10 14                                                        |
node 2 size: 1024 MB                                                          |
node 3 cpus: 3 7 11 15                                                        |
node 3 size: 1024 MB

After:

node 0 cpus: 0 4 8 12
node 0 size: 1024 MB
node 1 cpus: 1 5 9 13
node 1 size: 1024 MB
node 2 cpus: 2 6 10 14
node 2 size: 1024 MB
node 3 cpus: 3 7 11 15
node 3 size: 1024 MB

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
    
---
Based off mst's for_upstream tag, which has the NodeInfo changes.

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 277230d..b90bf66 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -145,11 +145,13 @@ extern int mem_prealloc;
  */
 #define MAX_CPUMASK_BITS 255
 
-extern int nb_numa_nodes;
+extern int nb_numa_nodes; /* Number of NUMA nodes */
+extern int max_numa_node; /* Highest specified NUMA node ID */
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
+    bool present;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void set_numa_nodes(void);
diff --git a/monitor.c b/monitor.c
index c7f8797..4721996 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
     CPUState *cpu;
 
     monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
-    for (i = 0; i < nb_numa_nodes; i++) {
+    for (i = 0; i < max_numa_node; i++) {
+        if (!numa_info[i].present) {
+            continue;
+        }
         monitor_printf(mon, "node %d cpus:", i);
         CPU_FOREACH(cpu) {
             if (cpu->numa_node == i) {
diff --git a/numa.c b/numa.c
index e471afe..c31857d 100644
--- a/numa.c
+++ b/numa.c
@@ -106,6 +106,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+    numa_info[nodenr].present = true;
+    if (nodenr >= max_numa_node) {
+        max_numa_node = nodenr + 1;
+    }
 }
 
 int numa_init_func(QemuOpts *opts, void *opaque)
@@ -155,7 +159,7 @@ void set_numa_nodes(void)
 {
     if (nb_numa_nodes > 0) {
         uint64_t numa_total;
-        int i;
+        int i, j, last_j = -1;
 
         if (nb_numa_nodes > MAX_NODES) {
             nb_numa_nodes = MAX_NODES;
@@ -164,27 +168,29 @@ void set_numa_nodes(void)
         /* If no memory size if given for any node, assume the default case
          * and distribute the available memory equally across all nodes
          */
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (numa_info[i].node_mem != 0) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present && numa_info[i].node_mem != 0) {
                 break;
             }
         }
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             uint64_t usedmem = 0;
 
             /* On Linux, the each node's border has to be 8MB aligned,
              * the final node gets the rest.
              */
-            for (i = 0; i < nb_numa_nodes - 1; i++) {
-                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
-                                        ~((1 << 23UL) - 1);
-                usedmem += numa_info[i].node_mem;
+            for (i = 0; i < max_numa_node - 1; i++) {
+                if (numa_info[i].present) {
+                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
+                                            ~((1 << 23UL) - 1);
+                    usedmem += numa_info[i].node_mem;
+                }
             }
             numa_info[i].node_mem = ram_size - usedmem;
         }
 
         numa_total = 0;
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             numa_total += numa_info[i].node_mem;
         }
         if (numa_total != ram_size) {
@@ -194,8 +200,9 @@ void set_numa_nodes(void)
             exit(1);
         }
 
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present &&
+                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }
         }
@@ -203,9 +210,19 @@ void set_numa_nodes(void)
          * must cope with this anyway, because there are BIOSes out there in
          * real machines which also use this scheme.
          */
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             for (i = 0; i < max_cpus; i++) {
-                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
+                for (j = last_j + 1;
+                     j != (last_j == 0 ? max_numa_node : last_j - 1); j++) {
+                    if (j >= max_numa_node) {
+                        j = 0;
+                    }
+                    if (numa_info[j].present) {
+                        break;
+                    }
+                }
+                last_j = j;
+                set_bit(i, numa_info[j].node_cpu);
             }
         }
     }
@@ -217,7 +234,7 @@ void set_numa_modes(void)
     int i;
 
     CPU_FOREACH(cpu) {
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
                 cpu->numa_node = i;
             }
diff --git a/vl.c b/vl.c
index 54b4627..e1a6ab8 100644
--- a/vl.c
+++ b/vl.c
@@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
     QTAILQ_HEAD_INITIALIZER(fw_boot_order);
 
 int nb_numa_nodes;
+int max_numa_node;
 NodeInfo numa_info[MAX_NODES];
 
 uint8_t qemu_uuid[16];
@@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
 
     for (i = 0; i < MAX_NODES; i++) {
         numa_info[i].node_mem = 0;
+        numa_info[i].present = false;
         bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
     }
 
     nb_numa_nodes = 0;
+    max_numa_node = -1;
     nb_nics = 0;
 
     bdrv_init_with_whitelist();

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [RFC PATCH v2] numa: enable sparse node numbering
  2014-06-23 19:33 [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering Nishanth Aravamudan
@ 2014-06-24  0:48 ` Nishanth Aravamudan
  2014-06-24  1:08   ` [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA " Nishanth Aravamudan
                     ` (2 more replies)
  2014-06-24  7:33 ` [Qemu-devel] [RFC PATCH] " Igor Mammedov
  1 sibling, 3 replies; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-24  0:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexey Kardashevskiy, Eduardo Habkost, anton, qemu-devel

On 23.06.2014 [12:33:10 -0700], Nishanth Aravamudan wrote:
> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
> 
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.

Sorry for the noise, but after staring at it for a bit, I think this
version is a bit cleaner for the set_numa_nodes() change.

Add a present field to NodeInfo which indicates if a given nodeid was
present on the command-line or not. Current code relies on a memory
value being passed for a node to indicate presence, which is
insufficient in the presence of memoryless nodes or sparse numbering.
Adjust the iteration of various NUMA loops to use the maximum known NUMA
ID rather than the number of NUMA nodes.

numa.c::set_numa_nodes() has become a bit more convoluted for
round-robin'ing the CPUs over known nodes when not specified by the
user.

Note that architecture-specific code still needs changes (forthcoming)
for both sparse node numbering and memoryless nodes.

Examples:

(1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
node,nodeid=2 -smp 16

Before:

node 0 cpus: 0 2 4 6 8 10 12 14
node 0 size: 2048 MB
node 1 cpus: 1 3 5 7 9 11 13 15
node 1 size: 2048 MB

After:

node 2 cpus: 0 2 4 6 8 10 12 14                                               |
node 2 size: 2048 MB                                                          |
node 3 cpus: 1 3 5 7 9 11 13 15                                               |
node 3 size: 2048 MB

(2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16

node 0 cpus: 0 4 8 12                                                         |
node 0 size: 1024 MB                                                          |
node 1 cpus: 1 5 9 13                                                         |
node 1 size: 1024 MB                                                          |
node 2 cpus: 2 6 10 14                                                        |
node 2 size: 1024 MB                                                          |
node 3 cpus: 3 7 11 15                                                        |
node 3 size: 1024 MB

(qemu) info numa
4 nodes
node 0 cpus: 0 4 8 12
node 0 size: 1024 MB
node 1 cpus: 1 5 9 13
node 1 size: 1024 MB
node 2 cpus: 2 6 10 14
node 2 size: 1024 MB
node 3 cpus: 3 7 11 15
node 3 size: 1024 MB

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: qemu-devel@nongnu.org

---
Based off mst's for_upstream tag, which has the NodeInfo changes.

v1 -> v2:
  Modify set_numa_nodes loop for round-robin'ing CPUs.

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 277230d..b90bf66 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -145,11 +145,13 @@ extern int mem_prealloc;
  */
 #define MAX_CPUMASK_BITS 255
 
-extern int nb_numa_nodes;
+extern int nb_numa_nodes; /* Number of NUMA nodes */
+extern int max_numa_node; /* Highest specified NUMA node ID */
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
+    bool present;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void set_numa_nodes(void);
diff --git a/monitor.c b/monitor.c
index c7f8797..4721996 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
     CPUState *cpu;
 
     monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
-    for (i = 0; i < nb_numa_nodes; i++) {
+    for (i = 0; i < max_numa_node; i++) {
+        if (!numa_info[i].present) {
+            continue;
+        }
         monitor_printf(mon, "node %d cpus:", i);
         CPU_FOREACH(cpu) {
             if (cpu->numa_node == i) {
diff --git a/numa.c b/numa.c
index e471afe..2a4c577 100644
--- a/numa.c
+++ b/numa.c
@@ -106,6 +106,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+    numa_info[nodenr].present = true;
+    if (nodenr >= max_numa_node) {
+        max_numa_node = nodenr + 1;
+    }
 }
 
 int numa_init_func(QemuOpts *opts, void *opaque)
@@ -155,7 +159,7 @@ void set_numa_nodes(void)
 {
     if (nb_numa_nodes > 0) {
         uint64_t numa_total;
-        int i;
+        int i, j = -1;
 
         if (nb_numa_nodes > MAX_NODES) {
             nb_numa_nodes = MAX_NODES;
@@ -164,27 +168,29 @@ void set_numa_nodes(void)
         /* If no memory size if given for any node, assume the default case
          * and distribute the available memory equally across all nodes
          */
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (numa_info[i].node_mem != 0) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present && numa_info[i].node_mem != 0) {
                 break;
             }
         }
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             uint64_t usedmem = 0;
 
             /* On Linux, the each node's border has to be 8MB aligned,
              * the final node gets the rest.
              */
-            for (i = 0; i < nb_numa_nodes - 1; i++) {
-                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
-                                        ~((1 << 23UL) - 1);
-                usedmem += numa_info[i].node_mem;
+            for (i = 0; i < max_numa_node - 1; i++) {
+                if (numa_info[i].present) {
+                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
+                                            ~((1 << 23UL) - 1);
+                    usedmem += numa_info[i].node_mem;
+                }
             }
             numa_info[i].node_mem = ram_size - usedmem;
         }
 
         numa_total = 0;
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             numa_total += numa_info[i].node_mem;
         }
         if (numa_total != ram_size) {
@@ -194,8 +200,9 @@ void set_numa_nodes(void)
             exit(1);
         }
 
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present &&
+                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }
         }
@@ -203,9 +210,15 @@ void set_numa_nodes(void)
          * must cope with this anyway, because there are BIOSes out there in
          * real machines which also use this scheme.
          */
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             for (i = 0; i < max_cpus; i++) {
-                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
+                do {
+                    j++;
+                    if (j >= max_numa_node) {
+                        j = 0;
+                    }
+                } while (!numa_info[j].present);
+                set_bit(i, numa_info[j].node_cpu);
             }
         }
     }
@@ -217,7 +230,7 @@ void set_numa_modes(void)
     int i;
 
     CPU_FOREACH(cpu) {
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
                 cpu->numa_node = i;
             }
diff --git a/vl.c b/vl.c
index 54b4627..e1a6ab8 100644
--- a/vl.c
+++ b/vl.c
@@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
     QTAILQ_HEAD_INITIALIZER(fw_boot_order);
 
 int nb_numa_nodes;
+int max_numa_node;
 NodeInfo numa_info[MAX_NODES];
 
 uint8_t qemu_uuid[16];
@@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
 
     for (i = 0; i < MAX_NODES; i++) {
         numa_info[i].node_mem = 0;
+        numa_info[i].present = false;
         bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
     }
 
     nb_numa_nodes = 0;
+    max_numa_node = -1;
     nb_nics = 0;
 
     bdrv_init_with_whitelist();

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA node numbering
  2014-06-24  0:48 ` [Qemu-devel] [RFC PATCH v2] " Nishanth Aravamudan
@ 2014-06-24  1:08   ` Nishanth Aravamudan
  2014-06-24  3:34     ` Nishanth Aravamudan
  2014-06-24  7:43   ` [Qemu-devel] [RFC PATCH v2] numa: enable sparse " Hu Tao
  2014-06-24 17:40   ` [Qemu-devel] [RFC PATCH v3] " Nishanth Aravamudan
  2 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-24  1:08 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexey Kardashevskiy, Eduardo Habkost, anton, qemu-devel

With generic sparse NUMA node parsing in place ("numa: enable sparse
node numbering"), ppc can be updated to iterate over only the
user-specified nodes.

qemu-system-ppc64 -machine pseries,accel=kvm,usb=off -m 4096
-realtime mlock=off -numa node,nodeid=3 -numa node,nodeid=2 -smp 4

Before:

info numa:
node 0 cpus: 0 2
node 0 size: 2048 MB
node 1 cpus: 1 3
node 1 size: 2048 MB

numactl --hardware:
available: 2 nodes (0-1)
node 0 cpus: 0 2
node 0 size: 2027 MB
node 0 free: 1875 MB
node 1 cpus: 1 3
node 1 size: 2045 MB
node 1 free: 1980 MB
node distances:
node   0   1
  0:  10  40
  1:  40  10

After:

info numa:
node 2 cpus: 0 2
node 2 size: 2048 MB
node 3 cpus: 1 3
node 3 size: 2048 MB

numactl --hardware:
available: 3 nodes (0,2-3)
node 0 cpus:
node 0 size: 0 MB
node 0 free: 0 MB
node 2 cpus: 0 2
node 2 size: 2027 MB
node 2 free: 1943 MB
node 3 cpus: 1 3
node 3 size: 2045 MB
node 3 free: 1904 MB
node distances:
node   0   2   3
  0:  10  40  40
  2:  40  10  40
  3:  40  40  10

Note, the empty node 0 is due to the Linux kernel.

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 82f183f..d07857a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -707,7 +707,10 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
 
     /* RAM: Node 1 and beyond */
     mem_start = node0_size;
-    for (i = 1; i < nb_numa_nodes; i++) {
+    for (i = 1; i < max_numa_node; i++) {
+        if (!numa_info[i].present) {
+            continue;
+        }
         mem_reg_property[0] = cpu_to_be64(mem_start);
         if (mem_start >= ram_size) {
             node_size = 0;

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA node numbering
  2014-06-24  1:08   ` [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA " Nishanth Aravamudan
@ 2014-06-24  3:34     ` Nishanth Aravamudan
  0 siblings, 0 replies; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-24  3:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexey Kardashevskiy, David Rientjes, Eduardo Habkost, anton, qemu-devel

Hi David,

On 23.06.2014 [18:08:33 -0700], Nishanth Aravamudan wrote:
> With generic sparse NUMA node parsing in place ("numa: enable sparse
> node numbering"), ppc can be updated to iterate over only the
> user-specified nodes.
> 
> qemu-system-ppc64 -machine pseries,accel=kvm,usb=off -m 4096
> -realtime mlock=off -numa node,nodeid=3 -numa node,nodeid=2 -smp 4

I'd like to do so something similar for x86 as a follow-on patch, but I
don't really know what the SRAT should look like, or what the APIC
mapping should be to have sparse NUMA numbering? Some cursory googling
indicates you've run into this in practice? Possibly with a bad APIC?

Thanks,
Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering
  2014-06-23 19:33 [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering Nishanth Aravamudan
  2014-06-24  0:48 ` [Qemu-devel] [RFC PATCH v2] " Nishanth Aravamudan
@ 2014-06-24  7:33 ` Igor Mammedov
  2014-06-24  8:58   ` Alexey Kardashevskiy
  2014-06-24 16:39   ` Nishanth Aravamudan
  1 sibling, 2 replies; 22+ messages in thread
From: Igor Mammedov @ 2014-06-24  7:33 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Alexey Kardashevskiy, qemu-devel, Eduardo Habkost, Michael S. Tsirkin

On Mon, 23 Jun 2014 12:33:10 -0700
Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:

> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
Why would you need sparse numbering?
What task exactly are you trying to solve?

> 
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.
> 
> Note that architecture-specific code still needs changes (forthcoming)
> for both sparse node numbering and memoryless nodes.
> 
> Examples:
> 
> (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
> node,nodeid=2 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 2 4 6 8 10 12 14
> node 0 size: 2048 MB
> node 1 cpus: 1 3 5 7 9 11 13 15
> node 1 size: 2048 MB
> 
> After:
> 
> node 2 cpus: 0 2 4 6 8 10 12 14                                               |
> node 2 size: 2048 MB                                                          |
> node 3 cpus: 1 3 5 7 9 11 13 15                                               |
> node 3 size: 2048 MB
> 
> (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
> node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 4 8 12                                                         |
> node 0 size: 1024 MB                                                          |
> node 1 cpus: 1 5 9 13                                                         |
> node 1 size: 1024 MB                                                          |
> node 2 cpus: 2 6 10 14                                                        |
> node 2 size: 1024 MB                                                          |
> node 3 cpus: 3 7 11 15                                                        |
> node 3 size: 1024 MB
> 
> After:
> 
> node 0 cpus: 0 4 8 12
> node 0 size: 1024 MB
> node 1 cpus: 1 5 9 13
> node 1 size: 1024 MB
> node 2 cpus: 2 6 10 14
> node 2 size: 1024 MB
> node 3 cpus: 3 7 11 15
> node 3 size: 1024 MB
> 
> Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
>     
> ---
> Based off mst's for_upstream tag, which has the NodeInfo changes.
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 277230d..b90bf66 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -145,11 +145,13 @@ extern int mem_prealloc;
>   */
>  #define MAX_CPUMASK_BITS 255
>  
> -extern int nb_numa_nodes;
> +extern int nb_numa_nodes; /* Number of NUMA nodes */
> +extern int max_numa_node; /* Highest specified NUMA node ID */
>  typedef struct node_info {
>      uint64_t node_mem;
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
> +    bool present;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void set_numa_nodes(void);
> diff --git a/monitor.c b/monitor.c
> index c7f8797..4721996 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
>      CPUState *cpu;
>  
>      monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
> -    for (i = 0; i < nb_numa_nodes; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
> +        if (!numa_info[i].present) {
> +            continue;
> +        }
>          monitor_printf(mon, "node %d cpus:", i);
>          CPU_FOREACH(cpu) {
>              if (cpu->numa_node == i) {
> diff --git a/numa.c b/numa.c
> index e471afe..c31857d 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -106,6 +106,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>          numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>      }
> +    numa_info[nodenr].present = true;
> +    if (nodenr >= max_numa_node) {
> +        max_numa_node = nodenr + 1;
> +    }
>  }
>  
>  int numa_init_func(QemuOpts *opts, void *opaque)
> @@ -155,7 +159,7 @@ void set_numa_nodes(void)
>  {
>      if (nb_numa_nodes > 0) {
>          uint64_t numa_total;
> -        int i;
> +        int i, j, last_j = -1;
>  
>          if (nb_numa_nodes > MAX_NODES) {
>              nb_numa_nodes = MAX_NODES;
> @@ -164,27 +168,29 @@ void set_numa_nodes(void)
>          /* If no memory size if given for any node, assume the default case
>           * and distribute the available memory equally across all nodes
>           */
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (numa_info[i].node_mem != 0) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present && numa_info[i].node_mem != 0) {
>                  break;
>              }
>          }
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              uint64_t usedmem = 0;
>  
>              /* On Linux, the each node's border has to be 8MB aligned,
>               * the final node gets the rest.
>               */
> -            for (i = 0; i < nb_numa_nodes - 1; i++) {
> -                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> -                                        ~((1 << 23UL) - 1);
> -                usedmem += numa_info[i].node_mem;
> +            for (i = 0; i < max_numa_node - 1; i++) {
> +                if (numa_info[i].present) {
> +                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> +                                            ~((1 << 23UL) - 1);
> +                    usedmem += numa_info[i].node_mem;
> +                }
>              }
>              numa_info[i].node_mem = ram_size - usedmem;
>          }
>  
>          numa_total = 0;
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              numa_total += numa_info[i].node_mem;
>          }
>          if (numa_total != ram_size) {
> @@ -194,8 +200,9 @@ void set_numa_nodes(void)
>              exit(1);
>          }
>  
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
>          }
> @@ -203,9 +210,19 @@ void set_numa_nodes(void)
>           * must cope with this anyway, because there are BIOSes out there in
>           * real machines which also use this scheme.
>           */
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                for (j = last_j + 1;
> +                     j != (last_j == 0 ? max_numa_node : last_j - 1); j++) {
> +                    if (j >= max_numa_node) {
> +                        j = 0;
> +                    }
> +                    if (numa_info[j].present) {
> +                        break;
> +                    }
> +                }
> +                last_j = j;
> +                set_bit(i, numa_info[j].node_cpu);
>              }
>          }
>      }
> @@ -217,7 +234,7 @@ void set_numa_modes(void)
>      int i;
>  
>      CPU_FOREACH(cpu) {
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
>                  cpu->numa_node = i;
>              }
> diff --git a/vl.c b/vl.c
> index 54b4627..e1a6ab8 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>      QTAILQ_HEAD_INITIALIZER(fw_boot_order);
>  
>  int nb_numa_nodes;
> +int max_numa_node;
>  NodeInfo numa_info[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          numa_info[i].node_mem = 0;
> +        numa_info[i].present = false;
>          bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
>      }
>  
>      nb_numa_nodes = 0;
> +    max_numa_node = -1;
>      nb_nics = 0;
>  
>      bdrv_init_with_whitelist();
> 
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v2] numa: enable sparse node numbering
  2014-06-24  0:48 ` [Qemu-devel] [RFC PATCH v2] " Nishanth Aravamudan
  2014-06-24  1:08   ` [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA " Nishanth Aravamudan
@ 2014-06-24  7:43   ` Hu Tao
  2014-06-24 17:40   ` [Qemu-devel] [RFC PATCH v3] " Nishanth Aravamudan
  2 siblings, 0 replies; 22+ messages in thread
From: Hu Tao @ 2014-06-24  7:43 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Alexey Kardashevskiy, qemu-devel, Eduardo Habkost, anton,
	Michael S. Tsirkin

On Mon, Jun 23, 2014 at 05:48:25PM -0700, Nishanth Aravamudan wrote:
> On 23.06.2014 [12:33:10 -0700], Nishanth Aravamudan wrote:
> > Add a present field to NodeInfo which indicates if a given nodeid was
> > present on the command-line or not. Current code relies on a memory
> > value being passed for a node to indicate presence, which is
> > insufficient in the presence of memoryless nodes or sparse numbering.
> > Adjust the iteration of various NUMA loops to use the maximum known NUMA
> > ID rather than the number of NUMA nodes.
> > 
> > numa.c::set_numa_nodes() has become a bit more convoluted for
> > round-robin'ing the CPUs over known nodes when not specified by the
> > user.
> 
> Sorry for the noise, but after staring at it for a bit, I think this
> version is a bit cleaner for the set_numa_nodes() change.
> 
> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
> 
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.
> 
> Note that architecture-specific code still needs changes (forthcoming)
> for both sparse node numbering and memoryless nodes.
> 
> Examples:
> 
> (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
> node,nodeid=2 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 2 4 6 8 10 12 14
> node 0 size: 2048 MB
> node 1 cpus: 1 3 5 7 9 11 13 15
> node 1 size: 2048 MB
> 
> After:
> 
> node 2 cpus: 0 2 4 6 8 10 12 14                                               |
> node 2 size: 2048 MB                                                          |
> node 3 cpus: 1 3 5 7 9 11 13 15                                               |
> node 3 size: 2048 MB
> 
> (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
> node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16
> 
> node 0 cpus: 0 4 8 12                                                         |
> node 0 size: 1024 MB                                                          |
> node 1 cpus: 1 5 9 13                                                         |
> node 1 size: 1024 MB                                                          |
> node 2 cpus: 2 6 10 14                                                        |
> node 2 size: 1024 MB                                                          |
> node 3 cpus: 3 7 11 15                                                        |
> node 3 size: 1024 MB
> 
> (qemu) info numa
> 4 nodes
> node 0 cpus: 0 4 8 12
> node 0 size: 1024 MB
> node 1 cpus: 1 5 9 13
> node 1 size: 1024 MB
> node 2 cpus: 2 6 10 14
> node 2 size: 1024 MB
> node 3 cpus: 3 7 11 15
> node 3 size: 1024 MB
> 
> Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
> Cc: Eduardo Habkost <ehabkost@redhat.com>
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: qemu-devel@nongnu.org

Thanks! The patch fixes sparse NUMA nodes problem for me.

Tested-by: Hu Tao <hutao@cn.fujitsu.com>

Some comments below:

> 
> ---
> Based off mst's for_upstream tag, which has the NodeInfo changes.
> 
> v1 -> v2:
>   Modify set_numa_nodes loop for round-robin'ing CPUs.
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 277230d..b90bf66 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -145,11 +145,13 @@ extern int mem_prealloc;
>   */
>  #define MAX_CPUMASK_BITS 255
>  
> -extern int nb_numa_nodes;
> +extern int nb_numa_nodes; /* Number of NUMA nodes */
> +extern int max_numa_node; /* Highest specified NUMA node ID */
>  typedef struct node_info {
>      uint64_t node_mem;
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
> +    bool present;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void set_numa_nodes(void);
> diff --git a/monitor.c b/monitor.c
> index c7f8797..4721996 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
>      CPUState *cpu;
>  
>      monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
> -    for (i = 0; i < nb_numa_nodes; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
> +        if (!numa_info[i].present) {
> +            continue;
> +        }

The same change can be applied to check against MAX_NODES in
memory_region_allocate_system_memory.

>          monitor_printf(mon, "node %d cpus:", i);
>          CPU_FOREACH(cpu) {
>              if (cpu->numa_node == i) {
> diff --git a/numa.c b/numa.c
> index e471afe..2a4c577 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -106,6 +106,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>          numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>      }
> +    numa_info[nodenr].present = true;
> +    if (nodenr >= max_numa_node) {
> +        max_numa_node = nodenr + 1;
> +    }
>  }
>  
>  int numa_init_func(QemuOpts *opts, void *opaque)
> @@ -155,7 +159,7 @@ void set_numa_nodes(void)
>  {
>      if (nb_numa_nodes > 0) {
>          uint64_t numa_total;
> -        int i;
> +        int i, j = -1;
>  
>          if (nb_numa_nodes > MAX_NODES) {
>              nb_numa_nodes = MAX_NODES;
> @@ -164,27 +168,29 @@ void set_numa_nodes(void)
>          /* If no memory size if given for any node, assume the default case
>           * and distribute the available memory equally across all nodes
>           */
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (numa_info[i].node_mem != 0) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present && numa_info[i].node_mem != 0) {
>                  break;
>              }
>          }
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              uint64_t usedmem = 0;
>  
>              /* On Linux, the each node's border has to be 8MB aligned,
>               * the final node gets the rest.
>               */
> -            for (i = 0; i < nb_numa_nodes - 1; i++) {
> -                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> -                                        ~((1 << 23UL) - 1);
> -                usedmem += numa_info[i].node_mem;
> +            for (i = 0; i < max_numa_node - 1; i++) {
> +                if (numa_info[i].present) {
> +                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> +                                            ~((1 << 23UL) - 1);
> +                    usedmem += numa_info[i].node_mem;
> +                }
>              }
>              numa_info[i].node_mem = ram_size - usedmem;
>          }
>  
>          numa_total = 0;
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              numa_total += numa_info[i].node_mem;
>          }
>          if (numa_total != ram_size) {
> @@ -194,8 +200,9 @@ void set_numa_nodes(void)
>              exit(1);
>          }
>  
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
>          }
> @@ -203,9 +210,15 @@ void set_numa_nodes(void)
>           * must cope with this anyway, because there are BIOSes out there in
>           * real machines which also use this scheme.
>           */
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                do {
> +                    j++;
> +                    if (j >= max_numa_node) {
> +                        j = 0;
> +                    }

can be even simple:

  j = (j + 1) % max_numa_node;

> +                } while (!numa_info[j].present);
> +                set_bit(i, numa_info[j].node_cpu);
>              }
>          }
>      }
> @@ -217,7 +230,7 @@ void set_numa_modes(void)
>      int i;
>  
>      CPU_FOREACH(cpu) {
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
>                  cpu->numa_node = i;
>              }
> diff --git a/vl.c b/vl.c
> index 54b4627..e1a6ab8 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>      QTAILQ_HEAD_INITIALIZER(fw_boot_order);
>  
>  int nb_numa_nodes;
> +int max_numa_node;
>  NodeInfo numa_info[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          numa_info[i].node_mem = 0;
> +        numa_info[i].present = false;
>          bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
>      }
>  
>      nb_numa_nodes = 0;
> +    max_numa_node = -1;
>      nb_nics = 0;
>  
>      bdrv_init_with_whitelist();
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering
  2014-06-24  7:33 ` [Qemu-devel] [RFC PATCH] " Igor Mammedov
@ 2014-06-24  8:58   ` Alexey Kardashevskiy
  2014-06-24 16:39   ` Nishanth Aravamudan
  1 sibling, 0 replies; 22+ messages in thread
From: Alexey Kardashevskiy @ 2014-06-24  8:58 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: qemu-devel, Nishanth Aravamudan, Eduardo Habkost, Michael S. Tsirkin

On 06/24/2014 05:33 PM, Igor Mammedov wrote:
> On Mon, 23 Jun 2014 12:33:10 -0700
> Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> 
>> Add a present field to NodeInfo which indicates if a given nodeid was
>> present on the command-line or not. Current code relies on a memory
>> value being passed for a node to indicate presence, which is
>> insufficient in the presence of memoryless nodes or sparse numbering.
>> Adjust the iteration of various NUMA loops to use the maximum known NUMA
>> ID rather than the number of NUMA nodes.
> Why would you need sparse numbering?
> What task exactly are you trying to solve?


This is to reproduce+debug the real POWERPC hardware behavior. No practical
use as far as I can tell.




-- 
Alexey

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering
  2014-06-24  7:33 ` [Qemu-devel] [RFC PATCH] " Igor Mammedov
  2014-06-24  8:58   ` Alexey Kardashevskiy
@ 2014-06-24 16:39   ` Nishanth Aravamudan
  1 sibling, 0 replies; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-24 16:39 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: Alexey Kardashevskiy, qemu-devel, Eduardo Habkost, Michael S. Tsirkin

On 24.06.2014 [09:33:48 +0200], Igor Mammedov wrote:
> On Mon, 23 Jun 2014 12:33:10 -0700
> Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> 
> > Add a present field to NodeInfo which indicates if a given nodeid was
> > present on the command-line or not. Current code relies on a memory
> > value being passed for a node to indicate presence, which is
> > insufficient in the presence of memoryless nodes or sparse numbering.
> > Adjust the iteration of various NUMA loops to use the maximum known NUMA
> > ID rather than the number of NUMA nodes.
> Why would you need sparse numbering?

We already see this in practice under PowerVM on ppc. This would allow
qemu/KVM to emulate the same behaviors (which often lead to bugs/testing
of underused code-paths in the kernel).

Also, sparse numbering is explicitly allowed by qemu already but doesn't
work at all as expected, at least to me. That is, if I specify various
node IDs, the nodes are allocated linearly in qemu and the NUMA node IDs
in Linux don't correspond to the command-line. Further, the location of
CPUs/memory doesn't correspond to the nodes requested by the user.

> What task exactly are you trying to solve?

I would like to be able to leverage qemu's command-line to reproduce
the NUMA topology seen under PowerVM.

I'll send v3 with a better changelog and a few changes from Hu Tao.

Thanks,
Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-24  0:48 ` [Qemu-devel] [RFC PATCH v2] " Nishanth Aravamudan
  2014-06-24  1:08   ` [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA " Nishanth Aravamudan
  2014-06-24  7:43   ` [Qemu-devel] [RFC PATCH v2] numa: enable sparse " Hu Tao
@ 2014-06-24 17:40   ` Nishanth Aravamudan
  2014-06-25 11:21     ` Igor Mammedov
  2014-06-26 19:37     ` Eduardo Habkost
  2 siblings, 2 replies; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-24 17:40 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Eduardo Habkost, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

Sparse node numbering occurs on powerpc in practice under PowerVM. In
order to emulate the same NUMA topology under qemu, the assumption that
NUMA nodes are linearly ordered has to be removed. qemu actually already
supports (inasmuch as it doesn't throw an error) sparse node numbering
by the end-user, but the information is effectively ignored and the
nodes are populated linearly starting at 0. This means a user's node ID
requests are not honored in the Linux kernel, and that the topology may
not be as requested (in terms of the CPU/memory placement).

Add a present field to NodeInfo which indicates if a given nodeid was
present on the command-line or not. Current code relies on a memory
value being passed for a node to indicate presence, which is
insufficient in the presence of memoryless nodes or sparse numbering.
Adjust the iteration of various NUMA loops to use the maximum known NUMA
ID rather than the number of NUMA nodes.

numa.c::set_numa_nodes() has become a bit more convoluted for
round-robin'ing the CPUs over known nodes when not specified by the
user.

Note that architecture-specific code still possibly needs changes
(forthcoming) for both sparse node numbering and memoryless nodes. This
only puts in the generic infrastructure to support that.

Examples:

(1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
node,nodeid=2 -smp 16

Before:

node 0 cpus: 0 2 4 6 8 10 12 14
node 0 size: 2048 MB
node 1 cpus: 1 3 5 7 9 11 13 15
node 1 size: 2048 MB

After:

node 2 cpus: 0 2 4 6 8 10 12 14                                               |
node 2 size: 2048 MB                                                          |
node 3 cpus: 1 3 5 7 9 11 13 15                                               |
node 3 size: 2048 MB

(2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16

node 0 cpus: 0 4 8 12                                                         |
node 0 size: 1024 MB                                                          |
node 1 cpus: 1 5 9 13                                                         |
node 1 size: 1024 MB                                                          |
node 2 cpus: 2 6 10 14                                                        |
node 2 size: 1024 MB                                                          |
node 3 cpus: 3 7 11 15                                                        |
node 3 size: 1024 MB

(qemu) info numa
4 nodes
node 0 cpus: 0 4 8 12
node 0 size: 1024 MB
node 1 cpus: 1 5 9 13
node 1 size: 1024 MB
node 2 cpus: 2 6 10 14
node 2 size: 1024 MB
node 3 cpus: 3 7 11 15
node 3 size: 1024 MB

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Tested-by: Hu Tao <hutao@cn.fujitsu.com>

---
Based off mst's for_upstream tag, which has the NodeInfo changes.

v1 -> v2:
  Modify set_numa_nodes loop for round-robin'ing CPUs.

v2 -> v3:
  Updated changelog to indicate problem being solved.
  Updated memory_region_allocate_system_memory based upon feedback from
    Hu.
  Updated set_numa_nodes loop again to be simpler based upon feedback
    from Hu.
  Fixed bug with a mix of nodes with nodeid specified and without, where
    the same nodeid would be used by the explicit specification and the
    auto-numbering code.

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 277230d..b90bf66 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -145,11 +145,13 @@ extern int mem_prealloc;
  */
 #define MAX_CPUMASK_BITS 255
 
-extern int nb_numa_nodes;
+extern int nb_numa_nodes; /* Number of NUMA nodes */
+extern int max_numa_node; /* Highest specified NUMA node ID */
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
+    bool present;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void set_numa_nodes(void);
diff --git a/monitor.c b/monitor.c
index c7f8797..4721996 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
     CPUState *cpu;
 
     monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
-    for (i = 0; i < nb_numa_nodes; i++) {
+    for (i = 0; i < max_numa_node; i++) {
+        if (!numa_info[i].present) {
+            continue;
+        }
         monitor_printf(mon, "node %d cpus:", i);
         CPU_FOREACH(cpu) {
             if (cpu->numa_node == i) {
diff --git a/numa.c b/numa.c
index e471afe..d6b86ab 100644
--- a/numa.c
+++ b/numa.c
@@ -53,7 +53,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
     if (node->has_nodeid) {
         nodenr = node->nodeid;
     } else {
-        nodenr = nb_numa_nodes;
+        nodenr = 0;
+        while (numa_info[nodenr].present) {
+            nodenr++;
+        }
     }
 
     if (nodenr >= MAX_NODES) {
@@ -106,6 +109,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
         numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
         numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
     }
+    numa_info[nodenr].present = true;
+    if (nodenr >= max_numa_node) {
+        max_numa_node = nodenr + 1;
+    }
 }
 
 int numa_init_func(QemuOpts *opts, void *opaque)
@@ -155,7 +162,7 @@ void set_numa_nodes(void)
 {
     if (nb_numa_nodes > 0) {
         uint64_t numa_total;
-        int i;
+        int i, j = -1;
 
         if (nb_numa_nodes > MAX_NODES) {
             nb_numa_nodes = MAX_NODES;
@@ -164,27 +171,29 @@ void set_numa_nodes(void)
         /* If no memory size if given for any node, assume the default case
          * and distribute the available memory equally across all nodes
          */
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (numa_info[i].node_mem != 0) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present && numa_info[i].node_mem != 0) {
                 break;
             }
         }
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             uint64_t usedmem = 0;
 
             /* On Linux, the each node's border has to be 8MB aligned,
              * the final node gets the rest.
              */
-            for (i = 0; i < nb_numa_nodes - 1; i++) {
-                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
-                                        ~((1 << 23UL) - 1);
-                usedmem += numa_info[i].node_mem;
+            for (i = 0; i < max_numa_node - 1; i++) {
+                if (numa_info[i].present) {
+                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
+                                            ~((1 << 23UL) - 1);
+                    usedmem += numa_info[i].node_mem;
+                }
             }
             numa_info[i].node_mem = ram_size - usedmem;
         }
 
         numa_total = 0;
-        for (i = 0; i < nb_numa_nodes; i++) {
+        for (i = 0; i < max_numa_node; i++) {
             numa_total += numa_info[i].node_mem;
         }
         if (numa_total != ram_size) {
@@ -194,8 +203,9 @@ void set_numa_nodes(void)
             exit(1);
         }
 
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present &&
+                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }
         }
@@ -203,9 +213,12 @@ void set_numa_nodes(void)
          * must cope with this anyway, because there are BIOSes out there in
          * real machines which also use this scheme.
          */
-        if (i == nb_numa_nodes) {
+        if (i == max_numa_node) {
             for (i = 0; i < max_cpus; i++) {
-                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
+                do {
+                    j = (j + 1) % max_numa_node;
+                } while (!numa_info[j].present);
+                set_bit(i, numa_info[j].node_cpu);
             }
         }
     }
@@ -217,8 +230,9 @@ void set_numa_modes(void)
     int i;
 
     CPU_FOREACH(cpu) {
-        for (i = 0; i < nb_numa_nodes; i++) {
-            if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
+        for (i = 0; i < max_numa_node; i++) {
+            if (numa_info[i].present &&
+                test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
                 cpu->numa_node = i;
             }
         }
@@ -266,10 +280,13 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
     }
 
     memory_region_init(mr, owner, name, ram_size);
-    for (i = 0; i < MAX_NODES; i++) {
+    for (i = 0; i < max_numa_node; i++) {
         Error *local_err = NULL;
         uint64_t size = numa_info[i].node_mem;
         HostMemoryBackend *backend = numa_info[i].node_memdev;
+        if (!numa_info[i].present) {
+            continue;
+        }
         if (!backend) {
             continue;
         }
diff --git a/vl.c b/vl.c
index 54b4627..e1a6ab8 100644
--- a/vl.c
+++ b/vl.c
@@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
     QTAILQ_HEAD_INITIALIZER(fw_boot_order);
 
 int nb_numa_nodes;
+int max_numa_node;
 NodeInfo numa_info[MAX_NODES];
 
 uint8_t qemu_uuid[16];
@@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
 
     for (i = 0; i < MAX_NODES; i++) {
         numa_info[i].node_mem = 0;
+        numa_info[i].present = false;
         bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
     }
 
     nb_numa_nodes = 0;
+    max_numa_node = -1;
     nb_nics = 0;
 
     bdrv_init_with_whitelist();

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-24 17:40   ` [Qemu-devel] [RFC PATCH v3] " Nishanth Aravamudan
@ 2014-06-25 11:21     ` Igor Mammedov
  2014-06-25 16:13       ` Nishanth Aravamudan
  2014-06-26 19:37     ` Eduardo Habkost
  1 sibling, 1 reply; 22+ messages in thread
From: Igor Mammedov @ 2014-06-25 11:21 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Eduardo Habkost, Michael S. Tsirkin, Alexey Kardashevskiy,
	Hu Tao, qemu-devel, Anton Blanchard, David Rientjes

On Tue, 24 Jun 2014 10:40:38 -0700
Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:

> Sparse node numbering occurs on powerpc in practice under PowerVM. In
> order to emulate the same NUMA topology under qemu, the assumption that
> NUMA nodes are linearly ordered has to be removed. qemu actually already
> supports (inasmuch as it doesn't throw an error) sparse node numbering
> by the end-user, but the information is effectively ignored and the
> nodes are populated linearly starting at 0. This means a user's node ID
> requests are not honored in the Linux kernel, and that the topology may
> not be as requested (in terms of the CPU/memory placement).
> 
> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
> 
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.
> 
> Note that architecture-specific code still possibly needs changes
> (forthcoming) for both sparse node numbering and memoryless nodes. This
> only puts in the generic infrastructure to support that.
> 
> Examples:
> 
> (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
> node,nodeid=2 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 2 4 6 8 10 12 14
> node 0 size: 2048 MB
> node 1 cpus: 1 3 5 7 9 11 13 15
> node 1 size: 2048 MB
> 
> After:
> 
> node 2 cpus: 0 2 4 6 8 10 12 14                                               |
> node 2 size: 2048 MB                                                          |
> node 3 cpus: 1 3 5 7 9 11 13 15                                               |
> node 3 size: 2048 MB
> 
> (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
> node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16
> 
> node 0 cpus: 0 4 8 12                                                         |
> node 0 size: 1024 MB                                                          |
> node 1 cpus: 1 5 9 13                                                         |
> node 1 size: 1024 MB                                                          |
> node 2 cpus: 2 6 10 14                                                        |
> node 2 size: 1024 MB                                                          |
> node 3 cpus: 3 7 11 15                                                        |
> node 3 size: 1024 MB
> 
> (qemu) info numa
> 4 nodes
> node 0 cpus: 0 4 8 12
> node 0 size: 1024 MB
> node 1 cpus: 1 5 9 13
> node 1 size: 1024 MB
> node 2 cpus: 2 6 10 14
> node 2 size: 1024 MB
> node 3 cpus: 3 7 11 15
> node 3 size: 1024 MB
> 
> Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
> Tested-by: Hu Tao <hutao@cn.fujitsu.com>
> 
> ---
> Based off mst's for_upstream tag, which has the NodeInfo changes.
> 
> v1 -> v2:
>   Modify set_numa_nodes loop for round-robin'ing CPUs.
> 
> v2 -> v3:
>   Updated changelog to indicate problem being solved.
>   Updated memory_region_allocate_system_memory based upon feedback from
>     Hu.
>   Updated set_numa_nodes loop again to be simpler based upon feedback
>     from Hu.
>   Fixed bug with a mix of nodes with nodeid specified and without, where
>     the same nodeid would be used by the explicit specification and the
>     auto-numbering code.
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 277230d..b90bf66 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -145,11 +145,13 @@ extern int mem_prealloc;
>   */
>  #define MAX_CPUMASK_BITS 255
>  
> -extern int nb_numa_nodes;
> +extern int nb_numa_nodes; /* Number of NUMA nodes */
> +extern int max_numa_node; /* Highest specified NUMA node ID */
>  typedef struct node_info {
>      uint64_t node_mem;
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
> +    bool present;
How about dropping 'present' and replacing array with a list
of only present nodes?
That way it will be one more step closer to converting numa
infrastructure to a set of QOM objects.


>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void set_numa_nodes(void);
> diff --git a/monitor.c b/monitor.c
> index c7f8797..4721996 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
>      CPUState *cpu;
>  
>      monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
> -    for (i = 0; i < nb_numa_nodes; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
> +        if (!numa_info[i].present) {
> +            continue;
> +        }
>          monitor_printf(mon, "node %d cpus:", i);
>          CPU_FOREACH(cpu) {
>              if (cpu->numa_node == i) {
> diff --git a/numa.c b/numa.c
> index e471afe..d6b86ab 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -53,7 +53,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>      if (node->has_nodeid) {
>          nodenr = node->nodeid;
>      } else {
> -        nodenr = nb_numa_nodes;
> +        nodenr = 0;
> +        while (numa_info[nodenr].present) {
> +            nodenr++;
> +        }
>      }
>  
>      if (nodenr >= MAX_NODES) {
> @@ -106,6 +109,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>          numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>      }
> +    numa_info[nodenr].present = true;
> +    if (nodenr >= max_numa_node) {
> +        max_numa_node = nodenr + 1;
> +    }
>  }
>  
>  int numa_init_func(QemuOpts *opts, void *opaque)
> @@ -155,7 +162,7 @@ void set_numa_nodes(void)
>  {
>      if (nb_numa_nodes > 0) {
>          uint64_t numa_total;
> -        int i;
> +        int i, j = -1;
>  
>          if (nb_numa_nodes > MAX_NODES) {
>              nb_numa_nodes = MAX_NODES;
> @@ -164,27 +171,29 @@ void set_numa_nodes(void)
>          /* If no memory size if given for any node, assume the default case
>           * and distribute the available memory equally across all nodes
>           */
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (numa_info[i].node_mem != 0) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present && numa_info[i].node_mem != 0) {
>                  break;
>              }
>          }
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              uint64_t usedmem = 0;
>  
>              /* On Linux, the each node's border has to be 8MB aligned,
>               * the final node gets the rest.
>               */
> -            for (i = 0; i < nb_numa_nodes - 1; i++) {
> -                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> -                                        ~((1 << 23UL) - 1);
> -                usedmem += numa_info[i].node_mem;
> +            for (i = 0; i < max_numa_node - 1; i++) {
> +                if (numa_info[i].present) {
> +                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> +                                            ~((1 << 23UL) - 1);
> +                    usedmem += numa_info[i].node_mem;
> +                }
>              }
>              numa_info[i].node_mem = ram_size - usedmem;
>          }
>  
>          numa_total = 0;
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              numa_total += numa_info[i].node_mem;
>          }
>          if (numa_total != ram_size) {
> @@ -194,8 +203,9 @@ void set_numa_nodes(void)
>              exit(1);
>          }
>  
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
>          }
> @@ -203,9 +213,12 @@ void set_numa_nodes(void)
>           * must cope with this anyway, because there are BIOSes out there in
>           * real machines which also use this scheme.
>           */
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                do {
> +                    j = (j + 1) % max_numa_node;
> +                } while (!numa_info[j].present);
> +                set_bit(i, numa_info[j].node_cpu);
>              }
>          }
>      }
> @@ -217,8 +230,9 @@ void set_numa_modes(void)
>      int i;
>  
>      CPU_FOREACH(cpu) {
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
>                  cpu->numa_node = i;
>              }
>          }
> @@ -266,10 +280,13 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
>      }
>  
>      memory_region_init(mr, owner, name, ram_size);
> -    for (i = 0; i < MAX_NODES; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
>          Error *local_err = NULL;
>          uint64_t size = numa_info[i].node_mem;
>          HostMemoryBackend *backend = numa_info[i].node_memdev;
> +        if (!numa_info[i].present) {
> +            continue;
> +        }
>          if (!backend) {
>              continue;
>          }
> diff --git a/vl.c b/vl.c
> index 54b4627..e1a6ab8 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>      QTAILQ_HEAD_INITIALIZER(fw_boot_order);
>  
>  int nb_numa_nodes;
> +int max_numa_node;
>  NodeInfo numa_info[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          numa_info[i].node_mem = 0;
> +        numa_info[i].present = false;
>          bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
>      }
>  
>      nb_numa_nodes = 0;
> +    max_numa_node = -1;
>      nb_nics = 0;
>  
>      bdrv_init_with_whitelist();
> 
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 11:21     ` Igor Mammedov
@ 2014-06-25 16:13       ` Nishanth Aravamudan
  2014-06-25 16:52         ` Eduardo Habkost
  0 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-25 16:13 UTC (permalink / raw)
  To: Igor Mammedov
  Cc: Eduardo Habkost, Michael S. Tsirkin, Alexey Kardashevskiy,
	Hu Tao, qemu-devel, Anton Blanchard, David Rientjes

On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> On Tue, 24 Jun 2014 10:40:38 -0700
> Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
<snip>
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index 277230d..b90bf66 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> >   */
> >  #define MAX_CPUMASK_BITS 255
> >  
> > -extern int nb_numa_nodes;
> > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > +extern int max_numa_node; /* Highest specified NUMA node ID */
> >  typedef struct node_info {
> >      uint64_t node_mem;
> >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> >      struct HostMemoryBackend *node_memdev;
> > +    bool present;
> How about dropping 'present' and replacing array with a list
> of only present nodes?

If that would be preferred, I can move to that. I assume a simple
linked-list is fine. Does qemu provide any infrastructure for defining
lists? I'll look through the source but any pointers would be helpful.

Generally speaking, sparse NUMA nodes aren't that common and when they
exist, the gaps aren't large. But it does seem to make sense if we have
sparse IDs at all, we might as well move to a list.

In any case, moving to the list means we'd have a nodeid as part of the
structure instead.

> That way it will be one more step closer to converting numa
> infrastructure to a set of QOM objects.

Sounds like a good idea to me. I'll respin the patch soon.

Thanks,
Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 16:13       ` Nishanth Aravamudan
@ 2014-06-25 16:52         ` Eduardo Habkost
  2014-06-25 17:04           ` Nishanth Aravamudan
  2014-06-25 18:23           ` Michael S. Tsirkin
  0 siblings, 2 replies; 22+ messages in thread
From: Eduardo Habkost @ 2014-06-25 16:52 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Michael S. Tsirkin, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > On Tue, 24 Jun 2014 10:40:38 -0700
> > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> <snip>
> > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > index 277230d..b90bf66 100644
> > > --- a/include/sysemu/sysemu.h
> > > +++ b/include/sysemu/sysemu.h
> > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > >   */
> > >  #define MAX_CPUMASK_BITS 255
> > >  
> > > -extern int nb_numa_nodes;
> > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > >  typedef struct node_info {
> > >      uint64_t node_mem;
> > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > >      struct HostMemoryBackend *node_memdev;
> > > +    bool present;
> > How about dropping 'present' and replacing array with a list
> > of only present nodes?
> 
> If that would be preferred, I can move to that. I assume a simple
> linked-list is fine. Does qemu provide any infrastructure for defining
> lists? I'll look through the source but any pointers would be helpful.
> 
> Generally speaking, sparse NUMA nodes aren't that common and when they
> exist, the gaps aren't large. But it does seem to make sense if we have
> sparse IDs at all, we might as well move to a list.
> 
> In any case, moving to the list means we'd have a nodeid as part of the
> structure instead.
> 
> > That way it will be one more step closer to converting numa
> > infrastructure to a set of QOM objects.
> 
> Sounds like a good idea to me. I'll respin the patch soon.

Having a list makes sense, the only difference is that keeping a sparse
array sorted is much easier than making a sorted list (because the ACPI
tables are nodeid-ordered). That's why I suggested keeping the array
initially.

Adding a "present" field to the array is a trivial and easy-to-review
change. Changing NodeInfo to use linked lists is a more complex change
that I wouldn't want to include after soft freeze.

In other words:
 * Having a list is better than a sparse array; but:
 * Having a small sparse array with the "present" field is better
   than broken sparse nodeid support (IMO).

<rant>
Improving the code is always welcome (and adding a list would be an
improvement), but we don't need to implement all refactoring changes we
can dream of, before including a new feature. I like to evaluate how the
code looks like compared to what we have today, not with what we want it
to look like in one year.

Could we (the QEMU community) please learn to sometimes accept simple
features or fixes (which don't make the code worse) without requiring
the whole code to be rewritten first?
</rant>

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 16:52         ` Eduardo Habkost
@ 2014-06-25 17:04           ` Nishanth Aravamudan
  2014-06-25 18:24             ` Michael S. Tsirkin
  2014-06-26  6:29             ` Igor Mammedov
  2014-06-25 18:23           ` Michael S. Tsirkin
  1 sibling, 2 replies; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-25 17:04 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: Michael S. Tsirkin, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On 25.06.2014 [13:52:56 -0300], Eduardo Habkost wrote:
> On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > <snip>
> > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > index 277230d..b90bf66 100644
> > > > --- a/include/sysemu/sysemu.h
> > > > +++ b/include/sysemu/sysemu.h
> > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > >   */
> > > >  #define MAX_CPUMASK_BITS 255
> > > >  
> > > > -extern int nb_numa_nodes;
> > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > >  typedef struct node_info {
> > > >      uint64_t node_mem;
> > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > >      struct HostMemoryBackend *node_memdev;
> > > > +    bool present;
> > > How about dropping 'present' and replacing array with a list
> > > of only present nodes?
> > 
> > If that would be preferred, I can move to that. I assume a simple
> > linked-list is fine. Does qemu provide any infrastructure for defining
> > lists? I'll look through the source but any pointers would be helpful.
> > 
> > Generally speaking, sparse NUMA nodes aren't that common and when they
> > exist, the gaps aren't large. But it does seem to make sense if we have
> > sparse IDs at all, we might as well move to a list.
> > 
> > In any case, moving to the list means we'd have a nodeid as part of the
> > structure instead.
> > 
> > > That way it will be one more step closer to converting numa
> > > infrastructure to a set of QOM objects.
> > 
> > Sounds like a good idea to me. I'll respin the patch soon.
> 
> Having a list makes sense, the only difference is that keeping a sparse
> array sorted is much easier than making a sorted list (because the ACPI
> tables are nodeid-ordered). That's why I suggested keeping the array
> initially.

And for non-ACPI platforms, it does feel like keeping the list sorted is
ideal, as it simplifies various loops, etc.

> Adding a "present" field to the array is a trivial and easy-to-review
> change. Changing NodeInfo to use linked lists is a more complex change
> that I wouldn't want to include after soft freeze.
> 
> In other words:
>  * Having a list is better than a sparse array; but:
>  * Having a small sparse array with the "present" field is better
>    than broken sparse nodeid support (IMO).

Perhaps as a compromise I can work on the list conversion as a follow-on
patch?

Thanks,
Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 16:52         ` Eduardo Habkost
  2014-06-25 17:04           ` Nishanth Aravamudan
@ 2014-06-25 18:23           ` Michael S. Tsirkin
  2014-06-26  9:09             ` Hu Tao
  1 sibling, 1 reply; 22+ messages in thread
From: Michael S. Tsirkin @ 2014-06-25 18:23 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: Alexey Kardashevskiy, Hu Tao, Nishanth Aravamudan, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On Wed, Jun 25, 2014 at 01:52:56PM -0300, Eduardo Habkost wrote:
> On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > <snip>
> > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > index 277230d..b90bf66 100644
> > > > --- a/include/sysemu/sysemu.h
> > > > +++ b/include/sysemu/sysemu.h
> > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > >   */
> > > >  #define MAX_CPUMASK_BITS 255
> > > >  
> > > > -extern int nb_numa_nodes;
> > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > >  typedef struct node_info {
> > > >      uint64_t node_mem;
> > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > >      struct HostMemoryBackend *node_memdev;
> > > > +    bool present;
> > > How about dropping 'present' and replacing array with a list
> > > of only present nodes?
> > 
> > If that would be preferred, I can move to that. I assume a simple
> > linked-list is fine. Does qemu provide any infrastructure for defining
> > lists? I'll look through the source but any pointers would be helpful.
> > 
> > Generally speaking, sparse NUMA nodes aren't that common and when they
> > exist, the gaps aren't large. But it does seem to make sense if we have
> > sparse IDs at all, we might as well move to a list.
> > 
> > In any case, moving to the list means we'd have a nodeid as part of the
> > structure instead.
> > 
> > > That way it will be one more step closer to converting numa
> > > infrastructure to a set of QOM objects.
> > 
> > Sounds like a good idea to me. I'll respin the patch soon.
> 
> Having a list makes sense, the only difference is that keeping a sparse
> array sorted is much easier than making a sorted list (because the ACPI
> tables are nodeid-ordered). That's why I suggested keeping the array
> initially.
> 
> Adding a "present" field to the array is a trivial and easy-to-review
> change. Changing NodeInfo to use linked lists is a more complex change
> that I wouldn't want to include after soft freeze.
> 
> In other words:
>  * Having a list is better than a sparse array; but:
>  * Having a small sparse array with the "present" field is better
>    than broken sparse nodeid support (IMO).

I agree here. This patchset is still RFC but if that's
the only issue I might apply it.

> <rant>
> Improving the code is always welcome (and adding a list would be an
> improvement), but we don't need to implement all refactoring changes we
> can dream of, before including a new feature. I like to evaluate how the
> code looks like compared to what we have today, not with what we want it
> to look like in one year.
> 
> Could we (the QEMU community) please learn to sometimes accept simple
> features or fixes (which don't make the code worse) without requiring
> the whole code to be rewritten first?
> </rant>

Whether this would be a win for the community depends on the specifics
of each case. Contributors are often inclined not to look beyond their
immediate needs and their specific small project.  Who gets to clean it
up then?

> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 17:04           ` Nishanth Aravamudan
@ 2014-06-25 18:24             ` Michael S. Tsirkin
  2014-06-26  6:29             ` Igor Mammedov
  1 sibling, 0 replies; 22+ messages in thread
From: Michael S. Tsirkin @ 2014-06-25 18:24 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Eduardo Habkost, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On Wed, Jun 25, 2014 at 10:04:18AM -0700, Nishanth Aravamudan wrote:
> On 25.06.2014 [13:52:56 -0300], Eduardo Habkost wrote:
> > On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > > <snip>
> > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > index 277230d..b90bf66 100644
> > > > > --- a/include/sysemu/sysemu.h
> > > > > +++ b/include/sysemu/sysemu.h
> > > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > > >   */
> > > > >  #define MAX_CPUMASK_BITS 255
> > > > >  
> > > > > -extern int nb_numa_nodes;
> > > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > > >  typedef struct node_info {
> > > > >      uint64_t node_mem;
> > > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > > >      struct HostMemoryBackend *node_memdev;
> > > > > +    bool present;
> > > > How about dropping 'present' and replacing array with a list
> > > > of only present nodes?
> > > 
> > > If that would be preferred, I can move to that. I assume a simple
> > > linked-list is fine. Does qemu provide any infrastructure for defining
> > > lists? I'll look through the source but any pointers would be helpful.
> > > 
> > > Generally speaking, sparse NUMA nodes aren't that common and when they
> > > exist, the gaps aren't large. But it does seem to make sense if we have
> > > sparse IDs at all, we might as well move to a list.
> > > 
> > > In any case, moving to the list means we'd have a nodeid as part of the
> > > structure instead.
> > > 
> > > > That way it will be one more step closer to converting numa
> > > > infrastructure to a set of QOM objects.
> > > 
> > > Sounds like a good idea to me. I'll respin the patch soon.
> > 
> > Having a list makes sense, the only difference is that keeping a sparse
> > array sorted is much easier than making a sorted list (because the ACPI
> > tables are nodeid-ordered). That's why I suggested keeping the array
> > initially.
> 
> And for non-ACPI platforms, it does feel like keeping the list sorted is
> ideal, as it simplifies various loops, etc.
> 
> > Adding a "present" field to the array is a trivial and easy-to-review
> > change. Changing NodeInfo to use linked lists is a more complex change
> > that I wouldn't want to include after soft freeze.
> > 
> > In other words:
> >  * Having a list is better than a sparse array; but:
> >  * Having a small sparse array with the "present" field is better
> >    than broken sparse nodeid support (IMO).
> 
> Perhaps as a compromise I can work on the list conversion as a follow-on
> patch?
> 
> Thanks,
> Nish

Yes, that's fine I think, we can do this after 2.1.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 17:04           ` Nishanth Aravamudan
  2014-06-25 18:24             ` Michael S. Tsirkin
@ 2014-06-26  6:29             ` Igor Mammedov
  1 sibling, 0 replies; 22+ messages in thread
From: Igor Mammedov @ 2014-06-26  6:29 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Eduardo Habkost, Michael S. Tsirkin, Alexey Kardashevskiy,
	Hu Tao, qemu-devel, Anton Blanchard, David Rientjes

On Wed, 25 Jun 2014 10:04:18 -0700
Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:

> On 25.06.2014 [13:52:56 -0300], Eduardo Habkost wrote:
> > On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > > <snip>
> > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > index 277230d..b90bf66 100644
> > > > > --- a/include/sysemu/sysemu.h
> > > > > +++ b/include/sysemu/sysemu.h
> > > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > > >   */
> > > > >  #define MAX_CPUMASK_BITS 255
> > > > >  
> > > > > -extern int nb_numa_nodes;
> > > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > > >  typedef struct node_info {
> > > > >      uint64_t node_mem;
> > > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > > >      struct HostMemoryBackend *node_memdev;
> > > > > +    bool present;
> > > > How about dropping 'present' and replacing array with a list
> > > > of only present nodes?
> > > 
> > > If that would be preferred, I can move to that. I assume a simple
> > > linked-list is fine. Does qemu provide any infrastructure for defining
> > > lists? I'll look through the source but any pointers would be helpful.
> > > 
> > > Generally speaking, sparse NUMA nodes aren't that common and when they
> > > exist, the gaps aren't large. But it does seem to make sense if we have
> > > sparse IDs at all, we might as well move to a list.
> > > 
> > > In any case, moving to the list means we'd have a nodeid as part of the
> > > structure instead.
> > > 
> > > > That way it will be one more step closer to converting numa
> > > > infrastructure to a set of QOM objects.
> > > 
> > > Sounds like a good idea to me. I'll respin the patch soon.
> > 
> > Having a list makes sense, the only difference is that keeping a sparse
> > array sorted is much easier than making a sorted list (because the ACPI
> > tables are nodeid-ordered). That's why I suggested keeping the array
> > initially.
> 
> And for non-ACPI platforms, it does feel like keeping the list sorted is
> ideal, as it simplifies various loops, etc.
> 
> > Adding a "present" field to the array is a trivial and easy-to-review
> > change. Changing NodeInfo to use linked lists is a more complex change
> > that I wouldn't want to include after soft freeze.
> > 
> > In other words:
> >  * Having a list is better than a sparse array; but:
> >  * Having a small sparse array with the "present" field is better
> >    than broken sparse nodeid support (IMO).
> 
> Perhaps as a compromise I can work on the list conversion as a follow-on
> patch?
Sure, it was just a suggestion that might make our lives easier in the future. 

> 
> Thanks,
> Nish
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-25 18:23           ` Michael S. Tsirkin
@ 2014-06-26  9:09             ` Hu Tao
  2014-06-26 17:58               ` Nishanth Aravamudan
  0 siblings, 1 reply; 22+ messages in thread
From: Hu Tao @ 2014-06-26  9:09 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Eduardo Habkost, Alexey Kardashevskiy, Nishanth Aravamudan,
	qemu-devel, Anton Blanchard, David Rientjes, Igor Mammedov

On Wed, Jun 25, 2014 at 09:23:17PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jun 25, 2014 at 01:52:56PM -0300, Eduardo Habkost wrote:
> > On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > > <snip>
> > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > index 277230d..b90bf66 100644
> > > > > --- a/include/sysemu/sysemu.h
> > > > > +++ b/include/sysemu/sysemu.h
> > > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > > >   */
> > > > >  #define MAX_CPUMASK_BITS 255
> > > > >  
> > > > > -extern int nb_numa_nodes;
> > > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > > >  typedef struct node_info {
> > > > >      uint64_t node_mem;
> > > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > > >      struct HostMemoryBackend *node_memdev;
> > > > > +    bool present;
> > > > How about dropping 'present' and replacing array with a list
> > > > of only present nodes?
> > > 
> > > If that would be preferred, I can move to that. I assume a simple
> > > linked-list is fine. Does qemu provide any infrastructure for defining
> > > lists? I'll look through the source but any pointers would be helpful.
> > > 
> > > Generally speaking, sparse NUMA nodes aren't that common and when they
> > > exist, the gaps aren't large. But it does seem to make sense if we have
> > > sparse IDs at all, we might as well move to a list.
> > > 
> > > In any case, moving to the list means we'd have a nodeid as part of the
> > > structure instead.
> > > 
> > > > That way it will be one more step closer to converting numa
> > > > infrastructure to a set of QOM objects.
> > > 
> > > Sounds like a good idea to me. I'll respin the patch soon.
> > 
> > Having a list makes sense, the only difference is that keeping a sparse
> > array sorted is much easier than making a sorted list (because the ACPI
> > tables are nodeid-ordered). That's why I suggested keeping the array
> > initially.
> > 
> > Adding a "present" field to the array is a trivial and easy-to-review
> > change. Changing NodeInfo to use linked lists is a more complex change
> > that I wouldn't want to include after soft freeze.
> > 
> > In other words:
> >  * Having a list is better than a sparse array; but:
> >  * Having a small sparse array with the "present" field is better
> >    than broken sparse nodeid support (IMO).
> 
> I agree here. This patchset is still RFC but if that's
> the only issue I might apply it.

I don't think it is ready for 2.1 since it lacks arch-specific changes.
The patch itself only makes qemu be able to bring up guest with sparse
numa nodes, without arch-specific changes, guest sees only one node(at
least on x86).

Hu

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-26  9:09             ` Hu Tao
@ 2014-06-26 17:58               ` Nishanth Aravamudan
  2014-06-26 18:39                 ` Eduardo Habkost
  0 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-26 17:58 UTC (permalink / raw)
  To: Hu Tao
  Cc: Eduardo Habkost, Michael S. Tsirkin, Alexey Kardashevskiy,
	qemu-devel, Anton Blanchard, David Rientjes, Igor Mammedov

On 26.06.2014 [17:09:25 +0800], Hu Tao wrote:
> On Wed, Jun 25, 2014 at 09:23:17PM +0300, Michael S. Tsirkin wrote:
> > On Wed, Jun 25, 2014 at 01:52:56PM -0300, Eduardo Habkost wrote:
> > > On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > > > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > > > <snip>
> > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > index 277230d..b90bf66 100644
> > > > > > --- a/include/sysemu/sysemu.h
> > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > > > >   */
> > > > > >  #define MAX_CPUMASK_BITS 255
> > > > > >  
> > > > > > -extern int nb_numa_nodes;
> > > > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > > > >  typedef struct node_info {
> > > > > >      uint64_t node_mem;
> > > > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > > > >      struct HostMemoryBackend *node_memdev;
> > > > > > +    bool present;
> > > > > How about dropping 'present' and replacing array with a list
> > > > > of only present nodes?
> > > > 
> > > > If that would be preferred, I can move to that. I assume a simple
> > > > linked-list is fine. Does qemu provide any infrastructure for defining
> > > > lists? I'll look through the source but any pointers would be helpful.
> > > > 
> > > > Generally speaking, sparse NUMA nodes aren't that common and when they
> > > > exist, the gaps aren't large. But it does seem to make sense if we have
> > > > sparse IDs at all, we might as well move to a list.
> > > > 
> > > > In any case, moving to the list means we'd have a nodeid as part of the
> > > > structure instead.
> > > > 
> > > > > That way it will be one more step closer to converting numa
> > > > > infrastructure to a set of QOM objects.
> > > > 
> > > > Sounds like a good idea to me. I'll respin the patch soon.
> > > 
> > > Having a list makes sense, the only difference is that keeping a sparse
> > > array sorted is much easier than making a sorted list (because the ACPI
> > > tables are nodeid-ordered). That's why I suggested keeping the array
> > > initially.
> > > 
> > > Adding a "present" field to the array is a trivial and easy-to-review
> > > change. Changing NodeInfo to use linked lists is a more complex change
> > > that I wouldn't want to include after soft freeze.
> > > 
> > > In other words:
> > >  * Having a list is better than a sparse array; but:
> > >  * Having a small sparse array with the "present" field is better
> > >    than broken sparse nodeid support (IMO).
> > 
> > I agree here. This patchset is still RFC but if that's
> > the only issue I might apply it.
> 
> I don't think it is ready for 2.1 since it lacks arch-specific changes.
> The patch itself only makes qemu be able to bring up guest with sparse
> numa nodes, without arch-specific changes, guest sees only one node(at
> least on x86).

Well, it does no harm or foul to have it, on it's own. In that, nothing
is worse or better than it was. Although I would argue the qemu core is
better than it was :) I have sent the powerpc enabling patch, but even
with that, we need Alexey's series of 6 patches (which do fix a
regression) to fully support the sparse numbering. Maybe that means I've
convinced myself it's best to wait until after 2.1.

I honestly am not sure how to "fix" the x86 code to support the sparse
numbering -- the APIC code seems to rely on it being continuous. From
some googling, it seems like the most common reason for non-continuous
numbering on x86 is faulty hardware. I will keep fiddling if no one else
steps up, but I'm not an x86 NUMA expert :)

Thanks,
Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-26 17:58               ` Nishanth Aravamudan
@ 2014-06-26 18:39                 ` Eduardo Habkost
  0 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2014-06-26 18:39 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Michael S. Tsirkin, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On Thu, Jun 26, 2014 at 10:58:07AM -0700, Nishanth Aravamudan wrote:
> On 26.06.2014 [17:09:25 +0800], Hu Tao wrote:
> > On Wed, Jun 25, 2014 at 09:23:17PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jun 25, 2014 at 01:52:56PM -0300, Eduardo Habkost wrote:
> > > > On Wed, Jun 25, 2014 at 09:13:59AM -0700, Nishanth Aravamudan wrote:
> > > > > On 25.06.2014 [13:21:34 +0200], Igor Mammedov wrote:
> > > > > > On Tue, 24 Jun 2014 10:40:38 -0700
> > > > > > Nishanth Aravamudan <nacc@linux.vnet.ibm.com> wrote:
> > > > > <snip>
> > > > > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > > > > index 277230d..b90bf66 100644
> > > > > > > --- a/include/sysemu/sysemu.h
> > > > > > > +++ b/include/sysemu/sysemu.h
> > > > > > > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> > > > > > >   */
> > > > > > >  #define MAX_CPUMASK_BITS 255
> > > > > > >  
> > > > > > > -extern int nb_numa_nodes;
> > > > > > > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> > > > > > > +extern int max_numa_node; /* Highest specified NUMA node ID */
> > > > > > >  typedef struct node_info {
> > > > > > >      uint64_t node_mem;
> > > > > > >      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> > > > > > >      struct HostMemoryBackend *node_memdev;
> > > > > > > +    bool present;
> > > > > > How about dropping 'present' and replacing array with a list
> > > > > > of only present nodes?
> > > > > 
> > > > > If that would be preferred, I can move to that. I assume a simple
> > > > > linked-list is fine. Does qemu provide any infrastructure for defining
> > > > > lists? I'll look through the source but any pointers would be helpful.
> > > > > 
> > > > > Generally speaking, sparse NUMA nodes aren't that common and when they
> > > > > exist, the gaps aren't large. But it does seem to make sense if we have
> > > > > sparse IDs at all, we might as well move to a list.
> > > > > 
> > > > > In any case, moving to the list means we'd have a nodeid as part of the
> > > > > structure instead.
> > > > > 
> > > > > > That way it will be one more step closer to converting numa
> > > > > > infrastructure to a set of QOM objects.
> > > > > 
> > > > > Sounds like a good idea to me. I'll respin the patch soon.
> > > > 
> > > > Having a list makes sense, the only difference is that keeping a sparse
> > > > array sorted is much easier than making a sorted list (because the ACPI
> > > > tables are nodeid-ordered). That's why I suggested keeping the array
> > > > initially.
> > > > 
> > > > Adding a "present" field to the array is a trivial and easy-to-review
> > > > change. Changing NodeInfo to use linked lists is a more complex change
> > > > that I wouldn't want to include after soft freeze.
> > > > 
> > > > In other words:
> > > >  * Having a list is better than a sparse array; but:
> > > >  * Having a small sparse array with the "present" field is better
> > > >    than broken sparse nodeid support (IMO).
> > > 
> > > I agree here. This patchset is still RFC but if that's
> > > the only issue I might apply it.
> > 
> > I don't think it is ready for 2.1 since it lacks arch-specific changes.
> > The patch itself only makes qemu be able to bring up guest with sparse
> > numa nodes, without arch-specific changes, guest sees only one node(at
> > least on x86).
> 
> Well, it does no harm or foul to have it, on it's own. In that, nothing
> is worse or better than it was. Although I would argue the qemu core is
> better than it was :) I have sent the powerpc enabling patch, but even
> with that, we need Alexey's series of 6 patches (which do fix a
> regression) to fully support the sparse numbering. Maybe that means I've
> convinced myself it's best to wait until after 2.1.
> 
> I honestly am not sure how to "fix" the x86 code to support the sparse
> numbering -- the APIC code seems to rely on it being continuous. From
> some googling, it seems like the most common reason for non-continuous
> numbering on x86 is faulty hardware. I will keep fiddling if no one else
> steps up, but I'm not an x86 NUMA expert :)

There are two ACPI table building methods: the fw_cfg-based (where
SeaBIOS builds the SRAT table), and the acpi-build.c code.

The fw_cfg-based method assumes contigous nodeids: it builds a table
with nb_numa_nodes entries, carrying only the memory size of each node.

The acpi-build.c code could easily use the "present" flag, and doesn't
seem to require contiguous nodeids. The ACPI specs don't seem to have
any requirement about contiguous proximity domain IDs, but I am not sure
about how guest OSes would react.

acpi-build.c has some confusing code to make sure the table has at least
nb_numa_nodes+2 entries (adding disabled entries at the end of the table
if necessary), but I don't understand why. Maybe it is just for
compatibility with SeaBIOS versions that calculated the SRAT table size
based on nb_numa_nodes beforehand.

But we need to this carefully, as changing the behavior again later
would require compatibility code, and would make things worse.  I don't
even know if guest OSes would properly handle non-contiguous node IDs.

In either case, QEMU is currently broken when nodeids are not contiguous
or nodeids are repeated in the command-line. So I suggest we do this on
QEMU 2.1:

 * Add the "present" field (required to allow us to detect
   command-lines with missing nodeids);
 * Reject the command-line if a nodeid is specified twice;
 * Make the PC initialization code reject the configuration if some
   nodeid is not present. This is a bug fix, not a regression, as
   non-contiguous nodeids were never supported.

This way, we are not introducing new behavior to keep compatibility in
the future, but we are fixing the bug where QEMU doesn't complain about
non-contiguous nodeids. And after 2.1, we can make PC support
non-contiguous nodeids if we agree it is useful.

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-24 17:40   ` [Qemu-devel] [RFC PATCH v3] " Nishanth Aravamudan
  2014-06-25 11:21     ` Igor Mammedov
@ 2014-06-26 19:37     ` Eduardo Habkost
  2014-06-30 18:26       ` Nishanth Aravamudan
  1 sibling, 1 reply; 22+ messages in thread
From: Eduardo Habkost @ 2014-06-26 19:37 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Michael S. Tsirkin, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On Tue, Jun 24, 2014 at 10:40:38AM -0700, Nishanth Aravamudan wrote:
> Sparse node numbering occurs on powerpc in practice under PowerVM. In
> order to emulate the same NUMA topology under qemu, the assumption that
> NUMA nodes are linearly ordered has to be removed. qemu actually already
> supports (inasmuch as it doesn't throw an error) sparse node numbering
> by the end-user, but the information is effectively ignored and the
> nodes are populated linearly starting at 0. This means a user's node ID
> requests are not honored in the Linux kernel, and that the topology may
> not be as requested (in terms of the CPU/memory placement).
> 
> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
> 
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.
> 
> Note that architecture-specific code still possibly needs changes
> (forthcoming) for both sparse node numbering and memoryless nodes. This
> only puts in the generic infrastructure to support that.
> 
> Examples:
> 
> (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
> node,nodeid=2 -smp 16
> 
> Before:
> 
> node 0 cpus: 0 2 4 6 8 10 12 14
> node 0 size: 2048 MB
> node 1 cpus: 1 3 5 7 9 11 13 15
> node 1 size: 2048 MB
> 
> After:
> 
> node 2 cpus: 0 2 4 6 8 10 12 14                                               |
> node 2 size: 2048 MB                                                          |
> node 3 cpus: 1 3 5 7 9 11 13 15                                               |
> node 3 size: 2048 MB
> 
> (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
> node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16
> 
> node 0 cpus: 0 4 8 12                                                         |
> node 0 size: 1024 MB                                                          |
> node 1 cpus: 1 5 9 13                                                         |
> node 1 size: 1024 MB                                                          |
> node 2 cpus: 2 6 10 14                                                        |
> node 2 size: 1024 MB                                                          |
> node 3 cpus: 3 7 11 15                                                        |
> node 3 size: 1024 MB
> 
> (qemu) info numa
> 4 nodes
> node 0 cpus: 0 4 8 12
> node 0 size: 1024 MB
> node 1 cpus: 1 5 9 13
> node 1 size: 1024 MB
> node 2 cpus: 2 6 10 14
> node 2 size: 1024 MB
> node 3 cpus: 3 7 11 15
> node 3 size: 1024 MB
> 
> Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
> Tested-by: Hu Tao <hutao@cn.fujitsu.com>
> 
> ---
> Based off mst's for_upstream tag, which has the NodeInfo changes.
> 
> v1 -> v2:
>   Modify set_numa_nodes loop for round-robin'ing CPUs.
> 
> v2 -> v3:
>   Updated changelog to indicate problem being solved.
>   Updated memory_region_allocate_system_memory based upon feedback from
>     Hu.
>   Updated set_numa_nodes loop again to be simpler based upon feedback
>     from Hu.
>   Fixed bug with a mix of nodes with nodeid specified and without, where
>     the same nodeid would be used by the explicit specification and the
>     auto-numbering code.
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 277230d..b90bf66 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -145,11 +145,13 @@ extern int mem_prealloc;
>   */
>  #define MAX_CPUMASK_BITS 255
>  
> -extern int nb_numa_nodes;
> +extern int nb_numa_nodes; /* Number of NUMA nodes */

I would rename it to num_numa_nodes, so we can easily ensure all code
using nb_numa_nodes will be converted appropriately.

> +extern int max_numa_node; /* Highest specified NUMA node ID */

I would rename it max_numa_nodeid, to make it clear it is the maximum
ID, not the maximum number of nodes.

The comment is inaccurate: it is the highest specified node ID, plus
one. I would specify it as:

/* Highest NUMA node ID, plus one, so for all present NUMA nodes,
 * nodeid < max_numa_nodeid
 */

>  typedef struct node_info {
>      uint64_t node_mem;
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
>      struct HostMemoryBackend *node_memdev;
> +    bool present;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void set_numa_nodes(void);
> diff --git a/monitor.c b/monitor.c
> index c7f8797..4721996 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
>      CPUState *cpu;
>  
>      monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
> -    for (i = 0; i < nb_numa_nodes; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
> +        if (!numa_info[i].present) {
> +            continue;
> +        }
>          monitor_printf(mon, "node %d cpus:", i);
>          CPU_FOREACH(cpu) {
>              if (cpu->numa_node == i) {
> diff --git a/numa.c b/numa.c
> index e471afe..d6b86ab 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -53,7 +53,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>      if (node->has_nodeid) {
>          nodenr = node->nodeid;
>      } else {
> -        nodenr = nb_numa_nodes;
> +        nodenr = 0;
> +        while (numa_info[nodenr].present) {
> +            nodenr++;
> +        }
>      }
>  
>      if (nodenr >= MAX_NODES) {
> @@ -106,6 +109,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
>          numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
>          numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
>      }
> +    numa_info[nodenr].present = true;

What about rejecting nodes which are already present? A node shouldn't
appear twice on the command-line.

By repeating nodeids, an user can make nb_numa_nodes > max_numa_node,
and make existing code access memory beyond memory_info[].


> +    if (nodenr >= max_numa_node) {
> +        max_numa_node = nodenr + 1;
> +    }
>  }
>  
>  int numa_init_func(QemuOpts *opts, void *opaque)
> @@ -155,7 +162,7 @@ void set_numa_nodes(void)
>  {
>      if (nb_numa_nodes > 0) {
>          uint64_t numa_total;
> -        int i;
> +        int i, j = -1;

Can you please initialize j closer to the loop where it is used?

>  
>          if (nb_numa_nodes > MAX_NODES) {
>              nb_numa_nodes = MAX_NODES;
> @@ -164,27 +171,29 @@ void set_numa_nodes(void)
>          /* If no memory size if given for any node, assume the default case
>           * and distribute the available memory equally across all nodes
>           */
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (numa_info[i].node_mem != 0) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present && numa_info[i].node_mem != 0) {
>                  break;
>              }
>          }
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              uint64_t usedmem = 0;
>  
>              /* On Linux, the each node's border has to be 8MB aligned,
>               * the final node gets the rest.
>               */
> -            for (i = 0; i < nb_numa_nodes - 1; i++) {
> -                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> -                                        ~((1 << 23UL) - 1);
> -                usedmem += numa_info[i].node_mem;
> +            for (i = 0; i < max_numa_node - 1; i++) {
> +                if (numa_info[i].present) {
> +                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> +                                            ~((1 << 23UL) - 1);
> +                    usedmem += numa_info[i].node_mem;
> +                }
>              }

This part is tricky: the following line works only because
numa_info[max_numa_node-1] is always present:

>              numa_info[i].node_mem = ram_size - usedmem;

So, what about adding assert(numa_info[i].present) here?

>          }
>  
>          numa_total = 0;
> -        for (i = 0; i < nb_numa_nodes; i++) {
> +        for (i = 0; i < max_numa_node; i++) {
>              numa_total += numa_info[i].node_mem;
>          }
>          if (numa_total != ram_size) {
> @@ -194,8 +203,9 @@ void set_numa_nodes(void)
>              exit(1);
>          }
>  
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
>                  break;
>              }
>          }
> @@ -203,9 +213,12 @@ void set_numa_nodes(void)
>           * must cope with this anyway, because there are BIOSes out there in
>           * real machines which also use this scheme.
>           */
> -        if (i == nb_numa_nodes) {
> +        if (i == max_numa_node) {
>              for (i = 0; i < max_cpus; i++) {
> -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> +                do {
> +                    j = (j + 1) % max_numa_node;
> +                } while (!numa_info[j].present);

If you change it from "do { } while" to "while { }", you don't need to
initialize j to -1.

> +                set_bit(i, numa_info[j].node_cpu);
>              }
>          }
>      }
> @@ -217,8 +230,9 @@ void set_numa_modes(void)
>      int i;
>  
>      CPU_FOREACH(cpu) {
> -        for (i = 0; i < nb_numa_nodes; i++) {
> -            if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
> +        for (i = 0; i < max_numa_node; i++) {
> +            if (numa_info[i].present &&
> +                test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
>                  cpu->numa_node = i;
>              }
>          }
> @@ -266,10 +280,13 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
>      }
>  
>      memory_region_init(mr, owner, name, ram_size);
> -    for (i = 0; i < MAX_NODES; i++) {
> +    for (i = 0; i < max_numa_node; i++) {
>          Error *local_err = NULL;
>          uint64_t size = numa_info[i].node_mem;
>          HostMemoryBackend *backend = numa_info[i].node_memdev;
> +        if (!numa_info[i].present) {
> +            continue;
> +        }
>          if (!backend) {
>              continue;
>          }
> diff --git a/vl.c b/vl.c
> index 54b4627..e1a6ab8 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>      QTAILQ_HEAD_INITIALIZER(fw_boot_order);
>  
>  int nb_numa_nodes;
> +int max_numa_node;
>  NodeInfo numa_info[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          numa_info[i].node_mem = 0;
> +        numa_info[i].present = false;
>          bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
>      }
>  
>      nb_numa_nodes = 0;
> +    max_numa_node = -1;

As we just need nodeids to be < max_numa_node, max_numa_node can be
safely initialized to 0 instead of -1.

>      nb_nics = 0;
>  
>      bdrv_init_with_whitelist();
> 

Considering the size of this patch (and the rest of the machine-specific
code which still needs to be changed), I have a suggestion:

We can first make QEMU reject non-contiguous nodeid on all
architectures, by introducing the "present" field, rejecting duplicate
nodeids, and rejecting the configuration if max_numa_node !=
nb_numa_nodes at the beginning of set_numa_nodes() (preferably with an
assert(node_info[i].present) loop, too). This way, we won't need to
touch much of the code to keep existing behavior, and the patch will fix
bugs but be small and very easy to review.

After we do that, we can review and consider a patch that adds support
to non-contiguous nodeids, which would be a larger and more intrusive
patch. Maybe it will be 2.1 material, maybe it won't, I don't know.

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-26 19:37     ` Eduardo Habkost
@ 2014-06-30 18:26       ` Nishanth Aravamudan
  2014-06-30 21:48         ` Eduardo Habkost
  0 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2014-06-30 18:26 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: Michael S. Tsirkin, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On 26.06.2014 [16:37:05 -0300], Eduardo Habkost wrote:
> On Tue, Jun 24, 2014 at 10:40:38AM -0700, Nishanth Aravamudan wrote:
<snip>
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index 277230d..b90bf66 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -145,11 +145,13 @@ extern int mem_prealloc;
> >   */
> >  #define MAX_CPUMASK_BITS 255
> >  
> > -extern int nb_numa_nodes;
> > +extern int nb_numa_nodes; /* Number of NUMA nodes */
> 
> I would rename it to num_numa_nodes, so we can easily ensure all code
> using nb_numa_nodes will be converted appropriately.
> 
> > +extern int max_numa_node; /* Highest specified NUMA node ID */
> 
> I would rename it max_numa_nodeid, to make it clear it is the maximum
> ID, not the maximum number of nodes.

Thanks, I'm rebasing onto your series now.

<snip>

> >  int numa_init_func(QemuOpts *opts, void *opaque)
> > @@ -155,7 +162,7 @@ void set_numa_nodes(void)
> >  {
> >      if (nb_numa_nodes > 0) {
> >          uint64_t numa_total;
> > -        int i;
> > +        int i, j = -1;
> 
> Can you please initialize j closer to the loop where it is used?

Yep.

<snip>

> >              /* On Linux, the each node's border has to be 8MB aligned,
> >               * the final node gets the rest.
> >               */
> > -            for (i = 0; i < nb_numa_nodes - 1; i++) {
> > -                numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> > -                                        ~((1 << 23UL) - 1);
> > -                usedmem += numa_info[i].node_mem;
> > +            for (i = 0; i < max_numa_node - 1; i++) {
> > +                if (numa_info[i].present) {
> > +                    numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> > +                                            ~((1 << 23UL) - 1);
> > +                    usedmem += numa_info[i].node_mem;
> > +                }
> >              }
> 
> This part is tricky: the following line works only because
> numa_info[max_numa_node-1] is always present:
> 
> >              numa_info[i].node_mem = ram_size - usedmem;
> 
> So, what about adding assert(numa_info[i].present) here?

Yep.

<snip>

> > @@ -203,9 +213,12 @@ void set_numa_nodes(void)
> >           * must cope with this anyway, because there are BIOSes out there in
> >           * real machines which also use this scheme.
> >           */
> > -        if (i == nb_numa_nodes) {
> > +        if (i == max_numa_node) {
> >              for (i = 0; i < max_cpus; i++) {
> > -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> > +                do {
> > +                    j = (j + 1) % max_numa_node;
> > +                } while (!numa_info[j].present);
> 
> If you change it from "do { } while" to "while { }", you don't need to
> initialize j to -1.

I don't think that's quite as simple as you make it out to be. If you
use a while() loop, we won't always increment j, which means once we've
found a present node, we'll always use that node? j here basically
represents the *last* used nodeid, which we don't want to use again when
we re-enter the for-loop, we want to use the next present nodeid. It
seems like the do {} while() does this fine?  I could use a while() if I
added another increment outside the loop, as follows:

        if (i == max_numa_nodeid) {
            for (i = 0, j = 0; i < max_cpus; i++) {
                while (!numa_info[j].present) {
                    j = (j + 1) % (max_numa_nodeid);
                }
                set_bit(i, numa_info[j].node_cpu);
                j = (j + 1) % (max_numa_nodeid);
            }
        }

If you think that is cleaner, I'll use that version.

Thanks,
Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering
  2014-06-30 18:26       ` Nishanth Aravamudan
@ 2014-06-30 21:48         ` Eduardo Habkost
  0 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2014-06-30 21:48 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Michael S. Tsirkin, Alexey Kardashevskiy, Hu Tao, qemu-devel,
	Anton Blanchard, David Rientjes, Igor Mammedov

On Mon, Jun 30, 2014 at 11:26:10AM -0700, Nishanth Aravamudan wrote:
[...]
> > > -        if (i == nb_numa_nodes) {
> > > +        if (i == max_numa_node) {
> > >              for (i = 0; i < max_cpus; i++) {
> > > -                set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> > > +                do {
> > > +                    j = (j + 1) % max_numa_node;
> > > +                } while (!numa_info[j].present);
> > 
> > If you change it from "do { } while" to "while { }", you don't need to
> > initialize j to -1.
> 
> I don't think that's quite as simple as you make it out to be. If you
> use a while() loop, we won't always increment j, which means once we've
> found a present node, we'll always use that node? j here basically
> represents the *last* used nodeid, which we don't want to use again when
> we re-enter the for-loop, we want to use the next present nodeid. It
> seems like the do {} while() does this fine?  I could use a while() if I
> added another increment outside the loop, as follows:
> 
>         if (i == max_numa_nodeid) {
>             for (i = 0, j = 0; i < max_cpus; i++) {
>                 while (!numa_info[j].present) {
>                     j = (j + 1) % (max_numa_nodeid);
>                 }
>                 set_bit(i, numa_info[j].node_cpu);
>                 j = (j + 1) % (max_numa_nodeid);
>             }
>         }
> 
> If you think that is cleaner, I'll use that version.

You are right, and your "do { } while" version looks cleaner than
duplicating the "j = (j + 1) % (max_numa_nodeid)" line.

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2014-06-30 21:49 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-23 19:33 [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering Nishanth Aravamudan
2014-06-24  0:48 ` [Qemu-devel] [RFC PATCH v2] " Nishanth Aravamudan
2014-06-24  1:08   ` [Qemu-devel] [RFC PATCH] ppc/spapr: support sparse NUMA " Nishanth Aravamudan
2014-06-24  3:34     ` Nishanth Aravamudan
2014-06-24  7:43   ` [Qemu-devel] [RFC PATCH v2] numa: enable sparse " Hu Tao
2014-06-24 17:40   ` [Qemu-devel] [RFC PATCH v3] " Nishanth Aravamudan
2014-06-25 11:21     ` Igor Mammedov
2014-06-25 16:13       ` Nishanth Aravamudan
2014-06-25 16:52         ` Eduardo Habkost
2014-06-25 17:04           ` Nishanth Aravamudan
2014-06-25 18:24             ` Michael S. Tsirkin
2014-06-26  6:29             ` Igor Mammedov
2014-06-25 18:23           ` Michael S. Tsirkin
2014-06-26  9:09             ` Hu Tao
2014-06-26 17:58               ` Nishanth Aravamudan
2014-06-26 18:39                 ` Eduardo Habkost
2014-06-26 19:37     ` Eduardo Habkost
2014-06-30 18:26       ` Nishanth Aravamudan
2014-06-30 21:48         ` Eduardo Habkost
2014-06-24  7:33 ` [Qemu-devel] [RFC PATCH] " Igor Mammedov
2014-06-24  8:58   ` Alexey Kardashevskiy
2014-06-24 16:39   ` Nishanth Aravamudan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.