All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/4] add NUMA emulation
@ 2009-03-31 13:28 Andre Przywara
  2009-03-31 13:28 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
  2009-03-31 14:37 ` [Qemu-devel] Re: [PATCH 0/4] add NUMA emulation Anthony Liguori
  0 siblings, 2 replies; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 13:28 UTC (permalink / raw)
  To: qemu-devel

Hi,

the following patches add NUMA emulation to QEMU guests.
Although the ultimate goal is KVM with host side support, these
patches are pure QEMU with no host side binding.
This is a reworked version from end of last year, I adapted the command line
syntax to Anthony's wishes:
-numa node[,mem=<size>[MG]][,cpus=<from>[-<to>]][,nodeid=<nr>]
If we agree to this scheme (which drops mem=from-to and requires at least
one -numa node for each NUMA node), I will provide more detailed documentation.
Patch 1/4 adds the -numa command line parameter and sets a QEMU global
array with the parsed values. If no specific values for memory and CPUs are
given, all resources will be split equally across all nodes.
Patch 2/4 adds an "info numa" command to the monitor to output the current
topology. Since NUMA is advertised via static ACPI tables, no changes are
possible during runtime.
Patch 3/4 uses the QEMU firmware configuration interfacce to send the NUMA
topology to the BIOS, which has to setup the tables. Only one channel is used.
Patch 4/4 finally adds the BIOS support, which create the appropriate
SRAT table reflecting the given topology.

Looking forward to any comments.

Regards,
Andre.

P.S. I hope I have applied to the latest QEMU coding style ;-)

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 448 3567 12
----to satisfy European law for business letters:
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Jochen Polster; Thomas M. McCoy; Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser
  2009-03-31 13:28 [Qemu-devel] [PATCH 0/4] add NUMA emulation Andre Przywara
@ 2009-03-31 13:28 ` Andre Przywara
  2009-03-31 13:28   ` [Qemu-devel] [PATCH 2/4] add info numa command to monitor Andre Przywara
  2009-03-31 13:42   ` [Qemu-devel] Re: [PATCH 1/4] added -numa cmdline parameter parser Anthony Liguori
  2009-03-31 14:37 ` [Qemu-devel] Re: [PATCH 0/4] add NUMA emulation Anthony Liguori
  1 sibling, 2 replies; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 13:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andre Przywara, Andre Przywara

From: Andre Przywara <aprzywar@amd.com>


Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 qemu-options.hx |    8 ++++
 sysemu.h        |    7 +++-
 vl.c            |  114 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index 6c58e2a..f3f1389 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -40,6 +40,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs
 to 4.
 ETEXI
 
+DEF("numa", HAS_ARG, QEMU_OPTION_numa,
+    "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n")
+STEXI
+@item -numa @var{opts}
+Simulate a multi node NUMA system. If mem and cpus are omitted, resources
+are split equally.
+ETEXI
+
 DEF("fda", HAS_ARG, QEMU_OPTION_fda,
     "-fda/-fdb file  use 'file' as floppy disk 0/1 image\n")
 DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "")
diff --git a/sysemu.h b/sysemu.h
index 3eab34b..b83a66c 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -108,6 +108,11 @@ extern const char *bootp_filename;
 extern int kqemu_allowed;
 #endif
 
+#define MAX_NODES 64
+extern int nb_numa_nodes;
+extern uint64_t node_mem[MAX_NODES];
+extern uint64_t node_cpumask[MAX_NODES];
+
 #define MAX_OPTION_ROMS 16
 extern const char *option_rom[MAX_OPTION_ROMS];
 extern int nb_option_roms;
@@ -248,7 +253,7 @@ void do_usb_add(Monitor *mon, const char *devname);
 void do_usb_del(Monitor *mon, const char *devname);
 void usb_info(Monitor *mon);
 
-const char *get_opt_name(char *buf, int buf_size, const char *p);
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
 const char *get_opt_value(char *buf, int buf_size, const char *p);
 int get_param_value(char *buf, int buf_size,
                     const char *tag, const char *str);
diff --git a/vl.c b/vl.c
index 5e6c621..3f9c713 100644
--- a/vl.c
+++ b/vl.c
@@ -261,6 +261,10 @@ const char *prom_envs[MAX_PROM_ENVS];
 int nb_drives_opt;
 struct drive_opt drives_opt[MAX_DRIVES];
 
+int nb_numa_nodes;
+uint64_t node_mem[MAX_NODES];
+uint64_t node_cpumask[MAX_NODES];
+
 static CPUState *cur_cpu;
 static CPUState *next_cpu;
 static int event_pending = 1;
@@ -1860,12 +1864,12 @@ static int socket_init(void)
 }
 #endif
 
-const char *get_opt_name(char *buf, int buf_size, const char *p)
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim)
 {
     char *q;
 
     q = buf;
-    while (*p != '\0' && *p != '=') {
+    while (*p != '\0' && *p != delim) {
         if (q && (q - buf) < buf_size - 1)
             *q++ = *p;
         p++;
@@ -1905,7 +1909,7 @@ int get_param_value(char *buf, int buf_size,
 
     p = str;
     for(;;) {
-        p = get_opt_name(option, sizeof(option), p);
+        p = get_opt_name(option, sizeof(option), p, '=');
         if (*p != '=')
             break;
         p++;
@@ -1930,7 +1934,7 @@ int check_params(char *buf, int buf_size,
 
     p = str;
     for(;;) {
-        p = get_opt_name(buf, buf_size, p);
+        p = get_opt_name(buf, buf_size, p, '=');
         if (*p != '=')
             return -1;
         p++;
@@ -2623,6 +2627,53 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque)
     return drives_table_idx;
 }
 
+static void numa_add(const char* optarg)
+{
+char option[128];
+char *endptr;
+unsigned long long value, endvalue;
+int nodenr;
+
+    optarg = get_opt_name(option, 128, optarg, ',') + 1;
+    if (!strcmp(option, "node")) {
+        if (get_param_value(option, 128, "nodeid", optarg) == 0) {
+            nodenr = nb_numa_nodes;
+        } else {
+            nodenr = strtoull(option, NULL, 10);
+        }
+
+        if (get_param_value(option, 128, "mem", optarg) == 0) {
+            node_mem[nodenr] = 0;
+        } else
+        {
+            value = strtoull(option, &endptr, 0);
+            switch (*endptr) {
+            case 0: case 'M': case 'm':
+                value <<= 20;
+                break;
+            case 'G': case 'g':
+                value <<= 30;
+                break;
+            }
+            node_mem[nodenr] = value;
+        }
+        if (get_param_value(option, 128, "cpus", optarg) == 0) {
+            node_cpumask[nodenr] = 0;
+        } else {
+            value = strtoull(option, &endptr, 10);
+            if (*endptr == '-') {
+                endvalue = strtoull(endptr+1, &endptr, 10);
+                value = (1 << (endvalue + 1)) - (1 << value);
+            } else {
+                value = 1 << value;
+            }
+            node_cpumask[nodenr] = value;
+        }
+        nb_numa_nodes++;
+    }
+    return;
+}
+
 /***********************************************************/
 /* USB devices */
 
@@ -4337,12 +4388,18 @@ int main(int argc, char **argv, char **envp)
         virtio_consoles[i] = NULL;
     virtio_console_index = 0;
 
+    for (i = 0; i < MAX_NODES; i++) {
+        node_cpumask[i] = 0;
+        node_mem[i] = 0;
+    }
+
     usb_devices_index = 0;
 
     nb_net_clients = 0;
     nb_bt_opts = 0;
     nb_drives = 0;
     nb_drives_opt = 0;
+    nb_numa_nodes = 0;
     hda_index = -1;
 
     nb_nics = 0;
@@ -4492,6 +4549,13 @@ int main(int argc, char **argv, char **envp)
 			             ",trans=none" : "");
                 }
                 break;
+            case QEMU_OPTION_numa:
+                if (nb_numa_nodes >= MAX_NODES) {
+                    fprintf(stderr, "qemu: too many NUMA nodes\n");
+                    exit(1);
+                }
+                numa_add(optarg);
+                break;
             case QEMU_OPTION_nographic:
                 nographic = 1;
                 break;
@@ -5192,6 +5256,48 @@ int main(int argc, char **argv, char **envp)
         }
     }
 
+    if (nb_numa_nodes > 0) {
+        int i;
+
+        if (nb_numa_nodes > smp_cpus) {
+            nb_numa_nodes = smp_cpus;
+        }
+
+        /* If no memory size if given for any node, assume the default case
+         * and distribute the available memory equally across all nodes
+         */
+        for (i = 0; i < nb_numa_nodes; i++) {
+            if (node_mem[i] != 0)
+                break;
+        }
+        if (i == nb_numa_nodes) {
+            uint64_t usedmem = 0;
+
+            /* On Linux, the each node's border has to be 8MB aligned,
+             * the final node gets the rest.
+             */
+            for (i = 0; i < nb_numa_nodes - 1; i++) {
+                node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
+                usedmem += node_mem[i];
+            }
+            node_mem[i] = ram_size - usedmem;
+        }
+
+        for (i = 0; i < nb_numa_nodes; i++) {
+            if (node_cpumask[i] != 0)
+                break;
+        }
+        /* assigning the VCPUs round-robin is easier to implement, guest OSes
+         * must cope with this anyway, because there are BIOSes out there in
+         * real machines which also use this scheme.
+         */
+        if (i == nb_numa_nodes) {
+            for (i = 0; i < smp_cpus; i++) {
+                node_cpumask[i % nb_numa_nodes] |= 1<<i;
+            }
+        }
+    }
+
     if (kvm_enabled()) {
         int ret;
 
-- 
1.6.1.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 2/4] add info numa command to monitor
  2009-03-31 13:28 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
@ 2009-03-31 13:28   ` Andre Przywara
  2009-03-31 13:28     ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Andre Przywara
  2009-03-31 13:42   ` [Qemu-devel] Re: [PATCH 1/4] added -numa cmdline parameter parser Anthony Liguori
  1 sibling, 1 reply; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 13:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andre Przywara, Andre Przywara

From: Andre Przywara <aprzywar@amd.com>

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 monitor.c |   18 ++++++++++++++++++
 1 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/monitor.c b/monitor.c
index c6fe968..eb7b6cb 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1397,6 +1397,22 @@ static void do_info_kvm(Monitor *mon)
 #endif
 }
 
+static void do_info_numa(Monitor *mon)
+{
+    int i, j;
+
+    monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
+    for (i = 0; i < nb_numa_nodes; i++) {
+        monitor_printf(mon, "node %d cpus:", i);
+        for (j = 0; j < 64; j++) {
+            if (node_cpumask[i] & (1ULL << j)) monitor_printf(mon, " %d", j);
+        }
+        monitor_printf(mon, "\n");
+        monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i,
+            node_mem[i] >> 20);
+    }
+}
+
 #ifdef CONFIG_PROFILER
 
 int64_t kqemu_time;
@@ -1770,6 +1786,8 @@ static const mon_cmd_t info_cmds[] = {
       "", "show KQEMU information", },
     { "kvm", "", do_info_kvm,
       "", "show KVM information", },
+    { "numa", "", do_info_numa,
+      "", "show NUMA information", },
     { "usb", "", usb_info,
       "", "show guest USB devices", },
     { "usbhost", "", usb_host_info,
-- 
1.6.1.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS
  2009-03-31 13:28   ` [Qemu-devel] [PATCH 2/4] add info numa command to monitor Andre Przywara
@ 2009-03-31 13:28     ` Andre Przywara
  2009-03-31 13:28       ` [Qemu-devel] [PATCH 4/4] add BIOS support for an ACPI SRAT table (needed for NUMA support) Andre Przywara
  2009-03-31 16:00       ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Blue Swirl
  0 siblings, 2 replies; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 13:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andre Przywara, Andre Przywara

From: Andre Przywara <aprzywar@amd.com>


Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 hw/fw_cfg.h |    1 +
 hw/pc.c     |   22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index 41a3dd0..f616ed2 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -14,6 +14,7 @@
 #define FW_CFG_INITRD_ADDR      0x0a
 #define FW_CFG_INITRD_SIZE      0x0b
 #define FW_CFG_BOOT_DEVICE      0x0c
+#define FW_CFG_NUMA             0x0d
 #define FW_CFG_MAX_ENTRY        0x10
 
 #define FW_CFG_WRITE_CHANNEL    0x4000
diff --git a/hw/pc.c b/hw/pc.c
index f9cfd1f..5c066b8 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -425,6 +425,8 @@ static void bochs_bios_write(void *opaque, uint32_t addr, uint32_t val)
 static void bochs_bios_init(void)
 {
     void *fw_cfg;
+    uint64_t *numa_fw_cfg;
+    int i, j;
 
     register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
     register_ioport_write(0x401, 1, 2, bochs_bios_write, NULL);
@@ -442,6 +444,26 @@ static void bochs_bios_init(void)
     fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size);
     fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
                      acpi_tables_len);
+
+    /* allocate memory for the NUMA channel: one (64bit) word for the number
+     * of nodes, one word for each VCPU->node and one word for each node to
+     * hold the amount of memory.
+     */
+    numa_fw_cfg = qemu_mallocz ((1 + smp_cpus + nb_numa_nodes) * 8);
+    numa_fw_cfg[0] = nb_numa_nodes;
+    for (i = 0; i < smp_cpus; i++) {
+        for (j = 0; j < nb_numa_nodes; j++) {
+            if (node_cpumask[j] & (1 << i)) {
+                numa_fw_cfg[i + 1] = j;
+                break;
+            }
+        }
+    }
+    for (i = 0; i < nb_numa_nodes; i++) {
+        numa_fw_cfg[smp_cpus + 1 + i] = node_mem[i];
+    }
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t*)numa_fw_cfg,
+                     (1 + smp_cpus + nb_numa_nodes) * 8);
 }
 
 /* Generate an initial boot sector which sets state and jump to
-- 
1.6.1.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 4/4] add BIOS support for an ACPI SRAT table (needed for NUMA support)
  2009-03-31 13:28     ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Andre Przywara
@ 2009-03-31 13:28       ` Andre Przywara
  2009-03-31 13:44         ` [Qemu-devel] " Anthony Liguori
  2009-03-31 16:00       ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Blue Swirl
  1 sibling, 1 reply; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 13:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: Andre Przywara


Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 .../bios-pq/0012_add-SRAT-ACPI-table-support.patch |  307 ++++++++++++++++++++
 pc-bios/bios-pq/series                             |    1 +
 2 files changed, 308 insertions(+), 0 deletions(-)
 create mode 100644 pc-bios/bios-pq/0012_add-SRAT-ACPI-table-support.patch

diff --git a/pc-bios/bios-pq/0012_add-SRAT-ACPI-table-support.patch b/pc-bios/bios-pq/0012_add-SRAT-ACPI-table-support.patch
new file mode 100644
index 0000000..a98c072
--- /dev/null
+++ b/pc-bios/bios-pq/0012_add-SRAT-ACPI-table-support.patch
@@ -0,0 +1,307 @@
+From 40b77280bf956be2e1d5cbd6b2662e861b480112 Mon Sep 17 00:00:00 2001
+From: Andre Przywara <andre.przywara@amd.com>
+Date: Fri, 20 Mar 2009 00:35:06 +0100
+Subject: [PATCH] add SRAT ACPI table support
+
+Take NUMA topology info from the QEMU firmware configuration interface
+(number of nodes, node for each (V)CPU and amount of memory) and build
+a SRAT table describing this topology for the guest OS. Handles more than
+4 GB of RAM by including a hole for 32bit PCI memory mapping.
+---
+ bios/rombios32.c |  175 ++++++++++++++++++++++++++++++++++++++++++++++++++----
+ 1 files changed, 164 insertions(+), 11 deletions(-)
+
+diff --git a/bios/rombios32.c b/bios/rombios32.c
+index 7be4216..02379c0 100644
+--- a/bios/rombios32.c
++++ b/bios/rombios32.c
+@@ -451,6 +451,11 @@ int pm_sci_int;
+ unsigned long bios_table_cur_addr;
+ unsigned long bios_table_end_addr;
+ 
++static inline uint64_t le64_to_cpu(uint64_t x)
++{
++    return x;
++}
++
+ void wrmsr_smp(uint32_t index, uint64_t val)
+ {
+     static struct { uint32_t ecx, eax, edx; } *p = (void *)SMP_MSR_ADDR;
+@@ -469,6 +474,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
+ #define QEMU_CFG_SIGNATURE  0x00
+ #define QEMU_CFG_ID         0x01
+ #define QEMU_CFG_UUID       0x02
++#define QEMU_CFG_NUMA       0x0D
+ #define QEMU_CFG_ARCH_LOCAL     0x8000
+ #define QEMU_CFG_ACPI_TABLES  (QEMU_CFG_ARCH_LOCAL + 0)
+ 
+@@ -519,6 +525,14 @@ static int acpi_load_table(int i, uint32_t addr, uint16_t *len)
+     qemu_cfg_read((uint8_t*)addr, *len);
+     return 0;
+ }
++
++uint64_t qemu_cfg_get64 (void)
++{
++    uint64_t ret;
++
++    qemu_cfg_read((uint8_t*)&ret, 8);
++    return le64_to_cpu(ret);
++}
+ #endif
+ 
+ void uuid_probe(void)
+@@ -1273,7 +1287,7 @@ struct rsdt_descriptor_rev1
+ {
+ 	ACPI_TABLE_HEADER_DEF                           /* ACPI common table header */
+ #ifdef BX_QEMU
+-	uint32_t                             table_offset_entry [4]; /* Array of pointers to other */
++	uint32_t                             table_offset_entry [5]; /* Array of pointers to other */
+ #else
+ 	uint32_t                             table_offset_entry [3]; /* Array of pointers to other */
+ #endif
+@@ -1381,7 +1395,7 @@ struct multiple_apic_table
+ } __attribute__((__packed__));
+ 
+ 
+-/* Values for Type in APIC_HEADER_DEF */
++/* Values for Type in APIC sub-headers */
+ 
+ #define APIC_PROCESSOR          0
+ #define APIC_IO                 1
+@@ -1394,18 +1408,18 @@ struct multiple_apic_table
+ #define APIC_XRUPT_SOURCE       8
+ #define APIC_RESERVED           9           /* 9 and greater are reserved */
+ 
+-/*
+- * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
+- */
+-#define APIC_HEADER_DEF                     /* Common APIC sub-structure header */\
++#define ACPI_SUB_HEADER_DEF                 /* Common ACPI sub-structure header */\
+ 	uint8_t                              type; \
+ 	uint8_t                              length;
+ 
++/*
++ * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
++ */
+ /* Sub-structures for MADT */
+ 
+ struct madt_processor_apic
+ {
+-	APIC_HEADER_DEF
++	ACPI_SUB_HEADER_DEF
+ 	uint8_t                              processor_id;           /* ACPI processor id */
+ 	uint8_t                              local_apic_id;          /* Processor's local APIC id */
+ #if 0
+@@ -1416,6 +1430,43 @@ struct madt_processor_apic
+ #endif
+ } __attribute__((__packed__));
+ 
++/*
++ * SRAT (NUMA topology description) table
++ */
++
++#define SRAT_PROCESSOR          0
++#define SRAT_MEMORY             1
++
++struct system_resource_affinity_table
++{
++    ACPI_TABLE_HEADER_DEF
++    uint32_t    reserved1;
++    uint32_t    reserved2[2];
++};
++
++struct srat_processor_affinity
++{
++    ACPI_SUB_HEADER_DEF
++    uint8_t     proximity_lo;
++    uint8_t     local_apic_id;
++    uint32_t    flags;
++    uint8_t     local_sapic_eid;
++    uint8_t     proximity_hi[3];
++    uint32_t    reserved;
++};
++
++struct srat_memory_affinity
++{
++    ACPI_SUB_HEADER_DEF
++    uint8_t     proximity[4];
++    uint16_t    reserved1;
++    uint32_t    base_addr_low,base_addr_high;
++    uint32_t    length_low,length_high;
++    uint32_t    reserved2;
++    uint32_t    flags;
++    uint32_t    reserved3[2];
++};
++
+ #ifdef BX_QEMU
+ /*
+  *  * ACPI 2.0 Generic Address Space definition.
+@@ -1444,7 +1495,7 @@ struct acpi_20_hpet {
+ 
+ struct madt_io_apic
+ {
+-	APIC_HEADER_DEF
++	ACPI_SUB_HEADER_DEF
+ 	uint8_t                              io_apic_id;             /* I/O APIC ID */
+ 	uint8_t                              reserved;               /* Reserved - must be zero */
+ 	uint32_t                             address;                /* APIC physical address */
+@@ -1455,7 +1506,7 @@ struct madt_io_apic
+ #ifdef BX_QEMU
+ struct madt_int_override
+ {
+-	APIC_HEADER_DEF
++	ACPI_SUB_HEADER_DEF
+ 	uint8_t                bus;     /* Identifies ISA Bus */
+ 	uint8_t                source;  /* Bus-relative interrupt source */
+ 	uint32_t               gsi;     /* GSI that source will signal */
+@@ -1559,6 +1610,21 @@ int acpi_build_processor_ssdt(uint8_t *ssdt)
+     return ssdt_ptr - ssdt;
+ }
+ 
++static void acpi_build_srat_memory(struct srat_memory_affinity *numamem,
++    uint64_t base, uint64_t len, int node, int enabled)
++{
++     numamem->type = SRAT_MEMORY;
++     numamem->length = sizeof(*numamem);
++     memset (numamem->proximity, 0 ,4);
++     numamem->proximity[0] = node;
++     numamem->flags = cpu_to_le32(!!enabled);
++     numamem->base_addr_low = base & 0xFFFFFFFF;
++     numamem->base_addr_high = base >> 32;
++     numamem->length_low = len & 0xFFFFFFFF;
++     numamem->length_high = len >> 32;
++     return;
++}
++
+ /* base_addr must be a multiple of 4KB */
+ void acpi_bios_init(void)
+ {
+@@ -1569,12 +1635,15 @@ void acpi_bios_init(void)
+     struct multiple_apic_table *madt;
+     uint8_t *dsdt, *ssdt;
+ #ifdef BX_QEMU
++    struct system_resource_affinity_table *srat;
+     struct acpi_20_hpet *hpet;
+     uint32_t hpet_addr;
+ #endif
+     uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr, ssdt_addr;
+     uint32_t acpi_tables_size, madt_addr, madt_size, rsdt_size;
++    uint32_t srat_addr,srat_size;
+     uint16_t i, external_tables;
++    int nb_numa_nodes;
+ 
+     /* reserve memory space for tables */
+ #ifdef BX_USE_EBDA_TABLES
+@@ -1616,6 +1685,25 @@ void acpi_bios_init(void)
+     ssdt_addr = addr;
+     ssdt = (void *)(addr);
+     addr += acpi_build_processor_ssdt(ssdt);
++#ifdef BX_QEMU
++    qemu_cfg_select(QEMU_CFG_NUMA);
++    nb_numa_nodes = qemu_cfg_get64();
++#else
++    nb_numa_nodes = 0;
++#endif
++    if (nb_numa_nodes > 0) {
++        addr = (addr + 7) & ~7;
++        srat_addr = addr;
++        srat_size = sizeof(*srat) +
++            sizeof(struct srat_processor_affinity) * smp_cpus +
++            sizeof(struct srat_memory_affinity) * (nb_numa_nodes + 2);
++        srat = (void *)(addr);
++        addr += srat_size;
++    } else {
++        srat_addr = addr;
++        srat = (void*)(addr);
++        srat_size = 0;
++    }
+ 
+     addr = (addr + 7) & ~7;
+     madt_addr = addr;
+@@ -1725,6 +1813,69 @@ void acpi_bios_init(void)
+ 
+     memset(rsdt, 0, rsdt_size);
+ #ifdef BX_QEMU
++    /* SRAT */
++    if (nb_numa_nodes > 0) {
++        struct srat_processor_affinity *core;
++        struct srat_memory_affinity *numamem;
++        int slots;
++        uint64_t mem_len, mem_base, next_base = 0, curnode;
++
++        qemu_cfg_select(QEMU_CFG_NUMA);
++        qemu_cfg_get64();
++        memset (srat, 0 , srat_size);
++        srat->reserved1=1;
++ 
++        core = (void*)(srat + 1);
++        for (i = 0; i < smp_cpus; ++i) {
++             core->type = SRAT_PROCESSOR;
++             core->length = sizeof(*core);
++             core->local_apic_id = i;
++             curnode = qemu_cfg_get64();
++             core->proximity_lo = curnode;
++             memset (core->proximity_hi, 0, 3);
++             core->local_sapic_eid = 0;
++             if (i < smp_cpus)
++                 core->flags = cpu_to_le32(1);
++             else
++                 core->flags = 0;
++             core++;
++        }
++
++        /* the memory map is a bit tricky, it contains at least one hole
++         * from 640k-1M and possibly another one from 3.5G-4G.
++         */
++        numamem = (void*)core; slots = 0;
++        acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
++        next_base = 1024 * 1024; numamem++;slots++;
++        for (i = 1; i < nb_numa_nodes + 1; ++i) {
++            mem_base = next_base;
++            mem_len = qemu_cfg_get64();
++            if (i == 1) mem_len -= 1024 * 1024;
++            next_base = mem_base + mem_len;
++ 
++            /* Cut out the PCI hole */
++            if (mem_base <= ram_size && next_base > ram_size) {
++                mem_len -= next_base - ram_size;
++                if (mem_len > 0) {
++                    acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
++                    numamem++; slots++;
++                }
++                mem_base = 1ULL << 32;
++                mem_len = next_base - ram_size;
++                next_base += (1ULL << 32) - ram_size;
++            }
++            acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
++            numamem++; slots++;
++        }
++        for (; slots < nb_numa_nodes + 2; slots++) {
++            acpi_build_srat_memory(numamem, 0, 0, 0, 0);
++            numamem++;
++        }
++
++         acpi_build_table_header((struct acpi_table_header *)srat,
++                                "SRAT", srat_size, 1);
++    }
++
+     /* HPET */
+     memset(hpet, 0, sizeof(*hpet));
+     /* Note timer_block_id value must be kept in sync with value advertised by
+@@ -1753,9 +1904,11 @@ void acpi_bios_init(void)
+     rsdt->table_offset_entry[2] = cpu_to_le32(ssdt_addr);
+ #ifdef BX_QEMU
+     rsdt->table_offset_entry[3] = cpu_to_le32(hpet_addr);
++    if (nb_numa_nodes > 0)
++        rsdt->table_offset_entry[4] = cpu_to_le32(srat_addr);
+ #endif
+-    acpi_build_table_header((struct acpi_table_header *)rsdt,
+-                            "RSDT", rsdt_size, 1);
++    acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
++        rsdt_size - (nb_numa_nodes > 0? 0: sizeof(uint32_t)), 1);
+ 
+     acpi_tables_size = addr - base_addr;
+ 
+-- 
+1.6.1.3
+
diff --git a/pc-bios/bios-pq/series b/pc-bios/bios-pq/series
index 5a29df9..bd04a1a 100644
--- a/pc-bios/bios-pq/series
+++ b/pc-bios/bios-pq/series
@@ -9,3 +9,4 @@
 0009_qemu-bios-pci-hotplug-support.patch
 0010_bios-mark-the-acpi-sci-interrupt-as-connected-to-irq-9.patch
 0011_read-additional-acpi-tables-from-a-vm.patch
+0012_add-SRAT-ACPI-table-support.patch
-- 
1.6.1.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] Re: [PATCH 1/4] added -numa cmdline parameter parser
  2009-03-31 13:28 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
  2009-03-31 13:28   ` [Qemu-devel] [PATCH 2/4] add info numa command to monitor Andre Przywara
@ 2009-03-31 13:42   ` Anthony Liguori
  2009-03-31 20:34     ` Andre Przywara
  1 sibling, 1 reply; 12+ messages in thread
From: Anthony Liguori @ 2009-03-31 13:42 UTC (permalink / raw)
  To: Andre Przywara; +Cc: qemu-devel, Andre Przywara

Andre Przywara wrote:
> diff --git a/sysemu.h b/sysemu.h
> index 3eab34b..b83a66c 100644
> --- a/sysemu.h
> +++ b/sysemu.h
> @@ -108,6 +108,11 @@ extern const char *bootp_filename;
>  extern int kqemu_allowed;
>  #endif
>  
> +#define MAX_NODES 64
> +extern int nb_numa_nodes;
> +extern uint64_t node_mem[MAX_NODES];
>   

Using ram_addr_t would be better here although ram_addr_t is just a 
uint64_t so it's not a big deal.

> +extern uint64_t node_cpumask[MAX_NODES];
>   

This is going to cause some pain because it won't be long before someone 
wants to support more than 64 cpus.  I think there are two 
possibilities.  We could go the cpuset route and introduce a type with 
special accessors to store a CPU bitmap.

Or, we could rely on the property that each CPU can only be part of one 
node and make the node association part of the CPUState.  If for some 
reason it's necessary to enumerate all of the CPUs for a given node, we 
would have to walk the CPU list to get at that information.  I don't 
think that'll be a common thing though.

> +static void numa_add(const char* optarg)
> +{
> +char option[128];
> +char *endptr;
> +unsigned long long value, endvalue;
> +int nodenr;
>   

That doesn't seem right indent-wise.

> +        /* assigning the VCPUs round-robin is easier to implement, guest OSes
> +         * must cope with this anyway, because there are BIOSes out there in
> +         * real machines which also use this scheme.
> +         */
> +        if (i == nb_numa_nodes) {
> +            for (i = 0; i < smp_cpus; i++) {
> +                node_cpumask[i % nb_numa_nodes] |= 1<<i;
> +            }
> +        }
>   

The only thing that I don't like about this is that I don't think the 
current -numa syntax can be used to describe a round-robin allocation.  
IIUC, you can say -numa cpus=3 or -numa cpus=3-4 but there's no way to 
say -numa cpus=3:5.

That means that if we ever change the default behavior, there's no way 
that a management app could recreate the guest with that particular 
topology (think live migration).

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] Re: [PATCH 4/4] add BIOS support for an ACPI SRAT table (needed for NUMA support)
  2009-03-31 13:28       ` [Qemu-devel] [PATCH 4/4] add BIOS support for an ACPI SRAT table (needed for NUMA support) Andre Przywara
@ 2009-03-31 13:44         ` Anthony Liguori
  2009-03-31 20:04           ` [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support Andre Przywara
  0 siblings, 1 reply; 12+ messages in thread
From: Anthony Liguori @ 2009-03-31 13:44 UTC (permalink / raw)
  To: Andre Przywara; +Cc: qemu-devel

Andre Przywara wrote:
> Signed-off-by: Andre Przywara <andre.przywara@amd.com>
>   
Please send this as a normal patch and CC bochs-devel.  I'll stick it in 
the patch queue directly when I commit it.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] Re: [PATCH 0/4] add NUMA emulation
  2009-03-31 13:28 [Qemu-devel] [PATCH 0/4] add NUMA emulation Andre Przywara
  2009-03-31 13:28 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
@ 2009-03-31 14:37 ` Anthony Liguori
  1 sibling, 0 replies; 12+ messages in thread
From: Anthony Liguori @ 2009-03-31 14:37 UTC (permalink / raw)
  To: Andre Przywara; +Cc: qemu-devel

Andre Przywara wrote:
> Hi,
>
> the following patches add NUMA emulation to QEMU guests.
> Although the ultimate goal is KVM with host side support, these
> patches are pure QEMU with no host side binding.
> This is a reworked version from end of last year, I adapted the command line
> syntax to Anthony's wishes:
> -numa node[,mem=<size>[MG]][,cpus=<from>[-<to>]][,nodeid=<nr>]
> If we agree to this scheme (which drops mem=from-to and requires at least
> one -numa node for each NUMA node), I will provide more detailed documentation.
> Patch 1/4 adds the -numa command line parameter and sets a QEMU global
> array with the parsed values. If no specific values for memory and CPUs are
> given, all resources will be split equally across all nodes.
> Patch 2/4 adds an "info numa" command to the monitor to output the current
> topology. Since NUMA is advertised via static ACPI tables, no changes are
> possible during runtime.
> Patch 3/4 uses the QEMU firmware configuration interfacce to send the NUMA
> topology to the BIOS, which has to setup the tables. Only one channel is used.
> Patch 4/4 finally adds the BIOS support, which create the appropriate
> SRAT table reflecting the given topology.
>
> Looking forward to any comments.
>   

Other than a few minor comments, it's look really good.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS
  2009-03-31 13:28     ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Andre Przywara
  2009-03-31 13:28       ` [Qemu-devel] [PATCH 4/4] add BIOS support for an ACPI SRAT table (needed for NUMA support) Andre Przywara
@ 2009-03-31 16:00       ` Blue Swirl
  2009-03-31 21:33         ` Andre Przywara
  1 sibling, 1 reply; 12+ messages in thread
From: Blue Swirl @ 2009-03-31 16:00 UTC (permalink / raw)
  To: qemu-devel

On 3/31/09, Andre Przywara <andre.przywara@amd.com> wrote:
> From: Andre Przywara <aprzywar@amd.com>
>
>
>  Signed-off-by: Andre Przywara <andre.przywara@amd.com>


>   static void bochs_bios_init(void)

>  +    uint64_t *numa_fw_cfg;

>  +                numa_fw_cfg[i + 1] = j;

>  +        numa_fw_cfg[smp_cpus + 1 + i] = node_mem[i];

>  +    fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t*)numa_fw_cfg,
>  +                     (1 + smp_cpus + nb_numa_nodes) * 8);

This would break on a big endian host, please use cpu_to_le64.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support
  2009-03-31 13:44         ` [Qemu-devel] " Anthony Liguori
@ 2009-03-31 20:04           ` Andre Przywara
  0 siblings, 0 replies; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 20:04 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: bochs-developers, qemu-devel

Take NUMA topology info from the QEMU firmware configuration interface
(number of nodes, node for each (V)CPU and amount of memory) and build
a SRAT table describing this topology for the guest OS. Handles more than
4 GB of RAM by including a hole for 32bit PCI memory mapping.
---
 bios/rombios32.c |  175 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 164 insertions(+), 11 deletions(-)

Dear Bochs developers, this patch is part of a series to introduce NUMA
support in QEMU. Since this requires the BIOS to build an ACPI table, this
patch is needed. Please review it if you like. This applies against the QEMU
BOCH base:
commit 04387139e3b5ac97b5633cd40b3d87cdf45efd6c
Author: sshwarts <sshwarts>
Date:   Mon Feb 9 19:46:34 2009 +0000
    Fixed compilation error + x86-64 correctness fix
plus the eleven QEMU BOCHS patches on top of it.
If you'd like to take it upstream, drop me a note and I will rebase it.

Thanks!
Andre.

diff --git a/bios/rombios32.c b/bios/rombios32.c
index 7be4216..02379c0 100644
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -451,6 +451,11 @@ int pm_sci_int;
 unsigned long bios_table_cur_addr;
 unsigned long bios_table_end_addr;
 
+static inline uint64_t le64_to_cpu(uint64_t x)
+{
+    return x;
+}
+
 void wrmsr_smp(uint32_t index, uint64_t val)
 {
     static struct { uint32_t ecx, eax, edx; } *p = (void *)SMP_MSR_ADDR;
@@ -469,6 +474,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
 #define QEMU_CFG_SIGNATURE  0x00
 #define QEMU_CFG_ID         0x01
 #define QEMU_CFG_UUID       0x02
+#define QEMU_CFG_NUMA       0x0D
 #define QEMU_CFG_ARCH_LOCAL     0x8000
 #define QEMU_CFG_ACPI_TABLES  (QEMU_CFG_ARCH_LOCAL + 0)
 
@@ -519,6 +525,14 @@ static int acpi_load_table(int i, uint32_t addr, uint16_t *len)
     qemu_cfg_read((uint8_t*)addr, *len);
     return 0;
 }
+
+uint64_t qemu_cfg_get64 (void)
+{
+    uint64_t ret;
+
+    qemu_cfg_read((uint8_t*)&ret, 8);
+    return le64_to_cpu(ret);
+}
 #endif
 
 void uuid_probe(void)
@@ -1273,7 +1287,7 @@ struct rsdt_descriptor_rev1
 {
 	ACPI_TABLE_HEADER_DEF                           /* ACPI common table header */
 #ifdef BX_QEMU
-	uint32_t                             table_offset_entry [4]; /* Array of pointers to other */
+	uint32_t                             table_offset_entry [5]; /* Array of pointers to other */
 #else
 	uint32_t                             table_offset_entry [3]; /* Array of pointers to other */
 #endif
@@ -1381,7 +1395,7 @@ struct multiple_apic_table
 } __attribute__((__packed__));
 
 
-/* Values for Type in APIC_HEADER_DEF */
+/* Values for Type in APIC sub-headers */
 
 #define APIC_PROCESSOR          0
 #define APIC_IO                 1
@@ -1394,18 +1408,18 @@ struct multiple_apic_table
 #define APIC_XRUPT_SOURCE       8
 #define APIC_RESERVED           9           /* 9 and greater are reserved */
 
-/*
- * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
- */
-#define APIC_HEADER_DEF                     /* Common APIC sub-structure header */\
+#define ACPI_SUB_HEADER_DEF                 /* Common ACPI sub-structure header */\
 	uint8_t                              type; \
 	uint8_t                              length;
 
+/*
+ * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
+ */
 /* Sub-structures for MADT */
 
 struct madt_processor_apic
 {
-	APIC_HEADER_DEF
+	ACPI_SUB_HEADER_DEF
 	uint8_t                              processor_id;           /* ACPI processor id */
 	uint8_t                              local_apic_id;          /* Processor's local APIC id */
 #if 0
@@ -1416,6 +1430,43 @@ struct madt_processor_apic
 #endif
 } __attribute__((__packed__));
 
+/*
+ * SRAT (NUMA topology description) table
+ */
+
+#define SRAT_PROCESSOR          0
+#define SRAT_MEMORY             1
+
+struct system_resource_affinity_table
+{
+    ACPI_TABLE_HEADER_DEF
+    uint32_t    reserved1;
+    uint32_t    reserved2[2];
+};
+
+struct srat_processor_affinity
+{
+    ACPI_SUB_HEADER_DEF
+    uint8_t     proximity_lo;
+    uint8_t     local_apic_id;
+    uint32_t    flags;
+    uint8_t     local_sapic_eid;
+    uint8_t     proximity_hi[3];
+    uint32_t    reserved;
+};
+
+struct srat_memory_affinity
+{
+    ACPI_SUB_HEADER_DEF
+    uint8_t     proximity[4];
+    uint16_t    reserved1;
+    uint32_t    base_addr_low,base_addr_high;
+    uint32_t    length_low,length_high;
+    uint32_t    reserved2;
+    uint32_t    flags;
+    uint32_t    reserved3[2];
+};
+
 #ifdef BX_QEMU
 /*
  *  * ACPI 2.0 Generic Address Space definition.
@@ -1444,7 +1495,7 @@ struct acpi_20_hpet {
 
 struct madt_io_apic
 {
-	APIC_HEADER_DEF
+	ACPI_SUB_HEADER_DEF
 	uint8_t                              io_apic_id;             /* I/O APIC ID */
 	uint8_t                              reserved;               /* Reserved - must be zero */
 	uint32_t                             address;                /* APIC physical address */
@@ -1455,7 +1506,7 @@ struct madt_io_apic
 #ifdef BX_QEMU
 struct madt_int_override
 {
-	APIC_HEADER_DEF
+	ACPI_SUB_HEADER_DEF
 	uint8_t                bus;     /* Identifies ISA Bus */
 	uint8_t                source;  /* Bus-relative interrupt source */
 	uint32_t               gsi;     /* GSI that source will signal */
@@ -1559,6 +1610,21 @@ int acpi_build_processor_ssdt(uint8_t *ssdt)
     return ssdt_ptr - ssdt;
 }
 
+static void acpi_build_srat_memory(struct srat_memory_affinity *numamem,
+    uint64_t base, uint64_t len, int node, int enabled)
+{
+     numamem->type = SRAT_MEMORY;
+     numamem->length = sizeof(*numamem);
+     memset (numamem->proximity, 0 ,4);
+     numamem->proximity[0] = node;
+     numamem->flags = cpu_to_le32(!!enabled);
+     numamem->base_addr_low = base & 0xFFFFFFFF;
+     numamem->base_addr_high = base >> 32;
+     numamem->length_low = len & 0xFFFFFFFF;
+     numamem->length_high = len >> 32;
+     return;
+}
+
 /* base_addr must be a multiple of 4KB */
 void acpi_bios_init(void)
 {
@@ -1569,12 +1635,15 @@ void acpi_bios_init(void)
     struct multiple_apic_table *madt;
     uint8_t *dsdt, *ssdt;
 #ifdef BX_QEMU
+    struct system_resource_affinity_table *srat;
     struct acpi_20_hpet *hpet;
     uint32_t hpet_addr;
 #endif
     uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr, ssdt_addr;
     uint32_t acpi_tables_size, madt_addr, madt_size, rsdt_size;
+    uint32_t srat_addr,srat_size;
     uint16_t i, external_tables;
+    int nb_numa_nodes;
 
     /* reserve memory space for tables */
 #ifdef BX_USE_EBDA_TABLES
@@ -1616,6 +1685,25 @@ void acpi_bios_init(void)
     ssdt_addr = addr;
     ssdt = (void *)(addr);
     addr += acpi_build_processor_ssdt(ssdt);
+#ifdef BX_QEMU
+    qemu_cfg_select(QEMU_CFG_NUMA);
+    nb_numa_nodes = qemu_cfg_get64();
+#else
+    nb_numa_nodes = 0;
+#endif
+    if (nb_numa_nodes > 0) {
+        addr = (addr + 7) & ~7;
+        srat_addr = addr;
+        srat_size = sizeof(*srat) +
+            sizeof(struct srat_processor_affinity) * smp_cpus +
+            sizeof(struct srat_memory_affinity) * (nb_numa_nodes + 2);
+        srat = (void *)(addr);
+        addr += srat_size;
+    } else {
+        srat_addr = addr;
+        srat = (void*)(addr);
+        srat_size = 0;
+    }
 
     addr = (addr + 7) & ~7;
     madt_addr = addr;
@@ -1725,6 +1813,69 @@ void acpi_bios_init(void)
 
     memset(rsdt, 0, rsdt_size);
 #ifdef BX_QEMU
+    /* SRAT */
+    if (nb_numa_nodes > 0) {
+        struct srat_processor_affinity *core;
+        struct srat_memory_affinity *numamem;
+        int slots;
+        uint64_t mem_len, mem_base, next_base = 0, curnode;
+
+        qemu_cfg_select(QEMU_CFG_NUMA);
+        qemu_cfg_get64();
+        memset (srat, 0 , srat_size);
+        srat->reserved1=1;
+ 
+        core = (void*)(srat + 1);
+        for (i = 0; i < smp_cpus; ++i) {
+             core->type = SRAT_PROCESSOR;
+             core->length = sizeof(*core);
+             core->local_apic_id = i;
+             curnode = qemu_cfg_get64();
+             core->proximity_lo = curnode;
+             memset (core->proximity_hi, 0, 3);
+             core->local_sapic_eid = 0;
+             if (i < smp_cpus)
+                 core->flags = cpu_to_le32(1);
+             else
+                 core->flags = 0;
+             core++;
+        }
+
+        /* the memory map is a bit tricky, it contains at least one hole
+         * from 640k-1M and possibly another one from 3.5G-4G.
+         */
+        numamem = (void*)core; slots = 0;
+        acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
+        next_base = 1024 * 1024; numamem++;slots++;
+        for (i = 1; i < nb_numa_nodes + 1; ++i) {
+            mem_base = next_base;
+            mem_len = qemu_cfg_get64();
+            if (i == 1) mem_len -= 1024 * 1024;
+            next_base = mem_base + mem_len;
+ 
+            /* Cut out the PCI hole */
+            if (mem_base <= ram_size && next_base > ram_size) {
+                mem_len -= next_base - ram_size;
+                if (mem_len > 0) {
+                    acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+                    numamem++; slots++;
+                }
+                mem_base = 1ULL << 32;
+                mem_len = next_base - ram_size;
+                next_base += (1ULL << 32) - ram_size;
+            }
+            acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+            numamem++; slots++;
+        }
+        for (; slots < nb_numa_nodes + 2; slots++) {
+            acpi_build_srat_memory(numamem, 0, 0, 0, 0);
+            numamem++;
+        }
+
+         acpi_build_table_header((struct acpi_table_header *)srat,
+                                "SRAT", srat_size, 1);
+    }
+
     /* HPET */
     memset(hpet, 0, sizeof(*hpet));
     /* Note timer_block_id value must be kept in sync with value advertised by
@@ -1753,9 +1904,11 @@ void acpi_bios_init(void)
     rsdt->table_offset_entry[2] = cpu_to_le32(ssdt_addr);
 #ifdef BX_QEMU
     rsdt->table_offset_entry[3] = cpu_to_le32(hpet_addr);
+    if (nb_numa_nodes > 0)
+        rsdt->table_offset_entry[4] = cpu_to_le32(srat_addr);
 #endif
-    acpi_build_table_header((struct acpi_table_header *)rsdt,
-                            "RSDT", rsdt_size, 1);
+    acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
+        rsdt_size - (nb_numa_nodes > 0? 0: sizeof(uint32_t)), 1);
 
     acpi_tables_size = addr - base_addr;
 
-- 
1.6.1.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Qemu-devel] Re: [PATCH 1/4] added -numa cmdline parameter parser
  2009-03-31 13:42   ` [Qemu-devel] Re: [PATCH 1/4] added -numa cmdline parameter parser Anthony Liguori
@ 2009-03-31 20:34     ` Andre Przywara
  0 siblings, 0 replies; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 20:34 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

Anthony Liguori wrote:
> Andre Przywara wrote:
>> diff --git a/sysemu.h b/sysemu.h
>> index 3eab34b..b83a66c 100644
>> --- a/sysemu.h
>> +++ b/sysemu.h
>> @@ -108,6 +108,11 @@ extern const char *bootp_filename;
> 
>> +extern uint64_t node_cpumask[MAX_NODES];
>>   
> 
> This is going to cause some pain because it won't be long before someone 
> wants to support more than 64 cpus.  I think there are two 
> possibilities.  We could go the cpuset route and introduce a type with 
> special accessors to store a CPU bitmap.
Right, I was thinking about that one, too. I couldn't find an already 
defined type for this, so I went the easy way for the first version of 
the patch to make a review easier. Please note that the interface to the 
BIOS is not limited in any way (beside a max of 2**64 nodes), so I could 
sent a patch to overcome this limitation later (I suppose more than 64 
vCPUs break something in other parts of code before that).
> 
> Or, we could rely on the property that each CPU can only be part of one 
> node and make the node association part of the CPUState.  If for some 
> reason it's necessary to enumerate all of the CPUs for a given node, we 
> would have to walk the CPU list to get at that information.  I don't 
> think that'll be a common thing though.
Sounds reasonable, I will take a look at it.
> 
>> +static void numa_add(const char* optarg)
>> +{
>> +char option[128];
>> +char *endptr;
>> +unsigned long long value, endvalue;
>> +int nodenr;
>>   
> 
> That doesn't seem right indent-wise.
I knew I missed something....
> 
>> +        /* assigning the VCPUs round-robin is easier to implement, 
>> guest OSes
>> +         * must cope with this anyway, because there are BIOSes out 
>> there in
>> +         * real machines which also use this scheme.
>> +         */
>> +        if (i == nb_numa_nodes) {
>> +            for (i = 0; i < smp_cpus; i++) {
>> +                node_cpumask[i % nb_numa_nodes] |= 1<<i;
>> +            }
>> +        }
>>   
> 
> The only thing that I don't like about this is that I don't think the 
> current -numa syntax can be used to describe a round-robin allocation.  
> IIUC, you can say -numa cpus=3 or -numa cpus=3-4 but there's no way to 
> say -numa cpus=3:5.
> 
> That means that if we ever change the default behavior, there's no way 
> that a management app could recreate the guest with that particular 
> topology (think live migration).
Good point, I was also not happy with the missing possibility to just 
specify a list of vCPUs (since we already used the comma). If you think 
that the colon could be valid delimiter here, I can introduce that (like 
-numa cpus=0:4:8). That doesn't look very neat, so shall we use the 
colon to separate the various numa sub-parameters (exchange comma and 
colon)?


Thanks for the review and the comments!
Andre.

-- 
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 488-3567-12
----to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Jochen Polster; Thomas M. McCoy; Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS
  2009-03-31 16:00       ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Blue Swirl
@ 2009-03-31 21:33         ` Andre Przywara
  0 siblings, 0 replies; 12+ messages in thread
From: Andre Przywara @ 2009-03-31 21:33 UTC (permalink / raw)
  To: blauwirbel; +Cc: qemu-devel, Andre Przywara

From: Andre Przywara <aprzywar@hagen.osrc.amd.com>

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
 hw/fw_cfg.h |    1 +
 hw/pc.c     |   22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 0 deletions(-)

Blueswirl, thanks for that catch!

Regards,
Andre.

diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index 41a3dd0..f616ed2 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -14,6 +14,7 @@
 #define FW_CFG_INITRD_ADDR      0x0a
 #define FW_CFG_INITRD_SIZE      0x0b
 #define FW_CFG_BOOT_DEVICE      0x0c
+#define FW_CFG_NUMA             0x0d
 #define FW_CFG_MAX_ENTRY        0x10
 
 #define FW_CFG_WRITE_CHANNEL    0x4000
diff --git a/hw/pc.c b/hw/pc.c
index f9cfd1f..efceae2 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -425,6 +425,8 @@ static void bochs_bios_write(void *opaque, uint32_t addr, uint32_t val)
 static void bochs_bios_init(void)
 {
     void *fw_cfg;
+    uint64_t *numa_fw_cfg;
+    int i, j;
 
     register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
     register_ioport_write(0x401, 1, 2, bochs_bios_write, NULL);
@@ -442,6 +444,26 @@ static void bochs_bios_init(void)
     fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size);
     fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
                      acpi_tables_len);
+
+    /* allocate memory for the NUMA channel: one (64bit) word for the number
+     * of nodes, one word for each VCPU->node and one word for each node to
+     * hold the amount of memory.
+     */
+    numa_fw_cfg = qemu_mallocz ((1 + smp_cpus + nb_numa_nodes) * 8);
+    numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
+    for (i = 0; i < smp_cpus; i++) {
+        for (j = 0; j < nb_numa_nodes; j++) {
+            if (node_cpumask[j] & (1 << i)) {
+                numa_fw_cfg[i + 1] = cpu_to_le64(j);
+                break;
+            }
+        }
+    }
+    for (i = 0; i < nb_numa_nodes; i++) {
+        numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+    }
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t*)numa_fw_cfg,
+                     (1 + smp_cpus + nb_numa_nodes) * 8);
 }
 
 /* Generate an initial boot sector which sets state and jump to
-- 
1.6.1.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-03-31 21:34 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-31 13:28 [Qemu-devel] [PATCH 0/4] add NUMA emulation Andre Przywara
2009-03-31 13:28 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
2009-03-31 13:28   ` [Qemu-devel] [PATCH 2/4] add info numa command to monitor Andre Przywara
2009-03-31 13:28     ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Andre Przywara
2009-03-31 13:28       ` [Qemu-devel] [PATCH 4/4] add BIOS support for an ACPI SRAT table (needed for NUMA support) Andre Przywara
2009-03-31 13:44         ` [Qemu-devel] " Anthony Liguori
2009-03-31 20:04           ` [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support Andre Przywara
2009-03-31 16:00       ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Blue Swirl
2009-03-31 21:33         ` Andre Przywara
2009-03-31 13:42   ` [Qemu-devel] Re: [PATCH 1/4] added -numa cmdline parameter parser Anthony Liguori
2009-03-31 20:34     ` Andre Przywara
2009-03-31 14:37 ` [Qemu-devel] Re: [PATCH 0/4] add NUMA emulation Anthony Liguori

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.