* [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser
2009-04-08 14:50 [Qemu-devel] [PATCH 0/4] v2: add NUMA emulation Andre Przywara
@ 2009-04-08 14:50 ` Andre Przywara
2009-04-08 14:50 ` [Qemu-devel] [PATCH 2/4] add info numa command to monitor Andre Przywara
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Andre Przywara @ 2009-04-08 14:50 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel
From: Andre Przywara <aprzywar@hagen.osrc.amd.com>
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
cpu-defs.h | 1 +
exec.c | 1 +
qemu-options.hx | 8 ++++
sysemu.h | 6 ++-
vl.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
5 files changed, 134 insertions(+), 5 deletions(-)
diff --git a/cpu-defs.h b/cpu-defs.h
index b462a9f..7cbf85d 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -205,6 +205,7 @@ typedef struct CPUWatchpoint {
\
CPUState *next_cpu; /* next CPU sharing TB cache */ \
int cpu_index; /* CPU index (informative) */ \
+ int numa_node; /* NUMA node this cpu is belonging to */ \
int running; /* Nonzero if cpu is currently running(usermode). */ \
/* user data */ \
void *opaque; \
diff --git a/exec.c b/exec.c
index b41edef..08bf62d 100644
--- a/exec.c
+++ b/exec.c
@@ -544,6 +544,7 @@ void cpu_exec_init(CPUState *env)
cpu_index++;
}
env->cpu_index = cpu_index;
+ env->numa_node = 0;
TAILQ_INIT(&env->breakpoints);
TAILQ_INIT(&env->watchpoints);
*penv = env;
diff --git a/qemu-options.hx b/qemu-options.hx
index f551775..e1319a7 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -47,6 +47,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs
to 4.
ETEXI
+DEF("numa", HAS_ARG, QEMU_OPTION_numa,
+ "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n")
+STEXI
+@item -numa @var{opts}
+Simulate a multi node NUMA system. If mem and cpus are omitted, resources
+are split equally.
+ETEXI
+
DEF("fda", HAS_ARG, QEMU_OPTION_fda,
"-fda/-fdb file use 'file' as floppy disk 0/1 image\n")
DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "")
diff --git a/sysemu.h b/sysemu.h
index 3eab34b..fbdbe62 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -108,6 +108,10 @@ extern const char *bootp_filename;
extern int kqemu_allowed;
#endif
+#define MAX_NODES 64
+extern int nb_numa_nodes;
+extern uint64_t node_mem[MAX_NODES];
+
#define MAX_OPTION_ROMS 16
extern const char *option_rom[MAX_OPTION_ROMS];
extern int nb_option_roms;
@@ -248,7 +252,7 @@ void do_usb_add(Monitor *mon, const char *devname);
void do_usb_del(Monitor *mon, const char *devname);
void usb_info(Monitor *mon);
-const char *get_opt_name(char *buf, int buf_size, const char *p);
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
const char *get_opt_value(char *buf, int buf_size, const char *p);
int get_param_value(char *buf, int buf_size,
const char *tag, const char *str);
diff --git a/vl.c b/vl.c
index ddbcc6c..6065f6c 100644
--- a/vl.c
+++ b/vl.c
@@ -264,6 +264,10 @@ const char *prom_envs[MAX_PROM_ENVS];
int nb_drives_opt;
struct drive_opt drives_opt[MAX_DRIVES];
+int nb_numa_nodes;
+uint64_t node_mem[MAX_NODES];
+uint64_t node_cpumask[MAX_NODES];
+
static CPUState *cur_cpu;
static CPUState *next_cpu;
static int event_pending = 1;
@@ -1864,12 +1868,12 @@ static int socket_init(void)
}
#endif
-const char *get_opt_name(char *buf, int buf_size, const char *p)
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim)
{
char *q;
q = buf;
- while (*p != '\0' && *p != '=') {
+ while (*p != '\0' && *p != delim) {
if (q && (q - buf) < buf_size - 1)
*q++ = *p;
p++;
@@ -1909,7 +1913,7 @@ int get_param_value(char *buf, int buf_size,
p = str;
for(;;) {
- p = get_opt_name(option, sizeof(option), p);
+ p = get_opt_name(option, sizeof(option), p, '=');
if (*p != '=')
break;
p++;
@@ -1934,7 +1938,7 @@ int check_params(char *buf, int buf_size,
p = str;
for(;;) {
- p = get_opt_name(buf, buf_size, p);
+ p = get_opt_name(buf, buf_size, p, '=');
if (*p != '=')
return -1;
p++;
@@ -2627,6 +2631,52 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque)
return drives_table_idx;
}
+static void numa_add(const char *optarg)
+{
+ char option[128];
+ char *endptr;
+ unsigned long long value, endvalue;
+ int nodenr;
+
+ optarg = get_opt_name(option, 128, optarg, ',') + 1;
+ if (!strcmp(option, "node")) {
+ if (get_param_value(option, 128, "nodeid", optarg) == 0) {
+ nodenr = nb_numa_nodes;
+ } else {
+ nodenr = strtoull(option, NULL, 10);
+ }
+
+ if (get_param_value(option, 128, "mem", optarg) == 0) {
+ node_mem[nodenr] = 0;
+ } else {
+ value = strtoull(option, &endptr, 0);
+ switch (*endptr) {
+ case 0: case 'M': case 'm':
+ value <<= 20;
+ break;
+ case 'G': case 'g':
+ value <<= 30;
+ break;
+ }
+ node_mem[nodenr] = value;
+ }
+ if (get_param_value(option, 128, "cpus", optarg) == 0) {
+ node_cpumask[nodenr] = 0;
+ } else {
+ value = strtoull(option, &endptr, 10);
+ if (*endptr == '-') {
+ endvalue = strtoull(endptr+1, &endptr, 10);
+ value = (1 << (endvalue + 1)) - (1 << value);
+ } else {
+ value = 1 << value;
+ }
+ node_cpumask[nodenr] = value;
+ }
+ nb_numa_nodes++;
+ }
+ return;
+}
+
/***********************************************************/
/* USB devices */
@@ -4282,6 +4332,7 @@ int main(int argc, char **argv, char **envp)
const char *chroot_dir = NULL;
const char *run_as = NULL;
#endif
+ CPUState *env;
qemu_cache_utils_init(envp);
@@ -4345,12 +4396,18 @@ int main(int argc, char **argv, char **envp)
virtio_consoles[i] = NULL;
virtio_console_index = 0;
+ for (i = 0; i < MAX_NODES; i++) {
+ node_mem[i] = 0;
+ node_cpumask[i] = 0;
+ }
+
usb_devices_index = 0;
nb_net_clients = 0;
nb_bt_opts = 0;
nb_drives = 0;
nb_drives_opt = 0;
+ nb_numa_nodes = 0;
hda_index = -1;
nb_nics = 0;
@@ -4500,6 +4557,13 @@ int main(int argc, char **argv, char **envp)
",trans=none" : "");
}
break;
+ case QEMU_OPTION_numa:
+ if (nb_numa_nodes >= MAX_NODES) {
+ fprintf(stderr, "qemu: too many NUMA nodes\n");
+ exit(1);
+ }
+ numa_add(optarg);
+ break;
case QEMU_OPTION_nographic:
nographic = 1;
break;
@@ -5207,6 +5271,48 @@ int main(int argc, char **argv, char **envp)
}
}
+ if (nb_numa_nodes > 0) {
+ int i;
+
+ if (nb_numa_nodes > smp_cpus) {
+ nb_numa_nodes = smp_cpus;
+ }
+
+ /* If no memory size if given for any node, assume the default case
+ * and distribute the available memory equally across all nodes
+ */
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_mem[i] != 0)
+ break;
+ }
+ if (i == nb_numa_nodes) {
+ uint64_t usedmem = 0;
+
+ /* On Linux, the each node's border has to be 8MB aligned,
+ * the final node gets the rest.
+ */
+ for (i = 0; i < nb_numa_nodes - 1; i++) {
+ node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
+ usedmem += node_mem[i];
+ }
+ node_mem[i] = ram_size - usedmem;
+ }
+
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_cpumask[i] != 0)
+ break;
+ }
+ /* assigning the VCPUs round-robin is easier to implement, guest OSes
+ * must cope with this anyway, because there are BIOSes out there in
+ * real machines which also use this scheme.
+ */
+ if (i == nb_numa_nodes) {
+ for (i = 0; i < smp_cpus; i++) {
+ node_cpumask[i % nb_numa_nodes] |= 1 << i;
+ }
+ }
+ }
+
if (kvm_enabled()) {
int ret;
@@ -5270,6 +5376,15 @@ int main(int argc, char **argv, char **envp)
machine->init(ram_size, vga_ram_size, boot_devices,
kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
+
+ for (env = first_cpu; env != NULL; env = env->next_cpu) {
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_cpumask[i] & (1 << env->cpu_index)) {
+ env->numa_node = i;
+ }
+ }
+ }
+
current_machine = machine;
/* Set KVM's vcpu state to qemu's initial CPUState. */
--
1.6.1.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [Qemu-devel] [PATCH 2/4] add info numa command to monitor
2009-04-08 14:50 [Qemu-devel] [PATCH 0/4] v2: add NUMA emulation Andre Przywara
2009-04-08 14:50 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
@ 2009-04-08 14:50 ` Andre Przywara
2009-04-08 14:50 ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Andre Przywara
` (2 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Andre Przywara @ 2009-04-08 14:50 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel
From: Andre Przywara <aprzywar@hagen.osrc.amd.com>
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
monitor.c | 21 +++++++++++++++++++++
1 files changed, 21 insertions(+), 0 deletions(-)
diff --git a/monitor.c b/monitor.c
index e764b5d..ed54a62 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1409,6 +1409,25 @@ static void do_info_kvm(Monitor *mon)
#endif
}
+static void do_info_numa(Monitor *mon)
+{
+ int i, j;
+ CPUState *env;
+
+ monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
+ for (i = 0; i < nb_numa_nodes; i++) {
+ monitor_printf(mon, "node %d cpus:", i);
+ for (env = first_cpu; env != NULL; env = env->next_cpu) {
+ if (env->numa_node == i) {
+ monitor_printf(mon, " %d", env->cpu_index);
+ }
+ }
+ monitor_printf(mon, "\n");
+ monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i,
+ node_mem[i] >> 20);
+ }
+}
+
#ifdef CONFIG_PROFILER
int64_t kqemu_time;
@@ -1788,6 +1807,8 @@ static const mon_cmd_t info_cmds[] = {
"", "show KQEMU information", },
{ "kvm", "", do_info_kvm,
"", "show KVM information", },
+ { "numa", "", do_info_numa,
+ "", "show NUMA information", },
{ "usb", "", usb_info,
"", "show guest USB devices", },
{ "usbhost", "", usb_host_info,
--
1.6.1.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS
2009-04-08 14:50 [Qemu-devel] [PATCH 0/4] v2: add NUMA emulation Andre Przywara
2009-04-08 14:50 ` [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser Andre Przywara
2009-04-08 14:50 ` [Qemu-devel] [PATCH 2/4] add info numa command to monitor Andre Przywara
@ 2009-04-08 14:50 ` Andre Przywara
2009-04-08 14:50 ` [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support Andre Przywara
2009-04-17 14:34 ` [Qemu-devel] Re: [PATCH 0/4] v2: add NUMA emulation Anthony Liguori
4 siblings, 0 replies; 10+ messages in thread
From: Andre Przywara @ 2009-04-08 14:50 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel
From: Andre Przywara <aprzywar@hagen.osrc.amd.com>
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
hw/fw_cfg.h | 1 +
hw/pc.c | 24 ++++++++++++++++++++++++
2 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index 41a3dd0..f616ed2 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -14,6 +14,7 @@
#define FW_CFG_INITRD_ADDR 0x0a
#define FW_CFG_INITRD_SIZE 0x0b
#define FW_CFG_BOOT_DEVICE 0x0c
+#define FW_CFG_NUMA 0x0d
#define FW_CFG_MAX_ENTRY 0x10
#define FW_CFG_WRITE_CHANNEL 0x4000
diff --git a/hw/pc.c b/hw/pc.c
index f9cfd1f..f4585df 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -422,9 +422,13 @@ static void bochs_bios_write(void *opaque, uint32_t addr, uint32_t val)
}
}
+extern uint64_t node_cpumask[MAX_NODES];
+
static void bochs_bios_init(void)
{
void *fw_cfg;
+ uint64_t *numa_fw_cfg;
+ int i, j;
register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
register_ioport_write(0x401, 1, 2, bochs_bios_write, NULL);
@@ -442,6 +446,26 @@ static void bochs_bios_init(void)
fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size);
fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, (uint8_t *)acpi_tables,
acpi_tables_len);
+
+ /* allocate memory for the NUMA channel: one (64bit) word for the number
+ * of nodes, one word for each VCPU->node and one word for each node to
+ * hold the amount of memory.
+ */
+ numa_fw_cfg = qemu_mallocz((1 + smp_cpus + nb_numa_nodes) * 8);
+ numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
+ for (i = 0; i < smp_cpus; i++) {
+ for (j = 0; j < nb_numa_nodes; j++) {
+ if (node_cpumask[j] & (1 << i)) {
+ numa_fw_cfg[i + 1] = cpu_to_le64(j);
+ break;
+ }
+ }
+ }
+ for (i = 0; i < nb_numa_nodes; i++) {
+ numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+ }
+ fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
+ (1 + smp_cpus + nb_numa_nodes) * 8);
}
/* Generate an initial boot sector which sets state and jump to
--
1.6.1.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support
2009-04-08 14:50 [Qemu-devel] [PATCH 0/4] v2: add NUMA emulation Andre Przywara
` (2 preceding siblings ...)
2009-04-08 14:50 ` [Qemu-devel] [PATCH 3/4] sending NUMA topology to BIOS Andre Przywara
@ 2009-04-08 14:50 ` Andre Przywara
2009-04-17 14:33 ` [Qemu-devel] " Anthony Liguori
2009-04-17 14:34 ` [Qemu-devel] Re: [PATCH 0/4] v2: add NUMA emulation Anthony Liguori
4 siblings, 1 reply; 10+ messages in thread
From: Andre Przywara @ 2009-04-08 14:50 UTC (permalink / raw)
To: anthony; +Cc: qemu-devel
Take NUMA topology info from the QEMU firmware configuration interface
(number of nodes, node for each (V)CPU and amount of memory) and build
a SRAT table describing this topology for the guest OS. Handles more than
4 GB of RAM by including a hole for 32bit PCI memory mapping.
---
bios/rombios32.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 164 insertions(+), 11 deletions(-)
Dear Bochs developers, this patch is part of a series to introduce NUMA
support in QEMU. Since this requires the BIOS to build an ACPI table, this
patch is needed. Please review it if you like. This applies against the QEMU
BOCH base:
commit 04387139e3b5ac97b5633cd40b3d87cdf45efd6c
Author: sshwarts <sshwarts>
Date: Mon Feb 9 19:46:34 2009 +0000
Fixed compilation error + x86-64 correctness fix
plus the eleven QEMU BOCHS patches on top of it.
If you'd like to take it upstream, drop me a note and I will rebase it.
Thanks!
Andre.
diff --git a/bios/rombios32.c b/bios/rombios32.c
index 7be4216..02379c0 100644
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -451,6 +451,11 @@ int pm_sci_int;
unsigned long bios_table_cur_addr;
unsigned long bios_table_end_addr;
+static inline uint64_t le64_to_cpu(uint64_t x)
+{
+ return x;
+}
+
void wrmsr_smp(uint32_t index, uint64_t val)
{
static struct { uint32_t ecx, eax, edx; } *p = (void *)SMP_MSR_ADDR;
@@ -469,6 +474,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
#define QEMU_CFG_SIGNATURE 0x00
#define QEMU_CFG_ID 0x01
#define QEMU_CFG_UUID 0x02
+#define QEMU_CFG_NUMA 0x0D
#define QEMU_CFG_ARCH_LOCAL 0x8000
#define QEMU_CFG_ACPI_TABLES (QEMU_CFG_ARCH_LOCAL + 0)
@@ -519,6 +525,14 @@ static int acpi_load_table(int i, uint32_t addr, uint16_t *len)
qemu_cfg_read((uint8_t*)addr, *len);
return 0;
}
+
+uint64_t qemu_cfg_get64 (void)
+{
+ uint64_t ret;
+
+ qemu_cfg_read((uint8_t*)&ret, 8);
+ return le64_to_cpu(ret);
+}
#endif
void uuid_probe(void)
@@ -1273,7 +1287,7 @@ struct rsdt_descriptor_rev1
{
ACPI_TABLE_HEADER_DEF /* ACPI common table header */
#ifdef BX_QEMU
- uint32_t table_offset_entry [4]; /* Array of pointers to other */
+ uint32_t table_offset_entry [5]; /* Array of pointers to other */
#else
uint32_t table_offset_entry [3]; /* Array of pointers to other */
#endif
@@ -1381,7 +1395,7 @@ struct multiple_apic_table
} __attribute__((__packed__));
-/* Values for Type in APIC_HEADER_DEF */
+/* Values for Type in APIC sub-headers */
#define APIC_PROCESSOR 0
#define APIC_IO 1
@@ -1394,18 +1408,18 @@ struct multiple_apic_table
#define APIC_XRUPT_SOURCE 8
#define APIC_RESERVED 9 /* 9 and greater are reserved */
-/*
- * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
- */
-#define APIC_HEADER_DEF /* Common APIC sub-structure header */\
+#define ACPI_SUB_HEADER_DEF /* Common ACPI sub-structure header */\
uint8_t type; \
uint8_t length;
+/*
+ * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
+ */
/* Sub-structures for MADT */
struct madt_processor_apic
{
- APIC_HEADER_DEF
+ ACPI_SUB_HEADER_DEF
uint8_t processor_id; /* ACPI processor id */
uint8_t local_apic_id; /* Processor's local APIC id */
#if 0
@@ -1416,6 +1430,43 @@ struct madt_processor_apic
#endif
} __attribute__((__packed__));
+/*
+ * SRAT (NUMA topology description) table
+ */
+
+#define SRAT_PROCESSOR 0
+#define SRAT_MEMORY 1
+
+struct system_resource_affinity_table
+{
+ ACPI_TABLE_HEADER_DEF
+ uint32_t reserved1;
+ uint32_t reserved2[2];
+};
+
+struct srat_processor_affinity
+{
+ ACPI_SUB_HEADER_DEF
+ uint8_t proximity_lo;
+ uint8_t local_apic_id;
+ uint32_t flags;
+ uint8_t local_sapic_eid;
+ uint8_t proximity_hi[3];
+ uint32_t reserved;
+};
+
+struct srat_memory_affinity
+{
+ ACPI_SUB_HEADER_DEF
+ uint8_t proximity[4];
+ uint16_t reserved1;
+ uint32_t base_addr_low,base_addr_high;
+ uint32_t length_low,length_high;
+ uint32_t reserved2;
+ uint32_t flags;
+ uint32_t reserved3[2];
+};
+
#ifdef BX_QEMU
/*
* * ACPI 2.0 Generic Address Space definition.
@@ -1444,7 +1495,7 @@ struct acpi_20_hpet {
struct madt_io_apic
{
- APIC_HEADER_DEF
+ ACPI_SUB_HEADER_DEF
uint8_t io_apic_id; /* I/O APIC ID */
uint8_t reserved; /* Reserved - must be zero */
uint32_t address; /* APIC physical address */
@@ -1455,7 +1506,7 @@ struct madt_io_apic
#ifdef BX_QEMU
struct madt_int_override
{
- APIC_HEADER_DEF
+ ACPI_SUB_HEADER_DEF
uint8_t bus; /* Identifies ISA Bus */
uint8_t source; /* Bus-relative interrupt source */
uint32_t gsi; /* GSI that source will signal */
@@ -1559,6 +1610,21 @@ int acpi_build_processor_ssdt(uint8_t *ssdt)
return ssdt_ptr - ssdt;
}
+static void acpi_build_srat_memory(struct srat_memory_affinity *numamem,
+ uint64_t base, uint64_t len, int node, int enabled)
+{
+ numamem->type = SRAT_MEMORY;
+ numamem->length = sizeof(*numamem);
+ memset (numamem->proximity, 0 ,4);
+ numamem->proximity[0] = node;
+ numamem->flags = cpu_to_le32(!!enabled);
+ numamem->base_addr_low = base & 0xFFFFFFFF;
+ numamem->base_addr_high = base >> 32;
+ numamem->length_low = len & 0xFFFFFFFF;
+ numamem->length_high = len >> 32;
+ return;
+}
+
/* base_addr must be a multiple of 4KB */
void acpi_bios_init(void)
{
@@ -1569,12 +1635,15 @@ void acpi_bios_init(void)
struct multiple_apic_table *madt;
uint8_t *dsdt, *ssdt;
#ifdef BX_QEMU
+ struct system_resource_affinity_table *srat;
struct acpi_20_hpet *hpet;
uint32_t hpet_addr;
#endif
uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr, ssdt_addr;
uint32_t acpi_tables_size, madt_addr, madt_size, rsdt_size;
+ uint32_t srat_addr,srat_size;
uint16_t i, external_tables;
+ int nb_numa_nodes;
/* reserve memory space for tables */
#ifdef BX_USE_EBDA_TABLES
@@ -1616,6 +1685,25 @@ void acpi_bios_init(void)
ssdt_addr = addr;
ssdt = (void *)(addr);
addr += acpi_build_processor_ssdt(ssdt);
+#ifdef BX_QEMU
+ qemu_cfg_select(QEMU_CFG_NUMA);
+ nb_numa_nodes = qemu_cfg_get64();
+#else
+ nb_numa_nodes = 0;
+#endif
+ if (nb_numa_nodes > 0) {
+ addr = (addr + 7) & ~7;
+ srat_addr = addr;
+ srat_size = sizeof(*srat) +
+ sizeof(struct srat_processor_affinity) * smp_cpus +
+ sizeof(struct srat_memory_affinity) * (nb_numa_nodes + 2);
+ srat = (void *)(addr);
+ addr += srat_size;
+ } else {
+ srat_addr = addr;
+ srat = (void*)(addr);
+ srat_size = 0;
+ }
addr = (addr + 7) & ~7;
madt_addr = addr;
@@ -1725,6 +1813,69 @@ void acpi_bios_init(void)
memset(rsdt, 0, rsdt_size);
#ifdef BX_QEMU
+ /* SRAT */
+ if (nb_numa_nodes > 0) {
+ struct srat_processor_affinity *core;
+ struct srat_memory_affinity *numamem;
+ int slots;
+ uint64_t mem_len, mem_base, next_base = 0, curnode;
+
+ qemu_cfg_select(QEMU_CFG_NUMA);
+ qemu_cfg_get64();
+ memset (srat, 0 , srat_size);
+ srat->reserved1=1;
+
+ core = (void*)(srat + 1);
+ for (i = 0; i < smp_cpus; ++i) {
+ core->type = SRAT_PROCESSOR;
+ core->length = sizeof(*core);
+ core->local_apic_id = i;
+ curnode = qemu_cfg_get64();
+ core->proximity_lo = curnode;
+ memset (core->proximity_hi, 0, 3);
+ core->local_sapic_eid = 0;
+ if (i < smp_cpus)
+ core->flags = cpu_to_le32(1);
+ else
+ core->flags = 0;
+ core++;
+ }
+
+ /* the memory map is a bit tricky, it contains at least one hole
+ * from 640k-1M and possibly another one from 3.5G-4G.
+ */
+ numamem = (void*)core; slots = 0;
+ acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
+ next_base = 1024 * 1024; numamem++;slots++;
+ for (i = 1; i < nb_numa_nodes + 1; ++i) {
+ mem_base = next_base;
+ mem_len = qemu_cfg_get64();
+ if (i == 1) mem_len -= 1024 * 1024;
+ next_base = mem_base + mem_len;
+
+ /* Cut out the PCI hole */
+ if (mem_base <= ram_size && next_base > ram_size) {
+ mem_len -= next_base - ram_size;
+ if (mem_len > 0) {
+ acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+ numamem++; slots++;
+ }
+ mem_base = 1ULL << 32;
+ mem_len = next_base - ram_size;
+ next_base += (1ULL << 32) - ram_size;
+ }
+ acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+ numamem++; slots++;
+ }
+ for (; slots < nb_numa_nodes + 2; slots++) {
+ acpi_build_srat_memory(numamem, 0, 0, 0, 0);
+ numamem++;
+ }
+
+ acpi_build_table_header((struct acpi_table_header *)srat,
+ "SRAT", srat_size, 1);
+ }
+
/* HPET */
memset(hpet, 0, sizeof(*hpet));
/* Note timer_block_id value must be kept in sync with value advertised by
@@ -1753,9 +1904,11 @@ void acpi_bios_init(void)
rsdt->table_offset_entry[2] = cpu_to_le32(ssdt_addr);
#ifdef BX_QEMU
rsdt->table_offset_entry[3] = cpu_to_le32(hpet_addr);
+ if (nb_numa_nodes > 0)
+ rsdt->table_offset_entry[4] = cpu_to_le32(srat_addr);
#endif
- acpi_build_table_header((struct acpi_table_header *)rsdt,
- "RSDT", rsdt_size, 1);
+ acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
+ rsdt_size - (nb_numa_nodes > 0? 0: sizeof(uint32_t)), 1);
acpi_tables_size = addr - base_addr;
--
1.6.1.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [Qemu-devel] Re: [PATCH 4/4] add SRAT ACPI table support
2009-04-08 14:50 ` [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support Andre Przywara
@ 2009-04-17 14:33 ` Anthony Liguori
0 siblings, 0 replies; 10+ messages in thread
From: Anthony Liguori @ 2009-04-17 14:33 UTC (permalink / raw)
To: Andre Przywara; +Cc: qemu-devel
Andre Przywara wrote:
> Take NUMA topology info from the QEMU firmware configuration interface
> (number of nodes, node for each (V)CPU and amount of memory) and build
> a SRAT table describing this topology for the guest OS. Handles more than
> 4 GB of RAM by including a hole for 32bit PCI memory mapping.
> ---
> bios/rombios32.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 files changed, 164 insertions(+), 11 deletions(-)
>
> Dear Bochs developers, this patch is part of a series to introduce NUMA
> support in QEMU. Since this requires the BIOS to build an ACPI table, this
> patch is needed. Please review it if you like. This applies against the QEMU
> BOCH base:
> commit 04387139e3b5ac97b5633cd40b3d87cdf45efd6c
> Author: sshwarts <sshwarts>
> Date: Mon Feb 9 19:46:34 2009 +0000
> Fixed compilation error + x86-64 correctness fix
> plus the eleven QEMU BOCHS patches on top of it.
> If you'd like to take it upstream, drop me a note and I will rebase it.
>
> Thanks!
> Andre.
>
This needs a SoB.
Regards,
Anthony Liguori
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Qemu-devel] Re: [PATCH 0/4] v2: add NUMA emulation
2009-04-08 14:50 [Qemu-devel] [PATCH 0/4] v2: add NUMA emulation Andre Przywara
` (3 preceding siblings ...)
2009-04-08 14:50 ` [Qemu-devel] [PATCH 4/4] add SRAT ACPI table support Andre Przywara
@ 2009-04-17 14:34 ` Anthony Liguori
2009-04-21 10:26 ` Andre Przywara
4 siblings, 1 reply; 10+ messages in thread
From: Anthony Liguori @ 2009-04-17 14:34 UTC (permalink / raw)
To: Andre Przywara; +Cc: qemu-devel
Hi Andre,
I don't understand why it's so difficult to eliminate the 64 CPU limit.
I'm willing to take these patches with the limit but I'd like to see the
following:
The -numa stuff needs to be more defensive when -smp > 64.
Specificially, it needs to explicitly check and warn the user if a NUMA
node contains a CPU > 64.
Each patch needs some description of what it's doing. All patches need
a SoB.
Regards,
Anthony Liguori
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Qemu-devel] Re: [PATCH 0/4] v2: add NUMA emulation
2009-04-17 14:34 ` [Qemu-devel] Re: [PATCH 0/4] v2: add NUMA emulation Anthony Liguori
@ 2009-04-21 10:26 ` Andre Przywara
0 siblings, 0 replies; 10+ messages in thread
From: Andre Przywara @ 2009-04-21 10:26 UTC (permalink / raw)
To: Anthony Liguori; +Cc: qemu-devel
Anthony Liguori wrote:
> Hi Andre,
>
> I don't understand why it's so difficult to eliminate the 64 CPU limit.
Because bitmap operations are not trivial to be done right (look at
Linux' implementation). I will send a patch later (introducing generic
bitmap operations), but I didn't want to merge this with the NUMA patchset.
> I'm willing to take these patches with the limit but I'd like to see the
> following:
>
> The -numa stuff needs to be more defensive when -smp > 64.
> Specificially, it needs to explicitly check and warn the user if a NUMA
> node contains a CPU > 64.
OK, done.
>
> Each patch needs some description of what it's doing. All patches need
> a SoB.
Agreed. But git should have already added a SoB line for every patch, no?
Regards,
Andre.
--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 448 3567 12
----to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Jochen Polster; Thomas M. McCoy; Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser
2009-04-21 11:02 [Qemu-devel] [PATCH 0/4] v3: " Andre Przywara
@ 2009-04-21 11:02 ` Andre Przywara
0 siblings, 0 replies; 10+ messages in thread
From: Andre Przywara @ 2009-04-21 11:02 UTC (permalink / raw)
To: aliguori; +Cc: Andre Przywara, qemu-devel
adds a -numa command line parameter and sets a QEMU global array with
the memory sizes. The CPU-to-node assignemnt is written into the
CPUState. If no specific values for memory and CPUs are given,
all resources will be split equally across all nodes.
This code currently support only up to 64 virtual CPUs.
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
cpu-defs.h | 1 +
exec.c | 1 +
qemu-options.hx | 8 +++
sysemu.h | 6 ++-
vl.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
5 files changed, 144 insertions(+), 5 deletions(-)
diff --git a/cpu-defs.h b/cpu-defs.h
index b462a9f..7cbf85d 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -205,6 +205,7 @@ typedef struct CPUWatchpoint {
\
CPUState *next_cpu; /* next CPU sharing TB cache */ \
int cpu_index; /* CPU index (informative) */ \
+ int numa_node; /* NUMA node this cpu is belonging to */ \
int running; /* Nonzero if cpu is currently running(usermode). */ \
/* user data */ \
void *opaque; \
diff --git a/exec.c b/exec.c
index fc7e08c..8245ac0 100644
--- a/exec.c
+++ b/exec.c
@@ -554,6 +554,7 @@ void cpu_exec_init(CPUState *env)
cpu_index++;
}
env->cpu_index = cpu_index;
+ env->numa_node = 0;
TAILQ_INIT(&env->breakpoints);
TAILQ_INIT(&env->watchpoints);
*penv = env;
diff --git a/qemu-options.hx b/qemu-options.hx
index 1d783e5..1e39f25 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -47,6 +47,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs
to 4.
ETEXI
+DEF("numa", HAS_ARG, QEMU_OPTION_numa,
+ "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n")
+STEXI
+@item -numa @var{opts}
+Simulate a multi node NUMA system. If mem and cpus are omitted, resources
+are split equally.
+ETEXI
+
DEF("fda", HAS_ARG, QEMU_OPTION_fda,
"-fda/-fdb file use 'file' as floppy disk 0/1 image\n")
DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "")
diff --git a/sysemu.h b/sysemu.h
index 24b4bd1..cbfbb8e 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -108,6 +108,10 @@ extern int old_param;
extern int kqemu_allowed;
#endif
+#define MAX_NODES 64
+extern int nb_numa_nodes;
+extern uint64_t node_mem[MAX_NODES];
+
#define MAX_OPTION_ROMS 16
extern const char *option_rom[MAX_OPTION_ROMS];
extern int nb_option_roms;
@@ -248,7 +252,7 @@ void do_usb_add(Monitor *mon, const char *devname);
void do_usb_del(Monitor *mon, const char *devname);
void usb_info(Monitor *mon);
-const char *get_opt_name(char *buf, int buf_size, const char *p);
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
const char *get_opt_value(char *buf, int buf_size, const char *p);
int get_param_value(char *buf, int buf_size,
const char *tag, const char *str);
diff --git a/vl.c b/vl.c
index 55a9bc5..a6d6801 100644
--- a/vl.c
+++ b/vl.c
@@ -265,6 +265,10 @@ const char *prom_envs[MAX_PROM_ENVS];
int nb_drives_opt;
struct drive_opt drives_opt[MAX_DRIVES];
+int nb_numa_nodes;
+uint64_t node_mem[MAX_NODES];
+uint64_t node_cpumask[MAX_NODES];
+
static CPUState *cur_cpu;
static CPUState *next_cpu;
static int event_pending = 1;
@@ -1865,12 +1869,12 @@ static int socket_init(void)
}
#endif
-const char *get_opt_name(char *buf, int buf_size, const char *p)
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim)
{
char *q;
q = buf;
- while (*p != '\0' && *p != '=') {
+ while (*p != '\0' && *p != delim) {
if (q && (q - buf) < buf_size - 1)
*q++ = *p;
p++;
@@ -1910,7 +1914,7 @@ int get_param_value(char *buf, int buf_size,
p = str;
for(;;) {
- p = get_opt_name(option, sizeof(option), p);
+ p = get_opt_name(option, sizeof(option), p, '=');
if (*p != '=')
break;
p++;
@@ -1935,7 +1939,7 @@ int check_params(char *buf, int buf_size,
p = str;
for(;;) {
- p = get_opt_name(buf, buf_size, p);
+ p = get_opt_name(buf, buf_size, p, '=');
if (*p != '=')
return -1;
p++;
@@ -2628,6 +2632,62 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque)
return drives_table_idx;
}
+static void numa_add(const char *optarg)
+{
+ char option[128];
+ char *endptr;
+ unsigned long long value, endvalue;
+ int nodenr;
+
+ optarg = get_opt_name(option, 128, optarg, ',') + 1;
+ if (!strcmp(option, "node")) {
+ if (get_param_value(option, 128, "nodeid", optarg) == 0) {
+ nodenr = nb_numa_nodes;
+ } else {
+ nodenr = strtoull(option, NULL, 10);
+ }
+
+ if (get_param_value(option, 128, "mem", optarg) == 0) {
+ node_mem[nodenr] = 0;
+ } else {
+ value = strtoull(option, &endptr, 0);
+ switch (*endptr) {
+ case 0: case 'M': case 'm':
+ value <<= 20;
+ break;
+ case 'G': case 'g':
+ value <<= 30;
+ break;
+ }
+ node_mem[nodenr] = value;
+ }
+ if (get_param_value(option, 128, "cpus", optarg) == 0) {
+ node_cpumask[nodenr] = 0;
+ } else {
+ value = strtoull(option, &endptr, 10);
+ if (value >= 64) {
+ value = 63;
+ fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
+ } else {
+ if (*endptr == '-') {
+ endvalue = strtoull(endptr+1, &endptr, 10);
+ if (endvalue >= 63) {
+ endvalue = 62;
+ fprintf(stderr,
+ "only 63 CPUs in NUMA mode supported.\n");
+ }
+ value = (1 << (endvalue + 1)) - (1 << value);
+ } else {
+ value = 1 << value;
+ }
+ }
+ node_cpumask[nodenr] = value;
+ }
+ nb_numa_nodes++;
+ }
+ return;
+}
+
/***********************************************************/
/* USB devices */
@@ -4290,6 +4350,7 @@ int main(int argc, char **argv, char **envp)
const char *chroot_dir = NULL;
const char *run_as = NULL;
#endif
+ CPUState *env;
qemu_cache_utils_init(envp);
@@ -4353,12 +4414,18 @@ int main(int argc, char **argv, char **envp)
virtio_consoles[i] = NULL;
virtio_console_index = 0;
+ for (i = 0; i < MAX_NODES; i++) {
+ node_mem[i] = 0;
+ node_cpumask[i] = 0;
+ }
+
usb_devices_index = 0;
nb_net_clients = 0;
nb_bt_opts = 0;
nb_drives = 0;
nb_drives_opt = 0;
+ nb_numa_nodes = 0;
hda_index = -1;
nb_nics = 0;
@@ -4508,6 +4575,13 @@ int main(int argc, char **argv, char **envp)
",trans=none" : "");
}
break;
+ case QEMU_OPTION_numa:
+ if (nb_numa_nodes >= MAX_NODES) {
+ fprintf(stderr, "qemu: too many NUMA nodes\n");
+ exit(1);
+ }
+ numa_add(optarg);
+ break;
case QEMU_OPTION_nographic:
nographic = 1;
break;
@@ -5211,6 +5285,48 @@ int main(int argc, char **argv, char **envp)
}
}
+ if (nb_numa_nodes > 0) {
+ int i;
+
+ if (nb_numa_nodes > smp_cpus) {
+ nb_numa_nodes = smp_cpus;
+ }
+
+ /* If no memory size if given for any node, assume the default case
+ * and distribute the available memory equally across all nodes
+ */
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_mem[i] != 0)
+ break;
+ }
+ if (i == nb_numa_nodes) {
+ uint64_t usedmem = 0;
+
+ /* On Linux, the each node's border has to be 8MB aligned,
+ * the final node gets the rest.
+ */
+ for (i = 0; i < nb_numa_nodes - 1; i++) {
+ node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
+ usedmem += node_mem[i];
+ }
+ node_mem[i] = ram_size - usedmem;
+ }
+
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_cpumask[i] != 0)
+ break;
+ }
+ /* assigning the VCPUs round-robin is easier to implement, guest OSes
+ * must cope with this anyway, because there are BIOSes out there in
+ * real machines which also use this scheme.
+ */
+ if (i == nb_numa_nodes) {
+ for (i = 0; i < smp_cpus; i++) {
+ node_cpumask[i % nb_numa_nodes] |= 1 << i;
+ }
+ }
+ }
+
if (kvm_enabled()) {
int ret;
@@ -5274,6 +5390,15 @@ int main(int argc, char **argv, char **envp)
machine->init(ram_size, vga_ram_size, boot_devices,
kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
+
+ for (env = first_cpu; env != NULL; env = env->next_cpu) {
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_cpumask[i] & (1 << env->cpu_index)) {
+ env->numa_node = i;
+ }
+ }
+ }
+
current_machine = machine;
/* Set KVM's vcpu state to qemu's initial CPUState. */
--
1.6.1.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [Qemu-devel] [PATCH 1/4] added -numa cmdline parameter parser
2009-03-31 13:28 [Qemu-devel] [PATCH 0/4] add NUMA emulation Andre Przywara
@ 2009-03-31 13:28 ` Andre Przywara
0 siblings, 0 replies; 10+ messages in thread
From: Andre Przywara @ 2009-03-31 13:28 UTC (permalink / raw)
To: qemu-devel; +Cc: Andre Przywara, Andre Przywara
From: Andre Przywara <aprzywar@amd.com>
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
---
qemu-options.hx | 8 ++++
sysemu.h | 7 +++-
vl.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 124 insertions(+), 5 deletions(-)
diff --git a/qemu-options.hx b/qemu-options.hx
index 6c58e2a..f3f1389 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -40,6 +40,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs
to 4.
ETEXI
+DEF("numa", HAS_ARG, QEMU_OPTION_numa,
+ "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n")
+STEXI
+@item -numa @var{opts}
+Simulate a multi node NUMA system. If mem and cpus are omitted, resources
+are split equally.
+ETEXI
+
DEF("fda", HAS_ARG, QEMU_OPTION_fda,
"-fda/-fdb file use 'file' as floppy disk 0/1 image\n")
DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "")
diff --git a/sysemu.h b/sysemu.h
index 3eab34b..b83a66c 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -108,6 +108,11 @@ extern const char *bootp_filename;
extern int kqemu_allowed;
#endif
+#define MAX_NODES 64
+extern int nb_numa_nodes;
+extern uint64_t node_mem[MAX_NODES];
+extern uint64_t node_cpumask[MAX_NODES];
+
#define MAX_OPTION_ROMS 16
extern const char *option_rom[MAX_OPTION_ROMS];
extern int nb_option_roms;
@@ -248,7 +253,7 @@ void do_usb_add(Monitor *mon, const char *devname);
void do_usb_del(Monitor *mon, const char *devname);
void usb_info(Monitor *mon);
-const char *get_opt_name(char *buf, int buf_size, const char *p);
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
const char *get_opt_value(char *buf, int buf_size, const char *p);
int get_param_value(char *buf, int buf_size,
const char *tag, const char *str);
diff --git a/vl.c b/vl.c
index 5e6c621..3f9c713 100644
--- a/vl.c
+++ b/vl.c
@@ -261,6 +261,10 @@ const char *prom_envs[MAX_PROM_ENVS];
int nb_drives_opt;
struct drive_opt drives_opt[MAX_DRIVES];
+int nb_numa_nodes;
+uint64_t node_mem[MAX_NODES];
+uint64_t node_cpumask[MAX_NODES];
+
static CPUState *cur_cpu;
static CPUState *next_cpu;
static int event_pending = 1;
@@ -1860,12 +1864,12 @@ static int socket_init(void)
}
#endif
-const char *get_opt_name(char *buf, int buf_size, const char *p)
+const char *get_opt_name(char *buf, int buf_size, const char *p, char delim)
{
char *q;
q = buf;
- while (*p != '\0' && *p != '=') {
+ while (*p != '\0' && *p != delim) {
if (q && (q - buf) < buf_size - 1)
*q++ = *p;
p++;
@@ -1905,7 +1909,7 @@ int get_param_value(char *buf, int buf_size,
p = str;
for(;;) {
- p = get_opt_name(option, sizeof(option), p);
+ p = get_opt_name(option, sizeof(option), p, '=');
if (*p != '=')
break;
p++;
@@ -1930,7 +1934,7 @@ int check_params(char *buf, int buf_size,
p = str;
for(;;) {
- p = get_opt_name(buf, buf_size, p);
+ p = get_opt_name(buf, buf_size, p, '=');
if (*p != '=')
return -1;
p++;
@@ -2623,6 +2627,53 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque)
return drives_table_idx;
}
+static void numa_add(const char* optarg)
+{
+char option[128];
+char *endptr;
+unsigned long long value, endvalue;
+int nodenr;
+
+ optarg = get_opt_name(option, 128, optarg, ',') + 1;
+ if (!strcmp(option, "node")) {
+ if (get_param_value(option, 128, "nodeid", optarg) == 0) {
+ nodenr = nb_numa_nodes;
+ } else {
+ nodenr = strtoull(option, NULL, 10);
+ }
+
+ if (get_param_value(option, 128, "mem", optarg) == 0) {
+ node_mem[nodenr] = 0;
+ } else
+ {
+ value = strtoull(option, &endptr, 0);
+ switch (*endptr) {
+ case 0: case 'M': case 'm':
+ value <<= 20;
+ break;
+ case 'G': case 'g':
+ value <<= 30;
+ break;
+ }
+ node_mem[nodenr] = value;
+ }
+ if (get_param_value(option, 128, "cpus", optarg) == 0) {
+ node_cpumask[nodenr] = 0;
+ } else {
+ value = strtoull(option, &endptr, 10);
+ if (*endptr == '-') {
+ endvalue = strtoull(endptr+1, &endptr, 10);
+ value = (1 << (endvalue + 1)) - (1 << value);
+ } else {
+ value = 1 << value;
+ }
+ node_cpumask[nodenr] = value;
+ }
+ nb_numa_nodes++;
+ }
+ return;
+}
+
/***********************************************************/
/* USB devices */
@@ -4337,12 +4388,18 @@ int main(int argc, char **argv, char **envp)
virtio_consoles[i] = NULL;
virtio_console_index = 0;
+ for (i = 0; i < MAX_NODES; i++) {
+ node_cpumask[i] = 0;
+ node_mem[i] = 0;
+ }
+
usb_devices_index = 0;
nb_net_clients = 0;
nb_bt_opts = 0;
nb_drives = 0;
nb_drives_opt = 0;
+ nb_numa_nodes = 0;
hda_index = -1;
nb_nics = 0;
@@ -4492,6 +4549,13 @@ int main(int argc, char **argv, char **envp)
",trans=none" : "");
}
break;
+ case QEMU_OPTION_numa:
+ if (nb_numa_nodes >= MAX_NODES) {
+ fprintf(stderr, "qemu: too many NUMA nodes\n");
+ exit(1);
+ }
+ numa_add(optarg);
+ break;
case QEMU_OPTION_nographic:
nographic = 1;
break;
@@ -5192,6 +5256,48 @@ int main(int argc, char **argv, char **envp)
}
}
+ if (nb_numa_nodes > 0) {
+ int i;
+
+ if (nb_numa_nodes > smp_cpus) {
+ nb_numa_nodes = smp_cpus;
+ }
+
+ /* If no memory size if given for any node, assume the default case
+ * and distribute the available memory equally across all nodes
+ */
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_mem[i] != 0)
+ break;
+ }
+ if (i == nb_numa_nodes) {
+ uint64_t usedmem = 0;
+
+ /* On Linux, the each node's border has to be 8MB aligned,
+ * the final node gets the rest.
+ */
+ for (i = 0; i < nb_numa_nodes - 1; i++) {
+ node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
+ usedmem += node_mem[i];
+ }
+ node_mem[i] = ram_size - usedmem;
+ }
+
+ for (i = 0; i < nb_numa_nodes; i++) {
+ if (node_cpumask[i] != 0)
+ break;
+ }
+ /* assigning the VCPUs round-robin is easier to implement, guest OSes
+ * must cope with this anyway, because there are BIOSes out there in
+ * real machines which also use this scheme.
+ */
+ if (i == nb_numa_nodes) {
+ for (i = 0; i < smp_cpus; i++) {
+ node_cpumask[i % nb_numa_nodes] |= 1<<i;
+ }
+ }
+ }
+
if (kvm_enabled()) {
int ret;
--
1.6.1.3
^ permalink raw reply related [flat|nested] 10+ messages in thread