From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:43524) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1UrloP-00051d-Ev for qemu-devel@nongnu.org; Wed, 26 Jun 2013 05:14:51 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1UrloM-0002Zu-5n for qemu-devel@nongnu.org; Wed, 26 Jun 2013 05:14:49 -0400 Received: from [222.73.24.84] (port=10916 helo=song.cn.fujitsu.com) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1UrloL-0002Wy-7O for qemu-devel@nongnu.org; Wed, 26 Jun 2013 05:14:46 -0400 From: Hu Tao Date: Wed, 26 Jun 2013 17:13:33 +0800 Message-Id: <52ef8c94b2e3af11ea918b2c84756d81a61db15f.1372234719.git.hutao@cn.fujitsu.com> In-Reply-To: References: Subject: [Qemu-devel] [PATCH v5 10/14] pc: Add dimm paravirt SRAT info List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Vasilis Liaskovitis The numa_fw_cfg paravirt interface is extended to include SRAT information for all hotplug-able dimms. There are 3 words for each hotplug-able memory slot, denoting start address, size and node proximity. The new info is appended after existing numa info, so that the fw_cfg layout does not break. This information is used by Seabios to build hotplug memory device objects at runtime. nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info to SeaBIOS. v3->v4: numa_fw_cfg needs to be initalized after memory controller sets up dimm ranges. Make changes for pc_piix and pc_q35 to set numa_fw_cfg after i440fx initialization. v2->v3: setting nb_numa_nodes to 1 is not needed v1->v2: Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not to break existing layout Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt Signed-off-by: Vasilis Liaskovitis Signed-off-by: Hu Tao --- docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++++ hw/i386/pc.c | 30 ++++++++++++++++++++++++------ hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c | 7 +++++-- include/hw/i386/pc.h | 1 + include/sysemu/sysemu.h | 1 + 6 files changed, 60 insertions(+), 8 deletions(-) create mode 100644 docs/specs/fwcfg.txt diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt new file mode 100644 index 0000000..e6fcd8f --- /dev/null +++ b/docs/specs/fwcfg.txt @@ -0,0 +1,28 @@ +QEMU<->BIOS Paravirt Documentation +-------------------------------------- + +This document describes paravirt data structures passed from QEMU to BIOS. + +fw_cfg SRAT paravirt info +-------------------- +The SRAT info passed from QEMU to BIOS has the following layout: + +----------------------------------------------------------------------------------------------- +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem + +----------------------------------------------------------------------------------------------- +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm + +Entry 0 contains the number of numa nodes (nb_numa_nodes). + +Entries 1..max_cpus: The next max_cpus entries describe node proximity for each +one of the vCPUs in the system. + +Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries +describe the memory size for each one of the NUMA nodes in the system. + +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms) + +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains +the physical address offset, size (in bytes), and node proximity for the +respective dimm. diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 65838a6..b51d3b5 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -55,6 +55,7 @@ #include "hw/acpi/acpi.h" #include "hw/cpu/icc_bus.h" #include "hw/boards.h" +#include "hw/mem-hotplug/dimm.h" /* debug PC/ISA interrupts */ //#define DEBUG_IRQ @@ -606,8 +607,6 @@ static FWCfgState *bochs_bios_init(void) FWCfgState *fw_cfg; uint8_t *smbios_table; size_t smbios_len; - uint64_t *numa_fw_cfg; - int i, j; unsigned int apic_id_limit = pc_apic_id_limit(max_cpus); fw_cfg = fw_cfg_init(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 1, 0, 0); @@ -640,11 +639,25 @@ static FWCfgState *bochs_bios_init(void) &e820_table, sizeof(e820_table)); fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg)); + + return fw_cfg; +} + +void bochs_meminfo_bios_init(void *fw_cfg) +{ + uint64_t *numa_fw_cfg; + uint64_t *hp_dimms_fw_cfg; + int i, j; + unsigned int apic_id_limit = pc_apic_id_limit(max_cpus); + /* allocate memory for the NUMA channel: one (64bit) word for the number * of nodes, one word for each VCPU->node and one word for each node to * hold the amount of memory. + * Finally one word for the number of hotplug memory slots and three words + * for each hotplug memory slot (start address, size and node proximity). */ - numa_fw_cfg = g_new0(uint64_t, 1 + apic_id_limit + nb_numa_nodes); + numa_fw_cfg = g_new0(uint64_t, + 2 + apic_id_limit + nb_numa_nodes + 3 * nb_hp_dimms); numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); for (i = 0; i < max_cpus; i++) { unsigned int apic_id = x86_cpu_apic_id_from_index(i); @@ -659,11 +672,16 @@ static FWCfgState *bochs_bios_init(void) for (i = 0; i < nb_numa_nodes; i++) { numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); } + + numa_fw_cfg[1 + apic_id_limit + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms); + + hp_dimms_fw_cfg = numa_fw_cfg + 2 + apic_id_limit + nb_numa_nodes; + if (nb_hp_dimms) { + dimm_setup_fwcfg_layout(hp_dimms_fw_cfg); + } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, - (1 + apic_id_limit + nb_numa_nodes) * + (2 + apic_id_limit + nb_numa_nodes + 3 * nb_hp_dimms) * sizeof(*numa_fw_cfg)); - - return fw_cfg; } static long get_file_size(FILE *f) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index fb056df..6e18343 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -138,6 +138,7 @@ static void pc_init1(MemoryRegion *system_memory, if (!xen_enabled()) { fw_cfg = pc_memory_init(kernel_filename, kernel_cmdline, initrd_filename, below_4g_mem_size, above_4g_mem_size); + bochs_meminfo_bios_init(fw_cfg); } if (kvm_irqchip_in_kernel()) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 5fe14bb..2c14977 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -74,6 +74,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args) ICH9LPCState *ich9_lpc; PCIDevice *ahci; DeviceState *icc_bridge; + void *fw_cfg = NULL; icc_bridge = qdev_create(NULL, TYPE_ICC_BRIDGE); object_property_add_child(qdev_get_machine(), "icc-bridge", @@ -97,8 +98,9 @@ static void pc_q35_init(QEMUMachineInitArgs *args) /* allocate ram and load rom/bios */ if (!xen_enabled()) { - pc_memory_init(kernel_filename, kernel_cmdline, - initrd_filename, below_4g_mem_size, above_4g_mem_size); + fw_cfg = pc_memory_init(kernel_filename, kernel_cmdline, + initrd_filename, below_4g_mem_size, + above_4g_mem_size); } /* irq lines */ @@ -116,6 +118,7 @@ static void pc_q35_init(QEMUMachineInitArgs *args) q35_host->mch.address_space_io = get_system_io(); /* pci */ qdev_init_nofail(DEVICE(q35_host)); + bochs_meminfo_bios_init(fw_cfg); host_bus = q35_host->host.pci.bus; /* create ISA bus */ lpc = pci_create_simple_multifunction(host_bus, PCI_DEVFN(ICH9_LPC_DEV, diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 959b92b..4a29e6e 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -231,6 +231,7 @@ int pvpanic_init(ISABus *bus); #define E820_UNUSABLE 5 int e820_add_entry(uint64_t, uint64_t, uint32_t); +void bochs_meminfo_bios_init(void *fw_cfg); #define PC_COMPAT_1_5 \ {\ diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 2fb71af..2644faa 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -132,6 +132,7 @@ extern QEMUClock *rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +extern int nb_hp_dimms; #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { -- 1.8.3.1