From mboxrd@z Thu Jan 1 00:00:00 1970 From: Vasilis Liaskovitis Subject: Re: [Qemu-devel] [RFC PATCH v2 09/21] pc: Add dimm paravirt SRAT info Date: Fri, 13 Jul 2012 19:40:32 +0200 Message-ID: <20120713174032.GD13216@dhcp-192-168-178-175.profitbricks.localdomain> References: <1342002726-18258-1-git-send-email-vasilis.liaskovitis@profitbricks.com> <1342002726-18258-10-git-send-email-vasilis.liaskovitis@profitbricks.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org, seabios@seabios.org, gleb@redhat.com, kevin@koconnor.net, avi@redhat.com, anthony@codemonkey.ws, imammedo@redhat.com To: Blue Swirl Return-path: Received: from mail-bk0-f46.google.com ([209.85.214.46]:59483 "EHLO mail-bk0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751800Ab2GMRkg (ORCPT ); Fri, 13 Jul 2012 13:40:36 -0400 Received: by bkwj10 with SMTP id j10so3387885bkw.19 for ; Fri, 13 Jul 2012 10:40:34 -0700 (PDT) Content-Disposition: inline In-Reply-To: Sender: kvm-owner@vger.kernel.org List-ID: On Thu, Jul 12, 2012 at 07:48:04PM +0000, Blue Swirl wrote: > On Wed, Jul 11, 2012 at 10:31 AM, Vasilis Liaskovitis > wrote: > > The numa_fw_cfg paravirt interface is extended to include SRAT information for > > all hotplug-able dimms. There are 3 words for each hotplug-able memory slot, > > denoting start address, size and node proximity. The new info is appended after > > existing numa info, so that the fw_cfg layout does not break. This information > > is used by Seabios to build hotplug memory device objects at runtime. > > nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info > > to SeaBIOS. > > > > v1->v2: > > Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not > > to break existing layout > > Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt > > > > Signed-off-by: Vasilis Liaskovitis > > --- > > docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++ > > hw/pc.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++- > > vl.c | 2 +- > > 3 files changed, 80 insertions(+), 3 deletions(-) > > create mode 100644 docs/specs/fwcfg.txt > > > > diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt > > new file mode 100644 > > index 0000000..e6fcd8f > > --- /dev/null > > +++ b/docs/specs/fwcfg.txt > > @@ -0,0 +1,28 @@ > > +QEMU<->BIOS Paravirt Documentation > > +-------------------------------------- > > + > > +This document describes paravirt data structures passed from QEMU to BIOS. > > + > > +fw_cfg SRAT paravirt info > > +-------------------- > > +The SRAT info passed from QEMU to BIOS has the following layout: > > + > > +----------------------------------------------------------------------------------------------- > > +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem > > + > > +----------------------------------------------------------------------------------------------- > > +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm > > + > > +Entry 0 contains the number of numa nodes (nb_numa_nodes). > > + > > +Entries 1..max_cpus: The next max_cpus entries describe node proximity for each > > +one of the vCPUs in the system. > > + > > +Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries > > +describe the memory size for each one of the NUMA nodes in the system. > > + > > +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms) > > + > > +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains > > +the physical address offset, size (in bytes), and node proximity for the > > +respective dimm. > > The size and endianness are not specified, you are using LE 64 bit > values for each item. thanks, I 'll update. > > > diff --git a/hw/pc.c b/hw/pc.c > > index ef9901a..cf651d0 100644 > > --- a/hw/pc.c > > +++ b/hw/pc.c > > @@ -598,12 +598,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type) > > return index; > > } > > > > +static void setup_hp_dimms(uint64_t *fw_cfg_slots); > > + > > static void *bochs_bios_init(void) > > { > > void *fw_cfg; > > uint8_t *smbios_table; > > size_t smbios_len; > > uint64_t *numa_fw_cfg; > > + uint64_t *hp_dimms_fw_cfg; > > int i, j; > > > > register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL); > > @@ -638,8 +641,10 @@ static void *bochs_bios_init(void) > > /* allocate memory for the NUMA channel: one (64bit) word for the number > > * of nodes, one word for each VCPU->node and one word for each node to > > * hold the amount of memory. > > + * Finally one word for the number of hotplug memory slots and three words > > + * for each hotplug memory slot (start address, size and node proximity). > > */ > > - numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8); > > + numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); > > numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); > > for (i = 0; i < max_cpus; i++) { > > for (j = 0; j < nb_numa_nodes; j++) { > > @@ -652,8 +657,15 @@ static void *bochs_bios_init(void) > > for (i = 0; i < nb_numa_nodes; i++) { > > numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]); > > } > > + > > + numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms); > > + > > + hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes; > > + if (nb_hp_dimms) > > + setup_hp_dimms(hp_dimms_fw_cfg); > > Braces. > > > + > > fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg, > > - (1 + max_cpus + nb_numa_nodes) * 8); > > + (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); > > > > return fw_cfg; > > } > > @@ -1223,3 +1235,40 @@ target_phys_addr_t pc_set_hp_memory_offset(uint64_t size) > > > > return ret; > > } > > + > > +static void setup_hp_dimms(uint64_t *fw_cfg_slots) > > +{ > > + int i = 0; > > + Error *err = NULL; > > + DeviceState *dev; > > + DimmState *slot; > > + const char *type; > > + BusChild *kid; > > + BusState *bus = sysbus_get_default(); > > + > > + QTAILQ_FOREACH(kid, &bus->children, sibling) { > > + dev = kid->child; > > + type = object_property_get_str(OBJECT(dev), "type", &err); > > + if (err) { > > + error_free(err); > > + fprintf(stderr, "error getting device type\n"); > > + exit(1); > > + } > > + > > + if (!strcmp(type, "dimm")) { > > + if (!dev->id) { > > + fprintf(stderr, "error getting dimm device id\n"); > > + exit(1); > > + } > > + slot = DIMM(dev); > > + /* determine starting physical address for this memory slot */ > > + assert(slot->start); > > + fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start); > > + fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size); > > + fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node); > > + i++; > > + } > > + } > > + assert(i == nb_hp_dimms); > > +} > > + > > diff --git a/vl.c b/vl.c > > index 0ff8818..37c9798 100644 > > --- a/vl.c > > +++ b/vl.c > > @@ -2335,7 +2335,7 @@ int main(int argc, char **argv, char **envp) > > node_cpumask[i] = 0; > > } > > > > - nb_numa_nodes = 0; > > + nb_numa_nodes = 1; > > nb_nics = 0; > > > > autostart= 1; > > -- > > 1.7.9 > > > > From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([208.118.235.92]:57719) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Spjr3-00080s-Gn for qemu-devel@nongnu.org; Fri, 13 Jul 2012 13:40:39 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Spjr2-0006l1-2c for qemu-devel@nongnu.org; Fri, 13 Jul 2012 13:40:37 -0400 Received: from mail-bk0-f45.google.com ([209.85.214.45]:38837) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Spjr1-0006kk-O5 for qemu-devel@nongnu.org; Fri, 13 Jul 2012 13:40:35 -0400 Received: by bkcji1 with SMTP id ji1so2838936bkc.4 for ; Fri, 13 Jul 2012 10:40:35 -0700 (PDT) Date: Fri, 13 Jul 2012 19:40:32 +0200 From: Vasilis Liaskovitis Message-ID: <20120713174032.GD13216@dhcp-192-168-178-175.profitbricks.localdomain> References: <1342002726-18258-1-git-send-email-vasilis.liaskovitis@profitbricks.com> <1342002726-18258-10-git-send-email-vasilis.liaskovitis@profitbricks.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: Subject: Re: [Qemu-devel] [RFC PATCH v2 09/21] pc: Add dimm paravirt SRAT info List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Blue Swirl Cc: kvm@vger.kernel.org, gleb@redhat.com, seabios@seabios.org, qemu-devel@nongnu.org, kevin@koconnor.net, avi@redhat.com, anthony@codemonkey.ws, imammedo@redhat.com On Thu, Jul 12, 2012 at 07:48:04PM +0000, Blue Swirl wrote: > On Wed, Jul 11, 2012 at 10:31 AM, Vasilis Liaskovitis > wrote: > > The numa_fw_cfg paravirt interface is extended to include SRAT information for > > all hotplug-able dimms. There are 3 words for each hotplug-able memory slot, > > denoting start address, size and node proximity. The new info is appended after > > existing numa info, so that the fw_cfg layout does not break. This information > > is used by Seabios to build hotplug memory device objects at runtime. > > nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info > > to SeaBIOS. > > > > v1->v2: > > Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not > > to break existing layout > > Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt > > > > Signed-off-by: Vasilis Liaskovitis > > --- > > docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++ > > hw/pc.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++- > > vl.c | 2 +- > > 3 files changed, 80 insertions(+), 3 deletions(-) > > create mode 100644 docs/specs/fwcfg.txt > > > > diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt > > new file mode 100644 > > index 0000000..e6fcd8f > > --- /dev/null > > +++ b/docs/specs/fwcfg.txt > > @@ -0,0 +1,28 @@ > > +QEMU<->BIOS Paravirt Documentation > > +-------------------------------------- > > + > > +This document describes paravirt data structures passed from QEMU to BIOS. > > + > > +fw_cfg SRAT paravirt info > > +-------------------- > > +The SRAT info passed from QEMU to BIOS has the following layout: > > + > > +----------------------------------------------------------------------------------------------- > > +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem > > + > > +----------------------------------------------------------------------------------------------- > > +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm > > + > > +Entry 0 contains the number of numa nodes (nb_numa_nodes). > > + > > +Entries 1..max_cpus: The next max_cpus entries describe node proximity for each > > +one of the vCPUs in the system. > > + > > +Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries > > +describe the memory size for each one of the NUMA nodes in the system. > > + > > +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms) > > + > > +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains > > +the physical address offset, size (in bytes), and node proximity for the > > +respective dimm. > > The size and endianness are not specified, you are using LE 64 bit > values for each item. thanks, I 'll update. > > > diff --git a/hw/pc.c b/hw/pc.c > > index ef9901a..cf651d0 100644 > > --- a/hw/pc.c > > +++ b/hw/pc.c > > @@ -598,12 +598,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type) > > return index; > > } > > > > +static void setup_hp_dimms(uint64_t *fw_cfg_slots); > > + > > static void *bochs_bios_init(void) > > { > > void *fw_cfg; > > uint8_t *smbios_table; > > size_t smbios_len; > > uint64_t *numa_fw_cfg; > > + uint64_t *hp_dimms_fw_cfg; > > int i, j; > > > > register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL); > > @@ -638,8 +641,10 @@ static void *bochs_bios_init(void) > > /* allocate memory for the NUMA channel: one (64bit) word for the number > > * of nodes, one word for each VCPU->node and one word for each node to > > * hold the amount of memory. > > + * Finally one word for the number of hotplug memory slots and three words > > + * for each hotplug memory slot (start address, size and node proximity). > > */ > > - numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8); > > + numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); > > numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); > > for (i = 0; i < max_cpus; i++) { > > for (j = 0; j < nb_numa_nodes; j++) { > > @@ -652,8 +657,15 @@ static void *bochs_bios_init(void) > > for (i = 0; i < nb_numa_nodes; i++) { > > numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]); > > } > > + > > + numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms); > > + > > + hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes; > > + if (nb_hp_dimms) > > + setup_hp_dimms(hp_dimms_fw_cfg); > > Braces. > > > + > > fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg, > > - (1 + max_cpus + nb_numa_nodes) * 8); > > + (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); > > > > return fw_cfg; > > } > > @@ -1223,3 +1235,40 @@ target_phys_addr_t pc_set_hp_memory_offset(uint64_t size) > > > > return ret; > > } > > + > > +static void setup_hp_dimms(uint64_t *fw_cfg_slots) > > +{ > > + int i = 0; > > + Error *err = NULL; > > + DeviceState *dev; > > + DimmState *slot; > > + const char *type; > > + BusChild *kid; > > + BusState *bus = sysbus_get_default(); > > + > > + QTAILQ_FOREACH(kid, &bus->children, sibling) { > > + dev = kid->child; > > + type = object_property_get_str(OBJECT(dev), "type", &err); > > + if (err) { > > + error_free(err); > > + fprintf(stderr, "error getting device type\n"); > > + exit(1); > > + } > > + > > + if (!strcmp(type, "dimm")) { > > + if (!dev->id) { > > + fprintf(stderr, "error getting dimm device id\n"); > > + exit(1); > > + } > > + slot = DIMM(dev); > > + /* determine starting physical address for this memory slot */ > > + assert(slot->start); > > + fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start); > > + fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size); > > + fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node); > > + i++; > > + } > > + } > > + assert(i == nb_hp_dimms); > > +} > > + > > diff --git a/vl.c b/vl.c > > index 0ff8818..37c9798 100644 > > --- a/vl.c > > +++ b/vl.c > > @@ -2335,7 +2335,7 @@ int main(int argc, char **argv, char **envp) > > node_cpumask[i] = 0; > > } > > > > - nb_numa_nodes = 0; > > + nb_numa_nodes = 1; > > nb_nics = 0; > > > > autostart= 1; > > -- > > 1.7.9 > > > >