From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:35259) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YLmyV-0001pn-PE for qemu-devel@nongnu.org; Thu, 12 Feb 2015 01:10:09 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1YLmyT-0005bM-Gp for qemu-devel@nongnu.org; Thu, 12 Feb 2015 01:10:07 -0500 Received: from ozlabs.org ([2401:3900:2:1::2]:38353) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YLmyS-0005Vo-SJ for qemu-devel@nongnu.org; Thu, 12 Feb 2015 01:10:05 -0500 Date: Thu, 12 Feb 2015 17:02:24 +1100 From: David Gibson Message-ID: <20150212060224.GC20593@voom.redhat.com> References: <1420697420-16053-1-git-send-email-bharata@linux.vnet.ibm.com> <1420697420-16053-13-git-send-email-bharata@linux.vnet.ibm.com> MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="i7F3eY7HS/tUJxUd" Content-Disposition: inline In-Reply-To: <1420697420-16053-13-git-send-email-bharata@linux.vnet.ibm.com> Subject: Re: [Qemu-devel] [RFC PATCH v1 12/13] spapr: Support ibm, dynamic-reconfiguration-memory List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Bharata B Rao Cc: imammedo@redhat.com, agraf@suse.de, qemu-devel@nongnu.org, mdroth@linux.vnet.ibm.com --i7F3eY7HS/tUJxUd Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On Thu, Jan 08, 2015 at 11:40:19AM +0530, Bharata B Rao wrote: > Parse ibm,architecture.vec table obtained from the guest and enable > memory node configuration via ibm,dynamic-reconfiguration-memory if guest > supports it. This is in preparation to support memory hotplug for > sPAPR guests. >=20 > This changes the way memory node configuration is done. Currently all > memory nodes are built upfront. But after this patch, only memory@0 node > for RMA is built upfront. Guest kernel boots with just that and rest of > the memory nodes (via memory@XXX or ibm,dynamic-reconfiguration-memory) > are built when guest does ibm,client-architecture-support call. >=20 > Note: This patch was tested with an enhancement to SLOF that supports > addition of device tree nodes from ibm,client-architecture-support call. >=20 > TODO: Enforce lmb-size alignment for node memory. I think this todo needs to be done before you're ready to merge. > Signed-off-by: Bharata B Rao > --- > hw/ppc/spapr.c | 232 ++++++++++++++++++++++++++++++++++++++++---= ------ > hw/ppc/spapr_hcall.c | 51 +++++++++-- > include/hw/ppc/spapr.h | 12 ++- > 3 files changed, 246 insertions(+), 49 deletions(-) >=20 > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 9ff08ff..6964b06 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -631,42 +631,6 @@ static void *spapr_create_fdt_skel(hwaddr initrd_bas= e, > return fdt; > } > =20 > -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size) > -{ > - void *fdt, *fdt_skel; > - sPAPRDeviceTreeUpdateHeader hdr =3D { .version_id =3D 1 }; > - > - size -=3D sizeof(hdr); > - > - /* Create sceleton */ > - fdt_skel =3D g_malloc0(size); > - _FDT((fdt_create(fdt_skel, size))); > - _FDT((fdt_begin_node(fdt_skel, ""))); > - _FDT((fdt_end_node(fdt_skel))); > - _FDT((fdt_finish(fdt_skel))); > - fdt =3D g_malloc0(size); > - _FDT((fdt_open_into(fdt_skel, fdt, size))); > - g_free(fdt_skel); > - > - /* Fix skeleton up */ > - _FDT((spapr_fixup_cpu_dt(fdt, spapr))); > - > - /* Pack resulting tree */ > - _FDT((fdt_pack(fdt))); > - > - if (fdt_totalsize(fdt) + sizeof(hdr) > size) { > - trace_spapr_cas_failed(size); > - return -1; > - } > - > - cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); > - cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt= )); > - trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); > - g_free(fdt); > - > - return 0; > -} > - > static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr sta= rt, > hwaddr size) > { > @@ -720,7 +684,6 @@ static int spapr_populate_memory(sPAPREnvironment *sp= apr, void *fdt) > } > if (!mem_start) { > /* ppc_spapr_init() checks for rma_size <=3D node0_size alre= ady */ > - spapr_populate_memory_node(fdt, i, 0, spapr->rma_size); > mem_start +=3D spapr->rma_size; > node_size -=3D spapr->rma_size; > } > @@ -741,6 +704,190 @@ static int spapr_populate_memory(sPAPREnvironment *= spapr, void *fdt) > return 0; > } > =20 > +/* > + * TODO: Take care of sparsemem configuration ? > + */ > +static uint64_t numa_node_end(uint32_t nodeid) > +{ > + uint32_t i =3D 0; > + uint64_t addr =3D 0; > + > + do { > + addr +=3D numa_info[i].node_mem; > + } while (++i <=3D nodeid); > + > + return addr; > +} > + > +static uint64_t numa_node_start(uint32_t nodeid) > +{ > + if (!nodeid) { > + return 0; > + } else { > + return numa_node_end(nodeid - 1); > + } > +} There's really no better generic way of finding the addresses of numa nodes? > +/* > + * Given the addr, return the NUMA node to which the address belongs to. > + */ > +static uint32_t get_numa_node(uint64_t addr) > +{ > + uint32_t i; > + > + for (i =3D 0; i < nb_numa_nodes; i++) { > + if ((addr >=3D numa_node_start(i)) && (addr < numa_node_end(i)))= { This is O(nb_numa_nodes^2) which is kind of nasty, althoguh that's unlikely to be large enough to be a real problem. > + return i; > + } > + } > + > + /* Unassigned memory goes to node 0 by default */ > + return 0; > +} > + > +/* Adds ibm,dynamic-reconfiguration-memory node */ > +static int spapr_populate_drconf_memory(sPAPREnvironment *spapr, void *f= dt) > +{ > + int root_offset, ret, i, offset; > + uint32_t lmb_size =3D SPAPR_MIN_MEMORY_BLOCK_SIZE; > + uint32_t prop_lmb_size[] =3D {0, cpu_to_be32(lmb_size)}; > + uint32_t dynamic_memory[DR_LMB_LIST_ENTRY_SIZE]; > + uint32_t nr_rma_lmbs =3D spapr->rma_size/lmb_size; > + uint32_t nr_lmbs =3D spapr->maxram_limit/lmb_size - nr_rma_lmbs; > + uint32_t nr_assigned_lmbs =3D spapr->ram_limit/lmb_size - nr_rma_lmb= s; > + uint32_t *int_buf, *cur_index, buf_len; > + > + /* Allocate enough buffer size to fit in ibm,dynamic-memory */ > + buf_len =3D nr_lmbs * DR_LMB_LIST_ENTRY_SIZE * sizeof(uint32_t) + > + sizeof(uint32_t); > + cur_index =3D int_buf =3D g_malloc0(buf_len); > + root_offset =3D fdt_path_offset(fdt, "/"); You don't need this, the fdt offset of / is always 0 and you're allowed to count on that. > + > + > + offset =3D fdt_add_subnode(fdt, root_offset, > + "ibm,dynamic-reconfiguration-memory"); > + > + ret =3D fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size, > + sizeof(prop_lmb_size)); > + if (ret < 0) { > + goto out; > + } Current versions of libfdt have fdt_setprop_u64 which you could use for this. > + ret =3D fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", > + cpu_to_be32(0xff)); fdt_setprop_cell has the byteswap built-in, so adding your own as well will make it incorrect for an LE host. > + if (ret < 0) { > + goto out; > + } > + > + ret =3D fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", > + cpu_to_be32(0x0)); > + if (ret < 0) { > + goto out; > + } > + > + /* ibm,dynamic-memory */ > + int_buf[0] =3D cpu_to_be32(nr_lmbs); > + cur_index++; > + for (i =3D 0; i < nr_lmbs; i++) { > + sPAPRDRConnector *drc; > + sPAPRDRConnectorClass *drck; > + uint64_t addr; > + > + if (i < nr_assigned_lmbs) { > + addr =3D (i + nr_rma_lmbs) * lmb_size; > + } else { > + addr =3D (i - nr_assigned_lmbs) * lmb_size + > + SPAPR_MACHINE(qdev_get_machine())->hotplug_memory_base; > + } > + drc =3D spapr_dr_connector_new(qdev_get_machine(), > + SPAPR_DR_CONNECTOR_TYPE_LMB, addr/lmb_size); > + drck =3D SPAPR_DR_CONNECTOR_GET_CLASS(drc); > + > + dynamic_memory[0] =3D cpu_to_be32(addr >> 32); > + dynamic_memory[1] =3D cpu_to_be32(addr & 0xffffffff); > + dynamic_memory[2] =3D cpu_to_be32(drck->get_index(drc)); > + dynamic_memory[3] =3D cpu_to_be32(0); /* reserved */ > + dynamic_memory[4] =3D cpu_to_be32(get_numa_node(addr)); As noted above your current get_numa_node() implementation is O(N^2) making this routine O(N^3). > + dynamic_memory[5] =3D (addr < spapr->ram_limit) ? > + cpu_to_be32(LMB_FLAGS_ASSIGNED) : > + cpu_to_be32(0); > + > + memcpy(cur_index, dynamic_memory, sizeof(dynamic_memory)); You could just use uint32_t *dynamic_memory =3D cur_index at the beginning of the loop block to avoid this memcpy(). > + cur_index +=3D DR_LMB_LIST_ENTRY_SIZE; > + } > + ret =3D fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_= len); > + if (ret < 0) { > + goto out; > + } > + > + /* ibm,associativity-lookup-arrays */ > + cur_index =3D int_buf; > + int_buf[0] =3D cpu_to_be32(nb_numa_nodes); > + int_buf[1] =3D cpu_to_be32(4); Something explaining the significance of this 4 for those of us that don't have access to PAPR+ would be nice. > + cur_index +=3D 2; > + for (i =3D 0; i < nb_numa_nodes; i++) { > + uint32_t associativity[] =3D { > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(i) > + }; > + memcpy(cur_index, associativity, sizeof(associativity)); > + cur_index +=3D 4; > + } > + ret =3D fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", = int_buf, > + (cur_index - int_buf) * sizeof(uint32_t)); > +out: > + g_free(int_buf); > + return ret; > +} > + > +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size, > + bool cpu_update, bool memory_update) > +{ > + void *fdt, *fdt_skel; > + sPAPRDeviceTreeUpdateHeader hdr =3D { .version_id =3D 1 }; > + > + size -=3D sizeof(hdr); > + > + /* Create sceleton */ > + fdt_skel =3D g_malloc0(size); > + _FDT((fdt_create(fdt_skel, size))); > + _FDT((fdt_begin_node(fdt_skel, ""))); > + _FDT((fdt_end_node(fdt_skel))); > + _FDT((fdt_finish(fdt_skel))); > + fdt =3D g_malloc0(size); > + _FDT((fdt_open_into(fdt_skel, fdt, size))); > + g_free(fdt_skel); > + > + /* Fixup cpu nodes */ > + if (cpu_update) { > + _FDT((spapr_fixup_cpu_dt(fdt, spapr))); > + } > + > + /* Generate memory nodes or ibm,dynamic-reconfiguration-memory node = */ > + if (memory_update) { > + _FDT((spapr_populate_drconf_memory(spapr, fdt))); > + } else { > + _FDT((spapr_populate_memory(spapr, fdt))); > + } > + > + /* Pack resulting tree */ > + _FDT((fdt_pack(fdt))); > + > + if (fdt_totalsize(fdt) + sizeof(hdr) > size) { You could just set the correct size (size - sizeof(hdr)) to fdt_create() and fdt_open_into() and avoid this failure case. > + trace_spapr_cas_failed(size); > + return -1; > + } > + > + cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); > + cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt= )); > + trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); > + g_free(fdt); > + > + return 0; > +} > + > static void spapr_finalize_fdt(sPAPREnvironment *spapr, > hwaddr fdt_addr, > hwaddr rtas_addr, > @@ -757,11 +904,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *sp= apr, > /* open out the base tree into a temp buffer for the final tweaks */ > _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE))); > =20 > - ret =3D spapr_populate_memory(spapr, fdt); > - if (ret < 0) { > - fprintf(stderr, "couldn't setup memory nodes in fdt\n"); > - exit(1); > - } > + /* > + * Add memory@0 node to represent RMA. Rest of the memory is either > + * represented by memory nodes or ibm,dynamic-reconfiguration-memory > + * node later during ibm,client-architecture-support call. > + */ > + spapr_populate_memory_node(fdt, 0, 0, spapr->rma_size); > =20 > ret =3D spapr_populate_vdevice(spapr->vio_bus, fdt); > if (ret < 0) { > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 8651447..10f05f4 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -805,6 +805,32 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, sPAP= REnvironment *spapr, > return ret; > } > =20 > +/* > + * Return the offset to the requested option vector @vector in the > + * option vector table @table. > + */ > +static target_ulong cas_get_option_vector(int vector, target_ulong table) > +{ > + int i; > + char nr_vectors, nr_entries; > + > + if (!table) { > + return 0; > + } > + > + nr_vectors =3D (rtas_ld(table, 0) >> 24) + 1; This is kind of abusing the rtas_ld() function. It's really only intended for accessing rtas arguments, not an arbitrary array of u32s. > + if (!vector || vector > nr_vectors) { > + return 0; > + } > + table++; /* skip nr option vectors */ > + > + for (i =3D 0; i < vector - 1; i++) { > + nr_entries =3D rtas_ld(table, 0) >> 24; > + table +=3D nr_entries + 2; > + } > + return table; > +} > + > typedef struct { > PowerPCCPU *cpu; > uint32_t cpu_version; > @@ -825,19 +851,22 @@ static void do_set_compat(void *arg) > ((cpuver) =3D=3D CPU_POWERPC_LOGICAL_2_06_PLUS) ? 2061 : \ > ((cpuver) =3D=3D CPU_POWERPC_LOGICAL_2_07) ? 2070 : 0) > =20 > +#define OV5_DRCONF_MEMORY 0x20 > + > static target_ulong h_client_architecture_support(PowerPCCPU *cpu_, > sPAPREnvironment *spap= r, > target_ulong opcode, > target_ulong *args) > { > - target_ulong list =3D args[0]; > + target_ulong list =3D args[0], ov_table; > PowerPCCPUClass *pcc_ =3D POWERPC_CPU_GET_CLASS(cpu_); > CPUState *cs; > - bool cpu_match =3D false; > + bool cpu_match =3D false, cpu_update =3D true, memory_update =3D fal= se; > unsigned old_cpu_version =3D cpu_->cpu_version; > unsigned compat_lvl =3D 0, cpu_version =3D 0; > unsigned max_lvl =3D get_compat_level(cpu_->max_compat); > int counter; > + char ov5_byte2; > =20 > /* Parse PVR list */ > for (counter =3D 0; counter < 512; ++counter) { > @@ -887,8 +916,6 @@ static target_ulong h_client_architecture_support(Pow= erPCCPU *cpu_, > } > } > =20 > - /* For the future use: here @list points to the first capability */ > - > /* Parsing finished */ > trace_spapr_cas_pvr(cpu_->cpu_version, cpu_match, > cpu_version, pcc_->pcr_mask); > @@ -912,14 +939,26 @@ static target_ulong h_client_architecture_support(P= owerPCCPU *cpu_, > } > =20 > if (!cpu_version) { > - return H_SUCCESS; > + cpu_update =3D false; > } > =20 > + /* For the future use: here @ov_table points to the first option vec= tor */ > + ov_table =3D list; > + > + list =3D cas_get_option_vector(5, ov_table); What's the literal 5 mean? > if (!list) { > return H_SUCCESS; > } > =20 > - if (spapr_h_cas_compose_response(args[1], args[2])) { > + /* @list now points to OV 5 */ > + list +=3D 2; > + ov5_byte2 =3D rtas_ld(list, 0) >> 24; > + if (ov5_byte2 & OV5_DRCONF_MEMORY) { > + memory_update =3D true; > + } > + > + if (spapr_h_cas_compose_response(args[1], args[2], cpu_update, > + memory_update)) { > qemu_system_reset_request(); > } > =20 > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 64681c4..10283f9 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -485,9 +485,19 @@ struct sPAPRTCETable { > /* Support a min of 1TB hotplug memory assuming 256MB per slot */ > #define SPAPR_MAX_RAM_SLOTS (1ULL << 12) > =20 > +/* > + * Number of 32 bit words in each LMB list entry in ibm,dynamic-memory > + * property under ibm,dynamic-reconfiguration-memory node. > + */ > +#define DR_LMB_LIST_ENTRY_SIZE 6 > + > +/* Flag values in Option Vector 5 ibm architecture vector table. */ > +#define LMB_FLAGS_ASSIGNED 0x00000008 Things declared in a public header should have some sort of namespacing, so, SPAPR_DR_LBM_LIST_ENTRY_SIZE for example. > void spapr_events_init(sPAPREnvironment *spapr); > void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq); > -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size); > +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size, > + bool cpu_update, bool memory_update); > sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, > uint64_t bus_offset, > uint32_t page_shift, --=20 David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson --i7F3eY7HS/tUJxUd Content-Type: application/pgp-signature -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAEBAgAGBQJU3EHwAAoJEGw4ysog2bOSIl0QAKMqPnDSj6UC0URw9Pls6eIp DHJvUzyLMz0O7hTHiaOp+vrFZfhJiReWCq20vnYv2CPymoYYvU3s6G2NBith8+YS YFGgtzpFgGPArN5tBzWc+Fp4Zdfs8eMruhWlYp1SptrgYx4qxMKATvypSadEAWGs QgV6cALNgy1X9C0ct5CeDRsyl6no6GX5Y19qLrEk3NVVLR6Cn/s7fFOfZ9f/9511 dr8z/ln8R2+HUB8xd69/MyChXSJ0caBD3o9/XIlYy6ggfdck63UkNL48pKnKPeVu obES858w6Hhfxi6UfNWKJs+8SwIlSoo31oLhvG20bvsIvoRDv6/zzPCCIwpS1HL1 iHheiCd/7a2UyVaKaQqieSzApKP9b7vmLmhj2uNI2VvFUvSOJjztWGdR+84MgAzt Yyngxu7HmGdYyD3jGPCHQgduvI8XOoSBqIsjnmPez3qTay8Bns5T2fWUyyuH12dk Ych0Qxf7U8CMbrZblwmOQEimCKpXDvalPXvfsKpQcUH4NVTg3zU+M29FFE9CUd+C BtffEEXxWWhUw9PIyXi9+shg3ZbgpcqStquaBR8uPPw/piZII18M82VowSPiuVes ZBZxAjqeCleBtWNs/GcxxrPiNZ7SwN74bqXcCp6BHLCUVdQco5fw1lEtiK3bhYuY HlBNe6NbRrznzPn+UbEV =tA1G -----END PGP SIGNATURE----- --i7F3eY7HS/tUJxUd--