From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([2001:4830:134:3::10]:38553)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <imammedo@redhat.com>) id 1YEdYb-00023n-F6
	for qemu-devel@nongnu.org; Fri, 23 Jan 2015 07:41:51 -0500
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <imammedo@redhat.com>) id 1YEdYX-0001fL-B4
	for qemu-devel@nongnu.org; Fri, 23 Jan 2015 07:41:49 -0500
Received: from mx1.redhat.com ([209.132.183.28]:48241)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <imammedo@redhat.com>) id 1YEdYX-0001fC-0z
	for qemu-devel@nongnu.org; Fri, 23 Jan 2015 07:41:45 -0500
Date: Fri, 23 Jan 2015 13:41:38 +0100
From: Igor Mammedov <imammedo@redhat.com>
Message-ID: <20150123134138.34334599@nial.brq.redhat.com>
In-Reply-To: <1420697420-16053-7-git-send-email-bharata@linux.vnet.ibm.com>
References: <1420697420-16053-1-git-send-email-bharata@linux.vnet.ibm.com>
	<1420697420-16053-7-git-send-email-bharata@linux.vnet.ibm.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Subject: Re: [Qemu-devel] [RFC PATCH v1 06/13] spapr: CPU hotplug support
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: agraf@suse.de, qemu-devel@nongnu.org, mdroth@linux.vnet.ibm.com

On Thu,  8 Jan 2015 11:40:13 +0530
Bharata B Rao <bharata@linux.vnet.ibm.com> wrote:

> Support CPU hotplug via device-add command. Use the exising EPOW event
> infrastructure to send CPU hotplug notification to the guest.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c              | 205 +++++++++++++++++++++++++++++++++++++++++++-
>  hw/ppc/spapr_events.c       |   8 +-
>  target-ppc/translate_init.c |   6 ++
>  3 files changed, 215 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 515d770..a293a59 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -330,6 +330,8 @@ static void add_str(GString *s, const gchar *s1)
>      g_string_append_len(s, s1, strlen(s1) + 1);
>  }
>  
> +uint32_t cpus_per_socket;
static ???

> +
>  static void *spapr_create_fdt_skel(hwaddr initrd_base,
>                                     hwaddr initrd_size,
>                                     hwaddr kernel_size,
> @@ -350,9 +352,9 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
>      unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
>      QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
>      unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
> -    uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
>      char *buf;
>  
> +    cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
>      add_str(hypertas, "hcall-pft");
>      add_str(hypertas, "hcall-term");
>      add_str(hypertas, "hcall-dabr");
> @@ -1744,12 +1746,209 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
>      }
>  }
>  
> +/* TODO: Duplicates code from spapr_create_fdt_skel(), Fix this */
> +static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
> +            int drc_index)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    CPUPPCState *env = &cpu->env;
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
> +    int index = ppc_get_vcpu_dt_id(cpu);
> +    uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
> +                       0xffffffff, 0xffffffff};
> +    uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
> +    uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
> +    uint32_t page_sizes_prop[64];
> +    size_t page_sizes_prop_size;
> +    int smpt = ppc_get_compat_smt_threads(cpu);
> +    uint32_t servers_prop[smpt];
> +    uint32_t gservers_prop[smpt * 2];
> +    int i;
> +    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
> +    uint32_t associativity[] = {cpu_to_be32(0x5),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(0x0),
> +                                cpu_to_be32(cs->numa_node),
> +                                cpu_to_be32(index)};
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
> +    _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
> +    _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
> +                        env->dcache_line_size)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
> +                        env->dcache_line_size)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
> +                        env->icache_line_size)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
> +                            env->icache_line_size)));
> +
> +    if (pcc->l1_dcache_size) {
> +        _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
> +            pcc->l1_dcache_size)));
> +    } else {
> +        fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
> +    }
> +    if (pcc->l1_icache_size) {
> +        _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
> +            pcc->l1_icache_size)));
> +    } else {
> +        fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
> +    }
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", env->slb_nr)));
> +    _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
> +    _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
> +
> +    if (env->spr_cb[SPR_PURR].oea_read) {
> +        _FDT((fdt_setprop(fdt, offset, "ibm,purr", NULL, 0)));
> +    }
> +
> +    if (env->mmu_model & POWERPC_MMU_1TSEG) {
> +        _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
> +                           segs, sizeof(segs))));
> +    }
> +
> +    /* Advertise VMX/VSX (vector extensions) if available
> +     *   0 / no property == no vector extensions
> +     *   1               == VMX / Altivec available
> +     *   2               == VSX available */
> +    if (env->insns_flags & PPC_ALTIVEC) {
> +        uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;
> +
> +        _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", vmx)));
> +    }
> +
> +    /* Advertise DFP (Decimal Floating Point) if available
> +     *   0 / no property == no DFP
> +     *   1               == DFP available */
> +    if (env->insns_flags2 & PPC2_DFP) {
> +        _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
> +    }
> +
> +    page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
> +                                                  sizeof(page_sizes_prop));
> +    if (page_sizes_prop_size) {
> +        _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
> +                           page_sizes_prop, page_sizes_prop_size)));
> +    }
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
> +                                cs->cpu_index / cpus_per_socket)));
> +
> +    _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
> +
> +    /* Build interrupt servers and gservers properties */
> +    for (i = 0; i < smpt; i++) {
> +        servers_prop[i] = cpu_to_be32(index + i);
> +        /* Hack, direct the group queues back to cpu 0 */
> +        gservers_prop[i*2] = cpu_to_be32(index + i);
> +        gservers_prop[i*2 + 1] = 0;
> +    }
> +    _FDT(fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
> +                       servers_prop, sizeof(servers_prop)));
> +    _FDT(fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
> +                      gservers_prop, sizeof(gservers_prop)));
> +    _FDT(fdt_setprop(fdt, offset, "ibm,pft-size",
> +                          pft_size_prop, sizeof(pft_size_prop)));
> +
> +    if (nb_numa_nodes > 1) {
> +        _FDT(fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> +                          sizeof(associativity)));
> +    }
> +}
> +
> +static void spapr_cpu_hotplug_add(DeviceState *dev, CPUState *cs)
> +{
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    DeviceClass *dc = DEVICE_GET_CLASS(cs);
> +    int id = ppc_get_vcpu_dt_id(cpu);
> +    sPAPRDRConnector *drc =
> +        spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    int drc_index = drck->get_index(drc);
> +    void *fdt, *fdt_orig;
> +    int offset, i;
> +    char *nodename;
> +
> +    /* add OF node for CPU and required OF DT properties */
> +    fdt_orig = g_malloc0(FDT_MAX_SIZE);
> +    offset = fdt_create(fdt_orig, FDT_MAX_SIZE);
> +    fdt_begin_node(fdt_orig, "");
> +    fdt_end_node(fdt_orig);
> +    fdt_finish(fdt_orig);
> +
> +    fdt = g_malloc0(FDT_MAX_SIZE);
> +    fdt_open_into(fdt_orig, fdt, FDT_MAX_SIZE);
> +
> +    nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
> +
> +    offset = fdt_add_subnode(fdt, offset, nodename);
> +
> +    /* Set NUMA node for the added CPU */
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (test_bit(cs->cpu_index, numa_info[i].node_cpu)) {
> +            cs->numa_node = i;
> +            break;
> +        }
> +    }
> +
> +    spapr_populate_cpu_dt(cs, fdt, offset, drc_index);
> +    g_free(fdt_orig);
> +    g_free(nodename);
> +
> +    drck->attach(drc, dev, fdt, offset, false);
> +}
> +
> +static void spapr_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> +                            Error **errp)
> +{
> +    Error *local_err = NULL;
> +    CPUState *cs = CPU(dev);
> +    PowerPCCPU *cpu = POWERPC_CPU(cs);
> +    sPAPRDRConnector *drc =
> +        spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, cpu->cpu_dt_id);
just rant: does this have any relation to hotplug_dev, the point here is to get
these data from hotplug_dev object/some child of it rather then via direct adhoc call.

> +
> +    /* TODO: Check if DR is enabled ? */
> +    g_assert(drc);
> +
> +    spapr_cpu_reset(POWERPC_CPU(CPU(dev)));
reset probably should be don at realize time, see x86_cpu_realizefn() for example.

> +    spapr_cpu_hotplug_add(dev, cs);
> +    spapr_hotplug_req_add_event(drc);
> +    error_propagate(errp, local_err);
> +    return;
> +}
> +
> +static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
> +                                      DeviceState *dev, Error **errp)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
> +        if (dev->hotplugged) {
> +            spapr_cpu_plug(hotplug_dev, dev, errp);
Would be nicer if this could do cold-plugged CPUs wiring too.

> +        }
> +    }
> +}
> +
> +static HotplugHandler *spapr_get_hotpug_handler(MachineState *machine,
> +                                             DeviceState *dev)
> +{
> +    if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
> +        return HOTPLUG_HANDLER(machine);
> +    }
> +    return NULL;
> +}
> +
>  static void spapr_machine_class_init(ObjectClass *oc, void *data)
>  {
>      MachineClass *mc = MACHINE_CLASS(oc);
>      sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
>      FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
>      NMIClass *nc = NMI_CLASS(oc);
> +    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
>  
>      mc->init = ppc_spapr_init;
>      mc->reset = ppc_spapr_reset;
> @@ -1759,6 +1958,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>      mc->default_boot_order = NULL;
>      mc->kvm_type = spapr_kvm_type;
>      mc->has_dynamic_sysbus = true;
> +    mc->get_hotplug_handler = spapr_get_hotpug_handler;
> +
> +    hc->plug = spapr_machine_device_plug;
>      smc->dr_phb_enabled = false;
>      smc->dr_cpu_enabled = false;
>      smc->dr_lmb_enabled = false;
> @@ -1778,6 +1980,7 @@ static const TypeInfo spapr_machine_info = {
>      .interfaces = (InterfaceInfo[]) {
>          { TYPE_FW_PATH_PROVIDER },
>          { TYPE_NMI },
> +        { TYPE_HOTPLUG_HANDLER },
>          { }
>      },
>  };
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 434a75d..035d8c9 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -364,14 +364,16 @@ static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action)
>      hp->hdr.section_length = cpu_to_be16(sizeof(*hp));
>      hp->hdr.section_version = 1; /* includes extended modifier */
>      hp->hotplug_action = hp_action;
> -
> +    hp->drc.index = cpu_to_be32(drck->get_index(drc));
> +    hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
>  
>      switch (drc_type) {
>      case SPAPR_DR_CONNECTOR_TYPE_PCI:
> -        hp->drc.index = cpu_to_be32(drck->get_index(drc));
> -        hp->hotplug_identifier = RTAS_LOG_V6_HP_ID_DRC_INDEX;
>          hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_PCI;
>          break;
> +    case SPAPR_DR_CONNECTOR_TYPE_CPU:
> +        hp->hotplug_type = RTAS_LOG_V6_HP_TYPE_CPU;
> +        break;
>      default:
>          /* skip notification for unknown connector types */
>          g_free(new_hp);
> diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
> index 9c642a5..cf9d8d3 100644
> --- a/target-ppc/translate_init.c
> +++ b/target-ppc/translate_init.c
> @@ -32,6 +32,7 @@
>  #include "hw/qdev-properties.h"
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/ppc.h"
> +#include "sysemu/sysemu.h"
>  
>  //#define PPC_DUMP_CPU
>  //#define PPC_DEBUG_SPR
> @@ -8909,6 +8910,11 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    if (cs->cpu_index >= max_cpus) {
pls note that cpu_index is monotonically increases, so when you do unplug
and then plug it will go above max_cpus or the same will happen if
one device_add fails in the middle, the next CPU add will fail because of
cs->cpu_index goes overboard.

I'd suggest not to rely/use cpu_index for any purposes and use other means
to identify where cpu is plugged in. On x68 we slowly getting rid of this
dependency in favor of apic_id (topology information), eventually it could
become:
  -device cpu_foo,socket=X,core=Y[,thread=Z][,node=N]

you probably could do the same.
It doesn't have to be in this series, just be aware of potential issues.


> +        error_setg(errp, "Can't have more CPUs, maxcpus limit reached");
> +        return;
> +    }
> +
>      cpu->cpu_dt_id = (cs->cpu_index / smp_threads) * max_smt
>          + (cs->cpu_index % smp_threads);
>  #endif