On Mon, May 13, 2019 at 04:28:36AM -0500, Shivaprasad G Bhat wrote:
> This patch implements few of the necessary hcalls for the nvdimm support.
> 
> PAPR semantics is such that each NVDIMM device is comprising of multiple
> SCM(Storage Class Memory) blocks. The guest requests the hypervisor to bind
> each of the SCM blocks of the NVDIMM device using hcalls. There can be
> SCM block unbind requests in case of driver errors or unplug(not supported now)
> use cases. The NVDIMM label read/writes are done through hcalls.
> 
> Since each virtual NVDIMM device is divided into multiple SCM blocks, the bind,
> unbind, and queries using hcalls on those blocks can come independently. This
> doesn't fit well into the qemu device semantics, where the map/unmap are done
> at the (whole)device/object level granularity. The patch doesnt actually
> bind/unbind on hcalls but let it happen at the object_add/del phase itself
> instead.
> 
> The guest kernel makes bind/unbind requests for the virtual NVDIMM device at the
> region level granularity. Without interleaving, each virtual NVDIMM device is
> presented as separate region. There is no way to configure the virtual NVDIMM
> interleaving for the guests today. So, there is no way a partial bind/unbind
> request can come for the vNVDIMM in a hcall for a subset of SCM blocks of a
> virtual NVDIMM. Hence it is safe to do bind/unbind everything during the
> object_add/del.
> 
> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
> ---
>  hw/ppc/spapr_hcall.c   |  202 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |    7 +-
>  2 files changed, 208 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 6c16d2b120..b6e7d04dcf 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -3,11 +3,13 @@
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/sysemu.h"
>  #include "qemu/log.h"
> +#include "qemu/range.h"
>  #include "qemu/error-report.h"
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "helper_regs.h"
>  #include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_drc.h"
>  #include "hw/ppc/spapr_cpu_core.h"
>  #include "mmu-hash64.h"
>  #include "cpu-models.h"
> @@ -16,6 +18,7 @@
>  #include "hw/ppc/spapr_ovec.h"
>  #include "mmu-book3s-v3.h"
>  #include "hw/mem/memory-device.h"
> +#include "hw/mem/nvdimm.h"
>  
>  static bool has_spr(PowerPCCPU *cpu, int spr)
>  {
> @@ -1795,6 +1798,199 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, SpaprMachineState *spapr,
>      return H_SUCCESS;
>  }
>  
> +static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
> +                                        SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t numBytesToRead = args[2];
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm = NULL;
> +    NVDIMMClass *ddc = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToRead != 1 && numBytesToRead != 2 &&
> +        numBytesToRead != 4 && numBytesToRead != 8) {
> +        return H_P3;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +    if ((offset + numBytesToRead < offset) ||
> +        (nvdimm->label_size < numBytesToRead + offset)) {
> +        return H_P2;
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);

I'm pretty sure this will need some sort of byteswap.  args[0] is
effectively in host native order (since it maps to a register, not
memory).  But the guest will have to assume some byteorder (BE, I'm
guessing, because PAPR) in order to map that register to an in-memory
byteorder.  So, you'll need to compensate for that here.

> +    return H_SUCCESS;
> +}
> +
> +
> +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
> +                                         SpaprMachineState *spapr,
> +                                         target_ulong opcode,
> +                                         target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t data = args[2];
> +    int8_t numBytesToWrite = args[3];
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm = NULL;
> +    DeviceState *dev = NULL;
> +    NVDIMMClass *ddc = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
> +        numBytesToWrite != 4 && numBytesToWrite != 8) {
> +        return H_P4;
> +    }
> +
> +    dev = drc->dev;
> +    nvdimm = NVDIMM(dev);
> +    if ((nvdimm->label_size < numBytesToWrite + offset) ||
> +        (offset + numBytesToWrite < offset)) {
> +        return H_P2;
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);

Likewise here.

> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_idx = args[1];
> +    uint64_t no_of_scm_blocks_to_bind = args[2];
> +    uint64_t target_logical_mem_addr = args[3];
> +    uint64_t continue_token = args[4];
> +    uint64_t size;
> +    uint64_t total_no_of_scm_blocks;
> +
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    hwaddr addr;
> +    DeviceState *dev = NULL;
> +    PCDIMMDevice *dimm = NULL;
> +    Error *local_err = NULL;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    dev = drc->dev;
> +    dimm = PC_DIMM(dev);
> +
> +    size = object_property_get_uint(OBJECT(dimm),
> +                                    PC_DIMM_SIZE_PROP, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return H_PARAMETER;
> +    }
> +
> +    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +    if ((starting_idx > total_no_of_scm_blocks) ||
> +        (no_of_scm_blocks_to_bind > total_no_of_scm_blocks)) {
> +        return H_P2;
> +    }
> +
> +    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
> +        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
> +        return H_P3;
> +    }
> +
> +    /* Currently qemu assigns the address. */
> +    if (target_logical_mem_addr != 0xffffffffffffffff) {
> +        return H_OVERLAP;

So, only supporting one mode of the interface is reasonable.  However,
it seems a somewhat unnatural way to handle the PAPR interface.  It
seems to me it would be more natural to instantiate the underlying
NVDIMM objects so that they're not in address_space_memory, but rather
in their own initially inaccssible address_space.  Then the BIND call
would alias a chunk of address_space_memory into the NVDIMMs address
space.

> +    }
> +
> +    /*
> +     * Currently continue token should be zero qemu has already bound
> +     * everything and this hcall doesnt return H_BUSY.
> +     */
> +    if (continue_token > 0) {
> +        return H_P5;
> +    }
> +
> +    /* NB : Already bound, Return target logical address in R4 */
> +    addr = object_property_get_uint(OBJECT(dimm),
> +                                    PC_DIMM_ADDR_PROP, &local_err);
> +    if (local_err) {
> +        error_report_err(local_err);
> +        return H_PARAMETER;
> +    }
> +
> +    args[1] = addr;

Don't you need to adjust this if start_idx is non-zero?

> +    args[2] = no_of_scm_blocks_to_bind;
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_scm_logical_addr = args[1];
> +    uint64_t no_of_scm_blocks_to_unbind = args[2];
> +    uint64_t size_to_unbind;
> +    uint64_t continue_token = args[3];
> +    Range blockrange = range_empty;
> +    Range nvdimmrange = range_empty;
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    DeviceState *dev = NULL;
> +    PCDIMMDevice *dimm = NULL;
> +    uint64_t size, addr;
> +
> +    if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    /* Check if starting_scm_logical_addr is block aligned */
> +    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
> +                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
> +        return H_P2;
> +    }
> +
> +    dev = drc->dev;
> +    dimm = PC_DIMM(dev);
> +    size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, NULL);
> +    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, NULL);
> +
> +    range_init_nofail(&nvdimmrange, addr, size);
> +
> +    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +
> +    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
> +
> +    if (!range_contains_range(&nvdimmrange, &blockrange)) {
> +        return H_P3;
> +    }
> +
> +    if (continue_token > 0) {
> +        return H_P3;
> +    }
> +
> +    args[1] = no_of_scm_blocks_to_unbind;
> +
> +    /*NB : dont do anything, let object_del take care of this for now. */
> +    return H_SUCCESS;
> +}
> +
>  static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
>  static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];
>  
> @@ -1894,6 +2090,12 @@ static void hypercall_register_types(void)
>      /* qemu/KVM-PPC specific hcalls */
>      spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
>  
> +    /* qemu/scm specific hcalls */
> +    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
> +    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
> +    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
> +    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
> +
>      /* ibm,client-architecture-support support */
>      spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support);
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 394ea26335..48e2cc9d67 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -283,6 +283,7 @@ struct SpaprMachineState {
>  #define H_P7              -60
>  #define H_P8              -61
>  #define H_P9              -62
> +#define H_OVERLAP         -68
>  #define H_UNSUPPORTED_FLAG -256
>  #define H_MULTI_THREADS_ACTIVE -9005
>  
> @@ -490,8 +491,12 @@ struct SpaprMachineState {
>  #define H_INT_ESB               0x3C8
>  #define H_INT_SYNC              0x3CC
>  #define H_INT_RESET             0x3D0
> +#define H_SCM_READ_METADATA     0x3E4
> +#define H_SCM_WRITE_METADATA    0x3E8
> +#define H_SCM_BIND_MEM          0x3EC
> +#define H_SCM_UNBIND_MEM        0x3F0
>  
> -#define MAX_HCALL_OPCODE        H_INT_RESET
> +#define MAX_HCALL_OPCODE        H_SCM_UNBIND_MEM
>  
>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>   * as well.
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson