On Tue, Mar 23, 2021 at 09:47:38AM -0400, Shivaprasad G Bhat wrote:
> The patch adds support for the SCM flush hcall for the nvdimm devices.
> To be available for exploitation by guest through the next patch.
> 
> The hcall expects the semantics such that the flush to return
> with H_BUSY when the operation is expected to take longer time along
> with a continue_token. The hcall to be called again providing the
> continue_token to get the status. So, all fresh requsts are put into
> a 'pending' list and flush worker is submitted to the thread pool.
> The thread pool completion callbacks move the requests to 'completed'
> list, which are cleaned up after reporting to guest in subsequent
> hcalls to get the status.
> 
> The semantics makes it necessary to preserve the continue_tokens
> and their return status even across migrations. So, the pre_save
> handler for the device waits for the flush worker to complete and
> collects all the hcall states from 'completed' list. The necessary
> nvdimm flush specific vmstate structures are added to the spapr
> machine vmstate.
> 
> Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>

An overal question: surely the same issue must arise on x86 with
file-backed NVDIMMs.  How do they handle this case?

> ---
>  hw/ppc/spapr.c                |    6 +
>  hw/ppc/spapr_nvdimm.c         |  240 +++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h        |   11 ++
>  include/hw/ppc/spapr_nvdimm.h |   12 ++
>  4 files changed, 268 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d56418ca29..fdb0c73a2c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1607,6 +1607,8 @@ static void spapr_machine_reset(MachineState *machine)
>          spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
>      }
>  
> +    spapr_nvdimm_finish_flushes();
> +
>      /* DRC reset may cause a device to be unplugged. This will cause troubles
>       * if this device is used by another device (eg, a running vhost backend
>       * will crash QEMU if the DIMM holding the vring goes away). To avoid such
> @@ -2003,6 +2005,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_cap_ccf_assist,
>          &vmstate_spapr_cap_fwnmi,
>          &vmstate_spapr_fwnmi,
> +        &vmstate_spapr_nvdimm_flush_states,
>          NULL
>      }
>  };
> @@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
>      }
>  
>      qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
> +    qemu_mutex_init(&spapr->spapr_nvdimm_flush_states_lock);

Do you actually need an extra mutex, or can you rely on the BQL?

> +    QLIST_INIT(&spapr->pending_flush_states);
> +    QLIST_INIT(&spapr->completed_flush_states);
>  }
>  
>  #define DEFAULT_KVM_TYPE "auto"
> diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
> index 8cf3fb2ffb..883317c1ed 100644
> --- a/hw/ppc/spapr_nvdimm.c
> +++ b/hw/ppc/spapr_nvdimm.c
> @@ -22,14 +22,17 @@
>   * THE SOFTWARE.
>   */
>  #include "qemu/osdep.h"
> +#include "qemu/cutils.h"
>  #include "qapi/error.h"
>  #include "hw/ppc/spapr_drc.h"
>  #include "hw/ppc/spapr_nvdimm.h"
>  #include "hw/mem/nvdimm.h"
> +#include "qemu/guest-random.h"
>  #include "qemu/nvdimm-utils.h"
>  #include "hw/ppc/fdt.h"
>  #include "qemu/range.h"
>  #include "hw/ppc/spapr_numa.h"
> +#include "block/thread-pool.h"
>  
>  /*
>   * The nvdimm size should be aligned to SCM block size.
> @@ -371,6 +374,242 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
>      return H_SUCCESS;
>  }
>  
> +static const VMStateDescription vmstate_spapr_nvdimm_entry = {
> +     .name = "spapr_nvdimm_states",
> +     .version_id = 1,
> +     .minimum_version_id = 1,
> +     .fields = (VMStateField[]) {
> +         VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
> +         VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
> +         VMSTATE_END_OF_LIST()
> +     },
> +};
> +
> +static bool spapr_nvdimm_states_needed(void *opaque)
> +{
> +     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +
> +     return (!QLIST_EMPTY(&spapr->pending_flush_states) ||
> +             !QLIST_EMPTY(&spapr->completed_flush_states));
> +}
> +
> +static int spapr_nvdimm_pre_save(void *opaque)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +
> +    while (!QLIST_EMPTY(&spapr->pending_flush_states)) {
> +        aio_poll(qemu_get_aio_context(), true);

Hmm... how long could waiting for all the pending flushes to complete
take?  This could add substanially to the guest's migration downtime,
couldn't it?

> +    }
> +
> +    return 0;
> +}
> +
> +const VMStateDescription vmstate_spapr_nvdimm_flush_states = {
> +    .name = "spapr_nvdimm_hcall_states",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_nvdimm_states_needed,
> +    .pre_save = spapr_nvdimm_pre_save,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1,
> +                        vmstate_spapr_nvdimm_entry,
> +                        SpaprNVDIMMDeviceFlushState, node),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +/*
> + * Acquire a unique token and reserve it for the new flush state.
> + */
> +static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(void)
> +{
> +    Error *err = NULL;
> +    uint64_t token;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    SpaprNVDIMMDeviceFlushState *tmp, *next, *state;
> +
> +    state = g_malloc0(sizeof(*state));
> +
> +    qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock);
> +retry:
> +    if (qemu_guest_getrandom(&token, sizeof(token), &err) < 0) {

Using getrandom seems like overkill, why not just use a counter?

> +        error_report_err(err);
> +        g_free(state);
> +        qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock);
> +        return NULL;
> +    }
> +
> +    if (!token) /* Token should be non-zero */
> +        goto retry;
> +
> +    /* If the token already in use, get a new one */
> +    QLIST_FOREACH_SAFE(tmp, &(spapr->pending_flush_states), node, next) {
> +        if (tmp->continue_token == token) {
> +            goto retry;
> +        }
> +    }
> +    QLIST_FOREACH_SAFE(tmp, &(spapr->completed_flush_states), node, next) {
> +        if (tmp->continue_token == token) {
> +            goto retry;
> +        }
> +    }
> +
> +    state->continue_token = token;
> +    QLIST_INSERT_HEAD(&spapr->pending_flush_states, state, node);
> +
> +    qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock);
> +
> +    return state;
> +}
> +
> +/*
> + * spapr_nvdimm_finish_flushes
> + *      Waits for all pending flush requests to complete
> + *      their execution and free the states
> + */
> +void spapr_nvdimm_finish_flushes(void)
> +{
> +    SpaprNVDIMMDeviceFlushState *state, *next;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

The caller has natural access to the machine, so pass it in rather
than using the global.

> +
> +    /*
> +     * No contention here when called on reset path, the main loop thread
> +     * which calls the pending BHs has gotten out running in the reset path,
> +     * finally reaching here. Other code path being guest
> +     * h_client_architecture_support, thats early boot up.
> +     */
> +
> +    while (!QLIST_EMPTY(&spapr->pending_flush_states)) {
> +        aio_poll(qemu_get_aio_context(), true);
> +    }
> +
> +    QLIST_FOREACH_SAFE(state, &spapr->completed_flush_states, node, next) {
> +        QLIST_REMOVE(state, node);
> +        g_free(state);
> +    }
> +}
> +
> +/*
> + * spapr_nvdimm_get_hcall_status
> + *      Fetches the status of the hcall worker and returns H_BUSY
> + *      if the worker is still running.
> + */
> +static int spapr_nvdimm_get_flush_status(uint64_t token)
> +{
> +    int ret = H_LONG_BUSY_ORDER_10_MSEC;
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

The callers have natural access to spapr, so pass it in rather than
using the global.

> +    SpaprNVDIMMDeviceFlushState *state, *node;
> +
> +    qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock);
> +    QLIST_FOREACH_SAFE(state, &spapr->pending_flush_states, node, node) {
> +        if (state->continue_token == token) {
> +            goto exit;
> +        }
> +    }
> +    ret = H_P2; /* If not found in complete list too, invalid token */
> +    QLIST_FOREACH_SAFE(state, &spapr->completed_flush_states, node, node) {
> +        if (state->continue_token == token) {
> +            ret = state->hcall_ret;
> +            QLIST_REMOVE(state, node);
> +            g_free(state);
> +            break;
> +        }
> +    }
> +exit:
> +    qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock);
> +
> +    return ret;
> +}
> +
> +static int flush_worker_cb(void *opaque)
> +{
> +    int ret = H_SUCCESS;
> +    SpaprNVDIMMDeviceFlushState *state = opaque;
> +
> +    /* flush raw backing image */
> +    if (qemu_fdatasync(state->backend_fd) < 0) {
> +        error_report("papr_scm: Could not sync nvdimm to backend file: %s",
> +                     strerror(errno));
> +        ret = H_HARDWARE;
> +    }
> +
> +    return ret;
> +}
> +
> +static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    SpaprNVDIMMDeviceFlushState *state = opaque;
> +
> +    qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock);
> +
> +    state->hcall_ret = hcall_ret;
> +    QLIST_REMOVE(state, node);
> +    QLIST_INSERT_HEAD(&spapr->completed_flush_states, state, node);
> +
> +    qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock);
> +}
> +
> +/*
> + * H_SCM_FLUSH
> + * Input: drc_index, continue-token
> + * Out: continue-token
> + * Return Value: H_SUCCESS, H_Parameter, H_P2, H_BUSY
> + *
> + * Given a DRC Index Flush the data to backend NVDIMM device.
> + * The hcall returns H_BUSY when the flush takes longer time and the hcall

It returns one of the H_LONG_BUSY values, not actual H_BUSY, doesn't
it?


> + * needs to be issued multiple times in order to be completely serviced.
> + * The continue-token from the output to be passed in the argument list of
> + * subsequent hcalls until the hcall is completely serviced at which
> + * point H_SUCCESS or other error is returned.
> + */
> +static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                      target_ulong opcode, target_ulong *args)
> +{
> +    int ret;
> +    uint32_t drc_index = args[0];
> +    uint64_t continue_token = args[1];
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    PCDIMMDevice *dimm;
> +    HostMemoryBackend *backend = NULL;
> +    SpaprNVDIMMDeviceFlushState *state;
> +    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
> +
> +    if (!drc || !drc->dev ||
> +        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (continue_token != 0) {
> +        ret = spapr_nvdimm_get_flush_status(continue_token);
> +        if (H_IS_LONG_BUSY(ret)) {
> +            args[0] = continue_token;
> +        }
> +
> +        return ret;
> +    }
> +
> +    dimm = PC_DIMM(drc->dev);
> +    backend = MEMORY_BACKEND(dimm->hostmem);
> +
> +    state = spapr_nvdimm_init_new_flush_state();
> +    if (!state) {
> +        return H_P2;

AFAICT the only way init_new_flush_state() fails is a failure in the
RNG, which definitely isn't a parameter problem.

> +    }
> +
> +    state->backend_fd = memory_region_get_fd(&backend->mr);

Is this guaranteed to return a usable fd in all configurations?

> +    thread_pool_submit_aio(pool, flush_worker_cb, state,
> +                           spapr_nvdimm_flush_completion_cb, state);
> +
> +    ret = spapr_nvdimm_get_flush_status(state->continue_token);
> +    if (H_IS_LONG_BUSY(ret)) {
> +        args[0] = state->continue_token;
> +    }
> +
> +    return ret;

I believe you can rearrange this so the get_flush_status / check /
return is shared between the args[0] == 0 and args[0] == token paths.

> +}
> +
>  static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
>                                       target_ulong opcode, target_ulong *args)
>  {
> @@ -487,6 +726,7 @@ static void spapr_scm_register_types(void)
>      spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
>      spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
>      spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
> +    spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush);
>  }
>  
>  type_init(spapr_scm_register_types)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 47cebaf3ac..7c27fb3e2d 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -12,10 +12,12 @@
>  #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
>  #include "hw/ppc/xics.h"        /* For ICSState */
>  #include "hw/ppc/spapr_tpm_proxy.h"
> +#include "hw/ppc/spapr_nvdimm.h"
>  
>  struct SpaprVioBus;
>  struct SpaprPhbState;
>  struct SpaprNvram;
> +struct SpaprNVDIMMDeviceFlushState;
>  
>  typedef struct SpaprEventLogEntry SpaprEventLogEntry;
>  typedef struct SpaprEventSource SpaprEventSource;
> @@ -245,6 +247,12 @@ struct SpaprMachineState {
>      uint32_t numa_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE];
>  
>      Error *fwnmi_migration_blocker;
> +
> +    /* nvdimm flush states */
> +    QemuMutex spapr_nvdimm_flush_states_lock;
> +    QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_flush_states;
> +    QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_flush_states;
> +
>  };
>  
>  #define H_SUCCESS         0
> @@ -538,8 +546,9 @@ struct SpaprMachineState {
>  #define H_SCM_BIND_MEM          0x3EC
>  #define H_SCM_UNBIND_MEM        0x3F0
>  #define H_SCM_UNBIND_ALL        0x3FC
> +#define H_SCM_FLUSH             0x44C
>  
> -#define MAX_HCALL_OPCODE        H_SCM_UNBIND_ALL
> +#define MAX_HCALL_OPCODE        H_SCM_FLUSH
>  
>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>   * as well.
> diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
> index abcacda5d7..c88df2c590 100644
> --- a/include/hw/ppc/spapr_nvdimm.h
> +++ b/include/hw/ppc/spapr_nvdimm.h
> @@ -11,6 +11,7 @@
>  #define HW_SPAPR_NVDIMM_H
>  
>  #include "hw/mem/nvdimm.h"
> +#include "migration/vmstate.h"
>  
>  struct SpaprDrc;
>  struct SpaprMachineState;
> @@ -22,5 +23,16 @@ void spapr_dt_persistent_memory(struct SpaprMachineState *spapr, void *fdt);
>  bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
>                             uint64_t size, Error **errp);
>  void spapr_add_nvdimm(DeviceState *dev, uint64_t slot);
> +void spapr_nvdimm_finish_flushes(void);
> +
> +typedef struct SpaprNVDIMMDeviceFlushState {
> +    uint64_t continue_token;
> +    int64_t hcall_ret;
> +    int backend_fd;
> +
> +    QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
> +} SpaprNVDIMMDeviceFlushState;
> +
> +extern const VMStateDescription vmstate_spapr_nvdimm_flush_states;
>  
>  #endif
> 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson