On Tue, Mar 23, 2021 at 09:47:38AM -0400, Shivaprasad G Bhat wrote: > The patch adds support for the SCM flush hcall for the nvdimm devices. > To be available for exploitation by guest through the next patch. > > The hcall expects the semantics such that the flush to return > with H_BUSY when the operation is expected to take longer time along > with a continue_token. The hcall to be called again providing the > continue_token to get the status. So, all fresh requsts are put into > a 'pending' list and flush worker is submitted to the thread pool. > The thread pool completion callbacks move the requests to 'completed' > list, which are cleaned up after reporting to guest in subsequent > hcalls to get the status. > > The semantics makes it necessary to preserve the continue_tokens > and their return status even across migrations. So, the pre_save > handler for the device waits for the flush worker to complete and > collects all the hcall states from 'completed' list. The necessary > nvdimm flush specific vmstate structures are added to the spapr > machine vmstate. > > Signed-off-by: Shivaprasad G Bhat An overal question: surely the same issue must arise on x86 with file-backed NVDIMMs. How do they handle this case? > --- > hw/ppc/spapr.c | 6 + > hw/ppc/spapr_nvdimm.c | 240 +++++++++++++++++++++++++++++++++++++++++ > include/hw/ppc/spapr.h | 11 ++ > include/hw/ppc/spapr_nvdimm.h | 12 ++ > 4 files changed, 268 insertions(+), 1 deletion(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index d56418ca29..fdb0c73a2c 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1607,6 +1607,8 @@ static void spapr_machine_reset(MachineState *machine) > spapr->ov5_cas = spapr_ovec_clone(spapr->ov5); > } > > + spapr_nvdimm_finish_flushes(); > + > /* DRC reset may cause a device to be unplugged. This will cause troubles > * if this device is used by another device (eg, a running vhost backend > * will crash QEMU if the DIMM holding the vring goes away). To avoid such > @@ -2003,6 +2005,7 @@ static const VMStateDescription vmstate_spapr = { > &vmstate_spapr_cap_ccf_assist, > &vmstate_spapr_cap_fwnmi, > &vmstate_spapr_fwnmi, > + &vmstate_spapr_nvdimm_flush_states, > NULL > } > }; > @@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine) > } > > qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond); > + qemu_mutex_init(&spapr->spapr_nvdimm_flush_states_lock); Do you actually need an extra mutex, or can you rely on the BQL? > + QLIST_INIT(&spapr->pending_flush_states); > + QLIST_INIT(&spapr->completed_flush_states); > } > > #define DEFAULT_KVM_TYPE "auto" > diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c > index 8cf3fb2ffb..883317c1ed 100644 > --- a/hw/ppc/spapr_nvdimm.c > +++ b/hw/ppc/spapr_nvdimm.c > @@ -22,14 +22,17 @@ > * THE SOFTWARE. > */ > #include "qemu/osdep.h" > +#include "qemu/cutils.h" > #include "qapi/error.h" > #include "hw/ppc/spapr_drc.h" > #include "hw/ppc/spapr_nvdimm.h" > #include "hw/mem/nvdimm.h" > +#include "qemu/guest-random.h" > #include "qemu/nvdimm-utils.h" > #include "hw/ppc/fdt.h" > #include "qemu/range.h" > #include "hw/ppc/spapr_numa.h" > +#include "block/thread-pool.h" > > /* > * The nvdimm size should be aligned to SCM block size. > @@ -371,6 +374,242 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr, > return H_SUCCESS; > } > > +static const VMStateDescription vmstate_spapr_nvdimm_entry = { > + .name = "spapr_nvdimm_states", > + .version_id = 1, > + .minimum_version_id = 1, > + .fields = (VMStateField[]) { > + VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState), > + VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState), > + VMSTATE_END_OF_LIST() > + }, > +}; > + > +static bool spapr_nvdimm_states_needed(void *opaque) > +{ > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + > + return (!QLIST_EMPTY(&spapr->pending_flush_states) || > + !QLIST_EMPTY(&spapr->completed_flush_states)); > +} > + > +static int spapr_nvdimm_pre_save(void *opaque) > +{ > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + > + while (!QLIST_EMPTY(&spapr->pending_flush_states)) { > + aio_poll(qemu_get_aio_context(), true); Hmm... how long could waiting for all the pending flushes to complete take? This could add substanially to the guest's migration downtime, couldn't it? > + } > + > + return 0; > +} > + > +const VMStateDescription vmstate_spapr_nvdimm_flush_states = { > + .name = "spapr_nvdimm_hcall_states", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = spapr_nvdimm_states_needed, > + .pre_save = spapr_nvdimm_pre_save, > + .fields = (VMStateField[]) { > + VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1, > + vmstate_spapr_nvdimm_entry, > + SpaprNVDIMMDeviceFlushState, node), > + VMSTATE_END_OF_LIST() > + }, > +}; > + > +/* > + * Acquire a unique token and reserve it for the new flush state. > + */ > +static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(void) > +{ > + Error *err = NULL; > + uint64_t token; > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + SpaprNVDIMMDeviceFlushState *tmp, *next, *state; > + > + state = g_malloc0(sizeof(*state)); > + > + qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock); > +retry: > + if (qemu_guest_getrandom(&token, sizeof(token), &err) < 0) { Using getrandom seems like overkill, why not just use a counter? > + error_report_err(err); > + g_free(state); > + qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock); > + return NULL; > + } > + > + if (!token) /* Token should be non-zero */ > + goto retry; > + > + /* If the token already in use, get a new one */ > + QLIST_FOREACH_SAFE(tmp, &(spapr->pending_flush_states), node, next) { > + if (tmp->continue_token == token) { > + goto retry; > + } > + } > + QLIST_FOREACH_SAFE(tmp, &(spapr->completed_flush_states), node, next) { > + if (tmp->continue_token == token) { > + goto retry; > + } > + } > + > + state->continue_token = token; > + QLIST_INSERT_HEAD(&spapr->pending_flush_states, state, node); > + > + qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock); > + > + return state; > +} > + > +/* > + * spapr_nvdimm_finish_flushes > + * Waits for all pending flush requests to complete > + * their execution and free the states > + */ > +void spapr_nvdimm_finish_flushes(void) > +{ > + SpaprNVDIMMDeviceFlushState *state, *next; > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); The caller has natural access to the machine, so pass it in rather than using the global. > + > + /* > + * No contention here when called on reset path, the main loop thread > + * which calls the pending BHs has gotten out running in the reset path, > + * finally reaching here. Other code path being guest > + * h_client_architecture_support, thats early boot up. > + */ > + > + while (!QLIST_EMPTY(&spapr->pending_flush_states)) { > + aio_poll(qemu_get_aio_context(), true); > + } > + > + QLIST_FOREACH_SAFE(state, &spapr->completed_flush_states, node, next) { > + QLIST_REMOVE(state, node); > + g_free(state); > + } > +} > + > +/* > + * spapr_nvdimm_get_hcall_status > + * Fetches the status of the hcall worker and returns H_BUSY > + * if the worker is still running. > + */ > +static int spapr_nvdimm_get_flush_status(uint64_t token) > +{ > + int ret = H_LONG_BUSY_ORDER_10_MSEC; > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); The callers have natural access to spapr, so pass it in rather than using the global. > + SpaprNVDIMMDeviceFlushState *state, *node; > + > + qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock); > + QLIST_FOREACH_SAFE(state, &spapr->pending_flush_states, node, node) { > + if (state->continue_token == token) { > + goto exit; > + } > + } > + ret = H_P2; /* If not found in complete list too, invalid token */ > + QLIST_FOREACH_SAFE(state, &spapr->completed_flush_states, node, node) { > + if (state->continue_token == token) { > + ret = state->hcall_ret; > + QLIST_REMOVE(state, node); > + g_free(state); > + break; > + } > + } > +exit: > + qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock); > + > + return ret; > +} > + > +static int flush_worker_cb(void *opaque) > +{ > + int ret = H_SUCCESS; > + SpaprNVDIMMDeviceFlushState *state = opaque; > + > + /* flush raw backing image */ > + if (qemu_fdatasync(state->backend_fd) < 0) { > + error_report("papr_scm: Could not sync nvdimm to backend file: %s", > + strerror(errno)); > + ret = H_HARDWARE; > + } > + > + return ret; > +} > + > +static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret) > +{ > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + SpaprNVDIMMDeviceFlushState *state = opaque; > + > + qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock); > + > + state->hcall_ret = hcall_ret; > + QLIST_REMOVE(state, node); > + QLIST_INSERT_HEAD(&spapr->completed_flush_states, state, node); > + > + qemu_mutex_unlock(&spapr->spapr_nvdimm_flush_states_lock); > +} > + > +/* > + * H_SCM_FLUSH > + * Input: drc_index, continue-token > + * Out: continue-token > + * Return Value: H_SUCCESS, H_Parameter, H_P2, H_BUSY > + * > + * Given a DRC Index Flush the data to backend NVDIMM device. > + * The hcall returns H_BUSY when the flush takes longer time and the hcall It returns one of the H_LONG_BUSY values, not actual H_BUSY, doesn't it? > + * needs to be issued multiple times in order to be completely serviced. > + * The continue-token from the output to be passed in the argument list of > + * subsequent hcalls until the hcall is completely serviced at which > + * point H_SUCCESS or other error is returned. > + */ > +static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr, > + target_ulong opcode, target_ulong *args) > +{ > + int ret; > + uint32_t drc_index = args[0]; > + uint64_t continue_token = args[1]; > + SpaprDrc *drc = spapr_drc_by_index(drc_index); > + PCDIMMDevice *dimm; > + HostMemoryBackend *backend = NULL; > + SpaprNVDIMMDeviceFlushState *state; > + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); > + > + if (!drc || !drc->dev || > + spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) { > + return H_PARAMETER; > + } > + > + if (continue_token != 0) { > + ret = spapr_nvdimm_get_flush_status(continue_token); > + if (H_IS_LONG_BUSY(ret)) { > + args[0] = continue_token; > + } > + > + return ret; > + } > + > + dimm = PC_DIMM(drc->dev); > + backend = MEMORY_BACKEND(dimm->hostmem); > + > + state = spapr_nvdimm_init_new_flush_state(); > + if (!state) { > + return H_P2; AFAICT the only way init_new_flush_state() fails is a failure in the RNG, which definitely isn't a parameter problem. > + } > + > + state->backend_fd = memory_region_get_fd(&backend->mr); Is this guaranteed to return a usable fd in all configurations? > + thread_pool_submit_aio(pool, flush_worker_cb, state, > + spapr_nvdimm_flush_completion_cb, state); > + > + ret = spapr_nvdimm_get_flush_status(state->continue_token); > + if (H_IS_LONG_BUSY(ret)) { > + args[0] = state->continue_token; > + } > + > + return ret; I believe you can rearrange this so the get_flush_status / check / return is shared between the args[0] == 0 and args[0] == token paths. > +} > + > static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr, > target_ulong opcode, target_ulong *args) > { > @@ -487,6 +726,7 @@ static void spapr_scm_register_types(void) > spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem); > spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem); > spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all); > + spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush); > } > > type_init(spapr_scm_register_types) > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 47cebaf3ac..7c27fb3e2d 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -12,10 +12,12 @@ > #include "hw/ppc/spapr_xive.h" /* For SpaprXive */ > #include "hw/ppc/xics.h" /* For ICSState */ > #include "hw/ppc/spapr_tpm_proxy.h" > +#include "hw/ppc/spapr_nvdimm.h" > > struct SpaprVioBus; > struct SpaprPhbState; > struct SpaprNvram; > +struct SpaprNVDIMMDeviceFlushState; > > typedef struct SpaprEventLogEntry SpaprEventLogEntry; > typedef struct SpaprEventSource SpaprEventSource; > @@ -245,6 +247,12 @@ struct SpaprMachineState { > uint32_t numa_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE]; > > Error *fwnmi_migration_blocker; > + > + /* nvdimm flush states */ > + QemuMutex spapr_nvdimm_flush_states_lock; > + QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_flush_states; > + QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_flush_states; > + > }; > > #define H_SUCCESS 0 > @@ -538,8 +546,9 @@ struct SpaprMachineState { > #define H_SCM_BIND_MEM 0x3EC > #define H_SCM_UNBIND_MEM 0x3F0 > #define H_SCM_UNBIND_ALL 0x3FC > +#define H_SCM_FLUSH 0x44C > > -#define MAX_HCALL_OPCODE H_SCM_UNBIND_ALL > +#define MAX_HCALL_OPCODE H_SCM_FLUSH > > /* The hcalls above are standardized in PAPR and implemented by pHyp > * as well. > diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h > index abcacda5d7..c88df2c590 100644 > --- a/include/hw/ppc/spapr_nvdimm.h > +++ b/include/hw/ppc/spapr_nvdimm.h > @@ -11,6 +11,7 @@ > #define HW_SPAPR_NVDIMM_H > > #include "hw/mem/nvdimm.h" > +#include "migration/vmstate.h" > > struct SpaprDrc; > struct SpaprMachineState; > @@ -22,5 +23,16 @@ void spapr_dt_persistent_memory(struct SpaprMachineState *spapr, void *fdt); > bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, > uint64_t size, Error **errp); > void spapr_add_nvdimm(DeviceState *dev, uint64_t slot); > +void spapr_nvdimm_finish_flushes(void); > + > +typedef struct SpaprNVDIMMDeviceFlushState { > + uint64_t continue_token; > + int64_t hcall_ret; > + int backend_fd; > + > + QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node; > +} SpaprNVDIMMDeviceFlushState; > + > +extern const VMStateDescription vmstate_spapr_nvdimm_flush_states; > > #endif > > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson