From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Cooper Subject: [PATCH v5 RFC 09/14] tools/libxc: x86 PV restore code Date: Wed, 11 Jun 2014 19:14:37 +0100 Message-ID: <1402510482-21099-10-git-send-email-andrew.cooper3@citrix.com> References: <1402510482-21099-1-git-send-email-andrew.cooper3@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1402510482-21099-1-git-send-email-andrew.cooper3@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Xen-devel Cc: Andrew Cooper , Frediano Ziglio , David Vrabel List-Id: xen-devel@lists.xenproject.org Signed-off-by: Andrew Cooper Signed-off-by: Frediano Ziglio Signed-off-by: David Vrabel --- tools/libxc/saverestore/common.h | 2 + tools/libxc/saverestore/restore_x86_pv.c | 965 ++++++++++++++++++++++++++++++ 2 files changed, 967 insertions(+) create mode 100644 tools/libxc/saverestore/restore_x86_pv.c diff --git a/tools/libxc/saverestore/common.h b/tools/libxc/saverestore/common.h index 5c8a370..bb21e01 100644 --- a/tools/libxc/saverestore/common.h +++ b/tools/libxc/saverestore/common.h @@ -248,6 +248,8 @@ extern struct common_ops common_ops_x86_pv; extern struct save_ops save_ops_x86_pv; +extern struct restore_ops restore_ops_x86_pv; + struct record { uint32_t type; diff --git a/tools/libxc/saverestore/restore_x86_pv.c b/tools/libxc/saverestore/restore_x86_pv.c new file mode 100644 index 0000000..3174d4c --- /dev/null +++ b/tools/libxc/saverestore/restore_x86_pv.c @@ -0,0 +1,965 @@ +#include + +#include "common_x86_pv.h" + +/* + * Expand our local tracking information for the p2m table and domains maximum + * size. Normally this will be called once to expand from 0 to max_pfn, but + * is liable to expand multiple times if the domain grows on the sending side + * after migration has started. + */ +static int expand_p2m(struct context *ctx, unsigned long max_pfn) +{ + xc_interface *xch = ctx->xch; + unsigned long old_max = ctx->x86_pv.max_pfn, i; + unsigned int fpp = PAGE_SIZE / ctx->x86_pv.width; + unsigned long end_frame = (max_pfn + fpp) / fpp; + unsigned long old_end_frame = (old_max + fpp) / fpp; + xen_pfn_t *p2m = NULL, *p2m_pfns = NULL; + uint32_t *pfn_types = NULL; + size_t p2msz, p2m_pfnsz, pfn_typesz; + + assert(max_pfn > old_max); + + p2msz = (max_pfn + 1) * ctx->x86_pv.width; + p2m = realloc(ctx->x86_pv.p2m, p2msz); + if ( !p2m ) + { + ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz); + return -1; + } + ctx->x86_pv.p2m = p2m; + + pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types); + pfn_types = realloc(ctx->x86_pv.pfn_types, pfn_typesz); + if ( !pfn_types ) + { + ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz); + return -1; + } + ctx->x86_pv.pfn_types = pfn_types; + + p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns); + p2m_pfns = realloc(ctx->x86_pv.p2m_pfns, p2m_pfnsz); + if ( !p2m_pfns ) + { + ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz); + return -1; + } + ctx->x86_pv.p2m_frames = end_frame; + ctx->x86_pv.p2m_pfns = p2m_pfns; + + ctx->x86_pv.max_pfn = max_pfn; + for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i ) + { + ctx->ops.set_gfn(ctx, i, INVALID_MFN); + ctx->ops.set_page_type(ctx, i, 0); + } + + for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i ) + ctx->x86_pv.p2m_pfns[i] = INVALID_MFN; + + DPRINTF("Expanded p2m from %#lx to %#lx", old_max, max_pfn); + return 0; +} + +/* + * Pin all of the pagetables. TODO - batch the hypercalls. + */ +static int pin_pagetables(struct context *ctx) +{ + xc_interface *xch = ctx->xch; + unsigned long i; + struct mmuext_op pin; + + DPRINTF("Pinning pagetables"); + + for ( i = 0; i <= ctx->x86_pv.max_pfn; ++i ) + { + if ( (ctx->x86_pv.pfn_types[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 ) + continue; + + switch ( ctx->x86_pv.pfn_types[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) + { + case XEN_DOMCTL_PFINFO_L1TAB: + pin.cmd = MMUEXT_PIN_L1_TABLE; + break; + case XEN_DOMCTL_PFINFO_L2TAB: + pin.cmd = MMUEXT_PIN_L2_TABLE; + break; + case XEN_DOMCTL_PFINFO_L3TAB: + pin.cmd = MMUEXT_PIN_L3_TABLE; + break; + case XEN_DOMCTL_PFINFO_L4TAB: + pin.cmd = MMUEXT_PIN_L4_TABLE; + break; + default: + continue; + } + + pin.arg1.mfn = ctx->ops.pfn_to_gfn(ctx, i); + + if ( xc_mmuext_op(xch, &pin, 1, ctx->domid) != 0 ) + { + PERROR("Failed to pin page table for pfn %#lx", i); + return -1; + } + } + + return 0; +} + +/* + * Update details in a guests start_info strucutre. + */ +static int process_start_info(struct context *ctx, vcpu_guest_context_any_t *vcpu) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t pfn, mfn; + start_info_any_t *guest_start_info = NULL; + int rc = -1; + + pfn = GET_FIELD(ctx, vcpu, user_regs.edx); + + if ( pfn > ctx->x86_pv.max_pfn ) + { + ERROR("Start Info pfn %#lx out of range", pfn); + goto err; + } + else if ( ctx->x86_pv.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB ) + { + ERROR("Start Info pfn %#lx has bad type %"PRIu32, pfn, + ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Start Info has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + guest_start_info = xc_map_foreign_range( + xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); + if ( !guest_start_info ) + { + PERROR("Failed to map Start Info at mfn %#lx", mfn); + goto err; + } + + /* Deal with xenstore stuff */ + pfn = GET_FIELD(ctx, guest_start_info, store_mfn); + if ( pfn > ctx->x86_pv.max_pfn ) + { + ERROR("XenStore pfn %#lx out of range", pfn); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("XenStore pfn has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + ctx->restore.xenstore_mfn = mfn; + SET_FIELD(ctx, guest_start_info, store_mfn, mfn); + SET_FIELD(ctx, guest_start_info, store_evtchn, ctx->restore.xenstore_evtchn); + + /* Deal with console stuff */ + pfn = GET_FIELD(ctx, guest_start_info, console.domU.mfn); + if ( pfn > ctx->x86_pv.max_pfn ) + { + ERROR("Console pfn %#lx out of range", pfn); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Console pfn has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + ctx->restore.console_mfn = mfn; + SET_FIELD(ctx, guest_start_info, console.domU.mfn, mfn); + SET_FIELD(ctx, guest_start_info, console.domU.evtchn, ctx->restore.console_evtchn); + + /* Set other information */ + SET_FIELD(ctx, guest_start_info, nr_pages, ctx->x86_pv.max_pfn + 1); + SET_FIELD(ctx, guest_start_info, shared_info, + ctx->dominfo.shared_info_frame << PAGE_SHIFT); + SET_FIELD(ctx, guest_start_info, flags, 0); + + SET_FIELD(ctx, vcpu, user_regs.edx, mfn); + rc = 0; + +err: + if ( guest_start_info ) + munmap(guest_start_info, PAGE_SIZE); + + return rc; +} + +/* + * Copy the p2m which has been constructed locally as memory has been + * allocated, over the p2m in guest, so the guest can find its memory again on + * resume. + */ +static int update_guest_p2m(struct context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t mfn, pfn, *guest_p2m = NULL; + unsigned i; + int rc = -1; + + for ( i = 0; i < ctx->x86_pv.p2m_frames; ++i ) + { + pfn = ctx->x86_pv.p2m_pfns[i]; + + if ( pfn > ctx->x86_pv.max_pfn ) + { + ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range", + pfn, i); + goto err; + } + else if ( ctx->x86_pv.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB ) + { + ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %"PRIu32, pfn, i, + ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("p2m_frame_list[%u] has bad mfn", i); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + ctx->x86_pv.p2m_pfns[i] = mfn; + } + + guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE, + ctx->x86_pv.p2m_pfns, + ctx->x86_pv.p2m_frames ); + if ( !guest_p2m ) + { + PERROR("Failed to map p2m frames"); + goto err; + } + + memcpy(guest_p2m, ctx->x86_pv.p2m, + (ctx->x86_pv.max_pfn + 1) * ctx->x86_pv.width); + rc = 0; + err: + if ( guest_p2m ) + munmap(guest_p2m, ctx->x86_pv.p2m_frames * PAGE_SIZE); + + return rc; +} + +/* + * Process a toolstack record. TODO - remove from spec and code once libxl + * framing is sorted. + */ +static int handle_toolstack(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + int rc; + + if ( !ctx->restore.callbacks || !ctx->restore.callbacks->toolstack_restore ) + return 0; + + rc = ctx->restore.callbacks->toolstack_restore(ctx->domid, rec->data, rec->length, + ctx->restore.callbacks->data); + if ( rc < 0 ) + PERROR("restoring toolstack"); + return rc; +} + +/* + * Process an X86_PV_INFO record. + */ +static int handle_x86_pv_info(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + struct rec_x86_pv_info *info = rec->data; + + if ( rec->length < sizeof(*info) ) + { + ERROR("X86_PV_INFO record truncated: length %"PRIu32", expected %zu", + rec->length, sizeof(*info)); + return -1; + } + else if ( info->guest_width != 4 && + info->guest_width != 8 ) + { + ERROR("Unexpected guest width %"PRIu32", Expected 4 or 8", + info->guest_width); + return -1; + } + else if ( info->guest_width != ctx->x86_pv.width ) + { + int rc; + struct xen_domctl domctl; + + /* Try to set address size, domain is always created 64 bit. */ + memset(&domctl, 0, sizeof(domctl)); + domctl.domain = ctx->domid; + domctl.cmd = XEN_DOMCTL_set_address_size; + domctl.u.address_size.size = info->guest_width * 8; + rc = do_domctl(xch, &domctl); + if ( rc != 0 ) + { + ERROR("Width of guest in stream (%"PRIu32 + " bits) differs with existing domain (%"PRIu32" bits)", + info->guest_width * 8, ctx->x86_pv.width * 8); + return -1; + } + + /* Domain informations changed, better to refresh. */ + rc = x86_pv_domain_info(ctx); + if ( rc != 0 ) + { + ERROR("Unable to refresh guest informations"); + return -1; + } + } + else if ( info->pt_levels != 3 && + info->pt_levels != 4 ) + { + ERROR("Unexpected guest levels %"PRIu32", Expected 3 or 4", + info->pt_levels); + return -1; + } + else if ( info->pt_levels != ctx->x86_pv.levels ) + { + ERROR("Levels of guest in stream (%"PRIu32 + ") differs with existing domain (%"PRIu32")", + info->pt_levels, ctx->x86_pv.levels); + return -1; + } + + DPRINTF("X86_PV_INFO record: %d bits, %d levels", + ctx->x86_pv.width * 8, ctx->x86_pv.levels); + return 0; +} + +/* + * Process an X86_PV_P2M_FRAMES record. Takes care of expanding the local p2m + * state if needed. + */ +static int handle_x86_pv_p2m_frames(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + struct rec_x86_pv_p2m_frames *data = rec->data; + unsigned start, end, x, fpp = PAGE_SIZE / ctx->x86_pv.width; + int rc; + + if ( rec->length < sizeof(*data) ) + { + ERROR("X86_PV_P2M_FRAMES record truncated: length %"PRIu32", min %zu", + rec->length, sizeof(*data) + sizeof(uint64_t)); + return -1; + } + else if ( data->start_pfn > data->end_pfn ) + { + ERROR("End pfn in stream (%#"PRIx32") exceeds Start (%#"PRIx32")", + data->end_pfn, data->start_pfn); + return -1; + } + + start = data->start_pfn / fpp; + end = data->end_pfn / fpp + 1; + + if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) ) + { + ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#"PRIx32 + ", end_pfn %#"PRIx32", length %"PRIu32 + ", expected %zu + (%u - %u) * %zu", + data->start_pfn, data->end_pfn, rec->length, + sizeof(*data), end, start, sizeof(uint64_t)); + return -1; + } + + if ( data->end_pfn > ctx->x86_pv.max_pfn ) + { + rc = expand_p2m(ctx, data->end_pfn); + if ( rc ) + return rc; + } + + for ( x = 0; x < (end - start); ++x ) + ctx->x86_pv.p2m_pfns[start + x] = data->p2m_pfns[x]; + + DPRINTF("X86_PV_P2M_FRAMES record: GFNs %#"PRIx32"->%#"PRIx32, + data->start_pfn, data->end_pfn); + return 0; +} + +/* + * Process an X86_PV_VCPU_BASIC record from the stream. + */ +static int handle_x86_pv_vcpu_basic(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + struct rec_x86_pv_vcpu_hdr *vhdr = rec->data; + vcpu_guest_context_any_t vcpu; + size_t vcpusz = ctx->x86_pv.width == 8 ? sizeof(vcpu.x64) : sizeof(vcpu.x32); + xen_pfn_t pfn, mfn; + unsigned long tmp; + unsigned i; + int rc = -1; + + if ( rec->length <= sizeof(*vhdr) ) + { + ERROR("X86_PV_VCPU_BASIC record truncated: length %"PRIu32", min %zu", + rec->length, sizeof(*vhdr) + 1); + goto err; + } + else if ( rec->length != sizeof(*vhdr) + vcpusz ) + { + ERROR("X86_PV_VCPU_BASIC record wrong size: length %"PRIu32 + ", expected %zu", rec->length, sizeof(*vhdr) + vcpusz); + goto err; + } + else if ( vhdr->vcpu_id > ctx->dominfo.max_vcpu_id ) + { + ERROR("X86_PV_VCPU_BASIC record vcpu_id (%"PRIu32 + ") exceeds domain max (%u)", + vhdr->vcpu_id, ctx->dominfo.max_vcpu_id); + goto err; + } + + memcpy(&vcpu, &vhdr->context, vcpusz); + + SET_FIELD(ctx, &vcpu, flags, GET_FIELD(ctx, &vcpu, flags) | VGCF_online); + + /* Vcpu 0 is special: Convert the suspend record to an mfn. */ + if ( vhdr->vcpu_id == 0 ) + { + rc = process_start_info(ctx, &vcpu); + if ( rc ) + return rc; + rc = -1; + } + + tmp = GET_FIELD(ctx, &vcpu, gdt_ents); + if ( tmp > 8192 ) + { + ERROR("GDT entry count (%lu) out of range", tmp); + errno = ERANGE; + goto err; + } + + /* Convert GDT frames to mfns. */ + for ( i = 0; (i * 512) < tmp; ++i ) + { + pfn = GET_FIELD(ctx, &vcpu, gdt_frames[i]); + if ( pfn >= ctx->x86_pv.max_pfn ) + { + ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn); + goto err; + } + else if ( ctx->x86_pv.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB ) + { + ERROR("GDT frame %u (pfn %#lx) has bad type %"PRIu32, i, pfn, + ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("GDT frame %u has bad mfn", i); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + SET_FIELD(ctx, &vcpu, gdt_frames[i], mfn); + } + + /* Convert CR3 to an mfn. */ + pfn = cr3_to_mfn(ctx, GET_FIELD(ctx, &vcpu, ctrlreg[3])); + if ( pfn >= ctx->x86_pv.max_pfn ) + { + ERROR("cr3 (pfn %#lx) out of range", pfn); + goto err; + } + else if ( (ctx->x86_pv.pfn_types[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) != + (((xen_pfn_t)ctx->x86_pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) + { + ERROR("cr3 (pfn %#lx) has bad type %"PRIu32", expected %"PRIu32, pfn, + ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, + ctx->x86_pv.levels); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("cr3 has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + SET_FIELD(ctx, &vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn)); + + /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */ + if ( ctx->x86_pv.levels == 4 && (vcpu.x64.ctrlreg[1] & 1) ) + { + pfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT; + + if ( pfn >= ctx->x86_pv.max_pfn ) + { + ERROR("cr1 (pfn %#lx) out of range", pfn); + goto err; + } + else if ( (ctx->x86_pv.pfn_types[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != + (((xen_pfn_t)ctx->x86_pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) + { + ERROR("cr1 (pfn %#lx) has bad type %"PRIu32", expected %"PRIu32, pfn, + ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, + ctx->x86_pv.levels); + goto err; + } + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("cr1 has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + vcpu.x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT; + } + + if ( xc_vcpu_setcontext(xch, ctx->domid, vhdr->vcpu_id, &vcpu) ) + { + PERROR("Failed to set vcpu%"PRIu32"'s basic info", vhdr->vcpu_id); + goto err; + } + + rc = 0; + DPRINTF("vcpu%"PRId32" X86_PV_VCPU_BASIC record", vhdr->vcpu_id); + err: + return rc; +} + +/* + * Process an X86_PV_VCPU_EXTENDED record from the stream. + */ +static int handle_x86_pv_vcpu_extended(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + struct rec_x86_pv_vcpu_hdr *vcpu = rec->data; + DECLARE_DOMCTL; + + if ( rec->length <= sizeof(*vcpu) ) + { + ERROR("X86_PV_VCPU_EXTENDED record truncated: length %"PRIu32", min %zu", + rec->length, sizeof(*vcpu) + 1); + return -1; + } + else if ( rec->length > sizeof(*vcpu) + 128 ) + { + ERROR("X86_PV_VCPU_EXTENDED record too long: length %"PRIu32", max %zu", + rec->length, sizeof(*vcpu) + 128); + return -1; + } + else if ( vcpu->vcpu_id > ctx->dominfo.max_vcpu_id ) + { + ERROR("X86_PV_VCPU_EXTENDED record vcpu_id (%"PRIu32 + ") exceeds domain max (%u)", + vcpu->vcpu_id, ctx->dominfo.max_vcpu_id); + return -1; + } + + domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext; + domctl.domain = ctx->domid; + memcpy(&domctl.u.ext_vcpucontext, &vcpu->context, + rec->length - sizeof(*vcpu)); + + if ( xc_domctl(xch, &domctl) != 0 ) + { + PERROR("Failed to set vcpu%"PRIu32"'s extended info", vcpu->vcpu_id); + return -1; + } + + DPRINTF("vcpu%"PRId32" X86_PV_VCPU_EXTENDED record", vcpu->vcpu_id); + return 0; +} + +/* + * Process an X86_PV_VCPU_XSAVE record from the stream. + */ +static int handle_x86_pv_vcpu_xsave(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + struct rec_x86_pv_vcpu_hdr *vhdr = rec->data; + int rc; + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(void, buffer); + size_t buffersz; + + if ( rec->length <= sizeof(*vhdr) ) + { + ERROR("X86_PV_VCPU_XSAVE record truncated: length %"PRIu32", min %zu", + rec->length, sizeof(*vhdr) + 1); + return -1; + } + else if ( vhdr->vcpu_id > ctx->dominfo.max_vcpu_id ) + { + ERROR("X86_PV_VCPU_XSAVE record vcpu_id (%"PRIu32 + ") exceeds domain max (%u)", + vhdr->vcpu_id, ctx->dominfo.max_vcpu_id); + return -1; + } + + buffersz = rec->length - sizeof(*vhdr); + buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz); + if ( !buffer ) + { + ERROR("Unable to allocate %"PRIu64" bytes for xsave hypercall buffer", + buffersz); + return -1; + } + + domctl.cmd = XEN_DOMCTL_setvcpuextstate; + domctl.domain = ctx->domid; + domctl.u.vcpuextstate.vcpu = vhdr->vcpu_id; + domctl.u.vcpuextstate.size = buffersz; + set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); + + memcpy(buffer, vhdr->context, buffersz); + + rc = xc_domctl(xch, &domctl); + + xc_hypercall_buffer_free(xch, buffer); + + if ( rc ) + PERROR("Failed to set vcpu%"PRIu32"'s xsave info", vhdr->vcpu_id); + else + DPRINTF("vcpu%"PRId32" X86_PV_VCPU_XSAVE record", vhdr->vcpu_id); + + return rc; +} + +/* + * Process an X86_PV_VCPU_MSRS record from the stream. + */ +static int handle_x86_pv_vcpu_msrs(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + struct rec_x86_pv_vcpu_hdr *vhdr = rec->data; + int rc; + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(void, buffer); + size_t buffersz = rec->length - sizeof(*vhdr); + + if ( rec->length <= sizeof(*vhdr) ) + { + ERROR("X86_PV_VCPU_MSRS record truncated: length %"PRIu32", min %zu", + rec->length, sizeof(*vhdr) + 1); + return -1; + } + else if ( vhdr->vcpu_id > ctx->dominfo.max_vcpu_id ) + { + ERROR("X86_PV_VCPU_MSRS record vcpu_id (%"PRIu32 + ") exceeds domain max (%u)", + vhdr->vcpu_id, ctx->dominfo.max_vcpu_id); + return -1; + } + else if ( buffersz % sizeof(xen_domctl_vcpu_msr_t) != 0 ) + { + ERROR("X86_PV_VCPU_MSRS payload size %zu" + " expected to be a multiple of %zu", + buffersz, sizeof(xen_domctl_vcpu_msr_t)); + return -1; + } + + buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz); + if ( !buffer ) + { + ERROR("Unable to allocate %zu bytes for msr hypercall buffer", + buffersz); + return -1; + } + + domctl.cmd = XEN_DOMCTL_set_vcpu_msrs; + domctl.domain = ctx->domid; + domctl.u.vcpu_msrs.vcpu = vhdr->vcpu_id; + domctl.u.vcpu_msrs.msr_count = buffersz % sizeof(xen_domctl_vcpu_msr_t); + set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); + + memcpy(buffer, vhdr->context, buffersz); + + rc = xc_domctl(xch, &domctl); + + xc_hypercall_buffer_free(xch, buffer); + + if ( rc ) + PERROR("Failed to set vcpu%"PRIu32"'s msrs", vhdr->vcpu_id); + else + DPRINTF("vcpu%"PRId32" X86_PV_VCPU_MSRS record", vhdr->vcpu_id); + + return rc; +} + +/* + * Process a SHARED_INFO record from the stream. + */ +static int handle_shared_info(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + unsigned i; + int rc = -1; + shared_info_any_t *guest_shared_info = NULL; + shared_info_any_t *stream_shared_info = rec->data; + + if ( rec->length != PAGE_SIZE ) + { + ERROR("X86_PV_SHARED_INFO record wrong size: length %"PRIu32 + ", expected %lu", rec->length, PAGE_SIZE); + goto err; + } + + guest_shared_info = xc_map_foreign_range( + xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, + ctx->dominfo.shared_info_frame); + if ( !guest_shared_info ) + { + PERROR("Failed to map Shared Info at mfn %#lx", + ctx->dominfo.shared_info_frame); + goto err; + } + + MEMCPY_FIELD(ctx, guest_shared_info, stream_shared_info, vcpu_info); + MEMCPY_FIELD(ctx, guest_shared_info, stream_shared_info, arch); + + SET_FIELD(ctx, guest_shared_info, arch.pfn_to_mfn_frame_list_list, 0); + + MEMSET_ARRAY_FIELD(ctx, guest_shared_info, evtchn_pending, 0); + for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) + SET_FIELD(ctx, guest_shared_info, vcpu_info[i].evtchn_pending_sel, 0); + + MEMSET_ARRAY_FIELD(ctx, guest_shared_info, evtchn_mask, 0xff); + + rc = 0; + err: + + if ( guest_shared_info ) + munmap(guest_shared_info, PAGE_SIZE); + + return rc; +} + +/* + * restore_ops function. Convert pfns back to mfns in pagetables. Possibly + * needs to populate new frames if a PTE is found referring to a frame which + * hasn't yet been seen from PAGE_DATA records. + */ +static int x86_pv_localise_page(struct context *ctx, uint32_t type, void *page) +{ + xc_interface *xch = ctx->xch; + uint64_t *table = page; + uint64_t pte; + unsigned i; + + type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + /* Only page tables need localisation. */ + if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB ) + return 0; + + for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) + { + pte = table[i]; + + if ( pte & _PAGE_PRESENT ) + { + xen_pfn_t mfn, pfn; + + pfn = pte_to_frame(ctx, pte); + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + + if ( mfn == INVALID_MFN ) + { + if ( populate_pfns(ctx, 1, &pfn, &type) ) + return -1; + + mfn = ctx->ops.pfn_to_gfn(ctx, pfn); + } + + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Bad mfn for L%"PRIu32"[%u]", + type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + return -1; + } + + update_pte(ctx, &pte, mfn); + + table[i] = pte; + } + } + + return 0; +} + +/* + * restore_ops function. Confirm that the incoming stream matches the type of + * domain we are attempting to restore into. + */ +static int x86_pv_setup(struct context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV ) + { + ERROR("Unable to restore %s domain into an x86_pv domain", + dhdr_type_to_str(ctx->restore.guest_type)); + return -1; + } + else if ( ctx->restore.guest_page_size != PAGE_SIZE ) + { + ERROR("Invalid page size %d for x86_pv domains", + ctx->restore.guest_page_size); + return -1; + } + + rc = x86_pv_domain_info(ctx); + if ( rc ) + return rc; + + rc = x86_pv_map_m2p(ctx); + if ( rc ) + return rc; + + return rc; +} + +/* + * restore_ops function. + */ +static int x86_pv_process_record(struct context *ctx, struct record *rec) +{ + xc_interface *xch = ctx->xch; + + switch ( rec->type ) + { + case REC_TYPE_X86_PV_INFO: + return handle_x86_pv_info(ctx, rec); + + case REC_TYPE_X86_PV_P2M_FRAMES: + return handle_x86_pv_p2m_frames(ctx, rec); + + case REC_TYPE_X86_PV_VCPU_BASIC: + return handle_x86_pv_vcpu_basic(ctx, rec); + + case REC_TYPE_X86_PV_VCPU_EXTENDED: + return handle_x86_pv_vcpu_extended(ctx, rec); + + case REC_TYPE_X86_PV_VCPU_XSAVE: + return handle_x86_pv_vcpu_xsave(ctx, rec); + + case REC_TYPE_SHARED_INFO: + return handle_shared_info(ctx, rec); + + case REC_TYPE_TOOLSTACK: + return handle_toolstack(ctx, rec); + + case REC_TYPE_TSC_INFO: + return handle_tsc_info(ctx, rec); + + case REC_TYPE_X86_PV_VCPU_MSRS: + return handle_x86_pv_vcpu_msrs(ctx, rec); + + default: + if ( rec->type & REC_TYPE_OPTIONAL ) + { + IPRINTF("Ignoring optional record (0x%"PRIx32", %s)", + rec->type, rec_type_to_str(rec->type)); + return 0; + } + + ERROR("Invalid record type (0x%"PRIx32", %s) for x86_pv domains", + rec->type, rec_type_to_str(rec->type)); + return -1; + } +} + +/* + * restore_ops function. Pin the pagetables, rewrite the p2m and seed the + * grant table. + */ +static int x86_pv_stream_complete(struct context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + rc = pin_pagetables(ctx); + if ( rc ) + return rc; + + rc = update_guest_p2m(ctx); + if ( rc ) + return rc; + + rc = xc_dom_gnttab_seed(xch, ctx->domid, + ctx->restore.console_mfn, + ctx->restore.xenstore_mfn, + ctx->restore.console_domid, + ctx->restore.xenstore_domid); + if ( rc ) + { + PERROR("Failed to seed grant table"); + return rc; + } + + return rc; +} + +/* + * restore_ops function. + */ +static int x86_pv_cleanup(struct context *ctx) +{ + free(ctx->x86_pv.p2m); + free(ctx->x86_pv.p2m_pfns); + free(ctx->x86_pv.pfn_types); + + if ( ctx->x86_pv.m2p ) + munmap(ctx->x86_pv.m2p, ctx->x86_pv.nr_m2p_frames * PAGE_SIZE); + + return 0; +} + +struct restore_ops restore_ops_x86_pv = +{ + .localise_page = x86_pv_localise_page, + .setup = x86_pv_setup, + .process_record = x86_pv_process_record, + .stream_complete = x86_pv_stream_complete, + .cleanup = x86_pv_cleanup, +}; + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ -- 1.7.10.4