From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Cooper Subject: [PATCH 1/6] tools/libxc: Remove legacy migration implementation Date: Mon, 20 Jul 2015 11:37:54 +0100 Message-ID: <1437388679-16468-2-git-send-email-andrew.cooper3@citrix.com> References: <1437388679-16468-1-git-send-email-andrew.cooper3@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1437388679-16468-1-git-send-email-andrew.cooper3@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Xen-devel Cc: Andrew Cooper , Ian Jackson , Ian Campbell , Wei Liu List-Id: xen-devel@lists.xenproject.org It is no longer used. One complication is that xc_map_m2p() has users in xc_offline_page.c, xen-mfndump and xen-mceinj. Move its implementation into xc_offline_page (for want of a better location) beside it's current user. Signed-off-by: Andrew Cooper CC: Ian Campbell CC: Ian Jackson CC: Wei Liu --- Going forwards, xc_map_m2p() should move into libxc (being host specific rather than guest specific) and gain a slightly more rational API to consolidate several open-coded instances throughout tools/. However, that is very much 4.7 work, given the current timescale on 4.6 --- tools/libxc/Makefile | 1 - tools/libxc/xc_domain_restore.c | 2411 --------------------------------------- tools/libxc/xc_domain_save.c | 2198 ----------------------------------- tools/libxc/xc_offline_page.c | 59 + tools/libxc/xg_save_restore.h | 247 ---- 5 files changed, 59 insertions(+), 4857 deletions(-) delete mode 100644 tools/libxc/xc_domain_restore.c delete mode 100644 tools/libxc/xc_domain_save.c diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile index 1aec848..a026c4e 100644 --- a/tools/libxc/Makefile +++ b/tools/libxc/Makefile @@ -53,7 +53,6 @@ CTRL_SRCS-$(CONFIG_MiniOS) += xc_minios.c GUEST_SRCS-y := GUEST_SRCS-y += xg_private.c xc_suspend.c ifeq ($(CONFIG_MIGRATE),y) -GUEST_SRCS-y += xc_domain_restore.c xc_domain_save.c GUEST_SRCS-y += xc_sr_common.c GUEST_SRCS-$(CONFIG_X86) += xc_sr_common_x86.c GUEST_SRCS-$(CONFIG_X86) += xc_sr_common_x86_pv.c diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c deleted file mode 100644 index 8435f6b..0000000 --- a/tools/libxc/xc_domain_restore.c +++ /dev/null @@ -1,2411 +0,0 @@ -/****************************************************************************** - * xc_domain_restore.c - * - * Restore the state of a guest session. - * - * Copyright (c) 2003, K A Fraser. - * Copyright (c) 2006, Intel Corporation - * Copyright (c) 2007, XenSource Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* - * The superpages flag in restore has two different meanings depending on - * the type of domain. - * - * For an HVM domain, the flag means to look for properly aligned contiguous - * pages and try to allocate a superpage to satisfy it. If that fails, - * fall back to small pages. - * - * For a PV domain, the flag means allocate all memory as superpages. If that - * fails, the restore fails. This behavior is required for PV guests who - * want to use superpages. - */ - -#include -#include -#include - -#include "xg_private.h" -#include "xg_save_restore.h" -#include "xc_dom.h" - -#include -#include - -struct restore_ctx { - unsigned long max_mfn; /* max mfn of the current host machine */ - unsigned long hvirt_start; /* virtual starting address of the hypervisor */ - unsigned int pt_levels; /* #levels of page tables used by the current guest */ - unsigned long nr_pfns; /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */ - xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */ - xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */ - xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */ - xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage alloc */ - int superpages; /* Superpage allocation has been requested */ - int hvm; /* This is an hvm domain */ - int completed; /* Set when a consistent image is available */ - int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */ - int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */ - struct domain_info_context dinfo; -}; - -#define HEARTBEAT_MS 1000 - -#ifndef __MINIOS__ -static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx, - int fd, void* buf, size_t size) -{ - size_t offset = 0; - ssize_t len; - struct timeval tv; - fd_set rfds; - - while ( offset < size ) - { - if ( ctx->completed ) { - /* expect a heartbeat every HEARBEAT_MS ms maximum */ - tv.tv_sec = HEARTBEAT_MS / 1000; - tv.tv_usec = (HEARTBEAT_MS % 1000) * 1000; - - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - len = select(fd + 1, &rfds, NULL, NULL, &tv); - if ( len == -1 && errno == EINTR ) - continue; - if ( !FD_ISSET(fd, &rfds) ) { - ERROR("%s failed (select returned %zd)", __func__, len); - errno = ETIMEDOUT; - return -1; - } - } - - len = read(fd, buf + offset, size - offset); - if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) ) - continue; - if ( len == 0 ) { - ERROR("0-length read"); - errno = 0; - } - if ( len <= 0 ) { - ERROR("%s failed (read rc: %zd, errno: %d)", __func__, len, errno); - return -1; - } - offset += len; - } - - return 0; -} - -#define RDEXACT(fd,buf,size) rdexact(xch, ctx, fd, buf, size) -#else -#define RDEXACT read_exact -#endif - -#define SUPERPAGE_PFN_SHIFT 9 -#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT) -#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1))) -#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 ) - -/* -** When we're restoring into a pv superpage-allocated guest, we take -** a copy of the p2m_batch array to preserve the pfn, then allocate the -** corresponding superpages. We then fill in the p2m array using the saved -** pfns. -*/ -static int alloc_superpage_mfns( - xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns) -{ - int i, j, max = 0; - unsigned long pfn, base_pfn, mfn; - - for (i = 0; i < nr_mfns; i++) - { - pfn = ctx->p2m_batch[i]; - base_pfn = SUPERPAGE(pfn); - if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2)) - { - ctx->p2m_saved_batch[max] = base_pfn; - ctx->p2m_batch[max] = base_pfn; - max++; - ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2; - } - } - if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT, - 0, ctx->p2m_batch) != 0) - return 1; - - for (i = 0; i < max; i++) - { - mfn = ctx->p2m_batch[i]; - pfn = ctx->p2m_saved_batch[i]; - for (j = 0; j < SUPERPAGE_NR_PFNS; j++) - ctx->p2m[pfn++] = mfn++; - } - return 0; -} -/* -** In the state file (or during transfer), all page-table pages are -** converted into a 'canonical' form where references to actual mfns -** are replaced with references to the corresponding pfns. -** This function inverts that operation, replacing the pfn values with -** the (now known) appropriate mfn values. -*/ -static int uncanonicalize_pagetable( - xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page) -{ - int i, rc, pte_last, nr_mfns = 0; - unsigned long pfn; - uint64_t pte; - struct domain_info_context *dinfo = &ctx->dinfo; - - pte_last = PAGE_SIZE / 8; - - /* First pass: work out how many (if any) MFNs we need to alloc */ - for ( i = 0; i < pte_last; i++ ) - { - pte = ((uint64_t *)page)[i]; - - /* XXX SMH: below needs fixing for PROT_NONE etc */ - if ( !(pte & _PAGE_PRESENT) ) - continue; - - pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; - - if ( pfn >= dinfo->p2m_size ) - { - /* This "page table page" is probably not one; bail. */ - ERROR("Frame number in page table is out of range: " - "i=%d pfn=0x%lx p2m_size=%lu", - i, pfn, dinfo->p2m_size); - return 0; - } - - if ( ctx->p2m[pfn] == INVALID_P2M_ENTRY ) - { - /* Have a 'valid' PFN without a matching MFN - need to alloc */ - ctx->p2m_batch[nr_mfns++] = pfn; - ctx->p2m[pfn]--; - } - } - - /* Allocate the requisite number of mfns. */ - if (nr_mfns) - { - if (!ctx->hvm && ctx->superpages) - rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns); - else - rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0, - ctx->p2m_batch); - - if (rc) - { - ERROR("Failed to allocate memory for batch.!\n"); - errno = ENOMEM; - return 0; - } - } - - /* Second pass: uncanonicalize each present PTE */ - nr_mfns = 0; - for ( i = 0; i < pte_last; i++ ) - { - pte = ((uint64_t *)page)[i]; - - /* XXX SMH: below needs fixing for PROT_NONE etc */ - if ( !(pte & _PAGE_PRESENT) ) - continue; - - pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; - - if ( ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) ) - ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++]; - - pte &= ~MADDR_MASK_X86; - pte |= (uint64_t)ctx->p2m[pfn] << PAGE_SHIFT; - - ((uint64_t *)page)[i] = (uint64_t)pte; - } - - return 1; -} - - -/* Load the p2m frame list, plus potential extended info chunk */ -static xen_pfn_t *load_p2m_frame_list( - xc_interface *xch, struct restore_ctx *ctx, - int io_fd, int *pae_extended_cr3, int *ext_vcpucontext, - uint32_t *vcpuextstate_size) -{ - xen_pfn_t *p2m_frame_list; - vcpu_guest_context_any_t ctxt; - xen_pfn_t p2m_fl_zero; - struct domain_info_context *dinfo = &ctx->dinfo; - - /* Read first entry of P2M list, or extended-info signature (~0UL). */ - if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(long)) ) - { - PERROR("read extended-info signature failed"); - return NULL; - } - - if ( p2m_fl_zero == ~0UL ) - { - uint32_t tot_bytes; - - /* Next 4 bytes: total size of following extended info. */ - if ( RDEXACT(io_fd, &tot_bytes, sizeof(tot_bytes)) ) - { - PERROR("read extended-info size failed"); - return NULL; - } - - while ( tot_bytes ) - { - uint32_t chunk_bytes; - char chunk_sig[4]; - - /* 4-character chunk signature + 4-byte remaining chunk size. */ - if ( RDEXACT(io_fd, chunk_sig, sizeof(chunk_sig)) || - RDEXACT(io_fd, &chunk_bytes, sizeof(chunk_bytes)) || - (tot_bytes < (chunk_bytes + 8)) ) - { - PERROR("read extended-info chunk signature failed"); - return NULL; - } - tot_bytes -= 8; - - /* VCPU context structure? */ - if ( !strncmp(chunk_sig, "vcpu", 4) ) - { - /* Pick a guest word-size and PT depth from the ctxt size */ - if ( chunk_bytes == sizeof (ctxt.x32) ) - { - dinfo->guest_width = 4; - ctx->pt_levels = 3; - } - else if ( chunk_bytes == sizeof (ctxt.x64) ) - { - dinfo->guest_width = 8; - ctx->pt_levels = 4; - } - else - { - ERROR("bad extended-info context size %d", chunk_bytes); - return NULL; - } - - if ( RDEXACT(io_fd, &ctxt, chunk_bytes) ) - { - PERROR("read extended-info vcpu context failed"); - return NULL; - } - tot_bytes -= chunk_bytes; - chunk_bytes = 0; - - if ( GET_FIELD(&ctxt, vm_assist, dinfo->guest_width) - & (1UL << VMASST_TYPE_pae_extended_cr3) ) - *pae_extended_cr3 = 1; - } - else if ( !strncmp(chunk_sig, "extv", 4) ) - { - *ext_vcpucontext = 1; - } - else if ( !strncmp(chunk_sig, "xcnt", 4) ) - { - if ( RDEXACT(io_fd, vcpuextstate_size, sizeof(*vcpuextstate_size)) ) - { - PERROR("read extended vcpu state size failed"); - return NULL; - } - tot_bytes -= chunk_bytes; - chunk_bytes = 0; - } - - /* Any remaining bytes of this chunk: read and discard. */ - while ( chunk_bytes ) - { - unsigned long sz = min_t(unsigned long, chunk_bytes, sizeof(xen_pfn_t)); - if ( RDEXACT(io_fd, &p2m_fl_zero, sz) ) - { - PERROR("read-and-discard extended-info chunk bytes failed"); - return NULL; - } - chunk_bytes -= sz; - tot_bytes -= sz; - } - } - - /* Now read the real first entry of P2M list. */ - if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) ) - { - PERROR("read first entry of p2m_frame_list failed"); - return NULL; - } - } - - /* Now that we know the guest's word-size, can safely allocate - * the p2m frame list */ - if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL ) - { - ERROR("Couldn't allocate p2m_frame_list array"); - return NULL; - } - - /* First entry has already been read. */ - p2m_frame_list[0] = p2m_fl_zero; - if ( RDEXACT(io_fd, &p2m_frame_list[1], - (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) ) - { - PERROR("read p2m_frame_list failed"); - free(p2m_frame_list); - return NULL; - } - - return p2m_frame_list; -} - -typedef struct { - int ishvm; - union { - struct tailbuf_pv { - unsigned int pfncount; - unsigned long* pfntab; - unsigned int vcpucount; - unsigned char* vcpubuf; - unsigned char shared_info_page[PAGE_SIZE]; - } pv; - struct tailbuf_hvm { - uint64_t magicpfns[3]; - uint32_t hvmbufsize, reclen; - uint8_t* hvmbuf; - struct { - uint32_t magic; - uint32_t version; - uint64_t len; - } qemuhdr; - uint32_t qemubufsize; - uint8_t* qemubuf; - } hvm; - } u; -} tailbuf_t; - -/* read stream until EOF, growing buffer as necssary */ -static int compat_buffer_qemu(xc_interface *xch, struct restore_ctx *ctx, - int fd, struct tailbuf_hvm *buf) -{ - uint8_t *qbuf, *tmp; - int blen = 0, dlen = 0; - int rc; - - /* currently save records tend to be about 7K */ - blen = 8192; - if ( !(qbuf = malloc(blen)) ) { - ERROR("Error allocating QEMU buffer"); - return -1; - } - - while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) { - DPRINTF("Read %d bytes of QEMU data\n", rc); - dlen += rc; - - if (dlen == blen) { - DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen); - blen += 4096; - tmp = realloc(qbuf, blen); - if ( !tmp ) { - ERROR("Error growing QEMU buffer to %d bytes", blen); - free(qbuf); - return -1; - } - qbuf = tmp; - } - } - - if ( rc < 0 ) { - ERROR("Error reading QEMU data"); - free(qbuf); - return -1; - } - - if ( memcmp(qbuf, "QEVM", 4) ) { - ERROR("Invalid QEMU magic: 0x%08"PRIx32, *(uint32_t*)qbuf); - free(qbuf); - return -1; - } - - buf->qemubuf = qbuf; - buf->qemubufsize = dlen; - - return 0; -} - -static int buffer_qemu(xc_interface *xch, struct restore_ctx *ctx, - int fd, struct tailbuf_hvm *buf) -{ - uint32_t qlen; - uint8_t *tmp; - - if ( RDEXACT(fd, &qlen, sizeof(qlen)) ) { - PERROR("Error reading QEMU header length"); - return -1; - } - - if ( qlen > buf->qemubufsize ) { - if ( buf->qemubuf) { - tmp = realloc(buf->qemubuf, qlen); - if ( tmp ) - buf->qemubuf = tmp; - else { - ERROR("Error reallocating QEMU state buffer"); - return -1; - } - } else { - buf->qemubuf = malloc(qlen); - if ( !buf->qemubuf ) { - ERROR("Error allocating QEMU state buffer"); - return -1; - } - } - } - buf->qemubufsize = qlen; - - if ( RDEXACT(fd, buf->qemubuf, buf->qemubufsize) ) { - PERROR("Error reading QEMU state"); - return -1; - } - - return 0; -} - -static int dump_qemu(xc_interface *xch, uint32_t dom, struct tailbuf_hvm *buf) -{ - int saved_errno; - char path[256]; - FILE *fp; - - sprintf(path, XC_DEVICE_MODEL_RESTORE_FILE".%u", dom); - fp = fopen(path, "wb"); - if ( !fp ) - return -1; - - DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize); - if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) { - saved_errno = errno; - fclose(fp); - errno = saved_errno; - return -1; - } - - fclose(fp); - - return 0; -} - -static int buffer_tail_hvm(xc_interface *xch, struct restore_ctx *ctx, - struct tailbuf_hvm *buf, int fd, - unsigned int max_vcpu_id, uint64_t *vcpumap, - int ext_vcpucontext, - uint32_t vcpuextstate_size) -{ - uint8_t *tmp; - unsigned char qemusig[21]; - - if ( RDEXACT(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) { - PERROR("Error reading magic PFNs"); - return -1; - } - - if ( RDEXACT(fd, &buf->reclen, sizeof(buf->reclen)) ) { - PERROR("Error reading HVM params size"); - return -1; - } - - if ( buf->reclen > buf->hvmbufsize ) { - if ( buf->hvmbuf) { - tmp = realloc(buf->hvmbuf, buf->reclen); - if ( tmp ) { - buf->hvmbuf = tmp; - buf->hvmbufsize = buf->reclen; - } else { - ERROR("Error reallocating HVM param buffer"); - return -1; - } - } else { - buf->hvmbuf = malloc(buf->reclen); - if ( !buf->hvmbuf ) { - ERROR("Error allocating HVM param buffer"); - return -1; - } - buf->hvmbufsize = buf->reclen; - } - } - - if ( RDEXACT(fd, buf->hvmbuf, buf->reclen) ) { - PERROR("Error reading HVM params"); - return -1; - } - - if ( RDEXACT(fd, qemusig, sizeof(qemusig)) ) { - PERROR("Error reading QEMU signature"); - return -1; - } - - /* The legacy live-migration QEMU record has no length information. - * Short of reimplementing the QEMU parser, we're forced to just read - * until EOF. - * - * Gets around this by sending a different signatures for the new - * live-migration QEMU record and Remus which includes a length - * prefix - */ - if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) ) - return compat_buffer_qemu(xch, ctx, fd, buf); - else if ( !memcmp(qemusig, "DeviceModelRecord0002", sizeof(qemusig)) || - !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) ) - return buffer_qemu(xch, ctx, fd, buf); - - qemusig[20] = '\0'; - ERROR("Invalid QEMU signature: %s", qemusig); - return -1; -} - -static int buffer_tail_pv(xc_interface *xch, struct restore_ctx *ctx, - struct tailbuf_pv *buf, int fd, - unsigned int max_vcpu_id, uint64_t *vcpumap, - int ext_vcpucontext, - uint32_t vcpuextstate_size) -{ - unsigned int i; - size_t pfnlen, vcpulen; - struct domain_info_context *dinfo = &ctx->dinfo; - - /* TODO: handle changing pfntab and vcpu counts */ - /* PFN tab */ - if ( RDEXACT(fd, &buf->pfncount, sizeof(buf->pfncount)) || - (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */ - { - PERROR("Error when reading pfn count"); - return -1; - } - pfnlen = sizeof(unsigned long) * buf->pfncount; - if ( !(buf->pfntab) ) { - if ( !(buf->pfntab = malloc(pfnlen)) ) { - ERROR("Error allocating PFN tail buffer"); - return -1; - } - } - // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen); - if ( RDEXACT(fd, buf->pfntab, pfnlen) ) { - PERROR("Error when reading pfntab"); - goto free_pfntab; - } - - /* VCPU contexts */ - buf->vcpucount = 0; - for (i = 0; i <= max_vcpu_id; i++) { - // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap[i/64], i, (vcpumap[i/64] & (1ULL << (i%64)))); - if ( (!(vcpumap[i/64] & (1ULL << (i%64)))) ) - continue; - buf->vcpucount++; - } - // DPRINTF("VCPU count: %d\n", buf->vcpucount); - vcpulen = ((dinfo->guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t) - : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount; - if ( ext_vcpucontext ) - vcpulen += 128 * buf->vcpucount; - vcpulen += vcpuextstate_size * buf->vcpucount; - - if ( !(buf->vcpubuf) ) { - if ( !(buf->vcpubuf = malloc(vcpulen)) ) { - ERROR("Error allocating VCPU ctxt tail buffer"); - goto free_pfntab; - } - } - // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen); - if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) { - PERROR("Error when reading ctxt"); - goto free_vcpus; - } - - /* load shared_info_page */ - // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE); - if ( RDEXACT(fd, buf->shared_info_page, PAGE_SIZE) ) { - PERROR("Error when reading shared info page"); - goto free_vcpus; - } - - return 0; - - free_vcpus: - if (buf->vcpubuf) { - free (buf->vcpubuf); - buf->vcpubuf = NULL; - } - free_pfntab: - if (buf->pfntab) { - free (buf->pfntab); - buf->pfntab = NULL; - } - - return -1; -} - -static int buffer_tail(xc_interface *xch, struct restore_ctx *ctx, - tailbuf_t *buf, int fd, unsigned int max_vcpu_id, - uint64_t *vcpumap, int ext_vcpucontext, - uint32_t vcpuextstate_size) -{ - if ( buf->ishvm ) - return buffer_tail_hvm(xch, ctx, &buf->u.hvm, fd, max_vcpu_id, vcpumap, - ext_vcpucontext, vcpuextstate_size); - else - return buffer_tail_pv(xch, ctx, &buf->u.pv, fd, max_vcpu_id, vcpumap, - ext_vcpucontext, vcpuextstate_size); -} - -static void tailbuf_free_hvm(struct tailbuf_hvm *buf) -{ - if ( buf->hvmbuf ) { - free(buf->hvmbuf); - buf->hvmbuf = NULL; - } - if ( buf->qemubuf ) { - free(buf->qemubuf); - buf->qemubuf = NULL; - } -} - -static void tailbuf_free_pv(struct tailbuf_pv *buf) -{ - if ( buf->vcpubuf ) { - free(buf->vcpubuf); - buf->vcpubuf = NULL; - } - if ( buf->pfntab ) { - free(buf->pfntab); - buf->pfntab = NULL; - } -} - -static void tailbuf_free(tailbuf_t *buf) -{ - if ( buf->ishvm ) - tailbuf_free_hvm(&buf->u.hvm); - else - tailbuf_free_pv(&buf->u.pv); -} - -struct toolstack_data_t { - uint8_t *data; - uint32_t len; -}; - -typedef struct { - void* pages; - /* pages is of length nr_physpages, pfn_types is of length nr_pages */ - unsigned int nr_physpages, nr_pages; - - /* checkpoint compression state */ - int compressing; - unsigned long compbuf_pos, compbuf_size; - - /* Types of the pfns in the current region */ - unsigned long* pfn_types; - - int verify; - - int new_ctxt_format; - int max_vcpu_id; - uint64_t vcpumap[XC_SR_MAX_VCPUS/64]; - uint64_t identpt; - uint64_t paging_ring_pfn; - uint64_t monitor_ring_pfn; - uint64_t sharing_ring_pfn; - uint64_t vm86_tss; - uint64_t console_pfn; - uint64_t acpi_ioport_location; - uint64_t viridian; - uint64_t vm_generationid_addr; - uint64_t ioreq_server_pfn; - uint64_t nr_ioreq_server_pages; - - struct toolstack_data_t tdata; -} pagebuf_t; - -static int pagebuf_init(pagebuf_t* buf) -{ - memset(buf, 0, sizeof(*buf)); - return 0; -} - -static void pagebuf_free(pagebuf_t* buf) -{ - if (buf->tdata.data != NULL) { - free(buf->tdata.data); - buf->tdata.data = NULL; - } - if (buf->pages) { - free(buf->pages); - buf->pages = NULL; - } - if(buf->pfn_types) { - free(buf->pfn_types); - buf->pfn_types = NULL; - } -} - -static int pagebuf_get_one(xc_interface *xch, struct restore_ctx *ctx, - pagebuf_t* buf, int fd, uint32_t dom) -{ - int count, countpages, oldcount, i; - void* ptmp; - unsigned long compbuf_size; - - if ( RDEXACT(fd, &count, sizeof(count)) ) - { - PERROR("Error when reading batch size"); - return -1; - } - - // DPRINTF("reading batch of %d pages\n", count); - - switch ( count ) - { - case 0: - // DPRINTF("Last batch read\n"); - return 0; - - case XC_SAVE_ID_ENABLE_VERIFY_MODE: - DPRINTF("Entering page verify mode\n"); - buf->verify = 1; - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_VCPU_INFO: - buf->new_ctxt_format = 1; - if ( RDEXACT(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) || - buf->max_vcpu_id >= XC_SR_MAX_VCPUS || - RDEXACT(fd, buf->vcpumap, vcpumap_sz(buf->max_vcpu_id)) ) { - PERROR("Error when reading max_vcpu_id"); - return -1; - } - // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap[0]); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_IDENT_PT: - /* Skip padding 4 bytes then read the EPT identity PT location. */ - if ( RDEXACT(fd, &buf->identpt, sizeof(uint32_t)) || - RDEXACT(fd, &buf->identpt, sizeof(uint64_t)) ) - { - PERROR("error read the address of the EPT identity map"); - return -1; - } - // DPRINTF("EPT identity map address: %llx\n", buf->identpt); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_PAGING_RING_PFN: - /* Skip padding 4 bytes then read the paging ring location. */ - if ( RDEXACT(fd, &buf->paging_ring_pfn, sizeof(uint32_t)) || - RDEXACT(fd, &buf->paging_ring_pfn, sizeof(uint64_t)) ) - { - PERROR("error read the paging ring pfn"); - return -1; - } - // DPRINTF("paging ring pfn address: %llx\n", buf->paging_ring_pfn); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_MONITOR_RING_PFN: - /* Skip padding 4 bytes then read the mem access ring location. */ - if ( RDEXACT(fd, &buf->monitor_ring_pfn, sizeof(uint32_t)) || - RDEXACT(fd, &buf->monitor_ring_pfn, sizeof(uint64_t)) ) - { - PERROR("error read the access ring pfn"); - return -1; - } - // DPRINTF("monitor ring pfn address: %llx\n", buf->monitor_ring_pfn); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_SHARING_RING_PFN: - /* Skip padding 4 bytes then read the sharing ring location. */ - if ( RDEXACT(fd, &buf->sharing_ring_pfn, sizeof(uint32_t)) || - RDEXACT(fd, &buf->sharing_ring_pfn, sizeof(uint64_t)) ) - { - PERROR("error read the sharing ring pfn"); - return -1; - } - // DPRINTF("sharing ring pfn address: %llx\n", buf->sharing_ring_pfn); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_VM86_TSS: - /* Skip padding 4 bytes then read the vm86 TSS location. */ - if ( RDEXACT(fd, &buf->vm86_tss, sizeof(uint32_t)) || - RDEXACT(fd, &buf->vm86_tss, sizeof(uint64_t)) ) - { - PERROR("error read the address of the vm86 TSS"); - return -1; - } - // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_TMEM: - DPRINTF("xc_domain_restore start tmem\n"); - if ( xc_tmem_restore(xch, dom, fd) ) { - PERROR("error reading/restoring tmem"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_TMEM_EXTRA: - if ( xc_tmem_restore_extra(xch, dom, fd) ) { - PERROR("error reading/restoring tmem extra"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_TSC_INFO: - { - uint32_t tsc_mode, khz, incarn; - uint64_t nsec; - if ( RDEXACT(fd, &tsc_mode, sizeof(uint32_t)) || - RDEXACT(fd, &nsec, sizeof(uint64_t)) || - RDEXACT(fd, &khz, sizeof(uint32_t)) || - RDEXACT(fd, &incarn, sizeof(uint32_t)) || - xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) { - PERROR("error reading/restoring tsc info"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - } - - case XC_SAVE_ID_HVM_CONSOLE_PFN : - /* Skip padding 4 bytes then read the console pfn location. */ - if ( RDEXACT(fd, &buf->console_pfn, sizeof(uint32_t)) || - RDEXACT(fd, &buf->console_pfn, sizeof(uint64_t)) ) - { - PERROR("error read the address of the console pfn"); - return -1; - } - // DPRINTF("console pfn location: %llx\n", buf->console_pfn); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_LAST_CHECKPOINT: - ctx->last_checkpoint = 1; - // DPRINTF("last checkpoint indication received"); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION: - /* Skip padding 4 bytes then read the acpi ioport location. */ - if ( RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint32_t)) || - RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint64_t)) ) - { - PERROR("error read the acpi ioport location"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_VIRIDIAN: - /* Skip padding 4 bytes then read the acpi ioport location. */ - if ( RDEXACT(fd, &buf->viridian, sizeof(uint32_t)) || - RDEXACT(fd, &buf->viridian, sizeof(uint64_t)) ) - { - PERROR("error reading the viridian enlightenments"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_TOOLSTACK: - { - if ( RDEXACT(fd, &buf->tdata.len, sizeof(buf->tdata.len)) ) - { - PERROR("error read toolstack id size"); - return -1; - } - buf->tdata.data = (uint8_t*) realloc(buf->tdata.data, buf->tdata.len); - if ( buf->tdata.data == NULL ) - { - PERROR("error memory allocation"); - return -1; - } - if ( RDEXACT(fd, buf->tdata.data, buf->tdata.len) ) - { - PERROR("error read toolstack id"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - } - - case XC_SAVE_ID_ENABLE_COMPRESSION: - /* We cannot set compression flag directly in pagebuf structure, - * since this pagebuf still has uncompressed pages that are yet to - * be applied. We enable the compression field in pagebuf structure - * after receiving the first tailbuf. - */ - ctx->compressing = 1; - // DPRINTF("compression flag received"); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_COMPRESSED_DATA: - - /* read the length of compressed chunk coming in */ - if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) ) - { - PERROR("Error when reading compbuf_size"); - return -1; - } - if (!compbuf_size) return 1; - - buf->compbuf_size += compbuf_size; - if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) { - ERROR("Could not (re)allocate compression buffer"); - return -1; - } - buf->pages = ptmp; - - if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size), - compbuf_size) ) { - PERROR("Error when reading compression buffer"); - return -1; - } - return compbuf_size; - - case XC_SAVE_ID_HVM_GENERATION_ID_ADDR: - /* Skip padding 4 bytes then read the generation id buffer location. */ - if ( RDEXACT(fd, &buf->vm_generationid_addr, sizeof(uint32_t)) || - RDEXACT(fd, &buf->vm_generationid_addr, sizeof(uint64_t)) ) - { - PERROR("error read the generation id buffer location"); - return -1; - } - DPRINTF("read generation id buffer address"); - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_IOREQ_SERVER_PFN: - /* Skip padding 4 bytes then read the ioreq server gmfn base. */ - if ( RDEXACT(fd, &buf->ioreq_server_pfn, sizeof(uint32_t)) || - RDEXACT(fd, &buf->ioreq_server_pfn, sizeof(uint64_t)) ) - { - PERROR("error read the ioreq server gmfn base"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - case XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES: - /* Skip padding 4 bytes then read the ioreq server gmfn count. */ - if ( RDEXACT(fd, &buf->nr_ioreq_server_pages, sizeof(uint32_t)) || - RDEXACT(fd, &buf->nr_ioreq_server_pages, sizeof(uint64_t)) ) - { - PERROR("error read the ioreq server gmfn count"); - return -1; - } - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - default: - if ( (count > MAX_BATCH_SIZE) || (count < 0) ) { - ERROR("Max batch size exceeded (%d). Giving up.", count); - errno = EMSGSIZE; - return -1; - } - break; - } - - oldcount = buf->nr_pages; - buf->nr_pages += count; - if (!buf->pfn_types) { - if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) { - ERROR("Could not allocate PFN type buffer"); - return -1; - } - } else { - if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) { - ERROR("Could not reallocate PFN type buffer"); - return -1; - } - buf->pfn_types = ptmp; - } - if ( RDEXACT(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) { - PERROR("Error when reading region pfn types"); - return -1; - } - - countpages = count; - for (i = oldcount; i < buf->nr_pages; ++i) - { - unsigned long pagetype; - - pagetype = buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; - if ( pagetype == XEN_DOMCTL_PFINFO_XTAB || - pagetype == XEN_DOMCTL_PFINFO_BROKEN || - pagetype == XEN_DOMCTL_PFINFO_XALLOC ) - --countpages; - } - - if (!countpages) - return count; - - /* If Remus Checkpoint Compression is turned on, we will only be - * receiving the pfn lists now. The compressed pages will come in later, - * following a tuple. - */ - if (buf->compressing) - return pagebuf_get_one(xch, ctx, buf, fd, dom); - - oldcount = buf->nr_physpages; - buf->nr_physpages += countpages; - if (!buf->pages) { - if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) { - ERROR("Could not allocate page buffer"); - return -1; - } - } else { - if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) { - ERROR("Could not reallocate page buffer"); - return -1; - } - buf->pages = ptmp; - } - if ( RDEXACT(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) { - PERROR("Error when reading pages"); - return -1; - } - - return count; -} - -static int pagebuf_get(xc_interface *xch, struct restore_ctx *ctx, - pagebuf_t* buf, int fd, uint32_t dom) -{ - int rc; - - buf->nr_physpages = buf->nr_pages = 0; - buf->compbuf_pos = buf->compbuf_size = 0; - - do { - rc = pagebuf_get_one(xch, ctx, buf, fd, dom); - } while (rc > 0); - - if (rc < 0) - pagebuf_free(buf); - - return rc; -} - -static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, - xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3, - struct xc_mmu* mmu, - pagebuf_t* pagebuf, int curbatch, int *invalid_pages) -{ - int i, j, curpage, nr_mfns; - int k, scount; - unsigned long superpage_start=INVALID_P2M_ENTRY; - /* used by debug verify code */ - unsigned long buf[PAGE_SIZE/sizeof(unsigned long)]; - /* Our mapping of the current region (batch) */ - char *region_base; - /* A temporary mapping, and a copy, of one frame of guest memory. */ - unsigned long *page = NULL; - int nraces = 0; - struct domain_info_context *dinfo = &ctx->dinfo; - int* pfn_err = NULL; - int rc = -1; - int local_invalid_pages = 0; - /* We have handled curbatch pages before this batch, and there are - * *invalid_pages pages that are not in pagebuf->pages. So the first - * page for this page is (curbatch - *invalid_pages) page. - */ - int first_page = curbatch - *invalid_pages; - - unsigned long mfn, pfn, pagetype; - - j = pagebuf->nr_pages - curbatch; - if (j > MAX_BATCH_SIZE) - j = MAX_BATCH_SIZE; - - /* First pass for this batch: work out how much memory to alloc, and detect superpages */ - nr_mfns = scount = 0; - for ( i = 0; i < j; i++ ) - { - unsigned long pfn, pagetype; - pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; - pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK; - - /* For allocation purposes, treat XEN_DOMCTL_PFINFO_XALLOC as a normal page */ - if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && - (ctx->p2m[pfn] == INVALID_P2M_ENTRY) ) - { - /* Have a live PFN which hasn't had an MFN allocated */ - - /* Logic if we're in the middle of detecting a candidate superpage */ - if ( superpage_start != INVALID_P2M_ENTRY ) - { - /* Is this the next expected continuation? */ - if ( pfn == superpage_start + scount ) - { - if ( !ctx->superpages ) - { - ERROR("Unexpexted codepath with no superpages"); - return -1; - } - - scount++; - - /* If we've found a whole superpage, allocate it and update p2m */ - if ( scount == SUPERPAGE_NR_PFNS ) - { - unsigned long supermfn; - - - supermfn=superpage_start; - if ( xc_domain_populate_physmap_exact(xch, dom, 1, - SUPERPAGE_PFN_SHIFT, 0, &supermfn) != 0 ) - { - DPRINTF("No 2M page available for pfn 0x%lx, fall back to 4K page.\n", - superpage_start); - /* If we're falling back from a failed allocation, subtract one - * from count, since the last page == pfn, which will behandled - * anyway. */ - scount--; - goto fallback; - } - - DPRINTF("Mapping superpage (%d) pfn %lx, mfn %lx\n", scount, superpage_start, supermfn); - for (k=0; kp2m[superpage_start+k] = supermfn+k; - ctx->nr_pfns++; - /* region_map[] will be set below */ - } - superpage_start=INVALID_P2M_ENTRY; - scount=0; - } - continue; - } - - fallback: - DPRINTF("Falling back %d pages pfn %lx\n", scount, superpage_start); - for (k=0; kp2m_batch[nr_mfns++] = superpage_start+k; - ctx->p2m[superpage_start+k]--; - } - superpage_start = INVALID_P2M_ENTRY; - scount=0; - } - - /* Are we ready to start a new superpage candidate? */ - if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) ) - { - superpage_start=pfn; - scount++; - } - else - { - /* Add the current pfn to pfn_batch */ - ctx->p2m_batch[nr_mfns++] = pfn; - ctx->p2m[pfn]--; - } - } - } - - /* Clean up any partial superpage candidates */ - if ( superpage_start != INVALID_P2M_ENTRY ) - { - DPRINTF("Falling back %d pages pfn %lx\n", scount, superpage_start); - for (k=0; kp2m_batch[nr_mfns++] = superpage_start+k; - ctx->p2m[superpage_start+k]--; - } - superpage_start = INVALID_P2M_ENTRY; - } - - /* Now allocate a bunch of mfns for this batch */ - if ( nr_mfns ) - { - DPRINTF("Mapping order 0, %d; first pfn %lx\n", nr_mfns, ctx->p2m_batch[0]); - - if (!ctx->hvm && ctx->superpages) - rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns); - else - rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0, - ctx->p2m_batch); - - if (rc) - { - ERROR("Failed to allocate memory for batch.!\n"); - errno = ENOMEM; - return -1; - } - } - - /* Second pass for this batch: update p2m[] and region_mfn[] */ - nr_mfns = 0; - for ( i = 0; i < j; i++ ) - { - unsigned long pfn, pagetype; - pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; - pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK; - - if ( pagetype != XEN_DOMCTL_PFINFO_XTAB - && ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) ) - { - /* We just allocated a new mfn above; update p2m */ - ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++]; - ctx->nr_pfns++; - } - - /* setup region_mfn[] for batch map, if necessary. - * For HVM guests, this interface takes PFNs, not MFNs */ - if ( pagetype == XEN_DOMCTL_PFINFO_XTAB - || pagetype == XEN_DOMCTL_PFINFO_XALLOC ) - region_mfn[i] = ~0UL; /* map will fail but we don't care */ - else - region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn]; - } - - /* Map relevant mfns */ - pfn_err = calloc(j, sizeof(*pfn_err)); - if ( pfn_err == NULL ) - { - PERROR("allocation for pfn_err failed"); - return -1; - } - region_base = xc_map_foreign_bulk( - xch, dom, PROT_WRITE, region_mfn, pfn_err, j); - - if ( region_base == NULL ) - { - PERROR("map batch failed"); - free(pfn_err); - return -1; - } - - for ( i = 0, curpage = -1; i < j; i++ ) - { - pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; - pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK; - - if ( pagetype == XEN_DOMCTL_PFINFO_XTAB - || pagetype == XEN_DOMCTL_PFINFO_XALLOC) - { - local_invalid_pages++; - /* a bogus/unmapped/allocate-only page: skip it */ - continue; - } - - if ( pagetype == XEN_DOMCTL_PFINFO_BROKEN ) - { - if ( xc_set_broken_page_p2m(xch, dom, pfn) ) - { - ERROR("Set p2m for broken page failed, " - "dom=%d, pfn=%lx\n", dom, pfn); - goto err_mapped; - } - - local_invalid_pages++; - continue; - } - - if (pfn_err[i]) - { - ERROR("unexpected PFN mapping failure pfn %lx map_mfn %lx p2m_mfn %lx", - pfn, region_mfn[i], ctx->p2m[pfn]); - goto err_mapped; - } - - ++curpage; - - if ( pfn > dinfo->p2m_size ) - { - ERROR("pfn out of range"); - goto err_mapped; - } - - pfn_type[pfn] = pagetype; - - mfn = ctx->p2m[pfn]; - - /* In verify mode, we use a copy; otherwise we work in place */ - page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE); - - /* Remus - page decompression */ - if (pagebuf->compressing) - { - if (xc_compression_uncompress_page(xch, pagebuf->pages, - pagebuf->compbuf_size, - &pagebuf->compbuf_pos, - (char *)page)) - { - ERROR("Failed to uncompress page (pfn=%lx)\n", pfn); - goto err_mapped; - } - } - else - memcpy(page, pagebuf->pages + (first_page + curpage) * PAGE_SIZE, - PAGE_SIZE); - - pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; - - if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && - (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) - { - /* - ** A page table page - need to 'uncanonicalize' it, i.e. - ** replace all the references to pfns with the corresponding - ** mfns for the new domain. - ** - ** On PAE we need to ensure that PGDs are in MFNs < 4G, and - ** so we may need to update the p2m after the main loop. - ** Hence we defer canonicalization of L1s until then. - */ - if ((ctx->pt_levels != 3) || - pae_extended_cr3 || - (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) { - - if (!uncanonicalize_pagetable(xch, dom, ctx, page)) { - /* - ** Failing to uncanonicalize a page table can be ok - ** under live migration since the pages type may have - ** changed by now (and we'll get an update later). - */ - DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n", - pagetype >> 28, pfn, mfn); - nraces++; - continue; - } - } - } - else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB ) - { - ERROR("Bogus page type %lx page table is out of range: " - "i=%d p2m_size=%lu", pagetype, i, dinfo->p2m_size); - goto err_mapped; - } - - if ( pagebuf->verify ) - { - int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE); - if ( res ) - { - int v; - - DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx " - "actualcs=%08lx\n", pfn, pfn_type[pfn], - csum_page(region_base + i * PAGE_SIZE), - csum_page(buf)); - - for ( v = 0; v < 4; v++ ) - { - unsigned long *p = (unsigned long *) - (region_base + i*PAGE_SIZE); - if ( buf[v] != p[v] ) - DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]); - } - } - } - - if ( !ctx->hvm && - xc_add_mmu_update(xch, mmu, - (((unsigned long long)mfn) << PAGE_SHIFT) - | MMU_MACHPHYS_UPDATE, pfn) ) - { - PERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn); - goto err_mapped; - } - } /* end of 'batch' for loop */ - - rc = nraces; - *invalid_pages += local_invalid_pages; - - err_mapped: - munmap(region_base, j*PAGE_SIZE); - free(pfn_err); - - return rc; -} - -int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, - unsigned int store_evtchn, unsigned long *store_mfn, - domid_t store_domid, unsigned int console_evtchn, - unsigned long *console_mfn, domid_t console_domid, - unsigned int hvm, unsigned int pae, int superpages, - int checkpointed_stream, - struct restore_callbacks *callbacks) -{ - DECLARE_DOMCTL; - xc_dominfo_t info; - int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0; - uint32_t vcpuextstate_size = 0; - unsigned long mfn, pfn; - int nraces = 0; - - /* The new domain's shared-info frame number. */ - unsigned long shared_info_frame; - unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */ - shared_info_any_t *old_shared_info = - (shared_info_any_t *)shared_info_page; - shared_info_any_t *new_shared_info; - - /* A copy of the CPU context of the guest. */ - DECLARE_HYPERCALL_BUFFER(vcpu_guest_context_any_t, ctxt); - - /* A copy of the CPU eXtended States of the guest. */ - DECLARE_HYPERCALL_BUFFER(void, buffer); - - /* A table containing the type of each PFN (/not/ MFN!). */ - unsigned long *pfn_type = NULL; - - /* A table of MFNs to map in the current region */ - xen_pfn_t *region_mfn = NULL; - - /* A copy of the pfn-to-mfn table frame list. */ - xen_pfn_t *p2m_frame_list = NULL; - - /* A temporary mapping of the guest's start_info page. */ - start_info_any_t *start_info; - - /* Our mapping of the current region (batch) */ - char *region_base; - - struct xc_mmu *mmu = NULL; - - struct mmuext_op pin[MAX_PIN_BATCH]; - unsigned int nr_pins; - - uint64_t vcpumap[XC_SR_MAX_VCPUS/64] = { 1ULL }; - unsigned int max_vcpu_id = 0; - int new_ctxt_format = 0; - - pagebuf_t pagebuf; - tailbuf_t tailbuf, tmptail; - struct toolstack_data_t tdata, tdatatmp; - void* vcpup; - uint64_t console_pfn = 0; - - int orig_io_fd_flags; - - struct restore_ctx _ctx; - struct restore_ctx *ctx = &_ctx; - struct domain_info_context *dinfo = &ctx->dinfo; - - if ( getenv("XG_MIGRATION_V2") ) - { - return xc_domain_restore2( - xch, io_fd, dom, store_evtchn, store_mfn, - store_domid, console_evtchn, console_mfn, console_domid, - hvm, pae, superpages, checkpointed_stream, callbacks); - } - - DPRINTF("%s: starting restore of new domid %u", __func__, dom); - - pagebuf_init(&pagebuf); - memset(&tailbuf, 0, sizeof(tailbuf)); - tailbuf.ishvm = hvm; - memset(&tdata, 0, sizeof(tdata)); - - memset(ctx, 0, sizeof(*ctx)); - - ctx->superpages = superpages; - ctx->hvm = hvm; - ctx->last_checkpoint = !checkpointed_stream; - - ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt)); - - if ( ctxt == NULL ) - { - PERROR("Unable to allocate VCPU ctxt buffer"); - return 1; - } - - - if ( (orig_io_fd_flags = fcntl(io_fd, F_GETFL, 0)) < 0 ) { - PERROR("unable to read IO FD flags"); - goto out; - } - - if ( RDEXACT(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) ) - { - PERROR("read: p2m_size"); - goto out; - } - DPRINTF("%s: p2m_size = %lx\n", __func__, dinfo->p2m_size); - - if ( !get_platform_info(xch, dom, - &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) ) - { - ERROR("Unable to get platform info."); - return 1; - } - - /* The *current* word size of the guest isn't very interesting; for now - * assume the guest will be the same as we are. We'll fix that later - * if we discover otherwise. */ - dinfo->guest_width = sizeof(unsigned long); - ctx->pt_levels = (dinfo->guest_width == 8) ? 4 : 3; - - if ( !hvm ) - { - /* Load the p2m frame list, plus potential extended info chunk */ - p2m_frame_list = load_p2m_frame_list(xch, ctx, - io_fd, &pae_extended_cr3, &ext_vcpucontext, - &vcpuextstate_size); - - if ( !p2m_frame_list ) - goto out; - - /* Now that we know the word size, tell Xen about it */ - memset(&domctl, 0, sizeof(domctl)); - domctl.domain = dom; - domctl.cmd = XEN_DOMCTL_set_address_size; - domctl.u.address_size.size = dinfo->guest_width * 8; - frc = do_domctl(xch, &domctl); - if ( frc != 0 ) - { - PERROR("Unable to set guest address size."); - goto out; - } - } - - /* We want zeroed memory so use calloc rather than malloc. */ - ctx->p2m = calloc(dinfo->p2m_size, sizeof(xen_pfn_t)); - pfn_type = calloc(dinfo->p2m_size, sizeof(unsigned long)); - - region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); - ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); - if (!ctx->hvm && ctx->superpages) - { - ctx->p2m_saved_batch = - malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); - if ( ctx->p2m_saved_batch == NULL ) - { - ERROR("saved batch memory alloc failed"); - errno = ENOMEM; - goto out; - } - } - - if ( (ctx->p2m == NULL) || (pfn_type == NULL) || - (region_mfn == NULL) || (ctx->p2m_batch == NULL) ) - { - ERROR("memory alloc failed"); - errno = ENOMEM; - goto out; - } - - memset(region_mfn, 0, - ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); - memset(ctx->p2m_batch, 0, - ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); - - /* Get the domain's shared-info frame. */ - if ( xc_domain_getinfo(xch, (domid_t)dom, 1, &info) != 1 ) - { - PERROR("Could not get information on new domain"); - goto out; - } - shared_info_frame = info.shared_info_frame; - - /* Mark all PFNs as invalid; we allocate on demand */ - for ( pfn = 0; pfn < dinfo->p2m_size; pfn++ ) - ctx->p2m[pfn] = INVALID_P2M_ENTRY; - - mmu = xc_alloc_mmu_updates(xch, dom); - if ( mmu == NULL ) - { - PERROR("Could not initialise for MMU updates"); - goto out; - } - - xc_set_progress_prefix(xch, "Reloading memory pages"); - xc_report_progress_step(xch, 0, dinfo->p2m_size); - - /* - * Now simply read each saved frame into its new machine frame. - * We uncanonicalise page tables as we go. - */ - - n = m = 0; - loadpages: - for ( ; ; ) - { - int j, curbatch, invalid_pages; - - xc_report_progress_step(xch, n, dinfo->p2m_size); - - if ( !ctx->completed ) { - pagebuf.nr_physpages = pagebuf.nr_pages = 0; - pagebuf.compbuf_pos = pagebuf.compbuf_size = 0; - if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) { - PERROR("Error when reading batch"); - goto out; - } - } - j = pagebuf.nr_pages; - - DBGPRINTF("batch %d\n",j); - - if ( j == 0 ) { - /* catch vcpu updates */ - if (pagebuf.new_ctxt_format) { - max_vcpu_id = pagebuf.max_vcpu_id; - memcpy(vcpumap, pagebuf.vcpumap, vcpumap_sz(max_vcpu_id)); - } - /* should this be deferred? does it change? */ - if ( pagebuf.identpt ) - xc_hvm_param_set(xch, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt); - if ( pagebuf.paging_ring_pfn ) - xc_hvm_param_set(xch, dom, HVM_PARAM_PAGING_RING_PFN, pagebuf.paging_ring_pfn); - if ( pagebuf.monitor_ring_pfn ) - xc_hvm_param_set(xch, dom, HVM_PARAM_MONITOR_RING_PFN, pagebuf.monitor_ring_pfn); - if ( pagebuf.sharing_ring_pfn ) - xc_hvm_param_set(xch, dom, HVM_PARAM_SHARING_RING_PFN, pagebuf.sharing_ring_pfn); - if ( pagebuf.vm86_tss ) - xc_hvm_param_set(xch, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss); - if ( pagebuf.console_pfn ) - console_pfn = pagebuf.console_pfn; - if ( pagebuf.vm_generationid_addr ) - xc_hvm_param_set(xch, dom, HVM_PARAM_VM_GENERATION_ID_ADDR, - pagebuf.vm_generationid_addr); - - break; /* our work here is done */ - } - - /* break pagebuf into batches */ - curbatch = 0; - invalid_pages = 0; - while ( curbatch < j ) { - int brc; - - brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type, - pae_extended_cr3, mmu, &pagebuf, curbatch, - &invalid_pages); - if ( brc < 0 ) - goto out; - - nraces += brc; - - curbatch += MAX_BATCH_SIZE; - } - - pagebuf.nr_physpages = pagebuf.nr_pages = 0; - pagebuf.compbuf_pos = pagebuf.compbuf_size = 0; - - n += j; /* crude stats */ - - /* - * Discard cache for portion of file read so far up to last - * page boundary every 16MB or so. - */ - m += j; - if ( m > MAX_PAGECACHE_USAGE ) - { - discard_file_cache(xch, io_fd, 0 /* no flush */); - m = 0; - } - } - - /* - * Ensure we flush all machphys updates before potential PAE-specific - * reallocations below. - */ - if ( !hvm && xc_flush_mmu_updates(xch, mmu) ) - { - PERROR("Error doing flush_mmu_updates()"); - goto out; - } - - // DPRINTF("Received all pages (%d races)\n", nraces); - - if ( !ctx->completed ) { - - if ( buffer_tail(xch, ctx, &tailbuf, io_fd, max_vcpu_id, vcpumap, - ext_vcpucontext, vcpuextstate_size) < 0 ) { - ERROR ("error buffering image tail"); - goto out; - } - - ctx->completed = 1; - - /* - * If more checkpoints are expected then shift into - * nonblocking mode for the remainder. - */ - if ( !ctx->last_checkpoint ) - fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK); - - /* - * If sender had sent enable compression flag, switch to compressed - * checkpoints mode once the first checkpoint is received. - */ - if (ctx->compressing) - pagebuf.compressing = 1; - } - - if (pagebuf.viridian != 0) - xc_hvm_param_set(xch, dom, HVM_PARAM_VIRIDIAN, pagebuf.viridian); - - /* - * If we are migrating in from a host that does not support - * secondary emulators then nr_ioreq_server_pages will be 0, since - * there will be no XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES chunk in - * the image. - * If we are migrating from a host that does support secondary - * emulators then the XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES chunk - * will exist and is guaranteed to have a non-zero value. The - * existence of that chunk also implies the existence of the - * XC_SAVE_ID_HVM_IOREQ_SERVER_PFN chunk, which is also guaranteed - * to have a non-zero value. - */ - if (!pagebuf.nr_ioreq_server_pages ^ !pagebuf.ioreq_server_pfn) { - ERROR("Inconsistent IOREQ Server settings (nr=%"PRIx64", pfn=%"PRIx64")", - pagebuf.nr_ioreq_server_pages, pagebuf.ioreq_server_pfn); - } else { - if (pagebuf.nr_ioreq_server_pages != 0 && - pagebuf.ioreq_server_pfn != 0) { - xc_hvm_param_set(xch, dom, HVM_PARAM_NR_IOREQ_SERVER_PAGES, - pagebuf.nr_ioreq_server_pages); - xc_hvm_param_set(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN, - pagebuf.ioreq_server_pfn); - } - } - - if (pagebuf.acpi_ioport_location == 1) { - DBGPRINTF("Use new firmware ioport from the checkpoint\n"); - xc_hvm_param_set(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, 1); - } else if (pagebuf.acpi_ioport_location == 0) { - DBGPRINTF("Use old firmware ioport from the checkpoint\n"); - } else { - ERROR("Error, unknow acpi ioport location (%"PRId64")", pagebuf.acpi_ioport_location); - } - - tdatatmp = tdata; - tdata = pagebuf.tdata; - pagebuf.tdata = tdatatmp; - - if ( ctx->last_checkpoint ) - { - // DPRINTF("Last checkpoint, finishing\n"); - goto finish; - } - - // DPRINTF("Buffered checkpoint\n"); - - if ( pagebuf_get(xch, ctx, &pagebuf, io_fd, dom) ) { - PERROR("error when buffering batch, finishing"); - /* - * Remus: discard the current incomplete checkpoint and restore - * backup from the last complete checkpoint. - */ - goto finish; - } - memset(&tmptail, 0, sizeof(tmptail)); - tmptail.ishvm = hvm; - if ( buffer_tail(xch, ctx, &tmptail, io_fd, max_vcpu_id, vcpumap, - ext_vcpucontext, vcpuextstate_size) < 0 ) { - ERROR ("error buffering image tail, finishing"); - /* - * Remus: discard the current incomplete checkpoint and restore - * backup from the last complete checkpoint. - */ - goto finish; - } - tailbuf_free(&tailbuf); - memcpy(&tailbuf, &tmptail, sizeof(tailbuf)); - - goto loadpages; - - /* With Remus: restore from last complete checkpoint */ - finish: - if ( hvm ) - goto finish_hvm; - - if ( (ctx->pt_levels == 3) && !pae_extended_cr3 ) - { - /* - ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This - ** is a little awkward and involves (a) finding all such PGDs and - ** replacing them with 'lowmem' versions; (b) upating the p2m[] - ** with the new info; and (c) canonicalizing all the L1s using the - ** (potentially updated) p2m[]. - ** - ** This is relatively slow (and currently involves two passes through - ** the pfn_type[] array), but at least seems to be correct. May wish - ** to consider more complex approaches to optimize this later. - */ - - int j, k; - - /* First pass: find all L3TABs current in > 4G mfns and get new mfns */ - for ( i = 0; i < dinfo->p2m_size; i++ ) - { - if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) == - XEN_DOMCTL_PFINFO_L3TAB) && - (ctx->p2m[i] > 0xfffffUL) ) - { - unsigned long new_mfn; - uint64_t l3ptes[4]; - uint64_t *l3tab; - - l3tab = (uint64_t *) - xc_map_foreign_range(xch, dom, PAGE_SIZE, - PROT_READ, ctx->p2m[i]); - if ( l3tab == NULL ) - { - PERROR("xc_map_foreign_range failed (for l3tab)"); - goto out; - } - - for ( j = 0; j < 4; j++ ) - l3ptes[j] = l3tab[j]; - - munmap(l3tab, PAGE_SIZE); - - new_mfn = xc_make_page_below_4G(xch, dom, ctx->p2m[i]); - if ( !new_mfn ) - { - PERROR("Couldn't get a page below 4GB :-("); - goto out; - } - - ctx->p2m[i] = new_mfn; - if ( xc_add_mmu_update(xch, mmu, - (((unsigned long long)new_mfn) - << PAGE_SHIFT) | - MMU_MACHPHYS_UPDATE, i) ) - { - PERROR("Couldn't m2p on PAE root pgdir"); - goto out; - } - - l3tab = (uint64_t *) - xc_map_foreign_range(xch, dom, PAGE_SIZE, - PROT_READ | PROT_WRITE, ctx->p2m[i]); - if ( l3tab == NULL ) - { - PERROR("xc_map_foreign_range failed (for l3tab, 2nd)"); - goto out; - } - - for ( j = 0; j < 4; j++ ) - l3tab[j] = l3ptes[j]; - - munmap(l3tab, PAGE_SIZE); - } - } - - /* Second pass: find all L1TABs and uncanonicalize them */ - j = 0; - - for ( i = 0; i < dinfo->p2m_size; i++ ) - { - if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) == - XEN_DOMCTL_PFINFO_L1TAB) ) - { - region_mfn[j] = ctx->p2m[i]; - j++; - } - - if ( (i == (dinfo->p2m_size-1)) || (j == MAX_BATCH_SIZE) ) - { - region_base = xc_map_foreign_pages( - xch, dom, PROT_READ | PROT_WRITE, region_mfn, j); - if ( region_base == NULL ) - { - PERROR("map batch failed"); - goto out; - } - - for ( k = 0; k < j; k++ ) - { - if ( !uncanonicalize_pagetable( - xch, dom, ctx, - region_base + k*PAGE_SIZE) ) - { - ERROR("failed uncanonicalize pt!"); - goto out; - } - } - - munmap(region_base, j*PAGE_SIZE); - j = 0; - } - } - - if ( xc_flush_mmu_updates(xch, mmu) ) - { - PERROR("Error doing xc_flush_mmu_updates()"); - goto out; - } - } - - /* - * Pin page tables. Do this after writing to them as otherwise Xen - * will barf when doing the type-checking. - */ - nr_pins = 0; - for ( i = 0; i < dinfo->p2m_size; i++ ) - { - if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 ) - continue; - - switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) - { - case XEN_DOMCTL_PFINFO_L1TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE; - break; - - case XEN_DOMCTL_PFINFO_L2TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE; - break; - - case XEN_DOMCTL_PFINFO_L3TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE; - break; - - case XEN_DOMCTL_PFINFO_L4TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE; - break; - - default: - continue; - } - - pin[nr_pins].arg1.mfn = ctx->p2m[i]; - nr_pins++; - - /* Batch full? Then flush. */ - if ( nr_pins == MAX_PIN_BATCH ) - { - if ( xc_mmuext_op(xch, pin, nr_pins, dom) < 0 ) - { - PERROR("Failed to pin batch of %d page tables", nr_pins); - goto out; - } - nr_pins = 0; - } - } - - /* Flush final partial batch. */ - if ( (nr_pins != 0) && (xc_mmuext_op(xch, pin, nr_pins, dom) < 0) ) - { - PERROR("Failed to pin batch of %d page tables", nr_pins); - goto out; - } - - DPRINTF("Memory reloaded (%ld pages)\n", ctx->nr_pfns); - - /* Get the list of PFNs that are not in the psuedo-phys map */ - { - int nr_frees = 0; - - for ( i = 0; i < tailbuf.u.pv.pfncount; i++ ) - { - unsigned long pfn = tailbuf.u.pv.pfntab[i]; - - if ( ctx->p2m[pfn] != INVALID_P2M_ENTRY ) - { - /* pfn is not in physmap now, but was at some point during - the save/migration process - need to free it */ - tailbuf.u.pv.pfntab[nr_frees++] = ctx->p2m[pfn]; - ctx->p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */ - } - } - - if ( nr_frees > 0 ) - { - if ( (frc = xc_domain_decrease_reservation(xch, dom, nr_frees, 0, tailbuf.u.pv.pfntab)) != nr_frees ) - { - PERROR("Could not decrease reservation : %d", frc); - goto out; - } - else - DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount); - } - } - - vcpup = tailbuf.u.pv.vcpubuf; - for ( i = 0; i <= max_vcpu_id; i++ ) - { - if ( !(vcpumap[i/64] & (1ULL << (i%64))) ) - continue; - - memcpy(ctxt, vcpup, ((dinfo->guest_width == 8) ? sizeof(ctxt->x64) - : sizeof(ctxt->x32))); - vcpup += (dinfo->guest_width == 8) ? sizeof(ctxt->x64) : sizeof(ctxt->x32); - - DPRINTF("read VCPU %d\n", i); - - if ( !new_ctxt_format ) - SET_FIELD(ctxt, flags, - GET_FIELD(ctxt, flags, dinfo->guest_width) | VGCF_online, - dinfo->guest_width); - - if ( i == 0 ) - { - /* - * Uncanonicalise the start info frame number and poke in - * updated values into the start info itself. - * - * The start info MFN is the 3rd argument to the - * HYPERVISOR_sched_op hypercall when op==SCHEDOP_shutdown - * and reason==SHUTDOWN_suspend, it is canonicalised in - * xc_domain_save and therefore the PFN is found in the - * edx register. - */ - pfn = GET_FIELD(ctxt, user_regs.edx, dinfo->guest_width); - if ( (pfn >= dinfo->p2m_size) || - (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) - { - ERROR("Suspend record frame number is bad"); - goto out; - } - mfn = ctx->p2m[pfn]; - SET_FIELD(ctxt, user_regs.edx, mfn, dinfo->guest_width); - start_info = xc_map_foreign_range( - xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); - if ( start_info == NULL ) - { - PERROR("xc_map_foreign_range failed (for start_info)"); - goto out; - } - - SET_FIELD(start_info, nr_pages, dinfo->p2m_size, dinfo->guest_width); - SET_FIELD(start_info, shared_info, shared_info_frame<guest_width); - SET_FIELD(start_info, flags, 0, dinfo->guest_width); - if ( GET_FIELD(start_info, store_mfn, dinfo->guest_width) > dinfo->p2m_size ) - { - ERROR("Suspend record xenstore frame number is bad"); - munmap(start_info, PAGE_SIZE); - goto out; - } - *store_mfn = ctx->p2m[GET_FIELD(start_info, store_mfn, dinfo->guest_width)]; - SET_FIELD(start_info, store_mfn, *store_mfn, dinfo->guest_width); - SET_FIELD(start_info, store_evtchn, store_evtchn, dinfo->guest_width); - if ( GET_FIELD(start_info, console.domU.mfn, dinfo->guest_width) > dinfo->p2m_size ) - { - ERROR("Suspend record console frame number is bad"); - munmap(start_info, PAGE_SIZE); - goto out; - } - *console_mfn = ctx->p2m[GET_FIELD(start_info, console.domU.mfn, dinfo->guest_width)]; - SET_FIELD(start_info, console.domU.mfn, *console_mfn, dinfo->guest_width); - SET_FIELD(start_info, console.domU.evtchn, console_evtchn, dinfo->guest_width); - munmap(start_info, PAGE_SIZE); - } - /* Uncanonicalise each GDT frame number. */ - if ( GET_FIELD(ctxt, gdt_ents, dinfo->guest_width) > 8192 ) - { - ERROR("GDT entry count out of range"); - goto out; - } - - for ( j = 0; (512*j) < GET_FIELD(ctxt, gdt_ents, dinfo->guest_width); j++ ) - { - pfn = GET_FIELD(ctxt, gdt_frames[j], dinfo->guest_width); - if ( (pfn >= dinfo->p2m_size) || - (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) - { - ERROR("GDT frame number %i (0x%lx) is bad", - j, (unsigned long)pfn); - goto out; - } - SET_FIELD(ctxt, gdt_frames[j], ctx->p2m[pfn], dinfo->guest_width); - } - /* Uncanonicalise the page table base pointer. */ - pfn = UNFOLD_CR3(GET_FIELD(ctxt, ctrlreg[3], dinfo->guest_width)); - - if ( pfn >= dinfo->p2m_size ) - { - ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx", - pfn, dinfo->p2m_size, pfn_type[pfn]); - goto out; - } - - if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != - ((unsigned long)ctx->pt_levels<p2m_size, pfn_type[pfn], - (unsigned long)ctx->pt_levels<p2m[pfn]), dinfo->guest_width); - - /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */ - if ( (ctx->pt_levels == 4) && (ctxt->x64.ctrlreg[1] & 1) ) - { - pfn = UNFOLD_CR3(ctxt->x64.ctrlreg[1] & ~1); - if ( pfn >= dinfo->p2m_size ) - { - ERROR("User PT base is bad: pfn=%lu p2m_size=%lu", - pfn, dinfo->p2m_size); - goto out; - } - if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != - ((unsigned long)ctx->pt_levels<p2m_size, pfn_type[pfn], - (unsigned long)ctx->pt_levels<x64.ctrlreg[1] = FOLD_CR3(ctx->p2m[pfn]); - } - frc = xc_vcpu_setcontext(xch, dom, i, ctxt); - if ( frc != 0 ) - { - PERROR("Couldn't build vcpu%d", i); - goto out; - } - - if ( !ext_vcpucontext ) - goto vcpu_ext_state_restore; - memcpy(&domctl.u.ext_vcpucontext, vcpup, 128); - vcpup += 128; - domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext; - domctl.domain = dom; - frc = xc_domctl(xch, &domctl); - if ( frc != 0 ) - { - PERROR("Couldn't set extended vcpu%d info", i); - goto out; - } - - vcpu_ext_state_restore: - if ( !vcpuextstate_size ) - continue; - - memcpy(&domctl.u.vcpuextstate.xfeature_mask, vcpup, - sizeof(domctl.u.vcpuextstate.xfeature_mask)); - vcpup += sizeof(domctl.u.vcpuextstate.xfeature_mask); - memcpy(&domctl.u.vcpuextstate.size, vcpup, - sizeof(domctl.u.vcpuextstate.size)); - vcpup += sizeof(domctl.u.vcpuextstate.size); - - buffer = xc_hypercall_buffer_alloc(xch, buffer, - domctl.u.vcpuextstate.size); - if ( !buffer ) - { - PERROR("Could not allocate buffer to restore eXtended States"); - goto out; - } - memcpy(buffer, vcpup, domctl.u.vcpuextstate.size); - vcpup += domctl.u.vcpuextstate.size; - - domctl.cmd = XEN_DOMCTL_setvcpuextstate; - domctl.domain = dom; - domctl.u.vcpuextstate.vcpu = i; - set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); - frc = xc_domctl(xch, &domctl); - if ( frc != 0 ) - { - PERROR("Couldn't set eXtended States for vcpu%d", i); - goto out; - } - xc_hypercall_buffer_free(xch, buffer); - } - - memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE); - - DPRINTF("Completed checkpoint load\n"); - - /* Restore contents of shared-info page. No checking needed. */ - new_shared_info = xc_map_foreign_range( - xch, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame); - if ( new_shared_info == NULL ) - { - PERROR("xc_map_foreign_range failed (for new_shared_info)"); - goto out; - } - - /* restore saved vcpu_info and arch specific info */ - MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info, dinfo->guest_width); - MEMCPY_FIELD(new_shared_info, old_shared_info, arch, dinfo->guest_width); - - /* clear any pending events and the selector */ - MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0, dinfo->guest_width); - for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) - SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0, dinfo->guest_width); - - /* mask event channels */ - MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff, dinfo->guest_width); - - /* leave wallclock time. set by hypervisor */ - munmap(new_shared_info, PAGE_SIZE); - - /* Uncanonicalise the pfn-to-mfn table frame-number list. */ - for ( i = 0; i < P2M_FL_ENTRIES; i++ ) - { - pfn = p2m_frame_list[i]; - if ( (pfn >= dinfo->p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) - { - ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn); - goto out; - } - p2m_frame_list[i] = ctx->p2m[pfn]; - } - - /* Copy the P2M we've constructed to the 'live' P2M */ - if ( !(ctx->live_p2m = xc_map_foreign_pages(xch, dom, PROT_WRITE, - p2m_frame_list, P2M_FL_ENTRIES)) ) - { - PERROR("Couldn't map p2m table"); - goto out; - } - - /* If the domain we're restoring has a different word size to ours, - * we need to adjust the live_p2m assignment appropriately */ - if ( dinfo->guest_width > sizeof (xen_pfn_t) ) - for ( i = dinfo->p2m_size - 1; i >= 0; i-- ) - ((int64_t *)ctx->live_p2m)[i] = (long)ctx->p2m[i]; - else if ( dinfo->guest_width < sizeof (xen_pfn_t) ) - for ( i = 0; i < dinfo->p2m_size; i++ ) - ((uint32_t *)ctx->live_p2m)[i] = ctx->p2m[i]; - else - memcpy(ctx->live_p2m, ctx->p2m, dinfo->p2m_size * sizeof(xen_pfn_t)); - munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE); - - frc = xc_dom_gnttab_seed(xch, dom, *console_mfn, *store_mfn, - console_domid, store_domid); - if (frc != 0) - { - ERROR("error seeding grant table"); - goto out; - } - - DPRINTF("Domain ready to be built.\n"); - rc = 0; - goto out; - - finish_hvm: - if ( tdata.data != NULL ) - { - if ( callbacks != NULL && callbacks->toolstack_restore != NULL ) - { - frc = callbacks->toolstack_restore(dom, tdata.data, tdata.len, - callbacks->data); - free(tdata.data); - if ( frc < 0 ) - { - PERROR("error calling toolstack_restore"); - goto out; - } - } else { - rc = -1; - ERROR("toolstack data available but no callback provided\n"); - free(tdata.data); - goto out; - } - } - - /* Dump the QEMU state to a state file for QEMU to load */ - if ( dump_qemu(xch, dom, &tailbuf.u.hvm) ) { - PERROR("Error dumping QEMU state to file"); - goto out; - } - - /* These comms pages need to be zeroed at the start of day */ - if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[0]) || - xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[1]) || - xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[2]) ) - { - PERROR("error zeroing magic pages"); - goto out; - } - - if ( (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0])) - || (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1])) - || (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2])) - || (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_PAE_ENABLED, pae)) - || (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_STORE_EVTCHN, - store_evtchn)) - || (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_CONSOLE_EVTCHN, - console_evtchn)) ) - { - PERROR("error setting HVM params: %i", frc); - goto out; - } - *store_mfn = tailbuf.u.hvm.magicpfns[2]; - - if ( console_pfn ) { - if ( xc_clear_domain_page(xch, dom, console_pfn) ) { - PERROR("error zeroing console page"); - goto out; - } - if ( (frc = xc_hvm_param_set(xch, dom, - HVM_PARAM_CONSOLE_PFN, console_pfn)) ) { - PERROR("error setting HVM param: %i", frc); - goto out; - } - *console_mfn = console_pfn; - } - - frc = xc_domain_hvm_setcontext(xch, dom, tailbuf.u.hvm.hvmbuf, - tailbuf.u.hvm.reclen); - if ( frc ) - { - PERROR("error setting the HVM context"); - goto out; - } - - frc = xc_dom_gnttab_hvm_seed(xch, dom, *console_mfn, *store_mfn, - console_domid, store_domid); - if (frc != 0) - { - ERROR("error seeding grant table"); - goto out; - } - - /* HVM success! */ - rc = 0; - - out: - if ( (rc != 0) && (dom != 0) ) - xc_domain_destroy(xch, dom); - xc_hypercall_buffer_free(xch, ctxt); - free(mmu); - free(ctx->p2m); - free(pfn_type); - free(region_mfn); - free(ctx->p2m_batch); - pagebuf_free(&pagebuf); - tailbuf_free(&tailbuf); - - /* discard cache for save file */ - discard_file_cache(xch, io_fd, 1 /*flush*/); - - fcntl(io_fd, F_SETFL, orig_io_fd_flags); - - DPRINTF("Restore exit of domid %u with rc=%d\n", dom, rc); - - return rc; -} -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c deleted file mode 100644 index 3222473..0000000 --- a/tools/libxc/xc_domain_save.c +++ /dev/null @@ -1,2198 +0,0 @@ -/****************************************************************************** - * xc_linux_save.c - * - * Save the state of a running Linux session. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * Copyright (c) 2003, K A Fraser. - */ - -#include -#include -#include -#include -#include -#include - -#include "xc_private.h" -#include "xc_bitops.h" -#include "xc_dom.h" -#include "xg_private.h" -#include "xg_save_restore.h" - -#include - -/* -** Default values for important tuning parameters. Can override by passing -** non-zero replacement values to xc_domain_save(). -** -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. -** -*/ -#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ -#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */ - -struct save_ctx { - unsigned long hvirt_start; /* virtual starting address of the hypervisor */ - unsigned int pt_levels; /* #levels of page tables used by the current guest */ - unsigned long max_mfn; /* max mfn of the whole machine */ - xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */ - xen_pfn_t *live_m2p; /* Live mapping of system MFN to PFN table. */ - unsigned long m2p_mfn0; - struct domain_info_context dinfo; -}; - -/* buffer for output */ -struct outbuf { - void* buf; - size_t size; - size_t pos; - int write_count; -}; - -#define OUTBUF_SIZE (16384 * 1024) - -/* grep fodder: machine_to_phys */ - -#define mfn_to_pfn(_mfn) (ctx->live_m2p[(_mfn)]) - -#define pfn_to_mfn(_pfn) \ - ((xen_pfn_t) ((dinfo->guest_width==8) \ - ? (((uint64_t *)ctx->live_p2m)[(_pfn)]) \ - : ((((uint32_t *)ctx->live_p2m)[(_pfn)]) == 0xffffffffU \ - ? (-1UL) : (((uint32_t *)ctx->live_p2m)[(_pfn)])))) - -/* - * Returns TRUE if the given machine frame number has a unique mapping - * in the guest's pseudophysical map. - */ -#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ - (((_mfn) < (ctx->max_mfn)) && \ - ((mfn_to_pfn(_mfn) < (dinfo->p2m_size)) && \ - (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn)))) - -#define SUPERPAGE_PFN_SHIFT 9 -#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT) - -#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 ) - -static uint64_t tv_to_us(struct timeval *new) -{ - return (new->tv_sec * 1000000) + new->tv_usec; -} - -static uint64_t llgettimeofday(void) -{ - struct timeval now; - gettimeofday(&now, NULL); - return tv_to_us(&now); -} - -static uint64_t tv_delta(struct timeval *new, struct timeval *old) -{ - return (((new->tv_sec - old->tv_sec)*1000000) + - (new->tv_usec - old->tv_usec)); -} - -static int noncached_write(xc_interface *xch, - struct outbuf* ob, - int fd, void *buffer, int len) -{ - int rc = (write_exact(fd, buffer, len) == 0) ? len : -1; - - ob->write_count += len; - if ( ob->write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) ) - { - /* Time to discard cache - dont care if this fails */ - int saved_errno = errno; - discard_file_cache(xch, fd, 0 /* no flush */); - errno = saved_errno; - ob->write_count = 0; - } - - return rc; -} - -static int outbuf_init(xc_interface *xch, struct outbuf* ob, size_t size) -{ - memset(ob, 0, sizeof(*ob)); - - if ( !(ob->buf = malloc(size)) ) { - DPRINTF("error allocating output buffer of size %zu\n", size); - return -1; - } - - ob->size = size; - - return 0; -} - -static int outbuf_free(struct outbuf *ob) -{ - free(ob->buf); - ob->buf = NULL; - return 0; -} - -static inline int outbuf_write(xc_interface *xch, - struct outbuf* ob, void* buf, size_t len) -{ - if ( len > ob->size - ob->pos ) { - errno = ERANGE; - DBGPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos); - return -1; - } - - memcpy(ob->buf + ob->pos, buf, len); - ob->pos += len; - - return 0; -} - -/* prep for nonblocking I/O */ -static int outbuf_flush(xc_interface *xch, struct outbuf* ob, int fd) -{ - int rc; - int cur = 0; - - if ( !ob->pos ) - return 0; - - rc = write(fd, ob->buf, ob->pos); - while (rc < 0 || cur + rc < ob->pos) { - if (rc < 0 && errno != EAGAIN && errno != EINTR) { - DPRINTF("error flushing output: %d\n", errno); - return -1; - } - if (rc > 0) - cur += rc; - - rc = write(fd, ob->buf + cur, ob->pos - cur); - } - - ob->pos = 0; - - return 0; -} - -/* if there's no room in the buffer, flush it and try again. */ -static inline int outbuf_hardwrite(xc_interface *xch, - struct outbuf* ob, int fd, void* buf, - size_t len) -{ - if ( !len ) - return 0; - - if ( !outbuf_write(xch, ob, buf, len) ) - return 0; - - if ( outbuf_flush(xch, ob, fd) < 0 ) - return -1; - - return outbuf_write(xch, ob, buf, len); -} - -/* start buffering output once we've reached checkpoint mode. */ -static inline int write_buffer(xc_interface *xch, - int dobuf, struct outbuf* ob, int fd, void* buf, - size_t len) -{ - if ( dobuf ) - return outbuf_hardwrite(xch, ob, fd, buf, len); - else - return write_exact(fd, buf, len); -} - -/* like write_buffer for noncached, which returns number of bytes written */ -static inline int write_uncached(xc_interface *xch, - int dobuf, struct outbuf* ob, int fd, - void* buf, size_t len) -{ - if ( dobuf ) - return outbuf_hardwrite(xch, ob, fd, buf, len) ? -1 : len; - else - return noncached_write(xch, ob, fd, buf, len); -} - -static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx, - int dobuf, struct outbuf* ob, int fd) -{ - int rc = 0; - int header = sizeof(int) + sizeof(unsigned long); - int marker = XC_SAVE_ID_COMPRESSED_DATA; - unsigned long compbuf_len = 0; - - for(;;) - { - /* check for available space (atleast 8k) */ - if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size) - { - if (outbuf_flush(xch, ob, fd) < 0) - { - ERROR("Error when flushing outbuf intermediate"); - return -1; - } - } - - rc = xc_compression_compress_pages(xch, compress_ctx, - ob->buf + ob->pos + header, - ob->size - ob->pos - header, - &compbuf_len); - if (!rc) - break; - - if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0) - { - PERROR("Error when writing marker (errno %d)", errno); - return -1; - } - - if (outbuf_hardwrite(xch, ob, fd, &compbuf_len, sizeof(compbuf_len)) < 0) - { - PERROR("Error when writing compbuf_len (errno %d)", errno); - return -1; - } - - ob->pos += (size_t) compbuf_len; - if (!dobuf && outbuf_flush(xch, ob, fd) < 0) - { - ERROR("Error when writing compressed chunk"); - return -1; - } - } - - return 0; -} - -struct time_stats { - struct timeval wall; - long long d0_cpu, d1_cpu; -}; - -static int print_stats(xc_interface *xch, uint32_t domid, int pages_sent, - struct time_stats *last, - xc_shadow_op_stats_t *stats, int print) -{ - struct time_stats now; - - gettimeofday(&now.wall, NULL); - - now.d0_cpu = xc_domain_get_cpu_usage(xch, 0, /* FIXME */ 0)/1000; - now.d1_cpu = xc_domain_get_cpu_usage(xch, domid, /* FIXME */ 0)/1000; - - if ( (now.d0_cpu == -1) || (now.d1_cpu == -1) ) - DPRINTF("ARRHHH!!\n"); - - if ( print ) - { - long long wall_delta; - long long d0_cpu_delta; - long long d1_cpu_delta; - - wall_delta = tv_delta(&now.wall,&last->wall)/1000; - if ( wall_delta == 0 ) - wall_delta = 1; - - d0_cpu_delta = (now.d0_cpu - last->d0_cpu)/1000; - d1_cpu_delta = (now.d1_cpu - last->d1_cpu)/1000; - - DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " - "dirtied %dMb/s %" PRId32 " pages\n", - wall_delta, - (int)((d0_cpu_delta*100)/wall_delta), - (int)((d1_cpu_delta*100)/wall_delta), - (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), - (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), - stats->dirty_count); - } - - *last = now; - - return 0; -} - - -static int analysis_phase(xc_interface *xch, uint32_t domid, struct save_ctx *ctx, - xc_hypercall_buffer_t *arr, int runs) -{ - long long start, now; - xc_shadow_op_stats_t stats; - int j; - struct domain_info_context *dinfo = &ctx->dinfo; - - start = llgettimeofday(); - - for ( j = 0; j < runs; j++ ) - { - int i; - - xc_shadow_control(xch, domid, XEN_DOMCTL_SHADOW_OP_CLEAN, - arr, dinfo->p2m_size, NULL, 0, NULL); - DPRINTF("#Flush\n"); - for ( i = 0; i < 40; i++ ) - { - usleep(50000); - now = llgettimeofday(); - xc_shadow_control(xch, domid, XEN_DOMCTL_SHADOW_OP_PEEK, - NULL, 0, NULL, 0, &stats); - DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n", - ((now-start)+500)/1000, - stats.fault_count, stats.dirty_count); - } - } - - return -1; -} - -static int suspend_and_state(int (*suspend)(void*), void* data, - xc_interface *xch, int io_fd, int dom, - xc_dominfo_t *info) -{ - if ( !(*suspend)(data) ) - { - ERROR("Suspend request failed"); - return -1; - } - - if ( (xc_domain_getinfo(xch, dom, 1, info) != 1) || - !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) ) - { - ERROR("Domain not in suspended state"); - return -1; - } - - return 0; -} - -/* -** Map the top-level page of MFNs from the guest. The guest might not have -** finished resuming from a previous restore operation, so we wait a while for -** it to update the MFN to a reasonable value. -*/ -static void *map_frame_list_list(xc_interface *xch, uint32_t dom, - struct save_ctx *ctx, - shared_info_any_t *shinfo) -{ - int count = 100; - void *p; - struct domain_info_context *dinfo = &ctx->dinfo; - uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list, dinfo->guest_width); - - while ( count-- && (fll == 0) ) - { - usleep(10000); - fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list, dinfo->guest_width); - } - - if ( fll == 0 ) - { - ERROR("Timed out waiting for frame list updated."); - return NULL; - } - - p = xc_map_foreign_range(xch, dom, PAGE_SIZE, PROT_READ, fll); - if ( p == NULL ) - PERROR("Couldn't map p2m_frame_list_list (errno %d)", errno); - - return p; -} - -/* -** During transfer (or in the state file), all page-table pages must be -** converted into a 'canonical' form where references to actual mfns -** are replaced with references to the corresponding pfns. -** -** This function performs the appropriate conversion, taking into account -** which entries do not require canonicalization (in particular, those -** entries which map the virtual address reserved for the hypervisor). -*/ -static int canonicalize_pagetable(struct save_ctx *ctx, - unsigned long type, unsigned long pfn, - const void *spage, void *dpage) -{ - struct domain_info_context *dinfo = &ctx->dinfo; - int i, pte_last, xen_start, xen_end, race = 0; - uint64_t pte; - - /* - ** We need to determine which entries in this page table hold - ** reserved hypervisor mappings. This depends on the current - ** page table type as well as the number of paging levels. - */ - xen_start = xen_end = pte_last = PAGE_SIZE / 8; - - if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) ) - xen_start = L3_PAGETABLE_ENTRIES_PAE; - - /* - ** In PAE only the L2 mapping the top 1GB contains Xen mappings. - ** We can spot this by looking for the guest's mappingof the m2p. - ** Guests must ensure that this check will fail for other L2s. - */ - if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) - { - int hstart; - uint64_t he; - - hstart = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; - he = ((const uint64_t *) spage)[hstart]; - - if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 ) - { - /* hvirt starts with xen stuff... */ - xen_start = hstart; - } - else if ( ctx->hvirt_start != 0xf5800000 ) - { - /* old L2s from before hole was shrunk... */ - hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; - he = ((const uint64_t *) spage)[hstart]; - if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 ) - xen_start = hstart; - } - } - - if ( (ctx->pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) ) - { - /* - ** XXX SMH: should compute these from hvirt_start (which we have) - ** and hvirt_end (which we don't) - */ - xen_start = 256; - xen_end = 272; - } - - /* Now iterate through the page table, canonicalizing each PTE */ - for (i = 0; i < pte_last; i++ ) - { - unsigned long pfn, mfn; - - pte = ((const uint64_t*)spage)[i]; - - if ( (i >= xen_start) && (i < xen_end) ) - pte = 0; - - if ( pte & _PAGE_PRESENT ) - { - mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) - { - /* This will happen if the type info is stale which - is quite feasible under live migration */ - pfn = 0; /* zap it - we'll retransmit this page later */ - /* XXX: We can't spot Xen mappings in compat-mode L2es - * from 64-bit tools, but the only thing in them is the - * compat m2p, so we quietly zap them. This doesn't - * count as a race, so don't report it. */ - if ( !(type == XEN_DOMCTL_PFINFO_L2TAB - && sizeof (unsigned long) > dinfo->guest_width) ) - race = 1; /* inform the caller; fatal if !live */ - } - else - pfn = mfn_to_pfn(mfn); - - pte &= ~MADDR_MASK_X86; - pte |= (uint64_t)pfn << PAGE_SHIFT; - - /* - * PAE guest L3Es can contain these flags when running on - * a 64bit hypervisor. We zap these here to avoid any - * surprise at restore time... - */ - if ( (ctx->pt_levels == 3) && - (type == XEN_DOMCTL_PFINFO_L3TAB) && - (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) ) - pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); - } - - ((uint64_t*)dpage)[i] = pte; - } - - return race; -} - -xen_pfn_t *xc_map_m2p(xc_interface *xch, - unsigned long max_mfn, - int prot, - unsigned long *mfn0) -{ - privcmd_mmap_entry_t *entries; - unsigned long m2p_chunks, m2p_size; - xen_pfn_t *m2p; - xen_pfn_t *extent_start; - int i; - - m2p = NULL; - m2p_size = M2P_SIZE(max_mfn); - m2p_chunks = M2P_CHUNKS(max_mfn); - - extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t)); - if ( !extent_start ) - { - ERROR("failed to allocate space for m2p mfns"); - goto err0; - } - - if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) ) - { - PERROR("xc_get_m2p_mfns"); - goto err1; - } - - entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t)); - if (entries == NULL) - { - ERROR("failed to allocate space for mmap entries"); - goto err1; - } - - for ( i = 0; i < m2p_chunks; i++ ) - entries[i].mfn = extent_start[i]; - - m2p = xc_map_foreign_ranges(xch, DOMID_XEN, - m2p_size, prot, M2P_CHUNK_SIZE, - entries, m2p_chunks); - if (m2p == NULL) - { - PERROR("xc_mmap_foreign_ranges failed"); - goto err2; - } - - if (mfn0) - *mfn0 = entries[0].mfn; - -err2: - free(entries); -err1: - free(extent_start); - -err0: - return m2p; -} - - -static xen_pfn_t *map_and_save_p2m_table(xc_interface *xch, - int io_fd, - uint32_t dom, - struct save_ctx *ctx, - shared_info_any_t *live_shinfo) -{ - vcpu_guest_context_any_t ctxt; - struct domain_info_context *dinfo = &ctx->dinfo; - - /* Double and single indirect references to the live P2M table */ - void *live_p2m_frame_list_list = NULL; - void *live_p2m_frame_list = NULL; - - /* Copies of the above. */ - xen_pfn_t *p2m_frame_list_list = NULL; - xen_pfn_t *p2m_frame_list = NULL; - - /* The mapping of the live p2m table itself */ - xen_pfn_t *p2m = NULL; - - int i, success = 0; - - live_p2m_frame_list_list = map_frame_list_list(xch, dom, ctx, - live_shinfo); - if ( !live_p2m_frame_list_list ) - goto out; - - /* Get a local copy of the live_P2M_frame_list_list */ - if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) ) - { - ERROR("Couldn't allocate p2m_frame_list_list array"); - goto out; - } - memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE); - - /* Canonicalize guest's unsigned long vs ours */ - if ( dinfo->guest_width > sizeof(unsigned long) ) - for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ ) - if ( i < PAGE_SIZE/dinfo->guest_width ) - p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i]; - else - p2m_frame_list_list[i] = 0; - else if ( dinfo->guest_width < sizeof(unsigned long) ) - for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- ) - p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i]; - - live_p2m_frame_list = - xc_map_foreign_pages(xch, dom, PROT_READ, - p2m_frame_list_list, - P2M_FLL_ENTRIES); - if ( !live_p2m_frame_list ) - { - PERROR("Couldn't map p2m_frame_list"); - goto out; - } - - /* Get a local copy of the live_P2M_frame_list */ - if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) ) - { - ERROR("Couldn't allocate p2m_frame_list array"); - goto out; - } - memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE); - memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE); - - munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); - live_p2m_frame_list = NULL; - - /* Canonicalize guest's unsigned long vs ours */ - if ( dinfo->guest_width > sizeof(unsigned long) ) - for ( i = 0; i < P2M_FL_ENTRIES; i++ ) - p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i]; - else if ( dinfo->guest_width < sizeof(unsigned long) ) - for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- ) - p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i]; - - - /* Map all the frames of the pfn->mfn table. For migrate to succeed, - the guest must not change which frames are used for this purpose. - (its not clear why it would want to change them, and we'll be OK - from a safety POV anyhow. */ - - p2m = xc_map_foreign_pages(xch, dom, PROT_READ, - p2m_frame_list, - P2M_FL_ENTRIES); - if ( !p2m ) - { - PERROR("Couldn't map p2m table"); - goto out; - } - ctx->live_p2m = p2m; /* So that translation macros will work */ - - /* Canonicalise the pfn-to-mfn table frame-number list. */ - for ( i = 0; i < dinfo->p2m_size; i += FPP ) - { - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) ) - { - ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys"); - ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx", - i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], ctx->max_mfn); - if ( p2m_frame_list[i/FPP] < ctx->max_mfn ) - { - ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64, - (uint64_t)p2m_frame_list[i/FPP], - (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]]); - ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64, - (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]], - (uint64_t)p2m[ctx->live_m2p[p2m_frame_list[i/FPP]]]); - - } - goto out; - } - p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]); - } - - if ( xc_vcpu_getcontext(xch, dom, 0, &ctxt) ) - { - PERROR("Could not get vcpu context"); - goto out; - } - - /* - * Write an extended-info structure to inform the restore code that - * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off - * slow paths in the restore code. - */ - { - unsigned long signature = ~0UL; - uint32_t chunk1_sz = ((dinfo->guest_width==8) - ? sizeof(ctxt.x64) - : sizeof(ctxt.x32)); - uint32_t chunk2_sz = 0; - uint32_t chunk3_sz = 4; - uint32_t xcnt_size = 0; - uint32_t tot_sz; - DECLARE_DOMCTL; - - domctl.cmd = XEN_DOMCTL_getvcpuextstate; - domctl.domain = dom; - domctl.u.vcpuextstate.vcpu = 0; - domctl.u.vcpuextstate.size = 0; - domctl.u.vcpuextstate.xfeature_mask = 0; - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("No extended context for VCPU%d", i); - goto out; - } - xcnt_size = domctl.u.vcpuextstate.size + 2 * sizeof(uint64_t); - - tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8); - if ( domctl.u.vcpuextstate.xfeature_mask ) - tot_sz += chunk3_sz + 8; - - if ( write_exact(io_fd, &signature, sizeof(signature)) || - write_exact(io_fd, &tot_sz, sizeof(tot_sz)) || - write_exact(io_fd, "vcpu", 4) || - write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) || - write_exact(io_fd, &ctxt, chunk1_sz) || - write_exact(io_fd, "extv", 4) || - write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) || - (domctl.u.vcpuextstate.xfeature_mask) ? - (write_exact(io_fd, "xcnt", 4) || - write_exact(io_fd, &chunk3_sz, sizeof(chunk3_sz)) || - write_exact(io_fd, &xcnt_size, 4)) : - 0 ) - { - PERROR("write: extended info"); - goto out; - } - } - - if ( write_exact(io_fd, p2m_frame_list, - P2M_FL_ENTRIES * sizeof(xen_pfn_t)) ) - { - PERROR("write: p2m_frame_list"); - goto out; - } - - success = 1; - - out: - - if ( !success && p2m ) - munmap(p2m, P2M_FL_ENTRIES * PAGE_SIZE); - - if ( live_p2m_frame_list_list ) - munmap(live_p2m_frame_list_list, PAGE_SIZE); - - if ( live_p2m_frame_list ) - munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); - - free(p2m_frame_list_list); - - free(p2m_frame_list); - - return success ? p2m : NULL; -} - -/* must be done AFTER suspend_and_state() */ -static int save_tsc_info(xc_interface *xch, uint32_t dom, int io_fd) -{ - int marker = XC_SAVE_ID_TSC_INFO; - uint32_t tsc_mode, khz, incarn; - uint64_t nsec; - - if ( xc_domain_get_tsc_info(xch, dom, &tsc_mode, - &nsec, &khz, &incarn) < 0 || - write_exact(io_fd, &marker, sizeof(marker)) || - write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) || - write_exact(io_fd, &nsec, sizeof(nsec)) || - write_exact(io_fd, &khz, sizeof(khz)) || - write_exact(io_fd, &incarn, sizeof(incarn)) ) - return -1; - return 0; -} - -int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iters, - uint32_t max_factor, uint32_t flags, - struct save_callbacks* callbacks, int hvm) -{ - xc_dominfo_t info; - DECLARE_DOMCTL; - - int rc, frc, i, j, last_iter = 0, iter = 0; - int live = (flags & XCFLAGS_LIVE); - int debug = (flags & XCFLAGS_DEBUG); - int superpages = !!hvm; - int race = 0, skip_this_iter = 0; - unsigned int sent_this_iter = 0; - int tmem_saved = 0; - - /* The new domain's shared-info frame number. */ - unsigned long shared_info_frame; - - /* A copy of the CPU context of the guest. */ - vcpu_guest_context_any_t ctxt; - - /* A table containing the type of each PFN (/not/ MFN!). */ - xen_pfn_t *pfn_type = NULL; - unsigned long *pfn_batch = NULL; - int *pfn_err = NULL; - - /* A copy of one frame of guest memory. */ - char page[PAGE_SIZE]; - - /* Live mapping of shared info structure */ - shared_info_any_t *live_shinfo = NULL; - - /* base of the region in which domain memory is mapped */ - unsigned char *region_base = NULL; - - /* A copy of the CPU eXtended States of the guest. */ - DECLARE_HYPERCALL_BUFFER(void, buffer); - - /* bitmap of pages: - - that should be sent this iteration (unless later marked as skip); - - to skip this iteration because already dirty; - - to fixup by sending at the end if not already resent; */ - DECLARE_HYPERCALL_BUFFER(unsigned long, to_skip); - DECLARE_HYPERCALL_BUFFER(unsigned long, to_send); - unsigned long *to_fix = NULL; - - struct time_stats time_stats; - xc_shadow_op_stats_t shadow_stats; - - unsigned long needed_to_fix = 0; - unsigned long total_sent = 0; - - uint64_t vcpumap[XC_SR_MAX_VCPUS/64] = { 1ULL }; - - /* HVM: a buffer for holding HVM context */ - uint32_t hvm_buf_size = 0; - uint8_t *hvm_buf = NULL; - - /* HVM: magic frames for ioreqs and xenstore comms. */ - uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ - - unsigned long mfn; - - /* Without checkpoint compression, the dirty pages, pfn arrays - * and tailbuf (vcpu ctx, shared info page, etc.) are written - * directly to outbuf. All of this is done while the domain is - * suspended. - * - * When checkpoint compression is enabled, the dirty pages are - * buffered, compressed "after" the domain is resumed and then - * written to outbuf. Since tailbuf data are collected while a - * domain is suspended, they cannot be directly written to the - * outbuf as there is no dirty page data preceeding tailbuf. - * - * So,two output buffers are maintained. Tailbuf data goes into - * ob_tailbuf. The dirty pages are compressed after resuming the - * domain and written to ob_pagebuf. ob_tailbuf is then appended - * to ob_pagebuf and finally flushed out. - */ - struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL; - struct save_ctx _ctx; - struct save_ctx *ctx = &_ctx; - struct domain_info_context *dinfo = &ctx->dinfo; - - /* Compression context */ - comp_ctx *compress_ctx= NULL; - /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only - * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for - * first time. - */ - int compressing = 0; - - int completed = 0; - - if ( getenv("XG_MIGRATION_V2") ) - { - return xc_domain_save2(xch, io_fd, dom, max_iters, - max_factor, flags, callbacks, hvm); - } - - DPRINTF("%s: starting save of domid %u", __func__, dom); - - if ( hvm && !callbacks->switch_qemu_logdirty ) - { - ERROR("No switch_qemu_logdirty callback provided."); - errno = EINVAL; - goto exit; - } - - outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE); - - memset(ctx, 0, sizeof(*ctx)); - - /* If no explicit control parameters given, use defaults */ - max_iters = max_iters ? : DEF_MAX_ITERS; - max_factor = max_factor ? : DEF_MAX_FACTOR; - - if ( !get_platform_info(xch, dom, - &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) ) - { - ERROR("Unable to get platform info."); - goto exit; - } - - if ( xc_domain_getinfo(xch, dom, 1, &info) != 1 ) - { - PERROR("Could not get domain info"); - goto exit; - } - - shared_info_frame = info.shared_info_frame; - - /* Map the shared info frame */ - if ( !hvm ) - { - live_shinfo = xc_map_foreign_range(xch, dom, PAGE_SIZE, - PROT_READ, shared_info_frame); - if ( !live_shinfo ) - { - PERROR("Couldn't map live_shinfo"); - goto out; - } - } - - /* Get the size of the P2M table */ - if ( xc_domain_nr_gpfns(xch, dom, &dinfo->p2m_size) < 0 ) - { - ERROR("Could not get maximum GPFN!"); - goto out; - } - - if ( dinfo->p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK ) - { - errno = E2BIG; - ERROR("Cannot save this big a guest"); - goto out; - } - - /* Domain is still running at this point */ - if ( live ) - { - /* Live suspend. Enable log-dirty mode. */ - if ( xc_shadow_control(xch, dom, - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL, 0, NULL) < 0 ) - { - /* log-dirty already enabled? There's no test op, - so attempt to disable then reenable it */ - frc = xc_shadow_control(xch, dom, XEN_DOMCTL_SHADOW_OP_OFF, - NULL, 0, NULL, 0, NULL); - if ( frc >= 0 ) - { - frc = xc_shadow_control(xch, dom, - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL, 0, NULL); - } - - if ( frc < 0 ) - { - PERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno ); - goto out; - } - } - - /* Enable qemu-dm logging dirty pages to xen */ - if ( hvm && callbacks->switch_qemu_logdirty(dom, 1, callbacks->data) ) - { - PERROR("Couldn't enable qemu log-dirty mode (errno %d)", errno); - goto out; - } - } - else - { - /* This is a non-live suspend. Suspend the domain .*/ - if ( suspend_and_state(callbacks->suspend, callbacks->data, xch, - io_fd, dom, &info) ) - { - ERROR("Domain appears not to have suspended"); - goto out; - } - } - - if ( flags & XCFLAGS_CHECKPOINT_COMPRESS ) - { - if (!(compress_ctx = xc_compression_create_context(xch, dinfo->p2m_size))) - { - ERROR("Failed to create compression context"); - goto out; - } - outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4); - } - - last_iter = !live; - - /* Setup to_send / to_fix and to_skip bitmaps */ - to_send = xc_hypercall_buffer_alloc_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size))); - to_skip = xc_hypercall_buffer_alloc_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size))); - to_fix = calloc(1, bitmap_size(dinfo->p2m_size)); - - if ( !to_send || !to_fix || !to_skip ) - { - errno = ENOMEM; - ERROR("Couldn't allocate to_send array"); - goto out; - } - - memset(to_send, 0xff, bitmap_size(dinfo->p2m_size)); - - if ( hvm ) - { - /* Need another buffer for HVM context */ - hvm_buf_size = xc_domain_hvm_getcontext(xch, dom, 0, 0); - if ( hvm_buf_size == -1 ) - { - PERROR("Couldn't get HVM context size from Xen"); - goto out; - } - hvm_buf = malloc(hvm_buf_size); - if ( !hvm_buf ) - { - errno = ENOMEM; - ERROR("Couldn't allocate memory"); - goto out; - } - } - - analysis_phase(xch, dom, ctx, HYPERCALL_BUFFER(to_skip), 0); - - pfn_type = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT)); - pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch)); - pfn_err = malloc(MAX_BATCH_SIZE * sizeof(*pfn_err)); - if ( (pfn_type == NULL) || (pfn_batch == NULL) || (pfn_err == NULL) ) - { - ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays"); - errno = ENOMEM; - goto out; - } - memset(pfn_type, 0, - ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT)); - - /* Setup the mfn_to_pfn table mapping */ - if ( !(ctx->live_m2p = xc_map_m2p(xch, ctx->max_mfn, PROT_READ, &ctx->m2p_mfn0)) ) - { - PERROR("Failed to map live M2P table"); - goto out; - } - - /* Start writing out the saved-domain record. */ - if ( write_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) ) - { - PERROR("write: p2m_size"); - goto out; - } - - if ( !hvm ) - { - int err = 0; - - /* Map the P2M table, and write the list of P2M frames */ - ctx->live_p2m = map_and_save_p2m_table(xch, io_fd, dom, ctx, live_shinfo); - if ( ctx->live_p2m == NULL ) - { - PERROR("Failed to map/save the p2m frame list"); - goto out; - } - - /* - * Quick belt and braces sanity check. - */ - - for ( i = 0; i < dinfo->p2m_size; i++ ) - { - mfn = pfn_to_mfn(i); - if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) ) - { - DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, - mfn, mfn_to_pfn(mfn)); - err++; - } - } - DPRINTF("Had %d unexplained entries in p2m table\n", err); - } - - print_stats(xch, dom, 0, &time_stats, &shadow_stats, 0); - - tmem_saved = xc_tmem_save(xch, dom, io_fd, live, XC_SAVE_ID_TMEM); - if ( tmem_saved == -1 ) - { - PERROR("Error when writing to state file (tmem)"); - goto out; - } - - if ( !live && save_tsc_info(xch, dom, io_fd) < 0 ) - { - PERROR("Error when writing to state file (tsc)"); - goto out; - } - - copypages: -#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf), (len)) -#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, (fd), (buf), (len)) -#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob, (fd)) - - ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */ - /* Now write out each data page, canonicalising page tables as we go... */ - for ( ; ; ) - { - unsigned int N, batch, run; - char reportbuf[80]; - - snprintf(reportbuf, sizeof(reportbuf), - "Saving memory: iter %d (last sent %u skipped %u)", - iter, sent_this_iter, skip_this_iter); - - xc_set_progress_prefix(xch, reportbuf); - xc_report_progress_step(xch, 0, dinfo->p2m_size); - - iter++; - sent_this_iter = 0; - skip_this_iter = 0; - N = 0; - - while ( N < dinfo->p2m_size ) - { - xc_report_progress_step(xch, N, dinfo->p2m_size); - - if ( !last_iter ) - { - /* Slightly wasteful to peek the whole array every time, - but this is fast enough for the moment. */ - frc = xc_shadow_control( - xch, dom, XEN_DOMCTL_SHADOW_OP_PEEK, HYPERCALL_BUFFER(to_skip), - dinfo->p2m_size, NULL, 0, NULL); - if ( frc != dinfo->p2m_size ) - { - ERROR("Error peeking shadow bitmap"); - goto out; - } - } - - /* load pfn_type[] with the mfn of all the pages we're doing in - this batch. */ - for ( batch = 0; - (batch < MAX_BATCH_SIZE) && (N < dinfo->p2m_size); - N++ ) - { - int n = N; - - if ( debug ) - { - DPRINTF("%d pfn= %08lx mfn= %08lx %d", - iter, (unsigned long)n, - hvm ? 0 : pfn_to_mfn(n), - test_bit(n, to_send)); - if ( !hvm && is_mapped(pfn_to_mfn(n)) ) - DPRINTF(" [mfn]= %08lx", - mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF)); - DPRINTF("\n"); - } - - if ( completed ) - { - /* for sparse bitmaps, word-by-word may save time */ - if ( !to_send[N >> ORDER_LONG] ) - { - /* incremented again in for loop! */ - N += BITS_PER_LONG - 1; - continue; - } - - if ( !test_bit(n, to_send) ) - continue; - - pfn_batch[batch] = n; - if ( hvm ) - pfn_type[batch] = n; - else - pfn_type[batch] = pfn_to_mfn(n); - } - else - { - int dont_skip = (last_iter || (superpages && iter==1)); - - if ( !dont_skip && - test_bit(n, to_send) && - test_bit(n, to_skip) ) - skip_this_iter++; /* stats keeping */ - - if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) || - (test_bit(n, to_send) && dont_skip) || - (test_bit(n, to_fix) && last_iter)) ) - continue; - - /* First time through, try to keep superpages in the same batch */ - if ( superpages && iter == 1 - && SUPER_PAGE_START(n) - && batch + SUPERPAGE_NR_PFNS > MAX_BATCH_SIZE ) - break; - - /* - ** we get here if: - ** 1. page is marked to_send & hasn't already been re-dirtied - ** 2. (ignore to_skip in first and last iterations) - ** 3. add in pages that still need fixup (net bufs) - */ - - pfn_batch[batch] = n; - - /* Hypercall interfaces operate in PFNs for HVM guests - * and MFNs for PV guests */ - if ( hvm ) - pfn_type[batch] = n; - else - pfn_type[batch] = pfn_to_mfn(n); - - if ( !is_mapped(pfn_type[batch]) ) - { - /* - ** not currently in psuedo-physical map -- set bit - ** in to_fix since we must send this page in last_iter - ** unless its sent sooner anyhow, or it never enters - ** pseudo-physical map (e.g. for ballooned down doms) - */ - set_bit(n, to_fix); - continue; - } - - if ( last_iter && - test_bit(n, to_fix) && - !test_bit(n, to_send) ) - { - needed_to_fix++; - DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n", - iter, n, pfn_type[batch]); - } - - clear_bit(n, to_fix); - } - - batch++; - } - - if ( batch == 0 ) - goto skip; /* vanishingly unlikely... */ - - region_base = xc_map_foreign_bulk( - xch, dom, PROT_READ, pfn_type, pfn_err, batch); - if ( region_base == NULL ) - { - PERROR("map batch failed"); - goto out; - } - - /* Get page types */ - if ( xc_get_pfn_type_batch(xch, dom, batch, pfn_type) ) - { - PERROR("get_pfn_type_batch failed"); - goto out; - } - - for ( run = j = 0; j < batch; j++ ) - { - unsigned long gmfn = pfn_batch[j]; - - if ( !hvm ) - gmfn = pfn_to_mfn(gmfn); - - if ( pfn_type[j] == XEN_DOMCTL_PFINFO_BROKEN ) - { - pfn_type[j] |= pfn_batch[j]; - ++run; - continue; - } - - if ( pfn_err[j] ) - { - if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB ) - continue; - - DPRINTF("map fail: page %i mfn %08lx err %d\n", - j, gmfn, pfn_err[j]); - pfn_type[j] = XEN_DOMCTL_PFINFO_XTAB; - continue; - } - - if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB ) - { - DPRINTF("type fail: page %i mfn %08lx\n", j, gmfn); - continue; - } - - if ( superpages && iter==1 && test_bit(gmfn, to_skip)) - pfn_type[j] = XEN_DOMCTL_PFINFO_XALLOC; - - /* canonicalise mfn->pfn */ - pfn_type[j] |= pfn_batch[j]; - ++run; - - if ( debug ) - { - if ( hvm ) - DPRINTF("%d pfn=%08lx sum=%08lx\n", - iter, - pfn_type[j], - csum_page(region_base + (PAGE_SIZE*j))); - else - DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx" - " sum= %08lx\n", - iter, - pfn_type[j], - gmfn, - mfn_to_pfn(gmfn), - csum_page(region_base + (PAGE_SIZE*j))); - } - } - - if ( !run ) - { - munmap(region_base, batch*PAGE_SIZE); - continue; /* bail on this batch: no valid pages */ - } - - if ( wrexact(io_fd, &batch, sizeof(unsigned int)) ) - { - PERROR("Error when writing to state file (2)"); - goto out; - } - - if ( sizeof(unsigned long) < sizeof(*pfn_type) ) - for ( j = 0; j < batch; j++ ) - ((unsigned long *)pfn_type)[j] = pfn_type[j]; - if ( wrexact(io_fd, pfn_type, sizeof(unsigned long)*batch) ) - { - PERROR("Error when writing to state file (3)"); - goto out; - } - if ( sizeof(unsigned long) < sizeof(*pfn_type) ) - while ( --j >= 0 ) - pfn_type[j] = ((unsigned long *)pfn_type)[j]; - - /* entering this loop, pfn_type is now in pfns (Not mfns) */ - run = 0; - for ( j = 0; j < batch; j++ ) - { - unsigned long pfn, pagetype; - void *spage = (char *)region_base + (PAGE_SIZE*j); - - pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; - pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK; - - if ( pagetype != 0 ) - { - /* If the page is not a normal data page, write out any - run of pages we may have previously acumulated */ - if ( !compressing && run ) - { - if ( wruncached(io_fd, live, - (char*)region_base+(PAGE_SIZE*(j-run)), - PAGE_SIZE*run) != PAGE_SIZE*run ) - { - PERROR("Error when writing to state file (4a)" - " (errno %d)", errno); - goto out; - } - run = 0; - } - } - - /* - * skip pages that aren't present, - * or are broken, or are alloc-only - */ - if ( pagetype == XEN_DOMCTL_PFINFO_XTAB - || pagetype == XEN_DOMCTL_PFINFO_BROKEN - || pagetype == XEN_DOMCTL_PFINFO_XALLOC ) - continue; - - pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; - - if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && - (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) - { - /* We have a pagetable page: need to rewrite it. */ - race = - canonicalize_pagetable(ctx, pagetype, pfn, spage, page); - - if ( race && !live ) - { - ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn, - pagetype); - goto out; - } - - if (compressing) - { - int c_err; - /* Mark pagetable page to be sent uncompressed */ - c_err = xc_compression_add_page(xch, compress_ctx, page, - pfn, 1 /* raw page */); - if (c_err == -2) /* OOB PFN */ - { - ERROR("Could not add pagetable page " - "(pfn:%" PRIpfn "to page buffer\n", pfn); - goto out; - } - - if (c_err == -1) - { - /* - * We are out of buffer space to hold dirty - * pages. Compress and flush the current buffer - * to make space. This is a corner case, that - * slows down checkpointing as the compression - * happens while domain is suspended. Happens - * seldom and if you find this occuring - * frequently, increase the PAGE_BUFFER_SIZE - * in xc_compression.c. - */ - if (wrcompressed(io_fd) < 0) - { - ERROR("Error when writing compressed" - " data (4b)\n"); - goto out; - } - } - } - else if ( wruncached(io_fd, live, page, - PAGE_SIZE) != PAGE_SIZE ) - { - PERROR("Error when writing to state file (4b)" - " (errno %d)", errno); - goto out; - } - } - else - { - /* We have a normal page: accumulate it for writing. */ - if (compressing) - { - int c_err; - /* For checkpoint compression, accumulate the page in the - * page buffer, to be compressed later. - */ - c_err = xc_compression_add_page(xch, compress_ctx, spage, - pfn, 0 /* not raw page */); - - if (c_err == -2) /* OOB PFN */ - { - ERROR("Could not add page " - "(pfn:%" PRIpfn "to page buffer\n", pfn); - goto out; - } - - if (c_err == -1) - { - if (wrcompressed(io_fd) < 0) - { - ERROR("Error when writing compressed" - " data (4c)\n"); - goto out; - } - } - } - else - run++; - } - } /* end of the write out for this batch */ - - if ( run ) - { - /* write out the last accumulated run of pages */ - if ( wruncached(io_fd, live, - (char*)region_base+(PAGE_SIZE*(j-run)), - PAGE_SIZE*run) != PAGE_SIZE*run ) - { - PERROR("Error when writing to state file (4c)" - " (errno %d)", errno); - goto out; - } - } - - sent_this_iter += batch; - - munmap(region_base, batch*PAGE_SIZE); - - } /* end of this while loop for this iteration */ - - skip: - - xc_report_progress_step(xch, dinfo->p2m_size, dinfo->p2m_size); - - total_sent += sent_this_iter; - - if ( last_iter ) - { - print_stats( xch, dom, sent_this_iter, &time_stats, &shadow_stats, 1); - - DPRINTF("Total pages sent= %ld (%.2fx)\n", - total_sent, ((float)total_sent)/dinfo->p2m_size ); - DPRINTF("(of which %ld were fixups)\n", needed_to_fix ); - } - - if ( last_iter && debug ) - { - int id = XC_SAVE_ID_ENABLE_VERIFY_MODE; - memset(to_send, 0xff, bitmap_size(dinfo->p2m_size)); - debug = 0; - DPRINTF("Entering debug resend-all mode\n"); - - /* send "-1" to put receiver into debug mode */ - if ( wrexact(io_fd, &id, sizeof(int)) ) - { - PERROR("Error when writing to state file (6)"); - goto out; - } - - continue; - } - - if ( last_iter ) - break; - - if ( live ) - { - if ( (iter >= max_iters) || - (sent_this_iter+skip_this_iter < 50) || - (total_sent > dinfo->p2m_size*max_factor) ) - { - DPRINTF("Start last iteration\n"); - last_iter = 1; - - if ( suspend_and_state(callbacks->suspend, callbacks->data, - xch, io_fd, dom, &info) ) - { - ERROR("Domain appears not to have suspended"); - goto out; - } - - DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame); - if ( (tmem_saved > 0) && - (xc_tmem_save_extra(xch,dom,io_fd,XC_SAVE_ID_TMEM_EXTRA) == -1) ) - { - PERROR("Error when writing to state file (tmem)"); - goto out; - } - - if ( save_tsc_info(xch, dom, io_fd) < 0 ) - { - PERROR("Error when writing to state file (tsc)"); - goto out; - } - - - } - - if ( xc_shadow_control(xch, dom, - XEN_DOMCTL_SHADOW_OP_CLEAN, HYPERCALL_BUFFER(to_send), - dinfo->p2m_size, NULL, 0, &shadow_stats) != dinfo->p2m_size ) - { - PERROR("Error flushing shadow PT"); - goto out; - } - - print_stats(xch, dom, sent_this_iter, &time_stats, &shadow_stats, 1); - - } - } /* end of infinite for loop */ - - DPRINTF("All memory is saved\n"); - - /* After last_iter, buffer the rest of pagebuf & tailbuf data into a - * separate output buffer and flush it after the compressed page chunks. - */ - if (compressing) - { - ob = &ob_tailbuf; - ob->pos = 0; - } - - { - struct chunk { - int id; - int max_vcpu_id; - uint64_t vcpumap[XC_SR_MAX_VCPUS/64]; - } chunk = { XC_SAVE_ID_VCPU_INFO, info.max_vcpu_id }; - - if ( info.max_vcpu_id >= XC_SR_MAX_VCPUS ) - { - errno = E2BIG; - ERROR("Too many VCPUS in guest!"); - goto out; - } - - for ( i = 1; i <= info.max_vcpu_id; i++ ) - { - xc_vcpuinfo_t vinfo; - if ( (xc_vcpu_getinfo(xch, dom, i, &vinfo) == 0) && - vinfo.online ) - vcpumap[i/64] |= 1ULL << (i%64); - } - - memcpy(chunk.vcpumap, vcpumap, vcpumap_sz(info.max_vcpu_id)); - if ( wrexact(io_fd, &chunk, offsetof(struct chunk, vcpumap) - + vcpumap_sz(info.max_vcpu_id)) ) - { - PERROR("Error when writing to state file"); - goto out; - } - } - - if ( hvm ) - { - struct { - int id; - uint32_t pad; - uint64_t data; - } chunk = { 0, }; - - chunk.id = XC_SAVE_ID_HVM_GENERATION_ID_ADDR; - xc_hvm_param_get(xch, dom, HVM_PARAM_VM_GENERATION_ID_ADDR, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the generation id buffer location for guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_IDENT_PT; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_IDENT_PT, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the ident_pt for EPT guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_PAGING_RING_PFN; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_PAGING_RING_PFN, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the paging ring pfn for guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_MONITOR_RING_PFN; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_MONITOR_RING_PFN, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the access ring pfn for guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_SHARING_RING_PFN; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_SHARING_RING_PFN, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the sharing ring pfn for guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_VM86_TSS; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_VM86_TSS, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the vm86 TSS for guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_CONSOLE_PFN; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_CONSOLE_PFN, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the console pfn for guest"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, &chunk.data); - - if ((chunk.data != 0) && wrexact(io_fd, &chunk, sizeof(chunk))) - { - PERROR("Error when writing the firmware ioport version"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_VIRIDIAN; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_VIRIDIAN, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the viridian flag"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_IOREQ_SERVER_PFN; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the ioreq server gmfn base"); - goto out; - } - - chunk.id = XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES; - chunk.data = 0; - xc_hvm_param_get(xch, dom, HVM_PARAM_NR_IOREQ_SERVER_PAGES, &chunk.data); - - if ( (chunk.data != 0) && - wrexact(io_fd, &chunk, sizeof(chunk)) ) - { - PERROR("Error when writing the ioreq server gmfn count"); - goto out; - } - } - - if ( callbacks != NULL && callbacks->toolstack_save != NULL ) - { - int id = XC_SAVE_ID_TOOLSTACK; - uint8_t *buf; - uint32_t len; - - if ( callbacks->toolstack_save(dom, &buf, &len, callbacks->data) < 0 ) - { - PERROR("Error calling toolstack_save"); - goto out; - } - wrexact(io_fd, &id, sizeof(id)); - wrexact(io_fd, &len, sizeof(len)); - wrexact(io_fd, buf, len); - free(buf); - } - - if ( !callbacks->checkpoint ) - { - /* - * If this is not a checkpointed save then this must be the first and - * last checkpoint. - */ - i = XC_SAVE_ID_LAST_CHECKPOINT; - if ( wrexact(io_fd, &i, sizeof(int)) ) - { - PERROR("Error when writing last checkpoint chunk"); - goto out; - } - } - - /* Enable compression logic on both sides by sending this - * one time marker. - * NOTE: We could have simplified this procedure by sending - * the enable/disable compression flag before the beginning of - * the main for loop. But this would break compatibility for - * live migration code, with older versions of xen. So we have - * to enable it after the last_iter, when the XC_SAVE_ID_* - * elements are sent. - */ - if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS)) - { - i = XC_SAVE_ID_ENABLE_COMPRESSION; - if ( wrexact(io_fd, &i, sizeof(int)) ) - { - PERROR("Error when writing enable_compression marker"); - goto out; - } - } - - /* Zero terminate */ - i = 0; - if ( wrexact(io_fd, &i, sizeof(int)) ) - { - PERROR("Error when writing to state file (6')"); - goto out; - } - - if ( hvm ) - { - uint32_t rec_size; - - /* Save magic-page locations. */ - memset(magic_pfns, 0, sizeof(magic_pfns)); - xc_hvm_param_get(xch, dom, HVM_PARAM_IOREQ_PFN, &magic_pfns[0]); - xc_hvm_param_get(xch, dom, HVM_PARAM_BUFIOREQ_PFN, &magic_pfns[1]); - xc_hvm_param_get(xch, dom, HVM_PARAM_STORE_PFN, &magic_pfns[2]); - if ( wrexact(io_fd, magic_pfns, sizeof(magic_pfns)) ) - { - PERROR("Error when writing to state file (7)"); - goto out; - } - - /* Get HVM context from Xen and save it too */ - if ( (rec_size = xc_domain_hvm_getcontext(xch, dom, hvm_buf, - hvm_buf_size)) == -1 ) - { - PERROR("HVM:Could not get hvm buffer"); - goto out; - } - - if ( wrexact(io_fd, &rec_size, sizeof(uint32_t)) ) - { - PERROR("error write hvm buffer size"); - goto out; - } - - if ( wrexact(io_fd, hvm_buf, rec_size) ) - { - PERROR("write HVM info failed!"); - goto out; - } - - /* HVM guests are done now */ - goto success; - } - - /* PV guests only from now on */ - - /* Send through a list of all the PFNs that were not in map at the close */ - { - unsigned int i,j; - unsigned long pfntab[1024]; - - for ( i = 0, j = 0; i < dinfo->p2m_size; i++ ) - { - if ( !is_mapped(pfn_to_mfn(i)) ) - j++; - } - - if ( wrexact(io_fd, &j, sizeof(unsigned int)) ) - { - PERROR("Error when writing to state file (6a)"); - goto out; - } - - for ( i = 0, j = 0; i < dinfo->p2m_size; ) - { - if ( !is_mapped(pfn_to_mfn(i)) ) - pfntab[j++] = i; - - i++; - if ( (j == 1024) || (i == dinfo->p2m_size) ) - { - if ( wrexact(io_fd, &pfntab, sizeof(unsigned long)*j) ) - { - PERROR("Error when writing to state file (6b)"); - goto out; - } - j = 0; - } - } - } - - if ( xc_vcpu_getcontext(xch, dom, 0, &ctxt) ) - { - PERROR("Could not get vcpu context"); - goto out; - } - - /* - * Canonicalise the start info frame number. - * - * The start info MFN is the 3rd argument to the - * HYPERVISOR_sched_op hypercall when op==SCHEDOP_shutdown and - * reason==SHUTDOWN_suspend and is therefore found in the edx - * register. - */ - mfn = GET_FIELD(&ctxt, user_regs.edx, dinfo->guest_width); - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) - { - errno = ERANGE; - ERROR("Suspend record is not in range of pseudophys map"); - goto out; - } - SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn), dinfo->guest_width); - - for ( i = 0; i <= info.max_vcpu_id; i++ ) - { - if ( !(vcpumap[i/64] & (1ULL << (i%64))) ) - continue; - - if ( (i != 0) && xc_vcpu_getcontext(xch, dom, i, &ctxt) ) - { - PERROR("No context for VCPU%d", i); - goto out; - } - - /* Canonicalise each GDT frame number. */ - for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents, dinfo->guest_width); j++ ) - { - mfn = GET_FIELD(&ctxt, gdt_frames[j], dinfo->guest_width); - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) - { - errno = ERANGE; - ERROR("GDT frame is not in range of pseudophys map"); - goto out; - } - SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn), dinfo->guest_width); - } - - /* Canonicalise the page table base pointer. */ - if ( !MFN_IS_IN_PSEUDOPHYS_MAP( - UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3], dinfo->guest_width))) ) - { - errno = ERANGE; - ERROR("PT base is not in range of pseudophys map"); - goto out; - } - SET_FIELD(&ctxt, ctrlreg[3], - FOLD_CR3(mfn_to_pfn(UNFOLD_CR3( - GET_FIELD(&ctxt, ctrlreg[3], dinfo->guest_width) - ))), dinfo->guest_width); - - /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */ - if ( (ctx->pt_levels == 4) && ctxt.x64.ctrlreg[1] ) - { - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) ) - { - errno = ERANGE; - ERROR("PT base is not in range of pseudophys map"); - goto out; - } - /* Least-significant bit means 'valid PFN'. */ - ctxt.x64.ctrlreg[1] = 1 | - FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1]))); - } - - if ( wrexact(io_fd, &ctxt, ((dinfo->guest_width==8) - ? sizeof(ctxt.x64) - : sizeof(ctxt.x32))) ) - { - PERROR("Error when writing to state file (1)"); - goto out; - } - - domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext; - domctl.domain = dom; - memset(&domctl.u, 0, sizeof(domctl.u)); - domctl.u.ext_vcpucontext.vcpu = i; - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("No extended context for VCPU%d", i); - goto out; - } - if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) ) - { - PERROR("Error when writing to state file (2)"); - goto out; - } - - /* Check there are no PV MSRs in use. */ - domctl.cmd = XEN_DOMCTL_get_vcpu_msrs; - domctl.domain = dom; - memset(&domctl.u, 0, sizeof(domctl.u)); - domctl.u.vcpu_msrs.vcpu = i; - domctl.u.vcpu_msrs.msr_count = 0; - set_xen_guest_handle_raw(domctl.u.vcpu_msrs.msrs, (void*)1); - - if ( xc_domctl(xch, &domctl) < 0 ) - { - if ( errno == ENOBUFS ) - { - errno = EOPNOTSUPP; - PERROR("Unable to migrate PV guest using MSRs (yet)"); - } - else - PERROR("Error querying maximum number of MSRs for VCPU%d", i); - goto out; - } - - /* Start to fetch CPU eXtended States */ - /* Get buffer size first */ - domctl.cmd = XEN_DOMCTL_getvcpuextstate; - domctl.domain = dom; - domctl.u.vcpuextstate.vcpu = i; - domctl.u.vcpuextstate.xfeature_mask = 0; - domctl.u.vcpuextstate.size = 0; - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("No eXtended states (XSAVE) for VCPU%d", i); - goto out; - } - - if ( !domctl.u.vcpuextstate.xfeature_mask ) - continue; - - /* Getting eXtended states data */ - buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size); - if ( !buffer ) - { - PERROR("Insufficient memory for getting eXtended states for" - "VCPU%d", i); - goto out; - } - set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("No eXtended states (XSAVE) for VCPU%d", i); - xc_hypercall_buffer_free(xch, buffer); - goto out; - } - - if ( wrexact(io_fd, &domctl.u.vcpuextstate.xfeature_mask, - sizeof(domctl.u.vcpuextstate.xfeature_mask)) || - wrexact(io_fd, &domctl.u.vcpuextstate.size, - sizeof(domctl.u.vcpuextstate.size)) || - wrexact(io_fd, buffer, domctl.u.vcpuextstate.size) ) - { - PERROR("Error when writing to state file VCPU extended state"); - xc_hypercall_buffer_free(xch, buffer); - goto out; - } - xc_hypercall_buffer_free(xch, buffer); - } - - /* - * Reset the MFN to be a known-invalid value. See map_frame_list_list(). - */ - memcpy(page, live_shinfo, PAGE_SIZE); - SET_FIELD(((shared_info_any_t *)page), - arch.pfn_to_mfn_frame_list_list, 0, dinfo->guest_width); - if ( wrexact(io_fd, page, PAGE_SIZE) ) - { - PERROR("Error when writing to state file (1)"); - goto out; - } - - /* Flush last write and check for errors. */ - if ( fsync(io_fd) && errno != EINVAL ) - { - PERROR("Error when flushing state file"); - goto out; - } - - /* Success! */ - success: - rc = errno = 0; - goto out_rc; - - out: - rc = errno; - assert(rc); - out_rc: - completed = 1; - - if ( !rc && callbacks->postcopy ) - callbacks->postcopy(callbacks->data); - - /* guest has been resumed. Now we can compress data - * at our own pace. - */ - if (!rc && compressing) - { - ob = &ob_pagebuf; - if (wrcompressed(io_fd) < 0) - { - ERROR("Error when writing compressed data, after postcopy\n"); - goto out; - } - /* Append the tailbuf data to the main outbuf */ - if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) ) - { - PERROR("Error when copying tailbuf into outbuf"); - goto out; - } - } - - /* Flush last write and discard cache for file. */ - if ( ob && outbuf_flush(xch, ob, io_fd) < 0 ) { - PERROR("Error when flushing output buffer"); - if (!rc) - rc = errno; - } - - discard_file_cache(xch, io_fd, 1 /* flush */); - - /* Enable compression now, finally */ - compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS); - - /* checkpoint_cb can spend arbitrarily long in between rounds */ - if (!rc && callbacks->checkpoint && - callbacks->checkpoint(callbacks->data) > 0) - { - /* reset stats timer */ - print_stats(xch, dom, 0, &time_stats, &shadow_stats, 0); - - /* last_iter = 1; */ - if ( suspend_and_state(callbacks->suspend, callbacks->data, xch, - io_fd, dom, &info) ) - { - ERROR("Domain appears not to have suspended"); - goto out; - } - DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame); - print_stats(xch, dom, 0, &time_stats, &shadow_stats, 1); - - if ( xc_shadow_control(xch, dom, - XEN_DOMCTL_SHADOW_OP_CLEAN, HYPERCALL_BUFFER(to_send), - dinfo->p2m_size, NULL, 0, &shadow_stats) != dinfo->p2m_size ) - { - PERROR("Error flushing shadow PT"); - } - - goto copypages; - } - - if ( tmem_saved != 0 && live ) - xc_tmem_save_done(xch, dom); - - if ( live ) - { - if ( xc_shadow_control(xch, dom, - XEN_DOMCTL_SHADOW_OP_OFF, - NULL, 0, NULL, 0, NULL) < 0 ) - DPRINTF("Warning - couldn't disable shadow mode"); - if ( hvm && callbacks->switch_qemu_logdirty(dom, 0, callbacks->data) ) - DPRINTF("Warning - couldn't disable qemu log-dirty mode"); - } - - if (compress_ctx) - xc_compression_free_context(xch, compress_ctx); - - if ( live_shinfo ) - munmap(live_shinfo, PAGE_SIZE); - - if ( ctx->live_p2m ) - munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE); - - if ( ctx->live_m2p ) - munmap(ctx->live_m2p, M2P_SIZE(ctx->max_mfn)); - - xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size))); - xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size))); - - free(pfn_type); - free(pfn_batch); - free(pfn_err); - free(to_fix); - free(hvm_buf); - outbuf_free(&ob_pagebuf); - - errno = rc; -exit: - DPRINTF("Save exit of domid %u with errno=%d\n", dom, errno); - - return !!errno; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xc_offline_page.c b/tools/libxc/xc_offline_page.c index b1d169c..c2a8230 100644 --- a/tools/libxc/xc_offline_page.c +++ b/tools/libxc/xc_offline_page.c @@ -396,6 +396,65 @@ static int is_page_exchangable(xc_interface *xch, int domid, xen_pfn_t mfn, return 1; } +xen_pfn_t *xc_map_m2p(xc_interface *xch, + unsigned long max_mfn, + int prot, + unsigned long *mfn0) +{ + privcmd_mmap_entry_t *entries; + unsigned long m2p_chunks, m2p_size; + xen_pfn_t *m2p; + xen_pfn_t *extent_start; + int i; + + m2p = NULL; + m2p_size = M2P_SIZE(max_mfn); + m2p_chunks = M2P_CHUNKS(max_mfn); + + extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t)); + if ( !extent_start ) + { + ERROR("failed to allocate space for m2p mfns"); + goto err0; + } + + if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) ) + { + PERROR("xc_get_m2p_mfns"); + goto err1; + } + + entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t)); + if (entries == NULL) + { + ERROR("failed to allocate space for mmap entries"); + goto err1; + } + + for ( i = 0; i < m2p_chunks; i++ ) + entries[i].mfn = extent_start[i]; + + m2p = xc_map_foreign_ranges(xch, DOMID_XEN, + m2p_size, prot, M2P_CHUNK_SIZE, + entries, m2p_chunks); + if (m2p == NULL) + { + PERROR("xc_mmap_foreign_ranges failed"); + goto err2; + } + + if (mfn0) + *mfn0 = entries[0].mfn; + +err2: + free(entries); +err1: + free(extent_start); + +err0: + return m2p; +} + /* The domain should be suspended when called here */ int xc_exchange_page(xc_interface *xch, int domid, xen_pfn_t mfn) { diff --git a/tools/libxc/xg_save_restore.h b/tools/libxc/xg_save_restore.h index 57d4e8f..c2686af 100644 --- a/tools/libxc/xg_save_restore.h +++ b/tools/libxc/xg_save_restore.h @@ -22,248 +22,6 @@ #include /* - * SAVE/RESTORE/MIGRATE PROTOCOL - * ============================= - * - * The general form of a stream of chunks is a header followed by a - * body consisting of a variable number of chunks (terminated by a - * chunk with type 0) followed by a trailer. - * - * For a rolling/checkpoint (e.g. remus) migration then the body and - * trailer phases can be repeated until an external event - * (e.g. failure) causes the process to terminate and commit to the - * most recent complete checkpoint. - * - * HEADER - * ------ - * - * unsigned long : p2m_size - * - * extended-info (PV-only, optional): - * - * If first unsigned long == ~0UL then extended info is present, - * otherwise unsigned long is part of p2m. Note that p2m_size above - * does not include the length of the extended info. - * - * extended-info: - * - * unsigned long : signature == ~0UL - * uint32_t : number of bytes remaining in extended-info - * - * 1 or more extended-info blocks of form: - * char[4] : block identifier - * uint32_t : block data size - * bytes : block data - * - * defined extended-info blocks: - * "vcpu" : VCPU context info containing vcpu_guest_context_t. - * The precise variant of the context structure - * (e.g. 32 vs 64 bit) is distinguished by - * the block size. - * "extv" : Presence indicates use of extended VCPU context in - * tail, data size is 0. - * - * p2m (PV-only): - * - * consists of p2m_size bytes comprising an array of xen_pfn_t sized entries. - * - * BODY PHASE - Format A (for live migration or Remus without compression) - * ---------- - * - * A series of chunks with a common header: - * int : chunk type - * - * If the chunk type is +ve then chunk contains guest memory data, and the - * type contains the number of pages in the batch: - * - * unsigned long[] : PFN array, length == number of pages in batch - * Each entry consists of XEN_DOMCTL_PFINFO_* - * in bits 31-28 and the PFN number in bits 27-0. - * page data : PAGE_SIZE bytes for each page marked present in PFN - * array - * - * If the chunk type is -ve then chunk consists of one of a number of - * metadata types. See definitions of XC_SAVE_ID_* below. - * - * If chunk type is 0 then body phase is complete. - * - * - * BODY PHASE - Format B (for Remus with compression) - * ---------- - * - * A series of chunks with a common header: - * int : chunk type - * - * If the chunk type is +ve then chunk contains array of PFNs corresponding - * to guest memory and type contains the number of PFNs in the batch: - * - * unsigned long[] : PFN array, length == number of pages in batch - * Each entry consists of XEN_DOMCTL_PFINFO_* - * in bits 31-28 and the PFN number in bits 27-0. - * - * If the chunk type is -ve then chunk consists of one of a number of - * metadata types. See definitions of XC_SAVE_ID_* below. - * - * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the - * chunk consists of compressed page data, in the following format: - * - * unsigned long : Size of the compressed chunk to follow - * compressed data : variable length data of size indicated above. - * This chunk consists of compressed page data. - * The number of pages in one chunk depends on - * the amount of space available in the sender's - * output buffer. - * - * Format of compressed data: - * compressed_data = * - * delta = - * marker = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker] - * RUNFLAG = 0 - * SKIPFLAG = 1 << 7 - * RUNLEN = 7-bit unsigned value indicating number of WORDS in the run - * run = string of bytes of length sizeof(WORD) * RUNLEN - * - * If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data following - * the marker is copied into the target page at the appropriate offset indicated by - * the offset_ptr - * If marker contains SKIPFLAG, then the offset_ptr is advanced - * by RUNLEN * sizeof(WORD). - * - * If chunk type is 0 then body phase is complete. - * - * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA, - * containing compressed pages. The compressed chunks are collated to form - * one single compressed chunk for the entire iteration. The number of pages - * present in this final compressed chunk will be equal to the total number - * of valid PFNs specified by the +ve chunks. - * - * At the sender side, compressed pages are inserted into the output stream - * in the same order as they would have been if compression logic was absent. - * - * Until last iteration, the BODY is sent in Format A, to maintain live - * migration compatibility with receivers of older Xen versions. - * At the last iteration, if Remus compression was enabled, the sender sends - * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the - * BODY in Format B from the next iteration onwards. - * - * An example sequence of chunks received in Format B: - * +16 +ve chunk - * unsigned long[16] PFN array - * +100 +ve chunk - * unsigned long[100] PFN array - * +50 +ve chunk - * unsigned long[50] PFN array - * - * XC_SAVE_ID_COMPRESSED_DATA TAG - * N Length of compressed data - * N bytes of DATA Decompresses to 166 pages - * - * XC_SAVE_ID_* other xc save chunks - * 0 END BODY TAG - * - * Corner case with checkpoint compression: - * At sender side, after pausing the domain, dirty pages are usually - * copied out to a temporary buffer. After the domain is resumed, - * compression is done and the compressed chunk(s) are sent, followed by - * other XC_SAVE_ID_* chunks. - * If the temporary buffer gets full while scanning for dirty pages, - * the sender stops buffering of dirty pages, compresses the temporary - * buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA. - * The sender then resumes the buffering of dirty pages and continues - * scanning for the dirty pages. - * For e.g., assume that the temporary buffer can hold 4096 pages and - * there are 5000 dirty pages. The following is the sequence of chunks - * that the receiver will see: - * - * +1024 +ve chunk - * unsigned long[1024] PFN array - * +1024 +ve chunk - * unsigned long[1024] PFN array - * +1024 +ve chunk - * unsigned long[1024] PFN array - * +1024 +ve chunk - * unsigned long[1024] PFN array - * - * XC_SAVE_ID_COMPRESSED_DATA TAG - * N Length of compressed data - * N bytes of DATA Decompresses to 4096 pages - * - * +4 +ve chunk - * unsigned long[4] PFN array - * - * XC_SAVE_ID_COMPRESSED_DATA TAG - * M Length of compressed data - * M bytes of DATA Decompresses to 4 pages - * - * XC_SAVE_ID_* other xc save chunks - * 0 END BODY TAG - * - * In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with - * +ve chunks arbitrarily. But at the receiver end, the following condition - * always holds true until the end of BODY PHASE: - * num(PFN entries +ve chunks) >= num(pages received in compressed form) - * - * TAIL PHASE - * ---------- - * - * Content differs for PV and HVM guests. - * - * HVM TAIL: - * - * "Magic" pages: - * uint64_t : I/O req PFN - * uint64_t : Buffered I/O req PFN - * uint64_t : Store PFN - * Xen HVM Context: - * uint32_t : Length of context in bytes - * bytes : Context data - * Qemu context: - * char[21] : Signature: - * "QemuDeviceModelRecord" : Read Qemu save data until EOF - * "DeviceModelRecord0002" : uint32_t length field followed by that many - * bytes of Qemu save data - * "RemusDeviceModelState" : Currently the same as "DeviceModelRecord0002". - * - * PV TAIL: - * - * Unmapped PFN list : list of all the PFNs that were not in map at the close - * unsigned int : Number of unmapped pages - * unsigned long[] : PFNs of unmapped pages - * - * VCPU context data : A series of VCPU records, one per present VCPU - * Maximum and present map supplied in XC_SAVE_ID_VCPUINFO - * bytes: : VCPU context structure. Size is determined by size - * provided in extended-info header - * bytes[128] : Extended VCPU context (present IFF "extv" block - * present in extended-info header) - * - * Shared Info Page : 4096 bytes of shared info page - */ - -#define XC_SAVE_ID_ENABLE_VERIFY_MODE -1 /* Switch to validation phase. */ -#define XC_SAVE_ID_VCPU_INFO -2 /* Additional VCPU info */ -#define XC_SAVE_ID_HVM_IDENT_PT -3 /* (HVM-only) */ -#define XC_SAVE_ID_HVM_VM86_TSS -4 /* (HVM-only) */ -#define XC_SAVE_ID_TMEM -5 -#define XC_SAVE_ID_TMEM_EXTRA -6 -#define XC_SAVE_ID_TSC_INFO -7 -#define XC_SAVE_ID_HVM_CONSOLE_PFN -8 /* (HVM-only) */ -#define XC_SAVE_ID_LAST_CHECKPOINT -9 /* Commit to restoring after completion of current iteration. */ -#define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10 -#define XC_SAVE_ID_HVM_VIRIDIAN -11 -#define XC_SAVE_ID_COMPRESSED_DATA -12 /* Marker to indicate arrival of compressed data */ -#define XC_SAVE_ID_ENABLE_COMPRESSION -13 /* Marker to enable compression logic at receiver side */ -#define XC_SAVE_ID_HVM_GENERATION_ID_ADDR -14 -/* Markers for the pfn's hosting these mem event rings */ -#define XC_SAVE_ID_HVM_PAGING_RING_PFN -15 -#define XC_SAVE_ID_HVM_MONITOR_RING_PFN -16 -#define XC_SAVE_ID_HVM_SHARING_RING_PFN -17 -#define XC_SAVE_ID_TOOLSTACK -18 /* Optional toolstack specific info */ -/* These are a pair; it is an error for one to exist without the other */ -#define XC_SAVE_ID_HVM_IOREQ_SERVER_PFN -19 -#define XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES -20 - -/* ** We process save/restore/migrate in batches of pages; the below ** determines how many pages we (at maximum) deal with in each batch. */ @@ -272,11 +30,6 @@ /* When pinning page tables at the end of restore, we also use batching. */ #define MAX_PIN_BATCH 1024 -/* Maximum #VCPUs currently supported for save/restore. */ -#define XC_SR_MAX_VCPUS 4096 -#define vcpumap_sz(max_id) (((max_id)/64+1)*sizeof(uint64_t)) - - /* ** Determine various platform information required for save/restore, in ** particular: -- 1.7.10.4