xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
@ 2020-04-23 15:30 Tamas K Lengyel
  2020-04-23 15:30 ` [PATCH v17 2/2] xen/tools: VM forking toolstack side Tamas K Lengyel
  2020-04-24  9:43 ` [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Roger Pau Monné
  0 siblings, 2 replies; 12+ messages in thread
From: Tamas K Lengyel @ 2020-04-23 15:30 UTC (permalink / raw)
  To: xen-devel
  Cc: Tamas K Lengyel, Tamas K Lengyel, Wei Liu, Andrew Cooper,
	George Dunlap, Jan Beulich, Roger Pau Monné

When resetting a VM fork we ought to only remove pages that were allocated for
the fork during it's execution and the contents copied over from the parent.
This can be determined if the page is sharable as special pages used by the
fork for other purposes will not pass this test. Unfortunately during the fork
reset loop we only partially check whether that's the case. A page's type may
indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
check by itself. All checks that are normally performed before a page is
converted to the sharable type need to be performed to avoid removing pages
from the p2m that may be used for other purposes. For example, currently the
reset loop also removes the vcpu info pages from the p2m, potentially putting
the guest into infinite page-fault loops.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
v17: Changes based on feedback from Roger
---
 xen/arch/x86/mm/mem_sharing.c | 83 ++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 36 deletions(-)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index bb74595351..344a5bfb3d 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -633,31 +633,33 @@ unsigned int mem_sharing_get_nr_shared_mfns(void)
 /* Functions that change a page's type and ownership */
 static int page_make_sharable(struct domain *d,
                               struct page_info *page,
-                              int expected_refcnt)
+                              unsigned int expected_refcnt,
+                              bool validate_only)
 {
-    bool_t drop_dom_ref;
+    int rc = 0;
+    bool drop_dom_ref = false;
 
-    spin_lock(&d->page_alloc_lock);
+    spin_lock_recursive(&d->page_alloc_lock);
 
     if ( d->is_dying )
     {
-        spin_unlock(&d->page_alloc_lock);
-        return -EBUSY;
+        rc = -EBUSY;
+        goto out;
     }
 
     /* Change page type and count atomically */
     if ( !get_page_and_type(page, d, PGT_shared_page) )
     {
-        spin_unlock(&d->page_alloc_lock);
-        return -EINVAL;
+        rc = -EINVAL;
+        goto out;
     }
 
     /* Check it wasn't already sharable and undo if it was */
     if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
     {
-        spin_unlock(&d->page_alloc_lock);
         put_page_and_type(page);
-        return -EEXIST;
+        rc = -EEXIST;
+        goto out;
     }
 
     /*
@@ -666,20 +668,26 @@ static int page_make_sharable(struct domain *d,
      */
     if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
     {
-        spin_unlock(&d->page_alloc_lock);
         /* Return type count back to zero */
         put_page_and_type(page);
-        return -E2BIG;
+        rc = -E2BIG;
+        goto out;
     }
 
-    page_set_owner(page, dom_cow);
-    drop_dom_ref = !domain_adjust_tot_pages(d, -1);
-    page_list_del(page, &d->page_list);
-    spin_unlock(&d->page_alloc_lock);
+    if ( !validate_only )
+    {
+        page_set_owner(page, dom_cow);
+        drop_dom_ref = !domain_adjust_tot_pages(d, -1);
+        page_list_del(page, &d->page_list);
+    }
+
+out:
+    spin_unlock_recursive(&d->page_alloc_lock);
 
     if ( drop_dom_ref )
         put_domain(d);
-    return 0;
+
+    return rc;
 }
 
 static int page_make_private(struct domain *d, struct page_info *page)
@@ -810,7 +818,8 @@ static int debug_gref(struct domain *d, grant_ref_t ref)
 }
 
 static int nominate_page(struct domain *d, gfn_t gfn,
-                         int expected_refcnt, shr_handle_t *phandle)
+                         unsigned int expected_refcnt, bool validate_only,
+                         shr_handle_t *phandle)
 {
     struct p2m_domain *hp2m = p2m_get_hostp2m(d);
     p2m_type_t p2mt;
@@ -879,8 +888,8 @@ static int nominate_page(struct domain *d, gfn_t gfn,
     }
 
     /* Try to convert the mfn to the sharable type */
-    ret = page_make_sharable(d, page, expected_refcnt);
-    if ( ret )
+    ret = page_make_sharable(d, page, expected_refcnt, validate_only);
+    if ( ret || validate_only )
         goto out;
 
     /*
@@ -1392,13 +1401,13 @@ static int range_share(struct domain *d, struct domain *cd,
          * We only break out if we run out of memory as individual pages may
          * legitimately be unsharable and we just want to skip over those.
          */
-        rc = nominate_page(d, _gfn(start), 0, &sh);
+        rc = nominate_page(d, _gfn(start), 0, false, &sh);
         if ( rc == -ENOMEM )
             break;
 
         if ( !rc )
         {
-            rc = nominate_page(cd, _gfn(start), 0, &ch);
+            rc = nominate_page(cd, _gfn(start), 0, false, &ch);
             if ( rc == -ENOMEM )
                 break;
 
@@ -1478,7 +1487,7 @@ int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
         /* For read-only accesses we just add a shared entry to the physmap */
         while ( parent )
         {
-            if ( !(rc = nominate_page(parent, gfn, 0, &handle)) )
+            if ( !(rc = nominate_page(parent, gfn, 0, false, &handle)) )
                 break;
 
             parent = parent->parent;
@@ -1775,20 +1784,22 @@ static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
     spin_lock_recursive(&d->page_alloc_lock);
     page_list_for_each_safe(page, tmp, &d->page_list)
     {
-        p2m_type_t p2mt;
-        p2m_access_t p2ma;
+        shr_handle_t sh;
         mfn_t mfn = page_to_mfn(page);
         gfn_t gfn = mfn_to_gfn(d, mfn);
 
-        mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
-                                    0, NULL, false);
-
-        /* only reset pages that are sharable */
-        if ( !p2m_is_sharable(p2mt) )
-            continue;
-
-        /* take an extra reference or just skip if can't for whatever reason */
-        if ( !get_page(page, d) )
+        /*
+         * We only want to remove pages from the fork here that were copied
+         * from the parent but could be potentially re-populated using memory
+         * sharing after the reset. These pages all must be regular pages with
+         * no extra reference held to them, thus should be possible to make
+         * them sharable. Unfortunately p2m_is_sharable check is not sufficient
+         * to test this as it doesn't check the page's reference count. We thus
+         * check whether the page is convertable to the shared type using
+         * nominate_page. In case the page is already shared (ie. a share
+         * handle is returned) then we don't remove it.
+         */
+        if ( (rc = nominate_page(d, gfn, 0, true, &sh)) || sh )
             continue;
 
         /* forked memory is 4k, not splitting large pages so this must work */
@@ -1797,7 +1808,7 @@ static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
         ASSERT(!rc);
 
         put_page_alloc_ref(page);
-        put_page(page);
+        put_page_and_type(page);
     }
     spin_unlock_recursive(&d->page_alloc_lock);
 
@@ -1839,7 +1850,7 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
     {
         shr_handle_t handle;
 
-        rc = nominate_page(d, _gfn(mso.u.nominate.u.gfn), 0, &handle);
+        rc = nominate_page(d, _gfn(mso.u.nominate.u.gfn), 0, false, &handle);
         mso.u.nominate.handle = handle;
     }
     break;
@@ -1854,7 +1865,7 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         if ( rc < 0 )
             goto out;
 
-        rc = nominate_page(d, gfn, 3, &handle);
+        rc = nominate_page(d, gfn, 3, false, &handle);
         mso.u.nominate.handle = handle;
     }
     break;
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v17 2/2] xen/tools: VM forking toolstack side
  2020-04-23 15:30 [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Tamas K Lengyel
@ 2020-04-23 15:30 ` Tamas K Lengyel
  2020-04-24 13:11   ` Jan Beulich
  2020-05-01 13:59   ` Tamas K Lengyel
  2020-04-24  9:43 ` [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Roger Pau Monné
  1 sibling, 2 replies; 12+ messages in thread
From: Tamas K Lengyel @ 2020-04-23 15:30 UTC (permalink / raw)
  To: xen-devel; +Cc: Anthony PERARD, Ian Jackson, Tamas K Lengyel, Wei Liu

Add necessary bits to implement "xl fork-vm" commands. The command allows the
user to specify how to launch the device model allowing for a late-launch model
in which the user can execute the fork without the device model and decide to
only later launch it.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 docs/man/xl.1.pod.in          |  49 +++++
 tools/libxc/include/xenctrl.h |  14 ++
 tools/libxc/xc_memshr.c       |  26 +++
 tools/libxl/libxl.h           |  12 ++
 tools/libxl/libxl_create.c    | 361 +++++++++++++++++++---------------
 tools/libxl/libxl_dm.c        |   2 +-
 tools/libxl/libxl_dom.c       |  43 +++-
 tools/libxl/libxl_internal.h  |   7 +
 tools/libxl/libxl_types.idl   |   1 +
 tools/libxl/libxl_x86.c       |  42 ++++
 tools/xl/Makefile             |   2 +-
 tools/xl/xl.h                 |   5 +
 tools/xl/xl_cmdtable.c        |  15 ++
 tools/xl/xl_forkvm.c          | 149 ++++++++++++++
 tools/xl/xl_vmcontrol.c       |  14 ++
 15 files changed, 576 insertions(+), 166 deletions(-)
 create mode 100644 tools/xl/xl_forkvm.c

diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
index 09339282e6..67b4e8588a 100644
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -708,6 +708,55 @@ above).
 
 =back
 
+=item B<fork-vm> [I<OPTIONS>] I<domain-id>
+
+Create a fork of a running VM.  The domain will be paused after the operation
+and remains paused while forks of it exist.  Experimental and x86 only.
+Forks can only be made of domains with HAP enabled and on Intel hardware.  The
+parent domain must be created with the xl toolstack and its configuration must
+not manually define max_grant_frames, max_maptrack_frames or max_event_channels.
+
+B<OPTIONS>
+
+=over 4
+
+=item B<-p>
+
+Leave the fork paused after creating it.
+
+=item B<--launch-dm>
+
+Specify whether the device model (QEMU) should be launched for the fork. Late
+launch allows to start the device model for an already running fork.
+
+=item B<-C>
+
+The config file to use when launching the device model.  Currently required when
+launching the device model.  Most config settings MUST match the parent domain
+exactly, only change VM name, disk path and network configurations.
+
+=item B<-Q>
+
+The path to the qemu save file to use when launching the device model.  Currently
+required when launching the device model.
+
+=item B<--fork-reset>
+
+Perform a reset operation of an already running fork.  Note that resetting may
+be less performant then creating a new fork depending on how much memory the
+fork has deduplicated during its runtime.
+
+=item B<--max-vcpus>
+
+Specify the max-vcpus matching the parent domain when not launching the dm.
+
+=item B<--allow-iommu>
+
+Specify to allow forking a domain that has IOMMU enabled. Only compatible with
+forks using --launch-dm no.
+
+=back
+
 =item B<sharing> [I<domain-id>]
 
 Display the number of shared pages for a specified domain. If no domain is
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 5f25c5a6d4..0a6ff93229 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2232,6 +2232,20 @@ int xc_memshr_range_share(xc_interface *xch,
                           uint64_t first_gfn,
                           uint64_t last_gfn);
 
+int xc_memshr_fork(xc_interface *xch,
+                   uint32_t source_domain,
+                   uint32_t client_domain,
+                   bool allow_with_iommu);
+
+/*
+ * Note: this function is only intended to be used on short-lived forks that
+ * haven't yet aquired a lot of memory. In case the fork has a lot of memory
+ * it is likely more performant to create a new fork with xc_memshr_fork.
+ *
+ * With VMs that have a lot of memory this call may block for a long time.
+ */
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain);
+
 /* Debug calls: return the number of pages referencing the shared frame backing
  * the input argument. Should be one or greater.
  *
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 97e2e6a8d9..2300cc7075 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -239,6 +239,32 @@ int xc_memshr_debug_gref(xc_interface *xch,
     return xc_memshr_memop(xch, domid, &mso);
 }
 
+int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
+                   bool allow_with_iommu)
+{
+    xen_mem_sharing_op_t mso;
+
+    memset(&mso, 0, sizeof(mso));
+
+    mso.op = XENMEM_sharing_op_fork;
+    mso.u.fork.parent_domain = pdomid;
+
+    if ( allow_with_iommu )
+        mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+
+    return xc_memshr_memop(xch, domid, &mso);
+}
+
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid)
+{
+    xen_mem_sharing_op_t mso;
+
+    memset(&mso, 0, sizeof(mso));
+    mso.op = XENMEM_sharing_op_fork_reset;
+
+    return xc_memshr_memop(xch, domid, &mso);
+}
+
 int xc_memshr_audit(xc_interface *xch)
 {
     xen_mem_sharing_op_t mso;
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 71709dc585..4bbb0a773d 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -2666,6 +2666,18 @@ int libxl_psr_get_hw_info(libxl_ctx *ctx, libxl_psr_feat_type type,
                           unsigned int lvl, unsigned int *nr,
                           libxl_psr_hw_info **info);
 void libxl_psr_hw_info_list_free(libxl_psr_hw_info *list, unsigned int nr);
+
+int libxl_domain_fork_vm(libxl_ctx *ctx, uint32_t pdomid, uint32_t max_vcpus,
+                         bool allow_with_iommu, uint32_t *domid)
+                         LIBXL_EXTERNAL_CALLERS_ONLY;
+
+int libxl_domain_fork_launch_dm(libxl_ctx *ctx, libxl_domain_config *d_config,
+                                uint32_t domid,
+                                const libxl_asyncprogress_how *aop_console_how)
+                                LIBXL_EXTERNAL_CALLERS_ONLY;
+
+int libxl_domain_fork_reset(libxl_ctx *ctx, uint32_t domid)
+                            LIBXL_EXTERNAL_CALLERS_ONLY;
 #endif
 
 /* misc */
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index e7cb2dbc2b..11001d19dc 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -538,12 +538,12 @@ out:
     return ret;
 }
 
-int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
-                       libxl__domain_build_state *state,
-                       uint32_t *domid, bool soft_reset)
+static int libxl__domain_make_xs_entries(libxl__gc *gc, libxl_domain_config *d_config,
+                                         libxl__domain_build_state *state,
+                                         uint32_t domid)
 {
     libxl_ctx *ctx = libxl__gc_owner(gc);
-    int ret, rc, nb_vm;
+    int rc, nb_vm;
     const char *dom_type;
     char *uuid_string;
     char *dom_path, *vm_path, *libxl_path;
@@ -555,9 +555,6 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
 
     /* convenience aliases */
     libxl_domain_create_info *info = &d_config->c_info;
-    libxl_domain_build_info *b_info = &d_config->b_info;
-
-    assert(soft_reset || *domid == INVALID_DOMID);
 
     uuid_string = libxl__uuid2string(gc, info->uuid);
     if (!uuid_string) {
@@ -565,137 +562,7 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
         goto out;
     }
 
-    if (!soft_reset) {
-        struct xen_domctl_createdomain create = {
-            .ssidref = info->ssidref,
-            .max_vcpus = b_info->max_vcpus,
-            .max_evtchn_port = b_info->event_channels,
-            .max_grant_frames = b_info->max_grant_frames,
-            .max_maptrack_frames = b_info->max_maptrack_frames,
-        };
-
-        if (info->type != LIBXL_DOMAIN_TYPE_PV) {
-            create.flags |= XEN_DOMCTL_CDF_hvm;
-            create.flags |=
-                libxl_defbool_val(info->hap) ? XEN_DOMCTL_CDF_hap : 0;
-            create.flags |=
-                libxl_defbool_val(info->oos) ? 0 : XEN_DOMCTL_CDF_oos_off;
-        }
-
-        assert(info->passthrough != LIBXL_PASSTHROUGH_DEFAULT);
-        LOG(DETAIL, "passthrough: %s",
-            libxl_passthrough_to_string(info->passthrough));
-
-        if (info->passthrough != LIBXL_PASSTHROUGH_DISABLED)
-            create.flags |= XEN_DOMCTL_CDF_iommu;
-
-        if (info->passthrough == LIBXL_PASSTHROUGH_SYNC_PT)
-            create.iommu_opts |= XEN_DOMCTL_IOMMU_no_sharept;
-
-        /* Ultimately, handle is an array of 16 uint8_t, same as uuid */
-        libxl_uuid_copy(ctx, (libxl_uuid *)&create.handle, &info->uuid);
-
-        ret = libxl__arch_domain_prepare_config(gc, d_config, &create);
-        if (ret < 0) {
-            LOGED(ERROR, *domid, "fail to get domain config");
-            rc = ERROR_FAIL;
-            goto out;
-        }
-
-        for (;;) {
-            uint32_t local_domid;
-            bool recent;
-
-            if (info->domid == RANDOM_DOMID) {
-                uint16_t v;
-
-                ret = libxl__random_bytes(gc, (void *)&v, sizeof(v));
-                if (ret < 0)
-                    break;
-
-                v &= DOMID_MASK;
-                if (!libxl_domid_valid_guest(v))
-                    continue;
-
-                local_domid = v;
-            } else {
-                local_domid = info->domid; /* May not be valid */
-            }
-
-            ret = xc_domain_create(ctx->xch, &local_domid, &create);
-            if (ret < 0) {
-                /*
-                 * If we generated a random domid and creation failed
-                 * because that domid already exists then simply try
-                 * again.
-                 */
-                if (errno == EEXIST && info->domid == RANDOM_DOMID)
-                    continue;
-
-                LOGED(ERROR, local_domid, "domain creation fail");
-                rc = ERROR_FAIL;
-                goto out;
-            }
-
-            /* A new domain now exists */
-            *domid = local_domid;
-
-            rc = libxl__is_domid_recent(gc, local_domid, &recent);
-            if (rc)
-                goto out;
-
-            /* The domid is not recent, so we're done */
-            if (!recent)
-                break;
-
-            /*
-             * If the domid was specified then there's no point in
-             * trying again.
-             */
-            if (libxl_domid_valid_guest(info->domid)) {
-                LOGED(ERROR, local_domid, "domain id recently used");
-                rc = ERROR_FAIL;
-                goto out;
-            }
-
-            /*
-             * The domain is recent and so cannot be used. Clear domid
-             * here since, if xc_domain_destroy() fails below there is
-             * little point calling it again in the error path.
-             */
-            *domid = INVALID_DOMID;
-
-            ret = xc_domain_destroy(ctx->xch, local_domid);
-            if (ret < 0) {
-                LOGED(ERROR, local_domid, "domain destroy fail");
-                rc = ERROR_FAIL;
-                goto out;
-            }
-
-            /* The domain was successfully destroyed, so we can try again */
-        }
-
-        rc = libxl__arch_domain_save_config(gc, d_config, state, &create);
-        if (rc < 0)
-            goto out;
-    }
-
-    /*
-     * If soft_reset is set the the domid will have been valid on entry.
-     * If it was not set then xc_domain_create() should have assigned a
-     * valid value. Either way, if we reach this point, domid should be
-     * valid.
-     */
-    assert(libxl_domid_valid_guest(*domid));
-
-    ret = xc_cpupool_movedomain(ctx->xch, info->poolid, *domid);
-    if (ret < 0) {
-        LOGED(ERROR, *domid, "domain move fail");
-        rc = ERROR_FAIL;
-        goto out;
-    }
-
-    dom_path = libxl__xs_get_dompath(gc, *domid);
+    dom_path = libxl__xs_get_dompath(gc, domid);
     if (!dom_path) {
         rc = ERROR_FAIL;
         goto out;
@@ -703,12 +570,12 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
 
     vm_path = GCSPRINTF("/vm/%s", uuid_string);
     if (!vm_path) {
-        LOGD(ERROR, *domid, "cannot allocate create paths");
+        LOGD(ERROR, domid, "cannot allocate create paths");
         rc = ERROR_FAIL;
         goto out;
     }
 
-    libxl_path = libxl__xs_libxl_path(gc, *domid);
+    libxl_path = libxl__xs_libxl_path(gc, domid);
     if (!libxl_path) {
         rc = ERROR_FAIL;
         goto out;
@@ -719,10 +586,10 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
 
     roperm[0].id = 0;
     roperm[0].perms = XS_PERM_NONE;
-    roperm[1].id = *domid;
+    roperm[1].id = domid;
     roperm[1].perms = XS_PERM_READ;
 
-    rwperm[0].id = *domid;
+    rwperm[0].id = domid;
     rwperm[0].perms = XS_PERM_NONE;
 
 retry_transaction:
@@ -740,7 +607,7 @@ retry_transaction:
                     noperm, ARRAY_SIZE(noperm));
 
     xs_write(ctx->xsh, t, GCSPRINTF("%s/vm", dom_path), vm_path, strlen(vm_path));
-    rc = libxl__domain_rename(gc, *domid, 0, info->name, t);
+    rc = libxl__domain_rename(gc, domid, 0, info->name, t);
     if (rc)
         goto out;
 
@@ -830,7 +697,7 @@ retry_transaction:
 
     vm_list = libxl_list_vm(ctx, &nb_vm);
     if (!vm_list) {
-        LOGD(ERROR, *domid, "cannot get number of running guests");
+        LOGD(ERROR, domid, "cannot get number of running guests");
         rc = ERROR_FAIL;
         goto out;
     }
@@ -854,7 +721,7 @@ retry_transaction:
             t = 0;
             goto retry_transaction;
         }
-        LOGED(ERROR, *domid, "domain creation ""xenstore transaction commit failed");
+        LOGED(ERROR, domid, "domain creation ""xenstore transaction commit failed");
         rc = ERROR_FAIL;
         goto out;
     }
@@ -866,6 +733,155 @@ retry_transaction:
     return rc;
 }
 
+int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
+                       libxl__domain_build_state *state,
+                       uint32_t *domid, bool soft_reset)
+{
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    int ret, rc;
+
+    /* convenience aliases */
+    libxl_domain_create_info *info = &d_config->c_info;
+    libxl_domain_build_info *b_info = &d_config->b_info;
+
+    assert(soft_reset || *domid == INVALID_DOMID);
+
+    if (!soft_reset) {
+        struct xen_domctl_createdomain create = {
+            .ssidref = info->ssidref,
+            .max_vcpus = b_info->max_vcpus,
+            .max_evtchn_port = b_info->event_channels,
+            .max_grant_frames = b_info->max_grant_frames,
+            .max_maptrack_frames = b_info->max_maptrack_frames,
+        };
+
+        if (info->type != LIBXL_DOMAIN_TYPE_PV) {
+            create.flags |= XEN_DOMCTL_CDF_hvm;
+            create.flags |=
+                libxl_defbool_val(info->hap) ? XEN_DOMCTL_CDF_hap : 0;
+            create.flags |=
+                libxl_defbool_val(info->oos) ? 0 : XEN_DOMCTL_CDF_oos_off;
+        }
+
+        assert(info->passthrough != LIBXL_PASSTHROUGH_DEFAULT);
+        LOG(DETAIL, "passthrough: %s",
+            libxl_passthrough_to_string(info->passthrough));
+
+        if (info->passthrough != LIBXL_PASSTHROUGH_DISABLED)
+            create.flags |= XEN_DOMCTL_CDF_iommu;
+
+        if (info->passthrough == LIBXL_PASSTHROUGH_SYNC_PT)
+            create.iommu_opts |= XEN_DOMCTL_IOMMU_no_sharept;
+
+        /* Ultimately, handle is an array of 16 uint8_t, same as uuid */
+        libxl_uuid_copy(ctx, (libxl_uuid *)&create.handle, &info->uuid);
+
+        ret = libxl__arch_domain_prepare_config(gc, d_config, &create);
+        if (ret < 0) {
+            LOGED(ERROR, *domid, "fail to get domain config");
+            rc = ERROR_FAIL;
+            goto out;
+        }
+
+        for (;;) {
+            uint32_t local_domid;
+            bool recent;
+
+            if (info->domid == RANDOM_DOMID) {
+                uint16_t v;
+
+                ret = libxl__random_bytes(gc, (void *)&v, sizeof(v));
+                if (ret < 0)
+                    break;
+
+                v &= DOMID_MASK;
+                if (!libxl_domid_valid_guest(v))
+                    continue;
+
+                local_domid = v;
+            } else {
+                local_domid = info->domid; /* May not be valid */
+            }
+
+            ret = xc_domain_create(ctx->xch, &local_domid, &create);
+            if (ret < 0) {
+                /*
+                 * If we generated a random domid and creation failed
+                 * because that domid already exists then simply try
+                 * again.
+                 */
+                if (errno == EEXIST && info->domid == RANDOM_DOMID)
+                    continue;
+
+                LOGED(ERROR, local_domid, "domain creation fail");
+                rc = ERROR_FAIL;
+                goto out;
+            }
+
+            /* A new domain now exists */
+            *domid = local_domid;
+
+            rc = libxl__is_domid_recent(gc, local_domid, &recent);
+            if (rc)
+                goto out;
+
+            /* The domid is not recent, so we're done */
+            if (!recent)
+                break;
+
+            /*
+             * If the domid was specified then there's no point in
+             * trying again.
+             */
+            if (libxl_domid_valid_guest(info->domid)) {
+                LOGED(ERROR, local_domid, "domain id recently used");
+                rc = ERROR_FAIL;
+                goto out;
+            }
+
+            /*
+             * The domain is recent and so cannot be used. Clear domid
+             * here since, if xc_domain_destroy() fails below there is
+             * little point calling it again in the error path.
+             */
+            *domid = INVALID_DOMID;
+
+            ret = xc_domain_destroy(ctx->xch, local_domid);
+            if (ret < 0) {
+                LOGED(ERROR, local_domid, "domain destroy fail");
+                rc = ERROR_FAIL;
+                goto out;
+            }
+
+            /* The domain was successfully destroyed, so we can try again */
+        }
+
+        rc = libxl__arch_domain_save_config(gc, d_config, state, &create);
+        if (rc < 0)
+            goto out;
+    }
+
+    /*
+     * If soft_reset is set the the domid will have been valid on entry.
+     * If it was not set then xc_domain_create() should have assigned a
+     * valid value. Either way, if we reach this point, domid should be
+     * valid.
+     */
+    assert(libxl_domid_valid_guest(*domid));
+
+    ret = xc_cpupool_movedomain(ctx->xch, info->poolid, *domid);
+    if (ret < 0) {
+        LOGED(ERROR, *domid, "domain move fail");
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    rc = libxl__domain_make_xs_entries(gc, d_config, state, *domid);
+
+out:
+    return rc;
+}
+
 static int store_libxl_entry(libxl__gc *gc, uint32_t domid,
                              libxl_domain_build_info *b_info)
 {
@@ -1191,16 +1207,32 @@ static void initiate_domain_create(libxl__egc *egc,
     ret = libxl__domain_config_setdefault(gc,d_config,domid);
     if (ret) goto error_out;
 
-    ret = libxl__domain_make(gc, d_config, &dcs->build_state, &domid,
-                             dcs->soft_reset);
-    if (ret) {
-        LOGD(ERROR, domid, "cannot make domain: %d", ret);
+    if (!d_config->dm_restore_file)
+    {
+        ret = libxl__domain_make(gc, d_config, &dcs->build_state, &domid,
+                                 dcs->soft_reset);
         dcs->guest_domid = domid;
+
+        if (ret) {
+            LOGD(ERROR, domid, "cannot make domain: %d", ret);
+            ret = ERROR_FAIL;
+            goto error_out;
+        }
+    } else if (dcs->guest_domid != INVALID_DOMID) {
+        domid = dcs->guest_domid;
+
+        ret = libxl__domain_make_xs_entries(gc, d_config, &dcs->build_state, domid);
+        if (ret) {
+            LOGD(ERROR, domid, "cannot make domain: %d", ret);
+            ret = ERROR_FAIL;
+            goto error_out;
+        }
+    } else {
+        LOGD(ERROR, domid, "cannot make domain");
         ret = ERROR_FAIL;
         goto error_out;
     }
 
-    dcs->guest_domid = domid;
     dcs->sdss.dm.guest_domid = 0; /* means we haven't spawned */
 
     /* post-4.13 todo: move these next bits of defaulting to
@@ -1236,7 +1268,7 @@ static void initiate_domain_create(libxl__egc *egc,
     if (ret)
         goto error_out;
 
-    if (restore_fd >= 0 || dcs->soft_reset) {
+    if (restore_fd >= 0 || dcs->soft_reset || d_config->dm_restore_file) {
         LOGD(DEBUG, domid, "restoring, not running bootloader");
         domcreate_bootloader_done(egc, &dcs->bl, 0);
     } else  {
@@ -1312,7 +1344,16 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     dcs->sdss.dm.callback = domcreate_devmodel_started;
     dcs->sdss.callback = domcreate_devmodel_started;
 
-    if (restore_fd < 0 && !dcs->soft_reset) {
+    if (restore_fd < 0 && !dcs->soft_reset && !d_config->dm_restore_file) {
+        rc = libxl__domain_build(gc, d_config, domid, state);
+        domcreate_rebuild_done(egc, dcs, rc);
+        return;
+    }
+
+    if (d_config->dm_restore_file) {
+        dcs->srs.dcs = dcs;
+        dcs->srs.ao = ao;
+        state->forked_vm = true;
         rc = libxl__domain_build(gc, d_config, domid, state);
         domcreate_rebuild_done(egc, dcs, rc);
         return;
@@ -1510,6 +1551,7 @@ static void domcreate_rebuild_done(libxl__egc *egc,
     /* convenience aliases */
     const uint32_t domid = dcs->guest_domid;
     libxl_domain_config *const d_config = dcs->guest_config;
+    libxl__domain_build_state *const state = &dcs->build_state;
 
     if (ret) {
         LOGD(ERROR, domid, "cannot (re-)build domain: %d", ret);
@@ -1517,6 +1559,9 @@ static void domcreate_rebuild_done(libxl__egc *egc,
         goto error_out;
     }
 
+    if (d_config->dm_restore_file)
+        state->saved_state = GCSPRINTF("%s", d_config->dm_restore_file);
+
     store_libxl_entry(gc, domid, &d_config->b_info);
 
     libxl__multidev_begin(ao, &dcs->multidev);
@@ -1947,7 +1992,7 @@ static void domain_create_cb(libxl__egc *egc,
                              libxl__domain_create_state *dcs,
                              int rc, uint32_t domid);
 
-static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
+int libxl__do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
                             uint32_t *domid, int restore_fd, int send_back_fd,
                             const libxl_domain_restore_params *params,
                             const libxl_asyncop_how *ao_how,
@@ -1960,6 +2005,8 @@ static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
     GCNEW(cdcs);
     cdcs->dcs.ao = ao;
     cdcs->dcs.guest_config = d_config;
+    cdcs->dcs.guest_domid = *domid;
+
     libxl_domain_config_init(&cdcs->dcs.guest_config_saved);
     libxl_domain_config_copy(ctx, &cdcs->dcs.guest_config_saved, d_config);
     cdcs->dcs.restore_fd = cdcs->dcs.libxc_fd = restore_fd;
@@ -2204,8 +2251,8 @@ int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,
                             const libxl_asyncprogress_how *aop_console_how)
 {
     unset_disk_colo_restore(d_config);
-    return do_domain_create(ctx, d_config, domid, -1, -1, NULL,
-                            ao_how, aop_console_how);
+    return libxl__do_domain_create(ctx, d_config, domid, -1, -1, NULL,
+                                  ao_how, aop_console_how);
 }
 
 int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config,
@@ -2221,8 +2268,8 @@ int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config,
         unset_disk_colo_restore(d_config);
     }
 
-    return do_domain_create(ctx, d_config, domid, restore_fd, send_back_fd,
-                            params, ao_how, aop_console_how);
+    return libxl__do_domain_create(ctx, d_config, domid, restore_fd, send_back_fd,
+                                   params, ao_how, aop_console_how);
 }
 
 int libxl_domain_soft_reset(libxl_ctx *ctx,
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index f4007bbe50..b615f1fc88 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -2803,7 +2803,7 @@ static void device_model_spawn_outcome(libxl__egc *egc,
 
     libxl__domain_build_state *state = dmss->build_state;
 
-    if (state->saved_state) {
+    if (state->saved_state && !state->forked_vm) {
         ret2 = unlink(state->saved_state);
         if (ret2) {
             LOGED(ERROR, dmss->guest_domid, "%s: failed to remove device-model state %s",
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 71cb578923..9e47162f67 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -249,9 +249,12 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     libxl_domain_build_info *const info = &d_config->b_info;
     libxl_ctx *ctx = libxl__gc_owner(gc);
     char *xs_domid, *con_domid;
-    int rc;
+    int rc = 0;
     uint64_t size;
 
+    if (state->forked_vm)
+        goto skip_fork;
+
     if (xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus) != 0) {
         LOG(ERROR, "Couldn't set max vcpu count");
         return ERROR_FAIL;
@@ -362,7 +365,6 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         }
     }
 
-
     rc = libxl__arch_extra_memory(gc, info, &size);
     if (rc < 0) {
         LOGE(ERROR, "Couldn't get arch extra constant memory size");
@@ -374,6 +376,11 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         return ERROR_FAIL;
     }
 
+    rc = libxl__arch_domain_create(gc, d_config, domid);
+    if (rc)
+        goto out;
+
+skip_fork:
     xs_domid = xs_read(ctx->xsh, XBT_NULL, "/tool/xenstored/domid", NULL);
     state->store_domid = xs_domid ? atoi(xs_domid) : 0;
     free(xs_domid);
@@ -385,8 +392,7 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     state->store_port = xc_evtchn_alloc_unbound(ctx->xch, domid, state->store_domid);
     state->console_port = xc_evtchn_alloc_unbound(ctx->xch, domid, state->console_domid);
 
-    rc = libxl__arch_domain_create(gc, d_config, domid);
-
+out:
     return rc;
 }
 
@@ -444,6 +450,9 @@ int libxl__build_post(libxl__gc *gc, uint32_t domid,
     char **ents;
     int i, rc;
 
+    if (state->forked_vm)
+        goto skip_fork;
+
     if (info->num_vnuma_nodes && !info->num_vcpu_soft_affinity) {
         rc = set_vnuma_affinity(gc, domid, info);
         if (rc)
@@ -466,6 +475,7 @@ int libxl__build_post(libxl__gc *gc, uint32_t domid,
         }
     }
 
+skip_fork:
     ents = libxl__calloc(gc, 12 + (info->max_vcpus * 2) + 2, sizeof(char *));
     ents[0] = "memory/static-max";
     ents[1] = GCSPRINTF("%"PRId64, info->max_memkb);
@@ -728,14 +738,16 @@ static int hvm_build_set_params(xc_interface *handle, uint32_t domid,
                                 libxl_domain_build_info *info,
                                 int store_evtchn, unsigned long *store_mfn,
                                 int console_evtchn, unsigned long *console_mfn,
-                                domid_t store_domid, domid_t console_domid)
+                                domid_t store_domid, domid_t console_domid,
+                                bool forked_vm)
 {
     struct hvm_info_table *va_hvm;
     uint8_t *va_map, sum;
     uint64_t str_mfn, cons_mfn;
     int i;
 
-    if (info->type == LIBXL_DOMAIN_TYPE_HVM) {
+    if (info->type == LIBXL_DOMAIN_TYPE_HVM && !forked_vm)
+    {
         va_map = xc_map_foreign_range(handle, domid,
                                       XC_PAGE_SIZE, PROT_READ | PROT_WRITE,
                                       HVM_INFO_PFN);
@@ -1051,6 +1063,23 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
     struct xc_dom_image *dom = NULL;
     bool device_model = info->type == LIBXL_DOMAIN_TYPE_HVM ? true : false;
 
+    if (state->forked_vm)
+    {
+        rc = hvm_build_set_params(ctx->xch, domid, info, state->store_port,
+                                  &state->store_mfn, state->console_port,
+                                  &state->console_mfn, state->store_domid,
+                                  state->console_domid, state->forked_vm);
+
+        if ( rc )
+            return rc;
+
+        return xc_dom_gnttab_seed(ctx->xch, domid, true,
+                                  state->console_mfn,
+                                  state->store_mfn,
+                                  state->console_domid,
+                                  state->store_domid);
+    }
+
     xc_dom_loginit(ctx->xch);
 
     /*
@@ -1175,7 +1204,7 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
     rc = hvm_build_set_params(ctx->xch, domid, info, state->store_port,
                                &state->store_mfn, state->console_port,
                                &state->console_mfn, state->store_domid,
-                               state->console_domid);
+                               state->console_domid, false);
     if (rc != 0) {
         LOG(ERROR, "hvm build set params failed");
         goto out;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 5f39e44cb9..d05ff31e83 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -1374,6 +1374,7 @@ typedef struct {
 
     char *saved_state;
     int dm_monitor_fd;
+    bool forked_vm;
 
     libxl__file_reference pv_kernel;
     libxl__file_reference pv_ramdisk;
@@ -4818,6 +4819,12 @@ _hidden int libxl__domain_pvcontrol(libxl__egc *egc,
 /* Check whether a domid is recent */
 int libxl__is_domid_recent(libxl__gc *gc, uint32_t domid, bool *recent);
 
+_hidden int libxl__do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
+                                    uint32_t *domid, int restore_fd, int send_back_fd,
+                                    const libxl_domain_restore_params *params,
+                                    const libxl_asyncop_how *ao_how,
+                                    const libxl_asyncprogress_how *aop_console_how);
+
 #endif
 
 /*
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index f7c473be74..2bb5e6319e 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -958,6 +958,7 @@ libxl_domain_config = Struct("domain_config", [
     ("on_watchdog", libxl_action_on_shutdown),
     ("on_crash", libxl_action_on_shutdown),
     ("on_soft_reset", libxl_action_on_shutdown),
+    ("dm_restore_file", string, {'const': True}),
     ], dir=DIR_IN)
 
 libxl_diskinfo = Struct("diskinfo", [
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index f8bc828e62..f6d7daa8fe 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -2,6 +2,7 @@
 #include "libxl_arch.h"
 
 #include <xc_dom.h>
+#include <xen-xsm/flask/flask.h>
 
 int libxl__arch_domain_prepare_config(libxl__gc *gc,
                                       libxl_domain_config *d_config,
@@ -842,6 +843,47 @@ int libxl__arch_passthrough_mode_setdefault(libxl__gc *gc,
     return rc;
 }
 
+/*
+ * The parent domain is expected to be created with default settings for
+ * - max_evtch_port
+ * - max_grant_frames
+ * - max_maptrack_frames
+ */
+int libxl_domain_fork_vm(libxl_ctx *ctx, uint32_t pdomid, uint32_t max_vcpus,
+                         bool allow_with_iommu, uint32_t *domid)
+{
+    int rc;
+    struct xen_domctl_createdomain create = {0};
+    create.flags |= XEN_DOMCTL_CDF_hvm;
+    create.flags |= XEN_DOMCTL_CDF_hap;
+    create.flags |= XEN_DOMCTL_CDF_oos_off;
+    create.arch.emulation_flags = (XEN_X86_EMU_ALL & ~XEN_X86_EMU_VPCI);
+    create.ssidref = SECINITSID_DOMU;
+    create.max_vcpus = max_vcpus;
+    create.max_evtchn_port = 1023;
+    create.max_grant_frames = LIBXL_MAX_GRANT_FRAMES_DEFAULT;
+    create.max_maptrack_frames = LIBXL_MAX_MAPTRACK_FRAMES_DEFAULT;
+
+    if ( (rc = xc_domain_create(ctx->xch, domid, &create)) )
+        return rc;
+
+    if ( (rc = xc_memshr_fork(ctx->xch, pdomid, *domid, allow_with_iommu)) )
+        xc_domain_destroy(ctx->xch, *domid);
+
+    return rc;
+}
+
+int libxl_domain_fork_launch_dm(libxl_ctx *ctx, libxl_domain_config *d_config,
+                                uint32_t domid,
+                                const libxl_asyncprogress_how *aop_console_how)
+{
+    return libxl__do_domain_create(ctx, d_config, &domid, -1, -1, 0, 0, aop_console_how);
+}
+
+int libxl_domain_fork_reset(libxl_ctx *ctx, uint32_t domid)
+{
+    return xc_memshr_fork_reset(ctx->xch, domid);
+}
 
 /*
  * Local variables:
diff --git a/tools/xl/Makefile b/tools/xl/Makefile
index af4912e67a..073222233b 100644
--- a/tools/xl/Makefile
+++ b/tools/xl/Makefile
@@ -15,7 +15,7 @@ LDFLAGS += $(PTHREAD_LDFLAGS)
 CFLAGS_XL += $(CFLAGS_libxenlight)
 CFLAGS_XL += -Wshadow
 
-XL_OBJS-$(CONFIG_X86) = xl_psr.o
+XL_OBJS-$(CONFIG_X86) = xl_psr.o xl_forkvm.o
 XL_OBJS = xl.o xl_cmdtable.o xl_sxp.o xl_utils.o $(XL_OBJS-y)
 XL_OBJS += xl_parse.o xl_cpupool.o xl_flask.o
 XL_OBJS += xl_vtpm.o xl_block.o xl_nic.o xl_usb.o
diff --git a/tools/xl/xl.h b/tools/xl/xl.h
index 06569c6c4a..1105c34b15 100644
--- a/tools/xl/xl.h
+++ b/tools/xl/xl.h
@@ -31,6 +31,7 @@ struct cmd_spec {
 };
 
 struct domain_create {
+    uint32_t ddomid; /* fork launch dm for this domid */
     int debug;
     int daemonize;
     int monitor; /* handle guest reboots etc */
@@ -45,6 +46,7 @@ struct domain_create {
     const char *config_file;
     char *extra_config; /* extra config string */
     const char *restore_file;
+    const char *dm_restore_file;
     char *colo_proxy_script;
     bool userspace_colo_proxy;
     int migrate_fd; /* -1 means none */
@@ -128,6 +130,8 @@ int main_pciassignable_remove(int argc, char **argv);
 int main_pciassignable_list(int argc, char **argv);
 #ifndef LIBXL_HAVE_NO_SUSPEND_RESUME
 int main_restore(int argc, char **argv);
+int main_fork_launch_dm(int argc, char **argv);
+int main_fork_reset(int argc, char **argv);
 int main_migrate_receive(int argc, char **argv);
 int main_save(int argc, char **argv);
 int main_migrate(int argc, char **argv);
@@ -212,6 +216,7 @@ int main_psr_cat_cbm_set(int argc, char **argv);
 int main_psr_cat_show(int argc, char **argv);
 int main_psr_mba_set(int argc, char **argv);
 int main_psr_mba_show(int argc, char **argv);
+int main_fork_vm(int argc, char **argv);
 #endif
 int main_qemu_monitor_command(int argc, char **argv);
 
diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c
index 08335394e5..ef634abf32 100644
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -187,6 +187,21 @@ struct cmd_spec cmd_table[] = {
       "Restore a domain from a saved state",
       "- for internal use only",
     },
+#if defined(__i386__) || defined(__x86_64__)
+    { "fork-vm",
+      &main_fork_vm, 0, 1,
+      "Fork a domain from the running parent domid. Experimental. Most config settings must match parent.",
+      "[options] <Domid>",
+      "-h                           Print this help.\n"
+      "-C <config>                  Use config file for VM fork.\n"
+      "-Q <qemu-save-file>          Use qemu save file for VM fork.\n"
+      "--launch-dm <yes|no|late>    Launch device model (QEMU) for VM fork.\n"
+      "--fork-reset                 Reset VM fork.\n"
+      "--max-vcpus                  Specify max-vcpus matching the parent domain when not launching dm\n"
+      "-p                           Do not unpause fork VM after operation.\n"
+      "-d                           Enable debug messages.\n"
+    },
+#endif
 #endif
     { "dump-core",
       &main_dump_core, 0, 1,
diff --git a/tools/xl/xl_forkvm.c b/tools/xl/xl_forkvm.c
new file mode 100644
index 0000000000..46805b84f3
--- /dev/null
+++ b/tools/xl/xl_forkvm.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <libxl.h>
+#include <libxl_utils.h>
+#include <libxlutil.h>
+
+#include "xl.h"
+#include "xl_utils.h"
+#include "xl_parse.h"
+
+int main_fork_vm(int argc, char **argv)
+{
+    int rc, debug = 0;
+    uint32_t domid_in = INVALID_DOMID, domid_out = INVALID_DOMID;
+    int launch_dm = 1;
+    bool reset = 0;
+    bool pause = 0;
+    bool allow_iommu = 0;
+    const char *config_file = NULL;
+    const char *dm_restore_file = NULL;
+    uint32_t max_vcpus = 0;
+
+    int opt;
+    static struct option opts[] = {
+        {"launch-dm", 1, 0, 'l'},
+        {"fork-reset", 0, 0, 'r'},
+        {"max-vcpus", 1, 0, 'm'},
+        {"allow-iommu", 1, 0, 'i'},
+        COMMON_LONG_OPTS
+    };
+
+    SWITCH_FOREACH_OPT(opt, "phdC:Q:l:rm:i", opts, "fork-vm", 1) {
+    case 'd':
+        debug = 1;
+        break;
+    case 'p':
+        pause = 1;
+        break;
+    case 'm':
+        max_vcpus = atoi(optarg);
+        break;
+    case 'C':
+        config_file = optarg;
+        break;
+    case 'Q':
+        dm_restore_file = optarg;
+        break;
+    case 'l':
+        if ( !strcmp(optarg, "no") )
+            launch_dm = 0;
+        if ( !strcmp(optarg, "yes") )
+            launch_dm = 1;
+        if ( !strcmp(optarg, "late") )
+            launch_dm = 2;
+        break;
+    case 'r':
+        reset = 1;
+        break;
+    case 'i':
+        allow_iommu = 1;
+        break;
+    default:
+        fprintf(stderr, "Unimplemented option(s)\n");
+        return EXIT_FAILURE;
+    }
+
+    if (argc-optind == 1) {
+        domid_in = atoi(argv[optind]);
+    } else {
+        help("fork-vm");
+        return EXIT_FAILURE;
+    }
+
+    if (launch_dm && (!config_file || !dm_restore_file)) {
+        fprintf(stderr, "Currently you must provide both -C and -Q options\n");
+        return EXIT_FAILURE;
+    }
+
+    if (reset) {
+        domid_out = domid_in;
+        if (libxl_domain_fork_reset(ctx, domid_in) == EXIT_FAILURE)
+            return EXIT_FAILURE;
+    }
+
+    if (launch_dm == 2 || reset) {
+        domid_out = domid_in;
+        rc = EXIT_SUCCESS;
+    } else {
+        if ( !max_vcpus )
+        {
+            fprintf(stderr, "Currently you must parent's max_vcpu for this option\n");
+            return EXIT_FAILURE;
+        }
+
+        rc = libxl_domain_fork_vm(ctx, domid_in, max_vcpus, allow_iommu, &domid_out);
+    }
+
+    if (rc == EXIT_SUCCESS) {
+        if ( launch_dm ) {
+            struct domain_create dom_info;
+            memset(&dom_info, 0, sizeof(dom_info));
+            dom_info.ddomid = domid_out;
+            dom_info.dm_restore_file = dm_restore_file;
+            dom_info.debug = debug;
+            dom_info.paused = pause;
+            dom_info.config_file = config_file;
+            dom_info.migrate_fd = -1;
+            dom_info.send_back_fd = -1;
+            rc = create_domain(&dom_info) < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+        } else if ( !pause )
+            rc = libxl_domain_unpause(ctx, domid_out, NULL);
+    }
+
+    if (rc == EXIT_SUCCESS)
+        fprintf(stderr, "fork-vm command successfully returned domid: %u\n", domid_out);
+    else if ( domid_out != INVALID_DOMID )
+        libxl_domain_destroy(ctx, domid_out, 0);
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c
index 17b4514c94..c64123d0a1 100644
--- a/tools/xl/xl_vmcontrol.c
+++ b/tools/xl/xl_vmcontrol.c
@@ -676,6 +676,12 @@ int create_domain(struct domain_create *dom_info)
 
     int restoring = (restore_file || (migrate_fd >= 0));
 
+#if defined(__i386__) || defined(__x86_64__)
+    /* VM forking */
+    uint32_t ddomid = dom_info->ddomid; // launch dm for this domain iff set
+    const char *dm_restore_file = dom_info->dm_restore_file;
+#endif
+
     libxl_domain_config_init(&d_config);
 
     if (restoring) {
@@ -928,6 +934,14 @@ start:
          * restore/migrate-receive it again.
          */
         restoring = 0;
+#if defined(__i386__) || defined(__x86_64__)
+    } else if ( ddomid ) {
+        d_config.dm_restore_file = dm_restore_file;
+        ret = libxl_domain_fork_launch_dm(ctx, &d_config, ddomid,
+                                          autoconnect_console_how);
+        domid = ddomid;
+        ddomid = INVALID_DOMID;
+#endif
     } else if (domid_soft_reset != INVALID_DOMID) {
         /* Do soft reset. */
         ret = libxl_domain_soft_reset(ctx, &d_config, domid_soft_reset,
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
  2020-04-23 15:30 [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Tamas K Lengyel
  2020-04-23 15:30 ` [PATCH v17 2/2] xen/tools: VM forking toolstack side Tamas K Lengyel
@ 2020-04-24  9:43 ` Roger Pau Monné
  2020-04-24 12:18   ` Tamas K Lengyel
  1 sibling, 1 reply; 12+ messages in thread
From: Roger Pau Monné @ 2020-04-24  9:43 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, Wei Liu, Andrew Cooper, George Dunlap,
	Jan Beulich, xen-devel

On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> When resetting a VM fork we ought to only remove pages that were allocated for
> the fork during it's execution and the contents copied over from the parent.
> This can be determined if the page is sharable as special pages used by the
> fork for other purposes will not pass this test. Unfortunately during the fork
> reset loop we only partially check whether that's the case. A page's type may
> indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> check by itself. All checks that are normally performed before a page is
> converted to the sharable type need to be performed to avoid removing pages
> from the p2m that may be used for other purposes. For example, currently the
> reset loop also removes the vcpu info pages from the p2m, potentially putting
> the guest into infinite page-fault loops.
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

I've been looking however and I'm not able to spot where you copy the
shared_info data, which I think it's also quite critical for the
domain, as it contains the info about event channels (when using L2).
Also for HVM forks the shared info should be mapped at the same gfn as
the parent, or else the child trying to access it will trigger an EPT
fault that won't be able to populate such gfn, because the shared_info
on the parent is already shared between Xen and the parent.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
  2020-04-24  9:43 ` [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Roger Pau Monné
@ 2020-04-24 12:18   ` Tamas K Lengyel
  2020-04-24 12:37     ` Roger Pau Monné
  2020-04-25  9:01     ` Roger Pau Monné
  0 siblings, 2 replies; 12+ messages in thread
From: Tamas K Lengyel @ 2020-04-24 12:18 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, Wei Liu, Andrew Cooper, George Dunlap,
	Jan Beulich, Xen-devel

On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > When resetting a VM fork we ought to only remove pages that were allocated for
> > the fork during it's execution and the contents copied over from the parent.
> > This can be determined if the page is sharable as special pages used by the
> > fork for other purposes will not pass this test. Unfortunately during the fork
> > reset loop we only partially check whether that's the case. A page's type may
> > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > check by itself. All checks that are normally performed before a page is
> > converted to the sharable type need to be performed to avoid removing pages
> > from the p2m that may be used for other purposes. For example, currently the
> > reset loop also removes the vcpu info pages from the p2m, potentially putting
> > the guest into infinite page-fault loops.
> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks!

>
> I've been looking however and I'm not able to spot where you copy the
> shared_info data, which I think it's also quite critical for the
> domain, as it contains the info about event channels (when using L2).
> Also for HVM forks the shared info should be mapped at the same gfn as
> the parent, or else the child trying to access it will trigger an EPT
> fault that won't be able to populate such gfn, because the shared_info
> on the parent is already shared between Xen and the parent.

Pages that cause an EPT fault but can't be made shared get a new page
allocated for them and copied in mem_sharing_fork_page. There are many
pages like that, mostly due to QEMU mapping them and thus holding an
extra reference. That said, shared_info is manually copied during
forking in copy_special_pages, at the end of the function you will
find:

old_mfn = _mfn(virt_to_mfn(d->shared_info));
new_mfn = _mfn(virt_to_mfn(cd->shared_info));

copy_domain_page(new_mfn, old_mfn);

Cheers,
Tamas


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
  2020-04-24 12:18   ` Tamas K Lengyel
@ 2020-04-24 12:37     ` Roger Pau Monné
  2020-04-25  9:01     ` Roger Pau Monné
  1 sibling, 0 replies; 12+ messages in thread
From: Roger Pau Monné @ 2020-04-24 12:37 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, Wei Liu, Andrew Cooper, George Dunlap,
	Jan Beulich, Xen-devel

On Fri, Apr 24, 2020 at 06:18:24AM -0600, Tamas K Lengyel wrote:
> On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > > When resetting a VM fork we ought to only remove pages that were allocated for
> > > the fork during it's execution and the contents copied over from the parent.
> > > This can be determined if the page is sharable as special pages used by the
> > > fork for other purposes will not pass this test. Unfortunately during the fork
> > > reset loop we only partially check whether that's the case. A page's type may
> > > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > > check by itself. All checks that are normally performed before a page is
> > > converted to the sharable type need to be performed to avoid removing pages
> > > from the p2m that may be used for other purposes. For example, currently the
> > > reset loop also removes the vcpu info pages from the p2m, potentially putting
> > > the guest into infinite page-fault loops.
> > >
> > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> >
> > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Thanks!
> 
> >
> > I've been looking however and I'm not able to spot where you copy the
> > shared_info data, which I think it's also quite critical for the
> > domain, as it contains the info about event channels (when using L2).
> > Also for HVM forks the shared info should be mapped at the same gfn as
> > the parent, or else the child trying to access it will trigger an EPT
> > fault that won't be able to populate such gfn, because the shared_info
> > on the parent is already shared between Xen and the parent.
> 
> Pages that cause an EPT fault but can't be made shared get a new page
> allocated for them and copied in mem_sharing_fork_page. There are many
> pages like that, mostly due to QEMU mapping them and thus holding an
> extra reference. That said, shared_info is manually copied during
> forking in copy_special_pages, at the end of the function you will
> find:
> 
> old_mfn = _mfn(virt_to_mfn(d->shared_info));
> new_mfn = _mfn(virt_to_mfn(cd->shared_info));
> 
> copy_domain_page(new_mfn, old_mfn);

Oh, sorry for the noise, I somehow missed it.

Roger.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 2/2] xen/tools: VM forking toolstack side
  2020-04-23 15:30 ` [PATCH v17 2/2] xen/tools: VM forking toolstack side Tamas K Lengyel
@ 2020-04-24 13:11   ` Jan Beulich
  2020-05-01 13:59   ` Tamas K Lengyel
  1 sibling, 0 replies; 12+ messages in thread
From: Jan Beulich @ 2020-04-24 13:11 UTC (permalink / raw)
  To: Tamas K Lengyel, Anthony PERARD, Ian Jackson, Wei Liu; +Cc: xen-devel

On 23.04.2020 17:30, Tamas K Lengyel wrote:
> Add necessary bits to implement "xl fork-vm" commands. The command allows the
> user to specify how to launch the device model allowing for a late-launch model
> in which the user can execute the fork without the device model and decide to
> only later launch it.
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>

As before this will be left for the tool stack maintainers to
ack and commit.

Jan


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
  2020-04-24 12:18   ` Tamas K Lengyel
  2020-04-24 12:37     ` Roger Pau Monné
@ 2020-04-25  9:01     ` Roger Pau Monné
  2020-04-25 12:31       ` Tamas K Lengyel
  1 sibling, 1 reply; 12+ messages in thread
From: Roger Pau Monné @ 2020-04-25  9:01 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, Wei Liu, Andrew Cooper, George Dunlap,
	Jan Beulich, Xen-devel

On Fri, Apr 24, 2020 at 06:18:24AM -0600, Tamas K Lengyel wrote:
> On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > > When resetting a VM fork we ought to only remove pages that were allocated for
> > > the fork during it's execution and the contents copied over from the parent.
> > > This can be determined if the page is sharable as special pages used by the
> > > fork for other purposes will not pass this test. Unfortunately during the fork
> > > reset loop we only partially check whether that's the case. A page's type may
> > > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > > check by itself. All checks that are normally performed before a page is
> > > converted to the sharable type need to be performed to avoid removing pages
> > > from the p2m that may be used for other purposes. For example, currently the
> > > reset loop also removes the vcpu info pages from the p2m, potentially putting
> > > the guest into infinite page-fault loops.
> > >
> > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> >
> > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Thanks!
> 
> >
> > I've been looking however and I'm not able to spot where you copy the
> > shared_info data, which I think it's also quite critical for the
> > domain, as it contains the info about event channels (when using L2).
> > Also for HVM forks the shared info should be mapped at the same gfn as
> > the parent, or else the child trying to access it will trigger an EPT
> > fault that won't be able to populate such gfn, because the shared_info
> > on the parent is already shared between Xen and the parent.
> 
> Pages that cause an EPT fault but can't be made shared get a new page
> allocated for them and copied in mem_sharing_fork_page. There are many
> pages like that, mostly due to QEMU mapping them and thus holding an
> extra reference. That said, shared_info is manually copied during
> forking in copy_special_pages, at the end of the function you will
> find:
> 
> old_mfn = _mfn(virt_to_mfn(d->shared_info));
> new_mfn = _mfn(virt_to_mfn(cd->shared_info));
> 
> copy_domain_page(new_mfn, old_mfn);

Yes, that's indeed fine, you need to copy the contents of the shared
info page, but for HVM you also need to make sure the shared info page
is mapped at the same gfn as the parent. HVM guest issue a hypercall
to request the mapping of the shared info page to a specific gfn,
since it's not added to the guest physmap by default.
XENMAPSPACE_shared_info is used in order to request the shared info
page to be mapped at a specific gfn.

AFAICT during fork such shared info mapping is not replicated to the
child, and hence the child trying to access the gfn of the shared info
page would just result in EPT faults that won't be fixed (because the
parent shared info page cannot be shared with the child)?

You should be able to use get_gpfn_from_mfn in order to get the parent
gfn where the shared info mfn is mapped (if any), and then replicate
this in the child using it's own shared info.

On fork reset you should check if the child shared info gfn != parent
shared info gfn and reinstate the parent state if different from the
child.

Roger.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
  2020-04-25  9:01     ` Roger Pau Monné
@ 2020-04-25 12:31       ` Tamas K Lengyel
  2020-04-27  8:27         ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Tamas K Lengyel @ 2020-04-25 12:31 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, Wei Liu, Andrew Cooper, George Dunlap,
	Jan Beulich, Xen-devel

On Sat, Apr 25, 2020 at 3:01 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Fri, Apr 24, 2020 at 06:18:24AM -0600, Tamas K Lengyel wrote:
> > On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >
> > > On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > > > When resetting a VM fork we ought to only remove pages that were allocated for
> > > > the fork during it's execution and the contents copied over from the parent.
> > > > This can be determined if the page is sharable as special pages used by the
> > > > fork for other purposes will not pass this test. Unfortunately during the fork
> > > > reset loop we only partially check whether that's the case. A page's type may
> > > > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > > > check by itself. All checks that are normally performed before a page is
> > > > converted to the sharable type need to be performed to avoid removing pages
> > > > from the p2m that may be used for other purposes. For example, currently the
> > > > reset loop also removes the vcpu info pages from the p2m, potentially putting
> > > > the guest into infinite page-fault loops.
> > > >
> > > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > >
> > > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> >
> > Thanks!
> >
> > >
> > > I've been looking however and I'm not able to spot where you copy the
> > > shared_info data, which I think it's also quite critical for the
> > > domain, as it contains the info about event channels (when using L2).
> > > Also for HVM forks the shared info should be mapped at the same gfn as
> > > the parent, or else the child trying to access it will trigger an EPT
> > > fault that won't be able to populate such gfn, because the shared_info
> > > on the parent is already shared between Xen and the parent.
> >
> > Pages that cause an EPT fault but can't be made shared get a new page
> > allocated for them and copied in mem_sharing_fork_page. There are many
> > pages like that, mostly due to QEMU mapping them and thus holding an
> > extra reference. That said, shared_info is manually copied during
> > forking in copy_special_pages, at the end of the function you will
> > find:
> >
> > old_mfn = _mfn(virt_to_mfn(d->shared_info));
> > new_mfn = _mfn(virt_to_mfn(cd->shared_info));
> >
> > copy_domain_page(new_mfn, old_mfn);
>
> Yes, that's indeed fine, you need to copy the contents of the shared
> info page, but for HVM you also need to make sure the shared info page
> is mapped at the same gfn as the parent. HVM guest issue a hypercall
> to request the mapping of the shared info page to a specific gfn,
> since it's not added to the guest physmap by default.
> XENMAPSPACE_shared_info is used in order to request the shared info
> page to be mapped at a specific gfn.
>
> AFAICT during fork such shared info mapping is not replicated to the
> child, and hence the child trying to access the gfn of the shared info
> page would just result in EPT faults that won't be fixed (because the
> parent shared info page cannot be shared with the child)?
>
> You should be able to use get_gpfn_from_mfn in order to get the parent
> gfn where the shared info mfn is mapped (if any), and then replicate
> this in the child using it's own shared info.
>
> On fork reset you should check if the child shared info gfn != parent
> shared info gfn and reinstate the parent state if different from the
> child.

OK, I see what you mean. In the way things set up currently the EPT
fault-loop problem doesn't happen since the page does get copied, it
just gets copied to a new mfn not the one d->shared_info points to. So
whatever issue that may bring it must be subtle because we haven't
noticed any instability.

Also, looking at the save/restore code in libxc it seems to me that
shared_info is actually a PV specific page and it doesn't get
saved/restored with an HVM domain. The hvm code paths don't process
REC_TYPE_SHARED_INFO at all. So since forks are exclusively for HVM
domains, do we really need it and if so, why doesn't HVM save/restore
need it?

Tamas


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset
  2020-04-25 12:31       ` Tamas K Lengyel
@ 2020-04-27  8:27         ` Roger Pau Monné
  0 siblings, 0 replies; 12+ messages in thread
From: Roger Pau Monné @ 2020-04-27  8:27 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, Wei Liu, Andrew Cooper, George Dunlap,
	Jan Beulich, Xen-devel

On Sat, Apr 25, 2020 at 06:31:46AM -0600, Tamas K Lengyel wrote:
> On Sat, Apr 25, 2020 at 3:01 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Fri, Apr 24, 2020 at 06:18:24AM -0600, Tamas K Lengyel wrote:
> > > On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > >
> > > > On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > > > > When resetting a VM fork we ought to only remove pages that were allocated for
> > > > > the fork during it's execution and the contents copied over from the parent.
> > > > > This can be determined if the page is sharable as special pages used by the
> > > > > fork for other purposes will not pass this test. Unfortunately during the fork
> > > > > reset loop we only partially check whether that's the case. A page's type may
> > > > > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > > > > check by itself. All checks that are normally performed before a page is
> > > > > converted to the sharable type need to be performed to avoid removing pages
> > > > > from the p2m that may be used for other purposes. For example, currently the
> > > > > reset loop also removes the vcpu info pages from the p2m, potentially putting
> > > > > the guest into infinite page-fault loops.
> > > > >
> > > > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > > >
> > > > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> > >
> > > Thanks!
> > >
> > > >
> > > > I've been looking however and I'm not able to spot where you copy the
> > > > shared_info data, which I think it's also quite critical for the
> > > > domain, as it contains the info about event channels (when using L2).
> > > > Also for HVM forks the shared info should be mapped at the same gfn as
> > > > the parent, or else the child trying to access it will trigger an EPT
> > > > fault that won't be able to populate such gfn, because the shared_info
> > > > on the parent is already shared between Xen and the parent.
> > >
> > > Pages that cause an EPT fault but can't be made shared get a new page
> > > allocated for them and copied in mem_sharing_fork_page. There are many
> > > pages like that, mostly due to QEMU mapping them and thus holding an
> > > extra reference. That said, shared_info is manually copied during
> > > forking in copy_special_pages, at the end of the function you will
> > > find:
> > >
> > > old_mfn = _mfn(virt_to_mfn(d->shared_info));
> > > new_mfn = _mfn(virt_to_mfn(cd->shared_info));
> > >
> > > copy_domain_page(new_mfn, old_mfn);
> >
> > Yes, that's indeed fine, you need to copy the contents of the shared
> > info page, but for HVM you also need to make sure the shared info page
> > is mapped at the same gfn as the parent. HVM guest issue a hypercall
> > to request the mapping of the shared info page to a specific gfn,
> > since it's not added to the guest physmap by default.
> > XENMAPSPACE_shared_info is used in order to request the shared info
> > page to be mapped at a specific gfn.
> >
> > AFAICT during fork such shared info mapping is not replicated to the
> > child, and hence the child trying to access the gfn of the shared info
> > page would just result in EPT faults that won't be fixed (because the
> > parent shared info page cannot be shared with the child)?
> >
> > You should be able to use get_gpfn_from_mfn in order to get the parent
> > gfn where the shared info mfn is mapped (if any), and then replicate
> > this in the child using it's own shared info.
> >
> > On fork reset you should check if the child shared info gfn != parent
> > shared info gfn and reinstate the parent state if different from the
> > child.
> 
> OK, I see what you mean. In the way things set up currently the EPT
> fault-loop problem doesn't happen since the page does get copied, it
> just gets copied to a new mfn not the one d->shared_info points to. So
> whatever issue that may bring it must be subtle because we haven't
> noticed any instability.

In any case I think it would be good if you could add such mapping on
domain fork and reset, as you already partially handle the shared info
page by copying the data from the parent into the child. Or at least
add a FIXME comment to note the fact that a child trying to access the
shared info page won't work.

> Also, looking at the save/restore code in libxc it seems to me that
> shared_info is actually a PV specific page and it doesn't get
> saved/restored with an HVM domain. The hvm code paths don't process
> REC_TYPE_SHARED_INFO at all. So since forks are exclusively for HVM
> domains, do we really need it and if so, why doesn't HVM save/restore
> need it?

HVM domains on suspend/resume are aware of the need to re-instate the
shared info mapping, as it's part of the protocol and the guest is
actively cooperating on such actions. The same happens with the vcpu
info pages or the PV timers for example: they are not saved during a
suspend/resume and the guest needs to setup them again on restore.

Fork however is completely transparent from a guest PoV, and hence
there's no way to signal the child it needs to setup the shared info
mapping (or any other mapping or interface FWIW).

Roger.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 2/2] xen/tools: VM forking toolstack side
  2020-04-23 15:30 ` [PATCH v17 2/2] xen/tools: VM forking toolstack side Tamas K Lengyel
  2020-04-24 13:11   ` Jan Beulich
@ 2020-05-01 13:59   ` Tamas K Lengyel
  2020-05-06 13:00     ` Wei Liu
  1 sibling, 1 reply; 12+ messages in thread
From: Tamas K Lengyel @ 2020-05-01 13:59 UTC (permalink / raw)
  To: Tamas K Lengyel; +Cc: Anthony PERARD, Xen-devel, Ian Jackson, Wei Liu

On Thu, Apr 23, 2020 at 9:33 AM Tamas K Lengyel <tamas.lengyel@intel.com> wrote:
>
> Add necessary bits to implement "xl fork-vm" commands. The command allows the
> user to specify how to launch the device model allowing for a late-launch model
> in which the user can execute the fork without the device model and decide to
> only later launch it.
>
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>

Patch ping. If nothing else at least the libxc parts would be nice to
get merged before the freeze.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 2/2] xen/tools: VM forking toolstack side
  2020-05-01 13:59   ` Tamas K Lengyel
@ 2020-05-06 13:00     ` Wei Liu
  2020-05-06 13:17       ` Tamas K Lengyel
  0 siblings, 1 reply; 12+ messages in thread
From: Wei Liu @ 2020-05-06 13:00 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Anthony PERARD, Xen-devel, Tamas K Lengyel, Ian Jackson, Wei Liu

On Fri, May 01, 2020 at 07:59:45AM -0600, Tamas K Lengyel wrote:
> On Thu, Apr 23, 2020 at 9:33 AM Tamas K Lengyel <tamas.lengyel@intel.com> wrote:
> >
> > Add necessary bits to implement "xl fork-vm" commands. The command allows the
> > user to specify how to launch the device model allowing for a late-launch model
> > in which the user can execute the fork without the device model and decide to
> > only later launch it.
> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> 
> Patch ping. If nothing else at least the libxc parts would be nice to
> get merged before the freeze.

Changes to libxc looks good to me.

Please split it out to a patch with proper commit message.

Wei.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v17 2/2] xen/tools: VM forking toolstack side
  2020-05-06 13:00     ` Wei Liu
@ 2020-05-06 13:17       ` Tamas K Lengyel
  0 siblings, 0 replies; 12+ messages in thread
From: Tamas K Lengyel @ 2020-05-06 13:17 UTC (permalink / raw)
  To: Wei Liu; +Cc: Anthony PERARD, Xen-devel, Tamas K Lengyel, Ian Jackson

On Wed, May 6, 2020 at 7:00 AM Wei Liu <wl@xen.org> wrote:
>
> On Fri, May 01, 2020 at 07:59:45AM -0600, Tamas K Lengyel wrote:
> > On Thu, Apr 23, 2020 at 9:33 AM Tamas K Lengyel <tamas.lengyel@intel.com> wrote:
> > >
> > > Add necessary bits to implement "xl fork-vm" commands. The command allows the
> > > user to specify how to launch the device model allowing for a late-launch model
> > > in which the user can execute the fork without the device model and decide to
> > > only later launch it.
> > >
> > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> >
> > Patch ping. If nothing else at least the libxc parts would be nice to
> > get merged before the freeze.
>
> Changes to libxc looks good to me.
>
> Please split it out to a patch with proper commit message.
>

Sounds good, will do.

Thanks,
Tamas


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-05-06 13:18 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-23 15:30 [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Tamas K Lengyel
2020-04-23 15:30 ` [PATCH v17 2/2] xen/tools: VM forking toolstack side Tamas K Lengyel
2020-04-24 13:11   ` Jan Beulich
2020-05-01 13:59   ` Tamas K Lengyel
2020-05-06 13:00     ` Wei Liu
2020-05-06 13:17       ` Tamas K Lengyel
2020-04-24  9:43 ` [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset Roger Pau Monné
2020-04-24 12:18   ` Tamas K Lengyel
2020-04-24 12:37     ` Roger Pau Monné
2020-04-25  9:01     ` Roger Pau Monné
2020-04-25 12:31       ` Tamas K Lengyel
2020-04-27  8:27         ` Roger Pau Monné

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).