All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
@ 2022-03-22 17:41 Tamas K Lengyel
  2022-03-22 17:41 ` [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete Tamas K Lengyel
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-22 17:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Tamas K Lengyel, Jan Beulich, Andrew Cooper, Roger Pau Monné,
	Wei Liu, George Dunlap, Julien Grall, Stefano Stabellini,
	Tamas K Lengyel

Add option to the fork memop to skip populating the fork with special pages.
These special pages are only necessary when setting up forks to be fully
functional with a toolstack. For short-lived forks where no toolstack is active
these pages are uneccesary.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
 xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
 xen/include/public/memory.h           |  4 ++--
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
index 698455444e..446cd06411 100644
--- a/xen/arch/x86/include/asm/hvm/domain.h
+++ b/xen/arch/x86/include/asm/hvm/domain.h
@@ -31,7 +31,9 @@
 #ifdef CONFIG_MEM_SHARING
 struct mem_sharing_domain
 {
-    bool enabled, block_interrupts;
+    bool enabled;
+    bool block_interrupts;
+    bool skip_special_pages;
 
     /*
      * When releasing shared gfn's in a preemptible manner, recall where
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 15e6a7ed81..84c04ddfa3 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
     return 0;
 }
 
-static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
+static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
+                              bool skip_special_pages)
 {
     unsigned int i;
     struct p2m_domain *p2m = p2m_get_hostp2m(cd);
@@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
 
         /* Copy & map in the vcpu_info page if the guest uses one */
         vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
-        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
+        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
         {
             mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
 
@@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
     return 0;
 }
 
-static int copy_settings(struct domain *cd, struct domain *d)
+static int copy_settings(struct domain *cd, struct domain *d,
+                         bool skip_special_pages)
 {
     int rc;
 
-    if ( (rc = copy_vcpu_settings(cd, d)) )
+    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
         return rc;
 
     if ( (rc = hvm_copy_context_and_params(cd, d)) )
         return rc;
 
-    if ( (rc = copy_special_pages(cd, d)) )
+    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
         return rc;
 
     copy_tsc(cd, d);
@@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
     return rc;
 }
 
-static int fork(struct domain *cd, struct domain *d)
+static int fork(struct domain *cd, struct domain *d, uint16_t flags)
 {
     int rc = -EBUSY;
+    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
+    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
 
     if ( !cd->controller_pause_count )
         return rc;
@@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
     if ( (rc = bring_up_vcpus(cd, d)) )
         goto done;
 
-    rc = copy_settings(cd, d);
+    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
+    {
+        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
+        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
+        /* skip mapping the vAPIC page on unpause if skipping special pages */
+        cd->creation_finished = skip_special_pages;
+    }
 
  done:
     if ( rc && rc != -ERESTART )
@@ -1920,7 +1930,7 @@ static int mem_sharing_fork_reset(struct domain *d)
     }
     spin_unlock_recursive(&d->page_alloc_lock);
 
-    rc = copy_settings(d, pd);
+    rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
 
     domain_unpause(d);
 
@@ -2190,7 +2200,8 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         if ( mso.u.fork.pad )
             goto out;
         if ( mso.u.fork.flags &
-             ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS) )
+             ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS |
+               XENMEM_FORK_SKIP_SPECIAL_PAGES) )
             goto out;
 
         rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2212,14 +2223,12 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
             goto out;
         }
 
-        rc = fork(d, pd);
+        rc = fork(d, pd, mso.u.fork.flags);
 
         if ( rc == -ERESTART )
             rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
                                                "lh", XENMEM_sharing_op,
                                                arg);
-        else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_BLOCK_INTERRUPTS) )
-            d->arch.hvm.mem_sharing.block_interrupts = true;
 
         rcu_unlock_domain(pd);
         break;
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index a1a0f0233a..208d8dcbd9 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -543,10 +543,10 @@ struct xen_mem_sharing_op {
         } debug;
         struct mem_sharing_op_fork {      /* OP_FORK */
             domid_t parent_domain;        /* IN: parent's domain id */
-/* Only makes sense for short-lived forks */
+/* These flags only makes sense for short-lived forks */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
-/* Only makes sense for short-lived forks */
 #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
+#define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
             uint16_t flags;               /* IN: optional settings */
             uint32_t pad;                 /* Must be set to 0 */
         } fork;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete
  2022-03-22 17:41 [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Tamas K Lengyel
@ 2022-03-22 17:41 ` Tamas K Lengyel
  2022-03-28 13:32   ` Jan Beulich
  2022-03-22 17:41 ` [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable Tamas K Lengyel
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-22 17:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Tamas K Lengyel, Jan Beulich, Andrew Cooper, Roger Pau Monné,
	Wei Liu, Tamas K Lengyel, George Dunlap

For the duration of the fork memop set dom_cow as a placeholder parent. This
gets updated to the real parent when the fork operation completes, or to NULL
in case the fork failed. Doing this allows us to skip populating the physmap
with any entries until the fork operation successfully completes. Currently
bringing up vCPUs may inadvertantly map in some pages that can turn out to be
unecessary, like the CR3 gfn when paging is disabled.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 xen/arch/x86/include/asm/mem_sharing.h | 2 +-
 xen/arch/x86/mm/mem_sharing.c          | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/include/asm/mem_sharing.h b/xen/arch/x86/include/asm/mem_sharing.h
index cf7a12f4d2..b4a8e8795a 100644
--- a/xen/arch/x86/include/asm/mem_sharing.h
+++ b/xen/arch/x86/include/asm/mem_sharing.h
@@ -79,7 +79,7 @@ static inline int mem_sharing_unshare_page(struct domain *d,
 
 static inline bool mem_sharing_is_fork(const struct domain *d)
 {
-    return d->parent;
+    return d->parent && d->parent != dom_cow;
 }
 
 int mem_sharing_fork_page(struct domain *d, gfn_t gfn,
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 84c04ddfa3..a21c781452 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1850,7 +1850,9 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
         *cd->arch.cpuid = *d->arch.cpuid;
         *cd->arch.msr = *d->arch.msr;
         cd->vmtrace_size = d->vmtrace_size;
-        cd->parent = d;
+
+        /* use dom_cow as a placeholder until we are all done */
+        cd->parent = dom_cow;
     }
 
     /* This is preemptible so it's the first to get done */
@@ -1862,6 +1864,7 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
 
     if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
     {
+        cd->parent = d;
         cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
         cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
         /* skip mapping the vAPIC page on unpause if skipping special pages */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-22 17:41 [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Tamas K Lengyel
  2022-03-22 17:41 ` [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete Tamas K Lengyel
@ 2022-03-22 17:41 ` Tamas K Lengyel
  2022-03-24 15:45   ` Roger Pau Monné
  2022-03-25 13:42   ` Roger Pau Monné
  2022-03-24 14:50 ` [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Roger Pau Monné
  2022-03-25 10:59 ` Roger Pau Monné
  3 siblings, 2 replies; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-22 17:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Tamas K Lengyel, Wei Liu, Anthony PERARD, Juergen Gross,
	Andrew Cooper, George Dunlap, Jan Beulich, Julien Grall,
	Stefano Stabellini, Roger Pau Monné,
	Tamas K Lengyel, Alexandru Isaila, Petre Pircalabu

Allow specify distinct parts of the fork VM to be reset. This is useful when a
fuzzing operation involves mapping in only a handful of pages that are known
ahead of time. Throwing these pages away just to be re-copied immediately is
expensive, thus allowing to specify partial resets can speed things up.

Also allow resetting to be initiated from vm_event responses as an
optiomization.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
 tools/include/xenctrl.h                |  3 ++-
 tools/libs/ctrl/xc_memshr.c            |  7 ++++++-
 xen/arch/x86/include/asm/mem_sharing.h |  9 +++++++++
 xen/arch/x86/mm/mem_sharing.c          | 22 +++++++++++++++++-----
 xen/common/vm_event.c                  | 14 ++++++++++++++
 xen/include/public/memory.h            |  4 +++-
 xen/include/public/vm_event.h          |  8 ++++++++
 7 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/tools/include/xenctrl.h b/tools/include/xenctrl.h
index 95bd5eca67..1b089a2c02 100644
--- a/tools/include/xenctrl.h
+++ b/tools/include/xenctrl.h
@@ -2290,7 +2290,8 @@ int xc_memshr_fork(xc_interface *xch,
  *
  * With VMs that have a lot of memory this call may block for a long time.
  */
-int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain);
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain,
+                         bool reset_state, bool reset_memory);
 
 /* Debug calls: return the number of pages referencing the shared frame backing
  * the input argument. Should be one or greater.
diff --git a/tools/libs/ctrl/xc_memshr.c b/tools/libs/ctrl/xc_memshr.c
index a6cfd7dccf..a0d0b894e2 100644
--- a/tools/libs/ctrl/xc_memshr.c
+++ b/tools/libs/ctrl/xc_memshr.c
@@ -257,12 +257,17 @@ int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
     return xc_memshr_memop(xch, domid, &mso);
 }
 
-int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid)
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid, bool reset_state,
+                         bool reset_memory)
 {
     xen_mem_sharing_op_t mso;
 
     memset(&mso, 0, sizeof(mso));
     mso.op = XENMEM_sharing_op_fork_reset;
+    if ( reset_state )
+        mso.u.fork.flags |= XENMEM_FORK_RESET_STATE;
+    if ( reset_memory )
+        mso.u.fork.flags |= XENMEM_FORK_RESET_MEMORY;
 
     return xc_memshr_memop(xch, domid, &mso);
 }
diff --git a/xen/arch/x86/include/asm/mem_sharing.h b/xen/arch/x86/include/asm/mem_sharing.h
index b4a8e8795a..fca5ec8aeb 100644
--- a/xen/arch/x86/include/asm/mem_sharing.h
+++ b/xen/arch/x86/include/asm/mem_sharing.h
@@ -85,6 +85,9 @@ static inline bool mem_sharing_is_fork(const struct domain *d)
 int mem_sharing_fork_page(struct domain *d, gfn_t gfn,
                           bool unsharing);
 
+int mem_sharing_fork_reset(struct domain *d, bool reset_state,
+                           bool reset_memory);
+
 /*
  * If called by a foreign domain, possible errors are
  *   -EBUSY -> ring full
@@ -148,6 +151,12 @@ static inline int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool lock)
     return -EOPNOTSUPP;
 }
 
+static inline int mem_sharing_fork_reset(struct domain *d, bool reset_state,
+                                         bool reset_memory)
+{
+    return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* __MEM_SHARING_H__ */
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index a21c781452..bfa6082f13 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1892,15 +1892,19 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
  * footprints the hypercall continuation should be implemented (or if this
  * feature needs to be become "stable").
  */
-static int mem_sharing_fork_reset(struct domain *d)
+int mem_sharing_fork_reset(struct domain *d, bool reset_state,
+                           bool reset_memory)
 {
-    int rc;
+    int rc = 0;
     struct domain *pd = d->parent;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     struct page_info *page, *tmp;
 
     domain_pause(d);
 
+    if ( !reset_memory )
+        goto state;
+
     /* need recursive lock because we will free pages */
     spin_lock_recursive(&d->page_alloc_lock);
     page_list_for_each_safe(page, tmp, &d->page_list)
@@ -1933,7 +1937,9 @@ static int mem_sharing_fork_reset(struct domain *d)
     }
     spin_unlock_recursive(&d->page_alloc_lock);
 
-    rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
+ state:
+    if ( reset_state )
+        rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
 
     domain_unpause(d);
 
@@ -2239,15 +2245,21 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 
     case XENMEM_sharing_op_fork_reset:
     {
+        bool reset_state = mso.u.fork.flags & XENMEM_FORK_RESET_STATE;
+        bool reset_memory = mso.u.fork.flags & XENMEM_FORK_RESET_MEMORY;
+
         rc = -EINVAL;
-        if ( mso.u.fork.pad || mso.u.fork.flags )
+        if ( mso.u.fork.pad || (!reset_state && !reset_memory) )
+            goto out;
+        if ( mso.u.fork.flags &
+             ~(XENMEM_FORK_RESET_STATE | XENMEM_FORK_RESET_MEMORY) )
             goto out;
 
         rc = -ENOSYS;
         if ( !d->parent )
             goto out;
 
-        rc = mem_sharing_fork_reset(d);
+        rc = mem_sharing_fork_reset(d, reset_state, reset_memory);
         break;
     }
 
diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
index 84cf52636b..a7b192be0d 100644
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -28,6 +28,11 @@
 #include <asm/p2m.h>
 #include <asm/monitor.h>
 #include <asm/vm_event.h>
+
+#ifdef CONFIG_MEM_SHARING
+#include <asm/mem_sharing.h>
+#endif
+
 #include <xsm/xsm.h>
 #include <public/hvm/params.h>
 
@@ -394,6 +399,15 @@ static int vm_event_resume(struct domain *d, struct vm_event_domain *ved)
             if ( rsp.reason == VM_EVENT_REASON_MEM_PAGING )
                 p2m_mem_paging_resume(d, &rsp);
 #endif
+#ifdef CONFIG_MEM_SHARING
+            do {
+                bool reset_state = rsp.flags & VM_EVENT_FLAG_RESET_FORK_STATE;
+                bool reset_mem = rsp.flags & VM_EVENT_FLAG_RESET_FORK_MEMORY;
+
+                if ( reset_state || reset_mem )
+                    mem_sharing_fork_reset(d, reset_state, reset_mem);
+            } while(0);
+#endif
 
             /*
              * Check emulation flags in the arch-specific handler only, as it
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 208d8dcbd9..30ce23c5a7 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
                 uint32_t gref;     /* IN: gref to debug         */
             } u;
         } debug;
-        struct mem_sharing_op_fork {      /* OP_FORK */
+        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
             domid_t parent_domain;        /* IN: parent's domain id */
 /* These flags only makes sense for short-lived forks */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
 #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
 #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
+#define XENMEM_FORK_RESET_STATE        (1u << 3)
+#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
             uint16_t flags;               /* IN: optional settings */
             uint32_t pad;                 /* Must be set to 0 */
         } fork;
diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
index bb003d21d0..81c2ee28cc 100644
--- a/xen/include/public/vm_event.h
+++ b/xen/include/public/vm_event.h
@@ -127,6 +127,14 @@
  * Reset the vmtrace buffer (if vmtrace is enabled)
  */
 #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
+/*
+ * Reset the VM state (if VM is fork)
+ */
+#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
+/*
+ * Remove unshared entried from physmap (if VM is fork)
+ */
+#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
 
 /*
  * Reasons for the vm event request
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-22 17:41 [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Tamas K Lengyel
  2022-03-22 17:41 ` [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete Tamas K Lengyel
  2022-03-22 17:41 ` [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable Tamas K Lengyel
@ 2022-03-24 14:50 ` Roger Pau Monné
  2022-03-24 15:15   ` Tamas K Lengyel
  2022-03-25 10:59 ` Roger Pau Monné
  3 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-24 14:50 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: xen-devel, Jan Beulich, Andrew Cooper, Wei Liu, George Dunlap,
	Julien Grall, Stefano Stabellini, Tamas K Lengyel

On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> Add option to the fork memop to skip populating the fork with special pages.
> These special pages are only necessary when setting up forks to be fully
> functional with a toolstack. For short-lived forks where no toolstack is active
> these pages are uneccesary.

I'm not sure those are strictly related to having a toolstack. For
example the vcpu_info has nothing to do with having a toolstack, and
is only used by the guest in order to receive events or fetch time
related data. So while a short-lived fork might not make use of those,
that has nothing to do with having a toolstack or not.

> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> ---
>  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
>  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
>  xen/include/public/memory.h           |  4 ++--
>  3 files changed, 26 insertions(+), 15 deletions(-)
> 
> diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> index 698455444e..446cd06411 100644
> --- a/xen/arch/x86/include/asm/hvm/domain.h
> +++ b/xen/arch/x86/include/asm/hvm/domain.h
> @@ -31,7 +31,9 @@
>  #ifdef CONFIG_MEM_SHARING
>  struct mem_sharing_domain
>  {
> -    bool enabled, block_interrupts;
> +    bool enabled;
> +    bool block_interrupts;
> +    bool skip_special_pages;
>  
>      /*
>       * When releasing shared gfn's in a preemptible manner, recall where
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index 15e6a7ed81..84c04ddfa3 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
>      return 0;
>  }
>  
> -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> +                              bool skip_special_pages)
>  {
>      unsigned int i;
>      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
>  
>          /* Copy & map in the vcpu_info page if the guest uses one */
>          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
>          {
>              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
>  
> @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
>      return 0;
>  }
>  
> -static int copy_settings(struct domain *cd, struct domain *d)
> +static int copy_settings(struct domain *cd, struct domain *d,
> +                         bool skip_special_pages)
>  {
>      int rc;
>  
> -    if ( (rc = copy_vcpu_settings(cd, d)) )
> +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
>          return rc;
>  
>      if ( (rc = hvm_copy_context_and_params(cd, d)) )
>          return rc;
>  
> -    if ( (rc = copy_special_pages(cd, d)) )
> +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
>          return rc;
>  
>      copy_tsc(cd, d);
> @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
>      return rc;
>  }
>  
> -static int fork(struct domain *cd, struct domain *d)
> +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
>  {
>      int rc = -EBUSY;
> +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
>  
>      if ( !cd->controller_pause_count )
>          return rc;
> @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
>      if ( (rc = bring_up_vcpus(cd, d)) )
>          goto done;
>  
> -    rc = copy_settings(cd, d);
> +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )

Can you set
cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
so that you don't need to pass the options around to copy_settings and
copy_vcpu_settings?

> +    {
> +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> +        cd->creation_finished = skip_special_pages;

I think this is dangerous. While it might be true at the moment that
the arch_domain_creation_finished only maps the vAPIC page, there's no
guarantee it couldn't do other stuff in the future that could be
required for the VM to be started.

Does it add much overhead to map the vAPIC page?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-24 14:50 ` [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Roger Pau Monné
@ 2022-03-24 15:15   ` Tamas K Lengyel
  2022-03-24 15:51     ` Roger Pau Monné
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-24 15:15 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > Add option to the fork memop to skip populating the fork with special pages.
> > These special pages are only necessary when setting up forks to be fully
> > functional with a toolstack. For short-lived forks where no toolstack is active
> > these pages are uneccesary.
>
> I'm not sure those are strictly related to having a toolstack. For
> example the vcpu_info has nothing to do with having a toolstack, and
> is only used by the guest in order to receive events or fetch time
> related data. So while a short-lived fork might not make use of those,
> that has nothing to do with having a toolstack or not.

Fair enough, the point is that the short live fork doesn't use these pages.

> >
> > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > ---
> >  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
> >  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
> >  xen/include/public/memory.h           |  4 ++--
> >  3 files changed, 26 insertions(+), 15 deletions(-)
> >
> > diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> > index 698455444e..446cd06411 100644
> > --- a/xen/arch/x86/include/asm/hvm/domain.h
> > +++ b/xen/arch/x86/include/asm/hvm/domain.h
> > @@ -31,7 +31,9 @@
> >  #ifdef CONFIG_MEM_SHARING
> >  struct mem_sharing_domain
> >  {
> > -    bool enabled, block_interrupts;
> > +    bool enabled;
> > +    bool block_interrupts;
> > +    bool skip_special_pages;
> >
> >      /*
> >       * When releasing shared gfn's in a preemptible manner, recall where
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index 15e6a7ed81..84c04ddfa3 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
> >      return 0;
> >  }
> >
> > -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> > +                              bool skip_special_pages)
> >  {
> >      unsigned int i;
> >      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> >
> >          /* Copy & map in the vcpu_info page if the guest uses one */
> >          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> >          {
> >              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> >
> > @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
> >      return 0;
> >  }
> >
> > -static int copy_settings(struct domain *cd, struct domain *d)
> > +static int copy_settings(struct domain *cd, struct domain *d,
> > +                         bool skip_special_pages)
> >  {
> >      int rc;
> >
> > -    if ( (rc = copy_vcpu_settings(cd, d)) )
> > +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
> >          return rc;
> >
> >      if ( (rc = hvm_copy_context_and_params(cd, d)) )
> >          return rc;
> >
> > -    if ( (rc = copy_special_pages(cd, d)) )
> > +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
> >          return rc;
> >
> >      copy_tsc(cd, d);
> > @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
> >      return rc;
> >  }
> >
> > -static int fork(struct domain *cd, struct domain *d)
> > +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> >  {
> >      int rc = -EBUSY;
> > +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> > +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
> >
> >      if ( !cd->controller_pause_count )
> >          return rc;
> > @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
> >      if ( (rc = bring_up_vcpus(cd, d)) )
> >          goto done;
> >
> > -    rc = copy_settings(cd, d);
> > +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
>
> Can you set
> cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
> so that you don't need to pass the options around to copy_settings and
> copy_vcpu_settings?

Would be possible yes, but then we would have to clear them in case
the forking failed at any point. Setting them only at the end when the
fork finished ensures that those fields are only ever valid if the VM
is a fork. Both are valid approaches and I prefer this over the other.

>
> > +    {
> > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > +        cd->creation_finished = skip_special_pages;
>
> I think this is dangerous. While it might be true at the moment that
> the arch_domain_creation_finished only maps the vAPIC page, there's no
> guarantee it couldn't do other stuff in the future that could be
> required for the VM to be started.

I understand this domain_creation_finished route could be expanded in
the future to include other stuff, but IMHO we can evaluate what to do
about that when and if it becomes necessary. This is all experimental
to begin with.

> Does it add much overhead to map the vAPIC page?

I don't have numbers but it does add overhead. When we do a fork reset
we loop through all pages in the physmap to determine what needs to be
removed. So having an extra page means that loop is always larger than
it actually needs to be. Considering we do the reset thousands of
times per second per core, you can imagine it adding up over time.

Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-22 17:41 ` [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable Tamas K Lengyel
@ 2022-03-24 15:45   ` Roger Pau Monné
  2022-03-24 15:52     ` Tamas K Lengyel
  2022-03-25 13:42   ` Roger Pau Monné
  1 sibling, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-24 15:45 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: xen-devel, Wei Liu, Anthony PERARD, Juergen Gross, Andrew Cooper,
	George Dunlap, Jan Beulich, Julien Grall, Stefano Stabellini,
	Tamas K Lengyel, Alexandru Isaila, Petre Pircalabu

On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> index 208d8dcbd9..30ce23c5a7 100644
> --- a/xen/include/public/memory.h
> +++ b/xen/include/public/memory.h
> @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
>                  uint32_t gref;     /* IN: gref to debug         */
>              } u;
>          } debug;
> -        struct mem_sharing_op_fork {      /* OP_FORK */
> +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
>              domid_t parent_domain;        /* IN: parent's domain id */
>  /* These flags only makes sense for short-lived forks */
>  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
>  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
>  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
>              uint16_t flags;               /* IN: optional settings */
>              uint32_t pad;                 /* Must be set to 0 */
>          } fork;
> diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
> index bb003d21d0..81c2ee28cc 100644
> --- a/xen/include/public/vm_event.h
> +++ b/xen/include/public/vm_event.h
> @@ -127,6 +127,14 @@
>   * Reset the vmtrace buffer (if vmtrace is enabled)
>   */
>  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> +/*
> + * Reset the VM state (if VM is fork)
> + */
> +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> +/*
> + * Remove unshared entried from physmap (if VM is fork)
> + */
> +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)

I'm confused about why two different interfaces are added to do this
kind of selective resets, one to vm_event and one to xenmem_fork?

I thin k the natural place for the option to live would be
XENMEM_FORK?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-24 15:15   ` Tamas K Lengyel
@ 2022-03-24 15:51     ` Roger Pau Monné
  2022-03-24 16:27       ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-24 15:51 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > Add option to the fork memop to skip populating the fork with special pages.
> > > These special pages are only necessary when setting up forks to be fully
> > > functional with a toolstack. For short-lived forks where no toolstack is active
> > > these pages are uneccesary.
> >
> > I'm not sure those are strictly related to having a toolstack. For
> > example the vcpu_info has nothing to do with having a toolstack, and
> > is only used by the guest in order to receive events or fetch time
> > related data. So while a short-lived fork might not make use of those,
> > that has nothing to do with having a toolstack or not.
> 
> Fair enough, the point is that the short live fork doesn't use these pages.
> 
> > >
> > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > > ---
> > >  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
> > >  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
> > >  xen/include/public/memory.h           |  4 ++--
> > >  3 files changed, 26 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> > > index 698455444e..446cd06411 100644
> > > --- a/xen/arch/x86/include/asm/hvm/domain.h
> > > +++ b/xen/arch/x86/include/asm/hvm/domain.h
> > > @@ -31,7 +31,9 @@
> > >  #ifdef CONFIG_MEM_SHARING
> > >  struct mem_sharing_domain
> > >  {
> > > -    bool enabled, block_interrupts;
> > > +    bool enabled;
> > > +    bool block_interrupts;
> > > +    bool skip_special_pages;
> > >
> > >      /*
> > >       * When releasing shared gfn's in a preemptible manner, recall where
> > > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > > index 15e6a7ed81..84c04ddfa3 100644
> > > --- a/xen/arch/x86/mm/mem_sharing.c
> > > +++ b/xen/arch/x86/mm/mem_sharing.c
> > > @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > >      return 0;
> > >  }
> > >
> > > -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > > +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> > > +                              bool skip_special_pages)
> > >  {
> > >      unsigned int i;
> > >      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > >
> > >          /* Copy & map in the vcpu_info page if the guest uses one */
> > >          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > > -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > > +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > >          {
> > >              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > >
> > > @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
> > >      return 0;
> > >  }
> > >
> > > -static int copy_settings(struct domain *cd, struct domain *d)
> > > +static int copy_settings(struct domain *cd, struct domain *d,
> > > +                         bool skip_special_pages)
> > >  {
> > >      int rc;
> > >
> > > -    if ( (rc = copy_vcpu_settings(cd, d)) )
> > > +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
> > >          return rc;
> > >
> > >      if ( (rc = hvm_copy_context_and_params(cd, d)) )
> > >          return rc;
> > >
> > > -    if ( (rc = copy_special_pages(cd, d)) )
> > > +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
> > >          return rc;
> > >
> > >      copy_tsc(cd, d);
> > > @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
> > >      return rc;
> > >  }
> > >
> > > -static int fork(struct domain *cd, struct domain *d)
> > > +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> > >  {
> > >      int rc = -EBUSY;
> > > +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> > > +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
> > >
> > >      if ( !cd->controller_pause_count )
> > >          return rc;
> > > @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
> > >      if ( (rc = bring_up_vcpus(cd, d)) )
> > >          goto done;
> > >
> > > -    rc = copy_settings(cd, d);
> > > +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
> >
> > Can you set
> > cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
> > so that you don't need to pass the options around to copy_settings and
> > copy_vcpu_settings?
> 
> Would be possible yes, but then we would have to clear them in case
> the forking failed at any point. Setting them only at the end when the
> fork finished ensures that those fields are only ever valid if the VM
> is a fork. Both are valid approaches and I prefer this over the other.

I think I'm confused, doesn't the fork get destroyed if there's a
failure? In which case the values
cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages}
shouldn't really matter?

> >
> > > +    {
> > > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > > +        cd->creation_finished = skip_special_pages;
> >
> > I think this is dangerous. While it might be true at the moment that
> > the arch_domain_creation_finished only maps the vAPIC page, there's no
> > guarantee it couldn't do other stuff in the future that could be
> > required for the VM to be started.
> 
> I understand this domain_creation_finished route could be expanded in
> the future to include other stuff, but IMHO we can evaluate what to do
> about that when and if it becomes necessary. This is all experimental
> to begin with.

Maybe you could check the skip_special_pages field from
domain_creation_finished in order to decide whether the vAPIC page
needs to be mapped?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-24 15:45   ` Roger Pau Monné
@ 2022-03-24 15:52     ` Tamas K Lengyel
  2022-03-24 16:03       ` Roger Pau Monné
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-24 15:52 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Alexandru Isaila,
	Petre Pircalabu

On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> > index 208d8dcbd9..30ce23c5a7 100644
> > --- a/xen/include/public/memory.h
> > +++ b/xen/include/public/memory.h
> > @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> >                  uint32_t gref;     /* IN: gref to debug         */
> >              } u;
> >          } debug;
> > -        struct mem_sharing_op_fork {      /* OP_FORK */
> > +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> >              domid_t parent_domain;        /* IN: parent's domain id */
> >  /* These flags only makes sense for short-lived forks */
> >  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> >  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> >  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> >              uint16_t flags;               /* IN: optional settings */
> >              uint32_t pad;                 /* Must be set to 0 */
> >          } fork;
> > diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
> > index bb003d21d0..81c2ee28cc 100644
> > --- a/xen/include/public/vm_event.h
> > +++ b/xen/include/public/vm_event.h
> > @@ -127,6 +127,14 @@
> >   * Reset the vmtrace buffer (if vmtrace is enabled)
> >   */
> >  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> > +/*
> > + * Reset the VM state (if VM is fork)
> > + */
> > +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> > +/*
> > + * Remove unshared entried from physmap (if VM is fork)
> > + */
> > +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
>
> I'm confused about why two different interfaces are added to do this
> kind of selective resets, one to vm_event and one to xenmem_fork?
>
> I thin k the natural place for the option to live would be
> XENMEM_FORK?

Yes, that's the natural place for it. But we are adding it to both for
a reason. In our use-case the reset operation will happen after a
vm_event is received to which we already must send a reply. Setting
the flag on the vm_event reply saves us having to issue an extra memop
hypercall afterwards.

Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-24 15:52     ` Tamas K Lengyel
@ 2022-03-24 16:03       ` Roger Pau Monné
  2022-03-24 16:22         ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-24 16:03 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Alexandru Isaila,
	Petre Pircalabu

On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > > diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> > > index 208d8dcbd9..30ce23c5a7 100644
> > > --- a/xen/include/public/memory.h
> > > +++ b/xen/include/public/memory.h
> > > @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> > >                  uint32_t gref;     /* IN: gref to debug         */
> > >              } u;
> > >          } debug;
> > > -        struct mem_sharing_op_fork {      /* OP_FORK */
> > > +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> > >              domid_t parent_domain;        /* IN: parent's domain id */
> > >  /* These flags only makes sense for short-lived forks */
> > >  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> > >  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> > >  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > > +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > > +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> > >              uint16_t flags;               /* IN: optional settings */
> > >              uint32_t pad;                 /* Must be set to 0 */
> > >          } fork;
> > > diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
> > > index bb003d21d0..81c2ee28cc 100644
> > > --- a/xen/include/public/vm_event.h
> > > +++ b/xen/include/public/vm_event.h
> > > @@ -127,6 +127,14 @@
> > >   * Reset the vmtrace buffer (if vmtrace is enabled)
> > >   */
> > >  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> > > +/*
> > > + * Reset the VM state (if VM is fork)
> > > + */
> > > +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> > > +/*
> > > + * Remove unshared entried from physmap (if VM is fork)
> > > + */
> > > +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
> >
> > I'm confused about why two different interfaces are added to do this
> > kind of selective resets, one to vm_event and one to xenmem_fork?
> >
> > I thin k the natural place for the option to live would be
> > XENMEM_FORK?
> 
> Yes, that's the natural place for it. But we are adding it to both for
> a reason. In our use-case the reset operation will happen after a
> vm_event is received to which we already must send a reply. Setting
> the flag on the vm_event reply saves us having to issue an extra memop
> hypercall afterwards.

Can you do a multicall and batch both operations in a single
hypercall?

That would seem more natural than adding duplicated interfaces.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-24 16:03       ` Roger Pau Monné
@ 2022-03-24 16:22         ` Tamas K Lengyel
  2022-03-24 16:43           ` Roger Pau Monné
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-24 16:22 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Alexandru Isaila,
	Petre Pircalabu

On Thu, Mar 24, 2022 at 12:04 PM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >
> > > On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > > > diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> > > > index 208d8dcbd9..30ce23c5a7 100644
> > > > --- a/xen/include/public/memory.h
> > > > +++ b/xen/include/public/memory.h
> > > > @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> > > >                  uint32_t gref;     /* IN: gref to debug         */
> > > >              } u;
> > > >          } debug;
> > > > -        struct mem_sharing_op_fork {      /* OP_FORK */
> > > > +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> > > >              domid_t parent_domain;        /* IN: parent's domain id */
> > > >  /* These flags only makes sense for short-lived forks */
> > > >  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> > > >  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> > > >  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > > > +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > > > +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> > > >              uint16_t flags;               /* IN: optional settings */
> > > >              uint32_t pad;                 /* Must be set to 0 */
> > > >          } fork;
> > > > diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
> > > > index bb003d21d0..81c2ee28cc 100644
> > > > --- a/xen/include/public/vm_event.h
> > > > +++ b/xen/include/public/vm_event.h
> > > > @@ -127,6 +127,14 @@
> > > >   * Reset the vmtrace buffer (if vmtrace is enabled)
> > > >   */
> > > >  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> > > > +/*
> > > > + * Reset the VM state (if VM is fork)
> > > > + */
> > > > +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> > > > +/*
> > > > + * Remove unshared entried from physmap (if VM is fork)
> > > > + */
> > > > +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
> > >
> > > I'm confused about why two different interfaces are added to do this
> > > kind of selective resets, one to vm_event and one to xenmem_fork?
> > >
> > > I thin k the natural place for the option to live would be
> > > XENMEM_FORK?
> >
> > Yes, that's the natural place for it. But we are adding it to both for
> > a reason. In our use-case the reset operation will happen after a
> > vm_event is received to which we already must send a reply. Setting
> > the flag on the vm_event reply saves us having to issue an extra memop
> > hypercall afterwards.
>
> Can you do a multicall and batch both operations in a single
> hypercall?
>
> That would seem more natural than adding duplicated interfaces.

Not in a straight forward way, no. There is no exposed API in libxc to
do a multicall. Even if that was an option it is still easier for me
to just flip a bit in the response field than having to construct a
whole standalone hypercall structure to be sent as part of a
multicall.

Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-24 15:51     ` Roger Pau Monné
@ 2022-03-24 16:27       ` Tamas K Lengyel
  2022-03-25 10:25         ` Roger Pau Monné
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-24 16:27 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

On Thu, Mar 24, 2022 at 11:51 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >
> > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > Add option to the fork memop to skip populating the fork with special pages.
> > > > These special pages are only necessary when setting up forks to be fully
> > > > functional with a toolstack. For short-lived forks where no toolstack is active
> > > > these pages are uneccesary.
> > >
> > > I'm not sure those are strictly related to having a toolstack. For
> > > example the vcpu_info has nothing to do with having a toolstack, and
> > > is only used by the guest in order to receive events or fetch time
> > > related data. So while a short-lived fork might not make use of those,
> > > that has nothing to do with having a toolstack or not.
> >
> > Fair enough, the point is that the short live fork doesn't use these pages.
> >
> > > >
> > > > Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
> > > > ---
> > > >  xen/arch/x86/include/asm/hvm/domain.h |  4 +++-
> > > >  xen/arch/x86/mm/mem_sharing.c         | 33 +++++++++++++++++----------
> > > >  xen/include/public/memory.h           |  4 ++--
> > > >  3 files changed, 26 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
> > > > index 698455444e..446cd06411 100644
> > > > --- a/xen/arch/x86/include/asm/hvm/domain.h
> > > > +++ b/xen/arch/x86/include/asm/hvm/domain.h
> > > > @@ -31,7 +31,9 @@
> > > >  #ifdef CONFIG_MEM_SHARING
> > > >  struct mem_sharing_domain
> > > >  {
> > > > -    bool enabled, block_interrupts;
> > > > +    bool enabled;
> > > > +    bool block_interrupts;
> > > > +    bool skip_special_pages;
> > > >
> > > >      /*
> > > >       * When releasing shared gfn's in a preemptible manner, recall where
> > > > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > > > index 15e6a7ed81..84c04ddfa3 100644
> > > > --- a/xen/arch/x86/mm/mem_sharing.c
> > > > +++ b/xen/arch/x86/mm/mem_sharing.c
> > > > @@ -1643,7 +1643,8 @@ static int bring_up_vcpus(struct domain *cd, struct domain *d)
> > > >      return 0;
> > > >  }
> > > >
> > > > -static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > > > +static int copy_vcpu_settings(struct domain *cd, const struct domain *d,
> > > > +                              bool skip_special_pages)
> > > >  {
> > > >      unsigned int i;
> > > >      struct p2m_domain *p2m = p2m_get_hostp2m(cd);
> > > > @@ -1660,7 +1661,7 @@ static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
> > > >
> > > >          /* Copy & map in the vcpu_info page if the guest uses one */
> > > >          vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
> > > > -        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > > > +        if ( !skip_special_pages && !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
> > > >          {
> > > >              mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
> > > >
> > > > @@ -1807,17 +1808,18 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
> > > >      return 0;
> > > >  }
> > > >
> > > > -static int copy_settings(struct domain *cd, struct domain *d)
> > > > +static int copy_settings(struct domain *cd, struct domain *d,
> > > > +                         bool skip_special_pages)
> > > >  {
> > > >      int rc;
> > > >
> > > > -    if ( (rc = copy_vcpu_settings(cd, d)) )
> > > > +    if ( (rc = copy_vcpu_settings(cd, d, skip_special_pages)) )
> > > >          return rc;
> > > >
> > > >      if ( (rc = hvm_copy_context_and_params(cd, d)) )
> > > >          return rc;
> > > >
> > > > -    if ( (rc = copy_special_pages(cd, d)) )
> > > > +    if ( !skip_special_pages && (rc = copy_special_pages(cd, d)) )
> > > >          return rc;
> > > >
> > > >      copy_tsc(cd, d);
> > > > @@ -1826,9 +1828,11 @@ static int copy_settings(struct domain *cd, struct domain *d)
> > > >      return rc;
> > > >  }
> > > >
> > > > -static int fork(struct domain *cd, struct domain *d)
> > > > +static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> > > >  {
> > > >      int rc = -EBUSY;
> > > > +    bool block_interrupts = flags & XENMEM_FORK_BLOCK_INTERRUPTS;
> > > > +    bool skip_special_pages = flags & XENMEM_FORK_SKIP_SPECIAL_PAGES;
> > > >
> > > >      if ( !cd->controller_pause_count )
> > > >          return rc;
> > > > @@ -1856,7 +1860,13 @@ static int fork(struct domain *cd, struct domain *d)
> > > >      if ( (rc = bring_up_vcpus(cd, d)) )
> > > >          goto done;
> > > >
> > > > -    rc = copy_settings(cd, d);
> > > > +    if ( !(rc = copy_settings(cd, d, skip_special_pages)) )
> > >
> > > Can you set
> > > cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages} earlier
> > > so that you don't need to pass the options around to copy_settings and
> > > copy_vcpu_settings?
> >
> > Would be possible yes, but then we would have to clear them in case
> > the forking failed at any point. Setting them only at the end when the
> > fork finished ensures that those fields are only ever valid if the VM
> > is a fork. Both are valid approaches and I prefer this over the other.
>
> I think I'm confused, doesn't the fork get destroyed if there's a
> failure? In which case the values
> cd->arch.hvm.mem_sharing.{block_interrupts,skip_special_pages}
> shouldn't really matter?

No, the domain that will be a fork is just a regular domain until the
fork operation completes. If the fork operation fails the domain
remains.

> > >
> > > > +    {
> > > > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > > > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > > > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > > > +        cd->creation_finished = skip_special_pages;
> > >
> > > I think this is dangerous. While it might be true at the moment that
> > > the arch_domain_creation_finished only maps the vAPIC page, there's no
> > > guarantee it couldn't do other stuff in the future that could be
> > > required for the VM to be started.
> >
> > I understand this domain_creation_finished route could be expanded in
> > the future to include other stuff, but IMHO we can evaluate what to do
> > about that when and if it becomes necessary. This is all experimental
> > to begin with.
>
> Maybe you could check the skip_special_pages field from
> domain_creation_finished in order to decide whether the vAPIC page
> needs to be mapped?

Could certainly do that but it means adding experimental code in an
#ifdef to the vAPIC mapping logic. Technically nothing wrong with that
but I would prefer keeping all this code in a single-place if
possible.

Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-24 16:22         ` Tamas K Lengyel
@ 2022-03-24 16:43           ` Roger Pau Monné
  2022-03-24 17:02             ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-24 16:43 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Alexandru Isaila,
	Petre Pircalabu

On Thu, Mar 24, 2022 at 12:22:49PM -0400, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 12:04 PM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
> > > On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > >
> > > > On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > > > > diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> > > > > index 208d8dcbd9..30ce23c5a7 100644
> > > > > --- a/xen/include/public/memory.h
> > > > > +++ b/xen/include/public/memory.h
> > > > > @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> > > > >                  uint32_t gref;     /* IN: gref to debug         */
> > > > >              } u;
> > > > >          } debug;
> > > > > -        struct mem_sharing_op_fork {      /* OP_FORK */
> > > > > +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> > > > >              domid_t parent_domain;        /* IN: parent's domain id */
> > > > >  /* These flags only makes sense for short-lived forks */
> > > > >  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> > > > >  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> > > > >  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > > > > +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > > > > +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> > > > >              uint16_t flags;               /* IN: optional settings */
> > > > >              uint32_t pad;                 /* Must be set to 0 */
> > > > >          } fork;
> > > > > diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
> > > > > index bb003d21d0..81c2ee28cc 100644
> > > > > --- a/xen/include/public/vm_event.h
> > > > > +++ b/xen/include/public/vm_event.h
> > > > > @@ -127,6 +127,14 @@
> > > > >   * Reset the vmtrace buffer (if vmtrace is enabled)
> > > > >   */
> > > > >  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> > > > > +/*
> > > > > + * Reset the VM state (if VM is fork)
> > > > > + */
> > > > > +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> > > > > +/*
> > > > > + * Remove unshared entried from physmap (if VM is fork)
> > > > > + */
> > > > > +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
> > > >
> > > > I'm confused about why two different interfaces are added to do this
> > > > kind of selective resets, one to vm_event and one to xenmem_fork?
> > > >
> > > > I thin k the natural place for the option to live would be
> > > > XENMEM_FORK?
> > >
> > > Yes, that's the natural place for it. But we are adding it to both for
> > > a reason. In our use-case the reset operation will happen after a
> > > vm_event is received to which we already must send a reply. Setting
> > > the flag on the vm_event reply saves us having to issue an extra memop
> > > hypercall afterwards.
> >
> > Can you do a multicall and batch both operations in a single
> > hypercall?
> >
> > That would seem more natural than adding duplicated interfaces.
> 
> Not in a straight forward way, no. There is no exposed API in libxc to
> do a multicall. Even if that was an option it is still easier for me
> to just flip a bit in the response field than having to construct a
> whole standalone hypercall structure to be sent as part of a
> multicall.

Right, I can see it being easier, but it seems like a bad choice from
an interface PoV. You are the maintainer of both subsystems, but it
would seem to me it's in your best interest to try to keep the
interfaces separated and clean.

Would it be possible for the reset XENMEM_FORK op to have the side
effect of performing what you would instead do with the vm_event
hypercall?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-24 16:43           ` Roger Pau Monné
@ 2022-03-24 17:02             ` Tamas K Lengyel
  2022-03-25  9:04               ` Jan Beulich
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-24 17:02 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Alexandru Isaila,
	Petre Pircalabu

On Thu, Mar 24, 2022 at 12:44 PM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Thu, Mar 24, 2022 at 12:22:49PM -0400, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 12:04 PM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
> > > > On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > > >
> > > > > On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > > > > > diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> > > > > > index 208d8dcbd9..30ce23c5a7 100644
> > > > > > --- a/xen/include/public/memory.h
> > > > > > +++ b/xen/include/public/memory.h
> > > > > > @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> > > > > >                  uint32_t gref;     /* IN: gref to debug         */
> > > > > >              } u;
> > > > > >          } debug;
> > > > > > -        struct mem_sharing_op_fork {      /* OP_FORK */
> > > > > > +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> > > > > >              domid_t parent_domain;        /* IN: parent's domain id */
> > > > > >  /* These flags only makes sense for short-lived forks */
> > > > > >  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> > > > > >  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> > > > > >  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > > > > > +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > > > > > +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> > > > > >              uint16_t flags;               /* IN: optional settings */
> > > > > >              uint32_t pad;                 /* Must be set to 0 */
> > > > > >          } fork;
> > > > > > diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
> > > > > > index bb003d21d0..81c2ee28cc 100644
> > > > > > --- a/xen/include/public/vm_event.h
> > > > > > +++ b/xen/include/public/vm_event.h
> > > > > > @@ -127,6 +127,14 @@
> > > > > >   * Reset the vmtrace buffer (if vmtrace is enabled)
> > > > > >   */
> > > > > >  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> > > > > > +/*
> > > > > > + * Reset the VM state (if VM is fork)
> > > > > > + */
> > > > > > +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> > > > > > +/*
> > > > > > + * Remove unshared entried from physmap (if VM is fork)
> > > > > > + */
> > > > > > +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
> > > > >
> > > > > I'm confused about why two different interfaces are added to do this
> > > > > kind of selective resets, one to vm_event and one to xenmem_fork?
> > > > >
> > > > > I thin k the natural place for the option to live would be
> > > > > XENMEM_FORK?
> > > >
> > > > Yes, that's the natural place for it. But we are adding it to both for
> > > > a reason. In our use-case the reset operation will happen after a
> > > > vm_event is received to which we already must send a reply. Setting
> > > > the flag on the vm_event reply saves us having to issue an extra memop
> > > > hypercall afterwards.
> > >
> > > Can you do a multicall and batch both operations in a single
> > > hypercall?
> > >
> > > That would seem more natural than adding duplicated interfaces.
> >
> > Not in a straight forward way, no. There is no exposed API in libxc to
> > do a multicall. Even if that was an option it is still easier for me
> > to just flip a bit in the response field than having to construct a
> > whole standalone hypercall structure to be sent as part of a
> > multicall.
>
> Right, I can see it being easier, but it seems like a bad choice from
> an interface PoV. You are the maintainer of both subsystems, but it
> would seem to me it's in your best interest to try to keep the
> interfaces separated and clean.
>
> Would it be possible for the reset XENMEM_FORK op to have the side
> effect of performing what you would instead do with the vm_event
> hypercall?

Yes, the event response is really just an event channel signal to Xen,
so the memop hypercall could similarly encode the "now check the
vm_event response" as an optional field. But why is that any better
than the current event channel route processing the vm_response
encoding the "now do these ops on the fork"?

We already have a bunch of different operations you can encode in the
vm_event response field, so it reduces the complexity on the toolstack
side since I don't have to switch around which hypercall I need to
issue depending on what extra ops I want to put into a single
hypercall.

Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-24 17:02             ` Tamas K Lengyel
@ 2022-03-25  9:04               ` Jan Beulich
  2022-03-25 10:48                 ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2022-03-25  9:04 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Julien Grall,
	Stefano Stabellini, Alexandru Isaila, Petre Pircalabu,
	Roger Pau Monné

On 24.03.2022 18:02, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 12:44 PM Roger Pau Monné <roger.pau@citrix.com> wrote:
>>
>> On Thu, Mar 24, 2022 at 12:22:49PM -0400, Tamas K Lengyel wrote:
>>> On Thu, Mar 24, 2022 at 12:04 PM Roger Pau Monné <roger.pau@citrix.com> wrote:
>>>>
>>>> On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
>>>>> On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>>>>>>
>>>>>> On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
>>>>>>> diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
>>>>>>> index 208d8dcbd9..30ce23c5a7 100644
>>>>>>> --- a/xen/include/public/memory.h
>>>>>>> +++ b/xen/include/public/memory.h
>>>>>>> @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
>>>>>>>                  uint32_t gref;     /* IN: gref to debug         */
>>>>>>>              } u;
>>>>>>>          } debug;
>>>>>>> -        struct mem_sharing_op_fork {      /* OP_FORK */
>>>>>>> +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
>>>>>>>              domid_t parent_domain;        /* IN: parent's domain id */
>>>>>>>  /* These flags only makes sense for short-lived forks */
>>>>>>>  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
>>>>>>>  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
>>>>>>>  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
>>>>>>> +#define XENMEM_FORK_RESET_STATE        (1u << 3)
>>>>>>> +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
>>>>>>>              uint16_t flags;               /* IN: optional settings */
>>>>>>>              uint32_t pad;                 /* Must be set to 0 */
>>>>>>>          } fork;
>>>>>>> diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
>>>>>>> index bb003d21d0..81c2ee28cc 100644
>>>>>>> --- a/xen/include/public/vm_event.h
>>>>>>> +++ b/xen/include/public/vm_event.h
>>>>>>> @@ -127,6 +127,14 @@
>>>>>>>   * Reset the vmtrace buffer (if vmtrace is enabled)
>>>>>>>   */
>>>>>>>  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
>>>>>>> +/*
>>>>>>> + * Reset the VM state (if VM is fork)
>>>>>>> + */
>>>>>>> +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
>>>>>>> +/*
>>>>>>> + * Remove unshared entried from physmap (if VM is fork)
>>>>>>> + */
>>>>>>> +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
>>>>>>
>>>>>> I'm confused about why two different interfaces are added to do this
>>>>>> kind of selective resets, one to vm_event and one to xenmem_fork?
>>>>>>
>>>>>> I thin k the natural place for the option to live would be
>>>>>> XENMEM_FORK?
>>>>>
>>>>> Yes, that's the natural place for it. But we are adding it to both for
>>>>> a reason. In our use-case the reset operation will happen after a
>>>>> vm_event is received to which we already must send a reply. Setting
>>>>> the flag on the vm_event reply saves us having to issue an extra memop
>>>>> hypercall afterwards.
>>>>
>>>> Can you do a multicall and batch both operations in a single
>>>> hypercall?
>>>>
>>>> That would seem more natural than adding duplicated interfaces.
>>>
>>> Not in a straight forward way, no. There is no exposed API in libxc to
>>> do a multicall. Even if that was an option it is still easier for me
>>> to just flip a bit in the response field than having to construct a
>>> whole standalone hypercall structure to be sent as part of a
>>> multicall.
>>
>> Right, I can see it being easier, but it seems like a bad choice from
>> an interface PoV. You are the maintainer of both subsystems, but it
>> would seem to me it's in your best interest to try to keep the
>> interfaces separated and clean.
>>
>> Would it be possible for the reset XENMEM_FORK op to have the side
>> effect of performing what you would instead do with the vm_event
>> hypercall?
> 
> Yes, the event response is really just an event channel signal to Xen,
> so the memop hypercall could similarly encode the "now check the
> vm_event response" as an optional field. But why is that any better
> than the current event channel route processing the vm_response
> encoding the "now do these ops on the fork"?

Well, as Roger said: Less duplication in the interface.

> We already have a bunch of different operations you can encode in the
> vm_event response field, so it reduces the complexity on the toolstack
> side since I don't have to switch around which hypercall I need to
> issue depending on what extra ops I want to put into a single
> hypercall.

The two goals need to be weighed against one another; for the moment
I think I'm with Roger aiming at a clean interface.

Jan



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-24 16:27       ` Tamas K Lengyel
@ 2022-03-25 10:25         ` Roger Pau Monné
  2022-03-25 10:51           ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-25 10:25 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

On Thu, Mar 24, 2022 at 12:27:02PM -0400, Tamas K Lengyel wrote:
> On Thu, Mar 24, 2022 at 11:51 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> > > On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > >
> > > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > > +    {
> > > > > +        cd->arch.hvm.mem_sharing.block_interrupts = block_interrupts;
> > > > > +        cd->arch.hvm.mem_sharing.skip_special_pages = skip_special_pages;
> > > > > +        /* skip mapping the vAPIC page on unpause if skipping special pages */
> > > > > +        cd->creation_finished = skip_special_pages;
> > > >
> > > > I think this is dangerous. While it might be true at the moment that
> > > > the arch_domain_creation_finished only maps the vAPIC page, there's no
> > > > guarantee it couldn't do other stuff in the future that could be
> > > > required for the VM to be started.
> > >
> > > I understand this domain_creation_finished route could be expanded in
> > > the future to include other stuff, but IMHO we can evaluate what to do
> > > about that when and if it becomes necessary. This is all experimental
> > > to begin with.
> >
> > Maybe you could check the skip_special_pages field from
> > domain_creation_finished in order to decide whether the vAPIC page
> > needs to be mapped?
> 
> Could certainly do that but it means adding experimental code in an
> #ifdef to the vAPIC mapping logic. Technically nothing wrong with that
> but I would prefer keeping all this code in a single-place if
> possible.

I see, while I agree it's best to keep the code contained when
possible, I think in this case it's better to modify the hook,
specially because further changes to domain_creation_finished will
easily spot that this path is special cases for VM forks.

While the code is experimental it doesn't mean it shouldn't be
properly integrated with the rest of the code base.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-25  9:04               ` Jan Beulich
@ 2022-03-25 10:48                 ` Tamas K Lengyel
  2022-03-25 12:14                   ` Roger Pau Monné
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-25 10:48 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Tamas K Lengyel, Xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Julien Grall,
	Stefano Stabellini, Alexandru Isaila, Petre Pircalabu,
	Roger Pau Monné

[-- Attachment #1: Type: text/plain, Size: 5496 bytes --]

On Fri, Mar 25, 2022, 5:04 AM Jan Beulich <jbeulich@suse.com> wrote:

> On 24.03.2022 18:02, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 12:44 PM Roger Pau Monné <roger.pau@citrix.com>
> wrote:
> >>
> >> On Thu, Mar 24, 2022 at 12:22:49PM -0400, Tamas K Lengyel wrote:
> >>> On Thu, Mar 24, 2022 at 12:04 PM Roger Pau Monné <roger.pau@citrix.com>
> wrote:
> >>>>
> >>>> On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
> >>>>> On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <
> roger.pau@citrix.com> wrote:
> >>>>>>
> >>>>>> On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> >>>>>>> diff --git a/xen/include/public/memory.h
> b/xen/include/public/memory.h
> >>>>>>> index 208d8dcbd9..30ce23c5a7 100644
> >>>>>>> --- a/xen/include/public/memory.h
> >>>>>>> +++ b/xen/include/public/memory.h
> >>>>>>> @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> >>>>>>>                  uint32_t gref;     /* IN: gref to debug         */
> >>>>>>>              } u;
> >>>>>>>          } debug;
> >>>>>>> -        struct mem_sharing_op_fork {      /* OP_FORK */
> >>>>>>> +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> >>>>>>>              domid_t parent_domain;        /* IN: parent's domain
> id */
> >>>>>>>  /* These flags only makes sense for short-lived forks */
> >>>>>>>  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> >>>>>>>  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> >>>>>>>  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> >>>>>>> +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> >>>>>>> +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> >>>>>>>              uint16_t flags;               /* IN: optional
> settings */
> >>>>>>>              uint32_t pad;                 /* Must be set to 0 */
> >>>>>>>          } fork;
> >>>>>>> diff --git a/xen/include/public/vm_event.h
> b/xen/include/public/vm_event.h
> >>>>>>> index bb003d21d0..81c2ee28cc 100644
> >>>>>>> --- a/xen/include/public/vm_event.h
> >>>>>>> +++ b/xen/include/public/vm_event.h
> >>>>>>> @@ -127,6 +127,14 @@
> >>>>>>>   * Reset the vmtrace buffer (if vmtrace is enabled)
> >>>>>>>   */
> >>>>>>>  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> >>>>>>> +/*
> >>>>>>> + * Reset the VM state (if VM is fork)
> >>>>>>> + */
> >>>>>>> +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> >>>>>>> +/*
> >>>>>>> + * Remove unshared entried from physmap (if VM is fork)
> >>>>>>> + */
> >>>>>>> +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
> >>>>>>
> >>>>>> I'm confused about why two different interfaces are added to do this
> >>>>>> kind of selective resets, one to vm_event and one to xenmem_fork?
> >>>>>>
> >>>>>> I thin k the natural place for the option to live would be
> >>>>>> XENMEM_FORK?
> >>>>>
> >>>>> Yes, that's the natural place for it. But we are adding it to both
> for
> >>>>> a reason. In our use-case the reset operation will happen after a
> >>>>> vm_event is received to which we already must send a reply. Setting
> >>>>> the flag on the vm_event reply saves us having to issue an extra
> memop
> >>>>> hypercall afterwards.
> >>>>
> >>>> Can you do a multicall and batch both operations in a single
> >>>> hypercall?
> >>>>
> >>>> That would seem more natural than adding duplicated interfaces.
> >>>
> >>> Not in a straight forward way, no. There is no exposed API in libxc to
> >>> do a multicall. Even if that was an option it is still easier for me
> >>> to just flip a bit in the response field than having to construct a
> >>> whole standalone hypercall structure to be sent as part of a
> >>> multicall.
> >>
> >> Right, I can see it being easier, but it seems like a bad choice from
> >> an interface PoV. You are the maintainer of both subsystems, but it
> >> would seem to me it's in your best interest to try to keep the
> >> interfaces separated and clean.
> >>
> >> Would it be possible for the reset XENMEM_FORK op to have the side
> >> effect of performing what you would instead do with the vm_event
> >> hypercall?
> >
> > Yes, the event response is really just an event channel signal to Xen,
> > so the memop hypercall could similarly encode the "now check the
> > vm_event response" as an optional field. But why is that any better
> > than the current event channel route processing the vm_response
> > encoding the "now do these ops on the fork"?
>
> Well, as Roger said: Less duplication in the interface.
>

No, you would just duplicate something else instead, ie. the event channel
hypercall.


> > We already have a bunch of different operations you can encode in the
> > vm_event response field, so it reduces the complexity on the toolstack
> > side since I don't have to switch around which hypercall I need to
> > issue depending on what extra ops I want to put into a single
> > hypercall.
>
> The two goals need to be weighed against one another; for the moment
> I think I'm with Roger aiming at a clean interface.
>

It may look like that from the Xen side but from the toolstack side this is
actually the cleanest way to achieve what we need. The vm_event interfaces
are already strongly integrated with both the mem_sharing and mem_paging
subsystems so nothing is gained by now for no reason trying to keep them
separate. So I strongly disagree with this suggestion and I'm going to keep
it as-is. I appreciate the feedback nevertheless.

Tamas

>

[-- Attachment #2: Type: text/html, Size: 8120 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-25 10:25         ` Roger Pau Monné
@ 2022-03-25 10:51           ` Tamas K Lengyel
  0 siblings, 0 replies; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-25 10:51 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, Xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

[-- Attachment #1: Type: text/plain, Size: 2284 bytes --]

On Fri, Mar 25, 2022, 6:25 AM Roger Pau Monné <roger.pau@citrix.com> wrote:

> On Thu, Mar 24, 2022 at 12:27:02PM -0400, Tamas K Lengyel wrote:
> > On Thu, Mar 24, 2022 at 11:51 AM Roger Pau Monné <roger.pau@citrix.com>
> wrote:
> > >
> > > On Thu, Mar 24, 2022 at 11:15:08AM -0400, Tamas K Lengyel wrote:
> > > > On Thu, Mar 24, 2022 at 10:50 AM Roger Pau Monné <
> roger.pau@citrix.com> wrote:
> > > > >
> > > > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > > > +    {
> > > > > > +        cd->arch.hvm.mem_sharing.block_interrupts =
> block_interrupts;
> > > > > > +        cd->arch.hvm.mem_sharing.skip_special_pages =
> skip_special_pages;
> > > > > > +        /* skip mapping the vAPIC page on unpause if skipping
> special pages */
> > > > > > +        cd->creation_finished = skip_special_pages;
> > > > >
> > > > > I think this is dangerous. While it might be true at the moment
> that
> > > > > the arch_domain_creation_finished only maps the vAPIC page,
> there's no
> > > > > guarantee it couldn't do other stuff in the future that could be
> > > > > required for the VM to be started.
> > > >
> > > > I understand this domain_creation_finished route could be expanded in
> > > > the future to include other stuff, but IMHO we can evaluate what to
> do
> > > > about that when and if it becomes necessary. This is all experimental
> > > > to begin with.
> > >
> > > Maybe you could check the skip_special_pages field from
> > > domain_creation_finished in order to decide whether the vAPIC page
> > > needs to be mapped?
> >
> > Could certainly do that but it means adding experimental code in an
> > #ifdef to the vAPIC mapping logic. Technically nothing wrong with that
> > but I would prefer keeping all this code in a single-place if
> > possible.
>
> I see, while I agree it's best to keep the code contained when
> possible, I think in this case it's better to modify the hook,
> specially because further changes to domain_creation_finished will
> easily spot that this path is special cases for VM forks.
>
> While the code is experimental it doesn't mean it shouldn't be
> properly integrated with the rest of the code base.
>

Sure, I'm fine with moving it there.

Tamas

>

[-- Attachment #2: Type: text/html, Size: 3349 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-22 17:41 [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Tamas K Lengyel
                   ` (2 preceding siblings ...)
  2022-03-24 14:50 ` [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Roger Pau Monné
@ 2022-03-25 10:59 ` Roger Pau Monné
  2022-03-25 11:15   ` Tamas K Lengyel
  3 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-25 10:59 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: xen-devel, Jan Beulich, Andrew Cooper, Wei Liu, George Dunlap,
	Julien Grall, Stefano Stabellini, Tamas K Lengyel

On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> Add option to the fork memop to skip populating the fork with special pages.
> These special pages are only necessary when setting up forks to be fully
> functional with a toolstack. For short-lived forks where no toolstack is active
> these pages are uneccesary.

Replying here because there's no cover letter AFAICT.

For this kind of performance related changes it would be better if you
could provide some figures about the performance impact. It would help
if we knew whether avoiding mapping the vAPIC page means you can
create 0.1% more forks per-minute or 20%.

If you really want to speed up the forking path it might be good to start
by perf sampling Xen in order to find the bottlenecks?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-25 10:59 ` Roger Pau Monné
@ 2022-03-25 11:15   ` Tamas K Lengyel
  2022-03-25 11:31     ` Roger Pau Monné
  0 siblings, 1 reply; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-25 11:15 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, Xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

[-- Attachment #1: Type: text/plain, Size: 1757 bytes --]

On Fri, Mar 25, 2022, 6:59 AM Roger Pau Monné <roger.pau@citrix.com> wrote:

> On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > Add option to the fork memop to skip populating the fork with special
> pages.
> > These special pages are only necessary when setting up forks to be fully
> > functional with a toolstack. For short-lived forks where no toolstack is
> active
> > these pages are uneccesary.
>
> Replying here because there's no cover letter AFAICT.
>
> For this kind of performance related changes it would be better if you
> could provide some figures about the performance impact. It would help
> if we knew whether avoiding mapping the vAPIC page means you can
> create 0.1% more forks per-minute or 20%.
>
> If you really want to speed up the forking path it might be good to start
> by perf sampling Xen in order to find the bottlenecks?
>

Sure but for experiment systems I don't think its necessary to collect that
data.

There is also a non-performance reason why we want to keep special pages
from being populated, in cases we really want the forks physmap to start
empty for better control over its state. There was already a case where
having special pages mapped in ended up triggering unexpected Xen behaviors
leading to chain of events not easy to follow. For example if page 0 gets
brought in while the vCPU is being created it ends up as a misconfigured
ept entry if nested virtualization is enabled. That leads to ept
misconfiguration exits instead of epf faults. Simply enforcing no entry in
the physmap until forking is complete eliminates the chance of something
like that happening again and makes reasoning about the VM's behavior from
the start easier.

Tamas

>

[-- Attachment #2: Type: text/html, Size: 2366 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-25 11:15   ` Tamas K Lengyel
@ 2022-03-25 11:31     ` Roger Pau Monné
  2022-03-25 12:13       ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-25 11:31 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Tamas K Lengyel, Xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

On Fri, Mar 25, 2022 at 07:15:59AM -0400, Tamas K Lengyel wrote:
> On Fri, Mar 25, 2022, 6:59 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
> > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > Add option to the fork memop to skip populating the fork with special
> > pages.
> > > These special pages are only necessary when setting up forks to be fully
> > > functional with a toolstack. For short-lived forks where no toolstack is
> > active
> > > these pages are uneccesary.
> >
> > Replying here because there's no cover letter AFAICT.
> >
> > For this kind of performance related changes it would be better if you
> > could provide some figures about the performance impact. It would help
> > if we knew whether avoiding mapping the vAPIC page means you can
> > create 0.1% more forks per-minute or 20%.
> >
> > If you really want to speed up the forking path it might be good to start
> > by perf sampling Xen in order to find the bottlenecks?
> >
> 
> Sure but for experiment systems I don't think its necessary to collect that
> data.

It helps weight whether the extra logic is worth the performance
benefit IMO. Here it might not matter that much since you say there's
also a non-performance reason for the change.

> There is also a non-performance reason why we want to keep special pages
> from being populated, in cases we really want the forks physmap to start
> empty for better control over its state. There was already a case where
> having special pages mapped in ended up triggering unexpected Xen behaviors
> leading to chain of events not easy to follow. For example if page 0 gets
> brought in while the vCPU is being created it ends up as a misconfigured
> ept entry if nested virtualization is enabled. That leads to ept
> misconfiguration exits instead of epf faults. Simply enforcing no entry in
> the physmap until forking is complete eliminates the chance of something
> like that happening again and makes reasoning about the VM's behavior from
> the start easier.

Could we have this added to the commit message then, and the option
renamed to 'empty_p2m' or some such. Then you should also ASSERT that
at the end of the fork process the p2m is indeed empty, not sure if
checking d->arch.paging.hap.p2m_pages == 0 would accomplish that?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork
  2022-03-25 11:31     ` Roger Pau Monné
@ 2022-03-25 12:13       ` Tamas K Lengyel
  0 siblings, 0 replies; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-25 12:13 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, Xen-devel, Jan Beulich, Andrew Cooper, Wei Liu,
	George Dunlap, Julien Grall, Stefano Stabellini

[-- Attachment #1: Type: text/plain, Size: 2580 bytes --]

On Fri, Mar 25, 2022, 7:31 AM Roger Pau Monné <roger.pau@citrix.com> wrote:

> On Fri, Mar 25, 2022 at 07:15:59AM -0400, Tamas K Lengyel wrote:
> > On Fri, Mar 25, 2022, 6:59 AM Roger Pau Monné <roger.pau@citrix.com>
> wrote:
> >
> > > On Tue, Mar 22, 2022 at 01:41:37PM -0400, Tamas K Lengyel wrote:
> > > > Add option to the fork memop to skip populating the fork with special
> > > pages.
> > > > These special pages are only necessary when setting up forks to be
> fully
> > > > functional with a toolstack. For short-lived forks where no
> toolstack is
> > > active
> > > > these pages are uneccesary.
> > >
> > > Replying here because there's no cover letter AFAICT.
> > >
> > > For this kind of performance related changes it would be better if you
> > > could provide some figures about the performance impact. It would help
> > > if we knew whether avoiding mapping the vAPIC page means you can
> > > create 0.1% more forks per-minute or 20%.
> > >
> > > If you really want to speed up the forking path it might be good to
> start
> > > by perf sampling Xen in order to find the bottlenecks?
> > >
> >
> > Sure but for experiment systems I don't think its necessary to collect
> that
> > data.
>
> It helps weight whether the extra logic is worth the performance
> benefit IMO. Here it might not matter that much since you say there's
> also a non-performance reason for the change.
>
> > There is also a non-performance reason why we want to keep special pages
> > from being populated, in cases we really want the forks physmap to start
> > empty for better control over its state. There was already a case where
> > having special pages mapped in ended up triggering unexpected Xen
> behaviors
> > leading to chain of events not easy to follow. For example if page 0 gets
> > brought in while the vCPU is being created it ends up as a misconfigured
> > ept entry if nested virtualization is enabled. That leads to ept
> > misconfiguration exits instead of epf faults. Simply enforcing no entry
> in
> > the physmap until forking is complete eliminates the chance of something
> > like that happening again and makes reasoning about the VM's behavior
> from
> > the start easier.
>
> Could we have this added to the commit message then, and the option
> renamed to 'empty_p2m' or some such. Then you should also ASSERT that
> at the end of the fork process the p2m is indeed empty, not sure if
> checking d->arch.paging.hap.p2m_pages == 0 would accomplish that?
>

Sure, I can do that.

Thanks,
Tamas

>

[-- Attachment #2: Type: text/html, Size: 3472 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-25 10:48                 ` Tamas K Lengyel
@ 2022-03-25 12:14                   ` Roger Pau Monné
  0 siblings, 0 replies; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-25 12:14 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Jan Beulich, Tamas K Lengyel, Xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Julien Grall,
	Stefano Stabellini, Alexandru Isaila, Petre Pircalabu

On Fri, Mar 25, 2022 at 06:48:42AM -0400, Tamas K Lengyel wrote:
> On Fri, Mar 25, 2022, 5:04 AM Jan Beulich <jbeulich@suse.com> wrote:
> 
> > On 24.03.2022 18:02, Tamas K Lengyel wrote:
> > > On Thu, Mar 24, 2022 at 12:44 PM Roger Pau Monné <roger.pau@citrix.com>
> > wrote:
> > >>
> > >> On Thu, Mar 24, 2022 at 12:22:49PM -0400, Tamas K Lengyel wrote:
> > >>> On Thu, Mar 24, 2022 at 12:04 PM Roger Pau Monné <roger.pau@citrix.com>
> > wrote:
> > >>>>
> > >>>> On Thu, Mar 24, 2022 at 11:52:38AM -0400, Tamas K Lengyel wrote:
> > >>>>> On Thu, Mar 24, 2022 at 11:46 AM Roger Pau Monné <
> > roger.pau@citrix.com> wrote:
> > >>>>>>
> > >>>>>> On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > >>>>>>> diff --git a/xen/include/public/memory.h
> > b/xen/include/public/memory.h
> > >>>>>>> index 208d8dcbd9..30ce23c5a7 100644
> > >>>>>>> --- a/xen/include/public/memory.h
> > >>>>>>> +++ b/xen/include/public/memory.h
> > >>>>>>> @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> > >>>>>>>                  uint32_t gref;     /* IN: gref to debug         */
> > >>>>>>>              } u;
> > >>>>>>>          } debug;
> > >>>>>>> -        struct mem_sharing_op_fork {      /* OP_FORK */
> > >>>>>>> +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> > >>>>>>>              domid_t parent_domain;        /* IN: parent's domain
> > id */
> > >>>>>>>  /* These flags only makes sense for short-lived forks */
> > >>>>>>>  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> > >>>>>>>  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> > >>>>>>>  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > >>>>>>> +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > >>>>>>> +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
> > >>>>>>>              uint16_t flags;               /* IN: optional
> > settings */
> > >>>>>>>              uint32_t pad;                 /* Must be set to 0 */
> > >>>>>>>          } fork;
> > >>>>>>> diff --git a/xen/include/public/vm_event.h
> > b/xen/include/public/vm_event.h
> > >>>>>>> index bb003d21d0..81c2ee28cc 100644
> > >>>>>>> --- a/xen/include/public/vm_event.h
> > >>>>>>> +++ b/xen/include/public/vm_event.h
> > >>>>>>> @@ -127,6 +127,14 @@
> > >>>>>>>   * Reset the vmtrace buffer (if vmtrace is enabled)
> > >>>>>>>   */
> > >>>>>>>  #define VM_EVENT_FLAG_RESET_VMTRACE      (1 << 13)
> > >>>>>>> +/*
> > >>>>>>> + * Reset the VM state (if VM is fork)
> > >>>>>>> + */
> > >>>>>>> +#define VM_EVENT_FLAG_RESET_FORK_STATE   (1 << 14)
> > >>>>>>> +/*
> > >>>>>>> + * Remove unshared entried from physmap (if VM is fork)
> > >>>>>>> + */
> > >>>>>>> +#define VM_EVENT_FLAG_RESET_FORK_MEMORY  (1 << 15)
> > >>>>>>
> > >>>>>> I'm confused about why two different interfaces are added to do this
> > >>>>>> kind of selective resets, one to vm_event and one to xenmem_fork?
> > >>>>>>
> > >>>>>> I thin k the natural place for the option to live would be
> > >>>>>> XENMEM_FORK?
> > >>>>>
> > >>>>> Yes, that's the natural place for it. But we are adding it to both
> > for
> > >>>>> a reason. In our use-case the reset operation will happen after a
> > >>>>> vm_event is received to which we already must send a reply. Setting
> > >>>>> the flag on the vm_event reply saves us having to issue an extra
> > memop
> > >>>>> hypercall afterwards.
> > >>>>
> > >>>> Can you do a multicall and batch both operations in a single
> > >>>> hypercall?
> > >>>>
> > >>>> That would seem more natural than adding duplicated interfaces.
> > >>>
> > >>> Not in a straight forward way, no. There is no exposed API in libxc to
> > >>> do a multicall. Even if that was an option it is still easier for me
> > >>> to just flip a bit in the response field than having to construct a
> > >>> whole standalone hypercall structure to be sent as part of a
> > >>> multicall.
> > >>
> > >> Right, I can see it being easier, but it seems like a bad choice from
> > >> an interface PoV. You are the maintainer of both subsystems, but it
> > >> would seem to me it's in your best interest to try to keep the
> > >> interfaces separated and clean.
> > >>
> > >> Would it be possible for the reset XENMEM_FORK op to have the side
> > >> effect of performing what you would instead do with the vm_event
> > >> hypercall?
> > >
> > > Yes, the event response is really just an event channel signal to Xen,
> > > so the memop hypercall could similarly encode the "now check the
> > > vm_event response" as an optional field. But why is that any better
> > > than the current event channel route processing the vm_response
> > > encoding the "now do these ops on the fork"?
> >
> > Well, as Roger said: Less duplication in the interface.
> >
> 
> No, you would just duplicate something else instead, ie. the event channel
> hypercall.
> 
> 
> > > We already have a bunch of different operations you can encode in the
> > > vm_event response field, so it reduces the complexity on the toolstack
> > > side since I don't have to switch around which hypercall I need to
> > > issue depending on what extra ops I want to put into a single
> > > hypercall.
> >
> > The two goals need to be weighed against one another; for the moment
> > I think I'm with Roger aiming at a clean interface.
> >
> 
> It may look like that from the Xen side but from the toolstack side this is
> actually the cleanest way to achieve what we need. The vm_event interfaces
> are already strongly integrated with both the mem_sharing and mem_paging
> subsystems so nothing is gained by now for no reason trying to keep them
> separate. So I strongly disagree with this suggestion and I'm going to keep
> it as-is. I appreciate the feedback nevertheless.

I'm not opposed to it, I just would like to better understand why you
are proposing such interface.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-22 17:41 ` [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable Tamas K Lengyel
  2022-03-24 15:45   ` Roger Pau Monné
@ 2022-03-25 13:42   ` Roger Pau Monné
  2022-03-25 13:53     ` Tamas K Lengyel
  1 sibling, 1 reply; 26+ messages in thread
From: Roger Pau Monné @ 2022-03-25 13:42 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: xen-devel, Wei Liu, Anthony PERARD, Juergen Gross, Andrew Cooper,
	George Dunlap, Jan Beulich, Julien Grall, Stefano Stabellini,
	Tamas K Lengyel, Alexandru Isaila, Petre Pircalabu

On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> index a21c781452..bfa6082f13 100644
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1892,15 +1892,19 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
>   * footprints the hypercall continuation should be implemented (or if this
>   * feature needs to be become "stable").
>   */
> -static int mem_sharing_fork_reset(struct domain *d)
> +int mem_sharing_fork_reset(struct domain *d, bool reset_state,
> +                           bool reset_memory)
>  {
> -    int rc;
> +    int rc = 0;
>      struct domain *pd = d->parent;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
>      struct page_info *page, *tmp;
>  
>      domain_pause(d);

I would assert that at least one of reset_sate or reset_memory is set
here, as callers already do the checks.

>  
> +    if ( !reset_memory )
> +        goto state;

I don't like using labels and goto like this as I think it makes the
code harder to follow, and so more likely to introduce bugs. I would
rather place the memory reset parts inside of an if ( reset_memory ) {
... }, but that's my taste.

> +
>      /* need recursive lock because we will free pages */
>      spin_lock_recursive(&d->page_alloc_lock);
>      page_list_for_each_safe(page, tmp, &d->page_list)
> @@ -1933,7 +1937,9 @@ static int mem_sharing_fork_reset(struct domain *d)
>      }
>      spin_unlock_recursive(&d->page_alloc_lock);
>  
> -    rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
> + state:
> +    if ( reset_state )
> +        rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
>  
>      domain_unpause(d);
>  
> @@ -2239,15 +2245,21 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
>  
>      case XENMEM_sharing_op_fork_reset:
>      {
> +        bool reset_state = mso.u.fork.flags & XENMEM_FORK_RESET_STATE;
> +        bool reset_memory = mso.u.fork.flags & XENMEM_FORK_RESET_MEMORY;
> +
>          rc = -EINVAL;
> -        if ( mso.u.fork.pad || mso.u.fork.flags )
> +        if ( mso.u.fork.pad || (!reset_state && !reset_memory) )
> +            goto out;
> +        if ( mso.u.fork.flags &
> +             ~(XENMEM_FORK_RESET_STATE | XENMEM_FORK_RESET_MEMORY) )
>              goto out;
>  
>          rc = -ENOSYS;
>          if ( !d->parent )
>              goto out;
>  
> -        rc = mem_sharing_fork_reset(d);
> +        rc = mem_sharing_fork_reset(d, reset_state, reset_memory);
>          break;
>      }
>  
> diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
> index 84cf52636b..a7b192be0d 100644
> --- a/xen/common/vm_event.c
> +++ b/xen/common/vm_event.c
> @@ -28,6 +28,11 @@
>  #include <asm/p2m.h>
>  #include <asm/monitor.h>
>  #include <asm/vm_event.h>
> +
> +#ifdef CONFIG_MEM_SHARING
> +#include <asm/mem_sharing.h>
> +#endif
> +
>  #include <xsm/xsm.h>
>  #include <public/hvm/params.h>
>  
> @@ -394,6 +399,15 @@ static int vm_event_resume(struct domain *d, struct vm_event_domain *ved)
>              if ( rsp.reason == VM_EVENT_REASON_MEM_PAGING )
>                  p2m_mem_paging_resume(d, &rsp);
>  #endif
> +#ifdef CONFIG_MEM_SHARING
> +            do {
> +                bool reset_state = rsp.flags & VM_EVENT_FLAG_RESET_FORK_STATE;
> +                bool reset_mem = rsp.flags & VM_EVENT_FLAG_RESET_FORK_MEMORY;
> +
> +                if ( reset_state || reset_mem )
> +                    mem_sharing_fork_reset(d, reset_state, reset_mem);

You seem to drop the error code returned by mem_sharing_fork_reset.

> +            } while(0);
> +#endif

I think you can avoid the do {} while(0); just using the braces will
allow you to define local variables in the inner block.

>              /*
>               * Check emulation flags in the arch-specific handler only, as it
> diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> index 208d8dcbd9..30ce23c5a7 100644
> --- a/xen/include/public/memory.h
> +++ b/xen/include/public/memory.h
> @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
>                  uint32_t gref;     /* IN: gref to debug         */
>              } u;
>          } debug;
> -        struct mem_sharing_op_fork {      /* OP_FORK */
> +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
>              domid_t parent_domain;        /* IN: parent's domain id */
>  /* These flags only makes sense for short-lived forks */
>  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
>  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
>  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)

For backward compatibility purposes should the flags be added
backwards, ie:

#define XENMEM_FORK_KEEP_STATE        (1u << 3)
#define XENMEM_FORK_KEEP_MEMORY       (1u << 4)

So that existing callers of XENMEM_sharing_op_fork_reset will continue
working as expected?

Or we don't care about that as the interface is protected with
__XEN_TOOLS__?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable
  2022-03-25 13:42   ` Roger Pau Monné
@ 2022-03-25 13:53     ` Tamas K Lengyel
  0 siblings, 0 replies; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-25 13:53 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Tamas K Lengyel, xen-devel, Wei Liu, Anthony PERARD,
	Juergen Gross, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Alexandru Isaila,
	Petre Pircalabu

On Fri, Mar 25, 2022 at 9:42 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>
> On Tue, Mar 22, 2022 at 01:41:39PM -0400, Tamas K Lengyel wrote:
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index a21c781452..bfa6082f13 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1892,15 +1892,19 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> >   * footprints the hypercall continuation should be implemented (or if this
> >   * feature needs to be become "stable").
> >   */
> > -static int mem_sharing_fork_reset(struct domain *d)
> > +int mem_sharing_fork_reset(struct domain *d, bool reset_state,
> > +                           bool reset_memory)
> >  {
> > -    int rc;
> > +    int rc = 0;
> >      struct domain *pd = d->parent;
> >      struct p2m_domain *p2m = p2m_get_hostp2m(d);
> >      struct page_info *page, *tmp;
> >
> >      domain_pause(d);
>
> I would assert that at least one of reset_sate or reset_memory is set
> here, as callers already do the checks.

Sure.

>
> >
> > +    if ( !reset_memory )
> > +        goto state;
>
> I don't like using labels and goto like this as I think it makes the
> code harder to follow, and so more likely to introduce bugs. I would
> rather place the memory reset parts inside of an if ( reset_memory ) {
> ... }, but that's my taste.

I did that first but because it requires shifting everything to the
right it requires odd line breaks.

>
> > +
> >      /* need recursive lock because we will free pages */
> >      spin_lock_recursive(&d->page_alloc_lock);
> >      page_list_for_each_safe(page, tmp, &d->page_list)
> > @@ -1933,7 +1937,9 @@ static int mem_sharing_fork_reset(struct domain *d)
> >      }
> >      spin_unlock_recursive(&d->page_alloc_lock);
> >
> > -    rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
> > + state:
> > +    if ( reset_state )
> > +        rc = copy_settings(d, pd, d->arch.hvm.mem_sharing.skip_special_pages);
> >
> >      domain_unpause(d);
> >
> > @@ -2239,15 +2245,21 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
> >
> >      case XENMEM_sharing_op_fork_reset:
> >      {
> > +        bool reset_state = mso.u.fork.flags & XENMEM_FORK_RESET_STATE;
> > +        bool reset_memory = mso.u.fork.flags & XENMEM_FORK_RESET_MEMORY;
> > +
> >          rc = -EINVAL;
> > -        if ( mso.u.fork.pad || mso.u.fork.flags )
> > +        if ( mso.u.fork.pad || (!reset_state && !reset_memory) )
> > +            goto out;
> > +        if ( mso.u.fork.flags &
> > +             ~(XENMEM_FORK_RESET_STATE | XENMEM_FORK_RESET_MEMORY) )
> >              goto out;
> >
> >          rc = -ENOSYS;
> >          if ( !d->parent )
> >              goto out;
> >
> > -        rc = mem_sharing_fork_reset(d);
> > +        rc = mem_sharing_fork_reset(d, reset_state, reset_memory);
> >          break;
> >      }
> >
> > diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
> > index 84cf52636b..a7b192be0d 100644
> > --- a/xen/common/vm_event.c
> > +++ b/xen/common/vm_event.c
> > @@ -28,6 +28,11 @@
> >  #include <asm/p2m.h>
> >  #include <asm/monitor.h>
> >  #include <asm/vm_event.h>
> > +
> > +#ifdef CONFIG_MEM_SHARING
> > +#include <asm/mem_sharing.h>
> > +#endif
> > +
> >  #include <xsm/xsm.h>
> >  #include <public/hvm/params.h>
> >
> > @@ -394,6 +399,15 @@ static int vm_event_resume(struct domain *d, struct vm_event_domain *ved)
> >              if ( rsp.reason == VM_EVENT_REASON_MEM_PAGING )
> >                  p2m_mem_paging_resume(d, &rsp);
> >  #endif
> > +#ifdef CONFIG_MEM_SHARING
> > +            do {
> > +                bool reset_state = rsp.flags & VM_EVENT_FLAG_RESET_FORK_STATE;
> > +                bool reset_mem = rsp.flags & VM_EVENT_FLAG_RESET_FORK_MEMORY;
> > +
> > +                if ( reset_state || reset_mem )
> > +                    mem_sharing_fork_reset(d, reset_state, reset_mem);
>
> You seem to drop the error code returned by mem_sharing_fork_reset.

Yes, there is no response that could be sent to the toolstack from
here. I could add an ASSERT that rc is 0 though. If the fork() op
successfully finished then fork_reset() couldn't reasonably end up in
a path where it fails.

>
> > +            } while(0);
> > +#endif
>
> I think you can avoid the do {} while(0); just using the braces will
> allow you to define local variables in the inner block.

Sure.

>
> >              /*
> >               * Check emulation flags in the arch-specific handler only, as it
> > diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> > index 208d8dcbd9..30ce23c5a7 100644
> > --- a/xen/include/public/memory.h
> > +++ b/xen/include/public/memory.h
> > @@ -541,12 +541,14 @@ struct xen_mem_sharing_op {
> >                  uint32_t gref;     /* IN: gref to debug         */
> >              } u;
> >          } debug;
> > -        struct mem_sharing_op_fork {      /* OP_FORK */
> > +        struct mem_sharing_op_fork {      /* OP_FORK/_RESET */
> >              domid_t parent_domain;        /* IN: parent's domain id */
> >  /* These flags only makes sense for short-lived forks */
> >  #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
> >  #define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
> >  #define XENMEM_FORK_SKIP_SPECIAL_PAGES (1u << 2)
> > +#define XENMEM_FORK_RESET_STATE        (1u << 3)
> > +#define XENMEM_FORK_RESET_MEMORY       (1u << 4)
>
> For backward compatibility purposes should the flags be added
> backwards, ie:
>
> #define XENMEM_FORK_KEEP_STATE        (1u << 3)
> #define XENMEM_FORK_KEEP_MEMORY       (1u << 4)
>
> So that existing callers of XENMEM_sharing_op_fork_reset will continue
> working as expected?
>
> Or we don't care about that as the interface is protected with
> __XEN_TOOLS__?

I would say don't care, we are updating the only toolstack that uses
this in lock-step with Xen.

Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete
  2022-03-22 17:41 ` [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete Tamas K Lengyel
@ 2022-03-28 13:32   ` Jan Beulich
  2022-03-28 15:36     ` Tamas K Lengyel
  0 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2022-03-28 13:32 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: Andrew Cooper, Roger Pau Monné,
	Wei Liu, Tamas K Lengyel, George Dunlap, xen-devel

On 22.03.2022 18:41, Tamas K Lengyel wrote:
> For the duration of the fork memop set dom_cow as a placeholder parent. This
> gets updated to the real parent when the fork operation completes, or to NULL
> in case the fork failed.

I am concerned of this, in particular because the state may last across
perhaps a long series of preemptions. Furthermore ...

> --- a/xen/arch/x86/include/asm/mem_sharing.h
> +++ b/xen/arch/x86/include/asm/mem_sharing.h
> @@ -79,7 +79,7 @@ static inline int mem_sharing_unshare_page(struct domain *d,
>  
>  static inline bool mem_sharing_is_fork(const struct domain *d)
>  {
> -    return d->parent;
> +    return d->parent && d->parent != dom_cow;
>  }

... this now makes the function "lie" (the domain is a fork already
while being constructed). Hence at the very least a comment would want
to appear here explaining why this is wanted despite not really being
correct. This "lying" for example means a partly set up fork would be
skipped by domain_relinquish_resources(), in case the tool stack
decided to kill the domain instead of waiting for its creation to
finish.

> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1850,7 +1850,9 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
>          *cd->arch.cpuid = *d->arch.cpuid;
>          *cd->arch.msr = *d->arch.msr;
>          cd->vmtrace_size = d->vmtrace_size;
> -        cd->parent = d;
> +
> +        /* use dom_cow as a placeholder until we are all done */

Nit: As per ./CODING_STYLE you want to at least start the comment with
a capital letter.

Jan



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete
  2022-03-28 13:32   ` Jan Beulich
@ 2022-03-28 15:36     ` Tamas K Lengyel
  0 siblings, 0 replies; 26+ messages in thread
From: Tamas K Lengyel @ 2022-03-28 15:36 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Tamas K Lengyel, Andrew Cooper, Roger Pau Monné,
	Wei Liu, George Dunlap, xen-devel

On Mon, Mar 28, 2022 at 9:32 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 22.03.2022 18:41, Tamas K Lengyel wrote:
> > For the duration of the fork memop set dom_cow as a placeholder parent. This
> > gets updated to the real parent when the fork operation completes, or to NULL
> > in case the fork failed.
>
> I am concerned of this, in particular because the state may last across
> perhaps a long series of preemptions. Furthermore ...
>
> > --- a/xen/arch/x86/include/asm/mem_sharing.h
> > +++ b/xen/arch/x86/include/asm/mem_sharing.h
> > @@ -79,7 +79,7 @@ static inline int mem_sharing_unshare_page(struct domain *d,
> >
> >  static inline bool mem_sharing_is_fork(const struct domain *d)
> >  {
> > -    return d->parent;
> > +    return d->parent && d->parent != dom_cow;
> >  }
>
> ... this now makes the function "lie" (the domain is a fork already
> while being constructed). Hence at the very least a comment would want
> to appear here explaining why this is wanted despite not really being
> correct. This "lying" for example means a partly set up fork would be
> skipped by domain_relinquish_resources(), in case the tool stack
> decided to kill the domain instead of waiting for its creation to
> finish.

Hm, yea, domain_relinquish_resources really ought to check if there is
any parent at all, while fork_page needs to check whether there is a
parent and it's not dom_cow. I think I just need two separate
mem_sharing_is_fork functions, one would be the current
mem_sharing_is_fork() and a new one mem_sharing_is_fork_complete(),
that would make it a bit clearer on what they do.

>
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1850,7 +1850,9 @@ static int fork(struct domain *cd, struct domain *d, uint16_t flags)
> >          *cd->arch.cpuid = *d->arch.cpuid;
> >          *cd->arch.msr = *d->arch.msr;
> >          cd->vmtrace_size = d->vmtrace_size;
> > -        cd->parent = d;
> > +
> > +        /* use dom_cow as a placeholder until we are all done */
>
> Nit: As per ./CODING_STYLE you want to at least start the comment with
> a capital letter.

Ack.

Thanks,
Tamas


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-03-28 15:37 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-22 17:41 [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Tamas K Lengyel
2022-03-22 17:41 ` [PATCH 2/3] x86/mem_sharing: use dom_cow as placeholder parent until fork is complete Tamas K Lengyel
2022-03-28 13:32   ` Jan Beulich
2022-03-28 15:36     ` Tamas K Lengyel
2022-03-22 17:41 ` [PATCH 3/3] x86/mem_sharing: make fork_reset more configurable Tamas K Lengyel
2022-03-24 15:45   ` Roger Pau Monné
2022-03-24 15:52     ` Tamas K Lengyel
2022-03-24 16:03       ` Roger Pau Monné
2022-03-24 16:22         ` Tamas K Lengyel
2022-03-24 16:43           ` Roger Pau Monné
2022-03-24 17:02             ` Tamas K Lengyel
2022-03-25  9:04               ` Jan Beulich
2022-03-25 10:48                 ` Tamas K Lengyel
2022-03-25 12:14                   ` Roger Pau Monné
2022-03-25 13:42   ` Roger Pau Monné
2022-03-25 13:53     ` Tamas K Lengyel
2022-03-24 14:50 ` [PATCH 1/3] x86/mem_sharing: option to skip populating special pages during fork Roger Pau Monné
2022-03-24 15:15   ` Tamas K Lengyel
2022-03-24 15:51     ` Roger Pau Monné
2022-03-24 16:27       ` Tamas K Lengyel
2022-03-25 10:25         ` Roger Pau Monné
2022-03-25 10:51           ` Tamas K Lengyel
2022-03-25 10:59 ` Roger Pau Monné
2022-03-25 11:15   ` Tamas K Lengyel
2022-03-25 11:31     ` Roger Pau Monné
2022-03-25 12:13       ` Tamas K Lengyel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.