[PATCH RFC 00/10] runstate/time area registration by (guest) physical address

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC 00/10] runstate/time area registration by (guest) physical address
@ 2022-10-19  7:35 Jan Beulich
  2022-10-19  7:38 ` [PATCH 01/10] unify update_runstate_area() Jan Beulich
                   ` (9 more replies)
  0 siblings, 10 replies; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:35 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

Since it was indicated that introducing specific new vCPU ops may be
beneficial independent of the introduction of a fully physical-
address-based ABI flavor, here we go. There are a number of open
questions throughout the series, resolving of which is the main goal
of this early posting.

01: unify update_runstate_area()
02: x86: split populating of struct vcpu_time_info into a separate function
03: domain: GADDR based shared guest area registration alternative - cleanup
04: domain: update GADDR based runstate guest area
05: x86: update GADDR based secondary time area
06: x86/mem-sharing: copy GADDR based shared guest areas
07: domain: map/unmap GADDR based shared guest areas
08: domain: introduce GADDR based runstate area registration alternative
09: x86: introduce GADDR based secondary time area registration alternative
10: common: convert vCPU info area registration

These go on top of the previously posted small bug fixes "common:
map_vcpu_info() wants to unshare the underlying page" and "x86: also
zap secondary time area handles during soft reset".

Jan

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 01/10] unify update_runstate_area()
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
@ 2022-10-19  7:38 ` Jan Beulich
  2022-11-24 20:43   ` Julien Grall
  2022-10-19  7:39 ` [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function Jan Beulich
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:38 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné,
	Bertrand Marquis, Volodymyr Babchuk

x86'es variant is a superset of Arm's, with CONFIG_COMPAT parts already
properly marked. The only other missing piece is
update_guest_memory_policy(): For the time being Arm simply gains an
empty struct and inline function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -281,38 +281,6 @@ static void ctxt_switch_to(struct vcpu *
     WRITE_SYSREG(n->arch.mdcr_el2, MDCR_EL2);
 }
 
-/* Update per-VCPU guest runstate shared memory area (if registered). */
-static void update_runstate_area(struct vcpu *v)
-{
-    void __user *guest_handle = NULL;
-    struct vcpu_runstate_info runstate;
-
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return;
-
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
-
-    if ( VM_ASSIST(v->domain, runstate_update_flag) )
-    {
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
-        guest_handle--;
-        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
-        smp_wmb();
-    }
-
-    __copy_to_guest(runstate_guest(v), &runstate, 1);
-
-    if ( guest_handle )
-    {
-        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-        smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
-    }
-}
-
 static void schedule_tail(struct vcpu *prev)
 {
     ASSERT(prev != current);
--- a/xen/arch/arm/include/asm/domain.h
+++ b/xen/arch/arm/include/asm/domain.h
@@ -295,6 +295,11 @@ struct arch_vcpu_io {
     struct instr_details dabt_instr; /* when the instruction is decoded */
 };
 
+struct guest_memory_policy {};
+static inline void update_guest_memory_policy(struct vcpu *v,
+                                              struct guest_memory_policy *gmp)
+{}
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1852,64 +1852,6 @@ void cf_check paravirt_ctxt_switch_to(st
         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
 }
 
-/* Update per-VCPU guest runstate shared memory area (if registered). */
-bool update_runstate_area(struct vcpu *v)
-{
-    bool rc;
-    struct guest_memory_policy policy = { .nested_guest_mode = false };
-    void __user *guest_handle = NULL;
-    struct vcpu_runstate_info runstate;
-
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return true;
-
-    update_guest_memory_policy(v, &policy);
-
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
-
-    if ( VM_ASSIST(v->domain, runstate_update_flag) )
-    {
-#ifdef CONFIG_COMPAT
-        guest_handle = has_32bit_shinfo(v->domain)
-            ? &v->runstate_guest.compat.p->state_entry_time + 1
-            : &v->runstate_guest.native.p->state_entry_time + 1;
-#else
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
-#endif
-        guest_handle--;
-        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
-        smp_wmb();
-    }
-
-#ifdef CONFIG_COMPAT
-    if ( has_32bit_shinfo(v->domain) )
-    {
-        struct compat_vcpu_runstate_info info;
-
-        XLAT_vcpu_runstate_info(&info, &runstate);
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
-        rc = true;
-    }
-    else
-#endif
-        rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
-             sizeof(runstate);
-
-    if ( guest_handle )
-    {
-        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-        smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
-    }
-
-    update_guest_memory_policy(v, &policy);
-
-    return rc;
-}
-
 static void _update_runstate_area(struct vcpu *v)
 {
     if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -681,7 +681,6 @@ void update_guest_memory_policy(struct v
 
 void domain_cpu_policy_changed(struct domain *d);
 
-bool update_runstate_area(struct vcpu *);
 bool update_secondary_system_time(struct vcpu *,
                                   struct vcpu_time_info *);
 
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1579,6 +1579,64 @@ int default_initialise_vcpu(struct vcpu
     return rc;
 }
 
+/* Update per-VCPU guest runstate shared memory area (if registered). */
+bool update_runstate_area(struct vcpu *v)
+{
+    bool rc;
+    struct guest_memory_policy policy = { };
+    void __user *guest_handle = NULL;
+    struct vcpu_runstate_info runstate;
+
+    if ( guest_handle_is_null(runstate_guest(v)) )
+        return true;
+
+    update_guest_memory_policy(v, &policy);
+
+    memcpy(&runstate, &v->runstate, sizeof(runstate));
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+#ifdef CONFIG_COMPAT
+        guest_handle = has_32bit_shinfo(v->domain)
+            ? &v->runstate_guest.compat.p->state_entry_time + 1
+            : &v->runstate_guest.native.p->state_entry_time + 1;
+#else
+        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
+#endif
+        guest_handle--;
+        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+        __raw_copy_to_guest(guest_handle,
+                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+        smp_wmb();
+    }
+
+#ifdef CONFIG_COMPAT
+    if ( has_32bit_shinfo(v->domain) )
+    {
+        struct compat_vcpu_runstate_info info;
+
+        XLAT_vcpu_runstate_info(&info, &runstate);
+        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        rc = true;
+    }
+    else
+#endif
+        rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
+             sizeof(runstate);
+
+    if ( guest_handle )
+    {
+        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        __raw_copy_to_guest(guest_handle,
+                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+    }
+
+    update_guest_memory_policy(v, &policy);
+
+    return rc;
+}
+
 long common_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     long rc = 0;
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -98,6 +98,8 @@ void arch_get_info_guest(struct vcpu *,
 int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg);
 int default_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg);
 
+bool update_runstate_area(struct vcpu *);
+
 int domain_relinquish_resources(struct domain *d);
 
 void dump_pageframe_info(struct domain *d);



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
  2022-10-19  7:38 ` [PATCH 01/10] unify update_runstate_area() Jan Beulich
@ 2022-10-19  7:39 ` Jan Beulich
  2023-01-17 20:19   ` Andrew Cooper
  2022-10-19  7:40 ` [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown Jan Beulich
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:39 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, George Dunlap, Wei Liu, Roger Pau Monné

This is to facilitate subsequent re-use of this code.

While doing so add const in a number of places, extending to
gtime_to_gtsc() and then for symmetry also its inverse function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I was on the edge of also folding the various is_hvm_domain() into a
function scope boolean, but then wasn't really certain that this
wouldn't open up undue speculation opportunities.

--- a/xen/arch/x86/include/asm/time.h
+++ b/xen/arch/x86/include/asm/time.h
@@ -52,8 +52,8 @@ uint64_t cf_check acpi_pm_tick_to_ns(uin
 uint64_t tsc_ticks2ns(uint64_t ticks);
 
 uint64_t pv_soft_rdtsc(const struct vcpu *v, const struct cpu_user_regs *regs);
-u64 gtime_to_gtsc(struct domain *d, u64 time);
-u64 gtsc_to_gtime(struct domain *d, u64 tsc);
+uint64_t gtime_to_gtsc(const struct domain *d, uint64_t time);
+uint64_t gtsc_to_gtime(const struct domain *d, uint64_t tsc);
 
 int tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
                  uint32_t gtsc_khz, uint32_t incarnation);
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1373,18 +1373,14 @@ uint64_t tsc_ticks2ns(uint64_t ticks)
     return scale_delta(ticks, &t->tsc_scale);
 }
 
-static void __update_vcpu_system_time(struct vcpu *v, int force)
+static void collect_time_info(const struct vcpu *v,
+                              struct vcpu_time_info *u)
 {
-    const struct cpu_time *t;
-    struct vcpu_time_info *u, _u = {};
-    struct domain *d = v->domain;
+    const struct cpu_time *t = &this_cpu(cpu_time);
+    const struct domain *d = v->domain;
     s_time_t tsc_stamp;
 
-    if ( v->vcpu_info == NULL )
-        return;
-
-    t = &this_cpu(cpu_time);
-    u = &vcpu_info(v, time);
+    memset(u, 0, sizeof(*u));
 
     if ( d->arch.vtsc )
     {
@@ -1392,7 +1388,7 @@ static void __update_vcpu_system_time(st
 
         if ( is_hvm_domain(d) )
         {
-            struct pl_time *pl = v->domain->arch.hvm.pl_time;
+            const struct pl_time *pl = d->arch.hvm.pl_time;
 
             stime += pl->stime_offset + v->arch.hvm.stime_offset;
             if ( stime >= 0 )
@@ -1403,27 +1399,27 @@ static void __update_vcpu_system_time(st
         else
             tsc_stamp = gtime_to_gtsc(d, stime);
 
-        _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
-        _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
+        u->tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
+        u->tsc_shift         = d->arch.vtsc_to_ns.shift;
     }
     else
     {
         if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
         {
             tsc_stamp            = hvm_scale_tsc(d, t->stamp.local_tsc);
-            _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
-            _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
+            u->tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
+            u->tsc_shift         = d->arch.vtsc_to_ns.shift;
         }
         else
         {
             tsc_stamp            = t->stamp.local_tsc;
-            _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
-            _u.tsc_shift         = t->tsc_scale.shift;
+            u->tsc_to_system_mul = t->tsc_scale.mul_frac;
+            u->tsc_shift         = t->tsc_scale.shift;
         }
     }
 
-    _u.tsc_timestamp = tsc_stamp;
-    _u.system_time   = t->stamp.local_stime;
+    u->tsc_timestamp = tsc_stamp;
+    u->system_time   = t->stamp.local_stime;
 
     /*
      * It's expected that domains cope with this bit changing on every
@@ -1431,10 +1427,21 @@ static void __update_vcpu_system_time(st
      * or if it further requires monotonicity checks with other vcpus.
      */
     if ( clocksource_is_tsc() )
-        _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
+        u->flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
 
     if ( is_hvm_domain(d) )
-        _u.tsc_timestamp += v->arch.hvm.cache_tsc_offset;
+        u->tsc_timestamp += v->arch.hvm.cache_tsc_offset;
+}
+
+static void __update_vcpu_system_time(struct vcpu *v, int force)
+{
+    struct vcpu_time_info *u = &vcpu_info(v, time), _u;
+    const struct domain *d = v->domain;
+
+    if ( v->vcpu_info == NULL )
+        return;
+
+    collect_time_info(v, &_u);
 
     /* Don't bother unless timestamp record has changed or we are forced. */
     _u.version = u->version; /* make versions match for memcmp test */
@@ -2476,7 +2483,7 @@ static int __init cf_check tsc_parse(con
 }
 custom_param("tsc", tsc_parse);
 
-u64 gtime_to_gtsc(struct domain *d, u64 time)
+uint64_t gtime_to_gtsc(const struct domain *d, uint64_t time)
 {
     if ( !is_hvm_domain(d) )
     {
@@ -2488,7 +2495,7 @@ u64 gtime_to_gtsc(struct domain *d, u64
     return scale_delta(time, &d->arch.ns_to_vtsc);
 }
 
-u64 gtsc_to_gtime(struct domain *d, u64 tsc)
+uint64_t gtsc_to_gtime(const struct domain *d, uint64_t tsc)
 {
     u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns);
 



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
  2022-10-19  7:38 ` [PATCH 01/10] unify update_runstate_area() Jan Beulich
  2022-10-19  7:39 ` [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function Jan Beulich
@ 2022-10-19  7:40 ` Jan Beulich
  2022-12-13 21:44   ` Julien Grall
  2023-01-17 21:17   ` Andrew Cooper
  2022-10-19  7:41 ` [PATCH RFC 04/10] domain: update GADDR based runstate guest area Jan Beulich
                   ` (6 subsequent siblings)
  9 siblings, 2 replies; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:40 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

In preparation of the introduction of new vCPU operations allowing to
register the respective areas (one of the two is x86-specific) by
guest-physical address, add the necessary domain cleanup hooks.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: Zapping the areas in pv_shim_shutdown() may not be strictly
     necessary: Aiui unmap_vcpu_info() is called only because the vCPU
     info area cannot be re-registered. Beyond that I guess the
     assumption is that the areas would only be re-registered as they
     were before. If that's not the case I wonder whether the guest
     handles for both areas shouldn't also be zapped.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1035,7 +1035,10 @@ int arch_domain_soft_reset(struct domain
     }
 
     for_each_vcpu ( d, v )
+    {
         set_xen_guest_handle(v->arch.time_info_guest, NULL);
+        unmap_guest_area(v, &v->arch.time_guest_area);
+    }
 
  exit_put_gfn:
     put_gfn(d, gfn_x(gfn));
@@ -2350,6 +2353,8 @@ int domain_relinquish_resources(struct d
             if ( ret )
                 return ret;
 
+            unmap_guest_area(v, &v->arch.time_guest_area);
+
             vpmu_destroy(v);
         }
 
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -661,6 +661,7 @@ struct arch_vcpu
 
     /* A secondary copy of the vcpu time info. */
     XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest;
+    struct guest_area time_guest_area;
 
     struct arch_vm_event *vm_event;
 
--- a/xen/arch/x86/pv/shim.c
+++ b/xen/arch/x86/pv/shim.c
@@ -394,8 +394,10 @@ int pv_shim_shutdown(uint8_t reason)
 
     for_each_vcpu ( d, v )
     {
-        /* Unmap guest vcpu_info pages. */
+        /* Unmap guest vcpu_info page and runstate/time areas. */
         unmap_vcpu_info(v);
+        unmap_guest_area(v, &v->runstate_guest_area);
+        unmap_guest_area(v, &v->arch.time_guest_area);
 
         /* Reset the periodic timer to the default value. */
         vcpu_set_periodic_timer(v, MILLISECS(10));
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -950,7 +950,10 @@ int domain_kill(struct domain *d)
         if ( cpupool_move_domain(d, cpupool0) )
             return -ERESTART;
         for_each_vcpu ( d, v )
+        {
             unmap_vcpu_info(v);
+            unmap_guest_area(v, &v->runstate_guest_area);
+        }
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
@@ -1404,6 +1407,7 @@ int domain_soft_reset(struct domain *d,
     {
         set_xen_guest_handle(runstate_guest(v), NULL);
         unmap_vcpu_info(v);
+        unmap_guest_area(v, &v->runstate_guest_area);
     }
 
     rc = arch_domain_soft_reset(d);
@@ -1555,6 +1559,15 @@ void unmap_vcpu_info(struct vcpu *v)
     put_page_and_type(mfn_to_page(mfn));
 }
 
+/*
+ * This is only intended to be used for domain cleanup (or more generally only
+ * with at least the respective vCPU, if it's not the current one, reliably
+ * paused).
+ */
+void unmap_guest_area(struct vcpu *v, struct guest_area *area)
+{
+}
+
 int default_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     struct vcpu_guest_context *ctxt;
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -5,6 +5,12 @@
 #include <xen/types.h>
 
 #include <public/xen.h>
+
+struct guest_area {
+    struct page_info *pg;
+    void *map;
+};
+
 #include <asm/domain.h>
 #include <asm/numa.h>
 
@@ -76,6 +82,11 @@ void arch_vcpu_destroy(struct vcpu *v);
 int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset);
 void unmap_vcpu_info(struct vcpu *v);
 
+int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
+                   struct guest_area *area,
+                   void (*populate)(void *dst, struct vcpu *v));
+void unmap_guest_area(struct vcpu *v, struct guest_area *area);
+
 int arch_domain_create(struct domain *d,
                        struct xen_domctl_createdomain *config,
                        unsigned int flags);
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -202,6 +202,7 @@ struct vcpu
         XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
     } runstate_guest; /* guest address */
 #endif
+    struct guest_area runstate_guest_area;
     unsigned int     new_state;
 
     /* Has the FPU been initialised? */



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (2 preceding siblings ...)
  2022-10-19  7:40 ` [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown Jan Beulich
@ 2022-10-19  7:41 ` Jan Beulich
  2022-12-16 12:26   ` Julien Grall
  2022-10-19  7:41 ` [PATCH RFC 05/10] x86: update GADDR based secondary time area Jan Beulich
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:41 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

Before adding a new vCPU operation to register the runstate area by
guest-physical address, add code to actually keep such areas up-to-date.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: Pages aren't marked dirty when written to (matching the handling of
     space mapped by map_vcpu_info() afaict), on the basis that the
     registrations are lost anyway across migration. Plus the contents
     of the areas in question have to be deemed volatile in the first
     place (so saving a "most recent" value is pretty meaningless even
     for e.g. snapshotting).

RFC: Can we perhaps avoid the VM-assist conditionals, assuming the more
     modern behavior to apply uniformly for gaddr-based registrations?

RFC: HVM guests (on x86) can change bitness and hence layout (and size!
     and alignment) of the runstate area. I don't think it is an option
     to require 32-bit code to pass a range such that even the 64-bit
     layout wouldn't cross a page boundary (and be suitably aligned). I
     also don't see any other good solution, so for now a crude approach
     with an extra boolean is used (using has_32bit_shinfo() isn't race
     free and could hence lead to overrunning the mapped space).

--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1599,14 +1599,55 @@ bool update_runstate_area(struct vcpu *v
     struct guest_memory_policy policy = { };
     void __user *guest_handle = NULL;
     struct vcpu_runstate_info runstate;
+    struct vcpu_runstate_info *map = v->runstate_guest_area.map;
+
+    memcpy(&runstate, &v->runstate, sizeof(runstate));
+
+    if ( map )
+    {
+        uint64_t *pset = NULL;
+#ifdef CONFIG_COMPAT
+        struct compat_vcpu_runstate_info *cmap = NULL;
+
+        if ( v->runstate_guest_area_compat )
+            cmap = (void *)map;
+#endif
+
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+#ifdef CONFIG_COMPAT
+            if ( cmap )
+                pset = &cmap->state_entry_time;
+            else
+#endif
+                pset = &map->state_entry_time;
+            runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+            write_atomic(pset, runstate.state_entry_time);
+            smp_wmb();
+        }
+
+#ifdef CONFIG_COMPAT
+        if ( cmap )
+            XLAT_vcpu_runstate_info(cmap, &runstate);
+        else
+#endif
+            *map = runstate;
+
+        if ( pset )
+        {
+            smp_wmb();
+            runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            write_atomic(pset, runstate.state_entry_time);
+        }
+
+        return true;
+    }
 
     if ( guest_handle_is_null(runstate_guest(v)) )
         return true;
 
     update_guest_memory_policy(v, &policy);
 
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
-
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
 #ifdef CONFIG_COMPAT
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -231,6 +231,8 @@ struct vcpu
 #ifdef CONFIG_COMPAT
     /* A hypercall is using the compat ABI? */
     bool             hcall_compat;
+    /* Physical runstate area registered via compat ABI? */
+    bool             runstate_guest_area_compat;
 #endif
 
 #ifdef CONFIG_IOREQ_SERVER



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH RFC 05/10] x86: update GADDR based secondary time area
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (3 preceding siblings ...)
  2022-10-19  7:41 ` [PATCH RFC 04/10] domain: update GADDR based runstate guest area Jan Beulich
@ 2022-10-19  7:41 ` Jan Beulich
  2023-01-17 20:31   ` Andrew Cooper
  2022-10-19  7:42 ` [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas Jan Beulich
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:41 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, George Dunlap, Wei Liu, Roger Pau Monné

Before adding a new vCPU operation to register the secondary time area
by guest-physical address, add code to actually keep such areas up-to-
date.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: Pages aren't marked dirty when written to (matching the handling of
     space mapped by map_vcpu_info() afaict), on the basis that the
     registrations are lost anyway across migration. Plus the contents
     of the areas in question have to be deemed volatile in the first
     place (so saving a "most recent" value is pretty meaningless even
     for e.g. snapshotting).

--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1462,12 +1462,34 @@ static void __update_vcpu_system_time(st
         v->arch.pv.pending_system_time = _u;
 }
 
+static void write_time_guest_area(struct vcpu_time_info *map,
+                                  const struct vcpu_time_info *src)
+{
+    /* 1. Update userspace version. */
+    write_atomic(&map->version, src->version);
+    smp_wmb();
+
+    /* 2. Update all other userspace fields. */
+    *map = *src;
+
+    /* 3. Update userspace version again. */
+    smp_wmb();
+    write_atomic(&map->version, version_update_end(src->version));
+}
+
 bool update_secondary_system_time(struct vcpu *v,
                                   struct vcpu_time_info *u)
 {
     XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
+    struct vcpu_time_info *map = v->arch.time_guest_area.map;
     struct guest_memory_policy policy = { .nested_guest_mode = false };
 
+    if ( map )
+    {
+        write_time_guest_area(map, u);
+        return true;
+    }
+
     if ( guest_handle_is_null(user_u) )
         return true;
 



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (4 preceding siblings ...)
  2022-10-19  7:41 ` [PATCH RFC 05/10] x86: update GADDR based secondary time area Jan Beulich
@ 2022-10-19  7:42 ` Jan Beulich
  2022-10-24 23:04   ` Tamas K Lengyel
  2022-10-19  7:43 ` [PATCH RFC 07/10] domain: map/unmap " Jan Beulich
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:42 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné,
	Tamas K Lengyel

In preparation of the introduction of new vCPU operations allowing to
register the respective areas (one of the two is x86-specific) by
guest-physical address, add the necessary fork handling (with the
backing function yet to be filled in).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1653,6 +1653,65 @@ static void copy_vcpu_nonreg_state(struc
     hvm_set_nonreg_state(cd_vcpu, &nrs);
 }
 
+static int copy_guest_area(struct guest_area *cd_area,
+                           const struct guest_area *d_area,
+                           struct vcpu *cd_vcpu,
+                           const struct domain *d)
+{
+    mfn_t d_mfn, cd_mfn;
+
+    if ( !d_area->pg )
+        return 0;
+
+    d_mfn = page_to_mfn(d_area->pg);
+
+    /* Allocate & map a page for the area if it hasn't been already. */
+    if ( !cd_area->pg )
+    {
+        gfn_t gfn = mfn_to_gfn(d, d_mfn);
+        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
+        p2m_type_t p2mt;
+        p2m_access_t p2ma;
+        unsigned int offset;
+        int ret;
+
+        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
+        if ( mfn_eq(cd_mfn, INVALID_MFN) )
+        {
+            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
+
+            if ( !pg )
+                return -ENOMEM;
+
+            cd_mfn = page_to_mfn(pg);
+            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
+
+            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                                 p2m->default_access, -1);
+            if ( ret )
+                return ret;
+        }
+        else if ( p2mt != p2m_ram_rw )
+            return -EBUSY;
+
+        /*
+         * Simply specify the entire range up to the end of the page. All the
+         * function uses it for is a check for not crossing page boundaries.
+         */
+        offset = PAGE_OFFSET(d_area->map);
+        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
+                             PAGE_SIZE - offset, cd_area, NULL);
+        if ( ret )
+            return ret;
+    }
+    else
+        cd_mfn = page_to_mfn(cd_area->pg);
+
+    copy_domain_page(cd_mfn, d_mfn);
+
+    return 0;
+}
+
 static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
 {
     struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
@@ -1745,6 +1804,16 @@ static int copy_vcpu_settings(struct dom
             copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
         }
 
+        /* Same for the (physically registered) runstate and time info areas. */
+        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
+                              &d_vcpu->runstate_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
+                              &d_vcpu->arch.time_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+
         ret = copy_vpmu(d_vcpu, cd_vcpu);
         if ( ret )
             return ret;
@@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
 
  state:
     if ( reset_state )
+    {
         rc = copy_settings(d, pd);
+        /* TBD: What to do here with -ERESTART? */
+    }
 
     domain_unpause(d);
 
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1559,6 +1559,13 @@ void unmap_vcpu_info(struct vcpu *v)
     put_page_and_type(mfn_to_page(mfn));
 }
 
+int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
+                   struct guest_area *area,
+                   void (*populate)(void *dst, struct vcpu *v))
+{
+    return -EOPNOTSUPP;
+}
+
 /*
  * This is only intended to be used for domain cleanup (or more generally only
  * with at least the respective vCPU, if it's not the current one, reliably



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (5 preceding siblings ...)
  2022-10-19  7:42 ` [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas Jan Beulich
@ 2022-10-19  7:43 ` Jan Beulich
  2022-11-24 21:29   ` Julien Grall
  2023-01-17 22:04   ` Andrew Cooper
  2022-10-19  7:44 ` [PATCH 08/10] domain: introduce GADDR based runstate area registration alternative Jan Beulich
                   ` (2 subsequent siblings)
  9 siblings, 2 replies; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:43 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

The registration by virtual/linear address has downsides: At least on
x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
PV domains the areas are inaccessible (and hence cannot be updated by
Xen) when in guest-user mode.

In preparation of the introduction of new vCPU operations allowing to
register the respective areas (one of the two is x86-specific) by
guest-physical address, flesh out the map/unmap functions.

Noteworthy differences from map_vcpu_info():
- areas can be registered more than once (and de-registered),
- remote vCPU-s are paused rather than checked for being down (which in
  principle can change right after the check),
- the domain lock is taken for a much smaller region.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: By using global domain page mappings the demand on the underlying
     VA range may increase significantly. I did consider to use per-
     domain mappings instead, but they exist for x86 only. Of course we
     could have arch_{,un}map_guest_area() aliasing global domain page
     mapping functions on Arm and using per-domain mappings on x86. Yet
     then again map_vcpu_info() doesn't do so either (albeit that's
     likely to be converted subsequently to use map_vcpu_area() anyway).

RFC: In map_guest_area() I'm not checking the P2M type, instead - just
     like map_vcpu_info() - solely relying on the type ref acquisition.
     Checking for p2m_ram_rw alone would be wrong, as at least
     p2m_ram_logdirty ought to also be okay to use here (and in similar
     cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
     used here (like altp2m_vcpu_enable_ve() does) as well as in
     map_vcpu_info(), yet then again the P2M type is stale by the time
     it is being looked at anyway without the P2M lock held.

--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
                    struct guest_area *area,
                    void (*populate)(void *dst, struct vcpu *v))
 {
-    return -EOPNOTSUPP;
+    struct domain *currd = v->domain;
+    void *map = NULL;
+    struct page_info *pg = NULL;
+    int rc = 0;
+
+    if ( gaddr )
+    {
+        unsigned long gfn = PFN_DOWN(gaddr);
+        unsigned int align;
+        p2m_type_t p2mt;
+
+        if ( gfn != PFN_DOWN(gaddr + size - 1) )
+            return -ENXIO;
+
+#ifdef CONFIG_COMPAT
+        if ( has_32bit_shinfo(currd) )
+            align = alignof(compat_ulong_t);
+        else
+#endif
+            align = alignof(xen_ulong_t);
+        if ( gaddr & (align - 1) )
+            return -ENXIO;
+
+        rc = check_get_page_from_gfn(currd, _gfn(gfn), false, &p2mt, &pg);
+        if ( rc )
+            return rc;
+
+        if ( !get_page_type(pg, PGT_writable_page) )
+        {
+            put_page(pg);
+            return -EACCES;
+        }
+
+        map = __map_domain_page_global(pg);
+        if ( !map )
+        {
+            put_page_and_type(pg);
+            return -ENOMEM;
+        }
+        map += PAGE_OFFSET(gaddr);
+    }
+
+    if ( v != current )
+    {
+        if ( !spin_trylock(&currd->hypercall_deadlock_mutex) )
+        {
+            rc = -ERESTART;
+            goto unmap;
+        }
+
+        vcpu_pause(v);
+
+        spin_unlock(&currd->hypercall_deadlock_mutex);
+    }
+
+    domain_lock(currd);
+
+    if ( map )
+        populate(map, v);
+
+    SWAP(area->pg, pg);
+    SWAP(area->map, map);
+
+    domain_unlock(currd);
+
+    if ( v != current )
+        vcpu_unpause(v);
+
+ unmap:
+    if ( pg )
+    {
+        unmap_domain_page_global(map);
+        put_page_and_type(pg);
+    }
+
+    return rc;
 }
 
 /*
@@ -1573,6 +1648,22 @@ int map_guest_area(struct vcpu *v, paddr
  */
 void unmap_guest_area(struct vcpu *v, struct guest_area *area)
 {
+    struct domain *d = v->domain;
+    void *map;
+    struct page_info *pg;
+
+    domain_lock(d);
+    map = area->map;
+    area->map = NULL;
+    pg = area->pg;
+    area->pg = NULL;
+    domain_unlock(d);
+
+    if ( pg )
+    {
+        unmap_domain_page_global(map);
+        put_page_and_type(pg);
+    }
 }
 
 int default_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 08/10] domain: introduce GADDR based runstate area registration alternative
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (6 preceding siblings ...)
  2022-10-19  7:43 ` [PATCH RFC 07/10] domain: map/unmap " Jan Beulich
@ 2022-10-19  7:44 ` Jan Beulich
  2022-10-19  7:45 ` [PATCH 09/10] x86: introduce GADDR based secondary time " Jan Beulich
  2022-10-19  7:45 ` [PATCH RFC 10/10] common: convert vCPU info area registration Jan Beulich
  9 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:44 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

The registration by virtual/linear address has downsides: At least on
x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
PV domains the area is inaccessible (and hence cannot be updated by Xen)
when in guest-user mode.

Introduce a new vCPU operation allowing to register the runstate area by
guest-physical address.

An at least theoretical downside to using physically registered areas is
that PV then won't see dirty (and perhaps also accessed) bits set in its
respective page table entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_64/domain.c
+++ b/xen/arch/x86/x86_64/domain.c
@@ -12,6 +12,22 @@
 CHECK_vcpu_get_physid;
 #undef xen_vcpu_get_physid
 
+static void cf_check
+runstate_area_populate(void *map, struct vcpu *v)
+{
+    if ( is_pv_vcpu(v) )
+        v->arch.pv.need_update_runstate_area = false;
+
+    v->runstate_guest_area_compat = true;
+
+    if ( v == current )
+    {
+        struct compat_vcpu_runstate_info *info = map;
+
+        XLAT_vcpu_runstate_info(info, &v->runstate);
+    }
+}
+
 int
 compat_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
@@ -57,6 +73,25 @@ compat_vcpu_op(int cmd, unsigned int vcp
 
         break;
     }
+
+    case VCPUOP_register_runstate_phys_area:
+    {
+        struct compat_vcpu_register_runstate_memory_area area;
+
+        rc = -EFAULT;
+        if ( copy_from_guest(&area.addr.p, arg, 1) )
+            break;
+
+        rc = map_guest_area(v, area.addr.p,
+                            sizeof(struct compat_vcpu_runstate_info),
+                            &v->runstate_guest_area,
+                            runstate_area_populate);
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+                                               cmd, vcpuid, arg);
+
+        break;
+    }
 
     case VCPUOP_register_vcpu_time_memory_area:
     {
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1789,6 +1789,26 @@ bool update_runstate_area(struct vcpu *v
     return rc;
 }
 
+static void cf_check
+runstate_area_populate(void *map, struct vcpu *v)
+{
+#ifdef CONFIG_PV
+    if ( is_pv_vcpu(v) )
+        v->arch.pv.need_update_runstate_area = false;
+#endif
+
+#ifdef CONFIG_COMPAT
+    v->runstate_guest_area_compat = false;
+#endif
+
+    if ( v == current )
+    {
+        struct vcpu_runstate_info *info = map;
+
+        *info = v->runstate;
+    }
+}
+
 long common_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     long rc = 0;
@@ -1963,6 +1983,25 @@ long common_vcpu_op(int cmd, struct vcpu
 
         break;
     }
+
+    case VCPUOP_register_runstate_phys_area:
+    {
+        struct vcpu_register_runstate_memory_area area;
+
+        rc = -EFAULT;
+        if ( copy_from_guest(&area.addr.p, arg, 1) )
+            break;
+
+        rc = map_guest_area(v, area.addr.p,
+                            sizeof(struct vcpu_runstate_info),
+                            &v->runstate_guest_area,
+                            runstate_area_populate);
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+                                               cmd, vcpuid, arg);
+
+        break;
+    }
 
     default:
         rc = -ENOSYS;
--- a/xen/include/public/vcpu.h
+++ b/xen/include/public/vcpu.h
@@ -235,6 +235,15 @@ struct vcpu_register_time_memory_area {
 typedef struct vcpu_register_time_memory_area vcpu_register_time_memory_area_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
 
+/*
+ * Like the respective VCPUOP_register_*_memory_area, just using the "addr.p"
+ * field of the supplied struct as a guest physical address (i.e. in GFN space).
+ * The respective area may not cross a page boundary.  Pass 0 to unregister an
+ * area.  Note that as long as an area is registered by physical address, the
+ * linear address based area will not be serviced (updated) by the hypervisor.
+ */
+#define VCPUOP_register_runstate_phys_area      14
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 
 /*



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 09/10] x86: introduce GADDR based secondary time area registration alternative
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (7 preceding siblings ...)
  2022-10-19  7:44 ` [PATCH 08/10] domain: introduce GADDR based runstate area registration alternative Jan Beulich
@ 2022-10-19  7:45 ` Jan Beulich
  2022-10-19  7:45 ` [PATCH RFC 10/10] common: convert vCPU info area registration Jan Beulich
  9 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:45 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné

The registration by virtual/linear address has downsides: The access is
expensive for HVM/PVH domains. Furthermore for 64-bit PV domains the area
is inaccessible (and hence cannot be updated by Xen) when in guest-user
mode.

Introduce a new vCPU operation allowing to register the secondary time
area by guest-physical address.

An at least theoretical downside to using physically registered areas is
that PV then won't see dirty (and perhaps also accessed) bits set in its
respective page table entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1520,6 +1520,15 @@ int arch_vcpu_reset(struct vcpu *v)
     return 0;
 }
 
+static void cf_check
+time_area_populate(void *map, struct vcpu *v)
+{
+    if ( is_pv_vcpu(v) )
+        v->arch.pv.pending_system_time.version = 0;
+
+    force_update_secondary_system_time(v, map);
+}
+
 long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
     long rc = 0;
@@ -1557,6 +1566,25 @@ long do_vcpu_op(int cmd, unsigned int vc
 
         break;
     }
+
+    case VCPUOP_register_vcpu_time_phys_area:
+    {
+        struct vcpu_register_time_memory_area area;
+
+        rc = -EFAULT;
+        if ( copy_from_guest(&area.addr.p, arg, 1) )
+            break;
+
+        rc = map_guest_area(v, area.addr.p,
+                            sizeof(vcpu_time_info_t),
+                            &v->arch.time_guest_area,
+                            time_area_populate);
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+                                               cmd, vcpuid, arg);
+
+        break;
+    }
 
     case VCPUOP_get_physid:
     {
--- a/xen/arch/x86/x86_64/domain.c
+++ b/xen/arch/x86/x86_64/domain.c
@@ -115,6 +115,7 @@ compat_vcpu_op(int cmd, unsigned int vcp
 
     case VCPUOP_send_nmi:
     case VCPUOP_get_physid:
+    case VCPUOP_register_vcpu_time_phys_area:
         rc = do_vcpu_op(cmd, vcpuid, arg);
         break;
 
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -684,6 +684,8 @@ void domain_cpu_policy_changed(struct do
 
 bool update_secondary_system_time(struct vcpu *,
                                   struct vcpu_time_info *);
+void force_update_secondary_system_time(struct vcpu *,
+                                        struct vcpu_time_info *);
 
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1524,6 +1524,15 @@ void force_update_vcpu_system_time(struc
     __update_vcpu_system_time(v, 1);
 }
 
+void force_update_secondary_system_time(struct vcpu *v,
+                                        struct vcpu_time_info *map)
+{
+    struct vcpu_time_info u;
+
+    collect_time_info(v, &u);
+    write_time_guest_area(map, &u);
+}
+
 static void update_domain_rtc(void)
 {
     struct domain *d;
--- a/xen/include/public/vcpu.h
+++ b/xen/include/public/vcpu.h
@@ -243,6 +243,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_register_ti
  * linear address based area will not be serviced (updated) by the hypervisor.
  */
 #define VCPUOP_register_runstate_phys_area      14
+#define VCPUOP_register_vcpu_time_phys_area     15
 
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH RFC 10/10] common: convert vCPU info area registration
  2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
                   ` (8 preceding siblings ...)
  2022-10-19  7:45 ` [PATCH 09/10] x86: introduce GADDR based secondary time " Jan Beulich
@ 2022-10-19  7:45 ` Jan Beulich
  2022-12-17 15:53   ` Julien Grall
  9 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-10-19  7:45 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Roger Pau Monné,
	Tamas K Lengyel

Switch to using map_guest_area(). Noteworthy differences from
map_vcpu_info():
- remote vCPU-s are paused rather than checked for being down (which in
  principle can change right after the check),
- the domain lock is taken for a much smaller region,
- areas could now be registered more than once, if we want this,
- as a corner case registering the area at GFN 0 offset 0 is no longer
  possible (this is considered an invalid de-registration request).

Note that this eliminates a bug in copy_vcpu_settings(): The function
did allocate a new page regardless of the GFN already having a mapping,
thus in particular breaking the case of two vCPU-s having their info
areas on the same page.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: While the "no re-registration" check is retained, it is now racy.
     If we really think it needs retaining _and_ properly enforcing,
     then locking will be needed, but I don't think we can hold the
     domain lock around a call to map_guest_area(), first and foremost
     because of its use of vcpu_pause().

RFC: Is the GFN 0 offset 0 behavioral change deemed acceptable?

RFC: To have just a single instance of casts to vcpu_info_t *,
     introducing a macro (or inline function if header dependencies
     permit) might be nice. However, the only sensible identifier for it
     would imo be vcpu_info(). Yet we already have a macro of that name.
     With some trickery it might be possible to overload the macro
     (making the "field" argument optional), but this may not be
     desirable for other reasons (it could e.g. be deemed confusing).

--- a/xen/arch/x86/include/asm/shared.h
+++ b/xen/arch/x86/include/asm/shared.h
@@ -27,16 +27,16 @@ static inline void arch_set_##field(stru
 static inline type arch_get_##field(const struct vcpu *v)       \
 {                                                               \
     return !has_32bit_shinfo(v->domain) ?                       \
-           v->vcpu_info->native.arch.field :                    \
-           v->vcpu_info->compat.arch.field;                     \
+           ((const vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field : \
+           ((const vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field;  \
 }                                                               \
 static inline void arch_set_##field(struct vcpu *v,             \
                                     type val)                   \
 {                                                               \
     if ( !has_32bit_shinfo(v->domain) )                         \
-        v->vcpu_info->native.arch.field = val;                  \
+        ((vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field = val; \
     else                                                        \
-        v->vcpu_info->compat.arch.field = val;                  \
+        ((vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field = val; \
 }
 
 #else
@@ -57,12 +57,12 @@ static inline void arch_set_##field(stru
 #define GET_SET_VCPU(type, field)                           \
 static inline type arch_get_##field(const struct vcpu *v)   \
 {                                                           \
-    return v->vcpu_info->arch.field;                        \
+    return ((const vcpu_info_t *)v->vcpu_info_area.map)->arch.field; \
 }                                                           \
 static inline void arch_set_##field(struct vcpu *v,         \
                                     type val)               \
 {                                                           \
-    v->vcpu_info->arch.field = val;                         \
+    ((vcpu_info_t *)v->vcpu_info_area.map)->arch.field = val; \
 }
 
 #endif
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1758,53 +1758,24 @@ static int copy_vpmu(struct vcpu *d_vcpu
 static int copy_vcpu_settings(struct domain *cd, const struct domain *d)
 {
     unsigned int i;
-    struct p2m_domain *p2m = p2m_get_hostp2m(cd);
     int ret = -EINVAL;
 
     for ( i = 0; i < cd->max_vcpus; i++ )
     {
         struct vcpu *d_vcpu = d->vcpu[i];
         struct vcpu *cd_vcpu = cd->vcpu[i];
-        mfn_t vcpu_info_mfn;
 
         if ( !d_vcpu || !cd_vcpu )
             continue;
 
-        /* Copy & map in the vcpu_info page if the guest uses one */
-        vcpu_info_mfn = d_vcpu->vcpu_info_mfn;
-        if ( !mfn_eq(vcpu_info_mfn, INVALID_MFN) )
-        {
-            mfn_t new_vcpu_info_mfn = cd_vcpu->vcpu_info_mfn;
-
-            /* Allocate & map the page for it if it hasn't been already */
-            if ( mfn_eq(new_vcpu_info_mfn, INVALID_MFN) )
-            {
-                gfn_t gfn = mfn_to_gfn(d, vcpu_info_mfn);
-                unsigned long gfn_l = gfn_x(gfn);
-                struct page_info *page;
-
-                if ( !(page = alloc_domheap_page(cd, 0)) )
-                    return -ENOMEM;
-
-                new_vcpu_info_mfn = page_to_mfn(page);
-                set_gpfn_from_mfn(mfn_x(new_vcpu_info_mfn), gfn_l);
-
-                ret = p2m->set_entry(p2m, gfn, new_vcpu_info_mfn,
-                                     PAGE_ORDER_4K, p2m_ram_rw,
-                                     p2m->default_access, -1);
-                if ( ret )
-                    return ret;
-
-                ret = map_vcpu_info(cd_vcpu, gfn_l,
-                                    PAGE_OFFSET(d_vcpu->vcpu_info));
-                if ( ret )
-                    return ret;
-            }
-
-            copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
-        }
-
-        /* Same for the (physically registered) runstate and time info areas. */
+        /*
+         * Copy and map the vcpu_info page and the (physically registered)
+         * runstate and time info areas.
+         */
+        ret = copy_guest_area(&cd_vcpu->vcpu_info_area,
+                              &d_vcpu->vcpu_info_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
         ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
                               &d_vcpu->runstate_guest_area, cd_vcpu, d);
         if ( ret )
--- a/xen/arch/x86/pv/shim.c
+++ b/xen/arch/x86/pv/shim.c
@@ -395,7 +395,7 @@ int pv_shim_shutdown(uint8_t reason)
     for_each_vcpu ( d, v )
     {
         /* Unmap guest vcpu_info page and runstate/time areas. */
-        unmap_vcpu_info(v);
+        unmap_guest_area(v, &v->vcpu_info_area);
         unmap_guest_area(v, &v->runstate_guest_area);
         unmap_guest_area(v, &v->arch.time_guest_area);
 
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -1438,7 +1438,7 @@ static void __update_vcpu_system_time(st
     struct vcpu_time_info *u = &vcpu_info(v, time), _u;
     const struct domain *d = v->domain;
 
-    if ( v->vcpu_info == NULL )
+    if ( !v->vcpu_info_area.map )
         return;
 
     collect_time_info(v, &_u);
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -53,7 +53,7 @@ void __dummy__(void)
 
     OFFSET(VCPU_processor, struct vcpu, processor);
     OFFSET(VCPU_domain, struct vcpu, domain);
-    OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info);
+    OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
     OFFSET(VCPU_trap_bounce, struct vcpu, arch.pv.trap_bounce);
     OFFSET(VCPU_thread_flags, struct vcpu, arch.flags);
     OFFSET(VCPU_event_addr, struct vcpu, arch.pv.event_callback_eip);
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -97,7 +97,7 @@ static void _show_registers(
     if ( context == CTXT_hypervisor )
         printk(" %pS", _p(regs->rip));
     printk("\nRFLAGS: %016lx   ", regs->rflags);
-    if ( (context == CTXT_pv_guest) && v && v->vcpu_info )
+    if ( (context == CTXT_pv_guest) && v && v->vcpu_info_area.map )
         printk("EM: %d   ", !!vcpu_info(v, evtchn_upcall_mask));
     printk("CONTEXT: %s", context_names[context]);
     if ( v && !is_idle_vcpu(v) )
--- a/xen/common/compat/domain.c
+++ b/xen/common/compat/domain.c
@@ -49,7 +49,7 @@ int compat_common_vcpu_op(int cmd, struc
     {
     case VCPUOP_initialise:
     {
-        if ( v->vcpu_info == &dummy_vcpu_info )
+        if ( v->vcpu_info_area.map == &dummy_vcpu_info )
             return -EINVAL;
 
 #ifdef CONFIG_HVM
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -127,10 +127,10 @@ static void vcpu_info_reset(struct vcpu
 {
     struct domain *d = v->domain;
 
-    v->vcpu_info = ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS)
-                    ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id])
-                    : &dummy_vcpu_info);
-    v->vcpu_info_mfn = INVALID_MFN;
+    v->vcpu_info_area.map =
+        ((v->vcpu_id < XEN_LEGACY_MAX_VCPUS)
+         ? (vcpu_info_t *)&shared_info(d, vcpu_info[v->vcpu_id])
+         : &dummy_vcpu_info);
 }
 
 static void vmtrace_free_buffer(struct vcpu *v)
@@ -951,7 +951,7 @@ int domain_kill(struct domain *d)
             return -ERESTART;
         for_each_vcpu ( d, v )
         {
-            unmap_vcpu_info(v);
+            unmap_guest_area(v, &v->vcpu_info_area);
             unmap_guest_area(v, &v->runstate_guest_area);
         }
         d->is_dying = DOMDYING_dead;
@@ -1406,7 +1406,7 @@ int domain_soft_reset(struct domain *d,
     for_each_vcpu ( d, v )
     {
         set_xen_guest_handle(runstate_guest(v), NULL);
-        unmap_vcpu_info(v);
+        unmap_guest_area(v, &v->vcpu_info_area);
         unmap_guest_area(v, &v->runstate_guest_area);
     }
 
@@ -1454,111 +1454,6 @@ int vcpu_reset(struct vcpu *v)
     return rc;
 }
 
-/*
- * Map a guest page in and point the vcpu_info pointer at it.  This
- * makes sure that the vcpu_info is always pointing at a valid piece
- * of memory, and it sets a pending event to make sure that a pending
- * event doesn't get missed.
- */
-int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset)
-{
-    struct domain *d = v->domain;
-    void *mapping;
-    vcpu_info_t *new_info;
-    struct page_info *page;
-    unsigned int align;
-
-    if ( offset > (PAGE_SIZE - sizeof(*new_info)) )
-        return -ENXIO;
-
-#ifdef CONFIG_COMPAT
-    BUILD_BUG_ON(sizeof(*new_info) != sizeof(new_info->compat));
-    if ( has_32bit_shinfo(d) )
-        align = alignof(new_info->compat);
-    else
-#endif
-        align = alignof(*new_info);
-    if ( offset & (align - 1) )
-        return -ENXIO;
-
-    if ( !mfn_eq(v->vcpu_info_mfn, INVALID_MFN) )
-        return -EINVAL;
-
-    /* Run this command on yourself or on other offline VCPUS. */
-    if ( (v != current) && !(v->pause_flags & VPF_down) )
-        return -EINVAL;
-
-    page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE);
-    if ( !page )
-        return -EINVAL;
-
-    if ( !get_page_type(page, PGT_writable_page) )
-    {
-        put_page(page);
-        return -EINVAL;
-    }
-
-    mapping = __map_domain_page_global(page);
-    if ( mapping == NULL )
-    {
-        put_page_and_type(page);
-        return -ENOMEM;
-    }
-
-    new_info = (vcpu_info_t *)(mapping + offset);
-
-    if ( v->vcpu_info == &dummy_vcpu_info )
-    {
-        memset(new_info, 0, sizeof(*new_info));
-#ifdef XEN_HAVE_PV_UPCALL_MASK
-        __vcpu_info(v, new_info, evtchn_upcall_mask) = 1;
-#endif
-    }
-    else
-    {
-        memcpy(new_info, v->vcpu_info, sizeof(*new_info));
-    }
-
-    v->vcpu_info = new_info;
-    v->vcpu_info_mfn = page_to_mfn(page);
-
-    /* Set new vcpu_info pointer /before/ setting pending flags. */
-    smp_wmb();
-
-    /*
-     * Mark everything as being pending just to make sure nothing gets
-     * lost.  The domain will get a spurious event, but it can cope.
-     */
-#ifdef CONFIG_COMPAT
-    if ( !has_32bit_shinfo(d) )
-        write_atomic(&new_info->native.evtchn_pending_sel, ~0);
-    else
-#endif
-        write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0);
-    vcpu_mark_events_pending(v);
-
-    return 0;
-}
-
-/*
- * Unmap the vcpu info page if the guest decided to place it somewhere
- * else. This is used from domain_kill() and domain_soft_reset().
- */
-void unmap_vcpu_info(struct vcpu *v)
-{
-    mfn_t mfn = v->vcpu_info_mfn;
-
-    if ( mfn_eq(mfn, INVALID_MFN) )
-        return;
-
-    unmap_domain_page_global((void *)
-                             ((unsigned long)v->vcpu_info & PAGE_MASK));
-
-    vcpu_info_reset(v); /* NB: Clobbers v->vcpu_info_mfn */
-
-    put_page_and_type(mfn_to_page(mfn));
-}
-
 int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
                    struct guest_area *area,
                    void (*populate)(void *dst, struct vcpu *v))
@@ -1628,6 +1523,30 @@ int map_guest_area(struct vcpu *v, paddr
 
     domain_unlock(currd);
 
+    /* Set pending flags /after/ new vcpu_info pointer was set. */
+    if ( area == &v->vcpu_info_area )
+    {
+        /*
+         * Mark everything as being pending just to make sure nothing gets
+         * lost.  The domain will get a spurious event, but it can cope.
+         */
+#ifdef CONFIG_COMPAT
+        if ( !has_32bit_shinfo(currd) )
+        {
+            vcpu_info_t *info = area->map;
+
+            /* For VCPUOP_register_vcpu_info handling in common_vcpu_op(). */
+            BUILD_BUG_ON(sizeof(*info) != sizeof(info->compat));
+            write_atomic(&info->native.evtchn_pending_sel, ~0);
+        }
+        else
+#endif
+            write_atomic(&vcpu_info(v, evtchn_pending_sel), ~0);
+        vcpu_mark_events_pending(v);
+
+        force_update_vcpu_system_time(v);
+    }
+
     if ( v != current )
         vcpu_unpause(v);
 
@@ -1654,7 +1573,10 @@ void unmap_guest_area(struct vcpu *v, st
 
     domain_lock(d);
     map = area->map;
-    area->map = NULL;
+    if ( area == &v->vcpu_info_area )
+        vcpu_info_reset(v);
+    else
+        area->map = NULL;
     pg = area->pg;
     area->pg = NULL;
     domain_unlock(d);
@@ -1789,6 +1711,27 @@ bool update_runstate_area(struct vcpu *v
     return rc;
 }
 
+/*
+ * This makes sure that the vcpu_info is always pointing at a valid piece of
+ * memory, and it sets a pending event to make sure that a pending event
+ * doesn't get missed.
+ */
+static void cf_check
+vcpu_info_populate(void *map, struct vcpu *v)
+{
+    vcpu_info_t *info = map;
+
+    if ( v->vcpu_info_area.map == &dummy_vcpu_info )
+    {
+        memset(info, 0, sizeof(*info));
+#ifdef XEN_HAVE_PV_UPCALL_MASK
+        __vcpu_info(v, info, evtchn_upcall_mask) = 1;
+#endif
+    }
+    else
+        memcpy(info, v->vcpu_info_area.map, sizeof(*info));
+}
+
 static void cf_check
 runstate_area_populate(void *map, struct vcpu *v)
 {
@@ -1818,7 +1761,7 @@ long common_vcpu_op(int cmd, struct vcpu
     switch ( cmd )
     {
     case VCPUOP_initialise:
-        if ( v->vcpu_info == &dummy_vcpu_info )
+        if ( v->vcpu_info_area.map == &dummy_vcpu_info )
             return -EINVAL;
 
         rc = arch_initialise_vcpu(v, arg);
@@ -1942,16 +1885,30 @@ long common_vcpu_op(int cmd, struct vcpu
     case VCPUOP_register_vcpu_info:
     {
         struct vcpu_register_vcpu_info info;
+        paddr_t gaddr;
 
         rc = -EFAULT;
         if ( copy_from_guest(&info, arg, 1) )
             break;
 
-        domain_lock(d);
-        rc = map_vcpu_info(v, info.mfn, info.offset);
-        domain_unlock(d);
+        rc = -EINVAL;
+        gaddr = gfn_to_gaddr(_gfn(info.mfn)) + info.offset;
+        if ( !gaddr ||
+             gfn_x(gaddr_to_gfn(gaddr)) != info.mfn ||
+             /*
+              * Technically re-registration is okay, but it wasn't allowed
+              * originally.  By trying to win a race, a guest might be able
+              * to bypass this restriction.
+              */
+             v->vcpu_info_area.pg )
+            break;
 
-        force_update_vcpu_system_time(v);
+        /* See the BUILD_BUG_ON() in vcpu_info_populate(). */
+        rc = map_guest_area(v, gaddr, sizeof(vcpu_info_t),
+                            &v->vcpu_info_area, vcpu_info_populate);
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
+                                               cmd, vcpuid, arg);
 
         break;
     }
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -79,9 +79,6 @@ void cf_check free_pirq_struct(void *);
 int  arch_vcpu_create(struct vcpu *v);
 void arch_vcpu_destroy(struct vcpu *v);
 
-int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned int offset);
-void unmap_vcpu_info(struct vcpu *v);
-
 int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
                    struct guest_area *area,
                    void (*populate)(void *dst, struct vcpu *v));
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -175,7 +175,7 @@ struct vcpu
 
     int              processor;
 
-    vcpu_info_t     *vcpu_info;
+    struct guest_area vcpu_info_area;
 
     struct domain   *domain;
 
@@ -288,9 +288,6 @@ struct vcpu
 
     struct waitqueue_vcpu *waitqueue_vcpu;
 
-    /* Guest-specified relocation of vcpu_info. */
-    mfn_t            vcpu_info_mfn;
-
     struct evtchn_fifo_vcpu *evtchn_fifo;
 
     /* vPCI per-vCPU area, used to store data for long running operations. */
--- a/xen/include/xen/shared.h
+++ b/xen/include/xen/shared.h
@@ -44,6 +44,7 @@ typedef struct vcpu_info vcpu_info_t;
 extern vcpu_info_t dummy_vcpu_info;
 
 #define shared_info(d, field)      __shared_info(d, (d)->shared_info, field)
-#define vcpu_info(v, field)        __vcpu_info(v, (v)->vcpu_info, field)
+#define vcpu_info(v, field)        \
+        __vcpu_info(v, (vcpu_info_t *)(v)->vcpu_info_area.map, field)
 
 #endif /* __XEN_SHARED_H__ */



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas
  2022-10-19  7:42 ` [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas Jan Beulich
@ 2022-10-24 23:04   ` Tamas K Lengyel
  2022-10-25  6:18     ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Tamas K Lengyel @ 2022-10-24 23:04 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Andrew Cooper, George Dunlap, Julien Grall,
	Stefano Stabellini, Wei Liu, Roger Pau Monné

[-- Attachment #1: Type: text/plain, Size: 798 bytes --]

> @@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
>
>   state:
>      if ( reset_state )
> +    {
>          rc = copy_settings(d, pd);
> +        /* TBD: What to do here with -ERESTART? */

Generally speaking the fork reset operation does not support "restarting".
While in the memory op path the error can be propagated back to the
toolstack and have it re-issue it, on the monitor reply path that's not
possible. But the important question is where does the -ERESTART come
from?  What I think would happen here though is that -ERESTART may happen
during the initial fork op and that can fail, but if it succeeded, then
during reset it can't happen since everything would be already allocated
and mapped, the only thing during reset that would be done is the page
copying.

Tamas

[-- Attachment #2: Type: text/html, Size: 968 bytes --]

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas
  2022-10-24 23:04   ` Tamas K Lengyel
@ 2022-10-25  6:18     ` Jan Beulich
  0 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2022-10-25  6:18 UTC (permalink / raw)
  To: Tamas K Lengyel
  Cc: xen-devel, Andrew Cooper, George Dunlap, Julien Grall,
	Stefano Stabellini, Wei Liu, Roger Pau Monné

On 25.10.2022 01:04, Tamas K Lengyel wrote:
>> @@ -1987,7 +2056,10 @@ int mem_sharing_fork_reset(struct domain
>>
>>   state:
>>      if ( reset_state )
>> +    {
>>          rc = copy_settings(d, pd);
>> +        /* TBD: What to do here with -ERESTART? */
> 
> Generally speaking the fork reset operation does not support "restarting".
> While in the memory op path the error can be propagated back to the
> toolstack and have it re-issue it, on the monitor reply path that's not
> possible. But the important question is where does the -ERESTART come
> from?

From map_guest_area() when d's hypercall deadlock mutex is busy. I
guess d is fully paused here, but checking for that to avoid the vCPU
pausing in map_guest_area() would end up fragile, I'm afraid.

Speaking of which - for the use of map_guest_area() here I guess it's
wrong for the function to have a local variable named "currd". I didn't
have this use here in mind when writing that function ...

>  What I think would happen here though is that -ERESTART may happen
> during the initial fork op and that can fail, but if it succeeded, then
> during reset it can't happen since everything would be already allocated
> and mapped, the only thing during reset that would be done is the page
> copying.

As per above I don't think there's any dependency on initial fork vs reset
here.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 01/10] unify update_runstate_area()
  2022-10-19  7:38 ` [PATCH 01/10] unify update_runstate_area() Jan Beulich
@ 2022-11-24 20:43   ` Julien Grall
  0 siblings, 0 replies; 47+ messages in thread
From: Julien Grall @ 2022-11-24 20:43 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	Bertrand Marquis, Volodymyr Babchuk

Hi Jan,

On 19/10/2022 09:38, Jan Beulich wrote:
> x86'es variant is a superset of Arm's, with CONFIG_COMPAT parts already
> properly marked. The only other missing piece is
> update_guest_memory_policy(): For the time being Arm simply gains an
> empty struct and inline function.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Julien Grall <jgrall@amazon.com>

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-10-19  7:43 ` [PATCH RFC 07/10] domain: map/unmap " Jan Beulich
@ 2022-11-24 21:29   ` Julien Grall
  2022-11-28  9:01     ` Jan Beulich
  2023-01-17 22:20     ` Andrew Cooper
  2023-01-17 22:04   ` Andrew Cooper
  1 sibling, 2 replies; 47+ messages in thread
From: Julien Grall @ 2022-11-24 21:29 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné

Hi Jan,

I am still digesting this series and replying with some initial comments.

On 19/10/2022 09:43, Jan Beulich wrote:
> The registration by virtual/linear address has downsides: At least on
> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
> PV domains the areas are inaccessible (and hence cannot be updated by
> Xen) when in guest-user mode.
> 
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, flesh out the map/unmap functions.
> 
> Noteworthy differences from map_vcpu_info():
> - areas can be registered more than once (and de-registered),
> - remote vCPU-s are paused rather than checked for being down (which in
>    principle can change right after the check),
> - the domain lock is taken for a much smaller region.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: By using global domain page mappings the demand on the underlying
>       VA range may increase significantly. I did consider to use per-
>       domain mappings instead, but they exist for x86 only. Of course we
>       could have arch_{,un}map_guest_area() aliasing global domain page
>       mapping functions on Arm and using per-domain mappings on x86. Yet
>       then again map_vcpu_info() doesn't do so either (albeit that's
>       likely to be converted subsequently to use map_vcpu_area() anyway).
> 
> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>       like map_vcpu_info() - solely relying on the type ref acquisition.
>       Checking for p2m_ram_rw alone would be wrong, as at least
>       p2m_ram_logdirty ought to also be okay to use here (and in similar
>       cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>       used here (like altp2m_vcpu_enable_ve() does) as well as in
>       map_vcpu_info(), yet then again the P2M type is stale by the time
>       it is being looked at anyway without the P2M lock held.
> 
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>                      struct guest_area *area,
>                      void (*populate)(void *dst, struct vcpu *v))
>   {
> -    return -EOPNOTSUPP;
> +    struct domain *currd = v->domain;
> +    void *map = NULL;
> +    struct page_info *pg = NULL;
> +    int rc = 0;
> +
> +    if ( gaddr )

0 is technically a valid (guest) physical address on Arm.

> +    {
> +        unsigned long gfn = PFN_DOWN(gaddr);

This could be gfn_t for adding some type safety.

> +        unsigned int align;
> +        p2m_type_t p2mt;
> +
> +        if ( gfn != PFN_DOWN(gaddr + size - 1) )
> +            return -ENXIO;
> +
> +#ifdef CONFIG_COMPAT
> +        if ( has_32bit_shinfo(currd) )
> +            align = alignof(compat_ulong_t);
> +        else
> +#endif
> +            align = alignof(xen_ulong_t);
> +        if ( gaddr & (align - 1) )
> +            return -ENXIO;
> +
> +        rc = check_get_page_from_gfn(currd, _gfn(gfn), false, &p2mt, &pg);
> +        if ( rc )
> +            return rc;
> +
> +        if ( !get_page_type(pg, PGT_writable_page) )
> +        {
> +            put_page(pg);
> +            return -EACCES;
> +        }
> +
> +        map = __map_domain_page_global(pg);
> +        if ( !map )
> +        {
> +            put_page_and_type(pg);
> +            return -ENOMEM;
> +        }
> +        map += PAGE_OFFSET(gaddr);
> +    }
> +
> +    if ( v != current )
> +    {
> +        if ( !spin_trylock(&currd->hypercall_deadlock_mutex) )
> +        {
> +            rc = -ERESTART;
> +            goto unmap;
> +        }
> +
> +        vcpu_pause(v);

AFAIU, the goal of vcpu_pause() here is to guarantee that the "area" 
will not be touched by another pCPU. However, looking at the function 
context_switch() we have:

sched_context_switched(prev, next);
_update_runstate_area();

The first function will set v->is_running to false (see 
vcpu_context_saved()). So I think the "area" could be touched even afte 
vcpu_pause() is returned.

Therefore, I think we will need _update_runstate_area() (or 
update_runstate_area()) to be called first.

> +
> +        spin_unlock(&currd->hypercall_deadlock_mutex);
> +    }
> +
> +    domain_lock(currd);
> +
> +    if ( map )
> +        populate(map, v);
> +
> +    SWAP(area->pg, pg);
> +    SWAP(area->map, map);
> +
> +    domain_unlock(currd);
> +
> +    if ( v != current )
> +        vcpu_unpause(v);
> +
> + unmap:
> +    if ( pg )
> +    {
> +        unmap_domain_page_global(map);
> +        put_page_and_type(pg);
> +    }
> +
> +    return rc;
>   }
>   
>   /*
> @@ -1573,6 +1648,22 @@ int map_guest_area(struct vcpu *v, paddr
>    */
>   void unmap_guest_area(struct vcpu *v, struct guest_area *area)
>   {
> +    struct domain *d = v->domain;
> +    void *map;
> +    struct page_info *pg;

AFAIU, the assumption is the vCPU should be paused here. Should we add 
an ASSERT()?

> +
> +    domain_lock(d);
> +    map = area->map;
> +    area->map = NULL;
> +    pg = area->pg;
> +    area->pg = NULL;
> +    domain_unlock(d);
> +
> +    if ( pg )
> +    {
> +        unmap_domain_page_global(map);
> +        put_page_and_type(pg);
> +    }
>   }
>   
>   int default_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
> 

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-11-24 21:29   ` Julien Grall
@ 2022-11-28  9:01     ` Jan Beulich
  2022-11-29  8:40       ` Julien Grall
  2023-01-17 22:20     ` Andrew Cooper
  1 sibling, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-11-28  9:01 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 24.11.2022 22:29, Julien Grall wrote:
> On 19/10/2022 09:43, Jan Beulich wrote:
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>                      struct guest_area *area,
>>                      void (*populate)(void *dst, struct vcpu *v))
>>   {
>> -    return -EOPNOTSUPP;
>> +    struct domain *currd = v->domain;
>> +    void *map = NULL;
>> +    struct page_info *pg = NULL;
>> +    int rc = 0;
>> +
>> +    if ( gaddr )
> 
> 0 is technically a valid (guest) physical address on Arm.

I guess it is everywhere; it certainly also is on x86. While perhaps a
little unfortunate in ordering, the public header changes coming only
in the following patches was the best way I could think of to split
this work into reasonable size pieces. There the special meaning of 0
is clearly documented. And I don't really see it as a meaningful
limitation to not allow guests to register such areas at address zero.

>> +    {
>> +        unsigned long gfn = PFN_DOWN(gaddr);
> 
> This could be gfn_t for adding some type safety.

Indeed I did consider doing so, but the resulting code would imo be
less legible. But this difference perhaps isn't significant enough
for me to object to changing, in case you (or others) think the
type safety is really a meaningful gain here.

>> +        unsigned int align;
>> +        p2m_type_t p2mt;
>> +
>> +        if ( gfn != PFN_DOWN(gaddr + size - 1) )
>> +            return -ENXIO;
>> +
>> +#ifdef CONFIG_COMPAT
>> +        if ( has_32bit_shinfo(currd) )
>> +            align = alignof(compat_ulong_t);
>> +        else
>> +#endif
>> +            align = alignof(xen_ulong_t);
>> +        if ( gaddr & (align - 1) )
>> +            return -ENXIO;
>> +
>> +        rc = check_get_page_from_gfn(currd, _gfn(gfn), false, &p2mt, &pg);
>> +        if ( rc )
>> +            return rc;
>> +
>> +        if ( !get_page_type(pg, PGT_writable_page) )
>> +        {
>> +            put_page(pg);
>> +            return -EACCES;
>> +        }
>> +
>> +        map = __map_domain_page_global(pg);
>> +        if ( !map )
>> +        {
>> +            put_page_and_type(pg);
>> +            return -ENOMEM;
>> +        }
>> +        map += PAGE_OFFSET(gaddr);
>> +    }
>> +
>> +    if ( v != current )
>> +    {
>> +        if ( !spin_trylock(&currd->hypercall_deadlock_mutex) )
>> +        {
>> +            rc = -ERESTART;
>> +            goto unmap;
>> +        }
>> +
>> +        vcpu_pause(v);
> 
> AFAIU, the goal of vcpu_pause() here is to guarantee that the "area" 
> will not be touched by another pCPU.

Hmm, I find the way you put it a little confusing, but yes. I'd express
it as "The purpose of the vcpu_pause() is to guarantee that the vCPU in
question won't use its own area while the location thereof is being
updated." This includes updates by Xen as well as guest side consumption
of the data (with focus on the former, yes).

> However, looking at the function context_switch() we have:
> 
> sched_context_switched(prev, next);
> _update_runstate_area();

With this really being

    _update_runstate_area(next);

...

> The first function will set v->is_running to false (see 
> vcpu_context_saved()). So I think the "area" could be touched even afte 
> vcpu_pause() is returned.
> 
> Therefore, I think we will need _update_runstate_area() (or 
> update_runstate_area()) to be called first.

... I don't see a need for adjustment. The corresponding

    _update_runstate_area(prev);

sits quite a bit earlier in context_switch(). (Arm code is quite a bit
different, but this particular aspect looks to apply there as well.)

>> @@ -1573,6 +1648,22 @@ int map_guest_area(struct vcpu *v, paddr
>>    */
>>   void unmap_guest_area(struct vcpu *v, struct guest_area *area)
>>   {
>> +    struct domain *d = v->domain;
>> +    void *map;
>> +    struct page_info *pg;
> 
> AFAIU, the assumption is the vCPU should be paused here.

Yes, as the comment ahead of the function (introduced by an earlier
patch) says.

> Should we add an ASSERT()?

I was going from unmap_vcpu_info(), which had the same requirement,
while also only recording it by way of a comment. I certainly could
add an ASSERT(), but besides this being questionable as to the rules
set forth in ./CODING_STYLE I also view assertions of "paused" state
as being of limited use - the entity in question may become unpaused
on the clock cycle after the check was done. (But yes, such are no
different from e.g. the fair number of spin_is_locked() checks we
have scattered around, which don't really provide guarantees either.)

Jan

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-11-28  9:01     ` Jan Beulich
@ 2022-11-29  8:40       ` Julien Grall
  2022-11-29 14:02         ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-11-29  8:40 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

Hi Jan,

On 28/11/2022 10:01, Jan Beulich wrote:
> On 24.11.2022 22:29, Julien Grall wrote:
>> On 19/10/2022 09:43, Jan Beulich wrote:
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>                       struct guest_area *area,
>>>                       void (*populate)(void *dst, struct vcpu *v))
>>>    {
>>> -    return -EOPNOTSUPP;
>>> +    struct domain *currd = v->domain;
>>> +    void *map = NULL;
>>> +    struct page_info *pg = NULL;
>>> +    int rc = 0;
>>> +
>>> +    if ( gaddr )
>>
>> 0 is technically a valid (guest) physical address on Arm.
> 
> I guess it is everywhere; it certainly also is on x86. While perhaps a
> little unfortunate in ordering, the public header changes coming only
> in the following patches was the best way I could think of to split
> this work into reasonable size pieces. There the special meaning of 0
> is clearly documented. And I don't really see it as a meaningful
> limitation to not allow guests to register such areas at address zero.
I would expect an OS to allocate the region using the generic physical 
allocator. This allocator may decide that '0' is a valid address and 
return it.

So I think your approach could potentially complicate the OS 
implementation. I think it would be better to use an all Fs value as 
this cannot be valid for this hypercall (we require at least 4-byte 
alignment).

> 
>>> +    {
>>> +        unsigned long gfn = PFN_DOWN(gaddr);
>>
>> This could be gfn_t for adding some type safety.
> 
> Indeed I did consider doing so, but the resulting code would imo be
> less legible. But this difference perhaps isn't significant enough
> for me to object to changing, in case you (or others) think the
> type safety is really a meaningful gain here.

In general, my preference is to always use the typesafe version because 
it reduces the number of "unsigned long".

[...]

>> The first function will set v->is_running to false (see
>> vcpu_context_saved()). So I think the "area" could be touched even afte
>> vcpu_pause() is returned.
>>
>> Therefore, I think we will need _update_runstate_area() (or
>> update_runstate_area()) to be called first.
> 
> ... I don't see a need for adjustment. The corresponding
> 
>      _update_runstate_area(prev);
> 
> sits quite a bit earlier in context_switch(). (Arm code is quite a bit
> different, but this particular aspect looks to apply there as well.)

You are right. Sorry I misread the code.

> 
>>> @@ -1573,6 +1648,22 @@ int map_guest_area(struct vcpu *v, paddr
>>>     */
>>>    void unmap_guest_area(struct vcpu *v, struct guest_area *area)
>>>    {
>>> +    struct domain *d = v->domain;
>>> +    void *map;
>>> +    struct page_info *pg;
>>
>> AFAIU, the assumption is the vCPU should be paused here.
> 
> Yes, as the comment ahead of the function (introduced by an earlier
> patch) says.

Ah, I missed that. Thanks!

> 
>> Should we add an ASSERT()?
> 
> I was going from unmap_vcpu_info(), which had the same requirement,
> while also only recording it by way of a comment. I certainly could
> add an ASSERT(), but besides this being questionable as to the rules
> set forth in ./CODING_STYLE I also view assertions of "paused" state
> as being of limited use - the entity in question may become unpaused
> on the clock cycle after the check was done. 

That's correct. However, that race you mention is unlikely to happen 
*every* time. So there are a very high chance the ASSERT() will hit if 
miscalled.

> (But yes, such are no
> different from e.g. the fair number of spin_is_locked() checks we
> have scattered around, which don't really provide guarantees either.)
And that's fine to not provide the full guarantee. You are still 
probably going to catch 95% (if not more) of the callers that forgot to 
call it with the spin lock held.

At least for me, those ASSERTs() were super helpful during development 
in more than a few cases.

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-11-29  8:40       ` Julien Grall
@ 2022-11-29 14:02         ` Jan Beulich
  2022-12-06 18:27           ` Julien Grall
  0 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-11-29 14:02 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 29.11.2022 09:40, Julien Grall wrote:
> On 28/11/2022 10:01, Jan Beulich wrote:
>> On 24.11.2022 22:29, Julien Grall wrote:
>>> On 19/10/2022 09:43, Jan Beulich wrote:
>>>> --- a/xen/common/domain.c
>>>> +++ b/xen/common/domain.c
>>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>>                       struct guest_area *area,
>>>>                       void (*populate)(void *dst, struct vcpu *v))
>>>>    {
>>>> -    return -EOPNOTSUPP;
>>>> +    struct domain *currd = v->domain;
>>>> +    void *map = NULL;
>>>> +    struct page_info *pg = NULL;
>>>> +    int rc = 0;
>>>> +
>>>> +    if ( gaddr )
>>>
>>> 0 is technically a valid (guest) physical address on Arm.
>>
>> I guess it is everywhere; it certainly also is on x86. While perhaps a
>> little unfortunate in ordering, the public header changes coming only
>> in the following patches was the best way I could think of to split
>> this work into reasonable size pieces. There the special meaning of 0
>> is clearly documented. And I don't really see it as a meaningful
>> limitation to not allow guests to register such areas at address zero.
> I would expect an OS to allocate the region using the generic physical 
> allocator. This allocator may decide that '0' is a valid address and 
> return it.
> 
> So I think your approach could potentially complicate the OS 
> implementation. I think it would be better to use an all Fs value as 
> this cannot be valid for this hypercall (we require at least 4-byte 
> alignment).

Valid point, yet my avoiding of an all-Fs value was specifically with
compat callers in mind - the values would be different for these and
native ones, which would make the check more clumsy (otherwise it
could simply be "if ( ~gaddr )").

>>>> @@ -1573,6 +1648,22 @@ int map_guest_area(struct vcpu *v, paddr
>>>>     */
>>>>    void unmap_guest_area(struct vcpu *v, struct guest_area *area)
>>>>    {
>>>> +    struct domain *d = v->domain;
>>>> +    void *map;
>>>> +    struct page_info *pg;
>>>
>>> AFAIU, the assumption is the vCPU should be paused here.
>>
>> Yes, as the comment ahead of the function (introduced by an earlier
>> patch) says.
> 
> Ah, I missed that. Thanks!
> 
>>
>>> Should we add an ASSERT()?
>>
>> I was going from unmap_vcpu_info(), which had the same requirement,
>> while also only recording it by way of a comment. I certainly could
>> add an ASSERT(), but besides this being questionable as to the rules
>> set forth in ./CODING_STYLE I also view assertions of "paused" state
>> as being of limited use - the entity in question may become unpaused
>> on the clock cycle after the check was done. 
> 
> That's correct. However, that race you mention is unlikely to happen 
> *every* time. So there are a very high chance the ASSERT() will hit if 
> miscalled.
> 
>> (But yes, such are no
>> different from e.g. the fair number of spin_is_locked() checks we
>> have scattered around, which don't really provide guarantees either.)
> And that's fine to not provide the full guarantee. You are still 
> probably going to catch 95% (if not more) of the callers that forgot to 
> call it with the spin lock held.
> 
> At least for me, those ASSERTs() were super helpful during development 
> in more than a few cases.

Okay, I'll add one then, but in the earlier patch, matching the comment
that's introduced there.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-11-29 14:02         ` Jan Beulich
@ 2022-12-06 18:27           ` Julien Grall
  2022-12-07  8:29             ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-12-06 18:27 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

Hi Jan,

On 29/11/2022 14:02, Jan Beulich wrote:
> On 29.11.2022 09:40, Julien Grall wrote:
>> On 28/11/2022 10:01, Jan Beulich wrote:
>>> On 24.11.2022 22:29, Julien Grall wrote:
>>>> On 19/10/2022 09:43, Jan Beulich wrote:
>>>>> --- a/xen/common/domain.c
>>>>> +++ b/xen/common/domain.c
>>>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>>>                        struct guest_area *area,
>>>>>                        void (*populate)(void *dst, struct vcpu *v))
>>>>>     {
>>>>> -    return -EOPNOTSUPP;
>>>>> +    struct domain *currd = v->domain;
>>>>> +    void *map = NULL;
>>>>> +    struct page_info *pg = NULL;
>>>>> +    int rc = 0;
>>>>> +
>>>>> +    if ( gaddr )
>>>>
>>>> 0 is technically a valid (guest) physical address on Arm.
>>>
>>> I guess it is everywhere; it certainly also is on x86. While perhaps a
>>> little unfortunate in ordering, the public header changes coming only
>>> in the following patches was the best way I could think of to split
>>> this work into reasonable size pieces. There the special meaning of 0
>>> is clearly documented. And I don't really see it as a meaningful
>>> limitation to not allow guests to register such areas at address zero.
>> I would expect an OS to allocate the region using the generic physical
>> allocator. This allocator may decide that '0' is a valid address and
>> return it.
>>
>> So I think your approach could potentially complicate the OS
>> implementation. I think it would be better to use an all Fs value as
>> this cannot be valid for this hypercall (we require at least 4-byte
>> alignment).
> 
> Valid point, yet my avoiding of an all-Fs value was specifically with
> compat callers in mind - the values would be different for these and
> native ones, which would make the check more clumsy (otherwise it
> could simply be "if ( ~gaddr )").

Ah I forgot about compat. How about converting the 32-bit Fs to a 64-bit 
Fs in the compat code?

This will avoid to add restriction in the hypercall interface just 
because of compat.

> 
>>>>> @@ -1573,6 +1648,22 @@ int map_guest_area(struct vcpu *v, paddr
>>>>>      */
>>>>>     void unmap_guest_area(struct vcpu *v, struct guest_area *area)
>>>>>     {
>>>>> +    struct domain *d = v->domain;
>>>>> +    void *map;
>>>>> +    struct page_info *pg;
>>>>
>>>> AFAIU, the assumption is the vCPU should be paused here.
>>>
>>> Yes, as the comment ahead of the function (introduced by an earlier
>>> patch) says.
>>
>> Ah, I missed that. Thanks!
>>
>>>
>>>> Should we add an ASSERT()?
>>>
>>> I was going from unmap_vcpu_info(), which had the same requirement,
>>> while also only recording it by way of a comment. I certainly could
>>> add an ASSERT(), but besides this being questionable as to the rules
>>> set forth in ./CODING_STYLE I also view assertions of "paused" state
>>> as being of limited use - the entity in question may become unpaused
>>> on the clock cycle after the check was done.
>>
>> That's correct. However, that race you mention is unlikely to happen
>> *every* time. So there are a very high chance the ASSERT() will hit if
>> miscalled.
>>
>>> (But yes, such are no
>>> different from e.g. the fair number of spin_is_locked() checks we
>>> have scattered around, which don't really provide guarantees either.)
>> And that's fine to not provide the full guarantee. You are still
>> probably going to catch 95% (if not more) of the callers that forgot to
>> call it with the spin lock held.
>>
>> At least for me, those ASSERTs() were super helpful during development
>> in more than a few cases.
> 
> Okay, I'll add one then, but in the earlier patch, matching the comment
> that's introduced there.

Thanks! I still owe you a review for the rest of the series.

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-12-06 18:27           ` Julien Grall
@ 2022-12-07  8:29             ` Jan Beulich
  0 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2022-12-07  8:29 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 06.12.2022 19:27, Julien Grall wrote:
> On 29/11/2022 14:02, Jan Beulich wrote:
>> On 29.11.2022 09:40, Julien Grall wrote:
>>> On 28/11/2022 10:01, Jan Beulich wrote:
>>>> On 24.11.2022 22:29, Julien Grall wrote:
>>>>> On 19/10/2022 09:43, Jan Beulich wrote:
>>>>>> --- a/xen/common/domain.c
>>>>>> +++ b/xen/common/domain.c
>>>>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>>>>                        struct guest_area *area,
>>>>>>                        void (*populate)(void *dst, struct vcpu *v))
>>>>>>     {
>>>>>> -    return -EOPNOTSUPP;
>>>>>> +    struct domain *currd = v->domain;
>>>>>> +    void *map = NULL;
>>>>>> +    struct page_info *pg = NULL;
>>>>>> +    int rc = 0;
>>>>>> +
>>>>>> +    if ( gaddr )
>>>>>
>>>>> 0 is technically a valid (guest) physical address on Arm.
>>>>
>>>> I guess it is everywhere; it certainly also is on x86. While perhaps a
>>>> little unfortunate in ordering, the public header changes coming only
>>>> in the following patches was the best way I could think of to split
>>>> this work into reasonable size pieces. There the special meaning of 0
>>>> is clearly documented. And I don't really see it as a meaningful
>>>> limitation to not allow guests to register such areas at address zero.
>>> I would expect an OS to allocate the region using the generic physical
>>> allocator. This allocator may decide that '0' is a valid address and
>>> return it.
>>>
>>> So I think your approach could potentially complicate the OS
>>> implementation. I think it would be better to use an all Fs value as
>>> this cannot be valid for this hypercall (we require at least 4-byte
>>> alignment).
>>
>> Valid point, yet my avoiding of an all-Fs value was specifically with
>> compat callers in mind - the values would be different for these and
>> native ones, which would make the check more clumsy (otherwise it
>> could simply be "if ( ~gaddr )").
> 
> Ah I forgot about compat. How about converting the 32-bit Fs to a 64-bit 
> Fs in the compat code?
> 
> This will avoid to add restriction in the hypercall interface just 
> because of compat.

Possible, but ugly: Since we're using the uint64_t field of the interface
structure, no translation is presently necessary for
VCPUOP_register_vcpu_time_phys_area (we do have the layer for
VCPUOP_register_runstate_phys_area, because of the size of the pointed
to area being different). Hence what you ask for would mean adding compat
code somewhere. In which case we could as well make the above check
itself depend on whether we're dealing with a compat domain. Which is
what I've named "clumsy" above; still that would seem better to me than
to add actual compat translation code.

Then again, looking at the last patch, switching to what you suggest
would (largely) address one of the RFC remarks there as well. So I guess
I'll make the change to that if() plus associated adjustments elsewhere
(in particular in the last patch there is a similar check which needs to
remain in sync).

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown
  2022-10-19  7:40 ` [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown Jan Beulich
@ 2022-12-13 21:44   ` Julien Grall
  2022-12-14  9:12     ` Jan Beulich
  2023-01-17 21:17   ` Andrew Cooper
  1 sibling, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-12-13 21:44 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné

Hi Jan,

On 19/10/2022 08:40, Jan Beulich wrote:
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, add the necessary domain cleanup hooks.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: Zapping the areas in pv_shim_shutdown() may not be strictly
>       necessary: Aiui unmap_vcpu_info() is called only because the vCPU
>       info area cannot be re-registered. Beyond that I guess the
>       assumption is that the areas would only be re-registered as they
>       were before. If that's not the case I wonder whether the guest
>       handles for both areas shouldn't also be zapped.

I don't know the code enough to be able to answer it.

The code itself looks good to me. With one remark below:

Reviewed-by: Julien Grall <jgrall@amazon.com>

[...]

> @@ -1555,6 +1559,15 @@ void unmap_vcpu_info(struct vcpu *v)
>       put_page_and_type(mfn_to_page(mfn));
>   }
>   
> +/*
> + * This is only intended to be used for domain cleanup (or more generally only
> + * with at least the respective vCPU, if it's not the current one, reliably
> + * paused).
> + */
> +void unmap_guest_area(struct vcpu *v, struct guest_area *area)
> +{

IIUC, you will add the ASSERT() we discussed in patch #7 in this patch. 
I would be fine if you keep my reviewed-by.

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown
  2022-12-13 21:44   ` Julien Grall
@ 2022-12-14  9:12     ` Jan Beulich
  0 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2022-12-14  9:12 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 13.12.2022 22:44, Julien Grall wrote:
> On 19/10/2022 08:40, Jan Beulich wrote:
>> In preparation of the introduction of new vCPU operations allowing to
>> register the respective areas (one of the two is x86-specific) by
>> guest-physical address, add the necessary domain cleanup hooks.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: Zapping the areas in pv_shim_shutdown() may not be strictly
>>       necessary: Aiui unmap_vcpu_info() is called only because the vCPU
>>       info area cannot be re-registered. Beyond that I guess the
>>       assumption is that the areas would only be re-registered as they
>>       were before. If that's not the case I wonder whether the guest
>>       handles for both areas shouldn't also be zapped.
> 
> I don't know the code enough to be able to answer it.

Right; I hope the original shim authors to be able to shed some light
on this.

> The code itself looks good to me. With one remark below:
> 
> Reviewed-by: Julien Grall <jgrall@amazon.com>

Thanks.

>> @@ -1555,6 +1559,15 @@ void unmap_vcpu_info(struct vcpu *v)
>>       put_page_and_type(mfn_to_page(mfn));
>>   }
>>   
>> +/*
>> + * This is only intended to be used for domain cleanup (or more generally only
>> + * with at least the respective vCPU, if it's not the current one, reliably
>> + * paused).
>> + */
>> +void unmap_guest_area(struct vcpu *v, struct guest_area *area)
>> +{
> 
> IIUC, you will add the ASSERT() we discussed in patch #7 in this patch. 
> I would be fine if you keep my reviewed-by.

And thanks again. Indeed this is what I have pending for v2:

/*
 * This is only intended to be used for domain cleanup (or more generally only
 * with at least the respective vCPU, if it's not the current one, reliably
 * paused).
 */
void unmap_guest_area(struct vcpu *v, struct guest_area *area)
{
    if ( v != current )
        ASSERT(atomic_read(&v->pause_count) | atomic_read(&d->pause_count));
}

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-10-19  7:41 ` [PATCH RFC 04/10] domain: update GADDR based runstate guest area Jan Beulich
@ 2022-12-16 12:26   ` Julien Grall
  2022-12-19 12:48     ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-12-16 12:26 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné

Hi Jan,

On 19/10/2022 08:41, Jan Beulich wrote:
> Before adding a new vCPU operation to register the runstate area by
> guest-physical address, add code to actually keep such areas up-to-date.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: Pages aren't marked dirty when written to (matching the handling of
>       space mapped by map_vcpu_info() afaict), on the basis that the
>       registrations are lost anyway across migration. 

So I agree for the existing migration. But I wonder whether we would 
need to dirty those pages if we live-migrated a guest without its help 
(IOW the registrations would still be present).

Anyway, nothing to worry about yet as this is not supported upstream.

> Plus the contents
>       of the areas in question have to be deemed volatile in the first
>       place (so saving a "most recent" value is pretty meaningless even
>       for e.g. snapshotting).
> 
> RFC: Can we perhaps avoid the VM-assist conditionals, assuming the more
>       modern behavior to apply uniformly for gaddr-based registrations?

It is not clear why someone would want to use the old behavior with the 
new gaddr-based registrations. So I would say yes.

> 
> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>       and alignment) of the runstate area. I don't think it is an option
>       to require 32-bit code to pass a range such that even the 64-bit
>       layout wouldn't cross a page boundary (and be suitably aligned). I
>       also don't see any other good solution, so for now a crude approach
>       with an extra boolean is used (using has_32bit_shinfo() isn't race
>       free and could hence lead to overrunning the mapped space).

I think the extra check for 32-bit code to pass the check for 64-bit 
layout would be better.

> 
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -1599,14 +1599,55 @@ bool update_runstate_area(struct vcpu *v
>       struct guest_memory_policy policy = { };
>       void __user *guest_handle = NULL;
>       struct vcpu_runstate_info runstate;
> +    struct vcpu_runstate_info *map = v->runstate_guest_area.map;
> +
> +    memcpy(&runstate, &v->runstate, sizeof(runstate));
> +
> +    if ( map )
> +    {
> +        uint64_t *pset = NULL;
> +#ifdef CONFIG_COMPAT
> +        struct compat_vcpu_runstate_info *cmap = NULL;
> +
> +        if ( v->runstate_guest_area_compat )
> +            cmap = (void *)map;
> +#endif
> +
> +        if ( VM_ASSIST(v->domain, runstate_update_flag) )
> +        {
> +#ifdef CONFIG_COMPAT
> +            if ( cmap )
> +                pset = &cmap->state_entry_time;
> +            else
> +#endif
> +                pset = &map->state_entry_time;
> +            runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> +            write_atomic(pset, runstate.state_entry_time);
> +            smp_wmb();
> +        }
> +
> +#ifdef CONFIG_COMPAT
> +        if ( cmap )
> +            XLAT_vcpu_runstate_info(cmap, &runstate);
> +        else
> +#endif
> +            *map = runstate;
> +
> +        if ( pset )
> +        {
> +            smp_wmb();
> +            runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +            write_atomic(pset, runstate.state_entry_time);
> +        }
> +
> +        return true;
> +    }
>   
>       if ( guest_handle_is_null(runstate_guest(v)) )
>           return true;
>   
>       update_guest_memory_policy(v, &policy);
>   
> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
> -
>       if ( VM_ASSIST(v->domain, runstate_update_flag) )
>       {
>   #ifdef CONFIG_COMPAT
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -231,6 +231,8 @@ struct vcpu
>   #ifdef CONFIG_COMPAT
>       /* A hypercall is using the compat ABI? */
>       bool             hcall_compat;
> +    /* Physical runstate area registered via compat ABI? */
> +    bool             runstate_guest_area_compat;
>   #endif
>   
>   #ifdef CONFIG_IOREQ_SERVER
> 

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 10/10] common: convert vCPU info area registration
  2022-10-19  7:45 ` [PATCH RFC 10/10] common: convert vCPU info area registration Jan Beulich
@ 2022-12-17 15:53   ` Julien Grall
  2022-12-19 15:01     ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-12-17 15:53 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	Tamas K Lengyel

Hi,

On 19/10/2022 08:45, Jan Beulich wrote:
> Switch to using map_guest_area(). Noteworthy differences from
> map_vcpu_info():
> - remote vCPU-s are paused rather than checked for being down (which in
>    principle can change right after the check),
> - the domain lock is taken for a much smaller region,
> - areas could now be registered more than once, if we want this,
> - as a corner case registering the area at GFN 0 offset 0 is no longer
>    possible (this is considered an invalid de-registration request).
> 
> Note that this eliminates a bug in copy_vcpu_settings(): The function
> did allocate a new page regardless of the GFN already having a mapping,
> thus in particular breaking the case of two vCPU-s having their info
> areas on the same page.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: While the "no re-registration" check is retained, it is now racy.
>       If we really think it needs retaining _and_ properly enforcing,
>       then locking will be needed, but I don't think we can hold the
>       domain lock around a call to map_guest_area(), first and foremost
>       because of its use of vcpu_pause().

function like evtchn_2l_unmask() may access vcpu_info that is not the 
current one.

So I believe the check needs to be reatined and properly enforced. 
Otherwise, the old structure may still be used for a short time even if 
it has been freed.

> 
> RFC: Is the GFN 0 offset 0 behavioral change deemed acceptable?

I would say n  (See the previous discussion for more details).

> 
> RFC: To have just a single instance of casts to vcpu_info_t *,
>       introducing a macro (or inline function if header dependencies
>       permit) might be nice. However, the only sensible identifier for it
>       would imo be vcpu_info(). Yet we already have a macro of that name.
>       With some trickery it might be possible to overload the macro
>       (making the "field" argument optional), but this may not be
>       desirable for other reasons (it could e.g. be deemed confusing).

You might be able to reduce the number of cast by using local variables.

But it is not entirely clear why we can't use the existing vcpu_info() 
in many of the cases. For instance...


> 
> --- a/xen/arch/x86/include/asm/shared.h
> +++ b/xen/arch/x86/include/asm/shared.h
> @@ -27,16 +27,16 @@ static inline void arch_set_##field(stru
>   static inline type arch_get_##field(const struct vcpu *v)       \
>   {                                                               \
>       return !has_32bit_shinfo(v->domain) ?                       \
> -           v->vcpu_info->native.arch.field :                    \
> -           v->vcpu_info->compat.arch.field;                     \
> +           ((const vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field : \
> +           ((const vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field;  \

... here.

>   }                                                               \
>   static inline void arch_set_##field(struct vcpu *v,             \
>                                       type val)                   \
>   {                                                               \
>       if ( !has_32bit_shinfo(v->domain) )                         \
> -        v->vcpu_info->native.arch.field = val;                  \
> +        ((vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field = val; \
>       else                                                        \
> -        v->vcpu_info->compat.arch.field = val;                  \
> +        ((vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field = val; \

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-12-16 12:26   ` Julien Grall
@ 2022-12-19 12:48     ` Jan Beulich
  2022-12-20  8:40       ` Julien Grall
  0 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-12-19 12:48 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 16.12.2022 13:26, Julien Grall wrote:
> On 19/10/2022 08:41, Jan Beulich wrote:
>> Before adding a new vCPU operation to register the runstate area by
>> guest-physical address, add code to actually keep such areas up-to-date.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: Pages aren't marked dirty when written to (matching the handling of
>>       space mapped by map_vcpu_info() afaict), on the basis that the
>>       registrations are lost anyway across migration. 
> 
> So I agree for the existing migration. But I wonder whether we would 
> need to dirty those pages if we live-migrated a guest without its help 
> (IOW the registrations would still be present).

Even then I'd expect the area to be re-populated at the target, so the
page contents would need moving over (perhaps multiple times) only if
any other parts of such a page were written to.

> Anyway, nothing to worry about yet as this is not supported upstream.

I assume you've taken note of this for the transparent migration work.
One question after all is how you'd make handling of the area at the
new target transparent, i.e. without any anomalies in the values the
guest gets to see. It may very well be that every such area needs
special treatment in the course of migrating, such that the most up-
to-date values are reported as part of the migration stream, but
separate from all the pages' contents.

>> Plus the contents
>>       of the areas in question have to be deemed volatile in the first
>>       place (so saving a "most recent" value is pretty meaningless even
>>       for e.g. snapshotting).
>>
>> RFC: Can we perhaps avoid the VM-assist conditionals, assuming the more
>>       modern behavior to apply uniformly for gaddr-based registrations?
> 
> It is not clear why someone would want to use the old behavior with the 
> new gaddr-based registrations. So I would say yes.

Okay, will do.

>> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>>       and alignment) of the runstate area. I don't think it is an option
>>       to require 32-bit code to pass a range such that even the 64-bit
>>       layout wouldn't cross a page boundary (and be suitably aligned). I
>>       also don't see any other good solution, so for now a crude approach
>>       with an extra boolean is used (using has_32bit_shinfo() isn't race
>>       free and could hence lead to overrunning the mapped space).
> 
> I think the extra check for 32-bit code to pass the check for 64-bit 
> layout would be better.

I'm afraid I can't derive from your reply what it is you actually want.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 10/10] common: convert vCPU info area registration
  2022-12-17 15:53   ` Julien Grall
@ 2022-12-19 15:01     ` Jan Beulich
  2022-12-20  8:52       ` Julien Grall
  0 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-12-19 15:01 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	Tamas K Lengyel, xen-devel

On 17.12.2022 16:53, Julien Grall wrote:
> On 19/10/2022 08:45, Jan Beulich wrote:
>> Switch to using map_guest_area(). Noteworthy differences from
>> map_vcpu_info():
>> - remote vCPU-s are paused rather than checked for being down (which in
>>    principle can change right after the check),
>> - the domain lock is taken for a much smaller region,
>> - areas could now be registered more than once, if we want this,
>> - as a corner case registering the area at GFN 0 offset 0 is no longer
>>    possible (this is considered an invalid de-registration request).
>>
>> Note that this eliminates a bug in copy_vcpu_settings(): The function
>> did allocate a new page regardless of the GFN already having a mapping,
>> thus in particular breaking the case of two vCPU-s having their info
>> areas on the same page.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: While the "no re-registration" check is retained, it is now racy.
>>       If we really think it needs retaining _and_ properly enforcing,
>>       then locking will be needed, but I don't think we can hold the
>>       domain lock around a call to map_guest_area(), first and foremost
>>       because of its use of vcpu_pause().
> 
> function like evtchn_2l_unmask() may access vcpu_info that is not the 
> current one.
> 
> So I believe the check needs to be reatined and properly enforced. 
> Otherwise, the old structure may still be used for a short time even if 
> it has been freed.

Good point, I'll try to come up with something suitable.

>> RFC: Is the GFN 0 offset 0 behavioral change deemed acceptable?
> 
> I would say n  (See the previous discussion for more details).

Of course - with 0 going to be replaced in map_guest_area(), the check
there needs to be adjusted accordingly. And ~0 was already invalid to
pass in here, due to the alignment check.

>> RFC: To have just a single instance of casts to vcpu_info_t *,
>>       introducing a macro (or inline function if header dependencies
>>       permit) might be nice. However, the only sensible identifier for it
>>       would imo be vcpu_info(). Yet we already have a macro of that name.
>>       With some trickery it might be possible to overload the macro
>>       (making the "field" argument optional), but this may not be
>>       desirable for other reasons (it could e.g. be deemed confusing).
> 
> You might be able to reduce the number of cast by using local variables.
> 
> But it is not entirely clear why we can't use the existing vcpu_info() 
> in many of the cases. For instance...
> 
> 
>>
>> --- a/xen/arch/x86/include/asm/shared.h
>> +++ b/xen/arch/x86/include/asm/shared.h
>> @@ -27,16 +27,16 @@ static inline void arch_set_##field(stru
>>   static inline type arch_get_##field(const struct vcpu *v)       \
>>   {                                                               \
>>       return !has_32bit_shinfo(v->domain) ?                       \
>> -           v->vcpu_info->native.arch.field :                    \
>> -           v->vcpu_info->compat.arch.field;                     \
>> +           ((const vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field : \
>> +           ((const vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field;  \
> 
> ... here.

vcpu_info() has a property which gets in the way of using the macro
here. Since __vcpu_info() wants to return something which can also
be used as lvalue, the 2nd and 3rd operands of the conditional
operator need to resolve to the same pointer type. Hence the smaller
(compat) type is the only one which is safe to use. See the comment
next to __shared_info()'s definition (which is the one __vcpu_info()'s
refers to).

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-12-19 12:48     ` Jan Beulich
@ 2022-12-20  8:40       ` Julien Grall
  2022-12-20  8:45         ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-12-20  8:40 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

Hi,

On 19/12/2022 12:48, Jan Beulich wrote:
> On 16.12.2022 13:26, Julien Grall wrote:
>> On 19/10/2022 08:41, Jan Beulich wrote:
>>> Before adding a new vCPU operation to register the runstate area by
>>> guest-physical address, add code to actually keep such areas up-to-date.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> RFC: Pages aren't marked dirty when written to (matching the handling of
>>>        space mapped by map_vcpu_info() afaict), on the basis that the
>>>        registrations are lost anyway across migration.
>>
>> So I agree for the existing migration. But I wonder whether we would
>> need to dirty those pages if we live-migrated a guest without its help
>> (IOW the registrations would still be present).
> 
> Even then I'd expect the area to be re-populated at the target, so the
> page contents would need moving over (perhaps multiple times) only if
> any other parts of such a page were written to.
You are right. I had another look how this was implemented and we indeed 
create a record for each vcpu_info/runstate to transfer the content.

> 
>> Anyway, nothing to worry about yet as this is not supported upstream.
> 
> I assume you've taken note of this for the transparent migration work.

Yes.

> One question after all is how you'd make handling of the area at the
> new target transparent, i.e. without any anomalies in the values the
> guest gets to see. It may very well be that every such area needs
> special treatment in the course of migrating, such that the most up-
> to-date values are reported as part of the migration stream, but
> separate from all the pages' contents.

AFAIK, vcpu_info doesn't need special treatment but the runstate needs 
because it contains a field 'state_entry_time' that is a system time in 
nanoseconds.

This could be solved by never exposing the system time to the guest and 
instead a relative value from when it starts.

> 
>>> Plus the contents
>>>        of the areas in question have to be deemed volatile in the first
>>>        place (so saving a "most recent" value is pretty meaningless even
>>>        for e.g. snapshotting).
>>>
>>> RFC: Can we perhaps avoid the VM-assist conditionals, assuming the more
>>>        modern behavior to apply uniformly for gaddr-based registrations?
>>
>> It is not clear why someone would want to use the old behavior with the
>> new gaddr-based registrations. So I would say yes.
> 
> Okay, will do.
> 
>>> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>>>        and alignment) of the runstate area. I don't think it is an option
>>>        to require 32-bit code to pass a range such that even the 64-bit
>>>        layout wouldn't cross a page boundary (and be suitably aligned). I
>>>        also don't see any other good solution, so for now a crude approach
>>>        with an extra boolean is used (using has_32bit_shinfo() isn't race
>>>        free and could hence lead to overrunning the mapped space).
>>
>> I think the extra check for 32-bit code to pass the check for 64-bit
>> layout would be better.
> 
> I'm afraid I can't derive from your reply what it is you actually want.

I think for 32-bit call, we also want to check the address provide will 
also pass the 64-bit check (i.e. if used as a 64-bit layout, the area 
would not cross a page boundary and be suitably aligned).

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-12-20  8:40       ` Julien Grall
@ 2022-12-20  8:45         ` Jan Beulich
  2022-12-20  8:48           ` Julien Grall
  0 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-12-20  8:45 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 20.12.2022 09:40, Julien Grall wrote:
> On 19/12/2022 12:48, Jan Beulich wrote:
>> On 16.12.2022 13:26, Julien Grall wrote:
>>> On 19/10/2022 08:41, Jan Beulich wrote:
>>>> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>>>>        and alignment) of the runstate area. I don't think it is an option
>>>>        to require 32-bit code to pass a range such that even the 64-bit
>>>>        layout wouldn't cross a page boundary (and be suitably aligned). I
>>>>        also don't see any other good solution, so for now a crude approach
>>>>        with an extra boolean is used (using has_32bit_shinfo() isn't race
>>>>        free and could hence lead to overrunning the mapped space).
>>>
>>> I think the extra check for 32-bit code to pass the check for 64-bit
>>> layout would be better.
>>
>> I'm afraid I can't derive from your reply what it is you actually want.
> 
> I think for 32-bit call, we also want to check the address provide will 
> also pass the 64-bit check (i.e. if used as a 64-bit layout, the area 
> would not cross a page boundary and be suitably aligned).

But that's specifically what I say I don't think is an option. First and
foremost because of the implication on 32-bit callers: They're need to
use magic to get hold of the size of the 64-bit variant of the struct.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-12-20  8:45         ` Jan Beulich
@ 2022-12-20  8:48           ` Julien Grall
  2022-12-20  9:59             ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Julien Grall @ 2022-12-20  8:48 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

Hi Jan,

On 20/12/2022 08:45, Jan Beulich wrote:
> On 20.12.2022 09:40, Julien Grall wrote:
>> On 19/12/2022 12:48, Jan Beulich wrote:
>>> On 16.12.2022 13:26, Julien Grall wrote:
>>>> On 19/10/2022 08:41, Jan Beulich wrote:
>>>>> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>>>>>         and alignment) of the runstate area. I don't think it is an option
>>>>>         to require 32-bit code to pass a range such that even the 64-bit
>>>>>         layout wouldn't cross a page boundary (and be suitably aligned). I
>>>>>         also don't see any other good solution, so for now a crude approach
>>>>>         with an extra boolean is used (using has_32bit_shinfo() isn't race
>>>>>         free and could hence lead to overrunning the mapped space).
>>>>
>>>> I think the extra check for 32-bit code to pass the check for 64-bit
>>>> layout would be better.
>>>
>>> I'm afraid I can't derive from your reply what it is you actually want.
>>
>> I think for 32-bit call, we also want to check the address provide will
>> also pass the 64-bit check (i.e. if used as a 64-bit layout, the area
>> would not cross a page boundary and be suitably aligned).
> 
> But that's specifically what I say I don't think is an option. First and
> foremost because of the implication on 32-bit callers: They're need to
> use magic to get hold of the size of the 64-bit variant of the struct.

I understand that. But I am not aware of any other (simple) approach 
where you could have race free code.

So between a non-race free code and exposing the restriction to the 
guest, I would chose the latter.

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 10/10] common: convert vCPU info area registration
  2022-12-19 15:01     ` Jan Beulich
@ 2022-12-20  8:52       ` Julien Grall
  0 siblings, 0 replies; 47+ messages in thread
From: Julien Grall @ 2022-12-20  8:52 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	Tamas K Lengyel, xen-devel

Hi Jan,

On 19/12/2022 15:01, Jan Beulich wrote:
> On 17.12.2022 16:53, Julien Grall wrote:
>> On 19/10/2022 08:45, Jan Beulich wrote:
>>> Switch to using map_guest_area(). Noteworthy differences from
>>> map_vcpu_info():
>>> - remote vCPU-s are paused rather than checked for being down (which in
>>>     principle can change right after the check),
>>> - the domain lock is taken for a much smaller region,
>>> - areas could now be registered more than once, if we want this,
>>> - as a corner case registering the area at GFN 0 offset 0 is no longer
>>>     possible (this is considered an invalid de-registration request).
>>>
>>> Note that this eliminates a bug in copy_vcpu_settings(): The function
>>> did allocate a new page regardless of the GFN already having a mapping,
>>> thus in particular breaking the case of two vCPU-s having their info
>>> areas on the same page.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> RFC: While the "no re-registration" check is retained, it is now racy.
>>>        If we really think it needs retaining _and_ properly enforcing,
>>>        then locking will be needed, but I don't think we can hold the
>>>        domain lock around a call to map_guest_area(), first and foremost
>>>        because of its use of vcpu_pause().
>>
>> function like evtchn_2l_unmask() may access vcpu_info that is not the
>> current one.
>>
>> So I believe the check needs to be reatined and properly enforced.
>> Otherwise, the old structure may still be used for a short time even if
>> it has been freed.
> 
> Good point, I'll try to come up with something suitable.
> 
>>> RFC: Is the GFN 0 offset 0 behavioral change deemed acceptable?
>>
>> I would say n  (See the previous discussion for more details).
> 
> Of course - with 0 going to be replaced in map_guest_area(), the check
> there needs to be adjusted accordingly. And ~0 was already invalid to
> pass in here, due to the alignment check.
> 
>>> RFC: To have just a single instance of casts to vcpu_info_t *,
>>>        introducing a macro (or inline function if header dependencies
>>>        permit) might be nice. However, the only sensible identifier for it
>>>        would imo be vcpu_info(). Yet we already have a macro of that name.
>>>        With some trickery it might be possible to overload the macro
>>>        (making the "field" argument optional), but this may not be
>>>        desirable for other reasons (it could e.g. be deemed confusing).
>>
>> You might be able to reduce the number of cast by using local variables.
>>
>> But it is not entirely clear why we can't use the existing vcpu_info()
>> in many of the cases. For instance...
>>
>>
>>>
>>> --- a/xen/arch/x86/include/asm/shared.h
>>> +++ b/xen/arch/x86/include/asm/shared.h
>>> @@ -27,16 +27,16 @@ static inline void arch_set_##field(stru
>>>    static inline type arch_get_##field(const struct vcpu *v)       \
>>>    {                                                               \
>>>        return !has_32bit_shinfo(v->domain) ?                       \
>>> -           v->vcpu_info->native.arch.field :                    \
>>> -           v->vcpu_info->compat.arch.field;                     \
>>> +           ((const vcpu_info_t *)v->vcpu_info_area.map)->native.arch.field : \
>>> +           ((const vcpu_info_t *)v->vcpu_info_area.map)->compat.arch.field;  \
>>
>> ... here.
> 
> vcpu_info() has a property which gets in the way of using the macro
> here. Since __vcpu_info() wants to return something which can also
> be used as lvalue, the 2nd and 3rd operands of the conditional
> operator need to resolve to the same pointer type. Hence the smaller
> (compat) type is the only one which is safe to use. See the comment
> next to __shared_info()'s definition (which is the one __vcpu_info()'s
> refers to).

Thanks for the pointer! I hadn't noticed the subtlety. Then, the 
open-cast (even if I dislike them) is probably the way go for now.

As I mentioned above, it would nice if they can be reduced by using 
local variables (this will also return the length of the line).

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-12-20  8:48           ` Julien Grall
@ 2022-12-20  9:59             ` Jan Beulich
  2022-12-20 10:01               ` Julien Grall
  0 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2022-12-20  9:59 UTC (permalink / raw)
  To: Julien Grall
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel

On 20.12.2022 09:48, Julien Grall wrote:
> Hi Jan,
> 
> On 20/12/2022 08:45, Jan Beulich wrote:
>> On 20.12.2022 09:40, Julien Grall wrote:
>>> On 19/12/2022 12:48, Jan Beulich wrote:
>>>> On 16.12.2022 13:26, Julien Grall wrote:
>>>>> On 19/10/2022 08:41, Jan Beulich wrote:
>>>>>> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>>>>>>         and alignment) of the runstate area. I don't think it is an option
>>>>>>         to require 32-bit code to pass a range such that even the 64-bit
>>>>>>         layout wouldn't cross a page boundary (and be suitably aligned). I
>>>>>>         also don't see any other good solution, so for now a crude approach
>>>>>>         with an extra boolean is used (using has_32bit_shinfo() isn't race
>>>>>>         free and could hence lead to overrunning the mapped space).
>>>>>
>>>>> I think the extra check for 32-bit code to pass the check for 64-bit
>>>>> layout would be better.
>>>>
>>>> I'm afraid I can't derive from your reply what it is you actually want.
>>>
>>> I think for 32-bit call, we also want to check the address provide will
>>> also pass the 64-bit check (i.e. if used as a 64-bit layout, the area
>>> would not cross a page boundary and be suitably aligned).
>>
>> But that's specifically what I say I don't think is an option. First and
>> foremost because of the implication on 32-bit callers: They're need to
>> use magic to get hold of the size of the 64-bit variant of the struct.
> 
> I understand that. But I am not aware of any other (simple) approach 
> where you could have race free code.

My reference to using has_32bit_shinfo() not being race free was to avoid
suggestions in that direction. There's no use of this function in the
patch here, nor in the one where the new boolean field is actually written
to. The solution as presented is, afaict, both simple and race free. It
merely isn't very nice.

Jan

> So between a non-race free code and exposing the restriction to the 
> guest, I would chose the latter.
> 
> Cheers,
> 



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 04/10] domain: update GADDR based runstate guest area
  2022-12-20  9:59             ` Jan Beulich
@ 2022-12-20 10:01               ` Julien Grall
  0 siblings, 0 replies; 47+ messages in thread
From: Julien Grall @ 2022-12-20 10:01 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Stefano Stabellini, Wei Liu,
	Roger Pau Monné,
	xen-devel



On 20/12/2022 09:59, Jan Beulich wrote:
> On 20.12.2022 09:48, Julien Grall wrote:
>> Hi Jan,
>>
>> On 20/12/2022 08:45, Jan Beulich wrote:
>>> On 20.12.2022 09:40, Julien Grall wrote:
>>>> On 19/12/2022 12:48, Jan Beulich wrote:
>>>>> On 16.12.2022 13:26, Julien Grall wrote:
>>>>>> On 19/10/2022 08:41, Jan Beulich wrote:
>>>>>>> RFC: HVM guests (on x86) can change bitness and hence layout (and size!
>>>>>>>          and alignment) of the runstate area. I don't think it is an option
>>>>>>>          to require 32-bit code to pass a range such that even the 64-bit
>>>>>>>          layout wouldn't cross a page boundary (and be suitably aligned). I
>>>>>>>          also don't see any other good solution, so for now a crude approach
>>>>>>>          with an extra boolean is used (using has_32bit_shinfo() isn't race
>>>>>>>          free and could hence lead to overrunning the mapped space).
>>>>>>
>>>>>> I think the extra check for 32-bit code to pass the check for 64-bit
>>>>>> layout would be better.
>>>>>
>>>>> I'm afraid I can't derive from your reply what it is you actually want.
>>>>
>>>> I think for 32-bit call, we also want to check the address provide will
>>>> also pass the 64-bit check (i.e. if used as a 64-bit layout, the area
>>>> would not cross a page boundary and be suitably aligned).
>>>
>>> But that's specifically what I say I don't think is an option. First and
>>> foremost because of the implication on 32-bit callers: They're need to
>>> use magic to get hold of the size of the 64-bit variant of the struct.
>>
>> I understand that. But I am not aware of any other (simple) approach
>> where you could have race free code.
> 
> My reference to using has_32bit_shinfo() not being race free was to avoid
> suggestions in that direction. There's no use of this function in the
> patch here, nor in the one where the new boolean field is actually written
> to. The solution as presented is, afaict, both simple and race free. It
> merely isn't very nice.

Ah! I misunderstood what you original wrote. Thanks for the clarification.

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function
  2022-10-19  7:39 ` [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function Jan Beulich
@ 2023-01-17 20:19   ` Andrew Cooper
  2023-01-18  7:32     ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Andrew Cooper @ 2023-01-17 20:19 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Wei Liu, Roger Pau Monne

On 19/10/2022 8:39 am, Jan Beulich wrote:
> This is to facilitate subsequent re-use of this code.
>
> While doing so add const in a number of places, extending to
> gtime_to_gtsc() and then for symmetry also its inverse function.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper@citrix.com>

> ---
> I was on the edge of also folding the various is_hvm_domain() into a
> function scope boolean, but then wasn't really certain that this
> wouldn't open up undue speculation opportunities.

I can't see anything interesting under here speculation wise. 
Commentary inline.

>
> --- a/xen/arch/x86/include/asm/time.h
> +++ b/xen/arch/x86/include/asm/time.h
> @@ -52,8 +52,8 @@ uint64_t cf_check acpi_pm_tick_to_ns(uin
>  uint64_t tsc_ticks2ns(uint64_t ticks);
>  
>  uint64_t pv_soft_rdtsc(const struct vcpu *v, const struct cpu_user_regs *regs);
> -u64 gtime_to_gtsc(struct domain *d, u64 time);
> -u64 gtsc_to_gtime(struct domain *d, u64 tsc);
> +uint64_t gtime_to_gtsc(const struct domain *d, uint64_t time);
> +uint64_t gtsc_to_gtime(const struct domain *d, uint64_t tsc);
>  
>  int tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
>                   uint32_t gtsc_khz, uint32_t incarnation);
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -1373,18 +1373,14 @@ uint64_t tsc_ticks2ns(uint64_t ticks)
>      return scale_delta(ticks, &t->tsc_scale);
>  }
>  
> -static void __update_vcpu_system_time(struct vcpu *v, int force)
> +static void collect_time_info(const struct vcpu *v,
> +                              struct vcpu_time_info *u)
>  {
> -    const struct cpu_time *t;
> -    struct vcpu_time_info *u, _u = {};
> -    struct domain *d = v->domain;
> +    const struct cpu_time *t = &this_cpu(cpu_time);
> +    const struct domain *d = v->domain;
>      s_time_t tsc_stamp;
>  
> -    if ( v->vcpu_info == NULL )
> -        return;
> -
> -    t = &this_cpu(cpu_time);
> -    u = &vcpu_info(v, time);
> +    memset(u, 0, sizeof(*u));
>  
>      if ( d->arch.vtsc )
>      {
> @@ -1392,7 +1388,7 @@ static void __update_vcpu_system_time(st
>  
>          if ( is_hvm_domain(d) )
>          {
> -            struct pl_time *pl = v->domain->arch.hvm.pl_time;
> +            const struct pl_time *pl = d->arch.hvm.pl_time;

A PV guest could in in principle use...

>  
>              stime += pl->stime_offset + v->arch.hvm.stime_offset;

... this pl->stime_offset as the second deference of a whatever happens
to sit under d->arch.hvm.pl_time in the pv union.

In the current build of Xen I have to hand, that's
d->arch.pv.mapcache.{epoch,tlbflush_timestamp}, the combination of which
doesn't seem like it can be steered into being a legal pointer into Xen.

>              if ( stime >= 0 )
> @@ -1403,27 +1399,27 @@ static void __update_vcpu_system_time(st
>          else
>              tsc_stamp = gtime_to_gtsc(d, stime);
>  
> -        _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
> -        _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
> +        u->tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
> +        u->tsc_shift         = d->arch.vtsc_to_ns.shift;
>      }
>      else
>      {
>          if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )

On the other hand, this is isn't safe.  There's no protection of the &&
calculation, but...

>          {
>              tsc_stamp            = hvm_scale_tsc(d, t->stamp.local_tsc);

... this path is the only path subject to speculative type confusion,
and all it does is read d->arch.hvm.tsc_scaling_ratio, so is
appropriately protected in this instance.

Also, all an attacker could do is encode the scaling ratio alongside
t->stamp.local_tsc (unpredictable) in the calculation for the duration
of the speculative window, with no way I can see to dereference the result.


> -            _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
> -            _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
> +            u->tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
> +            u->tsc_shift         = d->arch.vtsc_to_ns.shift;
>          }
>          else
>          {
>              tsc_stamp            = t->stamp.local_tsc;
> -            _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
> -            _u.tsc_shift         = t->tsc_scale.shift;
> +            u->tsc_to_system_mul = t->tsc_scale.mul_frac;
> +            u->tsc_shift         = t->tsc_scale.shift;
>          }
>      }
>  
> -    _u.tsc_timestamp = tsc_stamp;
> -    _u.system_time   = t->stamp.local_stime;
> +    u->tsc_timestamp = tsc_stamp;
> +    u->system_time   = t->stamp.local_stime;
>  
>      /*
>       * It's expected that domains cope with this bit changing on every
> @@ -1431,10 +1427,21 @@ static void __update_vcpu_system_time(st
>       * or if it further requires monotonicity checks with other vcpus.
>       */
>      if ( clocksource_is_tsc() )
> -        _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
> +        u->flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
>  
>      if ( is_hvm_domain(d) )
> -        _u.tsc_timestamp += v->arch.hvm.cache_tsc_offset;
> +        u->tsc_timestamp += v->arch.hvm.cache_tsc_offset;

This path is subject to type confusion on v->arch.{pv,hvm}, with a PV
guest able to encode the value of v->arch.pv.ctrlreg[5] into the
timestamp.  But again, no way to dereference the result.


I really don't think there's enough flexibility here for even a
perfectly-timed attacker to abuse.

~Andrew

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 05/10] x86: update GADDR based secondary time area
  2022-10-19  7:41 ` [PATCH RFC 05/10] x86: update GADDR based secondary time area Jan Beulich
@ 2023-01-17 20:31   ` Andrew Cooper
  2023-01-18  9:25     ` Jan Beulich
  0 siblings, 1 reply; 47+ messages in thread
From: Andrew Cooper @ 2023-01-17 20:31 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Wei Liu, Roger Pau Monne

On 19/10/2022 8:41 am, Jan Beulich wrote:
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -1462,12 +1462,34 @@ static void __update_vcpu_system_time(st
>          v->arch.pv.pending_system_time = _u;
>  }
>  
> +static void write_time_guest_area(struct vcpu_time_info *map,
> +                                  const struct vcpu_time_info *src)
> +{
> +    /* 1. Update userspace version. */
> +    write_atomic(&map->version, src->version);

version_update_begin()

~Andrew

> +    smp_wmb();
> +
> +    /* 2. Update all other userspace fields. */
> +    *map = *src;
> +
> +    /* 3. Update userspace version again. */
> +    smp_wmb();
> +    write_atomic(&map->version, version_update_end(src->version));
> +}
> +
>  bool update_secondary_system_time(struct vcpu *v,
>                                    struct vcpu_time_info *u)
>  {
>      XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
> +    struct vcpu_time_info *map = v->arch.time_guest_area.map;
>      struct guest_memory_policy policy = { .nested_guest_mode = false };
>  
> +    if ( map )
> +    {
> +        write_time_guest_area(map, u);
> +        return true;
> +    }
> +
>      if ( guest_handle_is_null(user_u) )
>          return true;
>  
>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown
  2022-10-19  7:40 ` [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown Jan Beulich
  2022-12-13 21:44   ` Julien Grall
@ 2023-01-17 21:17   ` Andrew Cooper
  2023-01-18  8:59     ` Jan Beulich
  1 sibling, 1 reply; 47+ messages in thread
From: Andrew Cooper @ 2023-01-17 21:17 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne

On 19/10/2022 8:40 am, Jan Beulich wrote:
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, add the necessary domain cleanup hooks.

What are the two areas you're discussing here?

I assume you mean VCPUOP_register_vcpu_time_memory_area to be the
x86-specific op, but ARM permits both  VCPUOP_register_vcpu_info and
VCPUOP_register_runstate_memory_area.

So isn't it more 2+1 rather than 1+1?

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: Zapping the areas in pv_shim_shutdown() may not be strictly
>      necessary: Aiui unmap_vcpu_info() is called only because the vCPU
>      info area cannot be re-registered. Beyond that I guess the
>      assumption is that the areas would only be re-registered as they
>      were before. If that's not the case I wonder whether the guest
>      handles for both areas shouldn't also be zapped.

At this point in pv_shim_shutdown(), have already come out of suspend
(successfully) and are preparing to poke the PV guest out of suspend too.

The PV guest needs to not have its subsequent VCPUOP_register_vcpu_info
call fail, but beyond that I can entirely believe that it was coded to
"Linux doesn't crash" rather than "what's everything we ought to reset
here" - recall that we weren't exactly flush with time when trying to
push Shim out of the door.

Whatever does get reregstered will be reregistered at the same
position.  No guest at all is going to have details like that dynamic
across migrate.

As a tangential observation, i see the periodic timer gets rearmed. 
This is still one of the more insane default properties of a PV guest;
Linux intentionally clobbers it on boot, but I can equivalent logic to
re-clobber after resume.

~Andrew

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-10-19  7:43 ` [PATCH RFC 07/10] domain: map/unmap " Jan Beulich
  2022-11-24 21:29   ` Julien Grall
@ 2023-01-17 22:04   ` Andrew Cooper
  2023-01-18  9:55     ` Jan Beulich
  1 sibling, 1 reply; 47+ messages in thread
From: Andrew Cooper @ 2023-01-17 22:04 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne

On 19/10/2022 8:43 am, Jan Beulich wrote:
> The registration by virtual/linear address has downsides: At least on
> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
> PV domains the areas are inaccessible (and hence cannot be updated by
> Xen) when in guest-user mode.

They're also inaccessible in HVM guests (x86 and ARM) when Meltdown
mitigations are in place.

And lets not get started on the multitude of layering violations that is
guest_memory_policy() for nested virt.  In fact, prohibiting any form of
map-by-va is a perquisite to any rational attempt to make nested virt work.

(In fact, that infrastructure needs purging before any other
architecture pick up stubs too.)

They're also inaccessible in general because no architecture has
hypervisor privilege in a normal user/supervisor split, and you don't
know whether the mapping is over supervisor or user mapping, and
settings like SMAP/PAN can cause the pagewalk to fail even when the
mapping is in place.

There are a lot of good reasons why map-by-va should never have happened.

Even for PV guests, map-by-gfn (and let the guest manage whatever
virtual mappings it wants on its own) would have been closer to the
status quo for how real hardware worked, and critically would have
avoided the restriction that the areas had to live at a globally fixed
position to be useful.

>
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, flesh out the map/unmap functions.
>
> Noteworthy differences from map_vcpu_info():
> - areas can be registered more than once (and de-registered),

When register by GFN is available, there is never a good reason to the
same area twice.

The guest maps one MMIO-like region, and then constructs all the regular
virtual addresses mapping it (or not) that it wants.

This API is new, so we can enforce sane behaviour from the outset.  In
particular, it will help with ...

> - remote vCPU-s are paused rather than checked for being down (which in
>   principle can change right after the check),
> - the domain lock is taken for a much smaller region.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: By using global domain page mappings the demand on the underlying
>      VA range may increase significantly. I did consider to use per-
>      domain mappings instead, but they exist for x86 only. Of course we
>      could have arch_{,un}map_guest_area() aliasing global domain page
>      mapping functions on Arm and using per-domain mappings on x86. Yet
>      then again map_vcpu_info() doesn't do so either (albeit that's
>      likely to be converted subsequently to use map_vcpu_area() anyway).

... this by providing a bound on the amount of vmap() space can be consumed.

>
> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>      like map_vcpu_info() - solely relying on the type ref acquisition.
>      Checking for p2m_ram_rw alone would be wrong, as at least
>      p2m_ram_logdirty ought to also be okay to use here (and in similar
>      cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>      used here (like altp2m_vcpu_enable_ve() does) as well as in
>      map_vcpu_info(), yet then again the P2M type is stale by the time
>      it is being looked at anyway without the P2M lock held.

Again, another error caused by Xen not knowing the guest physical
address layout.  These mappings should be restricted to just RAM regions
and I think we want to enforce that right from the outset.

~Andrew

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2022-11-24 21:29   ` Julien Grall
  2022-11-28  9:01     ` Jan Beulich
@ 2023-01-17 22:20     ` Andrew Cooper
  2023-01-17 22:45       ` Julien Grall
  2023-01-18  9:59       ` Jan Beulich
  1 sibling, 2 replies; 47+ messages in thread
From: Andrew Cooper @ 2023-01-17 22:20 UTC (permalink / raw)
  To: Julien Grall, Jan Beulich, xen-devel
  Cc: George Dunlap, Stefano Stabellini, Wei Liu, Roger Pau Monne

On 24/11/2022 9:29 pm, Julien Grall wrote:
> Hi Jan,
>
> I am still digesting this series and replying with some initial comments.
>
> On 19/10/2022 09:43, Jan Beulich wrote:
>> The registration by virtual/linear address has downsides: At least on
>> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
>> PV domains the areas are inaccessible (and hence cannot be updated by
>> Xen) when in guest-user mode.
>>
>> In preparation of the introduction of new vCPU operations allowing to
>> register the respective areas (one of the two is x86-specific) by
>> guest-physical address, flesh out the map/unmap functions.
>>
>> Noteworthy differences from map_vcpu_info():
>> - areas can be registered more than once (and de-registered),
>> - remote vCPU-s are paused rather than checked for being down (which in
>>    principle can change right after the check),
>> - the domain lock is taken for a much smaller region.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: By using global domain page mappings the demand on the underlying
>>       VA range may increase significantly. I did consider to use per-
>>       domain mappings instead, but they exist for x86 only. Of course we
>>       could have arch_{,un}map_guest_area() aliasing global domain page
>>       mapping functions on Arm and using per-domain mappings on x86. Yet
>>       then again map_vcpu_info() doesn't do so either (albeit that's
>>       likely to be converted subsequently to use map_vcpu_area()
>> anyway).
>>
>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>       like map_vcpu_info() - solely relying on the type ref acquisition.
>>       Checking for p2m_ram_rw alone would be wrong, as at least
>>       p2m_ram_logdirty ought to also be okay to use here (and in similar
>>       cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>       used here (like altp2m_vcpu_enable_ve() does) as well as in
>>       map_vcpu_info(), yet then again the P2M type is stale by the time
>>       it is being looked at anyway without the P2M lock held.
>>
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>                      struct guest_area *area,
>>                      void (*populate)(void *dst, struct vcpu *v))
>>   {
>> -    return -EOPNOTSUPP;
>> +    struct domain *currd = v->domain;
>> +    void *map = NULL;
>> +    struct page_info *pg = NULL;
>> +    int rc = 0;
>> +
>> +    if ( gaddr )
>
> 0 is technically a valid (guest) physical address on Arm.

It is on x86 too, but that's not why 0 is generally considered an
invalid address.

See the multitude of XSAs, and near-XSAs which have been caused by bad
logic in Xen caused by trying to make a variable held in struct
vcpu/domain have a default value other than 0.

It's not impossible to write such code safely, and in this case I expect
it can be done by the NULLness (or not) of the mapping pointer, rather
than by stashing the gaddr, but history has proved repeatedly that this
is a very fertile source of security bugs.

~Andrew

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-17 22:20     ` Andrew Cooper
@ 2023-01-17 22:45       ` Julien Grall
  2023-01-18  9:59       ` Jan Beulich
  1 sibling, 0 replies; 47+ messages in thread
From: Julien Grall @ 2023-01-17 22:45 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich, xen-devel
  Cc: George Dunlap, Stefano Stabellini, Wei Liu, Roger Pau Monne

Hi Andrew,

On 17/01/2023 22:20, Andrew Cooper wrote:
> On 24/11/2022 9:29 pm, Julien Grall wrote:
>> Hi Jan,
>>
>> I am still digesting this series and replying with some initial comments.
>>
>> On 19/10/2022 09:43, Jan Beulich wrote:
>>> The registration by virtual/linear address has downsides: At least on
>>> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
>>> PV domains the areas are inaccessible (and hence cannot be updated by
>>> Xen) when in guest-user mode.
>>>
>>> In preparation of the introduction of new vCPU operations allowing to
>>> register the respective areas (one of the two is x86-specific) by
>>> guest-physical address, flesh out the map/unmap functions.
>>>
>>> Noteworthy differences from map_vcpu_info():
>>> - areas can be registered more than once (and de-registered),
>>> - remote vCPU-s are paused rather than checked for being down (which in
>>>     principle can change right after the check),
>>> - the domain lock is taken for a much smaller region.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> RFC: By using global domain page mappings the demand on the underlying
>>>        VA range may increase significantly. I did consider to use per-
>>>        domain mappings instead, but they exist for x86 only. Of course we
>>>        could have arch_{,un}map_guest_area() aliasing global domain page
>>>        mapping functions on Arm and using per-domain mappings on x86. Yet
>>>        then again map_vcpu_info() doesn't do so either (albeit that's
>>>        likely to be converted subsequently to use map_vcpu_area()
>>> anyway).
>>>
>>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>>        like map_vcpu_info() - solely relying on the type ref acquisition.
>>>        Checking for p2m_ram_rw alone would be wrong, as at least
>>>        p2m_ram_logdirty ought to also be okay to use here (and in similar
>>>        cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>>        used here (like altp2m_vcpu_enable_ve() does) as well as in
>>>        map_vcpu_info(), yet then again the P2M type is stale by the time
>>>        it is being looked at anyway without the P2M lock held.
>>>
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>                       struct guest_area *area,
>>>                       void (*populate)(void *dst, struct vcpu *v))
>>>    {
>>> -    return -EOPNOTSUPP;
>>> +    struct domain *currd = v->domain;
>>> +    void *map = NULL;
>>> +    struct page_info *pg = NULL;
>>> +    int rc = 0;
>>> +
>>> +    if ( gaddr )
>>
>> 0 is technically a valid (guest) physical address on Arm.
> 
> It is on x86 too, but that's not why 0 is generally considered an
> invalid address.
> 
> See the multitude of XSAs, and near-XSAs which have been caused by bad
> logic in Xen caused by trying to make a variable held in struct
> vcpu/domain have a default value other than 0.
> 
> It's not impossible to write such code safely, and in this case I expect
> it can be done by the NULLness (or not) of the mapping pointer, rather
> than by stashing the gaddr, but history has proved repeatedly that this
> is a very fertile source of security bugs.

I understand the security concern. However... you are now imposing some 
constraint in the guest OS which may be more difficult to address.

Furthermore, we are trying to make a sane ABI. It doesn't look sane to 
me to expose our currently shortcomings to the guest OS. The more that 
if we decide it to relax in the future, then it would not help an OS 
because they will need to support older Xen...

Cheers,

-- 
Julien Grall


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function
  2023-01-17 20:19   ` Andrew Cooper
@ 2023-01-18  7:32     ` Jan Beulich
  0 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2023-01-18  7:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, Wei Liu, Roger Pau Monne, xen-devel

On 17.01.2023 21:19, Andrew Cooper wrote:
> On 19/10/2022 8:39 am, Jan Beulich wrote:
>> This is to facilitate subsequent re-use of this code.
>>
>> While doing so add const in a number of places, extending to
>> gtime_to_gtsc() and then for symmetry also its inverse function.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Acked-by: Andrew Cooper <andrew.cooper@citrix.com>

Thanks.

>> ---
>> I was on the edge of also folding the various is_hvm_domain() into a
>> function scope boolean, but then wasn't really certain that this
>> wouldn't open up undue speculation opportunities.
> 
> I can't see anything interesting under here speculation wise. 
> Commentary inline.

My interpretation of those comments is that the suggested conversion
would be okay-ish (as in not making things worse), but since you didn't
provide an explicit answer I thought I'd better ask for confirmation
before possibly making a patch to that effect.

Jan

>> --- a/xen/arch/x86/include/asm/time.h
>> +++ b/xen/arch/x86/include/asm/time.h
>> @@ -52,8 +52,8 @@ uint64_t cf_check acpi_pm_tick_to_ns(uin
>>  uint64_t tsc_ticks2ns(uint64_t ticks);
>>  
>>  uint64_t pv_soft_rdtsc(const struct vcpu *v, const struct cpu_user_regs *regs);
>> -u64 gtime_to_gtsc(struct domain *d, u64 time);
>> -u64 gtsc_to_gtime(struct domain *d, u64 tsc);
>> +uint64_t gtime_to_gtsc(const struct domain *d, uint64_t time);
>> +uint64_t gtsc_to_gtime(const struct domain *d, uint64_t tsc);
>>  
>>  int tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
>>                   uint32_t gtsc_khz, uint32_t incarnation);
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -1373,18 +1373,14 @@ uint64_t tsc_ticks2ns(uint64_t ticks)
>>      return scale_delta(ticks, &t->tsc_scale);
>>  }
>>  
>> -static void __update_vcpu_system_time(struct vcpu *v, int force)
>> +static void collect_time_info(const struct vcpu *v,
>> +                              struct vcpu_time_info *u)
>>  {
>> -    const struct cpu_time *t;
>> -    struct vcpu_time_info *u, _u = {};
>> -    struct domain *d = v->domain;
>> +    const struct cpu_time *t = &this_cpu(cpu_time);
>> +    const struct domain *d = v->domain;
>>      s_time_t tsc_stamp;
>>  
>> -    if ( v->vcpu_info == NULL )
>> -        return;
>> -
>> -    t = &this_cpu(cpu_time);
>> -    u = &vcpu_info(v, time);
>> +    memset(u, 0, sizeof(*u));
>>  
>>      if ( d->arch.vtsc )
>>      {
>> @@ -1392,7 +1388,7 @@ static void __update_vcpu_system_time(st
>>  
>>          if ( is_hvm_domain(d) )
>>          {
>> -            struct pl_time *pl = v->domain->arch.hvm.pl_time;
>> +            const struct pl_time *pl = d->arch.hvm.pl_time;
> 
> A PV guest could in in principle use...
> 
>>  
>>              stime += pl->stime_offset + v->arch.hvm.stime_offset;
> 
> ... this pl->stime_offset as the second deference of a whatever happens
> to sit under d->arch.hvm.pl_time in the pv union.
> 
> In the current build of Xen I have to hand, that's
> d->arch.pv.mapcache.{epoch,tlbflush_timestamp}, the combination of which
> doesn't seem like it can be steered into being a legal pointer into Xen.
> 
>>              if ( stime >= 0 )
>> @@ -1403,27 +1399,27 @@ static void __update_vcpu_system_time(st
>>          else
>>              tsc_stamp = gtime_to_gtsc(d, stime);
>>  
>> -        _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
>> -        _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
>> +        u->tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
>> +        u->tsc_shift         = d->arch.vtsc_to_ns.shift;
>>      }
>>      else
>>      {
>>          if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
> 
> On the other hand, this is isn't safe.  There's no protection of the &&
> calculation, but...
> 
>>          {
>>              tsc_stamp            = hvm_scale_tsc(d, t->stamp.local_tsc);
> 
> ... this path is the only path subject to speculative type confusion,
> and all it does is read d->arch.hvm.tsc_scaling_ratio, so is
> appropriately protected in this instance.
> 
> Also, all an attacker could do is encode the scaling ratio alongside
> t->stamp.local_tsc (unpredictable) in the calculation for the duration
> of the speculative window, with no way I can see to dereference the result.
> 
> 
>> -            _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
>> -            _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
>> +            u->tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
>> +            u->tsc_shift         = d->arch.vtsc_to_ns.shift;
>>          }
>>          else
>>          {
>>              tsc_stamp            = t->stamp.local_tsc;
>> -            _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
>> -            _u.tsc_shift         = t->tsc_scale.shift;
>> +            u->tsc_to_system_mul = t->tsc_scale.mul_frac;
>> +            u->tsc_shift         = t->tsc_scale.shift;
>>          }
>>      }
>>  
>> -    _u.tsc_timestamp = tsc_stamp;
>> -    _u.system_time   = t->stamp.local_stime;
>> +    u->tsc_timestamp = tsc_stamp;
>> +    u->system_time   = t->stamp.local_stime;
>>  
>>      /*
>>       * It's expected that domains cope with this bit changing on every
>> @@ -1431,10 +1427,21 @@ static void __update_vcpu_system_time(st
>>       * or if it further requires monotonicity checks with other vcpus.
>>       */
>>      if ( clocksource_is_tsc() )
>> -        _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
>> +        u->flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
>>  
>>      if ( is_hvm_domain(d) )
>> -        _u.tsc_timestamp += v->arch.hvm.cache_tsc_offset;
>> +        u->tsc_timestamp += v->arch.hvm.cache_tsc_offset;
> 
> This path is subject to type confusion on v->arch.{pv,hvm}, with a PV
> guest able to encode the value of v->arch.pv.ctrlreg[5] into the
> timestamp.  But again, no way to dereference the result.
> 
> 
> I really don't think there's enough flexibility here for even a
> perfectly-timed attacker to abuse.
> 
> ~Andrew



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown
  2023-01-17 21:17   ` Andrew Cooper
@ 2023-01-18  8:59     ` Jan Beulich
  0 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2023-01-18  8:59 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne, xen-devel, Juergen Gross, Boris Ostrovsky

On 17.01.2023 22:17, Andrew Cooper wrote:
> On 19/10/2022 8:40 am, Jan Beulich wrote:
>> In preparation of the introduction of new vCPU operations allowing to
>> register the respective areas (one of the two is x86-specific) by
>> guest-physical address, add the necessary domain cleanup hooks.
> 
> What are the two areas you're discussing here?
> 
> I assume you mean VCPUOP_register_vcpu_time_memory_area to be the
> x86-specific op, but ARM permits both  VCPUOP_register_vcpu_info and
> VCPUOP_register_runstate_memory_area.
> 
> So isn't it more 2+1 rather than 1+1?

Not in my view: The vcpu_info registration is already physical address
based, and there's also no new vCPU operation introduced there.

>> ---
>> RFC: Zapping the areas in pv_shim_shutdown() may not be strictly
>>      necessary: Aiui unmap_vcpu_info() is called only because the vCPU
>>      info area cannot be re-registered. Beyond that I guess the
>>      assumption is that the areas would only be re-registered as they
>>      were before. If that's not the case I wonder whether the guest
>>      handles for both areas shouldn't also be zapped.
> 
> At this point in pv_shim_shutdown(), have already come out of suspend
> (successfully) and are preparing to poke the PV guest out of suspend too.
> 
> The PV guest needs to not have its subsequent VCPUOP_register_vcpu_info
> call fail, but beyond that I can entirely believe that it was coded to
> "Linux doesn't crash" rather than "what's everything we ought to reset
> here" - recall that we weren't exactly flush with time when trying to
> push Shim out of the door.
> 
> Whatever does get reregstered will be reregistered at the same
> position.  No guest at all is going to have details like that dynamic
> across migrate.

I read this as "keep code as is, drop RFC remark", but that's not
necessarily the only way to interpret your reply.

> As a tangential observation, i see the periodic timer gets rearmed. 
> This is still one of the more insane default properties of a PV guest;
> Linux intentionally clobbers it on boot, but I can equivalent logic to
> re-clobber after resume.

I guess you meant s/can/can't spot/, in which case let's Cc Linux
folks for awareness.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 05/10] x86: update GADDR based secondary time area
  2023-01-17 20:31   ` Andrew Cooper
@ 2023-01-18  9:25     ` Jan Beulich
  0 siblings, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2023-01-18  9:25 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, Wei Liu, Roger Pau Monne, xen-devel

On 17.01.2023 21:31, Andrew Cooper wrote:
> On 19/10/2022 8:41 am, Jan Beulich wrote:
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -1462,12 +1462,34 @@ static void __update_vcpu_system_time(st
>>          v->arch.pv.pending_system_time = _u;
>>  }
>>  
>> +static void write_time_guest_area(struct vcpu_time_info *map,
>> +                                  const struct vcpu_time_info *src)
>> +{
>> +    /* 1. Update userspace version. */
>> +    write_atomic(&map->version, src->version);
> 
> version_update_begin()

Not really, no. src->version was already bumped, and the above is
the equivalent of

    /* 2. Update all other userspace fields. */
    __copy_to_guest(user_u, u, 1);

in pre-existing code (which also doesn't bump).

However, you point out a bug in patch 9: There I need to set the
version to ~0 between collect_time_info() and write_time_guest_area(),
to cover for the subsequent version_update_end(). (Using
version_update_begin() there wouldn't be correct, as
force_update_secondary_system_time() is used to first populate the
area, and we also shouldn't leave version at 2 once done, as that
might get in conflict with subsequent updates mirroring the version
from the "main" area.)

Jan

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-17 22:04   ` Andrew Cooper
@ 2023-01-18  9:55     ` Jan Beulich
  2023-01-20 18:15       ` Andrew Cooper
  0 siblings, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2023-01-18  9:55 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne, xen-devel

On 17.01.2023 23:04, Andrew Cooper wrote:
> On 19/10/2022 8:43 am, Jan Beulich wrote:
>> The registration by virtual/linear address has downsides: At least on
>> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
>> PV domains the areas are inaccessible (and hence cannot be updated by
>> Xen) when in guest-user mode.
> 
> They're also inaccessible in HVM guests (x86 and ARM) when Meltdown
> mitigations are in place.

I've added this explicitly, but ...

> And lets not get started on the multitude of layering violations that is
> guest_memory_policy() for nested virt.  In fact, prohibiting any form of
> map-by-va is a perquisite to any rational attempt to make nested virt work.
> 
> (In fact, that infrastructure needs purging before any other
> architecture pick up stubs too.)
> 
> They're also inaccessible in general because no architecture has
> hypervisor privilege in a normal user/supervisor split, and you don't
> know whether the mapping is over supervisor or user mapping, and
> settings like SMAP/PAN can cause the pagewalk to fail even when the
> mapping is in place.

... I'm now merely saying that there are yet more reasons, rather than
trying to enumerate them all.

>> In preparation of the introduction of new vCPU operations allowing to
>> register the respective areas (one of the two is x86-specific) by
>> guest-physical address, flesh out the map/unmap functions.
>>
>> Noteworthy differences from map_vcpu_info():
>> - areas can be registered more than once (and de-registered),
> 
> When register by GFN is available, there is never a good reason to the
> same area twice.

Why not? Why shouldn't different entities be permitted to register their
areas, one after the other? This at the very least requires a way to
de-register.

> The guest maps one MMIO-like region, and then constructs all the regular
> virtual addresses mapping it (or not) that it wants.
> 
> This API is new, so we can enforce sane behaviour from the outset.  In
> particular, it will help with ...
> 
>> - remote vCPU-s are paused rather than checked for being down (which in
>>   principle can change right after the check),
>> - the domain lock is taken for a much smaller region.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: By using global domain page mappings the demand on the underlying
>>      VA range may increase significantly. I did consider to use per-
>>      domain mappings instead, but they exist for x86 only. Of course we
>>      could have arch_{,un}map_guest_area() aliasing global domain page
>>      mapping functions on Arm and using per-domain mappings on x86. Yet
>>      then again map_vcpu_info() doesn't do so either (albeit that's
>>      likely to be converted subsequently to use map_vcpu_area() anyway).
> 
> ... this by providing a bound on the amount of vmap() space can be consumed.

I'm afraid I don't understand. When re-registering a different area, the
earlier one will be unmapped. The consumption of vmap space cannot grow
(or else we'd have a resource leak and hence an XSA).

>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>      like map_vcpu_info() - solely relying on the type ref acquisition.
>>      Checking for p2m_ram_rw alone would be wrong, as at least
>>      p2m_ram_logdirty ought to also be okay to use here (and in similar
>>      cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>      used here (like altp2m_vcpu_enable_ve() does) as well as in
>>      map_vcpu_info(), yet then again the P2M type is stale by the time
>>      it is being looked at anyway without the P2M lock held.
> 
> Again, another error caused by Xen not knowing the guest physical
> address layout.  These mappings should be restricted to just RAM regions
> and I think we want to enforce that right from the outset.

Meaning what exactly in terms of action for me to take? As said, checking
the P2M type is pointless. So without you being more explicit, all I can
take your reply for is merely a comment, with no action on my part (not
even to remove this RFC remark).

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-17 22:20     ` Andrew Cooper
  2023-01-17 22:45       ` Julien Grall
@ 2023-01-18  9:59       ` Jan Beulich
  2023-01-20 17:44         ` Andrew Cooper
  1 sibling, 1 reply; 47+ messages in thread
From: Jan Beulich @ 2023-01-18  9:59 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, Stefano Stabellini, Wei Liu, Roger Pau Monne,
	Julien Grall, xen-devel

On 17.01.2023 23:20, Andrew Cooper wrote:
> On 24/11/2022 9:29 pm, Julien Grall wrote:
>> On 19/10/2022 09:43, Jan Beulich wrote:
>>> The registration by virtual/linear address has downsides: At least on
>>> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
>>> PV domains the areas are inaccessible (and hence cannot be updated by
>>> Xen) when in guest-user mode.
>>>
>>> In preparation of the introduction of new vCPU operations allowing to
>>> register the respective areas (one of the two is x86-specific) by
>>> guest-physical address, flesh out the map/unmap functions.
>>>
>>> Noteworthy differences from map_vcpu_info():
>>> - areas can be registered more than once (and de-registered),
>>> - remote vCPU-s are paused rather than checked for being down (which in
>>>    principle can change right after the check),
>>> - the domain lock is taken for a much smaller region.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> RFC: By using global domain page mappings the demand on the underlying
>>>       VA range may increase significantly. I did consider to use per-
>>>       domain mappings instead, but they exist for x86 only. Of course we
>>>       could have arch_{,un}map_guest_area() aliasing global domain page
>>>       mapping functions on Arm and using per-domain mappings on x86. Yet
>>>       then again map_vcpu_info() doesn't do so either (albeit that's
>>>       likely to be converted subsequently to use map_vcpu_area()
>>> anyway).
>>>
>>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>>       like map_vcpu_info() - solely relying on the type ref acquisition.
>>>       Checking for p2m_ram_rw alone would be wrong, as at least
>>>       p2m_ram_logdirty ought to also be okay to use here (and in similar
>>>       cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>>       used here (like altp2m_vcpu_enable_ve() does) as well as in
>>>       map_vcpu_info(), yet then again the P2M type is stale by the time
>>>       it is being looked at anyway without the P2M lock held.
>>>
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>                      struct guest_area *area,
>>>                      void (*populate)(void *dst, struct vcpu *v))
>>>   {
>>> -    return -EOPNOTSUPP;
>>> +    struct domain *currd = v->domain;
>>> +    void *map = NULL;
>>> +    struct page_info *pg = NULL;
>>> +    int rc = 0;
>>> +
>>> +    if ( gaddr )
>>
>> 0 is technically a valid (guest) physical address on Arm.
> 
> It is on x86 too, but that's not why 0 is generally considered an
> invalid address.
> 
> See the multitude of XSAs, and near-XSAs which have been caused by bad
> logic in Xen caused by trying to make a variable held in struct
> vcpu/domain have a default value other than 0.
> 
> It's not impossible to write such code safely, and in this case I expect
> it can be done by the NULLness (or not) of the mapping pointer, rather
> than by stashing the gaddr, but history has proved repeatedly that this
> is a very fertile source of security bugs.

I'm checking a value passed in from the guest here. No checking of internal
state can replace that. The checks on internal state leverage zero-init:

 unmap:
    if ( pg )
    {
        unmap_domain_page_global(map);
        put_page_and_type(pg);
    }

It's also not clear to me whether, like Julien looks to have read it, you
mean to ask that I revert back to using 0 as the "invalid" (i.e. request
for unmap) indicator.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-18  9:59       ` Jan Beulich
@ 2023-01-20 17:44         ` Andrew Cooper
  0 siblings, 0 replies; 47+ messages in thread
From: Andrew Cooper @ 2023-01-20 17:44 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, Stefano Stabellini, Wei Liu, Roger Pau Monne,
	Julien Grall, xen-devel

On 18/01/2023 9:59 am, Jan Beulich wrote:
> On 17.01.2023 23:20, Andrew Cooper wrote:
>> On 24/11/2022 9:29 pm, Julien Grall wrote:
>>> On 19/10/2022 09:43, Jan Beulich wrote:
>>>> The registration by virtual/linear address has downsides: At least on
>>>> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
>>>> PV domains the areas are inaccessible (and hence cannot be updated by
>>>> Xen) when in guest-user mode.
>>>>
>>>> In preparation of the introduction of new vCPU operations allowing to
>>>> register the respective areas (one of the two is x86-specific) by
>>>> guest-physical address, flesh out the map/unmap functions.
>>>>
>>>> Noteworthy differences from map_vcpu_info():
>>>> - areas can be registered more than once (and de-registered),
>>>> - remote vCPU-s are paused rather than checked for being down (which in
>>>>    principle can change right after the check),
>>>> - the domain lock is taken for a much smaller region.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> ---
>>>> RFC: By using global domain page mappings the demand on the underlying
>>>>       VA range may increase significantly. I did consider to use per-
>>>>       domain mappings instead, but they exist for x86 only. Of course we
>>>>       could have arch_{,un}map_guest_area() aliasing global domain page
>>>>       mapping functions on Arm and using per-domain mappings on x86. Yet
>>>>       then again map_vcpu_info() doesn't do so either (albeit that's
>>>>       likely to be converted subsequently to use map_vcpu_area()
>>>> anyway).
>>>>
>>>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>>>       like map_vcpu_info() - solely relying on the type ref acquisition.
>>>>       Checking for p2m_ram_rw alone would be wrong, as at least
>>>>       p2m_ram_logdirty ought to also be okay to use here (and in similar
>>>>       cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>>>       used here (like altp2m_vcpu_enable_ve() does) as well as in
>>>>       map_vcpu_info(), yet then again the P2M type is stale by the time
>>>>       it is being looked at anyway without the P2M lock held.
>>>>
>>>> --- a/xen/common/domain.c
>>>> +++ b/xen/common/domain.c
>>>> @@ -1563,7 +1563,82 @@ int map_guest_area(struct vcpu *v, paddr
>>>>                      struct guest_area *area,
>>>>                      void (*populate)(void *dst, struct vcpu *v))
>>>>   {
>>>> -    return -EOPNOTSUPP;
>>>> +    struct domain *currd = v->domain;
>>>> +    void *map = NULL;
>>>> +    struct page_info *pg = NULL;
>>>> +    int rc = 0;
>>>> +
>>>> +    if ( gaddr )
>>> 0 is technically a valid (guest) physical address on Arm.
>> It is on x86 too, but that's not why 0 is generally considered an
>> invalid address.
>>
>> See the multitude of XSAs, and near-XSAs which have been caused by bad
>> logic in Xen caused by trying to make a variable held in struct
>> vcpu/domain have a default value other than 0.
>>
>> It's not impossible to write such code safely, and in this case I expect
>> it can be done by the NULLness (or not) of the mapping pointer, rather
>> than by stashing the gaddr, but history has proved repeatedly that this
>> is a very fertile source of security bugs.
> I'm checking a value passed in from the guest here. No checking of internal
> state can replace that. The checks on internal state leverage zero-init:
>
>  unmap:
>     if ( pg )
>     {
>         unmap_domain_page_global(map);
>         put_page_and_type(pg);
>     }
>
> It's also not clear to me whether, like Julien looks to have read it, you
> mean to ask that I revert back to using 0 as the "invalid" (i.e. request
> for unmap) indicator.

I'm merely asking you to be extra careful and not add bugs to the error
paths.  And it appears that you have done it based on the NULLness of
the mapping pointer, which is fine.

~Andrew

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-18  9:55     ` Jan Beulich
@ 2023-01-20 18:15       ` Andrew Cooper
  2023-01-23  8:23         ` Jan Beulich
  2023-01-23  8:29         ` Jan Beulich
  0 siblings, 2 replies; 47+ messages in thread
From: Andrew Cooper @ 2023-01-20 18:15 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne, xen-devel

On 18/01/2023 9:55 am, Jan Beulich wrote:
> On 17.01.2023 23:04, Andrew Cooper wrote:
>> On 19/10/2022 8:43 am, Jan Beulich wrote:
>>> The registration by virtual/linear address has downsides: At least on
>>> x86 the access is expensive for HVM/PVH domains. Furthermore for 64-bit
>>> PV domains the areas are inaccessible (and hence cannot be updated by
>>> Xen) when in guest-user mode.
>> They're also inaccessible in HVM guests (x86 and ARM) when Meltdown
>> mitigations are in place.
> I've added this explicitly, but ...
>
>> And lets not get started on the multitude of layering violations that is
>> guest_memory_policy() for nested virt.  In fact, prohibiting any form of
>> map-by-va is a perquisite to any rational attempt to make nested virt work.
>>
>> (In fact, that infrastructure needs purging before any other
>> architecture pick up stubs too.)
>>
>> They're also inaccessible in general because no architecture has
>> hypervisor privilege in a normal user/supervisor split, and you don't
>> know whether the mapping is over supervisor or user mapping, and
>> settings like SMAP/PAN can cause the pagewalk to fail even when the
>> mapping is in place.
> ... I'm now merely saying that there are yet more reasons, rather than
> trying to enumerate them all.

That's fine.  I just wanted to point out that its far more reasons that
were given the first time around.

>>> In preparation of the introduction of new vCPU operations allowing to
>>> register the respective areas (one of the two is x86-specific) by
>>> guest-physical address, flesh out the map/unmap functions.
>>>
>>> Noteworthy differences from map_vcpu_info():
>>> - areas can be registered more than once (and de-registered),
>> When register by GFN is available, there is never a good reason to the
>> same area twice.
> Why not? Why shouldn't different entities be permitted to register their
> areas, one after the other? This at the very least requires a way to
> de-register.

Because it's useless and extra complexity.  From the point of view of
any guest, its an MMIO(ish) window that Xen happens to update the
content of.

You don't get systems where you can ask hardware for e.g. "another copy
of the HPET at mfn $foo please".

>> The guest maps one MMIO-like region, and then constructs all the regular
>> virtual addresses mapping it (or not) that it wants.
>>
>> This API is new, so we can enforce sane behaviour from the outset.  In
>> particular, it will help with ...
>>
>>> - remote vCPU-s are paused rather than checked for being down (which in
>>>   principle can change right after the check),
>>> - the domain lock is taken for a much smaller region.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> RFC: By using global domain page mappings the demand on the underlying
>>>      VA range may increase significantly. I did consider to use per-
>>>      domain mappings instead, but they exist for x86 only. Of course we
>>>      could have arch_{,un}map_guest_area() aliasing global domain page
>>>      mapping functions on Arm and using per-domain mappings on x86. Yet
>>>      then again map_vcpu_info() doesn't do so either (albeit that's
>>>      likely to be converted subsequently to use map_vcpu_area() anyway).
>> ... this by providing a bound on the amount of vmap() space can be consumed.
> I'm afraid I don't understand. When re-registering a different area, the
> earlier one will be unmapped. The consumption of vmap space cannot grow
> (or else we'd have a resource leak and hence an XSA).

In which case you mean "can be re-registered elsewhere".  More
specifically, the area can be moved, and isn't a singleton operation
like map_vcpu_info was.

The wording as presented firmly suggests the presence of an XSA.

>>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>>      like map_vcpu_info() - solely relying on the type ref acquisition.
>>>      Checking for p2m_ram_rw alone would be wrong, as at least
>>>      p2m_ram_logdirty ought to also be okay to use here (and in similar
>>>      cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>>      used here (like altp2m_vcpu_enable_ve() does) as well as in
>>>      map_vcpu_info(), yet then again the P2M type is stale by the time
>>>      it is being looked at anyway without the P2M lock held.
>> Again, another error caused by Xen not knowing the guest physical
>> address layout.  These mappings should be restricted to just RAM regions
>> and I think we want to enforce that right from the outset.
> Meaning what exactly in terms of action for me to take? As said, checking
> the P2M type is pointless. So without you being more explicit, all I can
> take your reply for is merely a comment, with no action on my part (not
> even to remove this RFC remark).

There will become a point where it will need to become prohibited to
issue this against something which isn't p2m_type_ram.  If we had a sane
idea of the guest physmap, I'd go as far as saying E820_RAM, but that's
clearly not feasible yet.

Even now, absolutely nothing good can possibly come of e.g. trying to
overlay it on the grant table, or a grant mapping.

ram || logdirty ought to exclude most cases we care about the guest
(not) putting the mapping.

~Andrew

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-20 18:15       ` Andrew Cooper
@ 2023-01-23  8:23         ` Jan Beulich
  2023-01-23  8:29         ` Jan Beulich
  1 sibling, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2023-01-23  8:23 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne, xen-devel

On 20.01.2023 19:15, Andrew Cooper wrote:
> On 18/01/2023 9:55 am, Jan Beulich wrote:
>> On 17.01.2023 23:04, Andrew Cooper wrote:
>>> On 19/10/2022 8:43 am, Jan Beulich wrote:
>>>> Noteworthy differences from map_vcpu_info():
>>>> - areas can be registered more than once (and de-registered),
>>> When register by GFN is available, there is never a good reason to the
>>> same area twice.
>> Why not? Why shouldn't different entities be permitted to register their
>> areas, one after the other? This at the very least requires a way to
>> de-register.
> 
> Because it's useless and extra complexity.  From the point of view of
> any guest, its an MMIO(ish) window that Xen happens to update the
> content of.
> 
> You don't get systems where you can ask hardware for e.g. "another copy
> of the HPET at mfn $foo please".

I/O ports appear in multiple places on many systems. I think MMIO regions
can, too. And then I don't see why there couldn't be a way to actually
control this (via e.g. some chipset specific register).

>>>> RFC: By using global domain page mappings the demand on the underlying
>>>>      VA range may increase significantly. I did consider to use per-
>>>>      domain mappings instead, but they exist for x86 only. Of course we
>>>>      could have arch_{,un}map_guest_area() aliasing global domain page
>>>>      mapping functions on Arm and using per-domain mappings on x86. Yet
>>>>      then again map_vcpu_info() doesn't do so either (albeit that's
>>>>      likely to be converted subsequently to use map_vcpu_area() anyway).
>>> ... this by providing a bound on the amount of vmap() space can be consumed.
>> I'm afraid I don't understand. When re-registering a different area, the
>> earlier one will be unmapped. The consumption of vmap space cannot grow
>> (or else we'd have a resource leak and hence an XSA).
> 
> In which case you mean "can be re-registered elsewhere".  More
> specifically, the area can be moved, and isn't a singleton operation
> like map_vcpu_info was.
> 
> The wording as presented firmly suggests the presence of an XSA.

You mean the "map_vcpu_info() doesn't do so either"? That talks about the
function not using per-domain mappings. There's no connection at all that
I can see to a missed unmapping, which at this point is the only thing I
can deduce you might be referring to.

>>>> RFC: In map_guest_area() I'm not checking the P2M type, instead - just
>>>>      like map_vcpu_info() - solely relying on the type ref acquisition.
>>>>      Checking for p2m_ram_rw alone would be wrong, as at least
>>>>      p2m_ram_logdirty ought to also be okay to use here (and in similar
>>>>      cases, e.g. in Argo's find_ring_mfn()). p2m_is_pageable() could be
>>>>      used here (like altp2m_vcpu_enable_ve() does) as well as in
>>>>      map_vcpu_info(), yet then again the P2M type is stale by the time
>>>>      it is being looked at anyway without the P2M lock held.
>>> Again, another error caused by Xen not knowing the guest physical
>>> address layout.  These mappings should be restricted to just RAM regions
>>> and I think we want to enforce that right from the outset.
>> Meaning what exactly in terms of action for me to take? As said, checking
>> the P2M type is pointless. So without you being more explicit, all I can
>> take your reply for is merely a comment, with no action on my part (not
>> even to remove this RFC remark).
> 
> There will become a point where it will need to become prohibited to
> issue this against something which isn't p2m_type_ram.  If we had a sane
> idea of the guest physmap, I'd go as far as saying E820_RAM, but that's
> clearly not feasible yet.
> 
> Even now, absolutely nothing good can possibly come of e.g. trying to
> overlay it on the grant table, or a grant mapping.
> 
> ram || logdirty ought to exclude most cases we care about the guest
> (not) putting the mapping.

It's still not clear to me what you want me to do: If I add the P2M type
check here including log-dirty, then this will be inconsistent with what
we do elsewhere _and_ useless code (for the time being). I hope you're
not making a scope-creeping request for me to "fix" all the other places
(I may not have found all) where such a P2M type check is either missing
of failing to include log-dirty.

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH RFC 07/10] domain: map/unmap GADDR based shared guest areas
  2023-01-20 18:15       ` Andrew Cooper
  2023-01-23  8:23         ` Jan Beulich
@ 2023-01-23  8:29         ` Jan Beulich
  1 sibling, 0 replies; 47+ messages in thread
From: Jan Beulich @ 2023-01-23  8:29 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Roger Pau Monne, xen-devel

On 20.01.2023 19:15, Andrew Cooper wrote:
> On 18/01/2023 9:55 am, Jan Beulich wrote:
>> On 17.01.2023 23:04, Andrew Cooper wrote:
>>> On 19/10/2022 8:43 am, Jan Beulich wrote:
>>>> In preparation of the introduction of new vCPU operations allowing to
>>>> register the respective areas (one of the two is x86-specific) by
>>>> guest-physical address, flesh out the map/unmap functions.
>>>>
>>>> Noteworthy differences from map_vcpu_info():
>>>> - areas can be registered more than once (and de-registered),
>>> When register by GFN is available, there is never a good reason to the
>>> same area twice.
>> Why not? Why shouldn't different entities be permitted to register their
>> areas, one after the other? This at the very least requires a way to
>> de-register.
> 
> Because it's useless and extra complexity.

As to this: Looking at the code I think that I would actually add
complexity (just a little - an extra check) to prevent re-registration.
Things come out more naturally, from what I can tell, by allowing it.
This can also be seen in "common: convert vCPU info area registration"
where I'm actually adding such a (conditional) check to maintain the
"no re-registration" property of the sub-op there. Granted there can be
an argument towards making that check unconditional then ...

Jan


^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2023-01-23  8:30 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-19  7:35 [PATCH RFC 00/10] runstate/time area registration by (guest) physical address Jan Beulich
2022-10-19  7:38 ` [PATCH 01/10] unify update_runstate_area() Jan Beulich
2022-11-24 20:43   ` Julien Grall
2022-10-19  7:39 ` [PATCH 02/10] x86: split populating of struct vcpu_time_info into a separate function Jan Beulich
2023-01-17 20:19   ` Andrew Cooper
2023-01-18  7:32     ` Jan Beulich
2022-10-19  7:40 ` [PATCH RFC 03/10] domain: GADDR based shared guest area registration alternative - teardown Jan Beulich
2022-12-13 21:44   ` Julien Grall
2022-12-14  9:12     ` Jan Beulich
2023-01-17 21:17   ` Andrew Cooper
2023-01-18  8:59     ` Jan Beulich
2022-10-19  7:41 ` [PATCH RFC 04/10] domain: update GADDR based runstate guest area Jan Beulich
2022-12-16 12:26   ` Julien Grall
2022-12-19 12:48     ` Jan Beulich
2022-12-20  8:40       ` Julien Grall
2022-12-20  8:45         ` Jan Beulich
2022-12-20  8:48           ` Julien Grall
2022-12-20  9:59             ` Jan Beulich
2022-12-20 10:01               ` Julien Grall
2022-10-19  7:41 ` [PATCH RFC 05/10] x86: update GADDR based secondary time area Jan Beulich
2023-01-17 20:31   ` Andrew Cooper
2023-01-18  9:25     ` Jan Beulich
2022-10-19  7:42 ` [PATCH 06/10] x86/mem-sharing: copy GADDR based shared guest areas Jan Beulich
2022-10-24 23:04   ` Tamas K Lengyel
2022-10-25  6:18     ` Jan Beulich
2022-10-19  7:43 ` [PATCH RFC 07/10] domain: map/unmap " Jan Beulich
2022-11-24 21:29   ` Julien Grall
2022-11-28  9:01     ` Jan Beulich
2022-11-29  8:40       ` Julien Grall
2022-11-29 14:02         ` Jan Beulich
2022-12-06 18:27           ` Julien Grall
2022-12-07  8:29             ` Jan Beulich
2023-01-17 22:20     ` Andrew Cooper
2023-01-17 22:45       ` Julien Grall
2023-01-18  9:59       ` Jan Beulich
2023-01-20 17:44         ` Andrew Cooper
2023-01-17 22:04   ` Andrew Cooper
2023-01-18  9:55     ` Jan Beulich
2023-01-20 18:15       ` Andrew Cooper
2023-01-23  8:23         ` Jan Beulich
2023-01-23  8:29         ` Jan Beulich
2022-10-19  7:44 ` [PATCH 08/10] domain: introduce GADDR based runstate area registration alternative Jan Beulich
2022-10-19  7:45 ` [PATCH 09/10] x86: introduce GADDR based secondary time " Jan Beulich
2022-10-19  7:45 ` [PATCH RFC 10/10] common: convert vCPU info area registration Jan Beulich
2022-12-17 15:53   ` Julien Grall
2022-12-19 15:01     ` Jan Beulich
2022-12-20  8:52       ` Julien Grall

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.