[PATCH v2] xen/arm: Convert runstate address during hypcall

* [PATCH v2] xen/arm: Convert runstate address during hypcall
@ 2020-07-28 15:52 Bertrand Marquis
  2020-07-28 19:04 ` Stefano Stabellini
  2020-07-28 19:54 ` Jan Beulich
  0 siblings, 2 replies; 14+ messages in thread
From: Bertrand Marquis @ 2020-07-28 15:52 UTC (permalink / raw)
  To: xen-devel
  Cc: Stefano Stabellini, Julien Grall, Wei Liu, Andrew Cooper,
	Ian Jackson, George Dunlap, Jan Beulich, nd, Volodymyr Babchuk,
	Roger Pau Monné

At the moment on Arm, a Linux guest running with KTPI enabled will
cause the following error when a context switch happens in user mode:
(XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0

The error is caused by the virtual address for the runstate area
registered by the guest only being accessible when the guest is running
in kernel space when KPTI is enabled.

To solve this issue, this patch is doing the translation from virtual
address to physical address during the hypercall and mapping the
required pages using vmap. This is removing the conversion from virtual
to physical address during the context switch which is solving the
problem with KPTI.

This is done only on arm architecture, the behaviour on x86 is not
modified by this patch and the address conversion is done as before
during each context switch.

This is introducing several limitations in comparison to the previous
behaviour (on arm only):
- if the guest is remapping the area at a different physical address Xen
will continue to update the area at the previous physical address. As
the area is in kernel space and usually defined as a global variable this
is something which is believed not to happen. If this is required by a
guest, it will have to call the hypercall with the new area (even if it
is at the same virtual address).
- the area needs to be mapped during the hypercall. For the same reasons
as for the previous case, even if the area is registered for a different
vcpu. It is believed that registering an area using a virtual address
unmapped is not something done.

inline functions in headers could not be used as the architecture
domain.h is included before the global domain.h making it impossible
to use the struct vcpu inside the architecture header.
This should not have any performance impact as the hypercall is only
called once per vcpu usually.

Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>

---
  Changes in v2
    - use vmap to map the pages during the hypercall.
    - reintroduce initial copy during hypercall.

---
 xen/arch/arm/domain.c        | 160 +++++++++++++++++++++++++++++++----
 xen/arch/x86/domain.c        |  30 ++++++-
 xen/arch/x86/x86_64/domain.c |   4 +-
 xen/common/domain.c          |  19 ++---
 xen/include/asm-arm/domain.h |   9 ++
 xen/include/asm-x86/domain.h |  16 ++++
 xen/include/xen/domain.h     |   5 ++
 xen/include/xen/sched.h      |  16 +---
 8 files changed, 207 insertions(+), 52 deletions(-)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 31169326b2..c595438bd9 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -19,6 +19,7 @@
 #include <xen/sched.h>
 #include <xen/softirq.h>
 #include <xen/wait.h>
+#include <xen/vmap.h>
 
 #include <asm/alternative.h>
 #include <asm/cpuerrata.h>
@@ -275,36 +276,157 @@ static void ctxt_switch_to(struct vcpu *n)
     virt_timer_restore(n);
 }
 
-/* Update per-VCPU guest runstate shared memory area (if registered). */
-static void update_runstate_area(struct vcpu *v)
+static void cleanup_runstate_vcpu_locked(struct vcpu *v)
+{
+    if ( v->arch.runstate_guest )
+    {
+        vunmap((void *)((unsigned long)v->arch.runstate_guest & PAGE_MASK));
+
+        put_page(v->arch.runstate_guest_page[0]);
+
+        if ( v->arch.runstate_guest_page[1] )
+        {
+            put_page(v->arch.runstate_guest_page[1]);
+        }
+        v->arch.runstate_guest = NULL;
+    }
+}
+
+void arch_vcpu_cleanup_runstate(struct vcpu *v)
 {
-    void __user *guest_handle = NULL;
+    spin_lock(&v->arch.runstate_guest_lock);
+
+    cleanup_runstate_vcpu_locked(v);
+
+    spin_unlock(&v->arch.runstate_guest_lock);
+}
+
+static int setup_runstate_vcpu_locked(struct vcpu *v, vaddr_t vaddr)
+{
+    unsigned int offset;
+    mfn_t mfn[2];
+    struct page_info *page;
+    unsigned int numpages;
     struct vcpu_runstate_info runstate;
+    void *p;
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return;
+    /* user can pass a NULL address to unregister a previous area */
+    if ( vaddr == 0 )
+        return 0;
 
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
+    offset = vaddr & ~PAGE_MASK;
 
-    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    /* provided address must be aligned to a 64bit */
+    if ( offset % alignof(struct vcpu_runstate_info) )
     {
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
-        guest_handle--;
-        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
-        smp_wmb();
+        gprintk(XENLOG_WARNING, "Cannot map runstate pointer at 0x%lx: "
+                "Invalid offset\n", vaddr);
+        return -EINVAL;
+    }
+
+    page = get_page_from_gva(v, vaddr, GV2M_WRITE);
+    if ( !page )
+    {
+        gprintk(XENLOG_WARNING, "Cannot map runstate pointer at 0x%lx: "
+                "Page is not mapped\n", vaddr);
+        return -EINVAL;
+    }
+    mfn[0] = page_to_mfn(page);
+    v->arch.runstate_guest_page[0] = page;
+
+    if ( offset > (PAGE_SIZE - sizeof(struct vcpu_runstate_info)) )
+    {
+        /* guest area is crossing pages */
+        page = get_page_from_gva(v, vaddr + PAGE_SIZE, GV2M_WRITE);
+        if ( !page )
+        {
+            put_page(v->arch.runstate_guest_page[0]);
+            gprintk(XENLOG_WARNING, "Cannot map runstate pointer at 0x%lx: "
+                    "2nd Page is not mapped\n", vaddr);
+            return -EINVAL;
+        }
+        mfn[1] = page_to_mfn(page);
+        v->arch.runstate_guest_page[1] = page;
+        numpages = 2;
+    }
+    else
+    {
+        v->arch.runstate_guest_page[1] = NULL;
+        numpages = 1;
+    }
+
+    p = vmap(mfn, numpages);
+    if ( !p )
+    {
+        put_page(v->arch.runstate_guest_page[0]);
+        if ( numpages == 2 )
+            put_page(v->arch.runstate_guest_page[1]);
+
+        gprintk(XENLOG_WARNING, "Cannot map runstate pointer at 0x%lx: "
+                "vmap error\n", vaddr);
+        return -EINVAL;
     }
 
-    __copy_to_guest(runstate_guest(v), &runstate, 1);
+    v->arch.runstate_guest = p + offset;
 
-    if ( guest_handle )
+    if (v == current)
     {
-        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        memcpy(v->arch.runstate_guest, &v->runstate, sizeof(v->runstate));
+    }
+    else
+    {
+        vcpu_runstate_get(v, &runstate);
+        memcpy(v->arch.runstate_guest, &runstate, sizeof(v->runstate));
+    }
+
+    return 0;
+}
+
+int arch_vcpu_setup_runstate(struct vcpu *v,
+                             struct vcpu_register_runstate_memory_area area)
+{
+    int rc;
+
+    spin_lock(&v->arch.runstate_guest_lock);
+
+    /* cleanup if we are recalled */
+    cleanup_runstate_vcpu_locked(v);
+
+    rc = setup_runstate_vcpu_locked(v, (vaddr_t)area.addr.v);
+
+    spin_unlock(&v->arch.runstate_guest_lock);
+
+    return rc;
+}
+
+
+/* Update per-VCPU guest runstate shared memory area (if registered). */
+static void update_runstate_area(struct vcpu *v)
+{
+    spin_lock(&v->arch.runstate_guest_lock);
+
+    if ( v->arch.runstate_guest )
+    {
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+            v->arch.runstate_guest->state_entry_time |= XEN_RUNSTATE_UPDATE;
+            smp_wmb();
+        }
+
+        memcpy(v->arch.runstate_guest, &v->runstate, sizeof(v->runstate));
+
+        /* copy must be done before switching the bit */
         smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+
+        if ( VM_ASSIST(v->domain, runstate_update_flag) )
+        {
+            v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+            v->arch.runstate_guest->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        }
     }
+
+    spin_unlock(&v->arch.runstate_guest_lock);
 }
 
 static void schedule_tail(struct vcpu *prev)
@@ -560,6 +682,8 @@ int arch_vcpu_create(struct vcpu *v)
     v->arch.saved_context.sp = (register_t)v->arch.cpu_info;
     v->arch.saved_context.pc = (register_t)continue_new_vcpu;
 
+    spin_lock_init(&v->arch.runstate_guest_lock);
+
     /* Idle VCPUs don't need the rest of this setup */
     if ( is_idle_vcpu(v) )
         return rc;
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index fee6c3931a..98910b7cf1 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1642,6 +1642,30 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
         wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
 }
 
+int arch_vcpu_setup_runstate(struct vcpu *v,
+                             struct vcpu_register_runstate_memory_area area)
+{
+    struct vcpu_runstate_info runstate;
+
+    runstate_guest(v) = area.addr.h;
+
+    if ( v == current )
+    {
+        __copy_to_guest(runstate_guest(v), &v->runstate, 1);
+    }
+    else
+    {
+        vcpu_runstate_get(v, &runstate);
+        __copy_to_guest(runstate_guest(v), &runstate, 1);
+    }
+    return 0;
+}
+
+void arch_vcpu_cleanup_runstate(struct vcpu *v)
+{
+    set_xen_guest_handle(runstate_guest(v), NULL);
+}
+
 /* Update per-VCPU guest runstate shared memory area (if registered). */
 bool update_runstate_area(struct vcpu *v)
 {
@@ -1660,8 +1684,8 @@ bool update_runstate_area(struct vcpu *v)
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
         guest_handle = has_32bit_shinfo(v->domain)
-            ? &v->runstate_guest.compat.p->state_entry_time + 1
-            : &v->runstate_guest.native.p->state_entry_time + 1;
+            ? &v->arch.runstate_guest.compat.p->state_entry_time + 1
+            : &v->arch.runstate_guest.native.p->state_entry_time + 1;
         guest_handle--;
         runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
         __raw_copy_to_guest(guest_handle,
@@ -1674,7 +1698,7 @@ bool update_runstate_area(struct vcpu *v)
         struct compat_vcpu_runstate_info info;
 
         XLAT_vcpu_runstate_info(&info, &runstate);
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
         rc = true;
     }
     else
diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c
index c46dccc25a..b879e8dd2c 100644
--- a/xen/arch/x86/x86_64/domain.c
+++ b/xen/arch/x86/x86_64/domain.c
@@ -36,7 +36,7 @@ arch_compat_vcpu_op(
             break;
 
         rc = 0;
-        guest_from_compat_handle(v->runstate_guest.compat, area.addr.h);
+        guest_from_compat_handle(v->arch.runstate_guest.compat, area.addr.h);
 
         if ( v == current )
         {
@@ -49,7 +49,7 @@ arch_compat_vcpu_op(
             vcpu_runstate_get(v, &runstate);
             XLAT_vcpu_runstate_info(&info, &runstate);
         }
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        __copy_to_guest(v->arch.runstate_guest.compat, &info, 1);
 
         break;
     }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index e9be05f1d0..cd8595b186 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -727,7 +727,10 @@ int domain_kill(struct domain *d)
         if ( cpupool_move_domain(d, cpupool0) )
             return -ERESTART;
         for_each_vcpu ( d, v )
+        {
+            arch_vcpu_cleanup_runstate(v);
             unmap_vcpu_info(v);
+        }
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
@@ -1167,7 +1170,7 @@ int domain_soft_reset(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        set_xen_guest_handle(runstate_guest(v), NULL);
+        arch_vcpu_cleanup_runstate(v);
         unmap_vcpu_info(v);
     }
 
@@ -1494,7 +1497,6 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
     case VCPUOP_register_runstate_memory_area:
     {
         struct vcpu_register_runstate_memory_area area;
-        struct vcpu_runstate_info runstate;
 
         rc = -EFAULT;
         if ( copy_from_guest(&area, arg, 1) )
@@ -1503,18 +1505,7 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( !guest_handle_okay(area.addr.h, 1) )
             break;
 
-        rc = 0;
-        runstate_guest(v) = area.addr.h;
-
-        if ( v == current )
-        {
-            __copy_to_guest(runstate_guest(v), &v->runstate, 1);
-        }
-        else
-        {
-            vcpu_runstate_get(v, &runstate);
-            __copy_to_guest(runstate_guest(v), &runstate, 1);
-        }
+        rc = arch_vcpu_setup_runstate(v, area);
 
         break;
     }
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 4e2f582006..9df4d10abb 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -206,6 +206,15 @@ struct arch_vcpu
      */
     bool need_flush_to_ram;
 
+    /* runstate guest lock */
+    spinlock_t runstate_guest_lock;
+
+    /* runstate guest info */
+    struct vcpu_runstate_info *runstate_guest;
+
+    /* runstate pages mapped for runstate_guest */
+    struct page_info *runstate_guest_page[2];
+
 }  __cacheline_aligned;
 
 void vcpu_show_execution_state(struct vcpu *);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 6fd94c2e14..c369d22ccc 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -11,6 +11,11 @@
 #include <asm/x86_emulate.h>
 #include <public/vcpu.h>
 #include <public/hvm/hvm_info_table.h>
+#ifdef CONFIG_COMPAT
+#include <compat/vcpu.h>
+DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
+#endif
+
 
 #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
 
@@ -627,6 +632,17 @@ struct arch_vcpu
     struct {
         bool next_interrupt_enabled;
     } monitor;
+
+#ifndef CONFIG_COMPAT
+# define runstate_guest(v) ((v)->arch.runstate_guest)
+    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
+#else
+# define runstate_guest(v) ((v)->arch.runstate_guest.native)
+    union {
+        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
+        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
+    } runstate_guest;
+#endif
 };
 
 struct guest_memory_policy
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index 7e51d361de..5e8cbba31d 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -5,6 +5,7 @@
 #include <xen/types.h>
 
 #include <public/xen.h>
+#include <public/vcpu.h>
 #include <asm/domain.h>
 #include <asm/numa.h>
 
@@ -63,6 +64,10 @@ void arch_vcpu_destroy(struct vcpu *v);
 int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset);
 void unmap_vcpu_info(struct vcpu *v);
 
+int arch_vcpu_setup_runstate(struct vcpu *v,
+                             struct vcpu_register_runstate_memory_area area);
+void arch_vcpu_cleanup_runstate(struct vcpu *v);
+
 int arch_domain_create(struct domain *d,
                        struct xen_domctl_createdomain *config);
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index ac53519d7f..fac030fb83 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -29,11 +29,6 @@
 #include <public/vcpu.h>
 #include <public/event_channel.h>
 
-#ifdef CONFIG_COMPAT
-#include <compat/vcpu.h>
-DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
-#endif
-
 /*
  * Stats
  *
@@ -166,16 +161,7 @@ struct vcpu
     struct sched_unit *sched_unit;
 
     struct vcpu_runstate_info runstate;
-#ifndef CONFIG_COMPAT
-# define runstate_guest(v) ((v)->runstate_guest)
-    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
-#else
-# define runstate_guest(v) ((v)->runstate_guest.native)
-    union {
-        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
-        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
-    } runstate_guest; /* guest address */
-#endif
+
     unsigned int     new_state;
 
     /* Has the FPU been initialised? */
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread