All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] utilizing EPT_MISCONFIG VM exit
@ 2014-02-24 13:05 Jan Beulich
  2014-02-27  3:09 ` Zhang, Yang Z
  2014-02-27 14:45 ` Tim Deegan
  0 siblings, 2 replies; 3+ messages in thread
From: Jan Beulich @ 2014-02-24 13:05 UTC (permalink / raw)
  To: Eddie Dong, Jun Nakajima, Yang Z Zhang, Tim Deegan; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1554 bytes --]

Attached draft patch (still having debugging code in it, and assuming
other adjustments having happened before) demonstrates that we
should evaluate whether - not only for dealing with the memory type
adjustments here, but for basically everything currently done through
->change_entry_type_global() - utilizing the EPT_MISCONFIG VM exit
would be an architecturally clean approach. The fundamental idea
here is to defer updates to the page tables until the respective
entries actually get used, instead of iterating through all page tables
when the change is being requested, thus
- avoiding (here) or eliminating (elsewhere) long lasting operations
  without having to introduce expensive/fragile preemption handling
- leaving unaffected the sharing of the page tables with the IOMMU
  (since the EPT memory type field is available to the programmer on
  the IOMMU side; we obviously can't use the read/write bits without
  affecting the IOMMU)

The main question obviously is whether it is architecturally safe to
use any particular, presently invalid memory type (right now the
patches use type 7, i.e. the value defined for UC- in the PAT MSR
only), or whether such an invalid type could be determined at run
time.

Obviously if on EPT we can go this route, the goal ought to be to
eliminate ->change_entry_type_global() altogether (i.e. also from
the generic P2M code) by using on-access adjustments instead of
on-request ones. Quite likely that would involved adding an address
range to ->memory_type_changed().

Jan


[-- Attachment #2: EPT-sync-mem-types.patch --]
[-- Type: text/plain, Size: 10528 bytes --]

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -252,6 +252,9 @@ int hvm_set_guest_pat(struct vcpu *v, u6
 
     if ( !hvm_funcs.set_guest_pat(v, guest_pat) )
         v->arch.hvm_vcpu.pat_cr = guest_pat;
+
+    memory_type_changed(v->domain);
+
     return 1;
 }
 
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -444,8 +444,12 @@ bool_t mtrr_def_type_msr_set(struct doma
          return 0;
     }
 
-    m->enabled = enabled;
-    m->def_type = def_type;
+    if ( m->enabled != enabled || m->def_type != def_type )
+    {
+        m->enabled = enabled;
+        m->def_type = def_type;
+        memory_type_changed(d);
+    }
 
     return 1;
 }
@@ -465,6 +469,7 @@ bool_t mtrr_fix_range_msr_set(struct dom
                 return 0;
 
         fixed_range_base[row] = msr_content;
+        memory_type_changed(d);
     }
 
     return 1;
@@ -504,6 +509,8 @@ bool_t mtrr_var_range_msr_set(
 
     m->overlapped = is_var_mtrr_overlapped(m);
 
+    memory_type_changed(d);
+
     return 1;
 }
 
@@ -696,6 +703,12 @@ static int hvm_load_mtrr_msr(struct doma
 HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr,
                           1, HVMSR_PER_VCPU);
 
+void memory_type_changed(struct domain *d)
+{
+    if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] )
+        p2m_memory_type_changed(d);
+}
+
 uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
                            uint8_t *ipat, bool_t direct_mmio)
 {
@@ -735,8 +748,7 @@ uint8_t epte_get_entry_emt(struct domain
         return MTRR_TYPE_WRBACK;
     }
 
-    gmtrr_mtype = is_hvm_domain(d) && v &&
-                  d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ?
+    gmtrr_mtype = is_hvm_domain(d) && v ?
                   get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT)) :
                   MTRR_TYPE_WRBACK;
 
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3006,6 +3006,16 @@ void vmx_vmexit_handler(struct cpu_user_
         break;
     }
 
+    case EXIT_REASON_EPT_MISCONFIG:
+    {
+        paddr_t gpa;
+
+        __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
+        if ( !ept_handle_misconfig(gpa) )
+            goto exit_and_crash;
+        break;
+    }
+
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -687,6 +687,168 @@ static void ept_change_entry_type_global
     ept_sync_domain(p2m);
 }
 
+static bool_t ept_invalidate_emt(mfn_t mfn)
+{
+    ept_entry_t *epte = map_domain_page(mfn_x(mfn));
+    unsigned int i;
+    bool_t changed = 0;
+
+    for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        ept_entry_t e = atomic_read_ept_entry(&epte[i]);
+
+        if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
+             e.emt == MTRR_NUM_TYPES )
+            continue;
+
+        e.emt = MTRR_NUM_TYPES;
+        atomic_write_ept_entry(&epte[i], e);
+        changed = 1;
+    }
+
+    unmap_domain_page(epte);
+
+    return changed;
+}
+
+static unsigned long invcnt, invthr, hmccnt, hmcthr, hmctot;//temp
+static void ept_memory_type_changed(struct p2m_domain *p2m)
+{
+    unsigned long mfn = ept_get_asr(&p2m->ept);
+
+    if ( !mfn )
+        return;
+if(++invcnt > invthr) {//temp
+ invthr |= invcnt;
+ printk("d%d: invalidate\n", p2m->domain->domain_id);
+ hmctot += hmccnt;
+ hmccnt = hmcthr = 0;
+}
+
+    if ( ept_invalidate_emt(_mfn(mfn)) )
+        ept_sync_domain(p2m);
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct ept_data *ept = &p2m->ept;
+    unsigned int level = ept_get_wl(ept);
+    unsigned long gfn = PFN_DOWN(gpa);
+    unsigned long mfn = ept_get_asr(ept);
+    ept_entry_t *epte;
+    bool_t okay;
+bool_t print = 0;//temp
+static struct {//temp
+ u64 gpa;
+ u16 vcpu;
+ s8 mt[4];
+} log[32];
+static unsigned logidx;//temp
+unsigned l;//temp
+
+    if ( !mfn )
+        return 0;
+
+    p2m_lock(p2m);
+if(++hmccnt > hmcthr) {//temp
+ hmcthr |= hmccnt;
+ print = 1;
+ printk("%pv: miscfg@%09lx [%08lx/%08lx]\n", curr, gfn, hmctot + hmccnt, invcnt);
+}
+l = logidx++ % ARRAY_SIZE(log);//temp
+log[l].gpa = gpa;//temp
+log[l].vcpu = curr->vcpu_id;//temp
+memset(log[l].mt, -1, ARRAY_SIZE(log[l].mt));//temp
+
+    for ( okay = curr->arch.hvm_vmx.ept_spurious_misconfig; ; --level) {
+        ept_entry_t e;
+        unsigned int i;
+
+        epte = map_domain_page(mfn);
+        i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
+        e = atomic_read_ept_entry(&epte[i]);
+log[l].mt[3 - level] = e.emt;//temp
+if(print) printk("L%u[%03x]: %u\n", level, i, e.emt);//temp
+
+        if ( level == 0 || is_epte_superpage(&e) )
+        {
+            uint8_t ipat = 0;
+            struct vcpu *v;
+
+            if ( e.emt != MTRR_NUM_TYPES )
+                break;
+
+            if ( level == 0 )
+            {
+                for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
+                {
+                    e = atomic_read_ept_entry(&epte[i]);
+                    if ( e.emt == MTRR_NUM_TYPES )
+                        e.emt = 0;
+                    if ( !is_epte_valid(&e) || !is_epte_present(&e) )
+                        continue;
+                    e.emt = epte_get_entry_emt(p2m->domain, gfn + i,
+                                               _mfn(e.mfn), &ipat,
+                                               e.sa_p2mt == p2m_mmio_direct);
+                    e.ipat = ipat;
+                    atomic_write_ept_entry(&epte[i], e);
+++level;//temp
+                }
+if(print) printk("L0[] -> %u entries adjusted\n", level);//temp
+            }
+            else
+            {
+                e.emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
+                                           &ipat,
+                                           e.sa_p2mt == p2m_mmio_direct);
+                e.ipat = ipat;
+                atomic_write_ept_entry(&epte[i], e);
+if(print) printk("L%u[%03x] -> %u:%d (%lx)\n", level, i, e.emt, ipat, (long)e.mfn);//temp
+            }
+
+            for_each_vcpu ( curr->domain, v )
+                v->arch.hvm_vmx.ept_spurious_misconfig = 1;
+            okay = 1;
+            break;
+        }
+
+        if ( e.emt == MTRR_NUM_TYPES )
+        {
+            ASSERT(is_epte_present(&e));
+            e.emt = 0;
+            atomic_write_ept_entry(&epte[i], e);
+            unmap_domain_page(epte);
+
+            ept_invalidate_emt(_mfn(e.mfn));
+            okay = 1;
+        }
+        else if ( is_epte_present(&e) && !e.emt )
+            unmap_domain_page(epte);
+        else
+            break;
+
+        mfn = e.mfn;
+    }
+
+    unmap_domain_page(epte);
+    curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
+if(!okay) {//temp
+ printk("%pv: miscfg@%lx fail@%u\n", curr, gfn, level);
+ for(;; l = (l ?: ARRAY_SIZE(log)) - 1) {
+  printk("v%d %012"PRIx64" %d:%d:%d:%d\n",
+         log[l].vcpu, log[l].gpa, log[l].mt[0], log[l].mt[1], log[l].mt[2], log[l].mt[3]);
+  if(l == (logidx % ARRAY_SIZE(log)))
+   break;
+ }
+}
+    ept_sync_domain(p2m);
+    p2m_unlock(p2m);
+
+    return okay;
+}
+
 static void __ept_sync_domain(void *info)
 {
     struct ept_data *ept = &((struct p2m_domain *)info)->ept;
@@ -724,6 +886,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->set_entry = ept_set_entry;
     p2m->get_entry = ept_get_entry;
     p2m->change_entry_type_global = ept_change_entry_type_global;
+    p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
 
     /* Set the memory type used when accessing EPT paging structures. */
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct
     p2m_unlock(p2m);
 }
 
+void p2m_memory_type_changed(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->memory_type_changed )
+    {
+        p2m_lock(p2m);
+        p2m->memory_type_changed(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
                     unsigned int *page_order, bool_t locked)
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -124,6 +124,9 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */
+    bool_t               ept_spurious_misconfig;
+
     /* Is the guest in real mode? */
     uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -520,6 +520,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
 void ept_p2m_uninit(struct p2m_domain *p2m);
 
 void ept_walk_table(struct domain *d, unsigned long gfn);
+bool_t ept_handle_misconfig(uint64_t gpa);
 void setup_ept_dump(void);
 
 void update_guest_eip(void);
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -233,6 +233,7 @@ struct p2m_domain {
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
+    void               (*memory_type_changed)(struct p2m_domain *p2m);
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,
                                           unsigned long gfn, l1_pgentry_t *p,
@@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain
 p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
                            p2m_type_t ot, p2m_type_t nt);
 
+/* Report a change affecting memory types. */
+void p2m_memory_type_changed(struct domain *d);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] utilizing EPT_MISCONFIG VM exit
  2014-02-24 13:05 [RFC] utilizing EPT_MISCONFIG VM exit Jan Beulich
@ 2014-02-27  3:09 ` Zhang, Yang Z
  2014-02-27 14:45 ` Tim Deegan
  1 sibling, 0 replies; 3+ messages in thread
From: Zhang, Yang Z @ 2014-02-27  3:09 UTC (permalink / raw)
  To: Jan Beulich, Dong, Eddie, Nakajima, Jun, Tim Deegan; +Cc: xen-devel

Jan Beulich wrote on 2014-02-24:
> Attached draft patch (still having debugging code in it, and assuming
> other adjustments having happened before) demonstrates that we
> should evaluate whether - not only for dealing with the memory type
> adjustments here, but for basically everything currently done through
> ->change_entry_type_global() - utilizing the EPT_MISCONFIG VM exit
> would be an architecturally clean approach. The fundamental idea
> here is to defer updates to the page tables until the respective
> entries actually get used, instead of iterating through all page tables
> when the change is being requested, thus
> - avoiding (here) or eliminating (elsewhere) long lasting operations
>   without having to introduce expensive/fragile preemption handling -
>   leaving unaffected the sharing of the page tables with the IOMMU
>   (since the EPT memory type field is available to the programmer on the
>   IOMMU side; we obviously can't use the read/write bits without
>   affecting the IOMMU)
> The main question obviously is whether it is architecturally safe to use
> any particular, presently invalid memory type (right now the patches use
> type 7, i.e. the value defined for UC- in the PAT MSR only), or whether
> such an invalid type could be determined at run time.
> 
> Obviously if on EPT we can go this route, the goal ought to be to
> eliminate ->change_entry_type_global() altogether (i.e. also from
> the generic P2M code) by using on-access adjustments instead of
> on-request ones. Quite likely that would involved adding an address
> range to ->memory_type_changed().
> 

Nice idea. Yes, the only concern is the reserved bit in EPT. I will forward this to internal to look for help.

> Jan


Best regards,
Yang

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] utilizing EPT_MISCONFIG VM exit
  2014-02-24 13:05 [RFC] utilizing EPT_MISCONFIG VM exit Jan Beulich
  2014-02-27  3:09 ` Zhang, Yang Z
@ 2014-02-27 14:45 ` Tim Deegan
  1 sibling, 0 replies; 3+ messages in thread
From: Tim Deegan @ 2014-02-27 14:45 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Yang Z Zhang, xen-devel, Eddie Dong, Jun Nakajima

At 13:05 +0000 on 24 Feb (1393243536), Jan Beulich wrote:
> Attached draft patch (still having debugging code in it, and assuming
> other adjustments having happened before) demonstrates that we
> should evaluate whether - not only for dealing with the memory type
> adjustments here, but for basically everything currently done through
> ->change_entry_type_global() - utilizing the EPT_MISCONFIG VM exit
> would be an architecturally clean approach. The fundamental idea
> here is to defer updates to the page tables until the respective
> entries actually get used, instead of iterating through all page tables
> when the change is being requested, thus
> - avoiding (here) or eliminating (elsewhere) long lasting operations
>   without having to introduce expensive/fragile preemption handling
> - leaving unaffected the sharing of the page tables with the IOMMU
>   (since the EPT memory type field is available to the programmer on
>   the IOMMU side; we obviously can't use the read/write bits without
>   affecting the IOMMU)

Looks like a pretty good plan to me. :)

> The main question obviously is whether it is architecturally safe to
> use any particular, presently invalid memory type (right now the
> patches use type 7, i.e. the value defined for UC- in the PAT MSR
> only), or whether such an invalid type could be determined at run
> time.

Another question is whether we can easily do the same on AMD.  The AMD
nested pagetable fault will flag reserved-bit errors in EXITINFO1,
which looks good enough, but the AMD IOMMU spec lists all the PTE's
reserved bits as reserved in IOMMU too. :|

> Obviously if on EPT we can go this route, the goal ought to be to
> eliminate ->change_entry_type_global() altogether (i.e. also from
> the generic P2M code) by using on-access adjustments instead of
> on-request ones. Quite likely that would involved adding an address
> range to ->memory_type_changed().

Sure, that makes sense.

Tim.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-02-27 14:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-24 13:05 [RFC] utilizing EPT_MISCONFIG VM exit Jan Beulich
2014-02-27  3:09 ` Zhang, Yang Z
2014-02-27 14:45 ` Tim Deegan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.