All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take
@ 2014-04-22 12:22 Jan Beulich
  2014-04-22 12:28 ` [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types Jan Beulich
                   ` (5 more replies)
  0 siblings, 6 replies; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:22 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

1: EPT: don't walk entire page tables when globally changing types
2: EPT: don't walk entire page tables when changing types on a range
3: P2M: simplify write_p2m_entry()
4: NPT: don't walk entire page tables when changing types on a range
5: NPT: don't walk entire page tables when globally changing types
6: P2M: cleanup

Beyond this series it might also be worthwhile considering to make
HVMOP_set_mem_type use p2m_change_type_range(), avoiding
the need for explicit preemption there.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Add overview comments to the more complex of the functions added.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types
  2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
@ 2014-04-22 12:28 ` Jan Beulich
  2014-04-24 15:41   ` Tim Deegan
  2014-04-25 19:37   ` Konrad Rzeszutek Wilk
  2014-04-22 12:29 ` [PATCH v2 2/6] x86/EPT: don't walk entire page tables when changing types on a range Jan Beulich
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:28 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 19826 bytes --]

Instead leverage the EPT_MISCONFIG VM exit by marking just the top
level entries as needing recalculation of their type, propagating the
the recalculation state down as necessary such that the actual
recalculation gets done upon access.

For this to work, we have to
- restrict the types between which conversions can be done (right now
  only the two types involved in log dirty tracking need to be taken
  care of)
- remember the ranges that log dirty tracking was requested for as well
  as whether global log dirty tracking is in effect

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *
         if ( begin_pfn != dirty_vram->begin_pfn ||
              begin_pfn + nr != dirty_vram->end_pfn )
         {
+            unsigned long ostart = dirty_vram->begin_pfn;
+            unsigned long oend = dirty_vram->end_pfn;
+
             dirty_vram->begin_pfn = begin_pfn;
             dirty_vram->end_pfn = begin_pfn + nr;
 
             paging_unlock(d);
 
+            if ( oend > ostart )
+                p2m_change_type_range(d, ostart, oend,
+                                      p2m_ram_logdirty, p2m_ram_rw);
+
             /* set l1e entries of range within P2M table to be read-only. */
             p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
                                   p2m_ram_rw, p2m_ram_logdirty);
@@ -150,11 +157,16 @@ int hap_track_dirty_vram(struct domain *
              * If zero pages specified while tracking dirty vram
              * then stop tracking
              */
+            begin_pfn = dirty_vram->begin_pfn;
+            nr = dirty_vram->end_pfn - dirty_vram->begin_pfn;
             xfree(dirty_vram);
             d->arch.hvm_domain.dirty_vram = NULL;
         }
 
         paging_unlock(d);
+        if ( nr )
+            p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
+                                  p2m_ram_logdirty, p2m_ram_rw);
     }
 out:
     if ( dirty_bitmap )
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -116,8 +116,14 @@ static int p2m_init_hostp2m(struct domai
 
     if ( p2m )
     {
-        d->arch.p2m = p2m;
-        return 0;
+        p2m->logdirty_ranges = rangeset_new(d, "log-dirty",
+                                            RANGESETF_prettyprint_hex);
+        if ( p2m->logdirty_ranges )
+        {
+            d->arch.p2m = p2m;
+            return 0;
+        }
+        p2m_free_one(p2m);
     }
     return -ENOMEM;
 }
@@ -129,6 +135,7 @@ static void p2m_teardown_hostp2m(struct 
 
     if ( p2m )
     {
+        rangeset_destroy(p2m->logdirty_ranges);
         p2m_free_one(p2m);
         d->arch.p2m = NULL;
     }
@@ -191,12 +198,25 @@ int p2m_init(struct domain *d)
     return rc;
 }
 
+int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
+                          unsigned long end)
+{
+    ASSERT(!p2m_is_nestedp2m(p2m));
+    if ( p2m->global_logdirty ||
+         rangeset_contains_range(p2m->logdirty_ranges, start, end) )
+        return 1;
+    if ( rangeset_overlaps_range(p2m->logdirty_ranges, start, end) )
+        return -1;
+    return 0;
+}
+
 void p2m_change_entry_type_global(struct domain *d,
                                   p2m_type_t ot, p2m_type_t nt)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     p2m_lock(p2m);
     p2m->change_entry_type_global(p2m, ot, nt);
+    p2m->global_logdirty = (nt == p2m_ram_logdirty);
     p2m_unlock(p2m);
 }
 
@@ -713,6 +733,7 @@ void p2m_change_type_range(struct domain
     unsigned long gfn;
     mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc = 0;
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
 
@@ -726,11 +747,22 @@ void p2m_change_type_range(struct domain
         mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
         while ( order > PAGE_ORDER_4K )
         {
-            if ( pt != ot )
-                break;
-            if ( !(gfn & ((1UL << order) - 1)) &&
-                 end > (gfn | ((1UL << order) - 1)) )
-                break;
+            unsigned long mask = ~0UL << order;
+
+            /*
+             * Log-dirty ranges starting/ending in the middle of a super page
+             * (with a page split still pending) can't have a consistent type
+             * reported for the full range and hence need the split to be
+             * enforced here.
+             */
+            if ( !p2m_is_changeable(pt) ||
+                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+            {
+                if ( pt != ot )
+                    break;
+                if ( !(gfn & ~mask) && end > (gfn | ~mask) )
+                    break;
+            }
             if ( order == PAGE_ORDER_1G )
                 order = PAGE_ORDER_2M;
             else
@@ -744,6 +776,26 @@ void p2m_change_type_range(struct domain
             break;
     }
 
+    switch ( nt )
+    {
+    case p2m_ram_rw:
+        if ( ot == p2m_ram_logdirty )
+            rc = rangeset_remove_range(p2m->logdirty_ranges, start, end - 1);
+        break;
+    case p2m_ram_logdirty:
+        if ( ot == p2m_ram_rw )
+            rc = rangeset_add_range(p2m->logdirty_ranges, start, end - 1);
+        break;
+    default:
+        break;
+    }
+    if ( rc )
+    {
+        printk(XENLOG_G_ERR "Error %d manipulating Dom%d's log-dirty ranges\n",
+               rc, d->domain_id);
+        domain_crash(d);
+    }
+
     p2m->defer_nested_flush = 0;
     if ( nestedhvm_enabled(d) )
         p2m_flush_nestedp2m(d);
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -187,7 +187,6 @@ static int ept_split_super_page(struct p
         epte->mfn += i * trunk;
         epte->snp = (iommu_enabled && iommu_snoop);
         ASSERT(!epte->rsvd1);
-        ASSERT(!epte->avail1);
         ASSERT(!epte->avail3);
 
         ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
@@ -270,7 +269,12 @@ static int ept_next_level(struct p2m_dom
     return GUEST_TABLE_NORMAL_PAGE;
 }
 
-static bool_t ept_invalidate_emt(mfn_t mfn)
+/*
+ * Invalidate (via setting the EMT field to an invalid value) all valid
+ * present entries in the given page table, optionally marking the entries
+ * also for their subtrees needing P2M type re-calculation.
+ */
+static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc)
 {
     ept_entry_t *epte = map_domain_page(mfn_x(mfn));
     unsigned int i;
@@ -281,10 +285,12 @@ static bool_t ept_invalidate_emt(mfn_t m
         ept_entry_t e = atomic_read_ept_entry(&epte[i]);
 
         if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
-             e.emt == MTRR_NUM_TYPES )
+             (e.emt == MTRR_NUM_TYPES && (e.recalc || !recalc)) )
             continue;
 
         e.emt = MTRR_NUM_TYPES;
+        if ( recalc )
+            e.recalc = 1;
         atomic_write_ept_entry(&epte[i], e);
         changed = 1;
     }
@@ -294,23 +300,25 @@ static bool_t ept_invalidate_emt(mfn_t m
     return changed;
 }
 
-bool_t ept_handle_misconfig(uint64_t gpa)
+/*
+ * Resolve deliberately mis-configured (EMT field set to an invalid value)
+ * entries in the page table hierarchy for the given GFN:
+ * - calculate the correct value for the EMT field
+ * - if marked so, re-calculate the P2M type
+ * - propagate EMT and re-calculation flag down to the next page table level
+ *   for entries not involved in the translation of the given GFN
+ */
+static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
 {
-    struct vcpu *curr = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
     struct ept_data *ept = &p2m->ept;
     unsigned int level = ept_get_wl(ept);
-    unsigned long gfn = PFN_DOWN(gpa);
     unsigned long mfn = ept_get_asr(ept);
     ept_entry_t *epte;
-    int okay;
+    int rc = 0;
 
     if ( !mfn )
         return 0;
 
-    p2m_lock(p2m);
-
-    okay = -curr->arch.hvm_vmx.ept_spurious_misconfig;
     for ( ; ; --level )
     {
         ept_entry_t e;
@@ -340,6 +348,13 @@ bool_t ept_handle_misconfig(uint64_t gpa
                                                _mfn(e.mfn), 0, &ipat,
                                                e.sa_p2mt == p2m_mmio_direct);
                     e.ipat = ipat;
+                    if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                    {
+                         e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + i)
+                                     ? p2m_ram_logdirty : p2m_ram_rw;
+                         ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                    }
+                    e.recalc = 0;
                     atomic_write_ept_entry(&epte[i], e);
                 }
             }
@@ -348,6 +363,25 @@ bool_t ept_handle_misconfig(uint64_t gpa
                 int emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
                                              level * EPT_TABLE_ORDER, &ipat,
                                              e.sa_p2mt == p2m_mmio_direct);
+
+                if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                {
+                     unsigned long mask = ~0UL << (level * EPT_TABLE_ORDER);
+
+                     switch ( p2m_is_logdirty_range(p2m, gfn & mask,
+                                                    gfn | ~mask) )
+                     {
+                     case 0:
+                          e.sa_p2mt = p2m_ram_rw;
+                          break;
+                     case 1:
+                          e.sa_p2mt = p2m_ram_logdirty;
+                          break;
+                     default: /* Force split. */
+                          emt = -1;
+                          break;
+                     }
+                }
                 if ( unlikely(emt < 0) )
                 {
                     if ( ept_split_super_page(p2m, &e, level, level - 1) )
@@ -357,27 +391,31 @@ bool_t ept_handle_misconfig(uint64_t gpa
                         continue;
                     }
                     ept_free_entry(p2m, &e, level);
-                    okay = 0;
+                    rc = -ENOMEM;
                     break;
                 }
                 e.emt = emt;
                 e.ipat = ipat;
+                if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                    ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                e.recalc = 0;
                 atomic_write_ept_entry(&epte[i], e);
             }
 
-            okay = 1;
+            rc = 1;
             break;
         }
 
         if ( e.emt == MTRR_NUM_TYPES )
         {
             ASSERT(is_epte_present(&e));
-            ept_invalidate_emt(_mfn(e.mfn));
+            ept_invalidate_emt(_mfn(e.mfn), e.recalc);
             smp_wmb();
             e.emt = 0;
+            e.recalc = 0;
             atomic_write_ept_entry(&epte[i], e);
             unmap_domain_page(epte);
-            okay = 1;
+            rc = 1;
         }
         else if ( is_epte_present(&e) && !e.emt )
             unmap_domain_page(epte);
@@ -388,18 +426,34 @@ bool_t ept_handle_misconfig(uint64_t gpa
     }
 
     unmap_domain_page(epte);
-    if ( okay > 0 )
+    if ( rc )
     {
         struct vcpu *v;
 
-        for_each_vcpu ( curr->domain, v )
+        for_each_vcpu ( p2m->domain, v )
             v->arch.hvm_vmx.ept_spurious_misconfig = 1;
     }
+
+    return rc;
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    bool_t spurious;
+    int rc;
+
+    p2m_lock(p2m);
+
+    spurious = curr->arch.hvm_vmx.ept_spurious_misconfig;
+    rc = resolve_misconfig(p2m, PFN_DOWN(gpa));
     curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
     ept_sync_domain(p2m);
+
     p2m_unlock(p2m);
 
-    return !!okay;
+    return rc >= !spurious;
 }
 
 /*
@@ -416,12 +470,11 @@ ept_set_entry(struct p2m_domain *p2m, un
     unsigned long gfn_remainder = gfn;
     int i, target = order / EPT_TABLE_ORDER;
     int rc = 0;
-    int ret = 0;
     bool_t direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
     int vtd_pte_present = 0;
-    int needs_sync = 1;
+    int ret, needs_sync = -1;
     ept_entry_t old_entry = { .epte = 0 };
     ept_entry_t new_entry = { .epte = 0 };
     struct ept_data *ept = &p2m->ept;
@@ -439,12 +492,23 @@ ept_set_entry(struct p2m_domain *p2m, un
          (order % EPT_TABLE_ORDER) )
         return -EINVAL;
 
+    /* Carry out any eventually pending earlier changes first. */
+    ret = resolve_misconfig(p2m, gfn);
+    if ( ret < 0 )
+    {
+        ept_sync_domain(p2m);
+        return ret;
+    }
+    if ( ret > 0 )
+        needs_sync = 1;
+
     ASSERT((target == 2 && hvm_hap_has_1gb()) ||
            (target == 1 && hvm_hap_has_2mb()) ||
            (target == 0));
 
     table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
 
+    ret = GUEST_TABLE_MAP_FAILED;
     for ( i = ept_get_wl(ept); i > target; i-- )
     {
         ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
@@ -478,7 +542,7 @@ ept_set_entry(struct p2m_domain *p2m, un
         /* We reached the target level. */
 
         /* No need to flush if the old entry wasn't valid */
-        if ( !is_epte_present(ept_entry) )
+        if ( needs_sync < 0 && !is_epte_present(ept_entry) )
             needs_sync = 0;
 
         /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB),
@@ -596,6 +660,7 @@ static mfn_t ept_get_entry(struct p2m_do
     u32 index;
     int i;
     int ret = 0;
+    bool_t recalc = 0;
     mfn_t mfn = _mfn(INVALID_MFN);
     struct ept_data *ept = &p2m->ept;
 
@@ -611,6 +676,8 @@ static mfn_t ept_get_entry(struct p2m_do
     for ( i = ept_get_wl(ept); i > 0; i-- )
     {
     retry:
+        if ( table[gfn_remainder >> (i * EPT_TABLE_ORDER)].recalc )
+            recalc = 1;
         ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
         if ( !ret )
             goto out;
@@ -657,7 +724,12 @@ static mfn_t ept_get_entry(struct p2m_do
 
     if ( is_epte_valid(ept_entry) )
     {
-        *t = ept_entry->sa_p2mt;
+        if ( (recalc || ept_entry->recalc) &&
+             p2m_is_changeable(ept_entry->sa_p2mt) )
+            *t = p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+                                                      : p2m_ram_rw;
+        else
+            *t = ept_entry->sa_p2mt;
         *a = ept_entry->access;
 
         mfn = _mfn(ept_entry->mfn);
@@ -733,53 +805,18 @@ out:
     return;
 }
 
-/*
- * Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to
- * quickly enable or diable log-dirty tracking
- */
-static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
-                                       p2m_type_t ot, p2m_type_t nt)
-{
-    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_epte_valid(epte + i) )
-            continue;
-
-        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
-            ept_change_entry_type_page(_mfn(epte[i].mfn),
-                                       ept_page_level - 1, ot, nt);
-        else
-        {
-            e = atomic_read_ept_entry(&epte[i]);
-            if ( e.sa_p2mt != ot )
-                continue;
-
-            e.sa_p2mt = nt;
-            ept_p2m_type_to_flags(&e, nt, e.access);
-            atomic_write_ept_entry(&epte[i], e);
-        }
-    }
-
-    unmap_domain_page(epte);
-}
-
 static void ept_change_entry_type_global(struct p2m_domain *p2m,
                                          p2m_type_t ot, p2m_type_t nt)
 {
-    struct ept_data *ept = &p2m->ept;
-    if ( ept_get_asr(ept) == 0 )
-        return;
+    unsigned long mfn = ept_get_asr(&p2m->ept);
 
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(p2m_is_mmio(ot) || p2m_is_mmio(nt));
+    if ( !mfn || ot == nt )
+        return;
 
-    ept_change_entry_type_page(_mfn(ept_get_asr(ept)),
-                               ept_get_wl(ept), ot, nt);
+    BUG_ON(!p2m_is_changeable(ot) || !p2m_is_changeable(nt));
 
-    ept_sync_domain(p2m);
+    if ( ept_invalidate_emt(_mfn(mfn), 1) )
+        ept_sync_domain(p2m);
 }
 
 static void ept_memory_type_changed(struct p2m_domain *p2m)
@@ -789,7 +826,7 @@ static void ept_memory_type_changed(stru
     if ( !mfn )
         return;
 
-    if ( ept_invalidate_emt(_mfn(mfn)) )
+    if ( ept_invalidate_emt(_mfn(mfn), 0) )
         ept_sync_domain(p2m);
 }
 
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -38,7 +38,7 @@ typedef union {
         ipat        :   1,  /* bit 6 - Ignore PAT memory type */
         sp          :   1,  /* bit 7 - Is this a superpage? */
         rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
-        avail1      :   1,  /* bit 10 - Software available 1 */
+        recalc      :   1,  /* bit 10 - Software available 1 */
         snp         :   1,  /* bit 11 - VT-d snoop control in shared
                                EPT/VT-d usage */
         mfn         :   40, /* bits 51:12 - Machine physical frame number */
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -139,6 +139,10 @@ typedef unsigned int p2m_query_t;
                       | p2m_to_mask(p2m_grant_map_ro)   \
                       | p2m_to_mask(p2m_ram_shared) )
 
+/* Types that can be subject to bulk transitions. */
+#define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \
+                              | p2m_to_mask(p2m_ram_logdirty) )
+
 #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand))
 
 /* Pageable types */
@@ -167,6 +171,7 @@ typedef unsigned int p2m_query_t;
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_changeable(_t) (p2m_to_mask(_t) & P2M_CHANGEABLE_TYPES)
 #define p2m_is_pod(_t) (p2m_to_mask(_t) & P2M_POD_TYPES)
 #define p2m_is_grant(_t) (p2m_to_mask(_t) & P2M_GRANT_TYPES)
 /* Grant types are *not* considered valid, because they can be
@@ -209,6 +214,11 @@ struct p2m_domain {
      * threaded on in LRU order. */
     struct list_head   np2m_list;
 
+    /* Host p2m: Log-dirty ranges registered for the domain. */
+    struct rangeset   *logdirty_ranges;
+
+    /* Host p2m: Global log-dirty mode enabled for the domain. */
+    bool_t             global_logdirty;
 
     /* Host p2m: when this flag is set, don't flush all the nested-p2m 
      * tables on every host-p2m change.  The setter of this flag 
@@ -510,6 +520,9 @@ p2m_type_t p2m_change_type(struct domain
 /* Report a change affecting memory types. */
 void p2m_memory_type_changed(struct domain *d);
 
+int p2m_is_logdirty_range(struct p2m_domain *, unsigned long start,
+                          unsigned long end);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);



[-- Attachment #2: EPT-replace-cetg.patch --]
[-- Type: text/plain, Size: 19893 bytes --]

x86/EPT: don't walk entire page tables when globally changing types

Instead leverage the EPT_MISCONFIG VM exit by marking just the top
level entries as needing recalculation of their type, propagating the
the recalculation state down as necessary such that the actual
recalculation gets done upon access.

For this to work, we have to
- restrict the types between which conversions can be done (right now
  only the two types involved in log dirty tracking need to be taken
  care of)
- remember the ranges that log dirty tracking was requested for as well
  as whether global log dirty tracking is in effect

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *
         if ( begin_pfn != dirty_vram->begin_pfn ||
              begin_pfn + nr != dirty_vram->end_pfn )
         {
+            unsigned long ostart = dirty_vram->begin_pfn;
+            unsigned long oend = dirty_vram->end_pfn;
+
             dirty_vram->begin_pfn = begin_pfn;
             dirty_vram->end_pfn = begin_pfn + nr;
 
             paging_unlock(d);
 
+            if ( oend > ostart )
+                p2m_change_type_range(d, ostart, oend,
+                                      p2m_ram_logdirty, p2m_ram_rw);
+
             /* set l1e entries of range within P2M table to be read-only. */
             p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
                                   p2m_ram_rw, p2m_ram_logdirty);
@@ -150,11 +157,16 @@ int hap_track_dirty_vram(struct domain *
              * If zero pages specified while tracking dirty vram
              * then stop tracking
              */
+            begin_pfn = dirty_vram->begin_pfn;
+            nr = dirty_vram->end_pfn - dirty_vram->begin_pfn;
             xfree(dirty_vram);
             d->arch.hvm_domain.dirty_vram = NULL;
         }
 
         paging_unlock(d);
+        if ( nr )
+            p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
+                                  p2m_ram_logdirty, p2m_ram_rw);
     }
 out:
     if ( dirty_bitmap )
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -116,8 +116,14 @@ static int p2m_init_hostp2m(struct domai
 
     if ( p2m )
     {
-        d->arch.p2m = p2m;
-        return 0;
+        p2m->logdirty_ranges = rangeset_new(d, "log-dirty",
+                                            RANGESETF_prettyprint_hex);
+        if ( p2m->logdirty_ranges )
+        {
+            d->arch.p2m = p2m;
+            return 0;
+        }
+        p2m_free_one(p2m);
     }
     return -ENOMEM;
 }
@@ -129,6 +135,7 @@ static void p2m_teardown_hostp2m(struct 
 
     if ( p2m )
     {
+        rangeset_destroy(p2m->logdirty_ranges);
         p2m_free_one(p2m);
         d->arch.p2m = NULL;
     }
@@ -191,12 +198,25 @@ int p2m_init(struct domain *d)
     return rc;
 }
 
+int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
+                          unsigned long end)
+{
+    ASSERT(!p2m_is_nestedp2m(p2m));
+    if ( p2m->global_logdirty ||
+         rangeset_contains_range(p2m->logdirty_ranges, start, end) )
+        return 1;
+    if ( rangeset_overlaps_range(p2m->logdirty_ranges, start, end) )
+        return -1;
+    return 0;
+}
+
 void p2m_change_entry_type_global(struct domain *d,
                                   p2m_type_t ot, p2m_type_t nt)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     p2m_lock(p2m);
     p2m->change_entry_type_global(p2m, ot, nt);
+    p2m->global_logdirty = (nt == p2m_ram_logdirty);
     p2m_unlock(p2m);
 }
 
@@ -713,6 +733,7 @@ void p2m_change_type_range(struct domain
     unsigned long gfn;
     mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc = 0;
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
 
@@ -726,11 +747,22 @@ void p2m_change_type_range(struct domain
         mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
         while ( order > PAGE_ORDER_4K )
         {
-            if ( pt != ot )
-                break;
-            if ( !(gfn & ((1UL << order) - 1)) &&
-                 end > (gfn | ((1UL << order) - 1)) )
-                break;
+            unsigned long mask = ~0UL << order;
+
+            /*
+             * Log-dirty ranges starting/ending in the middle of a super page
+             * (with a page split still pending) can't have a consistent type
+             * reported for the full range and hence need the split to be
+             * enforced here.
+             */
+            if ( !p2m_is_changeable(pt) ||
+                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+            {
+                if ( pt != ot )
+                    break;
+                if ( !(gfn & ~mask) && end > (gfn | ~mask) )
+                    break;
+            }
             if ( order == PAGE_ORDER_1G )
                 order = PAGE_ORDER_2M;
             else
@@ -744,6 +776,26 @@ void p2m_change_type_range(struct domain
             break;
     }
 
+    switch ( nt )
+    {
+    case p2m_ram_rw:
+        if ( ot == p2m_ram_logdirty )
+            rc = rangeset_remove_range(p2m->logdirty_ranges, start, end - 1);
+        break;
+    case p2m_ram_logdirty:
+        if ( ot == p2m_ram_rw )
+            rc = rangeset_add_range(p2m->logdirty_ranges, start, end - 1);
+        break;
+    default:
+        break;
+    }
+    if ( rc )
+    {
+        printk(XENLOG_G_ERR "Error %d manipulating Dom%d's log-dirty ranges\n",
+               rc, d->domain_id);
+        domain_crash(d);
+    }
+
     p2m->defer_nested_flush = 0;
     if ( nestedhvm_enabled(d) )
         p2m_flush_nestedp2m(d);
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -187,7 +187,6 @@ static int ept_split_super_page(struct p
         epte->mfn += i * trunk;
         epte->snp = (iommu_enabled && iommu_snoop);
         ASSERT(!epte->rsvd1);
-        ASSERT(!epte->avail1);
         ASSERT(!epte->avail3);
 
         ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
@@ -270,7 +269,12 @@ static int ept_next_level(struct p2m_dom
     return GUEST_TABLE_NORMAL_PAGE;
 }
 
-static bool_t ept_invalidate_emt(mfn_t mfn)
+/*
+ * Invalidate (via setting the EMT field to an invalid value) all valid
+ * present entries in the given page table, optionally marking the entries
+ * also for their subtrees needing P2M type re-calculation.
+ */
+static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc)
 {
     ept_entry_t *epte = map_domain_page(mfn_x(mfn));
     unsigned int i;
@@ -281,10 +285,12 @@ static bool_t ept_invalidate_emt(mfn_t m
         ept_entry_t e = atomic_read_ept_entry(&epte[i]);
 
         if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
-             e.emt == MTRR_NUM_TYPES )
+             (e.emt == MTRR_NUM_TYPES && (e.recalc || !recalc)) )
             continue;
 
         e.emt = MTRR_NUM_TYPES;
+        if ( recalc )
+            e.recalc = 1;
         atomic_write_ept_entry(&epte[i], e);
         changed = 1;
     }
@@ -294,23 +300,25 @@ static bool_t ept_invalidate_emt(mfn_t m
     return changed;
 }
 
-bool_t ept_handle_misconfig(uint64_t gpa)
+/*
+ * Resolve deliberately mis-configured (EMT field set to an invalid value)
+ * entries in the page table hierarchy for the given GFN:
+ * - calculate the correct value for the EMT field
+ * - if marked so, re-calculate the P2M type
+ * - propagate EMT and re-calculation flag down to the next page table level
+ *   for entries not involved in the translation of the given GFN
+ */
+static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
 {
-    struct vcpu *curr = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
     struct ept_data *ept = &p2m->ept;
     unsigned int level = ept_get_wl(ept);
-    unsigned long gfn = PFN_DOWN(gpa);
     unsigned long mfn = ept_get_asr(ept);
     ept_entry_t *epte;
-    int okay;
+    int rc = 0;
 
     if ( !mfn )
         return 0;
 
-    p2m_lock(p2m);
-
-    okay = -curr->arch.hvm_vmx.ept_spurious_misconfig;
     for ( ; ; --level )
     {
         ept_entry_t e;
@@ -340,6 +348,13 @@ bool_t ept_handle_misconfig(uint64_t gpa
                                                _mfn(e.mfn), 0, &ipat,
                                                e.sa_p2mt == p2m_mmio_direct);
                     e.ipat = ipat;
+                    if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                    {
+                         e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + i)
+                                     ? p2m_ram_logdirty : p2m_ram_rw;
+                         ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                    }
+                    e.recalc = 0;
                     atomic_write_ept_entry(&epte[i], e);
                 }
             }
@@ -348,6 +363,25 @@ bool_t ept_handle_misconfig(uint64_t gpa
                 int emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
                                              level * EPT_TABLE_ORDER, &ipat,
                                              e.sa_p2mt == p2m_mmio_direct);
+
+                if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                {
+                     unsigned long mask = ~0UL << (level * EPT_TABLE_ORDER);
+
+                     switch ( p2m_is_logdirty_range(p2m, gfn & mask,
+                                                    gfn | ~mask) )
+                     {
+                     case 0:
+                          e.sa_p2mt = p2m_ram_rw;
+                          break;
+                     case 1:
+                          e.sa_p2mt = p2m_ram_logdirty;
+                          break;
+                     default: /* Force split. */
+                          emt = -1;
+                          break;
+                     }
+                }
                 if ( unlikely(emt < 0) )
                 {
                     if ( ept_split_super_page(p2m, &e, level, level - 1) )
@@ -357,27 +391,31 @@ bool_t ept_handle_misconfig(uint64_t gpa
                         continue;
                     }
                     ept_free_entry(p2m, &e, level);
-                    okay = 0;
+                    rc = -ENOMEM;
                     break;
                 }
                 e.emt = emt;
                 e.ipat = ipat;
+                if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                    ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                e.recalc = 0;
                 atomic_write_ept_entry(&epte[i], e);
             }
 
-            okay = 1;
+            rc = 1;
             break;
         }
 
         if ( e.emt == MTRR_NUM_TYPES )
         {
             ASSERT(is_epte_present(&e));
-            ept_invalidate_emt(_mfn(e.mfn));
+            ept_invalidate_emt(_mfn(e.mfn), e.recalc);
             smp_wmb();
             e.emt = 0;
+            e.recalc = 0;
             atomic_write_ept_entry(&epte[i], e);
             unmap_domain_page(epte);
-            okay = 1;
+            rc = 1;
         }
         else if ( is_epte_present(&e) && !e.emt )
             unmap_domain_page(epte);
@@ -388,18 +426,34 @@ bool_t ept_handle_misconfig(uint64_t gpa
     }
 
     unmap_domain_page(epte);
-    if ( okay > 0 )
+    if ( rc )
     {
         struct vcpu *v;
 
-        for_each_vcpu ( curr->domain, v )
+        for_each_vcpu ( p2m->domain, v )
             v->arch.hvm_vmx.ept_spurious_misconfig = 1;
     }
+
+    return rc;
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    bool_t spurious;
+    int rc;
+
+    p2m_lock(p2m);
+
+    spurious = curr->arch.hvm_vmx.ept_spurious_misconfig;
+    rc = resolve_misconfig(p2m, PFN_DOWN(gpa));
     curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
     ept_sync_domain(p2m);
+
     p2m_unlock(p2m);
 
-    return !!okay;
+    return rc >= !spurious;
 }
 
 /*
@@ -416,12 +470,11 @@ ept_set_entry(struct p2m_domain *p2m, un
     unsigned long gfn_remainder = gfn;
     int i, target = order / EPT_TABLE_ORDER;
     int rc = 0;
-    int ret = 0;
     bool_t direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
     int vtd_pte_present = 0;
-    int needs_sync = 1;
+    int ret, needs_sync = -1;
     ept_entry_t old_entry = { .epte = 0 };
     ept_entry_t new_entry = { .epte = 0 };
     struct ept_data *ept = &p2m->ept;
@@ -439,12 +492,23 @@ ept_set_entry(struct p2m_domain *p2m, un
          (order % EPT_TABLE_ORDER) )
         return -EINVAL;
 
+    /* Carry out any eventually pending earlier changes first. */
+    ret = resolve_misconfig(p2m, gfn);
+    if ( ret < 0 )
+    {
+        ept_sync_domain(p2m);
+        return ret;
+    }
+    if ( ret > 0 )
+        needs_sync = 1;
+
     ASSERT((target == 2 && hvm_hap_has_1gb()) ||
            (target == 1 && hvm_hap_has_2mb()) ||
            (target == 0));
 
     table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
 
+    ret = GUEST_TABLE_MAP_FAILED;
     for ( i = ept_get_wl(ept); i > target; i-- )
     {
         ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
@@ -478,7 +542,7 @@ ept_set_entry(struct p2m_domain *p2m, un
         /* We reached the target level. */
 
         /* No need to flush if the old entry wasn't valid */
-        if ( !is_epte_present(ept_entry) )
+        if ( needs_sync < 0 && !is_epte_present(ept_entry) )
             needs_sync = 0;
 
         /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB),
@@ -596,6 +660,7 @@ static mfn_t ept_get_entry(struct p2m_do
     u32 index;
     int i;
     int ret = 0;
+    bool_t recalc = 0;
     mfn_t mfn = _mfn(INVALID_MFN);
     struct ept_data *ept = &p2m->ept;
 
@@ -611,6 +676,8 @@ static mfn_t ept_get_entry(struct p2m_do
     for ( i = ept_get_wl(ept); i > 0; i-- )
     {
     retry:
+        if ( table[gfn_remainder >> (i * EPT_TABLE_ORDER)].recalc )
+            recalc = 1;
         ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
         if ( !ret )
             goto out;
@@ -657,7 +724,12 @@ static mfn_t ept_get_entry(struct p2m_do
 
     if ( is_epte_valid(ept_entry) )
     {
-        *t = ept_entry->sa_p2mt;
+        if ( (recalc || ept_entry->recalc) &&
+             p2m_is_changeable(ept_entry->sa_p2mt) )
+            *t = p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+                                                      : p2m_ram_rw;
+        else
+            *t = ept_entry->sa_p2mt;
         *a = ept_entry->access;
 
         mfn = _mfn(ept_entry->mfn);
@@ -733,53 +805,18 @@ out:
     return;
 }
 
-/*
- * Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to
- * quickly enable or diable log-dirty tracking
- */
-static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
-                                       p2m_type_t ot, p2m_type_t nt)
-{
-    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_epte_valid(epte + i) )
-            continue;
-
-        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
-            ept_change_entry_type_page(_mfn(epte[i].mfn),
-                                       ept_page_level - 1, ot, nt);
-        else
-        {
-            e = atomic_read_ept_entry(&epte[i]);
-            if ( e.sa_p2mt != ot )
-                continue;
-
-            e.sa_p2mt = nt;
-            ept_p2m_type_to_flags(&e, nt, e.access);
-            atomic_write_ept_entry(&epte[i], e);
-        }
-    }
-
-    unmap_domain_page(epte);
-}
-
 static void ept_change_entry_type_global(struct p2m_domain *p2m,
                                          p2m_type_t ot, p2m_type_t nt)
 {
-    struct ept_data *ept = &p2m->ept;
-    if ( ept_get_asr(ept) == 0 )
-        return;
+    unsigned long mfn = ept_get_asr(&p2m->ept);
 
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(p2m_is_mmio(ot) || p2m_is_mmio(nt));
+    if ( !mfn || ot == nt )
+        return;
 
-    ept_change_entry_type_page(_mfn(ept_get_asr(ept)),
-                               ept_get_wl(ept), ot, nt);
+    BUG_ON(!p2m_is_changeable(ot) || !p2m_is_changeable(nt));
 
-    ept_sync_domain(p2m);
+    if ( ept_invalidate_emt(_mfn(mfn), 1) )
+        ept_sync_domain(p2m);
 }
 
 static void ept_memory_type_changed(struct p2m_domain *p2m)
@@ -789,7 +826,7 @@ static void ept_memory_type_changed(stru
     if ( !mfn )
         return;
 
-    if ( ept_invalidate_emt(_mfn(mfn)) )
+    if ( ept_invalidate_emt(_mfn(mfn), 0) )
         ept_sync_domain(p2m);
 }
 
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -38,7 +38,7 @@ typedef union {
         ipat        :   1,  /* bit 6 - Ignore PAT memory type */
         sp          :   1,  /* bit 7 - Is this a superpage? */
         rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
-        avail1      :   1,  /* bit 10 - Software available 1 */
+        recalc      :   1,  /* bit 10 - Software available 1 */
         snp         :   1,  /* bit 11 - VT-d snoop control in shared
                                EPT/VT-d usage */
         mfn         :   40, /* bits 51:12 - Machine physical frame number */
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -139,6 +139,10 @@ typedef unsigned int p2m_query_t;
                       | p2m_to_mask(p2m_grant_map_ro)   \
                       | p2m_to_mask(p2m_ram_shared) )
 
+/* Types that can be subject to bulk transitions. */
+#define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \
+                              | p2m_to_mask(p2m_ram_logdirty) )
+
 #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand))
 
 /* Pageable types */
@@ -167,6 +171,7 @@ typedef unsigned int p2m_query_t;
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_changeable(_t) (p2m_to_mask(_t) & P2M_CHANGEABLE_TYPES)
 #define p2m_is_pod(_t) (p2m_to_mask(_t) & P2M_POD_TYPES)
 #define p2m_is_grant(_t) (p2m_to_mask(_t) & P2M_GRANT_TYPES)
 /* Grant types are *not* considered valid, because they can be
@@ -209,6 +214,11 @@ struct p2m_domain {
      * threaded on in LRU order. */
     struct list_head   np2m_list;
 
+    /* Host p2m: Log-dirty ranges registered for the domain. */
+    struct rangeset   *logdirty_ranges;
+
+    /* Host p2m: Global log-dirty mode enabled for the domain. */
+    bool_t             global_logdirty;
 
     /* Host p2m: when this flag is set, don't flush all the nested-p2m 
      * tables on every host-p2m change.  The setter of this flag 
@@ -510,6 +520,9 @@ p2m_type_t p2m_change_type(struct domain
 /* Report a change affecting memory types. */
 void p2m_memory_type_changed(struct domain *d);
 
+int p2m_is_logdirty_range(struct p2m_domain *, unsigned long start,
+                          unsigned long end);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 2/6] x86/EPT: don't walk entire page tables when changing types on a range
  2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
  2014-04-22 12:28 ` [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types Jan Beulich
@ 2014-04-22 12:29 ` Jan Beulich
  2014-04-24 16:00   ` Tim Deegan
  2014-04-22 12:30 ` [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry() Jan Beulich
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:29 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 7681 bytes --]

This requires a new P2M backend hook and a little bit of extra care on
accounting in the generic function.

Note that even on leaf entries we must not immediately set the new
type (in an attempt to avoid the EPT_MISCONFIG VM exits), since the
global accounting in p2m_change_type_range() gets intentionally done
only after updating page tables (or else the update there would
conflict with the function's own use of p2m_is_logdirty_range()), and
the correct type can only be calculated with that in place.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -730,17 +730,33 @@ void p2m_change_type_range(struct domain
 {
     p2m_access_t a;
     p2m_type_t pt;
-    unsigned long gfn;
+    unsigned long gfn = start;
     mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int rc = 0;
 
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    ASSERT(ot != nt);
+    ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
 
     p2m_lock(p2m);
     p2m->defer_nested_flush = 1;
 
-    for ( gfn = start; gfn < end; )
+    if ( unlikely(end > p2m->max_mapped_pfn) )
+    {
+        if ( !gfn )
+        {
+            p2m->change_entry_type_global(p2m, ot, nt);
+            gfn = end;
+        }
+        end = p2m->max_mapped_pfn + 1;
+    }
+
+    if ( gfn < end && p2m->change_entry_type_range )
+    {
+        rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1);
+        gfn = end;
+    }
+    while ( !rc && gfn < end )
     {
         unsigned int order;
 
@@ -769,12 +785,18 @@ void p2m_change_type_range(struct domain
                 order = PAGE_ORDER_4K;
         }
         if ( pt == ot )
-            p2m_set_entry(p2m, gfn, mfn, order, nt, a);
+            rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a);
         gfn += 1UL << order;
         gfn &= -1UL << order;
         if ( !gfn )
             break;
     }
+    if ( rc )
+    {
+        printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to %d\n",
+               rc, d->domain_id, start, end - 1, ot, nt);
+        domain_crash(d);
+    }
 
     switch ( nt )
     {
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -301,6 +301,75 @@ static bool_t ept_invalidate_emt(mfn_t m
 }
 
 /*
+ * Just like ept_invalidate_emt() except that not all entries at the
+ * targeted level will need processing. The passed in range is guaranteed
+ * to not cross a page (table) boundary at the targeted level.
+ */
+static int ept_invalidate_emt_range(struct p2m_domain *p2m,
+                                    unsigned int target,
+                                    unsigned long first_gfn,
+                                    unsigned long last_gfn)
+{
+    ept_entry_t *table;
+    unsigned long gfn_remainder = first_gfn;
+    unsigned int i, index;
+    int rc = 0, ret = GUEST_TABLE_MAP_FAILED;
+
+    table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+    for ( i = ept_get_wl(&p2m->ept); i > target; --i )
+    {
+        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+        if ( ret == GUEST_TABLE_MAP_FAILED )
+            goto out;
+        if ( ret != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    if ( i > target )
+    {
+        /* We need to split the original page. */
+        ept_entry_t split_ept_entry;
+
+        index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+        split_ept_entry = atomic_read_ept_entry(&table[index]);
+        ASSERT(is_epte_superpage(&split_ept_entry));
+        if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
+        {
+            ept_free_entry(p2m, &split_ept_entry, i);
+            rc = -ENOMEM;
+            goto out;
+        }
+        atomic_write_ept_entry(&table[index], split_ept_entry);
+
+        for ( ; i > target; --i )
+            if ( !ept_next_level(p2m, 1, &table, &gfn_remainder, i) )
+                break;
+        ASSERT(i == target);
+    }
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    i = (last_gfn >> (i * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
+    for ( ; index <= i; ++index )
+    {
+        ept_entry_t e = atomic_read_ept_entry(&table[index]);
+
+        if ( is_epte_valid(&e) && is_epte_present(&e) &&
+             (e.emt != MTRR_NUM_TYPES || !e.recalc) )
+        {
+            e.emt = MTRR_NUM_TYPES;
+            e.recalc = 1;
+            atomic_write_ept_entry(&table[index], e);
+            rc = 1;
+        }
+    }
+
+ out:
+    unmap_domain_page(table);
+
+    return rc;
+}
+
+/*
  * Resolve deliberately mis-configured (EMT field set to an invalid value)
  * entries in the page table hierarchy for the given GFN:
  * - calculate the correct value for the EMT field
@@ -819,6 +888,53 @@ static void ept_change_entry_type_global
         ept_sync_domain(p2m);
 }
 
+static int ept_change_entry_type_range(struct p2m_domain *p2m,
+                                       p2m_type_t ot, p2m_type_t nt,
+                                       unsigned long first_gfn,
+                                       unsigned long last_gfn)
+{
+    unsigned int i, wl = ept_get_wl(&p2m->ept);
+    unsigned long mask = (1 << EPT_TABLE_ORDER) - 1;
+    int rc = 0, sync = 0;
+
+    if ( !ept_get_asr(&p2m->ept) )
+        return -EINVAL;
+
+    for ( i = 0; i <= wl; )
+    {
+        if ( first_gfn & mask )
+        {
+            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
+
+            rc = ept_invalidate_emt_range(p2m, i, first_gfn, end_gfn);
+            sync |= rc;
+            if ( rc < 0 || end_gfn >= last_gfn )
+                break;
+            first_gfn = end_gfn + 1;
+        }
+        else if ( (last_gfn & mask) != mask )
+        {
+            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
+
+            rc = ept_invalidate_emt_range(p2m, i, start_gfn, last_gfn);
+            sync |= rc;
+            if ( rc < 0 || start_gfn <= first_gfn )
+                break;
+            last_gfn = start_gfn - 1;
+        }
+        else
+        {
+            ++i;
+            mask |= mask << EPT_TABLE_ORDER;
+        }
+    }
+
+    if ( sync )
+        ept_sync_domain(p2m);
+
+    return rc < 0 ? rc : 0;
+}
+
 static void ept_memory_type_changed(struct p2m_domain *p2m)
 {
     unsigned long mfn = ept_get_asr(&p2m->ept);
@@ -867,6 +983,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->set_entry = ept_set_entry;
     p2m->get_entry = ept_get_entry;
     p2m->change_entry_type_global = ept_change_entry_type_global;
+    p2m->change_entry_type_range = ept_change_entry_type_range;
     p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
 
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -243,6 +243,10 @@ struct p2m_domain {
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
+    int                (*change_entry_type_range)(struct p2m_domain *p2m,
+                                                  p2m_type_t ot, p2m_type_t nt,
+                                                  unsigned long first_gfn,
+                                                  unsigned long last_gfn);
     void               (*memory_type_changed)(struct p2m_domain *p2m);
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,



[-- Attachment #2: EPT-implement-cetr.patch --]
[-- Type: text/plain, Size: 7750 bytes --]

x86/EPT: don't walk entire page tables when changing types on a range

This requires a new P2M backend hook and a little bit of extra care on
accounting in the generic function.

Note that even on leaf entries we must not immediately set the new
type (in an attempt to avoid the EPT_MISCONFIG VM exits), since the
global accounting in p2m_change_type_range() gets intentionally done
only after updating page tables (or else the update there would
conflict with the function's own use of p2m_is_logdirty_range()), and
the correct type can only be calculated with that in place.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -730,17 +730,33 @@ void p2m_change_type_range(struct domain
 {
     p2m_access_t a;
     p2m_type_t pt;
-    unsigned long gfn;
+    unsigned long gfn = start;
     mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int rc = 0;
 
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    ASSERT(ot != nt);
+    ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
 
     p2m_lock(p2m);
     p2m->defer_nested_flush = 1;
 
-    for ( gfn = start; gfn < end; )
+    if ( unlikely(end > p2m->max_mapped_pfn) )
+    {
+        if ( !gfn )
+        {
+            p2m->change_entry_type_global(p2m, ot, nt);
+            gfn = end;
+        }
+        end = p2m->max_mapped_pfn + 1;
+    }
+
+    if ( gfn < end && p2m->change_entry_type_range )
+    {
+        rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1);
+        gfn = end;
+    }
+    while ( !rc && gfn < end )
     {
         unsigned int order;
 
@@ -769,12 +785,18 @@ void p2m_change_type_range(struct domain
                 order = PAGE_ORDER_4K;
         }
         if ( pt == ot )
-            p2m_set_entry(p2m, gfn, mfn, order, nt, a);
+            rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a);
         gfn += 1UL << order;
         gfn &= -1UL << order;
         if ( !gfn )
             break;
     }
+    if ( rc )
+    {
+        printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to %d\n",
+               rc, d->domain_id, start, end - 1, ot, nt);
+        domain_crash(d);
+    }
 
     switch ( nt )
     {
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -301,6 +301,75 @@ static bool_t ept_invalidate_emt(mfn_t m
 }
 
 /*
+ * Just like ept_invalidate_emt() except that not all entries at the
+ * targeted level will need processing. The passed in range is guaranteed
+ * to not cross a page (table) boundary at the targeted level.
+ */
+static int ept_invalidate_emt_range(struct p2m_domain *p2m,
+                                    unsigned int target,
+                                    unsigned long first_gfn,
+                                    unsigned long last_gfn)
+{
+    ept_entry_t *table;
+    unsigned long gfn_remainder = first_gfn;
+    unsigned int i, index;
+    int rc = 0, ret = GUEST_TABLE_MAP_FAILED;
+
+    table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
+    for ( i = ept_get_wl(&p2m->ept); i > target; --i )
+    {
+        ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
+        if ( ret == GUEST_TABLE_MAP_FAILED )
+            goto out;
+        if ( ret != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    if ( i > target )
+    {
+        /* We need to split the original page. */
+        ept_entry_t split_ept_entry;
+
+        index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+        split_ept_entry = atomic_read_ept_entry(&table[index]);
+        ASSERT(is_epte_superpage(&split_ept_entry));
+        if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
+        {
+            ept_free_entry(p2m, &split_ept_entry, i);
+            rc = -ENOMEM;
+            goto out;
+        }
+        atomic_write_ept_entry(&table[index], split_ept_entry);
+
+        for ( ; i > target; --i )
+            if ( !ept_next_level(p2m, 1, &table, &gfn_remainder, i) )
+                break;
+        ASSERT(i == target);
+    }
+
+    index = gfn_remainder >> (i * EPT_TABLE_ORDER);
+    i = (last_gfn >> (i * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
+    for ( ; index <= i; ++index )
+    {
+        ept_entry_t e = atomic_read_ept_entry(&table[index]);
+
+        if ( is_epte_valid(&e) && is_epte_present(&e) &&
+             (e.emt != MTRR_NUM_TYPES || !e.recalc) )
+        {
+            e.emt = MTRR_NUM_TYPES;
+            e.recalc = 1;
+            atomic_write_ept_entry(&table[index], e);
+            rc = 1;
+        }
+    }
+
+ out:
+    unmap_domain_page(table);
+
+    return rc;
+}
+
+/*
  * Resolve deliberately mis-configured (EMT field set to an invalid value)
  * entries in the page table hierarchy for the given GFN:
  * - calculate the correct value for the EMT field
@@ -819,6 +888,53 @@ static void ept_change_entry_type_global
         ept_sync_domain(p2m);
 }
 
+static int ept_change_entry_type_range(struct p2m_domain *p2m,
+                                       p2m_type_t ot, p2m_type_t nt,
+                                       unsigned long first_gfn,
+                                       unsigned long last_gfn)
+{
+    unsigned int i, wl = ept_get_wl(&p2m->ept);
+    unsigned long mask = (1 << EPT_TABLE_ORDER) - 1;
+    int rc = 0, sync = 0;
+
+    if ( !ept_get_asr(&p2m->ept) )
+        return -EINVAL;
+
+    for ( i = 0; i <= wl; )
+    {
+        if ( first_gfn & mask )
+        {
+            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
+
+            rc = ept_invalidate_emt_range(p2m, i, first_gfn, end_gfn);
+            sync |= rc;
+            if ( rc < 0 || end_gfn >= last_gfn )
+                break;
+            first_gfn = end_gfn + 1;
+        }
+        else if ( (last_gfn & mask) != mask )
+        {
+            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
+
+            rc = ept_invalidate_emt_range(p2m, i, start_gfn, last_gfn);
+            sync |= rc;
+            if ( rc < 0 || start_gfn <= first_gfn )
+                break;
+            last_gfn = start_gfn - 1;
+        }
+        else
+        {
+            ++i;
+            mask |= mask << EPT_TABLE_ORDER;
+        }
+    }
+
+    if ( sync )
+        ept_sync_domain(p2m);
+
+    return rc < 0 ? rc : 0;
+}
+
 static void ept_memory_type_changed(struct p2m_domain *p2m)
 {
     unsigned long mfn = ept_get_asr(&p2m->ept);
@@ -867,6 +983,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->set_entry = ept_set_entry;
     p2m->get_entry = ept_get_entry;
     p2m->change_entry_type_global = ept_change_entry_type_global;
+    p2m->change_entry_type_range = ept_change_entry_type_range;
     p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
 
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -243,6 +243,10 @@ struct p2m_domain {
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
+    int                (*change_entry_type_range)(struct p2m_domain *p2m,
+                                                  p2m_type_t ot, p2m_type_t nt,
+                                                  unsigned long first_gfn,
+                                                  unsigned long last_gfn);
     void               (*memory_type_changed)(struct p2m_domain *p2m);
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry()
  2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
  2014-04-22 12:28 ` [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types Jan Beulich
  2014-04-22 12:29 ` [PATCH v2 2/6] x86/EPT: don't walk entire page tables when changing types on a range Jan Beulich
@ 2014-04-22 12:30 ` Jan Beulich
  2014-04-24 16:00   ` Tim Deegan
  2014-04-22 12:30 ` [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range Jan Beulich
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:30 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 15303 bytes --]

The "table_mfn" parameter really isn't needed anywhere, so it gets
dropped.

The "struct vcpu *" one was always bogus (as was being made up by
paging_write_p2m_entry()), and is not commonly used. It can be easily
enough made up in the one place (sh_unshadow_for_p2m_change()) it is
needed, and we can otherwise pass "struct domain *" instead, properly
reflecting that P2M operations are per-domain.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -695,10 +695,9 @@ static void hap_update_paging_modes(stru
 }
 
 static void
-hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
-                    mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+hap_write_p2m_entry(struct domain *d, unsigned long gfn, l1_pgentry_t *p,
+                    l1_pgentry_t new, unsigned int level)
 {
-    struct domain *d = v->domain;
     uint32_t old_flags;
     bool_t flush_nestedp2m = 0;
 
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -79,7 +79,7 @@
 
 void
 nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
-    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+    l1_pgentry_t *p, l1_pgentry_t new, unsigned int level)
 {
     struct domain *d = p2m->domain;
     uint32_t old_flags;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -156,7 +156,7 @@ static void p2m_add_iommu_flags(l1_pgent
 
 /* Returns: 0 for success, -errno for failure */
 static int
-p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table,
+p2m_next_level(struct p2m_domain *p2m, void **table,
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
@@ -185,15 +185,15 @@ p2m_next_level(struct p2m_domain *p2m, m
         switch ( type ) {
         case PGT_l3_page_table:
             p2m_add_iommu_flags(&new_entry, 3, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 4);
             break;
         case PGT_l2_page_table:
             p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
             break;
         case PGT_l1_page_table:
             p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
             break;
         default:
             BUG();
@@ -221,14 +221,13 @@ p2m_next_level(struct p2m_domain *p2m, m
         {
             new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
             p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn, l1_entry + i, new_entry, 2);
         }
         unmap_domain_page(l1_entry);
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
         p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
     }
 
 
@@ -256,20 +255,17 @@ p2m_next_level(struct p2m_domain *p2m, m
         {
             new_entry = l1e_from_pfn(pfn + i, flags);
             p2m_add_iommu_flags(&new_entry, 0, 0);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 1);
+            p2m->write_p2m_entry(p2m, gfn, l1_entry + i, new_entry, 1);
         }
         unmap_domain_page(l1_entry);
         
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER);
         p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn,
-            p2m_entry, *table_mfn, new_entry, 2);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
     }
 
-    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
-    next = map_domain_page(mfn_x(*table_mfn));
+    next = map_domain_page(l1e_get_pfn(*p2m_entry));
     unmap_domain_page(*table);
     *table = next;
 
@@ -282,8 +278,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
                  unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
 {
     /* XXX -- this might be able to be faster iff current->domain == d */
-    mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
-    void *table = map_domain_page(mfn_x(table_mfn));
+    void *table;
     unsigned long i, gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
@@ -312,7 +307,8 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
     }
 
-    rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table);
     if ( rc )
@@ -349,7 +345,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
             old_mfn = l1e_get_pfn(*p2m_entry);
         }
 
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -358,8 +354,8 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
     }
     else 
     {
-        rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder,
-                            gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+        rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
+                            L3_PAGETABLE_SHIFT - PAGE_SHIFT,
                             L3_PAGETABLE_ENTRIES, PGT_l2_page_table);
         if ( rc )
             goto out;
@@ -367,7 +363,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
 
     if ( page_order == PAGE_ORDER_4K )
     {
-        rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+        rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table);
         if ( rc )
@@ -390,7 +386,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
             old_mfn = l1e_get_pfn(*p2m_entry);
         }
         /* level 1 entry */
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 1);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
     }
     else if ( page_order == PAGE_ORDER_2M )
@@ -426,7 +422,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
             old_mfn = l1e_get_pfn(*p2m_entry);
         }
 
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -660,7 +656,7 @@ static void p2m_pt_change_entry_type_glo
                 l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
                 p2m->write_p2m_entry(p2m, gfn,
                                      (l1_pgentry_t *)&l3e[i3],
-                                     l3mfn, l1e_content, 3);
+                                     l1e_content, 3);
                 continue;
             }
 
@@ -687,7 +683,7 @@ static void p2m_pt_change_entry_type_glo
                     l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
                     p2m->write_p2m_entry(p2m, gfn,
                                          (l1_pgentry_t *)&l2e[i2],
-                                         l2mfn, l1e_content, 2);
+                                         l1e_content, 2);
                     continue;
                 }
 
@@ -706,7 +702,7 @@ static void p2m_pt_change_entry_type_glo
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = p2m_l1e_from_pfn(mfn, flags);
                     p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
-                                         l1mfn, l1e_content, 1);
+                                         l1e_content, 1);
                 }
                 unmap_domain_page(l1e);
             }
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -724,18 +724,15 @@ void paging_update_nestedmode(struct vcp
 }
 
 void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
-                            l1_pgentry_t *p, mfn_t table_mfn,
-                            l1_pgentry_t new, unsigned int level)
+                            l1_pgentry_t *p, l1_pgentry_t new,
+                            unsigned int level)
 {
     struct domain *d = p2m->domain;
     struct vcpu *v = current;
     if ( v->domain != d )
         v = d->vcpu ? d->vcpu[0] : NULL;
     if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
-    {
-        return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
-                                                       new, level);
-    }
+        paging_get_hostmode(v)->write_p2m_entry(d, gfn, p, new, level);
     else
         safe_write_pte(p, new);
 }
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3310,11 +3310,14 @@ static int shadow_test_disable(struct do
  * shadow processing jobs.
  */
 
-static void sh_unshadow_for_p2m_change(struct vcpu *v, unsigned long gfn, 
-                                       l1_pgentry_t *p, mfn_t table_mfn, 
-                                       l1_pgentry_t new, unsigned int level)
+static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn,
+                                       l1_pgentry_t *p, l1_pgentry_t new,
+                                       unsigned int level)
 {
-    struct domain *d = v->domain;
+    struct vcpu *v = current;
+
+    if ( v->domain != d )
+        v = d->vcpu ? d->vcpu[0] : NULL;
 
     /* The following assertion is to make sure we don't step on 1GB host
      * page support of HVM guest. */
@@ -3379,18 +3382,16 @@ static void sh_unshadow_for_p2m_change(s
 }
 
 void
-shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, 
-                       l1_pgentry_t *p, mfn_t table_mfn, 
-                       l1_pgentry_t new, unsigned int level)
+shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
+                       l1_pgentry_t *p, l1_pgentry_t new,
+                       unsigned int level)
 {
-    struct domain *d = v->domain;
-    
     paging_lock(d);
 
     /* If there are any shadows, update them.  But if shadow_teardown()
      * has already been called then it's not safe to try. */ 
     if ( likely(d->arch.paging.shadow.total_pages != 0) )
-         sh_unshadow_for_p2m_change(v, gfn, p, table_mfn, new, level);
+         sh_unshadow_for_p2m_change(d, gfn, p, new, level);
 
     /* Update the entry with new content */
     safe_write_pte(p, new);
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -359,9 +359,9 @@ extern int sh_remove_write_access(struct
                                   unsigned long fault_addr);
 
 /* Functions that atomically write PT/P2M entries and update state */
-void shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, 
-                            l1_pgentry_t *p, mfn_t table_mfn,
-                            l1_pgentry_t new, unsigned int level);
+void shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
+                            l1_pgentry_t *p, l1_pgentry_t new,
+                            unsigned int level);
 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
                              intpte_t new, mfn_t gmfn);
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -251,8 +251,7 @@ struct p2m_domain {
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,
                                           unsigned long gfn, l1_pgentry_t *p,
-                                          mfn_t table_mfn, l1_pgentry_t new,
-                                          unsigned int level);
+                                          l1_pgentry_t new, unsigned int level);
     long               (*audit_p2m)(struct p2m_domain *p2m);
 
     /* Default P2M access type for each page in the the domain: new pages,
@@ -679,7 +678,7 @@ void p2m_flush(struct vcpu *v, struct p2
 void p2m_flush_nestedp2m(struct domain *d);
 
 void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
-    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
+    l1_pgentry_t *p, l1_pgentry_t new, unsigned int level);
 
 #endif /* _XEN_P2M_H */
 
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -111,9 +111,8 @@ struct paging_mode {
                                             unsigned int *page_order);
     void          (*update_cr3            )(struct vcpu *v, int do_locking);
     void          (*update_paging_modes   )(struct vcpu *v);
-    void          (*write_p2m_entry       )(struct vcpu *v, unsigned long gfn,
-                                            l1_pgentry_t *p, mfn_t table_mfn, 
-                                            l1_pgentry_t new, 
+    void          (*write_p2m_entry       )(struct domain *d, unsigned long gfn,
+                                            l1_pgentry_t *p, l1_pgentry_t new,
                                             unsigned int level);
     int           (*write_guest_entry     )(struct vcpu *v, intpte_t *p,
                                             intpte_t new, mfn_t gmfn);
@@ -335,9 +334,9 @@ static inline void safe_write_pte(l1_pge
  * we are writing. */
 struct p2m_domain;
 
-void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, 
-                            l1_pgentry_t *p, mfn_t table_mfn,
-                            l1_pgentry_t new, unsigned int level);
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+                            l1_pgentry_t *p, l1_pgentry_t new,
+                            unsigned int level);
 
 /* Called from the guest to indicate that the a process is being
  * torn down and its pagetables will soon be discarded */



[-- Attachment #2: x86-write_p2m_entry-simplify.patch --]
[-- Type: text/plain, Size: 15338 bytes --]

x86/P2M: simplify write_p2m_entry()

The "table_mfn" parameter really isn't needed anywhere, so it gets
dropped.

The "struct vcpu *" one was always bogus (as was being made up by
paging_write_p2m_entry()), and is not commonly used. It can be easily
enough made up in the one place (sh_unshadow_for_p2m_change()) it is
needed, and we can otherwise pass "struct domain *" instead, properly
reflecting that P2M operations are per-domain.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -695,10 +695,9 @@ static void hap_update_paging_modes(stru
 }
 
 static void
-hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
-                    mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+hap_write_p2m_entry(struct domain *d, unsigned long gfn, l1_pgentry_t *p,
+                    l1_pgentry_t new, unsigned int level)
 {
-    struct domain *d = v->domain;
     uint32_t old_flags;
     bool_t flush_nestedp2m = 0;
 
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -79,7 +79,7 @@
 
 void
 nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
-    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+    l1_pgentry_t *p, l1_pgentry_t new, unsigned int level)
 {
     struct domain *d = p2m->domain;
     uint32_t old_flags;
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -156,7 +156,7 @@ static void p2m_add_iommu_flags(l1_pgent
 
 /* Returns: 0 for success, -errno for failure */
 static int
-p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table,
+p2m_next_level(struct p2m_domain *p2m, void **table,
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
@@ -185,15 +185,15 @@ p2m_next_level(struct p2m_domain *p2m, m
         switch ( type ) {
         case PGT_l3_page_table:
             p2m_add_iommu_flags(&new_entry, 3, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 4);
             break;
         case PGT_l2_page_table:
             p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
             break;
         case PGT_l1_page_table:
             p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
             break;
         default:
             BUG();
@@ -221,14 +221,13 @@ p2m_next_level(struct p2m_domain *p2m, m
         {
             new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
             p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 2);
+            p2m->write_p2m_entry(p2m, gfn, l1_entry + i, new_entry, 2);
         }
         unmap_domain_page(l1_entry);
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
         p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
     }
 
 
@@ -256,20 +255,17 @@ p2m_next_level(struct p2m_domain *p2m, m
         {
             new_entry = l1e_from_pfn(pfn + i, flags);
             p2m_add_iommu_flags(&new_entry, 0, 0);
-            p2m->write_p2m_entry(p2m, gfn,
-                l1_entry+i, *table_mfn, new_entry, 1);
+            p2m->write_p2m_entry(p2m, gfn, l1_entry + i, new_entry, 1);
         }
         unmap_domain_page(l1_entry);
         
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER);
         p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
-        p2m->write_p2m_entry(p2m, gfn,
-            p2m_entry, *table_mfn, new_entry, 2);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
     }
 
-    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
-    next = map_domain_page(mfn_x(*table_mfn));
+    next = map_domain_page(l1e_get_pfn(*p2m_entry));
     unmap_domain_page(*table);
     *table = next;
 
@@ -282,8 +278,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
                  unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma)
 {
     /* XXX -- this might be able to be faster iff current->domain == d */
-    mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m));
-    void *table = map_domain_page(mfn_x(table_mfn));
+    void *table;
     unsigned long i, gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
@@ -312,7 +307,8 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
     }
 
-    rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table);
     if ( rc )
@@ -349,7 +345,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
             old_mfn = l1e_get_pfn(*p2m_entry);
         }
 
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 3);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -358,8 +354,8 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
     }
     else 
     {
-        rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder,
-                            gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+        rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
+                            L3_PAGETABLE_SHIFT - PAGE_SHIFT,
                             L3_PAGETABLE_ENTRIES, PGT_l2_page_table);
         if ( rc )
             goto out;
@@ -367,7 +363,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
 
     if ( page_order == PAGE_ORDER_4K )
     {
-        rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
+        rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table);
         if ( rc )
@@ -390,7 +386,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
             old_mfn = l1e_get_pfn(*p2m_entry);
         }
         /* level 1 entry */
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 1);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
     }
     else if ( page_order == PAGE_ORDER_2M )
@@ -426,7 +422,7 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
             old_mfn = l1e_get_pfn(*p2m_entry);
         }
 
-        p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
+        p2m->write_p2m_entry(p2m, gfn, p2m_entry, entry_content, 2);
         /* NB: paging_write_p2m_entry() handles tlb flushes properly */
 
         /* Free old intermediate tables if necessary */
@@ -660,7 +656,7 @@ static void p2m_pt_change_entry_type_glo
                 l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
                 p2m->write_p2m_entry(p2m, gfn,
                                      (l1_pgentry_t *)&l3e[i3],
-                                     l3mfn, l1e_content, 3);
+                                     l1e_content, 3);
                 continue;
             }
 
@@ -687,7 +683,7 @@ static void p2m_pt_change_entry_type_glo
                     l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
                     p2m->write_p2m_entry(p2m, gfn,
                                          (l1_pgentry_t *)&l2e[i2],
-                                         l2mfn, l1e_content, 2);
+                                         l1e_content, 2);
                     continue;
                 }
 
@@ -706,7 +702,7 @@ static void p2m_pt_change_entry_type_glo
                     flags = p2m_type_to_flags(nt, _mfn(mfn));
                     l1e_content = p2m_l1e_from_pfn(mfn, flags);
                     p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
-                                         l1mfn, l1e_content, 1);
+                                         l1e_content, 1);
                 }
                 unmap_domain_page(l1e);
             }
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -724,18 +724,15 @@ void paging_update_nestedmode(struct vcp
 }
 
 void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
-                            l1_pgentry_t *p, mfn_t table_mfn,
-                            l1_pgentry_t new, unsigned int level)
+                            l1_pgentry_t *p, l1_pgentry_t new,
+                            unsigned int level)
 {
     struct domain *d = p2m->domain;
     struct vcpu *v = current;
     if ( v->domain != d )
         v = d->vcpu ? d->vcpu[0] : NULL;
     if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
-    {
-        return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
-                                                       new, level);
-    }
+        paging_get_hostmode(v)->write_p2m_entry(d, gfn, p, new, level);
     else
         safe_write_pte(p, new);
 }
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3310,11 +3310,14 @@ static int shadow_test_disable(struct do
  * shadow processing jobs.
  */
 
-static void sh_unshadow_for_p2m_change(struct vcpu *v, unsigned long gfn, 
-                                       l1_pgentry_t *p, mfn_t table_mfn, 
-                                       l1_pgentry_t new, unsigned int level)
+static void sh_unshadow_for_p2m_change(struct domain *d, unsigned long gfn,
+                                       l1_pgentry_t *p, l1_pgentry_t new,
+                                       unsigned int level)
 {
-    struct domain *d = v->domain;
+    struct vcpu *v = current;
+
+    if ( v->domain != d )
+        v = d->vcpu ? d->vcpu[0] : NULL;
 
     /* The following assertion is to make sure we don't step on 1GB host
      * page support of HVM guest. */
@@ -3379,18 +3382,16 @@ static void sh_unshadow_for_p2m_change(s
 }
 
 void
-shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, 
-                       l1_pgentry_t *p, mfn_t table_mfn, 
-                       l1_pgentry_t new, unsigned int level)
+shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
+                       l1_pgentry_t *p, l1_pgentry_t new,
+                       unsigned int level)
 {
-    struct domain *d = v->domain;
-    
     paging_lock(d);
 
     /* If there are any shadows, update them.  But if shadow_teardown()
      * has already been called then it's not safe to try. */ 
     if ( likely(d->arch.paging.shadow.total_pages != 0) )
-         sh_unshadow_for_p2m_change(v, gfn, p, table_mfn, new, level);
+         sh_unshadow_for_p2m_change(d, gfn, p, new, level);
 
     /* Update the entry with new content */
     safe_write_pte(p, new);
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -359,9 +359,9 @@ extern int sh_remove_write_access(struct
                                   unsigned long fault_addr);
 
 /* Functions that atomically write PT/P2M entries and update state */
-void shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, 
-                            l1_pgentry_t *p, mfn_t table_mfn,
-                            l1_pgentry_t new, unsigned int level);
+void shadow_write_p2m_entry(struct domain *d, unsigned long gfn,
+                            l1_pgentry_t *p, l1_pgentry_t new,
+                            unsigned int level);
 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
                              intpte_t new, mfn_t gmfn);
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -251,8 +251,7 @@ struct p2m_domain {
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,
                                           unsigned long gfn, l1_pgentry_t *p,
-                                          mfn_t table_mfn, l1_pgentry_t new,
-                                          unsigned int level);
+                                          l1_pgentry_t new, unsigned int level);
     long               (*audit_p2m)(struct p2m_domain *p2m);
 
     /* Default P2M access type for each page in the the domain: new pages,
@@ -679,7 +678,7 @@ void p2m_flush(struct vcpu *v, struct p2
 void p2m_flush_nestedp2m(struct domain *d);
 
 void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
-    l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
+    l1_pgentry_t *p, l1_pgentry_t new, unsigned int level);
 
 #endif /* _XEN_P2M_H */
 
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -111,9 +111,8 @@ struct paging_mode {
                                             unsigned int *page_order);
     void          (*update_cr3            )(struct vcpu *v, int do_locking);
     void          (*update_paging_modes   )(struct vcpu *v);
-    void          (*write_p2m_entry       )(struct vcpu *v, unsigned long gfn,
-                                            l1_pgentry_t *p, mfn_t table_mfn, 
-                                            l1_pgentry_t new, 
+    void          (*write_p2m_entry       )(struct domain *d, unsigned long gfn,
+                                            l1_pgentry_t *p, l1_pgentry_t new,
                                             unsigned int level);
     int           (*write_guest_entry     )(struct vcpu *v, intpte_t *p,
                                             intpte_t new, mfn_t gmfn);
@@ -335,9 +334,9 @@ static inline void safe_write_pte(l1_pge
  * we are writing. */
 struct p2m_domain;
 
-void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, 
-                            l1_pgentry_t *p, mfn_t table_mfn,
-                            l1_pgentry_t new, unsigned int level);
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+                            l1_pgentry_t *p, l1_pgentry_t new,
+                            unsigned int level);
 
 /* Called from the guest to indicate that the a process is being
  * torn down and its pagetables will soon be discarded */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range
  2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
                   ` (2 preceding siblings ...)
  2014-04-22 12:30 ` [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry() Jan Beulich
@ 2014-04-22 12:30 ` Jan Beulich
  2014-04-24 16:25   ` Tim Deegan
  2014-04-22 12:31 ` [PATCH v2 5/6] x86/NPT: don't walk entire page tables when globally changing types Jan Beulich
  2014-04-22 12:32 ` [PATCH v2 6/6] x86/P2M: cleanup Jan Beulich
  5 siblings, 1 reply; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:30 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 18273 bytes --]

This builds on the fact that in order for no NPF VM exit to occur,
_PAGE_USER must always be set. I.e. by clearing the flag we can force a
VM exit allowing us to do similar lazy type changes as on EPT.

That way, the generic entry-wise code can go away, and we could remove
the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2557,7 +2557,16 @@ void svm_vmexit_handler(struct cpu_user_
         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
         if ( cpu_has_svm_decode )
             v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
-        svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+        rc = p2m_npt_fault(vmcb->exitinfo2);
+        if ( rc >= 0 )
+            svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+        else
+        {
+            printk(XENLOG_G_ERR
+                   "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
+                   v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
+            domain_crash(v->domain);
+        }
         v->arch.hvm_svm.cached_insn_len = 0;
         break;
 
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -728,10 +728,7 @@ void p2m_change_type_range(struct domain
                            unsigned long start, unsigned long end,
                            p2m_type_t ot, p2m_type_t nt)
 {
-    p2m_access_t a;
-    p2m_type_t pt;
     unsigned long gfn = start;
-    mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int rc = 0;
 
@@ -750,47 +747,8 @@ void p2m_change_type_range(struct domain
         }
         end = p2m->max_mapped_pfn + 1;
     }
-
-    if ( gfn < end && p2m->change_entry_type_range )
-    {
+    if ( gfn < end )
         rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1);
-        gfn = end;
-    }
-    while ( !rc && gfn < end )
-    {
-        unsigned int order;
-
-        mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
-        while ( order > PAGE_ORDER_4K )
-        {
-            unsigned long mask = ~0UL << order;
-
-            /*
-             * Log-dirty ranges starting/ending in the middle of a super page
-             * (with a page split still pending) can't have a consistent type
-             * reported for the full range and hence need the split to be
-             * enforced here.
-             */
-            if ( !p2m_is_changeable(pt) ||
-                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
-            {
-                if ( pt != ot )
-                    break;
-                if ( !(gfn & ~mask) && end > (gfn | ~mask) )
-                    break;
-            }
-            if ( order == PAGE_ORDER_1G )
-                order = PAGE_ORDER_2M;
-            else
-                order = PAGE_ORDER_4K;
-        }
-        if ( pt == ot )
-            rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a);
-        gfn += 1UL << order;
-        gfn &= -1UL << order;
-        if ( !gfn )
-            break;
-    }
     if ( rc )
     {
         printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to %d\n",
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -60,6 +60,19 @@
 #define P2M_BASE_FLAGS \
         (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
 
+#define RECALC_FLAGS (_PAGE_USER|_PAGE_ACCESSED)
+#define set_recalc(level, ent) level##e_remove_flags(ent, RECALC_FLAGS)
+#define clear_recalc(level, ent) level##e_add_flags(ent, RECALC_FLAGS)
+#define _needs_recalc(flags) (!((flags) & _PAGE_USER))
+#define needs_recalc(level, ent) _needs_recalc(level##e_get_flags(ent))
+#define valid_recalc(level, ent) (!(level##e_get_flags(ent) & _PAGE_ACCESSED))
+
+static const unsigned long pgt[] = {
+    PGT_l1_page_table,
+    PGT_l2_page_table,
+    PGT_l3_page_table
+};
+
 static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
@@ -272,6 +285,196 @@ p2m_next_level(struct p2m_domain *p2m, v
     return 0;
 }
 
+/*
+ * Mark (via clearing the U flag) as needing P2M type re-calculation all valid
+ * present entries at the targeted level for the passed in GFN range, which is
+ * guaranteed to not cross a page (table) boundary at that level.
+ */
+static int p2m_pt_set_recalc_range(struct p2m_domain *p2m,
+                                   unsigned int level,
+                                   unsigned long first_gfn,
+                                   unsigned long last_gfn)
+{
+    void *table;
+    unsigned long gfn_remainder = first_gfn, remainder;
+    unsigned int i;
+    l1_pgentry_t *pent, *plast;
+    int err = 0;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    for ( i = 4; i-- > level; )
+    {
+        remainder = gfn_remainder;
+        pent = p2m_find_entry(table, &remainder, first_gfn,
+                              i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+        if ( !pent )
+        {
+            err = -EINVAL;
+            goto out;
+        }
+
+        if ( !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+            goto out;
+
+        err = p2m_next_level(p2m, &table, &gfn_remainder, first_gfn,
+                             i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+                             pgt[i - 1]);
+        if ( err )
+            goto out;
+    }
+
+    remainder = gfn_remainder + (last_gfn - first_gfn);
+    pent = p2m_find_entry(table, &gfn_remainder, first_gfn,
+                          i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    plast = p2m_find_entry(table, &remainder, last_gfn,
+                           i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    if ( pent && plast )
+        for ( ; pent <= plast; ++pent )
+        {
+            l1_pgentry_t e = *pent;
+
+            if ( (l1e_get_flags(e) & _PAGE_PRESENT) && !needs_recalc(l1, e) )
+            {
+                set_recalc(l1, e);
+                p2m->write_p2m_entry(p2m, first_gfn, pent, e, level);
+            }
+            first_gfn += 1UL << (i * PAGETABLE_ORDER);
+        }
+    else
+        err = -EIO;
+
+ out:
+    unmap_domain_page(table);
+
+    return err;
+}
+
+/*
+ * Handle possibly necessary P2M type re-calculation (U flag clear for a
+ * present entry) for the entries in the page table hierarchy for the given
+ * GFN. Propagate the re-calculation flag down to the next page table level
+ * for entries not involved in the translation of the given GFN.
+ */
+static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
+{
+    void *table;
+    unsigned long gfn_remainder = gfn;
+    unsigned int level = 4;
+    l1_pgentry_t *pent;
+    int err = 0;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    while ( --level )
+    {
+        unsigned long remainder = gfn_remainder;
+
+        pent = p2m_find_entry(table, &remainder, gfn,
+                              level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+        if ( !pent || !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+            goto out;
+
+        if ( l1e_get_flags(*pent) & _PAGE_PSE )
+        {
+            unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+
+            if ( !needs_recalc(l1, *pent) ||
+                 !p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(*pent))) ||
+                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+                break;
+        }
+
+        err = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
+                             level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+                             pgt[level - 1]);
+        if ( err )
+            goto out;
+
+        if ( needs_recalc(l1, *pent) )
+        {
+            l1_pgentry_t e = *pent, *ptab = table;
+            unsigned int i;
+
+            if ( !valid_recalc(l1, e) )
+                P2M_DEBUG("bogus recalc state at d%d:%lx:%u\n",
+                          p2m->domain->domain_id, gfn, level);
+            remainder = gfn_remainder;
+            for ( i = 0; i < (1 << PAGETABLE_ORDER); ++i )
+            {
+                l1_pgentry_t ent = ptab[i];
+
+                if ( (l1e_get_flags(ent) & _PAGE_PRESENT) &&
+                     !needs_recalc(l1, ent) )
+                {
+                    set_recalc(l1, ent);
+                    p2m->write_p2m_entry(p2m, gfn - remainder, &ptab[i],
+                                         ent, level);
+                }
+                remainder -= 1UL << ((level - 1) * PAGETABLE_ORDER);
+            }
+            smp_wmb();
+            clear_recalc(l1, e);
+            p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+        }
+    }
+
+    pent = p2m_find_entry(table, &gfn_remainder, gfn,
+                          level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    if ( pent && (l1e_get_flags(*pent) & _PAGE_PRESENT) &&
+         needs_recalc(l1, *pent) )
+    {
+        l1_pgentry_t e = *pent;
+
+        if ( !valid_recalc(l1, e) )
+            P2M_DEBUG("bogus recalc leaf at d%d:%lx:%u\n",
+                      p2m->domain->domain_id, gfn, level);
+        if ( p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(e))) )
+        {
+            unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+            p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
+                              ? p2m_ram_logdirty : p2m_ram_rw;
+            unsigned long mfn = l1e_get_pfn(e);
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+
+            if ( level )
+            {
+                if ( flags & _PAGE_PAT )
+                {
+                     BUILD_BUG_ON(_PAGE_PAT != _PAGE_PSE);
+                     mfn |= _PAGE_PSE_PAT >> PAGE_SHIFT;
+                }
+                else
+                     mfn &= ~(_PAGE_PSE_PAT >> PAGE_SHIFT);
+                flags |= _PAGE_PSE;
+            }
+            e = l1e_from_pfn(mfn, flags);
+            p2m_add_iommu_flags(&e, level,
+                                (p2mt == p2m_ram_rw)
+                                ? IOMMUF_readable|IOMMUF_writable : 0);
+            ASSERT(!needs_recalc(l1, e));
+        }
+        else
+            clear_recalc(l1, e);
+        p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+    }
+
+ out:
+    unmap_domain_page(table);
+
+    return err;
+}
+
+int p2m_npt_fault(uint64_t gpa)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(current->domain);
+    int rc;
+
+    p2m_lock(p2m);
+    rc = do_recalc(p2m, PFN_DOWN(gpa));
+    p2m_unlock(p2m);
+
+    return rc;
+}
+
 /* Returns: 0 for success, -errno for failure */
 static int
 p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
@@ -307,6 +510,11 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
     }
 
+    /* Carry out any eventually pending earlier changes first. */
+    rc = do_recalc(p2m, gfn);
+    if ( rc < 0 )
+        return rc;
+
     table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
     rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
@@ -459,6 +667,15 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
     return rc;
 }
 
+static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
+                                     struct p2m_domain *p2m, unsigned long gfn)
+{
+    if ( !recalc || !p2m_is_changeable(t) )
+        return t;
+    return p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+                                                : p2m_ram_rw;
+}
+
 static mfn_t
 p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
                  p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
@@ -468,8 +685,9 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     l2_pgentry_t *l2e;
     l1_pgentry_t *l1e;
-    unsigned long l1e_flags;
+    unsigned int flags;
     p2m_type_t l1t;
+    bool_t recalc;
 
     ASSERT(paging_mode_translate(p2m->domain));
 
@@ -496,15 +714,17 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
             return _mfn(INVALID_MFN);
         }
         mfn = _mfn(l4e_get_pfn(*l4e));
+        recalc = needs_recalc(l4, *l4e);
         unmap_domain_page(l4e);
     }
     {
         l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
         l3e += l3_table_offset(addr);
 pod_retry_l3:
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        flags = l3e_get_flags(*l3e);
+        if ( !(flags & _PAGE_PRESENT) )
         {
-            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand )
+            if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
             {
                 if ( q & P2M_ALLOC )
                 {
@@ -518,12 +738,13 @@ pod_retry_l3:
             unmap_domain_page(l3e);
             return _mfn(INVALID_MFN);
         }
-        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+        if ( flags & _PAGE_PSE )
         {
             mfn = _mfn(l3e_get_pfn(*l3e) +
                        l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
                        l1_table_offset(addr));
-            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+            *t = recalc_type(recalc || _needs_recalc(flags),
+                             p2m_flags_to_type(flags), p2m, gfn);
             unmap_domain_page(l3e);
 
             ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -533,6 +754,8 @@ pod_retry_l3:
         }
 
         mfn = _mfn(l3e_get_pfn(*l3e));
+        if ( _needs_recalc(flags) )
+            recalc = 1;
         unmap_domain_page(l3e);
     }
 
@@ -540,10 +763,11 @@ pod_retry_l3:
     l2e += l2_table_offset(addr);
 
 pod_retry_l2:
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    flags = l2e_get_flags(*l2e);
+    if ( !(flags & _PAGE_PRESENT) )
     {
         /* PoD: Try to populate a 2-meg chunk */
-        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
         {
             if ( q & P2M_ALLOC ) {
                 if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) )
@@ -555,10 +779,11 @@ pod_retry_l2:
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
-    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    if ( flags & _PAGE_PSE )
     {
         mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
-        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        *t = recalc_type(recalc || _needs_recalc(flags),
+                         p2m_flags_to_type(flags), p2m, gfn);
         unmap_domain_page(l2e);
         
         ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -568,14 +793,16 @@ pod_retry_l2:
     }
 
     mfn = _mfn(l2e_get_pfn(*l2e));
+    if ( needs_recalc(l2, *l2e) )
+        recalc = 1;
     unmap_domain_page(l2e);
 
     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
 pod_retry_l1:
-    l1e_flags = l1e_get_flags(*l1e);
-    l1t = p2m_flags_to_type(l1e_flags);
-    if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) )
+    flags = l1e_get_flags(*l1e);
+    l1t = p2m_flags_to_type(flags);
+    if ( !(flags & _PAGE_PRESENT) && !p2m_is_paging(l1t) )
     {
         /* PoD: Try to populate */
         if ( l1t == p2m_populate_on_demand )
@@ -591,7 +818,7 @@ pod_retry_l1:
         return _mfn(INVALID_MFN);
     }
     mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = l1t;
+    *t = recalc_type(recalc || _needs_recalc(flags), l1t, p2m, gfn);
     unmap_domain_page(l1e);
 
     ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t));
@@ -714,6 +941,47 @@ static void p2m_pt_change_entry_type_glo
     unmap_domain_page(l4e);
 }
 
+static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,
+                                          p2m_type_t ot, p2m_type_t nt,
+                                          unsigned long first_gfn,
+                                          unsigned long last_gfn)
+{
+    unsigned long mask = (1 << PAGETABLE_ORDER) - 1;
+    unsigned int i;
+    int err = 0;
+
+    ASSERT(hap_enabled(p2m->domain));
+
+    for ( i = 1; i <= 4; )
+    {
+        if ( first_gfn & mask )
+        {
+            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
+
+            err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn);
+            if ( err || end_gfn >= last_gfn )
+                break;
+            first_gfn = end_gfn + 1;
+        }
+        else if ( (last_gfn & mask) != mask )
+        {
+            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
+
+            err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn);
+            if ( err || start_gfn <= first_gfn )
+                break;
+            last_gfn = start_gfn - 1;
+        }
+        else
+        {
+            ++i;
+            mask |= mask << PAGETABLE_ORDER;
+        }
+    }
+
+    return err;
+}
+
 #if P2M_AUDIT
 long p2m_pt_audit_p2m(struct p2m_domain *p2m)
 {
@@ -872,6 +1140,7 @@ void p2m_pt_init(struct p2m_domain *p2m)
     p2m->set_entry = p2m_pt_set_entry;
     p2m->get_entry = p2m_pt_get_entry;
     p2m->change_entry_type_global = p2m_pt_change_entry_type_global;
+    p2m->change_entry_type_range = p2m_pt_change_entry_type_range;
     p2m->write_p2m_entry = paging_write_p2m_entry;
 #if P2M_AUDIT
     p2m->audit_p2m = p2m_pt_audit_p2m;
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -668,6 +668,8 @@ static inline p2m_type_t p2m_flags_to_ty
     return (flags >> 12) & 0x7f;
 }
 
+int p2m_npt_fault(uint64_t gpa);
+
 /*
  * Nested p2m: shadow p2m tables used for nested HVM virtualization 
  */



[-- Attachment #2: NPT-implement-cetr.patch --]
[-- Type: text/plain, Size: 18342 bytes --]

x86/NPT: don't walk entire page tables when changing types on a range

This builds on the fact that in order for no NPF VM exit to occur,
_PAGE_USER must always be set. I.e. by clearing the flag we can force a
VM exit allowing us to do similar lazy type changes as on EPT.

That way, the generic entry-wise code can go away, and we could remove
the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2557,7 +2557,16 @@ void svm_vmexit_handler(struct cpu_user_
         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
         if ( cpu_has_svm_decode )
             v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
-        svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+        rc = p2m_npt_fault(vmcb->exitinfo2);
+        if ( rc >= 0 )
+            svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+        else
+        {
+            printk(XENLOG_G_ERR
+                   "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
+                   v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
+            domain_crash(v->domain);
+        }
         v->arch.hvm_svm.cached_insn_len = 0;
         break;
 
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -728,10 +728,7 @@ void p2m_change_type_range(struct domain
                            unsigned long start, unsigned long end,
                            p2m_type_t ot, p2m_type_t nt)
 {
-    p2m_access_t a;
-    p2m_type_t pt;
     unsigned long gfn = start;
-    mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int rc = 0;
 
@@ -750,47 +747,8 @@ void p2m_change_type_range(struct domain
         }
         end = p2m->max_mapped_pfn + 1;
     }
-
-    if ( gfn < end && p2m->change_entry_type_range )
-    {
+    if ( gfn < end )
         rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1);
-        gfn = end;
-    }
-    while ( !rc && gfn < end )
-    {
-        unsigned int order;
-
-        mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
-        while ( order > PAGE_ORDER_4K )
-        {
-            unsigned long mask = ~0UL << order;
-
-            /*
-             * Log-dirty ranges starting/ending in the middle of a super page
-             * (with a page split still pending) can't have a consistent type
-             * reported for the full range and hence need the split to be
-             * enforced here.
-             */
-            if ( !p2m_is_changeable(pt) ||
-                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
-            {
-                if ( pt != ot )
-                    break;
-                if ( !(gfn & ~mask) && end > (gfn | ~mask) )
-                    break;
-            }
-            if ( order == PAGE_ORDER_1G )
-                order = PAGE_ORDER_2M;
-            else
-                order = PAGE_ORDER_4K;
-        }
-        if ( pt == ot )
-            rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a);
-        gfn += 1UL << order;
-        gfn &= -1UL << order;
-        if ( !gfn )
-            break;
-    }
     if ( rc )
     {
         printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to %d\n",
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -60,6 +60,19 @@
 #define P2M_BASE_FLAGS \
         (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
 
+#define RECALC_FLAGS (_PAGE_USER|_PAGE_ACCESSED)
+#define set_recalc(level, ent) level##e_remove_flags(ent, RECALC_FLAGS)
+#define clear_recalc(level, ent) level##e_add_flags(ent, RECALC_FLAGS)
+#define _needs_recalc(flags) (!((flags) & _PAGE_USER))
+#define needs_recalc(level, ent) _needs_recalc(level##e_get_flags(ent))
+#define valid_recalc(level, ent) (!(level##e_get_flags(ent) & _PAGE_ACCESSED))
+
+static const unsigned long pgt[] = {
+    PGT_l1_page_table,
+    PGT_l2_page_table,
+    PGT_l3_page_table
+};
+
 static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
 {
     unsigned long flags;
@@ -272,6 +285,196 @@ p2m_next_level(struct p2m_domain *p2m, v
     return 0;
 }
 
+/*
+ * Mark (via clearing the U flag) as needing P2M type re-calculation all valid
+ * present entries at the targeted level for the passed in GFN range, which is
+ * guaranteed to not cross a page (table) boundary at that level.
+ */
+static int p2m_pt_set_recalc_range(struct p2m_domain *p2m,
+                                   unsigned int level,
+                                   unsigned long first_gfn,
+                                   unsigned long last_gfn)
+{
+    void *table;
+    unsigned long gfn_remainder = first_gfn, remainder;
+    unsigned int i;
+    l1_pgentry_t *pent, *plast;
+    int err = 0;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    for ( i = 4; i-- > level; )
+    {
+        remainder = gfn_remainder;
+        pent = p2m_find_entry(table, &remainder, first_gfn,
+                              i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+        if ( !pent )
+        {
+            err = -EINVAL;
+            goto out;
+        }
+
+        if ( !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+            goto out;
+
+        err = p2m_next_level(p2m, &table, &gfn_remainder, first_gfn,
+                             i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+                             pgt[i - 1]);
+        if ( err )
+            goto out;
+    }
+
+    remainder = gfn_remainder + (last_gfn - first_gfn);
+    pent = p2m_find_entry(table, &gfn_remainder, first_gfn,
+                          i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    plast = p2m_find_entry(table, &remainder, last_gfn,
+                           i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    if ( pent && plast )
+        for ( ; pent <= plast; ++pent )
+        {
+            l1_pgentry_t e = *pent;
+
+            if ( (l1e_get_flags(e) & _PAGE_PRESENT) && !needs_recalc(l1, e) )
+            {
+                set_recalc(l1, e);
+                p2m->write_p2m_entry(p2m, first_gfn, pent, e, level);
+            }
+            first_gfn += 1UL << (i * PAGETABLE_ORDER);
+        }
+    else
+        err = -EIO;
+
+ out:
+    unmap_domain_page(table);
+
+    return err;
+}
+
+/*
+ * Handle possibly necessary P2M type re-calculation (U flag clear for a
+ * present entry) for the entries in the page table hierarchy for the given
+ * GFN. Propagate the re-calculation flag down to the next page table level
+ * for entries not involved in the translation of the given GFN.
+ */
+static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
+{
+    void *table;
+    unsigned long gfn_remainder = gfn;
+    unsigned int level = 4;
+    l1_pgentry_t *pent;
+    int err = 0;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    while ( --level )
+    {
+        unsigned long remainder = gfn_remainder;
+
+        pent = p2m_find_entry(table, &remainder, gfn,
+                              level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+        if ( !pent || !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+            goto out;
+
+        if ( l1e_get_flags(*pent) & _PAGE_PSE )
+        {
+            unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+
+            if ( !needs_recalc(l1, *pent) ||
+                 !p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(*pent))) ||
+                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+                break;
+        }
+
+        err = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
+                             level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+                             pgt[level - 1]);
+        if ( err )
+            goto out;
+
+        if ( needs_recalc(l1, *pent) )
+        {
+            l1_pgentry_t e = *pent, *ptab = table;
+            unsigned int i;
+
+            if ( !valid_recalc(l1, e) )
+                P2M_DEBUG("bogus recalc state at d%d:%lx:%u\n",
+                          p2m->domain->domain_id, gfn, level);
+            remainder = gfn_remainder;
+            for ( i = 0; i < (1 << PAGETABLE_ORDER); ++i )
+            {
+                l1_pgentry_t ent = ptab[i];
+
+                if ( (l1e_get_flags(ent) & _PAGE_PRESENT) &&
+                     !needs_recalc(l1, ent) )
+                {
+                    set_recalc(l1, ent);
+                    p2m->write_p2m_entry(p2m, gfn - remainder, &ptab[i],
+                                         ent, level);
+                }
+                remainder -= 1UL << ((level - 1) * PAGETABLE_ORDER);
+            }
+            smp_wmb();
+            clear_recalc(l1, e);
+            p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+        }
+    }
+
+    pent = p2m_find_entry(table, &gfn_remainder, gfn,
+                          level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+    if ( pent && (l1e_get_flags(*pent) & _PAGE_PRESENT) &&
+         needs_recalc(l1, *pent) )
+    {
+        l1_pgentry_t e = *pent;
+
+        if ( !valid_recalc(l1, e) )
+            P2M_DEBUG("bogus recalc leaf at d%d:%lx:%u\n",
+                      p2m->domain->domain_id, gfn, level);
+        if ( p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(e))) )
+        {
+            unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+            p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
+                              ? p2m_ram_logdirty : p2m_ram_rw;
+            unsigned long mfn = l1e_get_pfn(e);
+            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+
+            if ( level )
+            {
+                if ( flags & _PAGE_PAT )
+                {
+                     BUILD_BUG_ON(_PAGE_PAT != _PAGE_PSE);
+                     mfn |= _PAGE_PSE_PAT >> PAGE_SHIFT;
+                }
+                else
+                     mfn &= ~(_PAGE_PSE_PAT >> PAGE_SHIFT);
+                flags |= _PAGE_PSE;
+            }
+            e = l1e_from_pfn(mfn, flags);
+            p2m_add_iommu_flags(&e, level,
+                                (p2mt == p2m_ram_rw)
+                                ? IOMMUF_readable|IOMMUF_writable : 0);
+            ASSERT(!needs_recalc(l1, e));
+        }
+        else
+            clear_recalc(l1, e);
+        p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+    }
+
+ out:
+    unmap_domain_page(table);
+
+    return err;
+}
+
+int p2m_npt_fault(uint64_t gpa)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(current->domain);
+    int rc;
+
+    p2m_lock(p2m);
+    rc = do_recalc(p2m, PFN_DOWN(gpa));
+    p2m_unlock(p2m);
+
+    return rc;
+}
+
 /* Returns: 0 for success, -errno for failure */
 static int
 p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
@@ -307,6 +510,11 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
         __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
     }
 
+    /* Carry out any eventually pending earlier changes first. */
+    rc = do_recalc(p2m, gfn);
+    if ( rc < 0 )
+        return rc;
+
     table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
     rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
@@ -459,6 +667,15 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
     return rc;
 }
 
+static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
+                                     struct p2m_domain *p2m, unsigned long gfn)
+{
+    if ( !recalc || !p2m_is_changeable(t) )
+        return t;
+    return p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+                                                : p2m_ram_rw;
+}
+
 static mfn_t
 p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
                  p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
@@ -468,8 +685,9 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     l2_pgentry_t *l2e;
     l1_pgentry_t *l1e;
-    unsigned long l1e_flags;
+    unsigned int flags;
     p2m_type_t l1t;
+    bool_t recalc;
 
     ASSERT(paging_mode_translate(p2m->domain));
 
@@ -496,15 +714,17 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
             return _mfn(INVALID_MFN);
         }
         mfn = _mfn(l4e_get_pfn(*l4e));
+        recalc = needs_recalc(l4, *l4e);
         unmap_domain_page(l4e);
     }
     {
         l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
         l3e += l3_table_offset(addr);
 pod_retry_l3:
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        flags = l3e_get_flags(*l3e);
+        if ( !(flags & _PAGE_PRESENT) )
         {
-            if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand )
+            if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
             {
                 if ( q & P2M_ALLOC )
                 {
@@ -518,12 +738,13 @@ pod_retry_l3:
             unmap_domain_page(l3e);
             return _mfn(INVALID_MFN);
         }
-        else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+        if ( flags & _PAGE_PSE )
         {
             mfn = _mfn(l3e_get_pfn(*l3e) +
                        l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
                        l1_table_offset(addr));
-            *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+            *t = recalc_type(recalc || _needs_recalc(flags),
+                             p2m_flags_to_type(flags), p2m, gfn);
             unmap_domain_page(l3e);
 
             ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -533,6 +754,8 @@ pod_retry_l3:
         }
 
         mfn = _mfn(l3e_get_pfn(*l3e));
+        if ( _needs_recalc(flags) )
+            recalc = 1;
         unmap_domain_page(l3e);
     }
 
@@ -540,10 +763,11 @@ pod_retry_l3:
     l2e += l2_table_offset(addr);
 
 pod_retry_l2:
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    flags = l2e_get_flags(*l2e);
+    if ( !(flags & _PAGE_PRESENT) )
     {
         /* PoD: Try to populate a 2-meg chunk */
-        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
         {
             if ( q & P2M_ALLOC ) {
                 if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) )
@@ -555,10 +779,11 @@ pod_retry_l2:
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
-    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    if ( flags & _PAGE_PSE )
     {
         mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
-        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        *t = recalc_type(recalc || _needs_recalc(flags),
+                         p2m_flags_to_type(flags), p2m, gfn);
         unmap_domain_page(l2e);
         
         ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -568,14 +793,16 @@ pod_retry_l2:
     }
 
     mfn = _mfn(l2e_get_pfn(*l2e));
+    if ( needs_recalc(l2, *l2e) )
+        recalc = 1;
     unmap_domain_page(l2e);
 
     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
 pod_retry_l1:
-    l1e_flags = l1e_get_flags(*l1e);
-    l1t = p2m_flags_to_type(l1e_flags);
-    if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) )
+    flags = l1e_get_flags(*l1e);
+    l1t = p2m_flags_to_type(flags);
+    if ( !(flags & _PAGE_PRESENT) && !p2m_is_paging(l1t) )
     {
         /* PoD: Try to populate */
         if ( l1t == p2m_populate_on_demand )
@@ -591,7 +818,7 @@ pod_retry_l1:
         return _mfn(INVALID_MFN);
     }
     mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = l1t;
+    *t = recalc_type(recalc || _needs_recalc(flags), l1t, p2m, gfn);
     unmap_domain_page(l1e);
 
     ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t));
@@ -714,6 +941,47 @@ static void p2m_pt_change_entry_type_glo
     unmap_domain_page(l4e);
 }
 
+static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,
+                                          p2m_type_t ot, p2m_type_t nt,
+                                          unsigned long first_gfn,
+                                          unsigned long last_gfn)
+{
+    unsigned long mask = (1 << PAGETABLE_ORDER) - 1;
+    unsigned int i;
+    int err = 0;
+
+    ASSERT(hap_enabled(p2m->domain));
+
+    for ( i = 1; i <= 4; )
+    {
+        if ( first_gfn & mask )
+        {
+            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
+
+            err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn);
+            if ( err || end_gfn >= last_gfn )
+                break;
+            first_gfn = end_gfn + 1;
+        }
+        else if ( (last_gfn & mask) != mask )
+        {
+            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
+
+            err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn);
+            if ( err || start_gfn <= first_gfn )
+                break;
+            last_gfn = start_gfn - 1;
+        }
+        else
+        {
+            ++i;
+            mask |= mask << PAGETABLE_ORDER;
+        }
+    }
+
+    return err;
+}
+
 #if P2M_AUDIT
 long p2m_pt_audit_p2m(struct p2m_domain *p2m)
 {
@@ -872,6 +1140,7 @@ void p2m_pt_init(struct p2m_domain *p2m)
     p2m->set_entry = p2m_pt_set_entry;
     p2m->get_entry = p2m_pt_get_entry;
     p2m->change_entry_type_global = p2m_pt_change_entry_type_global;
+    p2m->change_entry_type_range = p2m_pt_change_entry_type_range;
     p2m->write_p2m_entry = paging_write_p2m_entry;
 #if P2M_AUDIT
     p2m->audit_p2m = p2m_pt_audit_p2m;
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -668,6 +668,8 @@ static inline p2m_type_t p2m_flags_to_ty
     return (flags >> 12) & 0x7f;
 }
 
+int p2m_npt_fault(uint64_t gpa);
+
 /*
  * Nested p2m: shadow p2m tables used for nested HVM virtualization 
  */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 5/6] x86/NPT: don't walk entire page tables when globally changing types
  2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
                   ` (3 preceding siblings ...)
  2014-04-22 12:30 ` [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range Jan Beulich
@ 2014-04-22 12:31 ` Jan Beulich
  2014-04-24 16:28   ` Tim Deegan
  2014-04-22 12:32 ` [PATCH v2 6/6] x86/P2M: cleanup Jan Beulich
  5 siblings, 1 reply; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:31 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 6511 bytes --]

Instead leverage the NPF VM exit enforcement by marking just the top
level entries as needing recalculation of their type, building on the
respective range type change modifications.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -214,6 +214,10 @@ void p2m_change_entry_type_global(struct
                                   p2m_type_t ot, p2m_type_t nt)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    ASSERT(ot != nt);
+    ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
+
     p2m_lock(p2m);
     p2m->change_entry_type_global(p2m, ot, nt);
     p2m->global_logdirty = (nt == p2m_ram_logdirty);
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -879,11 +879,9 @@ static void ept_change_entry_type_global
 {
     unsigned long mfn = ept_get_asr(&p2m->ept);
 
-    if ( !mfn || ot == nt )
+    if ( !mfn )
         return;
 
-    BUG_ON(!p2m_is_changeable(ot) || !p2m_is_changeable(nt));
-
     if ( ept_invalidate_emt(_mfn(mfn), 1) )
         ept_sync_domain(p2m);
 }
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -827,118 +827,36 @@ pod_retry_l1:
     return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
 }
 
-/* Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to 
- * quickly enable or diable log-dirty tracking */
 static void p2m_pt_change_entry_type_global(struct p2m_domain *p2m,
                                             p2m_type_t ot, p2m_type_t nt)
 {
-    unsigned long mfn, gfn, flags;
-    l1_pgentry_t l1e_content;
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-    mfn_t l1mfn, l2mfn, l3mfn;
-    unsigned long i1, i2, i3;
-    l3_pgentry_t *l3e;
-    l4_pgentry_t *l4e;
-    unsigned long i4;
-
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
-
-    if ( !paging_mode_translate(p2m->domain) )
-        return;
+    l1_pgentry_t *tab;
+    unsigned long gfn = 0;
+    unsigned int i, changed;
 
     if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 )
         return;
 
     ASSERT(p2m_locked_by_me(p2m));
 
-    l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-
-    for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+    tab = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    for ( changed = i = 0; i < (1 << PAGETABLE_ORDER); ++i )
     {
-        if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
-        {
-            continue;
-        }
-        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
-        l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
-        for ( i3 = 0;
-              i3 < L3_PAGETABLE_ENTRIES;
-              i3++ )
-        {
-            if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
-            {
-                continue;
-            }
-            if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
-            {
-                flags = l3e_get_flags(l3e[i3]);
-                if ( p2m_flags_to_type(flags) != ot )
-                    continue;
-                mfn = l3e_get_pfn(l3e[i3]);
-                gfn = get_gpfn_from_mfn(mfn);
-                flags = p2m_type_to_flags(nt, _mfn(mfn));
-                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                p2m->write_p2m_entry(p2m, gfn,
-                                     (l1_pgentry_t *)&l3e[i3],
-                                     l1e_content, 3);
-                continue;
-            }
-
-            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
-            l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
-            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
-                {
-                    continue;
-                }
+        l1_pgentry_t e = tab[i];
 
-                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
-                {
-                    flags = l2e_get_flags(l2e[i2]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l2e_get_pfn(l2e[i2]);
-                    /* Do not use get_gpfn_from_mfn because it may return 
-                       SHARED_M2P_ENTRY */
-                    gfn = (i2 + (i3 + (i4 * L3_PAGETABLE_ENTRIES))
-                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                    p2m->write_p2m_entry(p2m, gfn,
-                                         (l1_pgentry_t *)&l2e[i2],
-                                         l1e_content, 2);
-                    continue;
-                }
-
-                l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
-                l1e = map_domain_page(mfn_x(l1mfn));
-
-                for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ )
-                {
-                    flags = l1e_get_flags(l1e[i1]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l1e_get_pfn(l1e[i1]);
-                    gfn = i1 + (i2 + (i3 + (i4 * L3_PAGETABLE_ENTRIES))
-                                * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    /* create a new 1le entry with the new type */
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = p2m_l1e_from_pfn(mfn, flags);
-                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
-                                         l1e_content, 1);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
+        if ( (l1e_get_flags(e) & _PAGE_PRESENT) &&
+             !needs_recalc(l1, e) )
+        {
+            set_recalc(l1, e);
+            p2m->write_p2m_entry(p2m, gfn, &tab[i], e, 4);
+            ++changed;
         }
-        unmap_domain_page(l3e);
+        gfn += 1UL << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
     }
+    unmap_domain_page(tab);
 
-    unmap_domain_page(l4e);
+    if ( changed )
+         flush_tlb_mask(p2m->domain->domain_dirty_cpumask);
 }
 
 static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,



[-- Attachment #2: NPT-replace-cetg.patch --]
[-- Type: text/plain, Size: 6578 bytes --]

x86/NPT: don't walk entire page tables when globally changing types

Instead leverage the NPF VM exit enforcement by marking just the top
level entries as needing recalculation of their type, building on the
respective range type change modifications.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -214,6 +214,10 @@ void p2m_change_entry_type_global(struct
                                   p2m_type_t ot, p2m_type_t nt)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    ASSERT(ot != nt);
+    ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
+
     p2m_lock(p2m);
     p2m->change_entry_type_global(p2m, ot, nt);
     p2m->global_logdirty = (nt == p2m_ram_logdirty);
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -879,11 +879,9 @@ static void ept_change_entry_type_global
 {
     unsigned long mfn = ept_get_asr(&p2m->ept);
 
-    if ( !mfn || ot == nt )
+    if ( !mfn )
         return;
 
-    BUG_ON(!p2m_is_changeable(ot) || !p2m_is_changeable(nt));
-
     if ( ept_invalidate_emt(_mfn(mfn), 1) )
         ept_sync_domain(p2m);
 }
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -827,118 +827,36 @@ pod_retry_l1:
     return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN);
 }
 
-/* Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to 
- * quickly enable or diable log-dirty tracking */
 static void p2m_pt_change_entry_type_global(struct p2m_domain *p2m,
                                             p2m_type_t ot, p2m_type_t nt)
 {
-    unsigned long mfn, gfn, flags;
-    l1_pgentry_t l1e_content;
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-    mfn_t l1mfn, l2mfn, l3mfn;
-    unsigned long i1, i2, i3;
-    l3_pgentry_t *l3e;
-    l4_pgentry_t *l4e;
-    unsigned long i4;
-
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
-
-    if ( !paging_mode_translate(p2m->domain) )
-        return;
+    l1_pgentry_t *tab;
+    unsigned long gfn = 0;
+    unsigned int i, changed;
 
     if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 )
         return;
 
     ASSERT(p2m_locked_by_me(p2m));
 
-    l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
-
-    for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+    tab = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+    for ( changed = i = 0; i < (1 << PAGETABLE_ORDER); ++i )
     {
-        if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
-        {
-            continue;
-        }
-        l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
-        l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
-        for ( i3 = 0;
-              i3 < L3_PAGETABLE_ENTRIES;
-              i3++ )
-        {
-            if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
-            {
-                continue;
-            }
-            if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
-            {
-                flags = l3e_get_flags(l3e[i3]);
-                if ( p2m_flags_to_type(flags) != ot )
-                    continue;
-                mfn = l3e_get_pfn(l3e[i3]);
-                gfn = get_gpfn_from_mfn(mfn);
-                flags = p2m_type_to_flags(nt, _mfn(mfn));
-                l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                p2m->write_p2m_entry(p2m, gfn,
-                                     (l1_pgentry_t *)&l3e[i3],
-                                     l1e_content, 3);
-                continue;
-            }
-
-            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
-            l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
-            for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
-                {
-                    continue;
-                }
+        l1_pgentry_t e = tab[i];
 
-                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
-                {
-                    flags = l2e_get_flags(l2e[i2]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l2e_get_pfn(l2e[i2]);
-                    /* Do not use get_gpfn_from_mfn because it may return 
-                       SHARED_M2P_ENTRY */
-                    gfn = (i2 + (i3 + (i4 * L3_PAGETABLE_ENTRIES))
-                           * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
-                    p2m->write_p2m_entry(p2m, gfn,
-                                         (l1_pgentry_t *)&l2e[i2],
-                                         l1e_content, 2);
-                    continue;
-                }
-
-                l1mfn = _mfn(l2e_get_pfn(l2e[i2]));
-                l1e = map_domain_page(mfn_x(l1mfn));
-
-                for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ )
-                {
-                    flags = l1e_get_flags(l1e[i1]);
-                    if ( p2m_flags_to_type(flags) != ot )
-                        continue;
-                    mfn = l1e_get_pfn(l1e[i1]);
-                    gfn = i1 + (i2 + (i3 + (i4 * L3_PAGETABLE_ENTRIES))
-                                * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; 
-                    /* create a new 1le entry with the new type */
-                    flags = p2m_type_to_flags(nt, _mfn(mfn));
-                    l1e_content = p2m_l1e_from_pfn(mfn, flags);
-                    p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
-                                         l1e_content, 1);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
+        if ( (l1e_get_flags(e) & _PAGE_PRESENT) &&
+             !needs_recalc(l1, e) )
+        {
+            set_recalc(l1, e);
+            p2m->write_p2m_entry(p2m, gfn, &tab[i], e, 4);
+            ++changed;
         }
-        unmap_domain_page(l3e);
+        gfn += 1UL << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
     }
+    unmap_domain_page(tab);
 
-    unmap_domain_page(l4e);
+    if ( changed )
+         flush_tlb_mask(p2m->domain->domain_dirty_cpumask);
 }
 
 static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 6/6] x86/P2M: cleanup
  2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
                   ` (4 preceding siblings ...)
  2014-04-22 12:31 ` [PATCH v2 5/6] x86/NPT: don't walk entire page tables when globally changing types Jan Beulich
@ 2014-04-22 12:32 ` Jan Beulich
  2014-04-22 12:37   ` Andrew Cooper
  2014-04-24 16:30   ` Tim Deegan
  5 siblings, 2 replies; 21+ messages in thread
From: Jan Beulich @ 2014-04-22 12:32 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, Boris Ostrovsky

[-- Attachment #1: Type: text/plain, Size: 3087 bytes --]

- don't abuse __PAGE_HYPERVISOR
- don't use bogus constructs like mfn_x(_mfn())

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -193,7 +193,7 @@ p2m_next_level(struct p2m_domain *p2m, v
             return -ENOMEM;
 
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR | _PAGE_USER);
+                                 P2M_BASE_FLAGS | _PAGE_RW);
 
         switch ( type ) {
         case PGT_l3_page_table:
@@ -229,7 +229,7 @@ p2m_next_level(struct p2m_domain *p2m, v
         flags = l1e_get_flags(*p2m_entry);
         pfn = l1e_get_pfn(*p2m_entry);
 
-        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        l1_entry = __map_domain_page(pg);
         for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
         {
             new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
@@ -238,7 +238,7 @@ p2m_next_level(struct p2m_domain *p2m, v
         }
         unmap_domain_page(l1_entry);
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
+                                 P2M_BASE_FLAGS | _PAGE_RW); /* disable PSE */
         p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
     }
@@ -273,7 +273,7 @@ p2m_next_level(struct p2m_domain *p2m, v
         unmap_domain_page(l1_entry);
         
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER);
+                                 P2M_BASE_FLAGS | _PAGE_RW);
         p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
     }
@@ -929,7 +929,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
                 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
                 continue;
             }
-            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
+            l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
             for ( i3 = 0;
                   i3 < L3_PAGETABLE_ENTRIES;
                   i3++ )
@@ -964,7 +964,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
                     }
                 }
 
-                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
+                l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
                 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
                 {
                     if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
@@ -1000,7 +1000,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
                         continue;
                     }
 
-                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
+                    l1e = map_domain_page(l2e_get_pfn(l2e[i2]));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
                     {




[-- Attachment #2: x86-p2m-pt-cleanup.patch --]
[-- Type: text/plain, Size: 3101 bytes --]

x86/P2M: cleanup

- don't abuse __PAGE_HYPERVISOR
- don't use bogus constructs like mfn_x(_mfn())

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -193,7 +193,7 @@ p2m_next_level(struct p2m_domain *p2m, v
             return -ENOMEM;
 
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR | _PAGE_USER);
+                                 P2M_BASE_FLAGS | _PAGE_RW);
 
         switch ( type ) {
         case PGT_l3_page_table:
@@ -229,7 +229,7 @@ p2m_next_level(struct p2m_domain *p2m, v
         flags = l1e_get_flags(*p2m_entry);
         pfn = l1e_get_pfn(*p2m_entry);
 
-        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        l1_entry = __map_domain_page(pg);
         for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
         {
             new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
@@ -238,7 +238,7 @@ p2m_next_level(struct p2m_domain *p2m, v
         }
         unmap_domain_page(l1_entry);
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
+                                 P2M_BASE_FLAGS | _PAGE_RW); /* disable PSE */
         p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
     }
@@ -273,7 +273,7 @@ p2m_next_level(struct p2m_domain *p2m, v
         unmap_domain_page(l1_entry);
         
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
-                                 __PAGE_HYPERVISOR|_PAGE_USER);
+                                 P2M_BASE_FLAGS | _PAGE_RW);
         p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
         p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
     }
@@ -929,7 +929,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
                 gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
                 continue;
             }
-            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
+            l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
             for ( i3 = 0;
                   i3 < L3_PAGETABLE_ENTRIES;
                   i3++ )
@@ -964,7 +964,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
                     }
                 }
 
-                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
+                l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
                 for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
                 {
                     if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
@@ -1000,7 +1000,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
                         continue;
                     }
 
-                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
+                    l1e = map_domain_page(l2e_get_pfn(l2e[i2]));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
                     {

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 6/6] x86/P2M: cleanup
  2014-04-22 12:32 ` [PATCH v2 6/6] x86/P2M: cleanup Jan Beulich
@ 2014-04-22 12:37   ` Andrew Cooper
  2014-04-24 16:30   ` Tim Deegan
  1 sibling, 0 replies; 21+ messages in thread
From: Andrew Cooper @ 2014-04-22 12:37 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, Jun Nakajima, Tim Deegan, Eddie Dong,
	suravee.suthikulpanit, xen-devel, Boris Ostrovsky


[-- Attachment #1.1: Type: text/plain, Size: 3388 bytes --]

On 22/04/14 13:32, Jan Beulich wrote:
> - don't abuse __PAGE_HYPERVISOR
> - don't use bogus constructs like mfn_x(_mfn())
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper<andrew.cooper3@citrix.com>

>
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -193,7 +193,7 @@ p2m_next_level(struct p2m_domain *p2m, v
>              return -ENOMEM;
>  
>          new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> -                                 __PAGE_HYPERVISOR | _PAGE_USER);
> +                                 P2M_BASE_FLAGS | _PAGE_RW);
>  
>          switch ( type ) {
>          case PGT_l3_page_table:
> @@ -229,7 +229,7 @@ p2m_next_level(struct p2m_domain *p2m, v
>          flags = l1e_get_flags(*p2m_entry);
>          pfn = l1e_get_pfn(*p2m_entry);
>  
> -        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
> +        l1_entry = __map_domain_page(pg);
>          for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
>          {
>              new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
> @@ -238,7 +238,7 @@ p2m_next_level(struct p2m_domain *p2m, v
>          }
>          unmap_domain_page(l1_entry);
>          new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> -                                 __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
> +                                 P2M_BASE_FLAGS | _PAGE_RW); /* disable PSE */
>          p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
>          p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 3);
>      }
> @@ -273,7 +273,7 @@ p2m_next_level(struct p2m_domain *p2m, v
>          unmap_domain_page(l1_entry);
>          
>          new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
> -                                 __PAGE_HYPERVISOR|_PAGE_USER);
> +                                 P2M_BASE_FLAGS | _PAGE_RW);
>          p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable);
>          p2m->write_p2m_entry(p2m, gfn, p2m_entry, new_entry, 2);
>      }
> @@ -929,7 +929,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
>                  gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
>                  continue;
>              }
> -            l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4]))));
> +            l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
>              for ( i3 = 0;
>                    i3 < L3_PAGETABLE_ENTRIES;
>                    i3++ )
> @@ -964,7 +964,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
>                      }
>                  }
>  
> -                l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
> +                l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
>                  for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
>                  {
>                      if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
> @@ -1000,7 +1000,7 @@ long p2m_pt_audit_p2m(struct p2m_domain 
>                          continue;
>                      }
>  
> -                    l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
> +                    l1e = map_domain_page(l2e_get_pfn(l2e[i2]));
>  
>                      for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
>                      {
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 4208 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types
  2014-04-22 12:28 ` [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types Jan Beulich
@ 2014-04-24 15:41   ` Tim Deegan
  2014-04-24 16:20     ` Jan Beulich
  2014-04-25 19:37   ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 21+ messages in thread
From: Tim Deegan @ 2014-04-24 15:41 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 13:28 +0100 on 22 Apr (1398169701), Jan Beulich wrote:
> Instead leverage the EPT_MISCONFIG VM exit by marking just the top
> level entries as needing recalculation of their type, propagating the
> the recalculation state down as necessary such that the actual
> recalculation gets done upon access.
> 
> For this to work, we have to
> - restrict the types between which conversions can be done (right now
>   only the two types involved in log dirty tracking need to be taken
>   care of)
> - remember the ranges that log dirty tracking was requested for as well
>   as whether global log dirty tracking is in effect
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Generally looks good; I think the core idea of reusing the misconfig
mechanism is the right way to go. 

The naming you've chosen implies that it might be extended later to
other lazily-updated types; I'm not sure how well that will scale.
OTOH right now I can't think of any other users for this (and I can 
see why you're not happy with using change_type() to flush out
foreign mappings on teardown after this changes).

A few comments on detail below:

> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *
>          if ( begin_pfn != dirty_vram->begin_pfn ||
>               begin_pfn + nr != dirty_vram->end_pfn )
>          {
> +            unsigned long ostart = dirty_vram->begin_pfn;
> +            unsigned long oend = dirty_vram->end_pfn;
> +
>              dirty_vram->begin_pfn = begin_pfn;
>              dirty_vram->end_pfn = begin_pfn + nr;
>  
>              paging_unlock(d);
>  
> +            if ( oend > ostart )
> +                p2m_change_type_range(d, ostart, oend,
> +                                      p2m_ram_logdirty, p2m_ram_rw);
> +

Does this (and the simlar change below) not risk losing updates if
we're in full log-dirty mode?  I think it should be fine to leave
those entries as log-dirty, since at worst they'll provoke another
fault-and-fixup.

> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -116,8 +116,14 @@ static int p2m_init_hostp2m(struct domai
>  
>      if ( p2m )
>      {
> -        d->arch.p2m = p2m;
> -        return 0;
> +        p2m->logdirty_ranges = rangeset_new(d, "log-dirty",
> +                                            RANGESETF_prettyprint_hex);

Is there anything the guest can do to cause this rangeset to grow very
large?  Like moving its video RAM around?

> +/*
> + * Resolve deliberately mis-configured (EMT field set to an invalid value)
> + * entries in the page table hierarchy for the given GFN:
> + * - calculate the correct value for the EMT field
> + * - if marked so, re-calculate the P2M type
> + * - propagate EMT and re-calculation flag down to the next page table level
> + *   for entries not involved in the translation of the given GFN
> + */
> +static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
>  {

Please document the different return values here as well.

>  
> -    return !!okay;
> +    return rc >= !spurious;

Please just write out this logic in a human-readable way.  The compiler
will DTRT. :)  

>  }
>  
>  /*
> @@ -416,12 +470,11 @@ ept_set_entry(struct p2m_domain *p2m, un
>      unsigned long gfn_remainder = gfn;
>      int i, target = order / EPT_TABLE_ORDER;
>      int rc = 0;
> -    int ret = 0;
>      bool_t direct_mmio = (p2mt == p2m_mmio_direct);
>      uint8_t ipat = 0;
>      int need_modify_vtd_table = 1;
>      int vtd_pte_present = 0;
> -    int needs_sync = 1;
> +    int ret, needs_sync = -1;

Hmmm, another tristate.  I find these hard to follow, esp.  without
comments to say which non-zero value is which.  I wonder if we could
add some sort of framework for these to make them clearer.  Ideally
we'd have an enum-like type and rely on the compiler to DTRT about
optimizing the tests.

In this case, where needs_sync isn't being passed as a return value, I
think we'd be fine with two booleans in place of this int.

And more generally, I would like to see at least a comment on every
new instance of this trick before I ack it in x86/mm code.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry()
  2014-04-22 12:30 ` [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry() Jan Beulich
@ 2014-04-24 16:00   ` Tim Deegan
  2014-04-24 16:22     ` Jan Beulich
  0 siblings, 1 reply; 21+ messages in thread
From: Tim Deegan @ 2014-04-24 16:00 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 13:30 +0100 on 22 Apr (1398169800), Jan Beulich wrote:
> The "table_mfn" parameter really isn't needed anywhere, so it gets
> dropped.
> 
> The "struct vcpu *" one was always bogus (as was being made up by
> paging_write_p2m_entry()), and is not commonly used. It can be easily
> enough made up in the one place (sh_unshadow_for_p2m_change()) it is
> needed, and we can otherwise pass "struct domain *" instead, properly
> reflecting that P2M operations are per-domain.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Tim Deegan <tim@xen.org>

and I think should be applicable independent of the rest of the
series.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 2/6] x86/EPT: don't walk entire page tables when changing types on a range
  2014-04-22 12:29 ` [PATCH v2 2/6] x86/EPT: don't walk entire page tables when changing types on a range Jan Beulich
@ 2014-04-24 16:00   ` Tim Deegan
  0 siblings, 0 replies; 21+ messages in thread
From: Tim Deegan @ 2014-04-24 16:00 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 13:29 +0100 on 22 Apr (1398169764), Jan Beulich wrote:
> This requires a new P2M backend hook and a little bit of extra care on
> accounting in the generic function.
> 
> Note that even on leaf entries we must not immediately set the new
> type (in an attempt to avoid the EPT_MISCONFIG VM exits), since the
> global accounting in p2m_change_type_range() gets intentionally done
> only after updating page tables (or else the update there would
> conflict with the function's own use of p2m_is_logdirty_range()), and
> the correct type can only be calculated with that in place.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Looks good, once 1/6 is ready.

Reviewed-by: Tim Deegan <tim@xen.org>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types
  2014-04-24 15:41   ` Tim Deegan
@ 2014-04-24 16:20     ` Jan Beulich
  2014-04-25  8:44       ` Tim Deegan
  0 siblings, 1 reply; 21+ messages in thread
From: Jan Beulich @ 2014-04-24 16:20 UTC (permalink / raw)
  To: Tim Deegan
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

>>> On 24.04.14 at 17:41, <tim@xen.org> wrote:
> At 13:28 +0100 on 22 Apr (1398169701), Jan Beulich wrote:
>> --- a/xen/arch/x86/mm/hap/hap.c
>> +++ b/xen/arch/x86/mm/hap/hap.c
>> @@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *
>>          if ( begin_pfn != dirty_vram->begin_pfn ||
>>               begin_pfn + nr != dirty_vram->end_pfn )
>>          {
>> +            unsigned long ostart = dirty_vram->begin_pfn;
>> +            unsigned long oend = dirty_vram->end_pfn;
>> +
>>              dirty_vram->begin_pfn = begin_pfn;
>>              dirty_vram->end_pfn = begin_pfn + nr;
>>  
>>              paging_unlock(d);
>>  
>> +            if ( oend > ostart )
>> +                p2m_change_type_range(d, ostart, oend,
>> +                                      p2m_ram_logdirty, p2m_ram_rw);
>> +
> 
> Does this (and the simlar change below) not risk losing updates if
> we're in full log-dirty mode?  I think it should be fine to leave
> those entries as log-dirty, since at worst they'll provoke another
> fault-and-fixup.

I don't think any updates can be lost: When their types get re-
calculated, p2m_is_logdirty_range() (taking p2m->global_logdirty
into account) will still cause them to become p2m_ram_logdirty
again.

And these are needed mainly because ...

>> --- a/xen/arch/x86/mm/p2m.c
>> +++ b/xen/arch/x86/mm/p2m.c
>> @@ -116,8 +116,14 @@ static int p2m_init_hostp2m(struct domai
>>  
>>      if ( p2m )
>>      {
>> -        d->arch.p2m = p2m;
>> -        return 0;
>> +        p2m->logdirty_ranges = rangeset_new(d, "log-dirty",
>> +                                            RANGESETF_prettyprint_hex);
> 
> Is there anything the guest can do to cause this rangeset to grow very
> large?  Like moving its video RAM around?

... we need to avoid this. If there was ever something added that
could cause multiple such regions to be created, we'd need to add
an upper limit on the number of permissible ranges. But there can
only be one VRAM region right now, so we're safe.

>> @@ -416,12 +470,11 @@ ept_set_entry(struct p2m_domain *p2m, un
>>      unsigned long gfn_remainder = gfn;
>>      int i, target = order / EPT_TABLE_ORDER;
>>      int rc = 0;
>> -    int ret = 0;
>>      bool_t direct_mmio = (p2mt == p2m_mmio_direct);
>>      uint8_t ipat = 0;
>>      int need_modify_vtd_table = 1;
>>      int vtd_pte_present = 0;
>> -    int needs_sync = 1;
>> +    int ret, needs_sync = -1;
> 
> Hmmm, another tristate.  I find these hard to follow, esp.  without
> comments to say which non-zero value is which.  I wonder if we could
> add some sort of framework for these to make them clearer.  Ideally
> we'd have an enum-like type and rely on the compiler to DTRT about
> optimizing the tests.
> 
> In this case, where needs_sync isn't being passed as a return value, I
> think we'd be fine with two booleans in place of this int.

I guess I'll make these tristates distinct enums then...

Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry()
  2014-04-24 16:00   ` Tim Deegan
@ 2014-04-24 16:22     ` Jan Beulich
  0 siblings, 0 replies; 21+ messages in thread
From: Jan Beulich @ 2014-04-24 16:22 UTC (permalink / raw)
  To: Tim Deegan
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

>>> On 24.04.14 at 18:00, <tim@xen.org> wrote:
> At 13:30 +0100 on 22 Apr (1398169800), Jan Beulich wrote:
>> The "table_mfn" parameter really isn't needed anywhere, so it gets
>> dropped.
>> 
>> The "struct vcpu *" one was always bogus (as was being made up by
>> paging_write_p2m_entry()), and is not commonly used. It can be easily
>> enough made up in the one place (sh_unshadow_for_p2m_change()) it is
>> needed, and we can otherwise pass "struct domain *" instead, properly
>> reflecting that P2M operations are per-domain.
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Tim Deegan <tim@xen.org>
> 
> and I think should be applicable independent of the rest of the
> series.

Yes indeed, all that is required is that it precedes the NPT patches. I
guess I'll commit it some time tomorrow unless you tell me otherwise.

Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range
  2014-04-22 12:30 ` [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range Jan Beulich
@ 2014-04-24 16:25   ` Tim Deegan
  2014-04-25  6:13     ` Jan Beulich
  0 siblings, 1 reply; 21+ messages in thread
From: Tim Deegan @ 2014-04-24 16:25 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 13:30 +0100 on 22 Apr (1398169846), Jan Beulich wrote:
> This builds on the fact that in order for no NPF VM exit to occur,
> _PAGE_USER must always be set. I.e. by clearing the flag we can force a
> VM exit allowing us to do similar lazy type changes as on EPT.
> 
> That way, the generic entry-wise code can go away, and we could remove
> the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -2557,7 +2557,16 @@ void svm_vmexit_handler(struct cpu_user_
>          perfc_incra(svmexits, VMEXIT_NPF_PERFC);
>          if ( cpu_has_svm_decode )
>              v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
> -        svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
> +        rc = p2m_npt_fault(vmcb->exitinfo2);

Can we limit the classes of fault that we need to check for
recalc on?  e.g. if bit 0 of the error code is clear, then the page is
not mapped at all. 

I'm unsure about having two different fixup paths here anyway -- one
for log-dirty and one for everything else (PoD, sharing &c).  This
call should probably go _inside_ svm_do_nested_pgfault() at least.

Also, maybe it wants a more descriptive name than p2m_npt_fault().
p2m_pt_hnadle_misconfig(), to match the EPT equivalent?

> +static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,
> +                                          p2m_type_t ot, p2m_type_t nt,
> +                                          unsigned long first_gfn,
> +                                          unsigned long last_gfn)
> +{
> +    unsigned long mask = (1 << PAGETABLE_ORDER) - 1;
> +    unsigned int i;
> +    int err = 0;
> +
> +    ASSERT(hap_enabled(p2m->domain));
> +
> +    for ( i = 1; i <= 4; )
> +    {
> +        if ( first_gfn & mask )
> +        {
> +            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
> +
> +            err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn);
> +            if ( err || end_gfn >= last_gfn )
> +                break;
> +            first_gfn = end_gfn + 1;
> +        }
> +        else if ( (last_gfn & mask) != mask )
> +        {
> +            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
> +
> +            err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn);
> +            if ( err || start_gfn <= first_gfn )
> +                break;
> +            last_gfn = start_gfn - 1;
> +        }
> +        else
> +        {
> +            ++i;
> +            mask |= mask << PAGETABLE_ORDER;
> +        }
> +    }
> +
> +    return err;
> +}

This looks like it could be merged with the EPT equivalent.  Or would
that be too unwieldy?

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 5/6] x86/NPT: don't walk entire page tables when globally changing types
  2014-04-22 12:31 ` [PATCH v2 5/6] x86/NPT: don't walk entire page tables when globally changing types Jan Beulich
@ 2014-04-24 16:28   ` Tim Deegan
  0 siblings, 0 replies; 21+ messages in thread
From: Tim Deegan @ 2014-04-24 16:28 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 13:31 +0100 on 22 Apr (1398169898), Jan Beulich wrote:
> Instead leverage the NPF VM exit enforcement by marking just the top
> level entries as needing recalculation of their type, building on the
> respective range type change modifications.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Tim Deegan <tim@xen.org>

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 6/6] x86/P2M: cleanup
  2014-04-22 12:32 ` [PATCH v2 6/6] x86/P2M: cleanup Jan Beulich
  2014-04-22 12:37   ` Andrew Cooper
@ 2014-04-24 16:30   ` Tim Deegan
  1 sibling, 0 replies; 21+ messages in thread
From: Tim Deegan @ 2014-04-24 16:30 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 13:32 +0100 on 22 Apr (1398169932), Jan Beulich wrote:
> - don't abuse __PAGE_HYPERVISOR
> - don't use bogus constructs like mfn_x(_mfn())
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Tim Deegan <tim@xen.org>

Again, suitable to go in independently of the rest.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range
  2014-04-24 16:25   ` Tim Deegan
@ 2014-04-25  6:13     ` Jan Beulich
  2014-04-25  8:43       ` Tim Deegan
  0 siblings, 1 reply; 21+ messages in thread
From: Jan Beulich @ 2014-04-25  6:13 UTC (permalink / raw)
  To: Tim Deegan
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

>>> On 24.04.14 at 18:25, <tim@xen.org> wrote:
> At 13:30 +0100 on 22 Apr (1398169846), Jan Beulich wrote:
>> This builds on the fact that in order for no NPF VM exit to occur,
>> _PAGE_USER must always be set. I.e. by clearing the flag we can force a
>> VM exit allowing us to do similar lazy type changes as on EPT.
>> 
>> That way, the generic entry-wise code can go away, and we could remove
>> the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> 
>> --- a/xen/arch/x86/hvm/svm/svm.c
>> +++ b/xen/arch/x86/hvm/svm/svm.c
>> @@ -2557,7 +2557,16 @@ void svm_vmexit_handler(struct cpu_user_
>>          perfc_incra(svmexits, VMEXIT_NPF_PERFC);
>>          if ( cpu_has_svm_decode )
>>              v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
>> -        svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
>> +        rc = p2m_npt_fault(vmcb->exitinfo2);
> 
> Can we limit the classes of fault that we need to check for
> recalc on?  e.g. if bit 0 of the error code is clear, then the page is
> not mapped at all. 

That's a good suggestion (albeit I'm not sure if there really are
frequent faults with P clear - with paging perhaps, but that's a slow
path anyway). Ideally we would have an indication that the fault
was because of the U bit clear, but sadly that doesn't exist.

> I'm unsure about having two different fixup paths here anyway -- one
> for log-dirty and one for everything else (PoD, sharing &c).  This
> call should probably go _inside_ svm_do_nested_pgfault() at least.

I intentionally kept it separate, so it's not getting farther away than
necessary from how EPT handling is structured.

> Also, maybe it wants a more descriptive name than p2m_npt_fault().
> p2m_pt_hnadle_misconfig(), to match the EPT equivalent?

I first considered naming it that way, but it's not really a mis-
configuration. But if you think that's a better name despite not
describing what it really does, I'm okay changing it...

>> +static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,
>> +                                          p2m_type_t ot, p2m_type_t nt,
>> +                                          unsigned long first_gfn,
>> +                                          unsigned long last_gfn)
>> +{
>> +    unsigned long mask = (1 << PAGETABLE_ORDER) - 1;
>> +    unsigned int i;
>> +    int err = 0;
>> +
>> +    ASSERT(hap_enabled(p2m->domain));
>> +
>> +    for ( i = 1; i <= 4; )
>> +    {
>> +        if ( first_gfn & mask )
>> +        {
>> +            unsigned long end_gfn = min(first_gfn | mask, last_gfn);
>> +
>> +            err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn);
>> +            if ( err || end_gfn >= last_gfn )
>> +                break;
>> +            first_gfn = end_gfn + 1;
>> +        }
>> +        else if ( (last_gfn & mask) != mask )
>> +        {
>> +            unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
>> +
>> +            err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn);
>> +            if ( err || start_gfn <= first_gfn )
>> +                break;
>> +            last_gfn = start_gfn - 1;
>> +        }
>> +        else
>> +        {
>> +            ++i;
>> +            mask |= mask << PAGETABLE_ORDER;
>> +        }
>> +    }
>> +
>> +    return err;
>> +}
> 
> This looks like it could be merged with the EPT equivalent.  Or would
> that be too unwieldy?

I considered unifying both this and/or the helpers of it, but decided
that the result would be uglier (too many parameters just to tell one
from the other) than the duplication that results from this approach.

Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range
  2014-04-25  6:13     ` Jan Beulich
@ 2014-04-25  8:43       ` Tim Deegan
  0 siblings, 0 replies; 21+ messages in thread
From: Tim Deegan @ 2014-04-25  8:43 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 07:13 +0100 on 25 Apr (1398406414), Jan Beulich wrote:
> >>> On 24.04.14 at 18:25, <tim@xen.org> wrote:
> > At 13:30 +0100 on 22 Apr (1398169846), Jan Beulich wrote:
> >> This builds on the fact that in order for no NPF VM exit to occur,
> >> _PAGE_USER must always be set. I.e. by clearing the flag we can force a
> >> VM exit allowing us to do similar lazy type changes as on EPT.
> >> 
> >> That way, the generic entry-wise code can go away, and we could remove
> >> the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.
> >> 
> >> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> 
> >> --- a/xen/arch/x86/hvm/svm/svm.c
> >> +++ b/xen/arch/x86/hvm/svm/svm.c
> >> @@ -2557,7 +2557,16 @@ void svm_vmexit_handler(struct cpu_user_
> >>          perfc_incra(svmexits, VMEXIT_NPF_PERFC);
> >>          if ( cpu_has_svm_decode )
> >>              v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
> >> -        svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
> >> +        rc = p2m_npt_fault(vmcb->exitinfo2);
> > 
> > Can we limit the classes of fault that we need to check for
> > recalc on?  e.g. if bit 0 of the error code is clear, then the page is
> > not mapped at all. 
> 
> That's a good suggestion (albeit I'm not sure if there really are
> frequent faults with P clear - with paging perhaps, but that's a slow
> path anyway). Ideally we would have an indication that the fault
> was because of the U bit clear, but sadly that doesn't exist.

Yeah, that's a shame.

> > I'm unsure about having two different fixup paths here anyway -- one
> > for log-dirty and one for everything else (PoD, sharing &c).  This
> > call should probably go _inside_ svm_do_nested_pgfault() at least.
> 
> I intentionally kept it separate, so it's not getting farther away than
> necessary from how EPT handling is structured.

Hmm.  Well, in that case I guess it's OK here.  The alternative would
be to put it right into the HVM nested fault handler and replumb the
ERPT path to go the same way, but I can see that would be inefficient...

> > Also, maybe it wants a more descriptive name than p2m_npt_fault().
> > p2m_pt_hnadle_misconfig(), to match the EPT equivalent?
> 
> I first considered naming it that way, but it's not really a mis-
> configuration. But if you think that's a better name despite not
> describing what it really does, I'm okay changing it...

I think I do prefer it, but maybe we can find a better name for both
of them: {p2m_pt,ept}_handle_deferred_changes()?

> > This looks like it could be merged with the EPT equivalent.  Or would
> > that be too unwieldy?
> 
> I considered unifying both this and/or the helpers of it, but decided
> that the result would be uglier (too many parameters just to tell one
> from the other) than the duplication that results from this approach.

OK, fair enough.

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types
  2014-04-24 16:20     ` Jan Beulich
@ 2014-04-25  8:44       ` Tim Deegan
  0 siblings, 0 replies; 21+ messages in thread
From: Tim Deegan @ 2014-04-25  8:44 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Eddie Dong,
	Jun Nakajima, xen-devel, Boris Ostrovsky

At 17:20 +0100 on 24 Apr (1398356409), Jan Beulich wrote:
> >>> On 24.04.14 at 17:41, <tim@xen.org> wrote:
> > At 13:28 +0100 on 22 Apr (1398169701), Jan Beulich wrote:
> >> --- a/xen/arch/x86/mm/hap/hap.c
> >> +++ b/xen/arch/x86/mm/hap/hap.c
> >> @@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *
> >>          if ( begin_pfn != dirty_vram->begin_pfn ||
> >>               begin_pfn + nr != dirty_vram->end_pfn )
> >>          {
> >> +            unsigned long ostart = dirty_vram->begin_pfn;
> >> +            unsigned long oend = dirty_vram->end_pfn;
> >> +
> >>              dirty_vram->begin_pfn = begin_pfn;
> >>              dirty_vram->end_pfn = begin_pfn + nr;
> >>  
> >>              paging_unlock(d);
> >>  
> >> +            if ( oend > ostart )
> >> +                p2m_change_type_range(d, ostart, oend,
> >> +                                      p2m_ram_logdirty, p2m_ram_rw);
> >> +
> > 
> > Does this (and the simlar change below) not risk losing updates if
> > we're in full log-dirty mode?  I think it should be fine to leave
> > those entries as log-dirty, since at worst they'll provoke another
> > fault-and-fixup.
> 
> I don't think any updates can be lost: When their types get re-
> calculated, p2m_is_logdirty_range() (taking p2m->global_logdirty
> into account) will still cause them to become p2m_ram_logdirty
> again.

Right, I see.  That's fine then.

> > In this case, where needs_sync isn't being passed as a return value, I
> > think we'd be fine with two booleans in place of this int.
> 
> I guess I'll make these tristates distinct enums then...

Thanks!

Tim.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types
  2014-04-22 12:28 ` [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types Jan Beulich
  2014-04-24 15:41   ` Tim Deegan
@ 2014-04-25 19:37   ` Konrad Rzeszutek Wilk
  2014-04-28  7:19     ` Jan Beulich
  1 sibling, 1 reply; 21+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-04-25 19:37 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Kevin Tian, Keir Fraser, Jun Nakajima, Tim Deegan, Eddie Dong,
	suravee.suthikulpanit, xen-devel, Boris Ostrovsky

> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
> @@ -38,7 +38,7 @@ typedef union {
>          ipat        :   1,  /* bit 6 - Ignore PAT memory type */
>          sp          :   1,  /* bit 7 - Is this a superpage? */
>          rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
> -        avail1      :   1,  /* bit 10 - Software available 1 */
> +        recalc      :   1,  /* bit 10 - Software available 1 */


The comment probably needs an update.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types
  2014-04-25 19:37   ` Konrad Rzeszutek Wilk
@ 2014-04-28  7:19     ` Jan Beulich
  0 siblings, 0 replies; 21+ messages in thread
From: Jan Beulich @ 2014-04-28  7:19 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Kevin Tian, Keir Fraser, suravee.suthikulpanit, Tim Deegan,
	Eddie Dong, Jun Nakajima, xen-devel, Boris Ostrovsky

>>> On 25.04.14 at 21:37, <konrad.wilk@oracle.com> wrote:
>>  --- a/xen/include/asm-x86/hvm/vmx/vmx.h
>> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
>> @@ -38,7 +38,7 @@ typedef union {
>>          ipat        :   1,  /* bit 6 - Ignore PAT memory type */
>>          sp          :   1,  /* bit 7 - Is this a superpage? */
>>          rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
>> -        avail1      :   1,  /* bit 10 - Software available 1 */
>> +        recalc      :   1,  /* bit 10 - Software available 1 */
> 
> 
> The comment probably needs an update.

Not sure - I intentionally left it as it was, explaining the meaning it's
being given by the specification, matching e.g. the comment on
sa_p2mt.

Jan

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2014-04-28  7:19 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-22 12:22 [PATCH v2 0/6] x86/P2M: reduce time bulk type changes take Jan Beulich
2014-04-22 12:28 ` [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types Jan Beulich
2014-04-24 15:41   ` Tim Deegan
2014-04-24 16:20     ` Jan Beulich
2014-04-25  8:44       ` Tim Deegan
2014-04-25 19:37   ` Konrad Rzeszutek Wilk
2014-04-28  7:19     ` Jan Beulich
2014-04-22 12:29 ` [PATCH v2 2/6] x86/EPT: don't walk entire page tables when changing types on a range Jan Beulich
2014-04-24 16:00   ` Tim Deegan
2014-04-22 12:30 ` [PATCH v2 3/6] x86/P2M: simplify write_p2m_entry() Jan Beulich
2014-04-24 16:00   ` Tim Deegan
2014-04-24 16:22     ` Jan Beulich
2014-04-22 12:30 ` [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range Jan Beulich
2014-04-24 16:25   ` Tim Deegan
2014-04-25  6:13     ` Jan Beulich
2014-04-25  8:43       ` Tim Deegan
2014-04-22 12:31 ` [PATCH v2 5/6] x86/NPT: don't walk entire page tables when globally changing types Jan Beulich
2014-04-24 16:28   ` Tim Deegan
2014-04-22 12:32 ` [PATCH v2 6/6] x86/P2M: cleanup Jan Beulich
2014-04-22 12:37   ` Andrew Cooper
2014-04-24 16:30   ` Tim Deegan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.