All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Vrabel <david.vrabel@citrix.com>
To: xen-devel@lists.xenproject.org
Cc: Kevin Tian <kevin.tian@intel.com>,
	Jun Nakajima <jun.nakajima@intel.com>,
	George Dunlap <george.dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	David Vrabel <david.vrabel@citrix.com>,
	Jan Beulich <jbeulich@suse.com>
Subject: [PATCHv5 2/2] x86/ept: defer the invalidation until the p2m lock is released
Date: Thu, 17 Dec 2015 15:17:06 +0000	[thread overview]
Message-ID: <1450365426-23167-3-git-send-email-david.vrabel@citrix.com> (raw)
In-Reply-To: <1450365426-23167-1-git-send-email-david.vrabel@citrix.com>

Holding the p2m lock while calling ept_sync_domain() is very expensive
since it does a on_selected_cpus() call.  IPIs on many socket machines
can be very slows and on_selected_cpus() is serialized.

It is safe to defer the invalidate until the p2m lock is released
except for two cases:

1. When freeing a page table page (since partial translations may be
   cached).
2. When reclaiming a zero page as part of PoD.

For these cases, add p2m_tlb_flush_sync() calls which will immediately
perform the invalidate before the page is freed or reclaimed.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
v5:
- add p2m_tlb_flush_sync() and call it before freeing pgae table pages
  and reclaiming zeroed pod pages.

v2:
- use per-p2m list for deferred pages.
- update synced_mask while holding write lock.
---
 xen/arch/x86/mm/mm-locks.h | 23 +++++++++++++++--------
 xen/arch/x86/mm/p2m-ept.c  | 44 ++++++++++++++++++++++++++++++++++++--------
 xen/arch/x86/mm/p2m-pod.c  |  2 ++
 xen/arch/x86/mm/p2m.c      | 14 ++++++++++++++
 xen/include/asm-x86/p2m.h  | 10 ++++++++++
 5 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
index 76c7217..b1a92e2 100644
--- a/xen/arch/x86/mm/mm-locks.h
+++ b/xen/arch/x86/mm/mm-locks.h
@@ -263,14 +263,21 @@ declare_mm_lock(altp2mlist)
  */
 
 declare_mm_rwlock(altp2m);
-#define p2m_lock(p)                         \
-{                                           \
-    if ( p2m_is_altp2m(p) )                 \
-        mm_write_lock(altp2m, &(p)->lock);  \
-    else                                    \
-        mm_write_lock(p2m, &(p)->lock);     \
-}
-#define p2m_unlock(p)         mm_write_unlock(&(p)->lock);
+#define p2m_lock(p)                             \
+    do {                                        \
+        if ( p2m_is_altp2m(p) )                 \
+            mm_write_lock(altp2m, &(p)->lock);  \
+        else                                    \
+            mm_write_lock(p2m, &(p)->lock);     \
+        (p)->defer_flush++;                     \
+    } while (0)
+#define p2m_unlock(p)                           \
+    do {                                        \
+        if ( --(p)->defer_flush == 0 )          \
+            p2m_tlb_flush_and_unlock(p);        \
+        else                                    \
+            mm_write_unlock(&(p)->lock);        \
+    } while (0)
 #define gfn_lock(p,g,o)       p2m_lock(p)
 #define gfn_unlock(p,g,o)     p2m_unlock(p)
 #define p2m_read_lock(p)      mm_read_lock(p2m, &(p)->lock)
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 6e0cf89..69fff61 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -843,7 +843,10 @@ out:
        from the iommu tables, so as to avoid a potential
        use-after-free. */
     if ( is_epte_present(&old_entry) )
+    {
+        p2m_tlb_flush_sync(p2m);
         ept_free_entry(p2m, &old_entry, target);
+    }
 
     if ( rc == 0 && p2m_is_hostp2m(p2m) )
         p2m_altp2m_propagate_change(d, _gfn(gfn), mfn, order, p2mt, p2ma);
@@ -1095,24 +1098,48 @@ static void __ept_sync_domain(void *info)
      */
 }
 
-void ept_sync_domain(struct p2m_domain *p2m)
+static void ept_sync_domain_prepare(struct p2m_domain *p2m)
 {
     struct domain *d = p2m->domain;
     struct ept_data *ept = &p2m->ept;
-    /* Only if using EPT and this domain has some VCPUs to dirty. */
-    if ( !paging_mode_hap(d) || !d->vcpu || !d->vcpu[0] )
-        return;
-
-    ASSERT(local_irq_is_enabled());
 
     if ( nestedhvm_enabled(d) && !p2m_is_nestedp2m(p2m) )
         p2m_flush_nestedp2m(d);
 
     /* May need to invalidate on all PCPUs. */
     cpumask_setall(ept->invalidate);
+}
+
+static void ept_sync_domain_mask(struct p2m_domain *p2m, const cpumask_t *mask)
+{
+    on_selected_cpus(mask, __ept_sync_domain, p2m, 1);
+}
 
-    on_selected_cpus(d->domain_dirty_cpumask,
-                     __ept_sync_domain, p2m, 1);
+void ept_sync_domain(struct p2m_domain *p2m)
+{
+    struct domain *d = p2m->domain;
+
+    /* Only if using EPT and this domain has some VCPUs to dirty. */
+    if ( !paging_mode_hap(d) || !d->vcpu || !d->vcpu[0] )
+        return;
+
+    ept_sync_domain_prepare(p2m);
+
+    if ( p2m->defer_flush )
+    {
+        p2m->need_flush = 1;
+        return;
+    }
+
+    ept_sync_domain_mask(p2m, d->domain_dirty_cpumask);
+}
+
+static void ept_flush_and_unlock(struct p2m_domain *p2m, bool_t unlock)
+{
+    p2m->need_flush = 0;
+    if ( unlock )
+        mm_write_unlock(&p2m->lock);
+    ept_sync_domain_mask(p2m, p2m->domain->domain_dirty_cpumask);
 }
 
 static void ept_enable_pml(struct p2m_domain *p2m)
@@ -1163,6 +1190,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->change_entry_type_range = ept_change_entry_type_range;
     p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
+    p2m->flush_and_unlock = ept_flush_and_unlock;
 
     /* Set the memory type used when accessing EPT paging structures. */
     ept->ept_mt = EPT_DEFAULT_MT;
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
index ea16d3e..a5d672e 100644
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -886,6 +886,8 @@ p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)
         }
     }
 
+    p2m_tlb_flush_sync(p2m);
+
     /* Now check each page for real */
     for ( i=0; i < count; i++ )
     {
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index ed0bbd7..efb15cd 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -324,6 +324,20 @@ void p2m_flush_hardware_cached_dirty(struct domain *d)
     }
 }
 
+void p2m_tlb_flush_sync(struct p2m_domain *p2m)
+{
+    if ( p2m->need_flush )
+        p2m->flush_and_unlock(p2m, 0);
+}
+
+void p2m_tlb_flush_and_unlock(struct p2m_domain *p2m)
+{
+    if ( p2m->need_flush )
+        p2m->flush_and_unlock(p2m, 1);
+    else
+        mm_write_unlock(&p2m->lock);
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
                     unsigned int *page_order, bool_t locked)
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index fa46dd9..9c394c2 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -261,6 +261,10 @@ struct p2m_domain {
                                           unsigned long gfn, l1_pgentry_t *p,
                                           l1_pgentry_t new, unsigned int level);
     long               (*audit_p2m)(struct p2m_domain *p2m);
+    void               (*flush_and_unlock)(struct p2m_domain *p2m, bool_t unlock);
+
+    unsigned int defer_flush;
+    bool_t need_flush;
 
     /* Default P2M access type for each page in the the domain: new pages,
      * swapped in pages, cleared pages, and pages that are ambiguously
@@ -353,6 +357,12 @@ static inline bool_t p2m_is_altp2m(const struct p2m_domain *p2m)
 
 #define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
 
+/*
+ * Ensure any deferred p2m TLB flush has been completed on all VCPUs.
+ */
+void p2m_tlb_flush_sync(struct p2m_domain *p2m);
+void p2m_tlb_flush_and_unlock(struct p2m_domain *p2m);
+
 /**** p2m query accessors. They lock p2m_lock, and thus serialize
  * lookups wrt modifications. They _do not_ release the lock on exit.
  * After calling any of the variants below, caller needs to use
-- 
2.1.4

  parent reply	other threads:[~2015-12-17 15:18 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-12-17 15:17 [PATCHv5 0/2] x86/ept: reduce translation invalidation impact David Vrabel
2015-12-17 15:17 ` [PATCHv5 1/2] x86/ept: invalidate guest physical mappings on VMENTER David Vrabel
2015-12-17 15:56   ` Andrew Cooper
2015-12-18  7:53   ` Tian, Kevin
2015-12-18 10:18     ` David Vrabel
2015-12-18 10:24       ` George Dunlap
2015-12-18 10:30         ` David Vrabel
2015-12-18 10:32       ` Jan Beulich
2015-12-18 11:37         ` David Vrabel
2015-12-17 15:17 ` David Vrabel [this message]
2015-12-18  7:46 ` [PATCHv5 0/2] x86/ept: reduce translation invalidation impact Tian, Kevin
2015-12-18 10:12   ` David Vrabel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1450365426-23167-3-git-send-email-david.vrabel@citrix.com \
    --to=david.vrabel@citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=george.dunlap@eu.citrix.com \
    --cc=jbeulich@suse.com \
    --cc=jun.nakajima@intel.com \
    --cc=kevin.tian@intel.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.