[PATCHv3 3/3] p2m: convert p2m rwlock to percpu rwlock

From: Malcolm Crossley <malcolm.crossley@citrix.com>
To: malcolm.crossley@citrix.com, JBeulich@suse.com,
	ian.campbell@citrix.com, andrew.cooper3@citrix.com,
	Marcos.Matsunaga@oracle.com, keir@xen.org,
	konrad.wilk@oracle.com, george.dunlap@eu.citrix.com
Cc: xen-devel@lists.xenproject.org, dario.faggioli@citrix.com,
	stefano.stabellini@citrix.com
Subject: [PATCHv3 3/3] p2m: convert p2m rwlock to percpu rwlock
Date: Thu, 17 Dec 2015 12:52:27 +0000	[thread overview]
Message-ID: <1450356747-29039-4-git-send-email-malcolm.crossley@citrix.com> (raw)
In-Reply-To: <1450356747-29039-1-git-send-email-malcolm.crossley@citrix.com>

The per domain p2m read lock suffers from significant contention when
performance multi-queue block or network IO due to the parallel
grant map/unmaps/copies occuring on the DomU's p2m.

On multi-socket systems, the contention results in the locked compare swap
operation failing frequently which results in a tight loop of retries of the
compare swap operation. As the coherency fabric can only support a specific
rate of compare swap operations for a particular data location then taking
the read lock itself becomes a bottleneck for p2m operations.

Percpu rwlock p2m performance with the same configuration is approximately
64 gbit/s vs the 48 gbit/s with grant table percpu rwlocks only.

Oprofile was used to determine the initial overhead of the read-write locks
and to confirm the overhead was dramatically reduced by the percpu rwlocks.

Note: altp2m users will not achieve a gain if they take an altp2m read lock
simultaneously with the main p2m lock.

Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
--
Changes since v2
- Updated local percpu rwlock initialisation
---
 xen/arch/x86/mm/mm-locks.h | 12 +++++++-----
 xen/arch/x86/mm/p2m.c      |  1 +
 xen/include/asm-x86/mm.h   |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
index 76c7217..8a40986 100644
--- a/xen/arch/x86/mm/mm-locks.h
+++ b/xen/arch/x86/mm/mm-locks.h
@@ -31,6 +31,8 @@
 DECLARE_PER_CPU(int, mm_lock_level);
 #define __get_lock_level()  (this_cpu(mm_lock_level))
 
+DECLARE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
+
 static inline void mm_lock_init(mm_lock_t *l)
 {
     spin_lock_init(&l->lock);
@@ -99,7 +101,7 @@ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level,
 
 static inline void mm_rwlock_init(mm_rwlock_t *l)
 {
-    rwlock_init(&l->lock);
+    percpu_rwlock_resource_init(&l->lock, p2m_percpu_rwlock);
     l->locker = -1;
     l->locker_function = "nobody";
     l->unlock_level = 0;
@@ -115,7 +117,7 @@ static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level)
     if ( !mm_write_locked_by_me(l) )
     {
         __check_lock_level(level);
-        write_lock(&l->lock);
+        percpu_write_lock(p2m_percpu_rwlock, &l->lock);
         l->locker = get_processor_id();
         l->locker_function = func;
         l->unlock_level = __get_lock_level();
@@ -131,20 +133,20 @@ static inline void mm_write_unlock(mm_rwlock_t *l)
     l->locker = -1;
     l->locker_function = "nobody";
     __set_lock_level(l->unlock_level);
-    write_unlock(&l->lock);
+    percpu_write_unlock(p2m_percpu_rwlock, &l->lock);
 }
 
 static inline void _mm_read_lock(mm_rwlock_t *l, int level)
 {
     __check_lock_level(level);
-    read_lock(&l->lock);
+    percpu_read_lock(p2m_percpu_rwlock, &l->lock);
     /* There's nowhere to store the per-CPU unlock level so we can't
      * set the lock level. */
 }
 
 static inline void mm_read_unlock(mm_rwlock_t *l)
 {
-    read_unlock(&l->lock);
+    percpu_read_unlock(p2m_percpu_rwlock, &l->lock);
 }
 
 /* This wrapper uses the line number to express the locking order below */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index ed0bbd7..a45ee35 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -54,6 +54,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
 #undef page_to_mfn
 #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
+DEFINE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
 
 /* Init the datastructures for later use by the p2m code */
 static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 67b34c6..78511dd 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -566,7 +566,7 @@ typedef struct mm_lock {
 } mm_lock_t;
 
 typedef struct mm_rwlock {
-    rwlock_t           lock;
+    percpu_rwlock_t    lock;
     int                unlock_level;
     int                recurse_count;
     int                locker; /* CPU that holds the write lock */
-- 
1.7.12.4