All of lore.kernel.org
 help / color / mirror / Atom feed
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, jglisse@google.com, mhocko@suse.com,
	vbabka@suse.cz, hannes@cmpxchg.org, mgorman@techsingularity.net,
	dave@stgolabs.net, willy@infradead.org, liam.howlett@oracle.com,
	peterz@infradead.org, ldufour@linux.ibm.com, paulmck@kernel.org,
	mingo@redhat.com, will@kernel.org, luto@kernel.org,
	songliubraving@fb.com, peterx@redhat.com, david@redhat.com,
	dhowells@redhat.com, hughd@google.com, bigeasy@linutronix.de,
	kent.overstreet@linux.dev, punit.agrawal@bytedance.com,
	lstoakes@gmail.com, peterjung1337@gmail.com, rientjes@google.com,
	axelrasmussen@google.com, joelaf@google.com, minchan@google.com,
	rppt@kernel.org, jannh@google.com, shakeelb@google.com,
	tatashin@google.com, edumazet@google.com, gthelen@google.com,
	gurua@google.com, arjunroy@google.com, soheil@google.com,
	leewalsh@google.com, posk@google.com, linux-mm@kvack.org,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, x86@kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@android.com,
	surenb@google.com
Subject: [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct
Date: Fri, 27 Jan 2023 11:41:10 -0800	[thread overview]
Message-ID: <20230127194110.533103-34-surenb@google.com> (raw)
In-Reply-To: <20230127194110.533103-1-surenb@google.com>

vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing. Fix that by moving the lock outside of the vm_area_struct.
All attempts to keep vma->lock inside vm_area_struct in a separate
cache line still produce performance regression especially on NUMA
machines. Smallest regression was achieved when lock is placed in the
fourth cache line but that bloats vm_area_struct to 256 bytes.
Considering performance and memory impact, separate lock looks like
the best option. It increases memory footprint of each VMA but that
can be optimized later if the new size causes issues.
Note that after this change vma_init() does not allocate or
initialize vma->lock anymore. A number of drivers allocate a pseudo
VMA on the stack but they never use the VMA's lock, therefore it does
not need to be allocated. The future drivers which might need the VMA
lock should use vm_area_alloc()/vm_area_free() to allocate the VMA.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h       | 23 ++++++-------
 include/linux/mm_types.h |  6 +++-
 kernel/fork.c            | 73 ++++++++++++++++++++++++++++++++--------
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c4ddcd6fd84..52e048c31239 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -621,12 +621,6 @@ struct vm_operations_struct {
 };
 
 #ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
-{
-	init_rwsem(&vma->lock);
-	vma->vm_lock_seq = -1;
-}
-
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -638,17 +632,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
 		return false;
 
 	/*
 	 * Overflow might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 */
 	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->lock);
+		up_read(&vma->vm_lock->lock);
 		return false;
 	}
 	return true;
@@ -657,7 +651,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->lock);
+	up_read(&vma->vm_lock->lock);
 	rcu_read_unlock();
 }
 
@@ -675,9 +669,9 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == mm_lock_seq)
 		return;
 
-	down_write(&vma->lock);
+	down_write(&vma->vm_lock->lock);
 	vma->vm_lock_seq = mm_lock_seq;
-	up_write(&vma->lock);
+	up_write(&vma->vm_lock->lock);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -704,6 +698,10 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -712,7 +710,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c4c43f10344a..1e97bb98197c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,10 @@ struct anon_vma_name {
 	char name[];
 };
 
+struct vma_lock {
+	struct rw_semaphore lock;
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -510,7 +514,7 @@ struct vm_area_struct {
 
 #ifdef CONFIG_PER_VMA_LOCK
 	int vm_lock_seq;
-	struct rw_semaphore lock;
+	struct vma_lock *vm_lock;
 #endif
 
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index d0999de82f94..a152804faa14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		vma_init_lock(new);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3089,6 +3131,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
-- 
2.39.1


WARNING: multiple messages have this Message-ID (diff)
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, joelaf@google.com, songliubraving@fb.com,
	mhocko@suse.com, leewalsh@google.com, david@redhat.com,
	peterz@infradead.org, bigeasy@linutronix.de, peterx@redhat.com,
	dhowells@redhat.com, linux-mm@kvack.org, edumazet@google.com,
	jglisse@google.com, punit.agrawal@bytedance.com, will@kernel.org,
	arjunroy@google.com, dave@stgolabs.net, minchan@google.com,
	x86@kernel.org, hughd@google.com, willy@infradead.org,
	gurua@google.com, mingo@redhat.com,
	linux-arm-kernel@lists.infradead.org, rientjes@google.com,
	axelrasmussen@google.com, kernel-team@android.com,
	soheil@google.com, paulmck@kernel.org, jannh@google.com,
	liam.howlett@oracle.com, shakeelb@google.com, luto@kernel.org,
	gthelen@google.com, ldufour@linux.ibm.com, surenb@google.com,
	vbabka@suse.cz, posk@google.com, lstoakes@gmail.com,
	peterjung1337@gmail.com, linuxppc-dev@lists.ozlabs.org,
	kent.overstreet@linux.dev, linux-kernel@vger.kernel.org,
	hannes@cmpxchg.org, tatashin@google.com,
	mgorman@techsingularity.net, rp pt@kernel.org
Subject: [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct
Date: Fri, 27 Jan 2023 11:41:10 -0800	[thread overview]
Message-ID: <20230127194110.533103-34-surenb@google.com> (raw)
In-Reply-To: <20230127194110.533103-1-surenb@google.com>

vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing. Fix that by moving the lock outside of the vm_area_struct.
All attempts to keep vma->lock inside vm_area_struct in a separate
cache line still produce performance regression especially on NUMA
machines. Smallest regression was achieved when lock is placed in the
fourth cache line but that bloats vm_area_struct to 256 bytes.
Considering performance and memory impact, separate lock looks like
the best option. It increases memory footprint of each VMA but that
can be optimized later if the new size causes issues.
Note that after this change vma_init() does not allocate or
initialize vma->lock anymore. A number of drivers allocate a pseudo
VMA on the stack but they never use the VMA's lock, therefore it does
not need to be allocated. The future drivers which might need the VMA
lock should use vm_area_alloc()/vm_area_free() to allocate the VMA.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h       | 23 ++++++-------
 include/linux/mm_types.h |  6 +++-
 kernel/fork.c            | 73 ++++++++++++++++++++++++++++++++--------
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c4ddcd6fd84..52e048c31239 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -621,12 +621,6 @@ struct vm_operations_struct {
 };
 
 #ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
-{
-	init_rwsem(&vma->lock);
-	vma->vm_lock_seq = -1;
-}
-
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -638,17 +632,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
 		return false;
 
 	/*
 	 * Overflow might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 */
 	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->lock);
+		up_read(&vma->vm_lock->lock);
 		return false;
 	}
 	return true;
@@ -657,7 +651,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->lock);
+	up_read(&vma->vm_lock->lock);
 	rcu_read_unlock();
 }
 
@@ -675,9 +669,9 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == mm_lock_seq)
 		return;
 
-	down_write(&vma->lock);
+	down_write(&vma->vm_lock->lock);
 	vma->vm_lock_seq = mm_lock_seq;
-	up_write(&vma->lock);
+	up_write(&vma->vm_lock->lock);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -704,6 +698,10 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -712,7 +710,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c4c43f10344a..1e97bb98197c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,10 @@ struct anon_vma_name {
 	char name[];
 };
 
+struct vma_lock {
+	struct rw_semaphore lock;
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -510,7 +514,7 @@ struct vm_area_struct {
 
 #ifdef CONFIG_PER_VMA_LOCK
 	int vm_lock_seq;
-	struct rw_semaphore lock;
+	struct vma_lock *vm_lock;
 #endif
 
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index d0999de82f94..a152804faa14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		vma_init_lock(new);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3089,6 +3131,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
-- 
2.39.1


WARNING: multiple messages have this Message-ID (diff)
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, jglisse@google.com, mhocko@suse.com,
	vbabka@suse.cz,  hannes@cmpxchg.org, mgorman@techsingularity.net,
	dave@stgolabs.net,  willy@infradead.org, liam.howlett@oracle.com,
	peterz@infradead.org,  ldufour@linux.ibm.com, paulmck@kernel.org,
	mingo@redhat.com, will@kernel.org,  luto@kernel.org,
	songliubraving@fb.com, peterx@redhat.com, david@redhat.com,
	 dhowells@redhat.com, hughd@google.com, bigeasy@linutronix.de,
	 kent.overstreet@linux.dev, punit.agrawal@bytedance.com,
	lstoakes@gmail.com,  peterjung1337@gmail.com,
	rientjes@google.com, axelrasmussen@google.com,
	 joelaf@google.com, minchan@google.com, rppt@kernel.org,
	jannh@google.com,  shakeelb@google.com, tatashin@google.com,
	edumazet@google.com,  gthelen@google.com, gurua@google.com,
	arjunroy@google.com, soheil@google.com,  leewalsh@google.com,
	posk@google.com, linux-mm@kvack.org,
	 linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org,  x86@kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@android.com,
	 surenb@google.com
Subject: [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct
Date: Fri, 27 Jan 2023 11:41:10 -0800	[thread overview]
Message-ID: <20230127194110.533103-34-surenb@google.com> (raw)
In-Reply-To: <20230127194110.533103-1-surenb@google.com>

vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing. Fix that by moving the lock outside of the vm_area_struct.
All attempts to keep vma->lock inside vm_area_struct in a separate
cache line still produce performance regression especially on NUMA
machines. Smallest regression was achieved when lock is placed in the
fourth cache line but that bloats vm_area_struct to 256 bytes.
Considering performance and memory impact, separate lock looks like
the best option. It increases memory footprint of each VMA but that
can be optimized later if the new size causes issues.
Note that after this change vma_init() does not allocate or
initialize vma->lock anymore. A number of drivers allocate a pseudo
VMA on the stack but they never use the VMA's lock, therefore it does
not need to be allocated. The future drivers which might need the VMA
lock should use vm_area_alloc()/vm_area_free() to allocate the VMA.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h       | 23 ++++++-------
 include/linux/mm_types.h |  6 +++-
 kernel/fork.c            | 73 ++++++++++++++++++++++++++++++++--------
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c4ddcd6fd84..52e048c31239 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -621,12 +621,6 @@ struct vm_operations_struct {
 };
 
 #ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
-{
-	init_rwsem(&vma->lock);
-	vma->vm_lock_seq = -1;
-}
-
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -638,17 +632,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
 		return false;
 
 	/*
 	 * Overflow might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 */
 	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->lock);
+		up_read(&vma->vm_lock->lock);
 		return false;
 	}
 	return true;
@@ -657,7 +651,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->lock);
+	up_read(&vma->vm_lock->lock);
 	rcu_read_unlock();
 }
 
@@ -675,9 +669,9 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (vma->vm_lock_seq == mm_lock_seq)
 		return;
 
-	down_write(&vma->lock);
+	down_write(&vma->vm_lock->lock);
 	vma->vm_lock_seq = mm_lock_seq;
-	up_write(&vma->lock);
+	up_write(&vma->vm_lock->lock);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -704,6 +698,10 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -712,7 +710,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c4c43f10344a..1e97bb98197c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,10 @@ struct anon_vma_name {
 	char name[];
 };
 
+struct vma_lock {
+	struct rw_semaphore lock;
+};
+
 /*
  * This struct describes a virtual memory area. There is one of these
  * per VM-area/task. A VM area is any part of the process virtual memory
@@ -510,7 +514,7 @@ struct vm_area_struct {
 
 #ifdef CONFIG_PER_VMA_LOCK
 	int vm_lock_seq;
-	struct rw_semaphore lock;
+	struct vma_lock *vm_lock;
 #endif
 
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index d0999de82f94..a152804faa14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		vma_init_lock(new);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3089,6 +3131,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
-- 
2.39.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  parent reply	other threads:[~2023-01-27 19:47 UTC|newest]

Thread overview: 126+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 01/33] maple_tree: Be more cautious about dead nodes Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 02/33] maple_tree: Detect dead nodes in mas_start() Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 03/33] maple_tree: Fix freeing of nodes in rcu mode Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 04/33] maple_tree: remove extra smp_wmb() from mas_dead_leaves() Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 05/33] maple_tree: Fix write memory barrier of nodes once dead for RCU mode Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 06/33] maple_tree: Add smp_rmb() to dead node detection Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 07/33] mm: Enable maple tree RCU mode by default Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 08/33] mm: introduce CONFIG_PER_VMA_LOCK Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 09/33] mm: rcu safe VMA freeing Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 10/33] mm: move mmap_lock assert function definitions Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 12/33] mm: mark VMA as being written when changing vm_flags Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 13/33] mm/mmap: move VMA locking before vma_adjust_trans_huge call Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 14/33] mm/khugepaged: write-lock VMA while collapsing a huge page Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 15/33] mm/mmap: write-lock VMAs before merging, splitting or expanding them Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 16/33] mm/mmap: write-lock VMA before shrinking or expanding it Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 17/33] mm/mremap: write-lock VMA while remapping it to a new address range Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 18/33] mm: write-lock VMAs before removing them from VMA tree Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 19/33] mm: conditionally write-lock VMA in free_pgtables Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 20/33] mm/mmap: write-lock adjacent VMAs if they can grow into unmapped area Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 21/33] kernel/fork: assert no VMA readers during its destruction Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 22/33] mm/mmap: prevent pagefault handler from racing with mmu_notifier registration Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 23/33] mm: introduce lock_vma_under_rcu to be used from arch-specific code Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 24/33] mm: fall back to mmap_lock if vma->anon_vma is not yet set Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 25/33] mm: add FAULT_FLAG_VMA_LOCK flag Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 26/33] mm: prevent do_swap_page from handling page faults under VMA lock Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 27/33] mm: prevent userfaults to be handled under per-vma lock Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 28/33] mm: introduce per-VMA lock statistics Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 29/33] x86/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 30/33] arm64/mm: " Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 31/33] powerc/mm: " Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 32/33] mm/mmap: free vm_area_struct without call_rcu in exit_mmap Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` Suren Baghdasaryan [this message]
2023-01-27 19:41   ` [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 22:51 ` [PATCH v2 00/33] Per-VMA locks Andrew Morton
2023-01-27 22:51   ` Andrew Morton
2023-01-27 22:51   ` Andrew Morton
2023-01-27 23:26   ` Matthew Wilcox
2023-01-27 23:26     ` Matthew Wilcox
2023-01-27 23:26     ` Matthew Wilcox
2023-01-28  0:00     ` Suren Baghdasaryan
2023-01-28  0:00       ` Suren Baghdasaryan
2023-01-28  0:00       ` Suren Baghdasaryan
2023-02-14 16:47       ` Suren Baghdasaryan
2023-02-14 16:47         ` Suren Baghdasaryan
2023-02-14 16:47         ` Suren Baghdasaryan
2023-02-15 17:32 ` [External] " Punit Agrawal
2023-02-15 17:32   ` Punit Agrawal
2023-02-15 17:32   ` Punit Agrawal
2023-02-15 17:39   ` Suren Baghdasaryan
2023-02-15 17:39     ` Suren Baghdasaryan
2023-02-15 17:39     ` Suren Baghdasaryan
2023-02-28 12:06   ` Punit Agrawal
2023-02-28 12:06     ` Punit Agrawal
2023-02-28 12:06     ` Punit Agrawal
2023-02-28 18:08     ` Suren Baghdasaryan
2023-02-28 18:08       ` Suren Baghdasaryan
2023-02-28 18:08       ` Suren Baghdasaryan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230127194110.533103-34-surenb@google.com \
    --to=surenb@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=arjunroy@google.com \
    --cc=axelrasmussen@google.com \
    --cc=bigeasy@linutronix.de \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=dhowells@redhat.com \
    --cc=edumazet@google.com \
    --cc=gthelen@google.com \
    --cc=gurua@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jannh@google.com \
    --cc=jglisse@google.com \
    --cc=joelaf@google.com \
    --cc=kent.overstreet@linux.dev \
    --cc=kernel-team@android.com \
    --cc=ldufour@linux.ibm.com \
    --cc=leewalsh@google.com \
    --cc=liam.howlett@oracle.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=lstoakes@gmail.com \
    --cc=luto@kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@suse.com \
    --cc=michel@lespinasse.org \
    --cc=minchan@google.com \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=peterjung1337@gmail.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=posk@google.com \
    --cc=punit.agrawal@bytedance.com \
    --cc=rientjes@google.com \
    --cc=rppt@kernel.org \
    --cc=shakeelb@google.com \
    --cc=soheil@google.com \
    --cc=songliubraving@fb.com \
    --cc=tatashin@google.com \
    --cc=vbabka@suse.cz \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.