All of lore.kernel.org
 help / color / mirror / Atom feed
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, jglisse@google.com, mhocko@suse.com,
	vbabka@suse.cz, hannes@cmpxchg.org, mgorman@techsingularity.net,
	dave@stgolabs.net, willy@infradead.org, liam.howlett@oracle.com,
	peterz@infradead.org, ldufour@linux.ibm.com,
	laurent.dufour@fr.ibm.com, paulmck@kernel.org, riel@surriel.com,
	luto@kernel.org, songliubraving@fb.com, peterx@redhat.com,
	david@redhat.com, dhowells@redhat.com, hughd@google.com,
	bigeasy@linutronix.de, kent.overstreet@linux.dev,
	rientjes@google.com, axelrasmussen@google.com, joelaf@google.com,
	minchan@google.com, surenb@google.com, kernel-team@android.com,
	linux-mm@kvack.org, linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, x86@kernel.org,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH 05/28] mm: add per-VMA lock and helper functions to control it
Date: Mon, 29 Aug 2022 21:25:08 +0000	[thread overview]
Message-ID: <20220829212531.3184856-6-surenb@google.com> (raw)
In-Reply-To: <20220829212531.3184856-1-surenb@google.com>

Introduce a per-VMA rw_semaphore to be used during page fault handling
instead of mmap_lock. Because there are cases when multiple VMAs need
to be exclusively locked during VMA tree modifications, instead of the
usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
exclusively and setting vma->lock_seq to the current mm->lock_seq. When
mmap_write_lock holder is done with all modifications and drops mmap_lock,
it will increment mm->lock_seq, effectively unlocking all VMAs marked as
locked.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 78 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  7 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 105 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d322a979455..476bf936c5f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -611,6 +611,83 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+static inline void vma_mark_locked(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+{
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result but it's not critical.
+	 * False unlocked result is critical but is impossible because we
+	 * modify and check vma->vm_lock_seq under vma->lock protection and
+	 * mm->mm_lock_seq modification invalidates all existing locks.
+	 */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_read_unlock(struct vm_area_struct *vma)
+{
+	up_read(&vma->lock);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+	lockdep_assert_held(&vma->lock);
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline void vma_mark_locked(struct vm_area_struct *vma) {}
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_read_unlock(struct vm_area_struct *vma) {}
+static inline void vma_assert_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -619,6 +696,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bed25ef7c994..6a03f59c1e78 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -486,6 +486,10 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	struct rw_semaphore lock;
+	int vm_lock_seq;
+#endif
 } __randomize_layout;
 
 struct kioctx_table;
@@ -567,6 +571,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..a391ae226564 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_mark_unlocked_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_mark_unlocked_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_mark_unlocked_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_mark_unlocked_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 614872438393..bfab31ecd11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -475,6 +475,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		new->vm_next = new->vm_prev = NULL;
 		dup_anon_vma_name(orig, new);
 	}
@@ -1130,6 +1131,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	WRITE_ONCE(mm->mm_lock_seq, 0);
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fbe7844d0912..8399f90d631c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
2.37.2.672.g94769d06f0-goog


WARNING: multiple messages have this Message-ID (diff)
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, jglisse@google.com, mhocko@suse.com,
	vbabka@suse.cz,  hannes@cmpxchg.org, mgorman@techsingularity.net,
	dave@stgolabs.net,  willy@infradead.org, liam.howlett@oracle.com,
	peterz@infradead.org,  ldufour@linux.ibm.com,
	laurent.dufour@fr.ibm.com, paulmck@kernel.org,  riel@surriel.com,
	luto@kernel.org, songliubraving@fb.com, peterx@redhat.com,
	 david@redhat.com, dhowells@redhat.com, hughd@google.com,
	 bigeasy@linutronix.de, kent.overstreet@linux.dev,
	rientjes@google.com,  axelrasmussen@google.com,
	joelaf@google.com, minchan@google.com,  surenb@google.com,
	kernel-team@android.com, linux-mm@kvack.org,
	 linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org,  x86@kernel.org,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH 05/28] mm: add per-VMA lock and helper functions to control it
Date: Mon, 29 Aug 2022 21:25:08 +0000	[thread overview]
Message-ID: <20220829212531.3184856-6-surenb@google.com> (raw)
In-Reply-To: <20220829212531.3184856-1-surenb@google.com>

Introduce a per-VMA rw_semaphore to be used during page fault handling
instead of mmap_lock. Because there are cases when multiple VMAs need
to be exclusively locked during VMA tree modifications, instead of the
usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
exclusively and setting vma->lock_seq to the current mm->lock_seq. When
mmap_write_lock holder is done with all modifications and drops mmap_lock,
it will increment mm->lock_seq, effectively unlocking all VMAs marked as
locked.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 78 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  7 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 105 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d322a979455..476bf936c5f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -611,6 +611,83 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+static inline void vma_mark_locked(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+{
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result but it's not critical.
+	 * False unlocked result is critical but is impossible because we
+	 * modify and check vma->vm_lock_seq under vma->lock protection and
+	 * mm->mm_lock_seq modification invalidates all existing locks.
+	 */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_read_unlock(struct vm_area_struct *vma)
+{
+	up_read(&vma->lock);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+	lockdep_assert_held(&vma->lock);
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline void vma_mark_locked(struct vm_area_struct *vma) {}
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_read_unlock(struct vm_area_struct *vma) {}
+static inline void vma_assert_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -619,6 +696,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bed25ef7c994..6a03f59c1e78 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -486,6 +486,10 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	struct rw_semaphore lock;
+	int vm_lock_seq;
+#endif
 } __randomize_layout;
 
 struct kioctx_table;
@@ -567,6 +571,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..a391ae226564 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_mark_unlocked_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_mark_unlocked_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_mark_unlocked_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_mark_unlocked_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 614872438393..bfab31ecd11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -475,6 +475,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		new->vm_next = new->vm_prev = NULL;
 		dup_anon_vma_name(orig, new);
 	}
@@ -1130,6 +1131,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	WRITE_ONCE(mm->mm_lock_seq, 0);
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fbe7844d0912..8399f90d631c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
2.37.2.672.g94769d06f0-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

WARNING: multiple messages have this Message-ID (diff)
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, joelaf@google.com, songliubraving@fb.com,
	mhocko@suse.com, david@redhat.com, peterz@infradead.org,
	bigeasy@linutronix.de, peterx@redhat.com, dhowells@redhat.com,
	linux-mm@kvack.org, jglisse@google.com, dave@stgolabs.net,
	minchan@google.com, x86@kernel.org, hughd@google.com,
	willy@infradead.org, laurent.dufour@fr.ibm.com,
	linux-arm-kernel@lists.infradead.org, rientjes@google.com,
	axelrasmussen@google.com, kernel-team@android.com,
	paulmck@kernel.org, riel@surriel.com, liam.howlett@oracle.com,
	luto@kernel.org, ldufour@linux.ibm.com, surenb@google.com,
	vbabka@suse.cz, linuxppc-dev@lists.ozlabs.org,
	kent.overstreet@linux.dev, linux-kernel@vger.kernel.org,
	hannes@cmpxchg.org, mgorman@techsingularity.net
Subject: [RFC PATCH 05/28] mm: add per-VMA lock and helper functions to control it
Date: Mon, 29 Aug 2022 21:25:08 +0000	[thread overview]
Message-ID: <20220829212531.3184856-6-surenb@google.com> (raw)
In-Reply-To: <20220829212531.3184856-1-surenb@google.com>

Introduce a per-VMA rw_semaphore to be used during page fault handling
instead of mmap_lock. Because there are cases when multiple VMAs need
to be exclusively locked during VMA tree modifications, instead of the
usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
exclusively and setting vma->lock_seq to the current mm->lock_seq. When
mmap_write_lock holder is done with all modifications and drops mmap_lock,
it will increment mm->lock_seq, effectively unlocking all VMAs marked as
locked.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 78 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  7 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 105 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d322a979455..476bf936c5f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -611,6 +611,83 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+static inline void vma_mark_locked(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+{
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result but it's not critical.
+	 * False unlocked result is critical but is impossible because we
+	 * modify and check vma->vm_lock_seq under vma->lock protection and
+	 * mm->mm_lock_seq modification invalidates all existing locks.
+	 */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_read_unlock(struct vm_area_struct *vma)
+{
+	up_read(&vma->lock);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+	lockdep_assert_held(&vma->lock);
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline void vma_mark_locked(struct vm_area_struct *vma) {}
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_read_unlock(struct vm_area_struct *vma) {}
+static inline void vma_assert_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -619,6 +696,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bed25ef7c994..6a03f59c1e78 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -486,6 +486,10 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	struct rw_semaphore lock;
+	int vm_lock_seq;
+#endif
 } __randomize_layout;
 
 struct kioctx_table;
@@ -567,6 +571,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..a391ae226564 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_mark_unlocked_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_mark_unlocked_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_mark_unlocked_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_mark_unlocked_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 614872438393..bfab31ecd11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -475,6 +475,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		new->vm_next = new->vm_prev = NULL;
 		dup_anon_vma_name(orig, new);
 	}
@@ -1130,6 +1131,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	WRITE_ONCE(mm->mm_lock_seq, 0);
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fbe7844d0912..8399f90d631c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
2.37.2.672.g94769d06f0-goog


  parent reply	other threads:[~2022-08-29 21:26 UTC|newest]

Thread overview: 87+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-29 21:25 [RFC PATCH 00/28] per-VMA locks proposal Suren Baghdasaryan
2022-08-29 21:25 ` Suren Baghdasaryan
2022-08-29 21:25 ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 01/28] mm: introduce CONFIG_PER_VMA_LOCK Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 02/28] mm: rcu safe VMA freeing Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 03/28] mm: introduce __find_vma to be used without mmap_lock protection Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 04/28] mm: move mmap_lock assert function definitions Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` Suren Baghdasaryan [this message]
2022-08-29 21:25   ` [RFC PATCH 05/28] mm: add per-VMA lock and helper functions to control it Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 06/28] mm: mark VMA as locked whenever vma->vm_flags are modified Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 07/28] kernel/fork: mark VMAs as locked before copying pages during fork Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 08/28] mm/khugepaged: mark VMA as locked while collapsing a hugepage Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 09/28] mm/mempolicy: mark VMA as locked when changing protection policy Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 10/28] mm/mmap: mark VMAs as locked in vma_adjust Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 11/28] mm/mmap: mark VMAs as locked before merging or splitting them Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 12/28] mm/mremap: mark VMA as locked while remapping it to a new address range Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 13/28] mm: conditionally mark VMA as locked in free_pgtables and unmap_page_range Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 14/28] mm: mark VMAs as locked before isolating them Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 15/28] mm/mmap: mark adjacent VMAs as locked if they can grow into unmapped area Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 16/28] kernel/fork: assert no VMA readers during its destruction Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 17/28] mm/mmap: prevent pagefault handler from racing with mmu_notifier registration Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 18/28] mm: add FAULT_FLAG_VMA_LOCK flag Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 19/28] mm: disallow do_swap_page to handle page faults under VMA lock Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 20/28] mm: introduce per-VMA lock statistics Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 21/28] mm: introduce find_and_lock_anon_vma to be used from arch-specific code Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 22/28] x86/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 23/28] x86/mm: define ARCH_SUPPORTS_PER_VMA_LOCK Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 24/28] arm64/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 25/28] arm64/mm: define ARCH_SUPPORTS_PER_VMA_LOCK Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 26/28] powerc/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 27/28] powerpc/mm: define ARCH_SUPPORTS_PER_VMA_LOCK Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25 ` [RFC PATCH 28/28] kernel/fork: throttle call_rcu() calls in vm_area_free Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan
2022-08-29 21:25   ` Suren Baghdasaryan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220829212531.3184856-6-surenb@google.com \
    --to=surenb@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=bigeasy@linutronix.de \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=dhowells@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jglisse@google.com \
    --cc=joelaf@google.com \
    --cc=kent.overstreet@linux.dev \
    --cc=kernel-team@android.com \
    --cc=laurent.dufour@fr.ibm.com \
    --cc=ldufour@linux.ibm.com \
    --cc=liam.howlett@oracle.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=luto@kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@suse.com \
    --cc=michel@lespinasse.org \
    --cc=minchan@google.com \
    --cc=paulmck@kernel.org \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=riel@surriel.com \
    --cc=rientjes@google.com \
    --cc=songliubraving@fb.com \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.