All of lore.kernel.org
 help / color / mirror / Atom feed
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, jglisse@google.com, mhocko@suse.com,
	vbabka@suse.cz, hannes@cmpxchg.org, mgorman@techsingularity.net,
	dave@stgolabs.net, willy@infradead.org, liam.howlett@oracle.com,
	peterz@infradead.org, ldufour@linux.ibm.com, paulmck@kernel.org,
	mingo@redhat.com, will@kernel.org, luto@kernel.org,
	songliubraving@fb.com, peterx@redhat.com, david@redhat.com,
	dhowells@redhat.com, hughd@google.com, bigeasy@linutronix.de,
	kent.overstreet@linux.dev, punit.agrawal@bytedance.com,
	lstoakes@gmail.com, peterjung1337@gmail.com, rientjes@google.com,
	axelrasmussen@google.com, joelaf@google.com, minchan@google.com,
	rppt@kernel.org, jannh@google.com, shakeelb@google.com,
	tatashin@google.com, edumazet@google.com, gthelen@google.com,
	gurua@google.com, arjunroy@google.com, soheil@google.com,
	leewalsh@google.com, posk@google.com, linux-mm@kvack.org,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, x86@kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@android.com,
	surenb@google.com
Subject: [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it
Date: Fri, 27 Jan 2023 11:40:48 -0800	[thread overview]
Message-ID: <20230127194110.533103-12-surenb@google.com> (raw)
In-Reply-To: <20230127194110.533103-1-surenb@google.com>

Introduce per-VMA locking. The lock implementation relies on a
per-vma and per-mm sequence counters to note exclusive locking:
  - read lock - (implemented by vma_start_read) requires the vma
    (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
    If they match then there must be a vma exclusive lock held somewhere.
  - read unlock - (implemented by vma_end_read) is a trivial vma->lock
    unlock.
  - write lock - (vma_start_write) requires the mmap_lock to be held
    exclusively and the current mm counter is assigned to the vma counter.
    This will allow multiple vmas to be locked under a single mmap_lock
    write lock (e.g. during vma merging). The vma counter is modified
    under exclusive vma lock.
  - write unlock - (vma_end_write_all) is a batch release of all vma
    locks held. It doesn't pair with a specific vma_start_write! It is
    done before exclusive mmap_lock is released by incrementing mm
    sequence counter (mm_lock_seq).
  - write downgrade - if the mmap_lock is downgraded to the read lock, all
    vma write locks are released as well (effectivelly same as write
    unlock).

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 82 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  8 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 110 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd295c020e85..fee08e8fdce7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -617,6 +617,87 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+	/* Check before locking. A race might cause false locked result. */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+		return false;
+
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result.
+	 * False unlocked result is impossible because we modify and check
+	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * modification invalidates all existing locks.
+	 */
+	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+	rcu_read_lock(); /* keeps vma alive till the end of up_read */
+	up_read(&vma->lock);
+	rcu_read_unlock();
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -625,6 +706,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 62e413f84011..88619c6a29a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -508,6 +508,11 @@ struct vm_area_struct {
 		vm_flags_t __private __vm_flags;
 	};
 
+#ifdef CONFIG_PER_VMA_LOCK
+	int vm_lock_seq;
+	struct rw_semaphore lock;
+#endif
+
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
@@ -633,6 +638,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_end_write_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_end_write_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 314d51eb91da..9141427a98b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		data_race(memcpy(new, orig, sizeof(*new)));
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -1147,6 +1148,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	mm->mm_lock_seq = 0;
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
2.39.1


WARNING: multiple messages have this Message-ID (diff)
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, jglisse@google.com, mhocko@suse.com,
	vbabka@suse.cz,  hannes@cmpxchg.org, mgorman@techsingularity.net,
	dave@stgolabs.net,  willy@infradead.org, liam.howlett@oracle.com,
	peterz@infradead.org,  ldufour@linux.ibm.com, paulmck@kernel.org,
	mingo@redhat.com, will@kernel.org,  luto@kernel.org,
	songliubraving@fb.com, peterx@redhat.com, david@redhat.com,
	 dhowells@redhat.com, hughd@google.com, bigeasy@linutronix.de,
	 kent.overstreet@linux.dev, punit.agrawal@bytedance.com,
	lstoakes@gmail.com,  peterjung1337@gmail.com,
	rientjes@google.com, axelrasmussen@google.com,
	 joelaf@google.com, minchan@google.com, rppt@kernel.org,
	jannh@google.com,  shakeelb@google.com, tatashin@google.com,
	edumazet@google.com,  gthelen@google.com, gurua@google.com,
	arjunroy@google.com, soheil@google.com,  leewalsh@google.com,
	posk@google.com, linux-mm@kvack.org,
	 linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org,  x86@kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@android.com,
	 surenb@google.com
Subject: [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it
Date: Fri, 27 Jan 2023 11:40:48 -0800	[thread overview]
Message-ID: <20230127194110.533103-12-surenb@google.com> (raw)
In-Reply-To: <20230127194110.533103-1-surenb@google.com>

Introduce per-VMA locking. The lock implementation relies on a
per-vma and per-mm sequence counters to note exclusive locking:
  - read lock - (implemented by vma_start_read) requires the vma
    (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
    If they match then there must be a vma exclusive lock held somewhere.
  - read unlock - (implemented by vma_end_read) is a trivial vma->lock
    unlock.
  - write lock - (vma_start_write) requires the mmap_lock to be held
    exclusively and the current mm counter is assigned to the vma counter.
    This will allow multiple vmas to be locked under a single mmap_lock
    write lock (e.g. during vma merging). The vma counter is modified
    under exclusive vma lock.
  - write unlock - (vma_end_write_all) is a batch release of all vma
    locks held. It doesn't pair with a specific vma_start_write! It is
    done before exclusive mmap_lock is released by incrementing mm
    sequence counter (mm_lock_seq).
  - write downgrade - if the mmap_lock is downgraded to the read lock, all
    vma write locks are released as well (effectivelly same as write
    unlock).

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 82 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  8 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 110 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd295c020e85..fee08e8fdce7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -617,6 +617,87 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+	/* Check before locking. A race might cause false locked result. */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+		return false;
+
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result.
+	 * False unlocked result is impossible because we modify and check
+	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * modification invalidates all existing locks.
+	 */
+	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+	rcu_read_lock(); /* keeps vma alive till the end of up_read */
+	up_read(&vma->lock);
+	rcu_read_unlock();
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -625,6 +706,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 62e413f84011..88619c6a29a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -508,6 +508,11 @@ struct vm_area_struct {
 		vm_flags_t __private __vm_flags;
 	};
 
+#ifdef CONFIG_PER_VMA_LOCK
+	int vm_lock_seq;
+	struct rw_semaphore lock;
+#endif
+
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
@@ -633,6 +638,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_end_write_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_end_write_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 314d51eb91da..9141427a98b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		data_race(memcpy(new, orig, sizeof(*new)));
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -1147,6 +1148,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	mm->mm_lock_seq = 0;
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
2.39.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

WARNING: multiple messages have this Message-ID (diff)
From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: michel@lespinasse.org, joelaf@google.com, songliubraving@fb.com,
	mhocko@suse.com, leewalsh@google.com, david@redhat.com,
	peterz@infradead.org, bigeasy@linutronix.de, peterx@redhat.com,
	dhowells@redhat.com, linux-mm@kvack.org, edumazet@google.com,
	jglisse@google.com, punit.agrawal@bytedance.com, will@kernel.org,
	arjunroy@google.com, dave@stgolabs.net, minchan@google.com,
	x86@kernel.org, hughd@google.com, willy@infradead.org,
	gurua@google.com, mingo@redhat.com,
	linux-arm-kernel@lists.infradead.org, rientjes@google.com,
	axelrasmussen@google.com, kernel-team@android.com,
	soheil@google.com, paulmck@kernel.org, jannh@google.com,
	liam.howlett@oracle.com, shakeelb@google.com, luto@kernel.org,
	gthelen@google.com, ldufour@linux.ibm.com, surenb@google.com,
	vbabka@suse.cz, posk@google.com, lstoakes@gmail.com,
	peterjung1337@gmail.com, linuxppc-dev@lists.ozlabs.org,
	kent.overstreet@linux.dev, linux-kernel@vger.kernel.org,
	hannes@cmpxchg.org, tatashin@google.com,
	mgorman@techsingularity.net, rp pt@kernel.org
Subject: [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it
Date: Fri, 27 Jan 2023 11:40:48 -0800	[thread overview]
Message-ID: <20230127194110.533103-12-surenb@google.com> (raw)
In-Reply-To: <20230127194110.533103-1-surenb@google.com>

Introduce per-VMA locking. The lock implementation relies on a
per-vma and per-mm sequence counters to note exclusive locking:
  - read lock - (implemented by vma_start_read) requires the vma
    (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
    If they match then there must be a vma exclusive lock held somewhere.
  - read unlock - (implemented by vma_end_read) is a trivial vma->lock
    unlock.
  - write lock - (vma_start_write) requires the mmap_lock to be held
    exclusively and the current mm counter is assigned to the vma counter.
    This will allow multiple vmas to be locked under a single mmap_lock
    write lock (e.g. during vma merging). The vma counter is modified
    under exclusive vma lock.
  - write unlock - (vma_end_write_all) is a batch release of all vma
    locks held. It doesn't pair with a specific vma_start_write! It is
    done before exclusive mmap_lock is released by incrementing mm
    sequence counter (mm_lock_seq).
  - write downgrade - if the mmap_lock is downgraded to the read lock, all
    vma write locks are released as well (effectivelly same as write
    unlock).

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 82 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h  |  8 ++++
 include/linux/mmap_lock.h | 13 +++++++
 kernel/fork.c             |  4 ++
 mm/init-mm.c              |  3 ++
 5 files changed, 110 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd295c020e85..fee08e8fdce7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -617,6 +617,87 @@ struct vm_operations_struct {
 					  unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->lock);
+	vma->vm_lock_seq = -1;
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+	/* Check before locking. A race might cause false locked result. */
+	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+		return false;
+
+	if (unlikely(down_read_trylock(&vma->lock) == 0))
+		return false;
+
+	/*
+	 * Overflow might produce false locked result.
+	 * False unlocked result is impossible because we modify and check
+	 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+	 * modification invalidates all existing locks.
+	 */
+	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+		up_read(&vma->lock);
+		return false;
+	}
+	return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+	rcu_read_lock(); /* keeps vma alive till the end of up_read */
+	up_read(&vma->lock);
+	rcu_read_unlock();
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	int mm_lock_seq;
+
+	mmap_assert_write_locked(vma->vm_mm);
+
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+	if (vma->vm_lock_seq == mm_lock_seq)
+		return;
+
+	down_write(&vma->lock);
+	vma->vm_lock_seq = mm_lock_seq;
+	up_write(&vma->lock);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+	mmap_assert_write_locked(vma->vm_mm);
+	/*
+	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+	 * mm->mm_lock_seq can't be concurrently modified.
+	 */
+	VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+		{ return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
@@ -625,6 +706,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma_init_lock(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 62e413f84011..88619c6a29a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -508,6 +508,11 @@ struct vm_area_struct {
 		vm_flags_t __private __vm_flags;
 	};
 
+#ifdef CONFIG_PER_VMA_LOCK
+	int vm_lock_seq;
+	struct rw_semaphore lock;
+#endif
+
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
@@ -633,6 +638,9 @@ struct mm_struct {
 					  * init_mm.mmlist, and are protected
 					  * by mmlist_lock
 					  */
+#ifdef CONFIG_PER_VMA_LOCK
+		int mm_lock_seq;
+#endif
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+	/* No races during update due to exclusive mmap_lock being held */
+	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_released(mm, true);
+	vma_end_write_all(mm);
 	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	vma_end_write_all(mm);
 	downgrade_write(&mm->mmap_lock);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 314d51eb91da..9141427a98b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		data_race(memcpy(new, orig, sizeof(*new)));
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -1147,6 +1148,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	mm->mm_lock_seq = 0;
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+	.mm_lock_seq	= 0,
+#endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA
-- 
2.39.1


  parent reply	other threads:[~2023-01-27 19:43 UTC|newest]

Thread overview: 126+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 01/33] maple_tree: Be more cautious about dead nodes Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 02/33] maple_tree: Detect dead nodes in mas_start() Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 03/33] maple_tree: Fix freeing of nodes in rcu mode Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 04/33] maple_tree: remove extra smp_wmb() from mas_dead_leaves() Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 05/33] maple_tree: Fix write memory barrier of nodes once dead for RCU mode Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 06/33] maple_tree: Add smp_rmb() to dead node detection Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 07/33] mm: Enable maple tree RCU mode by default Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 08/33] mm: introduce CONFIG_PER_VMA_LOCK Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 09/33] mm: rcu safe VMA freeing Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 10/33] mm: move mmap_lock assert function definitions Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` Suren Baghdasaryan [this message]
2023-01-27 19:40   ` [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 12/33] mm: mark VMA as being written when changing vm_flags Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 13/33] mm/mmap: move VMA locking before vma_adjust_trans_huge call Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 14/33] mm/khugepaged: write-lock VMA while collapsing a huge page Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 15/33] mm/mmap: write-lock VMAs before merging, splitting or expanding them Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 16/33] mm/mmap: write-lock VMA before shrinking or expanding it Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 17/33] mm/mremap: write-lock VMA while remapping it to a new address range Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 18/33] mm: write-lock VMAs before removing them from VMA tree Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 19/33] mm: conditionally write-lock VMA in free_pgtables Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 20/33] mm/mmap: write-lock adjacent VMAs if they can grow into unmapped area Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 21/33] kernel/fork: assert no VMA readers during its destruction Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 22/33] mm/mmap: prevent pagefault handler from racing with mmu_notifier registration Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:40   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 23/33] mm: introduce lock_vma_under_rcu to be used from arch-specific code Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 24/33] mm: fall back to mmap_lock if vma->anon_vma is not yet set Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 25/33] mm: add FAULT_FLAG_VMA_LOCK flag Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 26/33] mm: prevent do_swap_page from handling page faults under VMA lock Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 27/33] mm: prevent userfaults to be handled under per-vma lock Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 28/33] mm: introduce per-VMA lock statistics Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 29/33] x86/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 30/33] arm64/mm: " Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 31/33] powerc/mm: " Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 32/33] mm/mmap: free vm_area_struct without call_rcu in exit_mmap Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 19:41   ` Suren Baghdasaryan
2023-01-27 22:51 ` [PATCH v2 00/33] Per-VMA locks Andrew Morton
2023-01-27 22:51   ` Andrew Morton
2023-01-27 22:51   ` Andrew Morton
2023-01-27 23:26   ` Matthew Wilcox
2023-01-27 23:26     ` Matthew Wilcox
2023-01-27 23:26     ` Matthew Wilcox
2023-01-28  0:00     ` Suren Baghdasaryan
2023-01-28  0:00       ` Suren Baghdasaryan
2023-01-28  0:00       ` Suren Baghdasaryan
2023-02-14 16:47       ` Suren Baghdasaryan
2023-02-14 16:47         ` Suren Baghdasaryan
2023-02-14 16:47         ` Suren Baghdasaryan
2023-02-15 17:32 ` [External] " Punit Agrawal
2023-02-15 17:32   ` Punit Agrawal
2023-02-15 17:32   ` Punit Agrawal
2023-02-15 17:39   ` Suren Baghdasaryan
2023-02-15 17:39     ` Suren Baghdasaryan
2023-02-15 17:39     ` Suren Baghdasaryan
2023-02-28 12:06   ` Punit Agrawal
2023-02-28 12:06     ` Punit Agrawal
2023-02-28 12:06     ` Punit Agrawal
2023-02-28 18:08     ` Suren Baghdasaryan
2023-02-28 18:08       ` Suren Baghdasaryan
2023-02-28 18:08       ` Suren Baghdasaryan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230127194110.533103-12-surenb@google.com \
    --to=surenb@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=arjunroy@google.com \
    --cc=axelrasmussen@google.com \
    --cc=bigeasy@linutronix.de \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=dhowells@redhat.com \
    --cc=edumazet@google.com \
    --cc=gthelen@google.com \
    --cc=gurua@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jannh@google.com \
    --cc=jglisse@google.com \
    --cc=joelaf@google.com \
    --cc=kent.overstreet@linux.dev \
    --cc=kernel-team@android.com \
    --cc=ldufour@linux.ibm.com \
    --cc=leewalsh@google.com \
    --cc=liam.howlett@oracle.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=lstoakes@gmail.com \
    --cc=luto@kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@suse.com \
    --cc=michel@lespinasse.org \
    --cc=minchan@google.com \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=peterjung1337@gmail.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=posk@google.com \
    --cc=punit.agrawal@bytedance.com \
    --cc=rientjes@google.com \
    --cc=rppt@kernel.org \
    --cc=shakeelb@google.com \
    --cc=soheil@google.com \
    --cc=songliubraving@fb.com \
    --cc=tatashin@google.com \
    --cc=vbabka@suse.cz \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.