All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -v2 0/7] locking: Percpu-rwsem rewrite
@ 2020-01-31 15:07 Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
                   ` (11 more replies)
  0 siblings, 12 replies; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

Hi all,

This is the long awaited report of the percpu-rwsem rework (sorry Juri).

IIRC (I really have trouble keeping up momentum on this series) I've addressed
all previous comments by Oleg and Davidlohr and Waiman and hope we can stick
this in tip/locking/core for inclusion in the next merge.

It has been cooked (thoroughly) in PREEMPT_RT, and not found wanting.

Any objections to me stuffing it in so we can all forget about it properly?



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 2/7] locking/percpu-rwsem: Convert to bool Peter Zijlstra
                   ` (10 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

As preparation for replacing the embedded rwsem, give percpu-rwsem its
own lockdep_map.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
---
 include/linux/percpu-rwsem.h  |   29 +++++++++++++++++++----------
 kernel/cpu.c                  |    4 ++--
 kernel/locking/percpu-rwsem.c |   16 ++++++++++++----
 kernel/locking/rwsem.c        |    4 ++--
 kernel/locking/rwsem.h        |    2 ++
 5 files changed, 37 insertions(+), 18 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -15,8 +15,17 @@ struct percpu_rw_semaphore {
 	struct rw_semaphore	rw_sem; /* slowpath */
 	struct rcuwait          writer; /* blocked writer */
 	int			readers_block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
 #define __DEFINE_PERCPU_RWSEM(name, is_static)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
 is_static struct percpu_rw_semaphore name = {				\
@@ -24,7 +33,9 @@ is_static struct percpu_rw_semaphore nam
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
+
 #define DEFINE_PERCPU_RWSEM(name)		\
 	__DEFINE_PERCPU_RWSEM(name, /* not static */)
 #define DEFINE_STATIC_PERCPU_RWSEM(name)	\
@@ -37,7 +48,7 @@ static inline void percpu_down_read(stru
 {
 	might_sleep();
 
-	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	preempt_disable();
 	/*
@@ -76,13 +87,15 @@ static inline int percpu_down_read_trylo
 	 */
 
 	if (ret)
-		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
 
 	return ret;
 }
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
 	preempt_disable();
 	/*
 	 * Same as in percpu_down_read().
@@ -92,8 +105,6 @@ static inline void percpu_up_read(struct
 	else
 		__percpu_up_read(sem); /* Unconditional memory barrier */
 	preempt_enable();
-
-	rwsem_release(&sem->rw_sem.dep_map, _RET_IP_);
 }
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,15 +121,13 @@ extern void percpu_free_rwsem(struct per
 	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem)				\
-	lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem)	lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem)	lockdep_assert_held(sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_release(&sem->rw_sem.dep_map, ip);
+	lock_release(&sem->dep_map, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
 		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
@@ -128,7 +137,7 @@ static inline void percpu_rwsem_release(
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
 		atomic_long_set(&sem->rw_sem.owner, (long)current);
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
 
 static void lockdep_acquire_cpus_lock(void)
 {
-	rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+	rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
 }
 
 static void lockdep_release_cpus_lock(void)
 {
-	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
+	rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
 }
 
 /*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -11,7 +11,7 @@
 #include "rwsem.h"
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
-			const char *name, struct lock_class_key *rwsem_key)
+			const char *name, struct lock_class_key *key)
 {
 	sem->read_count = alloc_percpu(int);
 	if (unlikely(!sem->read_count))
@@ -19,9 +19,13 @@ int __percpu_init_rwsem(struct percpu_rw
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+	init_rwsem(&sem->rw_sem);
 	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -142,10 +146,12 @@ static bool readers_active_check(struct
 
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	down_write(&sem->rw_sem);
+	__down_write(&sem->rw_sem);
 
 	/*
 	 * Notify new readers to block; up until now, and thus throughout the
@@ -168,6 +174,8 @@ EXPORT_SYMBOL_GPL(percpu_down_write);
 
 void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
 	/*
 	 * Signal the writer is done, no fast path yet.
 	 *
@@ -183,7 +191,7 @@ void percpu_up_write(struct percpu_rw_se
 	/*
 	 * Release the write lock, this will allow readers back in the game.
 	 */
-	up_write(&sem->rw_sem);
+	__up_write(&sem->rw_sem);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1383,7 +1383,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp = RWSEM_UNLOCKED_VALUE;
 
@@ -1446,7 +1446,7 @@ inline void __up_read(struct rw_semaphor
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+inline void __up_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -6,5 +6,7 @@
 
 extern void __down_read(struct rw_semaphore *sem);
 extern void __up_read(struct rw_semaphore *sem);
+extern void __down_write(struct rw_semaphore *sem);
+extern void __up_write(struct rw_semaphore *sem);
 
 #endif /* __INTERNAL_RWSEM_H */



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 2/7] locking/percpu-rwsem: Convert to bool
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath Peter Zijlstra
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

Use bool where possible.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
---
 include/linux/percpu-rwsem.h  |    6 +++---
 kernel/locking/percpu-rwsem.c |    8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -41,7 +41,7 @@ is_static struct percpu_rw_semaphore nam
 #define DEFINE_STATIC_PERCPU_RWSEM(name)	\
 	__DEFINE_PERCPU_RWSEM(name, static)
 
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -69,9 +69,9 @@ static inline void percpu_down_read(stru
 	preempt_enable();
 }
 
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	int ret = 1;
+	bool ret = true;
 
 	preempt_disable();
 	/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	/*
 	 * Due to having preemption disabled the decrement happens on
@@ -69,7 +69,7 @@ int __percpu_down_read(struct percpu_rw_
 	 * release in percpu_up_write().
 	 */
 	if (likely(!smp_load_acquire(&sem->readers_block)))
-		return 1;
+		return true;
 
 	/*
 	 * Per the above comment; we still have preemption disabled and
@@ -78,7 +78,7 @@ int __percpu_down_read(struct percpu_rw_
 	__percpu_up_read(sem);
 
 	if (try)
-		return 0;
+		return false;
 
 	/*
 	 * We either call schedule() in the wait, or we'll fall through
@@ -94,7 +94,7 @@ int __percpu_down_read(struct percpu_rw_
 	__up_read(&sem->rw_sem);
 
 	preempt_disable();
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 2/7] locking/percpu-rwsem: Convert to bool Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 4/7] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
                   ` (8 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

As preparation to rework __percpu_down_read() move the
__this_cpu_inc() into it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
---
 include/linux/percpu-rwsem.h  |   10 ++++++----
 kernel/locking/percpu-rwsem.c |    2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -59,8 +59,9 @@ static inline void percpu_down_read(stru
 	 * and that once the synchronize_rcu() is done, the writer will see
 	 * anything we did within this RCU-sched read-size critical section.
 	 */
-	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_inc(*sem->read_count);
+	else
 		__percpu_down_read(sem, false); /* Unconditional memory barrier */
 	/*
 	 * The preempt_enable() prevents the compiler from
@@ -77,8 +78,9 @@ static inline bool percpu_down_read_tryl
 	/*
 	 * Same as in percpu_down_read().
 	 */
-	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_inc(*sem->read_count);
+	else
 		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -47,6 +47,8 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
+	__this_cpu_inc(*sem->read_count);
+
 	/*
 	 * Due to having preemption disabled the decrement happens on
 	 * the same CPU as the increment, avoiding the



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 4/7] locking/percpu-rwsem: Extract __percpu_down_read_trylock()
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-01-31 15:07 ` [PATCH -v2 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
  2020-01-31 15:07 ` [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

In preparation for removing the embedded rwsem and building a custom
lock, extract the read-trylock primitive.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
---
 kernel/locking/percpu-rwsem.c |   19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
 	__this_cpu_inc(*sem->read_count);
 
@@ -73,11 +73,18 @@ bool __percpu_down_read(struct percpu_rw
 	if (likely(!smp_load_acquire(&sem->readers_block)))
 		return true;
 
-	/*
-	 * Per the above comment; we still have preemption disabled and
-	 * will thus decrement on the same CPU as we incremented.
-	 */
-	__percpu_up_read(sem);
+	__this_cpu_dec(*sem->read_count);
+
+	/* Prod writer to re-evaluate readers_active_check() */
+	rcuwait_wake_up(&sem->writer);
+
+	return false;
+}
+
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+{
+	if (__percpu_down_read_trylock(sem))
+		return true;
 
 	if (try)
 		return false;



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-01-31 15:07 ` [PATCH -v2 4/7] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-02-03 11:45   ` Kirill Tkhai
                     ` (2 more replies)
  2020-01-31 15:07 ` [PATCH -v2 6/7] locking/percpu-rwsem: Fold __percpu_up_read() Peter Zijlstra
                   ` (6 subsequent siblings)
  11 siblings, 3 replies; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

The filesystem freezer uses percpu-rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.

When PREEMPT_RT replaces the rwsem implementation with a PI aware
variant this comes apart.

Remove the embedded rwsem and implement it using a waitqueue and an
atomic_t.

 - make readers_block an atomic, and use it, with the waitqueue
   for a blocking test-and-set write-side.

 - have the read-side wait for the 'lock' state to clear.

Have the waiters use FIFO queueing and mark them (reader/writer) with
a new WQ_FLAG. Use a custom wake_function to wake either a single
writer or all readers until a writer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
---
 include/linux/percpu-rwsem.h  |   19 +----
 include/linux/rwsem.h         |    6 -
 include/linux/wait.h          |    1 
 kernel/locking/percpu-rwsem.c |  153 ++++++++++++++++++++++++++++++------------
 kernel/locking/rwsem.c        |   11 +--
 kernel/locking/rwsem.h        |   12 ---
 6 files changed, 123 insertions(+), 79 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -3,18 +3,18 @@
 #define _LINUX_PERCPU_RWSEM_H
 
 #include <linux/atomic.h>
-#include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/rcuwait.h>
+#include <linux/wait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem; /* slowpath */
-	struct rcuwait          writer; /* blocked writer */
-	int			readers_block;
+	struct rcuwait		writer;
+	wait_queue_head_t	waiters;
+	atomic_t		block;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __pe
 is_static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
-	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	.waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),		\
+	.block = ATOMIC_INIT(0),					\
 	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
 
@@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
 					bool read, unsigned long ip)
 {
 	lock_release(&sem->dep_map, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
 }
 
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
 	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
 }
 
 #endif
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -53,12 +53,6 @@ struct rw_semaphore {
 #endif
 };
 
-/*
- * Setting all bits of the owner field except bit 0 will indicate
- * that the rwsem is writer-owned with an unknown owner.
- */
-#define RWSEM_OWNER_UNKNOWN	(-2L)
-
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -20,6 +20,7 @@ int default_wake_function(struct wait_qu
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 #define WQ_FLAG_BOOKMARK	0x04
+#define WQ_FLAG_CUSTOM		0x08
 
 /*
  * A single wait-queue entry structure:
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/atomic.h>
-#include <linux/rwsem.h>
 #include <linux/percpu.h>
+#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/errno.h>
 
-#include "rwsem.h"
-
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *key)
 {
@@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw
 	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	init_rwsem(&sem->rw_sem);
 	rcuwait_init(&sem->writer);
-	sem->readers_block = 0;
+	init_waitqueue_head(&sem->waiters);
+	atomic_set(&sem->block, 0);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * If the reader misses the writer's assignment of readers_block, then
-	 * the writer is guaranteed to see the reader's increment.
+	 * If the reader misses the writer's assignment of sem->block, then the
+	 * writer is guaranteed to see the reader's increment.
 	 *
 	 * Conversely, any readers that increment their sem->read_count after
-	 * the writer looks are guaranteed to see the readers_block value,
-	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter that the
-	 * writer missed them.
+	 * the writer looks are guaranteed to see the sem->block value, which
+	 * in turn means that they are guaranteed to immediately decrement
+	 * their sem->read_count, so that it doesn't matter that the writer
+	 * missed them.
 	 */
 
 	smp_mb(); /* A matches D */
 
 	/*
-	 * If !readers_block the critical section starts here, matched by the
+	 * If !sem->block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(!smp_load_acquire(&sem->readers_block)))
+	if (likely(!atomic_read_acquire(&sem->block)))
 		return true;
 
 	__this_cpu_dec(*sem->read_count);
@@ -81,6 +79,88 @@ static bool __percpu_down_read_trylock(s
 	return false;
 }
 
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+	if (atomic_read(&sem->block))
+		return false;
+
+	return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+	if (reader) {
+		bool ret;
+
+		preempt_disable();
+		ret = __percpu_down_read_trylock(sem);
+		preempt_enable();
+
+		return ret;
+	}
+	return __percpu_down_write_trylock(sem);
+}
+
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ *  <0 - error, wakeup is terminated and the error is returned
+ *   0 - no wakeup, a next waiter is tried
+ *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+				      unsigned int mode, int wake_flags,
+				      void *key)
+{
+	struct task_struct *p = get_task_struct(wq_entry->private);
+	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+	struct percpu_rw_semaphore *sem = key;
+
+	/* concurrent against percpu_down_write(), can get stolen */
+	if (!__percpu_rwsem_trylock(sem, reader))
+		return 1;
+
+	list_del_init(&wq_entry->entry);
+	smp_store_release(&wq_entry->private, NULL);
+
+	wake_up_process(p);
+	put_task_struct(p);
+
+	return !reader; /* wake (readers until) 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+{
+	DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+	bool wait;
+
+	spin_lock_irq(&sem->waiters.lock);
+	/*
+	 * Serialize against the wakeup in percpu_up_write(), if we fail
+	 * the trylock, the wakeup must see us on the list.
+	 */
+	wait = !__percpu_rwsem_trylock(sem, reader);
+	if (wait) {
+		wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+		__add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+	}
+	spin_unlock_irq(&sem->waiters.lock);
+
+	while (wait) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!smp_load_acquire(&wq_entry.private))
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
 bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	if (__percpu_down_read_trylock(sem))
@@ -89,20 +169,10 @@ bool __percpu_down_read(struct percpu_rw
 	if (try)
 		return false;
 
-	/*
-	 * We either call schedule() in the wait, or we'll fall through
-	 * and reschedule on the preempt_enable() in percpu_down_read().
-	 */
-	preempt_enable_no_resched();
-
-	/*
-	 * Avoid lockdep for the down/up_read() we already have them.
-	 */
-	__down_read(&sem->rw_sem);
-	this_cpu_inc(*sem->read_count);
-	__up_read(&sem->rw_sem);
-
+	preempt_enable();
+	percpu_rwsem_wait(sem, /* .reader = */ true);
 	preempt_disable();
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
@@ -117,7 +187,7 @@ void __percpu_up_read(struct percpu_rw_s
 	 */
 	__this_cpu_dec(*sem->read_count);
 
-	/* Prod writer to recheck readers_active */
+	/* Prod writer to re-evaluate readers_active_check() */
 	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -137,6 +207,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
  * zero.  If this sum is zero, then it is stable due to the fact that if any
  * newly arriving readers increment a given counter, they will immediately
  * decrement that same counter.
+ *
+ * Assumes sem->block is set.
  */
 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
@@ -160,23 +232,22 @@ void percpu_down_write(struct percpu_rw_
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	__down_write(&sem->rw_sem);
-
 	/*
-	 * Notify new readers to block; up until now, and thus throughout the
-	 * longish rcu_sync_enter() above, new readers could still come in.
+	 * Try set sem->block; this provides writer-writer exclusion.
+	 * Having sem->block set makes new readers block.
 	 */
-	WRITE_ONCE(sem->readers_block, 1);
+	if (!__percpu_down_write_trylock(sem))
+		percpu_rwsem_wait(sem, /* .reader = */ false);
 
-	smp_mb(); /* D matches A */
+	/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block, then we are
-	 * guaranteed to see their sem->read_count increment, and therefore
-	 * will wait for them.
+	 * If they don't see our store of sem->block, then we are guaranteed to
+	 * see their sem->read_count increment, and therefore will wait for
+	 * them.
 	 */
 
-	/* Wait for all now active readers to complete. */
+	/* Wait for all active readers to complete. */
 	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -195,12 +266,12 @@ void percpu_up_write(struct percpu_rw_se
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->readers_block, 0);
+	atomic_set_release(&sem->block, 0);
 
 	/*
-	 * Release the write lock, this will allow readers back in the game.
+	 * Prod any pending reader/writer to make progress.
 	 */
-	__up_write(&sem->rw_sem);
+	__wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,7 +28,6 @@
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
 
-#include "rwsem.h"
 #include "lock_events.h"
 
 /*
@@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_own
 	unsigned long flags;
 	bool ret = true;
 
-	BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
 	if (need_resched()) {
 		lockevent_inc(rwsem_opt_fail);
 		return false;
@@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgr
 /*
  * lock for reading
  */
-inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
 {
 	if (!rwsem_read_trylock(sem)) {
 		rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
@@ -1383,7 +1380,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp = RWSEM_UNLOCKED_VALUE;
 
@@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
@@ -1446,7 +1443,7 @@ inline void __up_read(struct rw_semaphor
 /*
  * unlock after writing
  */
-inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __INTERNAL_RWSEM_H
-#define __INTERNAL_RWSEM_H
-#include <linux/rwsem.h>
-
-extern void __down_read(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-extern void __down_write(struct rw_semaphore *sem);
-extern void __up_write(struct rw_semaphore *sem);
-
-#endif /* __INTERNAL_RWSEM_H */



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 6/7] locking/percpu-rwsem: Fold __percpu_up_read()
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-01-31 15:07 ` [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Davidlohr Bueso
  2020-01-31 15:07 ` [PATCH -v2 7/7] locking/percpu-rwsem: Add might_sleep() for writer locking Peter Zijlstra
                   ` (5 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

From: Davidlohr Bueso <dave@stgolabs.net>

Now that __percpu_up_read() is only ever used from percpu_up_read()
merge them, it's a small function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h  |   19 +++++++++++++++----
 kernel/exit.c                 |    1 +
 kernel/locking/percpu-rwsem.c |   15 ---------------
 3 files changed, 16 insertions(+), 19 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -43,7 +43,6 @@ is_static struct percpu_rw_semaphore nam
 	__DEFINE_PERCPU_RWSEM(name, static)
 
 extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
-extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
@@ -103,10 +102,22 @@ static inline void percpu_up_read(struct
 	/*
 	 * Same as in percpu_down_read().
 	 */
-	if (likely(rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss))) {
 		__this_cpu_dec(*sem->read_count);
-	else
-		__percpu_up_read(sem); /* Unconditional memory barrier */
+	} else {
+		/*
+		 * slowpath; reader will only ever wake a single blocked
+		 * writer.
+		 */
+		smp_mb(); /* B matches C */
+		/*
+		 * In other words, if they see our decrement (presumably to
+		 * aggregate zero, as that is the only time it matters) they
+		 * will also see our critical section.
+		 */
+		__this_cpu_dec(*sem->read_count);
+		rcuwait_wake_up(&sem->writer);
+	}
 	preempt_enable();
 }
 
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 		wake_up_process(task);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 
 /*
  * Determine if a process group is "orphaned", according to the POSIX
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -177,21 +177,6 @@ bool __percpu_down_read(struct percpu_rw
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-void __percpu_up_read(struct percpu_rw_semaphore *sem)
-{
-	smp_mb(); /* B matches C */
-	/*
-	 * In other words, if they see our decrement (presumably to aggregate
-	 * zero, as that is the only time it matters) they will also see our
-	 * critical section.
-	 */
-	__this_cpu_dec(*sem->read_count);
-
-	/* Prod writer to re-evaluate readers_active_check() */
-	rcuwait_wake_up(&sem->writer);
-}
-EXPORT_SYMBOL_GPL(__percpu_up_read);
-
 #define per_cpu_sum(var)						\
 ({									\
 	typeof(var) __sum = 0;						\



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2 7/7] locking/percpu-rwsem: Add might_sleep() for writer locking
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (5 preceding siblings ...)
  2020-01-31 15:07 ` [PATCH -v2 6/7] locking/percpu-rwsem: Fold __percpu_up_read() Peter Zijlstra
@ 2020-01-31 15:07 ` Peter Zijlstra
  2020-01-31 19:23 ` [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Waiman Long
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2020-01-31 15:07 UTC (permalink / raw)
  To: peterz, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack, Davidlohr Bueso

From: Davidlohr Bueso <dave@stgolabs.net>

We are missing this annotation in percpu_down_write(). Correct
this.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200108013305.7732-1-dave@stgolabs.net
---
 kernel/locking/percpu-rwsem.c |    1 +
 1 file changed, 1 insertion(+)

--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -212,6 +212,7 @@ static bool readers_active_check(struct
 
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 
 	/* Notify readers to take the slow path. */



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 0/7] locking: Percpu-rwsem rewrite
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (6 preceding siblings ...)
  2020-01-31 15:07 ` [PATCH -v2 7/7] locking/percpu-rwsem: Add might_sleep() for writer locking Peter Zijlstra
@ 2020-01-31 19:23 ` Waiman Long
  2020-02-01 16:23 ` Davidlohr Bueso
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Waiman Long @ 2020-01-31 19:23 UTC (permalink / raw)
  To: Peter Zijlstra, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	dave, jack

On 1/31/20 10:07 AM, Peter Zijlstra wrote:
> Hi all,
>
> This is the long awaited report of the percpu-rwsem rework (sorry Juri).
>
> IIRC (I really have trouble keeping up momentum on this series) I've addressed
> all previous comments by Oleg and Davidlohr and Waiman and hope we can stick
> this in tip/locking/core for inclusion in the next merge.
>
> It has been cooked (thoroughly) in PREEMPT_RT, and not found wanting.
>
> Any objections to me stuffing it in so we can all forget about it properly?
>
>
The patchset looks good to me.

Acked-by: Waiman Long <longman@redhat.com>

Cheers,
Longman


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 0/7] locking: Percpu-rwsem rewrite
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (7 preceding siblings ...)
  2020-01-31 19:23 ` [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Waiman Long
@ 2020-02-01 16:23 ` Davidlohr Bueso
  2020-02-03 10:51 ` Sebastian Andrzej Siewior
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 29+ messages in thread
From: Davidlohr Bueso @ 2020-02-01 16:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, jack

On Fri, 31 Jan 2020, Peter Zijlstra wrote:

>Hi all,
>
>This is the long awaited report of the percpu-rwsem rework (sorry Juri).
>
>IIRC (I really have trouble keeping up momentum on this series) I've addressed
>all previous comments by Oleg and Davidlohr and Waiman and hope we can stick
>this in tip/locking/core for inclusion in the next merge.
>
>It has been cooked (thoroughly) in PREEMPT_RT, and not found wanting.
>
>Any objections to me stuffing it in so we can all forget about it properly?

Feel free to add my:

Reviewed-by: Davidlohr Bueso <dbueso@suse.de>

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 0/7] locking: Percpu-rwsem rewrite
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (8 preceding siblings ...)
  2020-02-01 16:23 ` Davidlohr Bueso
@ 2020-02-03 10:51 ` Sebastian Andrzej Siewior
  2020-02-03 12:25 ` Juri Lelli
  2020-02-03 18:08 ` Will Deacon
  11 siblings, 0 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-02-03 10:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, oleg, tglx, linux-kernel, juri.lelli, williams,
	bristot, longman, dave, jack

On 2020-01-31 16:07:03 [+0100], Peter Zijlstra wrote:
> Hi all,
> 
> This is the long awaited report of the percpu-rwsem rework (sorry Juri).
> 
> IIRC (I really have trouble keeping up momentum on this series) I've addressed
> all previous comments by Oleg and Davidlohr and Waiman and hope we can stick
> this in tip/locking/core for inclusion in the next merge.
> 
> It has been cooked (thoroughly) in PREEMPT_RT, and not found wanting.

I did not suck it into -RT earlier since it look like work-in-progress
(based on the review). Now if you feel confident, I will suck it in.

Thank you.

> Any objections to me stuffing it in so we can all forget about it properly?

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-01-31 15:07 ` [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
@ 2020-02-03 11:45   ` Kirill Tkhai
  2020-02-03 13:44     ` Peter Zijlstra
  2020-02-03 14:20   ` Christoph Hellwig
  2020-02-04  9:24   ` [PATCH -v2-mkII 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
  2 siblings, 1 reply; 29+ messages in thread
From: Kirill Tkhai @ 2020-02-03 11:45 UTC (permalink / raw)
  To: Peter Zijlstra, mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

On 31.01.2020 18:07, Peter Zijlstra wrote:
> The filesystem freezer uses percpu-rwsem in a way that is effectively
> write_non_owner() and achieves this with a few horrible hacks that
> rely on the rwsem (!percpu) implementation.
> 
> When PREEMPT_RT replaces the rwsem implementation with a PI aware
> variant this comes apart.
> 
> Remove the embedded rwsem and implement it using a waitqueue and an
> atomic_t.
> 
>  - make readers_block an atomic, and use it, with the waitqueue
>    for a blocking test-and-set write-side.
> 
>  - have the read-side wait for the 'lock' state to clear.
> 
> Have the waiters use FIFO queueing and mark them (reader/writer) with
> a new WQ_FLAG. Use a custom wake_function to wake either a single
> writer or all readers until a writer.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Juri Lelli <juri.lelli@redhat.com>
> ---
>  include/linux/percpu-rwsem.h  |   19 +----
>  include/linux/rwsem.h         |    6 -
>  include/linux/wait.h          |    1 
>  kernel/locking/percpu-rwsem.c |  153 ++++++++++++++++++++++++++++++------------
>  kernel/locking/rwsem.c        |   11 +--
>  kernel/locking/rwsem.h        |   12 ---
>  6 files changed, 123 insertions(+), 79 deletions(-)
> 
> --- a/include/linux/percpu-rwsem.h
> +++ b/include/linux/percpu-rwsem.h
> @@ -3,18 +3,18 @@
>  #define _LINUX_PERCPU_RWSEM_H
>  
>  #include <linux/atomic.h>
> -#include <linux/rwsem.h>
>  #include <linux/percpu.h>
>  #include <linux/rcuwait.h>
> +#include <linux/wait.h>
>  #include <linux/rcu_sync.h>
>  #include <linux/lockdep.h>
>  
>  struct percpu_rw_semaphore {
>  	struct rcu_sync		rss;
>  	unsigned int __percpu	*read_count;
> -	struct rw_semaphore	rw_sem; /* slowpath */
> -	struct rcuwait          writer; /* blocked writer */
> -	int			readers_block;
> +	struct rcuwait		writer;
> +	wait_queue_head_t	waiters;
> +	atomic_t		block;
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  	struct lockdep_map	dep_map;
>  #endif
> @@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __pe
>  is_static struct percpu_rw_semaphore name = {				\
>  	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
>  	.read_count = &__percpu_rwsem_rc_##name,			\
> -	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
>  	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
> +	.waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),		\
> +	.block = ATOMIC_INIT(0),					\
>  	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
>  }
>  
> @@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
>  					bool read, unsigned long ip)
>  {
>  	lock_release(&sem->dep_map, ip);
> -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
> -	if (!read)
> -		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
> -#endif
>  }
>  
>  static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
>  					bool read, unsigned long ip)
>  {
>  	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
> -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
> -	if (!read)
> -		atomic_long_set(&sem->rw_sem.owner, (long)current);
> -#endif
>  }
>  
>  #endif
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -53,12 +53,6 @@ struct rw_semaphore {
>  #endif
>  };
>  
> -/*
> - * Setting all bits of the owner field except bit 0 will indicate
> - * that the rwsem is writer-owned with an unknown owner.
> - */
> -#define RWSEM_OWNER_UNKNOWN	(-2L)
> -
>  /* In all implementations count != 0 means locked */
>  static inline int rwsem_is_locked(struct rw_semaphore *sem)
>  {
> --- a/include/linux/wait.h
> +++ b/include/linux/wait.h
> @@ -20,6 +20,7 @@ int default_wake_function(struct wait_qu
>  #define WQ_FLAG_EXCLUSIVE	0x01
>  #define WQ_FLAG_WOKEN		0x02
>  #define WQ_FLAG_BOOKMARK	0x04
> +#define WQ_FLAG_CUSTOM		0x08
>  
>  /*
>   * A single wait-queue entry structure:
> --- a/kernel/locking/percpu-rwsem.c
> +++ b/kernel/locking/percpu-rwsem.c
> @@ -1,15 +1,14 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  #include <linux/atomic.h>
> -#include <linux/rwsem.h>
>  #include <linux/percpu.h>
> +#include <linux/wait.h>
>  #include <linux/lockdep.h>
>  #include <linux/percpu-rwsem.h>
>  #include <linux/rcupdate.h>
>  #include <linux/sched.h>
> +#include <linux/sched/task.h>
>  #include <linux/errno.h>
>  
> -#include "rwsem.h"
> -
>  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
>  			const char *name, struct lock_class_key *key)
>  {
> @@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw
>  	if (unlikely(!sem->read_count))
>  		return -ENOMEM;
>  
> -	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
>  	rcu_sync_init(&sem->rss);
> -	init_rwsem(&sem->rw_sem);
>  	rcuwait_init(&sem->writer);
> -	sem->readers_block = 0;
> +	init_waitqueue_head(&sem->waiters);
> +	atomic_set(&sem->block, 0);
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
>  	lockdep_init_map(&sem->dep_map, name, key, 0);
> @@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
>  	 * the same CPU as the increment, avoiding the
>  	 * increment-on-one-CPU-and-decrement-on-another problem.
>  	 *
> -	 * If the reader misses the writer's assignment of readers_block, then
> -	 * the writer is guaranteed to see the reader's increment.
> +	 * If the reader misses the writer's assignment of sem->block, then the
> +	 * writer is guaranteed to see the reader's increment.
>  	 *
>  	 * Conversely, any readers that increment their sem->read_count after
> -	 * the writer looks are guaranteed to see the readers_block value,
> -	 * which in turn means that they are guaranteed to immediately
> -	 * decrement their sem->read_count, so that it doesn't matter that the
> -	 * writer missed them.
> +	 * the writer looks are guaranteed to see the sem->block value, which
> +	 * in turn means that they are guaranteed to immediately decrement
> +	 * their sem->read_count, so that it doesn't matter that the writer
> +	 * missed them.
>  	 */
>  
>  	smp_mb(); /* A matches D */
>  
>  	/*
> -	 * If !readers_block the critical section starts here, matched by the
> +	 * If !sem->block the critical section starts here, matched by the
>  	 * release in percpu_up_write().
>  	 */
> -	if (likely(!smp_load_acquire(&sem->readers_block)))
> +	if (likely(!atomic_read_acquire(&sem->block)))
>  		return true;
>  
>  	__this_cpu_dec(*sem->read_count);
> @@ -81,6 +79,88 @@ static bool __percpu_down_read_trylock(s
>  	return false;
>  }
>  
> +static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
> +{
> +	if (atomic_read(&sem->block))
> +		return false;
> +
> +	return atomic_xchg(&sem->block, 1) == 0;
> +}
> +
> +static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
> +{
> +	if (reader) {
> +		bool ret;
> +
> +		preempt_disable();
> +		ret = __percpu_down_read_trylock(sem);
> +		preempt_enable();
> +
> +		return ret;
> +	}
> +	return __percpu_down_write_trylock(sem);
> +}
> +
> +/*
> + * The return value of wait_queue_entry::func means:
> + *
> + *  <0 - error, wakeup is terminated and the error is returned
> + *   0 - no wakeup, a next waiter is tried
> + *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
> + *
> + * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
> + * and play games with the return value to allow waking multiple readers.
> + *
> + * Specifically, we wake readers until we've woken a single writer, or until a
> + * trylock fails.
> + */
> +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
> +				      unsigned int mode, int wake_flags,
> +				      void *key)
> +{
> +	struct task_struct *p = get_task_struct(wq_entry->private);
> +	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
> +	struct percpu_rw_semaphore *sem = key;
> +
> +	/* concurrent against percpu_down_write(), can get stolen */
> +	if (!__percpu_rwsem_trylock(sem, reader))
> +		return 1;
> +
> +	list_del_init(&wq_entry->entry);
> +	smp_store_release(&wq_entry->private, NULL);
> +
> +	wake_up_process(p);
> +	put_task_struct(p);
> +
> +	return !reader; /* wake (readers until) 1 writer */
> +}

Maybe, this is not a subject of this patchset. But since this is a newborn function,
can we introduce it to save one unneeded wake_up of writer? This is a situation,
when writer becomes woken up just to write itself into sem->writer.task.

Something like below:

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index a136677543b4..e4f88bfd43ed 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -9,6 +9,8 @@
 #include <linux/sched/task.h>
 #include <linux/errno.h>
 
+static bool readers_active_check(struct percpu_rw_semaphore *sem);
+
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *key)
 {
@@ -101,6 +103,16 @@ static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
 	return __percpu_down_write_trylock(sem);
 }
 
+static void queue_sem_writer(struct percpu_rw_semaphore *sem, struct task_struct *p)
+{
+	rcu_assign_pointer(sem->writer.task, p);
+	smp_mb();
+	if (readers_active_check(sem)) {
+		WRITE_ONCE(sem->writer.task, NULL);
+		wake_up_process(p);
+	}
+}
+
 /*
  * The return value of wait_queue_entry::func means:
  *
@@ -129,7 +141,11 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
 	list_del_init(&wq_entry->entry);
 	smp_store_release(&wq_entry->private, NULL);
 
-	wake_up_process(p);
+	if (reader || readers_active_check(sem))
+		wake_up_process(p);
+	else
+		queue_sem_writer(sem, p);
+
 	put_task_struct(p);
 
 	return !reader; /* wake (readers until) 1 writer */
@@ -247,8 +263,11 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 * them.
 	 */
 
-	/* Wait for all active readers to complete. */
-	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
+	if (rcu_access_pointer(sem->writer.task))
+		WRITE_ONCE(sem->writer.task, NULL);
+	else
+		/* Wait for all active readers to complete. */
+		rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
Just an idea, completely untested.

Kirill

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 0/7] locking: Percpu-rwsem rewrite
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (9 preceding siblings ...)
  2020-02-03 10:51 ` Sebastian Andrzej Siewior
@ 2020-02-03 12:25 ` Juri Lelli
  2020-02-03 18:08 ` Will Deacon
  11 siblings, 0 replies; 29+ messages in thread
From: Juri Lelli @ 2020-02-03 12:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, williams,
	bristot, longman, dave, jack

Hi,

On 31/01/20 16:07, Peter Zijlstra wrote:
> Hi all,
> 
> This is the long awaited report of the percpu-rwsem rework (sorry Juri).
> 
> IIRC (I really have trouble keeping up momentum on this series) I've addressed
> all previous comments by Oleg and Davidlohr and Waiman and hope we can stick
> this in tip/locking/core for inclusion in the next merge.
> 
> It has been cooked (thoroughly) in PREEMPT_RT, and not found wanting.
> 
> Any objections to me stuffing it in so we can all forget about it properly?

FWIW, backported and tested again on downstream PREEMPT_RT kernel.
locktorture didn't find any problem and latencies look good.

Tested-by: Juri Lelli <juri.lelli@redhat.com>

Thanks!

Juri


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-02-03 11:45   ` Kirill Tkhai
@ 2020-02-03 13:44     ` Peter Zijlstra
  2020-02-03 14:33       ` Kirill Tkhai
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-02-03 13:44 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, dave, jack

Hi Kirill,

On Mon, Feb 03, 2020 at 02:45:16PM +0300, Kirill Tkhai wrote:

> Maybe, this is not a subject of this patchset. But since this is a newborn function,
> can we introduce it to save one unneeded wake_up of writer? This is a situation,
> when writer becomes woken up just to write itself into sem->writer.task.
> 
> Something like below:
> 
> diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
> index a136677543b4..e4f88bfd43ed 100644
> --- a/kernel/locking/percpu-rwsem.c
> +++ b/kernel/locking/percpu-rwsem.c
> @@ -9,6 +9,8 @@
>  #include <linux/sched/task.h>
>  #include <linux/errno.h>
>  
> +static bool readers_active_check(struct percpu_rw_semaphore *sem);
> +
>  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
>  			const char *name, struct lock_class_key *key)
>  {
> @@ -101,6 +103,16 @@ static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
>  	return __percpu_down_write_trylock(sem);
>  }
>  
> +static void queue_sem_writer(struct percpu_rw_semaphore *sem, struct task_struct *p)
> +{
> +	rcu_assign_pointer(sem->writer.task, p);
> +	smp_mb();
> +	if (readers_active_check(sem)) {
> +		WRITE_ONCE(sem->writer.task, NULL);
> +		wake_up_process(p);
> +	}
> +}
> +
>  /*
>   * The return value of wait_queue_entry::func means:
>   *
> @@ -129,7 +141,11 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
>  	list_del_init(&wq_entry->entry);
>  	smp_store_release(&wq_entry->private, NULL);
>  
> -	wake_up_process(p);
> +	if (reader || readers_active_check(sem))
> +		wake_up_process(p);
> +	else
> +		queue_sem_writer(sem, p);
> +
>  	put_task_struct(p);
>  
>  	return !reader; /* wake (readers until) 1 writer */
> @@ -247,8 +263,11 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
>  	 * them.
>  	 */
>  
> -	/* Wait for all active readers to complete. */
> -	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
> +	if (rcu_access_pointer(sem->writer.task))
> +		WRITE_ONCE(sem->writer.task, NULL);
> +	else
> +		/* Wait for all active readers to complete. */
> +		rcuwait_wait_event(&sem->writer, readers_active_check(sem));
>  }
>  EXPORT_SYMBOL_GPL(percpu_down_write);
>  
> Just an idea, completely untested.

Hurm,.. I think I see what you're proposing. I also think your immediate
patch is racy, consider for example what happens if your
queue_sem_writer() finds !readers_active_check(), such that we do in
fact need to wait. Then your percpu_down_write() will find
sem->writer.task and clear it -- no waiting.

Also, I'm not going to hold up these patches for this, we can always do
this on top.

Still, let me consider this a little more.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-01-31 15:07 ` [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
  2020-02-03 11:45   ` Kirill Tkhai
@ 2020-02-03 14:20   ` Christoph Hellwig
  2020-02-03 15:09     ` Peter Zijlstra
  2020-02-04  9:24   ` [PATCH -v2-mkII 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
  2 siblings, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2020-02-03 14:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, dave, jack

On Fri, Jan 31, 2020 at 04:07:08PM +0100, Peter Zijlstra wrote:
> @@ -53,12 +53,6 @@ struct rw_semaphore {
>  #endif
>  };
>  
> -/*
> - * Setting all bits of the owner field except bit 0 will indicate
> - * that the rwsem is writer-owned with an unknown owner.
> - */
> -#define RWSEM_OWNER_UNKNOWN	(-2L)

Can you split the removal of the non-owned resem support into a separate
patch?  I still think keeping this one and moving aio to that scheme is
a better idea than the current ad-hoc locking scheme that has all kinds
of issues.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-02-03 13:44     ` Peter Zijlstra
@ 2020-02-03 14:33       ` Kirill Tkhai
  0 siblings, 0 replies; 29+ messages in thread
From: Kirill Tkhai @ 2020-02-03 14:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, dave, jack

On 03.02.2020 16:44, Peter Zijlstra wrote:
> Hi Kirill,
> 
> On Mon, Feb 03, 2020 at 02:45:16PM +0300, Kirill Tkhai wrote:
> 
>> Maybe, this is not a subject of this patchset. But since this is a newborn function,
>> can we introduce it to save one unneeded wake_up of writer? This is a situation,
>> when writer becomes woken up just to write itself into sem->writer.task.
>>
>> Something like below:
>>
>> diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
>> index a136677543b4..e4f88bfd43ed 100644
>> --- a/kernel/locking/percpu-rwsem.c
>> +++ b/kernel/locking/percpu-rwsem.c
>> @@ -9,6 +9,8 @@
>>  #include <linux/sched/task.h>
>>  #include <linux/errno.h>
>>  
>> +static bool readers_active_check(struct percpu_rw_semaphore *sem);
>> +
>>  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
>>  			const char *name, struct lock_class_key *key)
>>  {
>> @@ -101,6 +103,16 @@ static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
>>  	return __percpu_down_write_trylock(sem);
>>  }
>>  
>> +static void queue_sem_writer(struct percpu_rw_semaphore *sem, struct task_struct *p)
>> +{
>> +	rcu_assign_pointer(sem->writer.task, p);
>> +	smp_mb();
>> +	if (readers_active_check(sem)) {
>> +		WRITE_ONCE(sem->writer.task, NULL);
>> +		wake_up_process(p);
>> +	}
>> +}
>> +
>>  /*
>>   * The return value of wait_queue_entry::func means:
>>   *
>> @@ -129,7 +141,11 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
>>  	list_del_init(&wq_entry->entry);
>>  	smp_store_release(&wq_entry->private, NULL);
>>  
>> -	wake_up_process(p);
>> +	if (reader || readers_active_check(sem))
>> +		wake_up_process(p);
>> +	else
>> +		queue_sem_writer(sem, p);
>> +
>>  	put_task_struct(p);
>>  
>>  	return !reader; /* wake (readers until) 1 writer */
>> @@ -247,8 +263,11 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
>>  	 * them.
>>  	 */
>>  
>> -	/* Wait for all active readers to complete. */
>> -	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
>> +	if (rcu_access_pointer(sem->writer.task))
>> +		WRITE_ONCE(sem->writer.task, NULL);
>> +	else
>> +		/* Wait for all active readers to complete. */
>> +		rcuwait_wait_event(&sem->writer, readers_active_check(sem));
>>  }
>>  EXPORT_SYMBOL_GPL(percpu_down_write);
>>  
>> Just an idea, completely untested.
> 
> Hurm,.. I think I see what you're proposing. I also think your immediate
> patch is racy, consider for example what happens if your
> queue_sem_writer() finds !readers_active_check(), such that we do in
> fact need to wait. Then your percpu_down_write() will find
> sem->writer.task and clear it -- no waiting.

You mean, down_read() wakes up waiters unconditionally. So, optimization
in percpu_down_write() will miss readers_active_check() check.

You are sure. Then we have to modify this a little bit and to remove
the optimization from percpu_down_write():

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index a136677543b4..90647ab28804 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -9,6 +9,8 @@
 #include <linux/sched/task.h>
 #include <linux/errno.h>
 
+static bool readers_active_check(struct percpu_rw_semaphore *sem);
+
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *key)
 {
@@ -101,6 +103,16 @@ static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
 	return __percpu_down_write_trylock(sem);
 }
 
+static void queue_sem_writer(struct percpu_rw_semaphore *sem, struct task_struct *p)
+{
+	rcu_assign_pointer(sem->writer.task, p);
+	smp_mb();
+	if (readers_active_check(sem)) {
+		WRITE_ONCE(sem->writer.task, NULL);
+		wake_up_process(p);
+	}
+}
+
 /*
  * The return value of wait_queue_entry::func means:
  *
@@ -129,7 +141,11 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
 	list_del_init(&wq_entry->entry);
 	smp_store_release(&wq_entry->private, NULL);
 
-	wake_up_process(p);
+	if (reader || readers_active_check(sem))
+		wake_up_process(p);
+	else
+		queue_sem_writer(sem, p);
+
 	put_task_struct(p);
 
 	return !reader; /* wake (readers until) 1 writer */
@@ -248,6 +264,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 */
 
 	/* Wait for all active readers to complete. */
+	/* sem->writer is NULL or points to current */
 	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);

> Also, I'm not going to hold up these patches for this, we can always do
> this on top.
> 
> Still, let me consider this a little more.

No problem, this is just an idea.

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-02-03 14:20   ` Christoph Hellwig
@ 2020-02-03 15:09     ` Peter Zijlstra
  2020-02-03 17:48       ` Christoph Hellwig
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-02-03 15:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, dave, jack

On Mon, Feb 03, 2020 at 06:20:50AM -0800, Christoph Hellwig wrote:
> On Fri, Jan 31, 2020 at 04:07:08PM +0100, Peter Zijlstra wrote:
> > @@ -53,12 +53,6 @@ struct rw_semaphore {
> >  #endif
> >  };
> >  
> > -/*
> > - * Setting all bits of the owner field except bit 0 will indicate
> > - * that the rwsem is writer-owned with an unknown owner.
> > - */
> > -#define RWSEM_OWNER_UNKNOWN	(-2L)
> 
> Can you split the removal of the non-owned resem support into a separate
> patch?  I still think keeping this one and moving aio to that scheme is
> a better idea than the current ad-hoc locking scheme that has all kinds
> of issues.

That's basically 2 lines of code and a comment, surely we can ressurect
that if/when it's needed again?

---
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 00d6054687dd..8a418d9eeb7a 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -53,12 +53,6 @@ struct rw_semaphore {
 #endif
 };
 
-/*
- * Setting all bits of the owner field except bit 0 will indicate
- * that the rwsem is writer-owned with an unknown owner.
- */
-#define RWSEM_OWNER_UNKNOWN	(-2L)
-
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44e68761f432..5449ab33f89e 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
 	unsigned long flags;
 	bool ret = true;
 
-	BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
 	if (need_resched()) {
 		lockevent_inc(rwsem_opt_fail);
 		return false;



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-02-03 15:09     ` Peter Zijlstra
@ 2020-02-03 17:48       ` Christoph Hellwig
  2020-02-04  8:50         ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2020-02-03 17:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Hellwig, mingo, will, oleg, tglx, linux-kernel,
	bigeasy, juri.lelli, williams, bristot, longman, dave, jack

On Mon, Feb 03, 2020 at 04:09:33PM +0100, Peter Zijlstra wrote:
> > Can you split the removal of the non-owned resem support into a separate
> > patch?  I still think keeping this one and moving aio to that scheme is
> > a better idea than the current ad-hoc locking scheme that has all kinds
> > of issues.
> 
> That's basically 2 lines of code and a comment, surely we can ressurect
> that if/when it's needed again?

Sure, I could.  But then you'd still need to update your commit log for
this patch explaining why it includes unrelated changes to a different
subsystem.  By splitting it you also document your changes much better.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 0/7] locking: Percpu-rwsem rewrite
  2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
                   ` (10 preceding siblings ...)
  2020-02-03 12:25 ` Juri Lelli
@ 2020-02-03 18:08 ` Will Deacon
  11 siblings, 0 replies; 29+ messages in thread
From: Will Deacon @ 2020-02-03 18:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams,
	bristot, longman, dave, jack

On Fri, Jan 31, 2020 at 04:07:03PM +0100, Peter Zijlstra wrote:
> This is the long awaited report of the percpu-rwsem rework (sorry Juri).
> 
> IIRC (I really have trouble keeping up momentum on this series) I've addressed
> all previous comments by Oleg and Davidlohr and Waiman and hope we can stick
> this in tip/locking/core for inclusion in the next merge.
> 
> It has been cooked (thoroughly) in PREEMPT_RT, and not found wanting.
> 
> Any objections to me stuffing it in so we can all forget about it properly?

Whole series looks fine to me and it also passes my arm64 build tests, so:

Acked-by: Will Deacon <will@kernel.org>

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-02-03 17:48       ` Christoph Hellwig
@ 2020-02-04  8:50         ` Peter Zijlstra
  2020-02-04  9:22           ` [PATCH] locking/rwsem: Remove RWSEM_OWNER_UNKNOWN Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-02-04  8:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, dave, jack

On Mon, Feb 03, 2020 at 09:48:31AM -0800, Christoph Hellwig wrote:
> On Mon, Feb 03, 2020 at 04:09:33PM +0100, Peter Zijlstra wrote:
> > > Can you split the removal of the non-owned resem support into a separate
> > > patch?  I still think keeping this one and moving aio to that scheme is
> > > a better idea than the current ad-hoc locking scheme that has all kinds
> > > of issues.
> > 
> > That's basically 2 lines of code and a comment, surely we can ressurect
> > that if/when it's needed again?
> 
> Sure, I could.  But then you'd still need to update your commit log for
> this patch explaining why it includes unrelated changes to a different
> subsystem.  By splitting it you also document your changes much better.

I really don't see the argument; the diffstat is:

include/linux/percpu-rwsem.h  |   19 +----
include/linux/rwsem.h         |    6 -
include/linux/wait.h          |    1
kernel/locking/percpu-rwsem.c |  153 ++++++++++++++++++++++++++++++------------
kernel/locking/rwsem.c        |   11 +--
kernel/locking/rwsem.h        |   12 ---

There is no unrelated subsystem, it's all locking (well, strictly
speaking wait.h is sched, bit that one flag is actually mentioned in the
Changelog).

Also, cleaning up unused bits is quite common without mention.

Anyway, I'll go split, since you seem to care so deeply.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH] locking/rwsem: Remove RWSEM_OWNER_UNKNOWN
  2020-02-04  8:50         ` Peter Zijlstra
@ 2020-02-04  9:22           ` Peter Zijlstra
  2020-02-11 12:48             ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-02-04  9:22 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
	williams, bristot, longman, dave, jack

On Tue, Feb 04, 2020 at 09:50:49AM +0100, Peter Zijlstra wrote:

> Anyway, I'll go split, since you seem to care so deeply.

---

Subject: locking/rwsem: Remove RWSEM_OWNER_UNKNOWN
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue Feb  4 09:34:37 CET 2020

Remove the now unused RWSEM_OWNER_UNKNOWN hack. This hack breaks
PREEMPT_RT and getting rid of it was the entire motivation for
re-writing the percpu rwsem.

The biggest problem is that it is fundamentally incompatible with any
form of Priority Inheritance, any exclusively held lock must have a
distinct owner.

Requested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rwsem.h  |    6 ------
 kernel/locking/rwsem.c |    2 --
 2 files changed, 8 deletions(-)

--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -53,12 +53,6 @@ struct rw_semaphore {
 #endif
 };
 
-/*
- * Setting all bits of the owner field except bit 0 will indicate
- * that the rwsem is writer-owned with an unknown owner.
- */
-#define RWSEM_OWNER_UNKNOWN	(-2L)
-
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -659,8 +659,6 @@ static inline bool rwsem_can_spin_on_own
 	unsigned long flags;
 	bool ret = true;
 
-	BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
 	if (need_resched()) {
 		lockevent_inc(rwsem_opt_fail);
 		return false;

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH -v2-mkII 5/7] locking/percpu-rwsem: Remove the embedded rwsem
  2020-01-31 15:07 ` [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
  2020-02-03 11:45   ` Kirill Tkhai
  2020-02-03 14:20   ` Christoph Hellwig
@ 2020-02-04  9:24   ` Peter Zijlstra
  2020-02-11 12:48     ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
  2 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-02-04  9:24 UTC (permalink / raw)
  To: mingo, will
  Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
	longman, dave, jack

Subject: locking/percpu-rwsem: Remove the embedded rwsem
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Oct 30 20:30:41 CET 2019

The filesystem freezer uses percpu-rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.

When PREEMPT_RT replaces the rwsem implementation with a PI aware
variant this comes apart.

Remove the embedded rwsem and implement it using a waitqueue and an
atomic_t.

 - make readers_block an atomic, and use it, with the waitqueue
   for a blocking test-and-set write-side.

 - have the read-side wait for the 'lock' state to clear.

Have the waiters use FIFO queueing and mark them (reader/writer) with
a new WQ_FLAG. Use a custom wake_function to wake either a single
writer or all readers until a writer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
---

Moved the RWSEM_OWNER_UNKNOWN removal into it's own patch.

 include/linux/percpu-rwsem.h  |   19 +----
 include/linux/wait.h          |    1 
 kernel/locking/percpu-rwsem.c |  153 ++++++++++++++++++++++++++++++------------
 kernel/locking/rwsem.c        |    9 +-
 kernel/locking/rwsem.h        |   12 ---
 5 files changed, 123 insertions(+), 71 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -3,18 +3,18 @@
 #define _LINUX_PERCPU_RWSEM_H
 
 #include <linux/atomic.h>
-#include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/rcuwait.h>
+#include <linux/wait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem; /* slowpath */
-	struct rcuwait          writer; /* blocked writer */
-	int			readers_block;
+	struct rcuwait		writer;
+	wait_queue_head_t	waiters;
+	atomic_t		block;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __pe
 is_static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
-	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	.waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),		\
+	.block = ATOMIC_INIT(0),					\
 	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
 
@@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
 					bool read, unsigned long ip)
 {
 	lock_release(&sem->dep_map, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
 }
 
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
 	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
 }
 
 #endif
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -20,6 +20,7 @@ int default_wake_function(struct wait_qu
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 #define WQ_FLAG_BOOKMARK	0x04
+#define WQ_FLAG_CUSTOM		0x08
 
 /*
  * A single wait-queue entry structure:
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/atomic.h>
-#include <linux/rwsem.h>
 #include <linux/percpu.h>
+#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/errno.h>
 
-#include "rwsem.h"
-
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *key)
 {
@@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw
 	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	init_rwsem(&sem->rw_sem);
 	rcuwait_init(&sem->writer);
-	sem->readers_block = 0;
+	init_waitqueue_head(&sem->waiters);
+	atomic_set(&sem->block, 0);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * If the reader misses the writer's assignment of readers_block, then
-	 * the writer is guaranteed to see the reader's increment.
+	 * If the reader misses the writer's assignment of sem->block, then the
+	 * writer is guaranteed to see the reader's increment.
 	 *
 	 * Conversely, any readers that increment their sem->read_count after
-	 * the writer looks are guaranteed to see the readers_block value,
-	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter that the
-	 * writer missed them.
+	 * the writer looks are guaranteed to see the sem->block value, which
+	 * in turn means that they are guaranteed to immediately decrement
+	 * their sem->read_count, so that it doesn't matter that the writer
+	 * missed them.
 	 */
 
 	smp_mb(); /* A matches D */
 
 	/*
-	 * If !readers_block the critical section starts here, matched by the
+	 * If !sem->block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(!smp_load_acquire(&sem->readers_block)))
+	if (likely(!atomic_read_acquire(&sem->block)))
 		return true;
 
 	__this_cpu_dec(*sem->read_count);
@@ -81,6 +79,88 @@ static bool __percpu_down_read_trylock(s
 	return false;
 }
 
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+	if (atomic_read(&sem->block))
+		return false;
+
+	return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+	if (reader) {
+		bool ret;
+
+		preempt_disable();
+		ret = __percpu_down_read_trylock(sem);
+		preempt_enable();
+
+		return ret;
+	}
+	return __percpu_down_write_trylock(sem);
+}
+
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ *  <0 - error, wakeup is terminated and the error is returned
+ *   0 - no wakeup, a next waiter is tried
+ *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+				      unsigned int mode, int wake_flags,
+				      void *key)
+{
+	struct task_struct *p = get_task_struct(wq_entry->private);
+	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+	struct percpu_rw_semaphore *sem = key;
+
+	/* concurrent against percpu_down_write(), can get stolen */
+	if (!__percpu_rwsem_trylock(sem, reader))
+		return 1;
+
+	list_del_init(&wq_entry->entry);
+	smp_store_release(&wq_entry->private, NULL);
+
+	wake_up_process(p);
+	put_task_struct(p);
+
+	return !reader; /* wake (readers until) 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+{
+	DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+	bool wait;
+
+	spin_lock_irq(&sem->waiters.lock);
+	/*
+	 * Serialize against the wakeup in percpu_up_write(), if we fail
+	 * the trylock, the wakeup must see us on the list.
+	 */
+	wait = !__percpu_rwsem_trylock(sem, reader);
+	if (wait) {
+		wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+		__add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+	}
+	spin_unlock_irq(&sem->waiters.lock);
+
+	while (wait) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!smp_load_acquire(&wq_entry.private))
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
 bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	if (__percpu_down_read_trylock(sem))
@@ -89,20 +169,10 @@ bool __percpu_down_read(struct percpu_rw
 	if (try)
 		return false;
 
-	/*
-	 * We either call schedule() in the wait, or we'll fall through
-	 * and reschedule on the preempt_enable() in percpu_down_read().
-	 */
-	preempt_enable_no_resched();
-
-	/*
-	 * Avoid lockdep for the down/up_read() we already have them.
-	 */
-	__down_read(&sem->rw_sem);
-	this_cpu_inc(*sem->read_count);
-	__up_read(&sem->rw_sem);
-
+	preempt_enable();
+	percpu_rwsem_wait(sem, /* .reader = */ true);
 	preempt_disable();
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
@@ -117,7 +187,7 @@ void __percpu_up_read(struct percpu_rw_s
 	 */
 	__this_cpu_dec(*sem->read_count);
 
-	/* Prod writer to recheck readers_active */
+	/* Prod writer to re-evaluate readers_active_check() */
 	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -137,6 +207,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
  * zero.  If this sum is zero, then it is stable due to the fact that if any
  * newly arriving readers increment a given counter, they will immediately
  * decrement that same counter.
+ *
+ * Assumes sem->block is set.
  */
 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
@@ -160,23 +232,22 @@ void percpu_down_write(struct percpu_rw_
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	__down_write(&sem->rw_sem);
-
 	/*
-	 * Notify new readers to block; up until now, and thus throughout the
-	 * longish rcu_sync_enter() above, new readers could still come in.
+	 * Try set sem->block; this provides writer-writer exclusion.
+	 * Having sem->block set makes new readers block.
 	 */
-	WRITE_ONCE(sem->readers_block, 1);
+	if (!__percpu_down_write_trylock(sem))
+		percpu_rwsem_wait(sem, /* .reader = */ false);
 
-	smp_mb(); /* D matches A */
+	/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block, then we are
-	 * guaranteed to see their sem->read_count increment, and therefore
-	 * will wait for them.
+	 * If they don't see our store of sem->block, then we are guaranteed to
+	 * see their sem->read_count increment, and therefore will wait for
+	 * them.
 	 */
 
-	/* Wait for all now active readers to complete. */
+	/* Wait for all active readers to complete. */
 	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -195,12 +266,12 @@ void percpu_up_write(struct percpu_rw_se
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->readers_block, 0);
+	atomic_set_release(&sem->block, 0);
 
 	/*
-	 * Release the write lock, this will allow readers back in the game.
+	 * Prod any pending reader/writer to make progress.
 	 */
-	__up_write(&sem->rw_sem);
+	__wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,7 +28,6 @@
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
 
-#include "rwsem.h"
 #include "lock_events.h"
 
 /*
@@ -1338,7 +1337,7 @@ static struct rw_semaphore *rwsem_downgr
 /*
  * lock for reading
  */
-inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
 {
 	if (!rwsem_read_trylock(sem)) {
 		rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
@@ -1383,7 +1382,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp = RWSEM_UNLOCKED_VALUE;
 
@@ -1426,7 +1425,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
@@ -1446,7 +1445,7 @@ inline void __up_read(struct rw_semaphor
 /*
  * unlock after writing
  */
-inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __INTERNAL_RWSEM_H
-#define __INTERNAL_RWSEM_H
-#include <linux/rwsem.h>
-
-extern void __down_read(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-extern void __down_write(struct rw_semaphore *sem);
-extern void __up_write(struct rw_semaphore *sem);
-
-#endif /* __INTERNAL_RWSEM_H */

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/percpu-rwsem: Remove the embedded rwsem
  2020-02-04  9:24   ` [PATCH -v2-mkII 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
@ 2020-02-11 12:48     ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Ingo Molnar, Davidlohr Bueso, Will Deacon, Waiman Long,
	Juri Lelli, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     7f26482a872c36b2ee87ea95b9dcd96e3d5805df
Gitweb:        https://git.kernel.org/tip/7f26482a872c36b2ee87ea95b9dcd96e3d5805df
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 30 Oct 2019 20:30:41 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:56 +01:00

locking/percpu-rwsem: Remove the embedded rwsem

The filesystem freezer uses percpu-rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.

When PREEMPT_RT replaces the rwsem implementation with a PI aware
variant this comes apart.

Remove the embedded rwsem and implement it using a waitqueue and an
atomic_t.

 - make readers_block an atomic, and use it, with the waitqueue
   for a blocking test-and-set write-side.

 - have the read-side wait for the 'lock' state to clear.

Have the waiters use FIFO queueing and mark them (reader/writer) with
a new WQ_FLAG. Use a custom wake_function to wake either a single
writer or all readers until a writer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200204092403.GB14879@hirez.programming.kicks-ass.net
---
 include/linux/percpu-rwsem.h  |  19 +---
 include/linux/wait.h          |   1 +-
 kernel/locking/percpu-rwsem.c | 153 ++++++++++++++++++++++++---------
 kernel/locking/rwsem.c        |   9 +--
 kernel/locking/rwsem.h        |  12 +---
 5 files changed, 123 insertions(+), 71 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index bb5b71c..f5ecf6a 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -3,18 +3,18 @@
 #define _LINUX_PERCPU_RWSEM_H
 
 #include <linux/atomic.h>
-#include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/rcuwait.h>
+#include <linux/wait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem; /* slowpath */
-	struct rcuwait          writer; /* blocked writer */
-	int			readers_block;
+	struct rcuwait		writer;
+	wait_queue_head_t	waiters;
+	atomic_t		block;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
 is_static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
-	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	.waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),		\
+	.block = ATOMIC_INIT(0),					\
 	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
 
@@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
 	lock_release(&sem->dep_map, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
 }
 
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
 	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
 }
 
 #endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3283c8d..feeb6be 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -20,6 +20,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int 
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 #define WQ_FLAG_BOOKMARK	0x04
+#define WQ_FLAG_CUSTOM		0x08
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index b155e8e..a136677 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/atomic.h>
-#include <linux/rwsem.h>
 #include <linux/percpu.h>
+#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/errno.h>
 
-#include "rwsem.h"
-
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *key)
 {
@@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	init_rwsem(&sem->rw_sem);
 	rcuwait_init(&sem->writer);
-	sem->readers_block = 0;
+	init_waitqueue_head(&sem->waiters);
+	atomic_set(&sem->block, 0);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * If the reader misses the writer's assignment of readers_block, then
-	 * the writer is guaranteed to see the reader's increment.
+	 * If the reader misses the writer's assignment of sem->block, then the
+	 * writer is guaranteed to see the reader's increment.
 	 *
 	 * Conversely, any readers that increment their sem->read_count after
-	 * the writer looks are guaranteed to see the readers_block value,
-	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter that the
-	 * writer missed them.
+	 * the writer looks are guaranteed to see the sem->block value, which
+	 * in turn means that they are guaranteed to immediately decrement
+	 * their sem->read_count, so that it doesn't matter that the writer
+	 * missed them.
 	 */
 
 	smp_mb(); /* A matches D */
 
 	/*
-	 * If !readers_block the critical section starts here, matched by the
+	 * If !sem->block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(!smp_load_acquire(&sem->readers_block)))
+	if (likely(!atomic_read_acquire(&sem->block)))
 		return true;
 
 	__this_cpu_dec(*sem->read_count);
@@ -81,6 +79,88 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	return false;
 }
 
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+	if (atomic_read(&sem->block))
+		return false;
+
+	return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+	if (reader) {
+		bool ret;
+
+		preempt_disable();
+		ret = __percpu_down_read_trylock(sem);
+		preempt_enable();
+
+		return ret;
+	}
+	return __percpu_down_write_trylock(sem);
+}
+
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ *  <0 - error, wakeup is terminated and the error is returned
+ *   0 - no wakeup, a next waiter is tried
+ *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+				      unsigned int mode, int wake_flags,
+				      void *key)
+{
+	struct task_struct *p = get_task_struct(wq_entry->private);
+	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+	struct percpu_rw_semaphore *sem = key;
+
+	/* concurrent against percpu_down_write(), can get stolen */
+	if (!__percpu_rwsem_trylock(sem, reader))
+		return 1;
+
+	list_del_init(&wq_entry->entry);
+	smp_store_release(&wq_entry->private, NULL);
+
+	wake_up_process(p);
+	put_task_struct(p);
+
+	return !reader; /* wake (readers until) 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+{
+	DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+	bool wait;
+
+	spin_lock_irq(&sem->waiters.lock);
+	/*
+	 * Serialize against the wakeup in percpu_up_write(), if we fail
+	 * the trylock, the wakeup must see us on the list.
+	 */
+	wait = !__percpu_rwsem_trylock(sem, reader);
+	if (wait) {
+		wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+		__add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+	}
+	spin_unlock_irq(&sem->waiters.lock);
+
+	while (wait) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!smp_load_acquire(&wq_entry.private))
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
 bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	if (__percpu_down_read_trylock(sem))
@@ -89,20 +169,10 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 	if (try)
 		return false;
 
-	/*
-	 * We either call schedule() in the wait, or we'll fall through
-	 * and reschedule on the preempt_enable() in percpu_down_read().
-	 */
-	preempt_enable_no_resched();
-
-	/*
-	 * Avoid lockdep for the down/up_read() we already have them.
-	 */
-	__down_read(&sem->rw_sem);
-	this_cpu_inc(*sem->read_count);
-	__up_read(&sem->rw_sem);
-
+	preempt_enable();
+	percpu_rwsem_wait(sem, /* .reader = */ true);
 	preempt_disable();
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
@@ -117,7 +187,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
 	 */
 	__this_cpu_dec(*sem->read_count);
 
-	/* Prod writer to recheck readers_active */
+	/* Prod writer to re-evaluate readers_active_check() */
 	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -137,6 +207,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
  * zero.  If this sum is zero, then it is stable due to the fact that if any
  * newly arriving readers increment a given counter, they will immediately
  * decrement that same counter.
+ *
+ * Assumes sem->block is set.
  */
 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
@@ -160,23 +232,22 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	__down_write(&sem->rw_sem);
-
 	/*
-	 * Notify new readers to block; up until now, and thus throughout the
-	 * longish rcu_sync_enter() above, new readers could still come in.
+	 * Try set sem->block; this provides writer-writer exclusion.
+	 * Having sem->block set makes new readers block.
 	 */
-	WRITE_ONCE(sem->readers_block, 1);
+	if (!__percpu_down_write_trylock(sem))
+		percpu_rwsem_wait(sem, /* .reader = */ false);
 
-	smp_mb(); /* D matches A */
+	/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block, then we are
-	 * guaranteed to see their sem->read_count increment, and therefore
-	 * will wait for them.
+	 * If they don't see our store of sem->block, then we are guaranteed to
+	 * see their sem->read_count increment, and therefore will wait for
+	 * them.
 	 */
 
-	/* Wait for all now active readers to complete. */
+	/* Wait for all active readers to complete. */
 	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -195,12 +266,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->readers_block, 0);
+	atomic_set_release(&sem->block, 0);
 
 	/*
-	 * Release the write lock, this will allow readers back in the game.
+	 * Prod any pending reader/writer to make progress.
 	 */
-	__up_write(&sem->rw_sem);
+	__wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 30df8df..81c0d75 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,7 +28,6 @@
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
 
-#include "rwsem.h"
 #include "lock_events.h"
 
 /*
@@ -1338,7 +1337,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 /*
  * lock for reading
  */
-inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
 {
 	if (!rwsem_read_trylock(sem)) {
 		rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
@@ -1383,7 +1382,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp = RWSEM_UNLOCKED_VALUE;
 
@@ -1426,7 +1425,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 /*
  * unlock after reading
  */
-inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
@@ -1446,7 +1445,7 @@ inline void __up_read(struct rw_semaphore *sem)
 /*
  * unlock after writing
  */
-inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index d0d33a5..e69de29 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __INTERNAL_RWSEM_H
-#define __INTERNAL_RWSEM_H
-#include <linux/rwsem.h>
-
-extern void __down_read(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-extern void __down_write(struct rw_semaphore *sem);
-extern void __up_write(struct rw_semaphore *sem);
-
-#endif /* __INTERNAL_RWSEM_H */

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/percpu-rwsem: Fold __percpu_up_read()
  2020-01-31 15:07 ` [PATCH -v2 6/7] locking/percpu-rwsem: Fold __percpu_up_read() Peter Zijlstra
@ 2020-02-11 12:48   ` tip-bot2 for Davidlohr Bueso
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Davidlohr Bueso @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Davidlohr Bueso, Peter Zijlstra (Intel),
	Ingo Molnar, Will Deacon, Waiman Long, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     ac8dec420970f5cbaf2f6eda39153a60ec5b257b
Gitweb:        https://git.kernel.org/tip/ac8dec420970f5cbaf2f6eda39153a60ec5b257b
Author:        Davidlohr Bueso <dave@stgolabs.net>
AuthorDate:    Mon, 18 Nov 2019 15:19:35 -08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:58 +01:00

locking/percpu-rwsem: Fold __percpu_up_read()

Now that __percpu_up_read() is only ever used from percpu_up_read()
merge them, it's a small function.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20200131151540.212415454@infradead.org
---
 include/linux/percpu-rwsem.h  | 19 +++++++++++++++----
 kernel/exit.c                 |  1 +
 kernel/locking/percpu-rwsem.c | 15 ---------------
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index f5ecf6a..5e033fe 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -43,7 +43,6 @@ is_static struct percpu_rw_semaphore name = {				\
 	__DEFINE_PERCPU_RWSEM(name, static)
 
 extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
-extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
@@ -103,10 +102,22 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	/*
 	 * Same as in percpu_down_read().
 	 */
-	if (likely(rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss))) {
 		__this_cpu_dec(*sem->read_count);
-	else
-		__percpu_up_read(sem); /* Unconditional memory barrier */
+	} else {
+		/*
+		 * slowpath; reader will only ever wake a single blocked
+		 * writer.
+		 */
+		smp_mb(); /* B matches C */
+		/*
+		 * In other words, if they see our decrement (presumably to
+		 * aggregate zero, as that is the only time it matters) they
+		 * will also see our critical section.
+		 */
+		__this_cpu_dec(*sem->read_count);
+		rcuwait_wake_up(&sem->writer);
+	}
 	preempt_enable();
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb..f64a8f9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 		wake_up_process(task);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 
 /*
  * Determine if a process group is "orphaned", according to the POSIX
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index a136677..8048a9a 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -177,21 +177,6 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-void __percpu_up_read(struct percpu_rw_semaphore *sem)
-{
-	smp_mb(); /* B matches C */
-	/*
-	 * In other words, if they see our decrement (presumably to aggregate
-	 * zero, as that is the only time it matters) they will also see our
-	 * critical section.
-	 */
-	__this_cpu_dec(*sem->read_count);
-
-	/* Prod writer to re-evaluate readers_active_check() */
-	rcuwait_wake_up(&sem->writer);
-}
-EXPORT_SYMBOL_GPL(__percpu_up_read);
-
 #define per_cpu_sum(var)						\
 ({									\
 	typeof(var) __sum = 0;						\

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/rwsem: Remove RWSEM_OWNER_UNKNOWN
  2020-02-04  9:22           ` [PATCH] locking/rwsem: Remove RWSEM_OWNER_UNKNOWN Peter Zijlstra
@ 2020-02-11 12:48             ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Ingo Molnar, Davidlohr Bueso, Will Deacon, Waiman Long,
	Juri Lelli, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     bcba67cd806800fa8e973ac49dbc7d2d8fb3e55e
Gitweb:        https://git.kernel.org/tip/bcba67cd806800fa8e973ac49dbc7d2d8fb3e55e
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 04 Feb 2020 09:34:37 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:57 +01:00

locking/rwsem: Remove RWSEM_OWNER_UNKNOWN

Remove the now unused RWSEM_OWNER_UNKNOWN hack. This hack breaks
PREEMPT_RT and getting rid of it was the entire motivation for
re-writing the percpu rwsem.

The biggest problem is that it is fundamentally incompatible with any
form of Priority Inheritance, any exclusively held lock must have a
distinct owner.

Requested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200204092228.GP14946@hirez.programming.kicks-ass.net
---
 include/linux/rwsem.h  | 6 ------
 kernel/locking/rwsem.c | 2 --
 2 files changed, 8 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 00d6054..8a418d9 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -53,12 +53,6 @@ struct rw_semaphore {
 #endif
 };
 
-/*
- * Setting all bits of the owner field except bit 0 will indicate
- * that the rwsem is writer-owned with an unknown owner.
- */
-#define RWSEM_OWNER_UNKNOWN	(-2L)
-
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 81c0d75..e6f437b 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -659,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
 	unsigned long flags;
 	bool ret = true;
 
-	BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
 	if (need_resched()) {
 		lockevent_inc(rwsem_opt_fail);
 		return false;

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/percpu-rwsem: Extract __percpu_down_read_trylock()
  2020-01-31 15:07 ` [PATCH -v2 4/7] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
@ 2020-02-11 12:48   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Ingo Molnar, Davidlohr Bueso, Will Deacon, Waiman Long,
	Juri Lelli, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     75ff64572e497578e238fefbdff221c96f29067a
Gitweb:        https://git.kernel.org/tip/75ff64572e497578e238fefbdff221c96f29067a
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 31 Oct 2019 12:34:23 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:55 +01:00

locking/percpu-rwsem: Extract __percpu_down_read_trylock()

In preparation for removing the embedded rwsem and building a custom
lock, extract the read-trylock primitive.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200131151540.098485539@infradead.org
---
 kernel/locking/percpu-rwsem.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index becf925..b155e8e 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
 	__this_cpu_inc(*sem->read_count);
 
@@ -73,11 +73,18 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 	if (likely(!smp_load_acquire(&sem->readers_block)))
 		return true;
 
-	/*
-	 * Per the above comment; we still have preemption disabled and
-	 * will thus decrement on the same CPU as we incremented.
-	 */
-	__percpu_up_read(sem);
+	__this_cpu_dec(*sem->read_count);
+
+	/* Prod writer to re-evaluate readers_active_check() */
+	rcuwait_wake_up(&sem->writer);
+
+	return false;
+}
+
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+{
+	if (__percpu_down_read_trylock(sem))
+		return true;
 
 	if (try)
 		return false;

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/percpu-rwsem: Convert to bool
  2020-01-31 15:07 ` [PATCH -v2 2/7] locking/percpu-rwsem: Convert to bool Peter Zijlstra
@ 2020-02-11 12:48   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Ingo Molnar, Davidlohr Bueso, Will Deacon, Waiman Long,
	Juri Lelli, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     206c98ffbeda588dbbd9d272505c42acbc364a30
Gitweb:        https://git.kernel.org/tip/206c98ffbeda588dbbd9d272505c42acbc364a30
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 30 Oct 2019 20:12:37 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:54 +01:00

locking/percpu-rwsem: Convert to bool

Use bool where possible.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200131151539.984626569@infradead.org
---
 include/linux/percpu-rwsem.h  | 6 +++---
 kernel/locking/percpu-rwsem.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index f2c36fb..4ceaa19 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -41,7 +41,7 @@ is_static struct percpu_rw_semaphore name = {				\
 #define DEFINE_STATIC_PERCPU_RWSEM(name)	\
 	__DEFINE_PERCPU_RWSEM(name, static)
 
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -69,9 +69,9 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	preempt_enable();
 }
 
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	int ret = 1;
+	bool ret = true;
 
 	preempt_disable();
 	/*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index aa2b118..969389d 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	/*
 	 * Due to having preemption disabled the decrement happens on
@@ -69,7 +69,7 @@ int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 	 * release in percpu_up_write().
 	 */
 	if (likely(!smp_load_acquire(&sem->readers_block)))
-		return 1;
+		return true;
 
 	/*
 	 * Per the above comment; we still have preemption disabled and
@@ -78,7 +78,7 @@ int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 	__percpu_up_read(sem);
 
 	if (try)
-		return 0;
+		return false;
 
 	/*
 	 * We either call schedule() in the wait, or we'll fall through
@@ -94,7 +94,7 @@ int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 	__up_read(&sem->rw_sem);
 
 	preempt_disable();
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath
  2020-01-31 15:07 ` [PATCH -v2 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath Peter Zijlstra
@ 2020-02-11 12:48   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Ingo Molnar, Davidlohr Bueso, Will Deacon, Waiman Long,
	Juri Lelli, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     71365d40232110f7b029befc9033ea311d680611
Gitweb:        https://git.kernel.org/tip/71365d40232110f7b029befc9033ea311d680611
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 30 Oct 2019 20:17:51 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:54 +01:00

locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath

As preparation to rework __percpu_down_read() move the
__this_cpu_inc() into it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200131151540.041600199@infradead.org
---
 include/linux/percpu-rwsem.h  | 10 ++++++----
 kernel/locking/percpu-rwsem.c |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 4ceaa19..bb5b71c 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -59,8 +59,9 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	 * and that once the synchronize_rcu() is done, the writer will see
 	 * anything we did within this RCU-sched read-size critical section.
 	 */
-	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_inc(*sem->read_count);
+	else
 		__percpu_down_read(sem, false); /* Unconditional memory barrier */
 	/*
 	 * The preempt_enable() prevents the compiler from
@@ -77,8 +78,9 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	/*
 	 * Same as in percpu_down_read().
 	 */
-	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_inc(*sem->read_count);
+	else
 		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 969389d..becf925 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -47,6 +47,8 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
+	__this_cpu_inc(*sem->read_count);
+
 	/*
 	 * Due to having preemption disabled the decrement happens on
 	 * the same CPU as the increment, avoiding the

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: locking/core] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map
  2020-01-31 15:07 ` [PATCH -v2 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
@ 2020-02-11 12:48   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-02-11 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel),
	Ingo Molnar, Davidlohr Bueso, Will Deacon, Waiman Long,
	Juri Lelli, x86, LKML

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     1751060e2527462714359573a39dca10451ffbf8
Gitweb:        https://git.kernel.org/tip/1751060e2527462714359573a39dca10451ffbf8
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Wed, 30 Oct 2019 20:01:26 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:10:53 +01:00

locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map

As preparation for replacing the embedded rwsem, give percpu-rwsem its
own lockdep_map.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200131151539.927625541@infradead.org
---
 include/linux/percpu-rwsem.h  | 29 +++++++++++++++++++----------
 kernel/cpu.c                  |  4 ++--
 kernel/locking/percpu-rwsem.c | 16 ++++++++++++----
 kernel/locking/rwsem.c        |  4 ++--
 kernel/locking/rwsem.h        |  2 ++
 5 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index ad2ca2a..f2c36fb 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -15,8 +15,17 @@ struct percpu_rw_semaphore {
 	struct rw_semaphore	rw_sem; /* slowpath */
 	struct rcuwait          writer; /* blocked writer */
 	int			readers_block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
 #define __DEFINE_PERCPU_RWSEM(name, is_static)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
 is_static struct percpu_rw_semaphore name = {				\
@@ -24,7 +33,9 @@ is_static struct percpu_rw_semaphore name = {				\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
+
 #define DEFINE_PERCPU_RWSEM(name)		\
 	__DEFINE_PERCPU_RWSEM(name, /* not static */)
 #define DEFINE_STATIC_PERCPU_RWSEM(name)	\
@@ -37,7 +48,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 
-	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	preempt_disable();
 	/*
@@ -76,13 +87,15 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	 */
 
 	if (ret)
-		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
 
 	return ret;
 }
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
 	preempt_disable();
 	/*
 	 * Same as in percpu_down_read().
@@ -92,8 +105,6 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	else
 		__percpu_up_read(sem); /* Unconditional memory barrier */
 	preempt_enable();
-
-	rwsem_release(&sem->rw_sem.dep_map, _RET_IP_);
 }
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,15 +121,13 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem)				\
-	lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem)	lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem)	lockdep_assert_held(sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_release(&sem->rw_sem.dep_map, ip);
+	lock_release(&sem->dep_map, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
 		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
@@ -128,7 +137,7 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
 		atomic_long_set(&sem->rw_sem.owner, (long)current);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9c706af..221bf6a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
 
 static void lockdep_acquire_cpus_lock(void)
 {
-	rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+	rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
 }
 
 static void lockdep_release_cpus_lock(void)
 {
-	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
+	rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
 }
 
 /*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 364d38a..aa2b118 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -11,7 +11,7 @@
 #include "rwsem.h"
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
-			const char *name, struct lock_class_key *rwsem_key)
+			const char *name, struct lock_class_key *key)
 {
 	sem->read_count = alloc_percpu(int);
 	if (unlikely(!sem->read_count))
@@ -19,9 +19,13 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+	init_rwsem(&sem->rw_sem);
 	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -142,10 +146,12 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
 
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	down_write(&sem->rw_sem);
+	__down_write(&sem->rw_sem);
 
 	/*
 	 * Notify new readers to block; up until now, and thus throughout the
@@ -168,6 +174,8 @@ EXPORT_SYMBOL_GPL(percpu_down_write);
 
 void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
 	/*
 	 * Signal the writer is done, no fast path yet.
 	 *
@@ -183,7 +191,7 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 	/*
 	 * Release the write lock, this will allow readers back in the game.
 	 */
-	up_write(&sem->rw_sem);
+	__up_write(&sem->rw_sem);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 0d9b6be..30df8df 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1383,7 +1383,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp = RWSEM_UNLOCKED_VALUE;
 
@@ -1446,7 +1446,7 @@ inline void __up_read(struct rw_semaphore *sem)
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+inline void __up_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 2534ce4..d0d33a5 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -6,5 +6,7 @@
 
 extern void __down_read(struct rw_semaphore *sem);
 extern void __up_read(struct rw_semaphore *sem);
+extern void __down_write(struct rw_semaphore *sem);
+extern void __up_write(struct rw_semaphore *sem);
 
 #endif /* __INTERNAL_RWSEM_H */

^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2020-02-11 12:49 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-31 15:07 [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Peter Zijlstra
2020-01-31 15:07 ` [PATCH -v2 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
2020-01-31 15:07 ` [PATCH -v2 2/7] locking/percpu-rwsem: Convert to bool Peter Zijlstra
2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
2020-01-31 15:07 ` [PATCH -v2 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath Peter Zijlstra
2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
2020-01-31 15:07 ` [PATCH -v2 4/7] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
2020-01-31 15:07 ` [PATCH -v2 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
2020-02-03 11:45   ` Kirill Tkhai
2020-02-03 13:44     ` Peter Zijlstra
2020-02-03 14:33       ` Kirill Tkhai
2020-02-03 14:20   ` Christoph Hellwig
2020-02-03 15:09     ` Peter Zijlstra
2020-02-03 17:48       ` Christoph Hellwig
2020-02-04  8:50         ` Peter Zijlstra
2020-02-04  9:22           ` [PATCH] locking/rwsem: Remove RWSEM_OWNER_UNKNOWN Peter Zijlstra
2020-02-11 12:48             ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
2020-02-04  9:24   ` [PATCH -v2-mkII 5/7] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
2020-02-11 12:48     ` [tip: locking/core] " tip-bot2 for Peter Zijlstra
2020-01-31 15:07 ` [PATCH -v2 6/7] locking/percpu-rwsem: Fold __percpu_up_read() Peter Zijlstra
2020-02-11 12:48   ` [tip: locking/core] " tip-bot2 for Davidlohr Bueso
2020-01-31 15:07 ` [PATCH -v2 7/7] locking/percpu-rwsem: Add might_sleep() for writer locking Peter Zijlstra
2020-01-31 19:23 ` [PATCH -v2 0/7] locking: Percpu-rwsem rewrite Waiman Long
2020-02-01 16:23 ` Davidlohr Bueso
2020-02-03 10:51 ` Sebastian Andrzej Siewior
2020-02-03 12:25 ` Juri Lelli
2020-02-03 18:08 ` Will Deacon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.