* [PATCH 1/5] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map
2019-11-13 10:21 [PATCH 0/5] locking: Percpu-rwsem rewrite Peter Zijlstra
@ 2019-11-13 10:21 ` Peter Zijlstra
2019-11-15 20:39 ` Davidlohr Bueso
2019-11-13 10:21 ` [PATCH 2/5] locking/percpu-rwsem: Convert to bool Peter Zijlstra
` (4 subsequent siblings)
5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2019-11-13 10:21 UTC (permalink / raw)
To: peterz, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
longman, dave, jack
As preparation for replacing the embedded rwsem, give percpu-rwsem its
own lockdep_map.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 29 +++++++++++++++++++----------
kernel/cpu.c | 4 ++--
kernel/locking/percpu-rwsem.c | 14 +++++++++++---
kernel/locking/rwsem.c | 4 ++--
kernel/locking/rwsem.h | 2 ++
5 files changed, 36 insertions(+), 17 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -15,8 +15,17 @@ struct percpu_rw_semaphore {
struct rw_semaphore rw_sem; /* slowpath */
struct rcuwait writer; /* blocked writer */
int readers_block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
#define __DEFINE_PERCPU_RWSEM(name, is_static) \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \
is_static struct percpu_rw_semaphore name = { \
@@ -24,7 +33,9 @@ is_static struct percpu_rw_semaphore nam
.read_count = &__percpu_rwsem_rc_##name, \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
.writer = __RCUWAIT_INITIALIZER(name.writer), \
+ __PERCPU_RWSEM_DEP_MAP_INIT(name) \
}
+
#define DEFINE_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name) \
@@ -37,7 +48,7 @@ static inline void percpu_down_read(stru
{
might_sleep();
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
preempt_disable();
/*
@@ -76,13 +87,15 @@ static inline int percpu_down_read_trylo
*/
if (ret)
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, _RET_IP_);
+
preempt_disable();
/*
* Same as in percpu_down_read().
@@ -92,8 +105,6 @@ static inline void percpu_up_read(struct
else
__percpu_up_read(sem); /* Unconditional memory barrier */
preempt_enable();
-
- rwsem_release(&sem->rw_sem.dep_map, _RET_IP_);
}
extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,15 +121,13 @@ extern void percpu_free_rwsem(struct per
__percpu_init_rwsem(sem, #sem, &rwsem_key); \
})
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem) \
- lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem) lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem)
static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
- lock_release(&sem->rw_sem.dep_map, ip);
+ lock_release(&sem->dep_map, ip);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
if (!read)
atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
@@ -128,7 +137,7 @@ static inline void percpu_rwsem_release(
static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
- lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+ lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
if (!read)
atomic_long_set(&sem->rw_sem.owner, (long)current);
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
static void lockdep_acquire_cpus_lock(void)
{
- rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+ rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}
static void lockdep_release_cpus_lock(void)
{
- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
+ rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -11,7 +11,7 @@
#include "rwsem.h"
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
- const char *name, struct lock_class_key *rwsem_key)
+ const char *name, struct lock_class_key *key)
{
sem->read_count = alloc_percpu(int);
if (unlikely(!sem->read_count))
@@ -22,6 +22,10 @@ int __percpu_init_rwsem(struct percpu_rw
__init_rwsem(&sem->rw_sem, name, rwsem_key);
rcuwait_init(&sem->writer);
sem->readers_block = 0;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -142,10 +146,12 @@ static bool readers_active_check(struct
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- down_write(&sem->rw_sem);
+ __down_write(&sem->rw_sem);
/*
* Notify new readers to block; up until now, and thus throughout the
@@ -168,6 +174,8 @@ EXPORT_SYMBOL_GPL(percpu_down_write);
void percpu_up_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, _RET_IP_);
+
/*
* Signal the writer is done, no fast path yet.
*
@@ -183,7 +191,7 @@ void percpu_up_write(struct percpu_rw_se
/*
* Release the write lock, this will allow readers back in the game.
*/
- up_write(&sem->rw_sem);
+ __up_write(&sem->rw_sem);
/*
* Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1383,7 +1383,7 @@ static inline int __down_read_trylock(st
/*
* lock for writing
*/
-static inline void __down_write(struct rw_semaphore *sem)
+inline void __down_write(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
@@ -1446,7 +1446,7 @@ inline void __up_read(struct rw_semaphor
/*
* unlock after writing
*/
-static inline void __up_write(struct rw_semaphore *sem)
+inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -6,5 +6,7 @@
extern void __down_read(struct rw_semaphore *sem);
extern void __up_read(struct rw_semaphore *sem);
+extern void __down_write(struct rw_semaphore *sem);
+extern void __up_write(struct rw_semaphore *sem);
#endif /* __INTERNAL_RWSEM_H */
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/5] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map
2019-11-13 10:21 ` [PATCH 1/5] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
@ 2019-11-15 20:39 ` Davidlohr Bueso
2020-01-08 1:33 ` [PATCH] locking/percpu-rwsem: Add might_sleep() for writer locking Davidlohr Bueso
0 siblings, 1 reply; 23+ messages in thread
From: Davidlohr Bueso @ 2019-11-15 20:39 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
williams, bristot, longman, jack
On Wed, 13 Nov 2019, Peter Zijlstra wrote:
>@@ -37,7 +48,7 @@ static inline void percpu_down_read(stru
> {
> might_sleep();
>
>- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
>+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
Unrelated to this patch, but we also need a might_sleep() annotation
in percpu_down_write().
Thanks,
Davidlohr
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] locking/percpu-rwsem: Add might_sleep() for writer locking
2019-11-15 20:39 ` Davidlohr Bueso
@ 2020-01-08 1:33 ` Davidlohr Bueso
2020-01-08 1:33 ` Davidlohr Bueso
2020-02-11 12:48 ` [tip: locking/core] " tip-bot2 for Davidlohr Bueso
0 siblings, 2 replies; 23+ messages in thread
From: Davidlohr Bueso @ 2020-01-08 1:33 UTC (permalink / raw)
To: dave
Cc: bigeasy, bristot, jack, juri.lelli, linux-kernel, longman, mingo,
oleg, peterz, tglx, will, williams, Davidlohr Bueso
We are missing this annotation in percpu_down_write(). Correct
this.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
Applies against your queue tree.
kernel/locking/percpu-rwsem.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 4cbeeada8d09..08db2abfa00d 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -212,6 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
/* Notify readers to take the slow path. */
--
2.16.4
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH] locking/percpu-rwsem: Add might_sleep() for writer locking
2020-01-08 1:33 ` [PATCH] locking/percpu-rwsem: Add might_sleep() for writer locking Davidlohr Bueso
@ 2020-01-08 1:33 ` Davidlohr Bueso
2020-02-11 12:48 ` [tip: locking/core] " tip-bot2 for Davidlohr Bueso
1 sibling, 0 replies; 23+ messages in thread
From: Davidlohr Bueso @ 2020-01-08 1:33 UTC (permalink / raw)
To: dave
Cc: bigeasy, bristot, jack, juri.lelli, linux-kernel, longman, mingo,
oleg, peterz, tglx, will, williams, Davidlohr Bueso
We are missing this annotation in percpu_down_write(). Correct
this.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
Applies against your queue tree.
kernel/locking/percpu-rwsem.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 4cbeeada8d09..08db2abfa00d 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -212,6 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
/* Notify readers to take the slow path. */
--
2.16.4
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [tip: locking/core] locking/percpu-rwsem: Add might_sleep() for writer locking
2020-01-08 1:33 ` [PATCH] locking/percpu-rwsem: Add might_sleep() for writer locking Davidlohr Bueso
2020-01-08 1:33 ` Davidlohr Bueso
@ 2020-02-11 12:48 ` tip-bot2 for Davidlohr Bueso
1 sibling, 0 replies; 23+ messages in thread
From: tip-bot2 for Davidlohr Bueso @ 2020-02-11 12:48 UTC (permalink / raw)
To: linux-tip-commits
Cc: Davidlohr Bueso, Peter Zijlstra (Intel),
Ingo Molnar, Will Deacon, Waiman Long, x86, LKML
The following commit has been merged into the locking/core branch of tip:
Commit-ID: 41f0e29190ac9e38099a37abd1a8a4cb1dc21233
Gitweb: https://git.kernel.org/tip/41f0e29190ac9e38099a37abd1a8a4cb1dc21233
Author: Davidlohr Bueso <dave@stgolabs.net>
AuthorDate: Tue, 07 Jan 2020 17:33:04 -08:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 11 Feb 2020 13:11:02 +01:00
locking/percpu-rwsem: Add might_sleep() for writer locking
We are missing this annotation in percpu_down_write(). Correct
this.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20200108013305.7732-1-dave@stgolabs.net
---
kernel/locking/percpu-rwsem.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8048a9a..183a3aa 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -212,6 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
/* Notify readers to take the slow path. */
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 2/5] locking/percpu-rwsem: Convert to bool
2019-11-13 10:21 [PATCH 0/5] locking: Percpu-rwsem rewrite Peter Zijlstra
2019-11-13 10:21 ` [PATCH 1/5] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
@ 2019-11-13 10:21 ` Peter Zijlstra
2019-11-13 10:21 ` [PATCH 3/5] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath Peter Zijlstra
` (3 subsequent siblings)
5 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-11-13 10:21 UTC (permalink / raw)
To: peterz, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
longman, dave, jack
Use bool where possible.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 6 +++---
kernel/locking/percpu-rwsem.c | 8 ++++----
2 files changed, 7 insertions(+), 7 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -41,7 +41,7 @@ is_static struct percpu_rw_semaphore nam
#define DEFINE_STATIC_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, static)
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
extern void __percpu_up_read(struct percpu_rw_semaphore *);
static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -69,9 +69,9 @@ static inline void percpu_down_read(stru
preempt_enable();
}
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- int ret = 1;
+ bool ret = true;
preempt_disable();
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
/*
* Due to having preemption disabled the decrement happens on
@@ -69,7 +69,7 @@ int __percpu_down_read(struct percpu_rw_
* release in percpu_up_write().
*/
if (likely(!smp_load_acquire(&sem->readers_block)))
- return 1;
+ return true;
/*
* Per the above comment; we still have preemption disabled and
@@ -78,7 +78,7 @@ int __percpu_down_read(struct percpu_rw_
__percpu_up_read(sem);
if (try)
- return 0;
+ return false;
/*
* We either call schedule() in the wait, or we'll fall through
@@ -94,7 +94,7 @@ int __percpu_down_read(struct percpu_rw_
__up_read(&sem->rw_sem);
preempt_disable();
- return 1;
+ return true;
}
EXPORT_SYMBOL_GPL(__percpu_down_read);
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 3/5] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath
2019-11-13 10:21 [PATCH 0/5] locking: Percpu-rwsem rewrite Peter Zijlstra
2019-11-13 10:21 ` [PATCH 1/5] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its own lockdep_map Peter Zijlstra
2019-11-13 10:21 ` [PATCH 2/5] locking/percpu-rwsem: Convert to bool Peter Zijlstra
@ 2019-11-13 10:21 ` Peter Zijlstra
2019-11-13 10:21 ` [PATCH 4/5] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
` (2 subsequent siblings)
5 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-11-13 10:21 UTC (permalink / raw)
To: peterz, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
longman, dave, jack
As preparation to rework __percpu_down_read() move the
__this_cpu_inc() into it.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 10 ++++++----
kernel/locking/percpu-rwsem.c | 2 ++
2 files changed, 8 insertions(+), 4 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -59,8 +59,9 @@ static inline void percpu_down_read(stru
* and that once the synchronize_rcu() is done, the writer will see
* anything we did within this RCU-sched read-size critical section.
*/
- __this_cpu_inc(*sem->read_count);
- if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss)))
+ __this_cpu_inc(*sem->read_count);
+ else
__percpu_down_read(sem, false); /* Unconditional memory barrier */
/*
* The preempt_enable() prevents the compiler from
@@ -77,8 +78,9 @@ static inline bool percpu_down_read_tryl
/*
* Same as in percpu_down_read().
*/
- __this_cpu_inc(*sem->read_count);
- if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss)))
+ __this_cpu_inc(*sem->read_count);
+ else
ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
preempt_enable();
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -47,6 +47,8 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
+ __this_cpu_inc(*sem->read_count);
+
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 4/5] locking/percpu-rwsem: Extract __percpu_down_read_trylock()
2019-11-13 10:21 [PATCH 0/5] locking: Percpu-rwsem rewrite Peter Zijlstra
` (2 preceding siblings ...)
2019-11-13 10:21 ` [PATCH 3/5] locking/percpu-rwsem: Move __this_cpu_inc() into the slowpath Peter Zijlstra
@ 2019-11-13 10:21 ` Peter Zijlstra
2019-11-18 16:28 ` Oleg Nesterov
2019-11-13 10:21 ` [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
2019-11-15 17:14 ` [PATCH 0/5] locking: Percpu-rwsem rewrite Juri Lelli
5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2019-11-13 10:21 UTC (permalink / raw)
To: peterz, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
longman, dave, jack
In preparation for removing the embedded rwsem and building a custom
lock, extract the read-trylock primitive.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/locking/percpu-rwsem.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
__this_cpu_inc(*sem->read_count);
@@ -70,14 +70,21 @@ bool __percpu_down_read(struct percpu_rw
* If !readers_block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(!smp_load_acquire(&sem->readers_block)))
+ if (likely(!atomic_read_acquire(&sem->readers_block)))
return true;
- /*
- * Per the above comment; we still have preemption disabled and
- * will thus decrement on the same CPU as we incremented.
- */
- __percpu_up_read(sem);
+ __this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to re-evaluate readers_active_check() */
+ rcuwait_wake_up(&sem->writer);
+
+ return false;
+}
+
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+{
+ if (__percpu_down_read_trylock(sem))
+ return true;
if (try)
return false;
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 4/5] locking/percpu-rwsem: Extract __percpu_down_read_trylock()
2019-11-13 10:21 ` [PATCH 4/5] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
@ 2019-11-18 16:28 ` Oleg Nesterov
0 siblings, 0 replies; 23+ messages in thread
From: Oleg Nesterov @ 2019-11-18 16:28 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, will, tglx, linux-kernel, bigeasy, juri.lelli, williams,
bristot, longman, dave, jack
Hi Peter, sorry for delay.
I'll re-read this series tomorrow, but everything looks correct at first
glance...
Except one very minor problem in this patch, see below.
On 11/13, Peter Zijlstra wrote:
>
> -bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
> +static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
> {
> __this_cpu_inc(*sem->read_count);
>
> @@ -70,14 +70,21 @@ bool __percpu_down_read(struct percpu_rw
> * If !readers_block the critical section starts here, matched by the
> * release in percpu_up_write().
> */
> - if (likely(!smp_load_acquire(&sem->readers_block)))
> + if (likely(!atomic_read_acquire(&sem->readers_block)))
I don't think this can be compiled ;) ->readers_block is "int" until the next
patch makes it atomic_t and renames to ->block.
And. I think __percpu_down_read_trylock() should do
if (atomic_read(&sem->block))
return false;
at the start, before __this_cpu_inc(read_count).
Suppose that the pending writer sleeps in rcuwait_wait_event(readers_active_check).
If the new reader comes, it is better to not wake up that writer.
Oleg.
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-13 10:21 [PATCH 0/5] locking: Percpu-rwsem rewrite Peter Zijlstra
` (3 preceding siblings ...)
2019-11-13 10:21 ` [PATCH 4/5] locking/percpu-rwsem: Extract __percpu_down_read_trylock() Peter Zijlstra
@ 2019-11-13 10:21 ` Peter Zijlstra
2019-11-18 19:53 ` Davidlohr Bueso
` (2 more replies)
2019-11-15 17:14 ` [PATCH 0/5] locking: Percpu-rwsem rewrite Juri Lelli
5 siblings, 3 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-11-13 10:21 UTC (permalink / raw)
To: peterz, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
longman, dave, jack
The filesystem freezer uses percpu-rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.
When PREEMPT_RT replaces the rwsem implementation with a PI aware
variant this comes apart.
Remove the embedded rwsem and implement it using a waitqueue and an
atomic_t.
- make readers_block an atomic, and use it, with the waitqueue
for a blocking test-and-set write-side.
- have the read-side wait for the 'lock' state to clear.
Have the waiters use FIFO queueing and mark them (reader/writer) with
a new WQ_FLAG. Use a custom wake_function to wake either a single
writer or all readers until a writer.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 19 +----
include/linux/wait.h | 1
kernel/locking/percpu-rwsem.c | 140 +++++++++++++++++++++++++++++-------------
kernel/locking/rwsem.c | 9 +-
kernel/locking/rwsem.h | 12 ---
5 files changed, 110 insertions(+), 71 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -3,18 +3,18 @@
#define _LINUX_PERCPU_RWSEM_H
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
+#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
struct percpu_rw_semaphore {
struct rcu_sync rss;
unsigned int __percpu *read_count;
- struct rw_semaphore rw_sem; /* slowpath */
- struct rcuwait writer; /* blocked writer */
- int readers_block;
+ struct rcuwait writer;
+ wait_queue_head_t waiters;
+ atomic_t block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __pe
is_static struct percpu_rw_semaphore name = { \
.rss = __RCU_SYNC_INITIALIZER(name.rss), \
.read_count = &__percpu_rwsem_rc_##name, \
- .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
.writer = __RCUWAIT_INITIALIZER(name.writer), \
+ .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters), \
+ .block = ATOMIC_INIT(0), \
__PERCPU_RWSEM_DEP_MAP_INIT(name) \
}
@@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
bool read, unsigned long ip)
{
lock_release(&sem->dep_map, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- if (!read)
- atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
}
static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- if (!read)
- atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
}
#endif
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -20,6 +20,7 @@ int default_wake_function(struct wait_qu
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
#define WQ_FLAG_BOOKMARK 0x04
+#define WQ_FLAG_CUSTOM 0x08
/*
* A single wait-queue entry structure:
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,15 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
+#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/errno.h>
-#include "rwsem.h"
-
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
{
@@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw
if (unlikely(!sem->read_count))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
rcu_sync_init(&sem->rss);
- __init_rwsem(&sem->rw_sem, name, rwsem_key);
rcuwait_init(&sem->writer);
- sem->readers_block = 0;
+ init_waitqueue_head(&sem->waiters);
+ atomic_set(&sem->block, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*
- * If the reader misses the writer's assignment of readers_block, then
- * the writer is guaranteed to see the reader's increment.
+ * If the reader misses the writer's assignment of sem->block, then the
+ * writer is guaranteed to see the reader's increment.
*
* Conversely, any readers that increment their sem->read_count after
- * the writer looks are guaranteed to see the readers_block value,
- * which in turn means that they are guaranteed to immediately
- * decrement their sem->read_count, so that it doesn't matter that the
- * writer missed them.
+ * the writer looks are guaranteed to see the sem->block value, which
+ * in turn means that they are guaranteed to immediately decrement
+ * their sem->read_count, so that it doesn't matter that the writer
+ * missed them.
*/
smp_mb(); /* A matches D */
/*
- * If !readers_block the critical section starts here, matched by the
+ * If !sem->block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(!atomic_read_acquire(&sem->readers_block)))
+ if (likely(!atomic_read_acquire(&sem->block)))
return true;
__this_cpu_dec(*sem->read_count);
@@ -81,6 +79,75 @@ static bool __percpu_down_read_trylock(s
return false;
}
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+ if (atomic_read(&sem->block))
+ return false;
+
+ return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+ if (reader) {
+ bool ret;
+
+ preempt_disable();
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ return ret;
+ }
+ return __percpu_down_write_trylock(sem);
+}
+
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ struct task_struct *p = get_task_struct(wq_entry->private);
+ bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+ struct percpu_rw_semaphore *sem = key;
+
+ /* concurrent against percpu_down_write(), can get stolen */
+ if (!__percpu_rwsem_trylock(sem, reader))
+ return 1;
+
+ list_del_init(&wq_entry->entry);
+ smp_store_release(&wq_entry->private, NULL);
+
+ wake_up_process(p);
+ put_task_struct(p);
+
+ return !reader; /* wake 'all' readers and 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+{
+ DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+ bool wait;
+
+ spin_lock_irq(&sem->waiters.lock);
+ /*
+ * Serialize against the wakeup in percpu_up_write(), if we fail
+ * the trylock, the wakeup must see us on the list.
+ */
+ wait = !__percpu_rwsem_trylock(sem, reader);
+ if (wait) {
+ wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+ __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+ }
+ spin_unlock_irq(&sem->waiters.lock);
+
+ while (wait) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!smp_load_acquire(&wq_entry.private))
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
@@ -89,20 +156,10 @@ bool __percpu_down_read(struct percpu_rw
if (try)
return false;
- /*
- * We either call schedule() in the wait, or we'll fall through
- * and reschedule on the preempt_enable() in percpu_down_read().
- */
- preempt_enable_no_resched();
-
- /*
- * Avoid lockdep for the down/up_read() we already have them.
- */
- __down_read(&sem->rw_sem);
- this_cpu_inc(*sem->read_count);
- __up_read(&sem->rw_sem);
-
+ preempt_enable();
+ percpu_rwsem_wait(sem, /* .reader = */ true );
preempt_disable();
+
return true;
}
EXPORT_SYMBOL_GPL(__percpu_down_read);
@@ -117,7 +174,7 @@ void __percpu_up_read(struct percpu_rw_s
*/
__this_cpu_dec(*sem->read_count);
- /* Prod writer to recheck readers_active */
+ /* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);
}
EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -137,6 +194,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
* zero. If this sum is zero, then it is stable due to the fact that if any
* newly arriving readers increment a given counter, they will immediately
* decrement that same counter.
+ *
+ * Assumes sem->block is set.
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
@@ -160,23 +219,22 @@ void percpu_down_write(struct percpu_rw_
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- __down_write(&sem->rw_sem);
-
/*
- * Notify new readers to block; up until now, and thus throughout the
- * longish rcu_sync_enter() above, new readers could still come in.
+ * Try set sem->block; this provides writer-writer exclusion.
+ * Having sem->block set makes new readers block.
*/
- WRITE_ONCE(sem->readers_block, 1);
+ if (!__percpu_down_write_trylock(sem))
+ percpu_rwsem_wait(sem, /* .reader = */ false);
- smp_mb(); /* D matches A */
+ /* smp_mb() implied by __percpu_down_writer_trylock() on success -- D matches A */
/*
- * If they don't see our writer of readers_block, then we are
- * guaranteed to see their sem->read_count increment, and therefore
- * will wait for them.
+ * If they don't see our store of sem->block, then we are guaranteed to
+ * see their sem->read_count increment, and therefore will wait for
+ * them.
*/
- /* Wait for all now active readers to complete. */
+ /* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -195,12 +253,12 @@ void percpu_up_write(struct percpu_rw_se
* Therefore we force it through the slow path which guarantees an
* acquire and thereby guarantees the critical section's consistency.
*/
- smp_store_release(&sem->readers_block, 0);
+ atomic_set_release(&sem->block, 0);
/*
- * Release the write lock, this will allow readers back in the game.
+ * Prod any pending reader/writer to make progress.
*/
- __up_write(&sem->rw_sem);
+ __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
/*
* Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,7 +28,6 @@
#include <linux/rwsem.h>
#include <linux/atomic.h>
-#include "rwsem.h"
#include "lock_events.h"
/*
@@ -1338,7 +1337,7 @@ static struct rw_semaphore *rwsem_downgr
/*
* lock for reading
*/
-inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
{
if (!rwsem_read_trylock(sem)) {
rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
@@ -1383,7 +1382,7 @@ static inline int __down_read_trylock(st
/*
* lock for writing
*/
-inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
@@ -1426,7 +1425,7 @@ static inline int __down_write_trylock(s
/*
* unlock after reading
*/
-inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
@@ -1446,7 +1445,7 @@ inline void __up_read(struct rw_semaphor
/*
* unlock after writing
*/
-inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __INTERNAL_RWSEM_H
-#define __INTERNAL_RWSEM_H
-#include <linux/rwsem.h>
-
-extern void __down_read(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-extern void __down_write(struct rw_semaphore *sem);
-extern void __up_write(struct rw_semaphore *sem);
-
-#endif /* __INTERNAL_RWSEM_H */
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-13 10:21 ` [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
@ 2019-11-18 19:53 ` Davidlohr Bueso
2019-11-18 23:19 ` Davidlohr Bueso
2019-12-17 10:35 ` Peter Zijlstra
2019-11-18 21:52 ` Waiman Long
2019-11-19 13:50 ` Waiman Long
2 siblings, 2 replies; 23+ messages in thread
From: Davidlohr Bueso @ 2019-11-18 19:53 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
williams, bristot, longman, jack
On Wed, 13 Nov 2019, Peter Zijlstra wrote:
>@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
> * the same CPU as the increment, avoiding the
> * increment-on-one-CPU-and-decrement-on-another problem.
Nit: Now that you've made read_count more symmetric, maybe this first
paragraph can be moved down to __percpu_rwsem_trylock() reader side,
as such:
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*/
preempt_disable();
ret = __percpu_down_read_trylock(sem);
preempt_enable();
> *
>- * If the reader misses the writer's assignment of readers_block, then
>- * the writer is guaranteed to see the reader's increment.
>+ * If the reader misses the writer's assignment of sem->block, then the
>+ * writer is guaranteed to see the reader's increment.
...
> bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
> {
> if (__percpu_down_read_trylock(sem))
>@@ -89,20 +156,10 @@ bool __percpu_down_read(struct percpu_rw
> if (try)
> return false;
>
>- /*
>- * We either call schedule() in the wait, or we'll fall through
>- * and reschedule on the preempt_enable() in percpu_down_read().
>- */
>- preempt_enable_no_resched();
>-
>- /*
>- * Avoid lockdep for the down/up_read() we already have them.
>- */
>- __down_read(&sem->rw_sem);
>- this_cpu_inc(*sem->read_count);
>- __up_read(&sem->rw_sem);
>-
>+ preempt_enable();
>+ percpu_rwsem_wait(sem, /* .reader = */ true );
> preempt_disable();
>+
> return true;
> }
> EXPORT_SYMBOL_GPL(__percpu_down_read);
Do we really need to export symbol here? This function is only called
from percpu-rwsem.h.
>@@ -117,7 +174,7 @@ void __percpu_up_read(struct percpu_rw_s
> */
> __this_cpu_dec(*sem->read_count);
>
>- /* Prod writer to recheck readers_active */
>+ /* Prod writer to re-evaluate readers_active_check() */
> rcuwait_wake_up(&sem->writer);
> }
> EXPORT_SYMBOL_GPL(__percpu_up_read);
>@@ -137,6 +194,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
> * zero. If this sum is zero, then it is stable due to the fact that if any
> * newly arriving readers increment a given counter, they will immediately
> * decrement that same counter.
>+ *
>+ * Assumes sem->block is set.
> */
> static bool readers_active_check(struct percpu_rw_semaphore *sem)
> {
>@@ -160,23 +219,22 @@ void percpu_down_write(struct percpu_rw_
> /* Notify readers to take the slow path. */
> rcu_sync_enter(&sem->rss);
>
>- __down_write(&sem->rw_sem);
>-
> /*
>- * Notify new readers to block; up until now, and thus throughout the
>- * longish rcu_sync_enter() above, new readers could still come in.
>+ * Try set sem->block; this provides writer-writer exclusion.
>+ * Having sem->block set makes new readers block.
> */
>- WRITE_ONCE(sem->readers_block, 1);
>+ if (!__percpu_down_write_trylock(sem))
>+ percpu_rwsem_wait(sem, /* .reader = */ false);
>
>- smp_mb(); /* D matches A */
>+ /* smp_mb() implied by __percpu_down_writer_trylock() on success -- D matches A */
^^^
write
...
>--- a/kernel/locking/rwsem.h
>+++ b/kernel/locking/rwsem.h
>@@ -1,12 +0,0 @@
>-/* SPDX-License-Identifier: GPL-2.0 */
>-
>-#ifndef __INTERNAL_RWSEM_H
>-#define __INTERNAL_RWSEM_H
>-#include <linux/rwsem.h>
>-
>-extern void __down_read(struct rw_semaphore *sem);
>-extern void __up_read(struct rw_semaphore *sem);
>-extern void __down_write(struct rw_semaphore *sem);
>-extern void __up_write(struct rw_semaphore *sem);
This is a nice side effect.
Thanks,
Davidlohr
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-18 19:53 ` Davidlohr Bueso
@ 2019-11-18 23:19 ` Davidlohr Bueso
2019-12-17 10:45 ` Peter Zijlstra
2019-12-17 10:35 ` Peter Zijlstra
1 sibling, 1 reply; 23+ messages in thread
From: Davidlohr Bueso @ 2019-11-18 23:19 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
williams, bristot, longman, jack
On Mon, 18 Nov 2019, Davidlohr Bueso wrote:
>On Wed, 13 Nov 2019, Peter Zijlstra wrote:
3>>bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
>>{
>> if (__percpu_down_read_trylock(sem))
>>@@ -89,20 +156,10 @@ bool __percpu_down_read(struct percpu_rw
>> if (try)
>> return false;
>>
>>- /*
>>- * We either call schedule() in the wait, or we'll fall through
>>- * and reschedule on the preempt_enable() in percpu_down_read().
>>- */
>>- preempt_enable_no_resched();
>>-
>>- /*
>>- * Avoid lockdep for the down/up_read() we already have them.
>>- */
>>- __down_read(&sem->rw_sem);
>>- this_cpu_inc(*sem->read_count);
>>- __up_read(&sem->rw_sem);
>>-
>>+ preempt_enable();
>>+ percpu_rwsem_wait(sem, /* .reader = */ true );
>> preempt_disable();
>>+
>> return true;
>>}
>>EXPORT_SYMBOL_GPL(__percpu_down_read);
>
>Do we really need to export symbol here? This function is only called
>from percpu-rwsem.h.
Similarly, afaict we can get rid of __percpu_up_read() and put the
slowpath all into percpu_up_read(). Also explicitly mention the
single task nature of the writer (which is a better comment for
the rcuwait_wake_up()).
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index f5ecf6a8a1dd..eda545f42fb8 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -43,7 +43,6 @@ is_static struct percpu_rw_semaphore name = { \
__DEFINE_PERCPU_RWSEM(name, static)
extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
-extern void __percpu_up_read(struct percpu_rw_semaphore *);
static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
@@ -103,10 +102,23 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
/*
* Same as in percpu_down_read().
*/
- if (likely(rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss))) {
__this_cpu_dec(*sem->read_count);
- else
- __percpu_up_read(sem); /* Unconditional memory barrier */
+ goto done;
+ }
+
+ /*
+ * slowpath; reader will only ever wake a single blocked writer.
+ */
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to
+ * aggregate zero, as that is the only time it matters) they
+ * will also see our critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
+ rcuwait_wake_up(&sem->writer);
+done:
preempt_enable();
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 851038468efb..a5150a876626 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -164,21 +164,6 @@ bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
}
EXPORT_SYMBOL_GPL(__percpu_down_read);
-void __percpu_up_read(struct percpu_rw_semaphore *sem)
-{
- smp_mb(); /* B matches C */
- /*
- * In other words, if they see our decrement (presumably to aggregate
- * zero, as that is the only time it matters) they will also see our
- * critical section.
- */
- __this_cpu_dec(*sem->read_count);
-
- /* Prod writer to re-evaluate readers_active_check() */
- rcuwait_wake_up(&sem->writer);
-}
-EXPORT_SYMBOL_GPL(__percpu_up_read);
-
#define per_cpu_sum(var) \
({ \
typeof(var) __sum = 0; \
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-18 23:19 ` Davidlohr Bueso
@ 2019-12-17 10:45 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-12-17 10:45 UTC (permalink / raw)
To: Davidlohr Bueso
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
williams, bristot, longman, jack
On Mon, Nov 18, 2019 at 03:19:35PM -0800, Davidlohr Bueso wrote:
> Similarly, afaict we can get rid of __percpu_up_read() and put the
> slowpath all into percpu_up_read(). Also explicitly mention the
> single task nature of the writer (which is a better comment for
> the rcuwait_wake_up()).
> static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> {
> @@ -103,10 +102,23 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
> /*
> * Same as in percpu_down_read().
> */
> + if (likely(rcu_sync_is_idle(&sem->rss))) {
> __this_cpu_dec(*sem->read_count);
> + goto done;
> + }
> +
> + /*
> + * slowpath; reader will only ever wake a single blocked writer.
> + */
> + smp_mb(); /* B matches C */
> + /*
> + * In other words, if they see our decrement (presumably to
> + * aggregate zero, as that is the only time it matters) they
> + * will also see our critical section.
> + */
> + __this_cpu_dec(*sem->read_count);
> + rcuwait_wake_up(&sem->writer);
> +done:
> preempt_enable();
> }
Let me write that as a normal if () { } else { }.
But yes, that's small enough I suppose.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-18 19:53 ` Davidlohr Bueso
2019-11-18 23:19 ` Davidlohr Bueso
@ 2019-12-17 10:35 ` Peter Zijlstra
1 sibling, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-12-17 10:35 UTC (permalink / raw)
To: Davidlohr Bueso
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
williams, bristot, longman, jack
On Mon, Nov 18, 2019 at 11:53:04AM -0800, Davidlohr Bueso wrote:
> On Wed, 13 Nov 2019, Peter Zijlstra wrote:
> > @@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
> > * the same CPU as the increment, avoiding the
> > * increment-on-one-CPU-and-decrement-on-another problem.
>
> Nit: Now that you've made read_count more symmetric, maybe this first
> paragraph can be moved down to __percpu_rwsem_trylock() reader side,
> as such:
>
> /*
> * Due to having preemption disabled the decrement happens on
> * the same CPU as the increment, avoiding the
> * increment-on-one-CPU-and-decrement-on-another problem.
> */
> preempt_disable();
> ret = __percpu_down_read_trylock(sem);
> preempt_enable();
There's another callsite for that function too, so I think the current
place still works best.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-13 10:21 ` [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
2019-11-18 19:53 ` Davidlohr Bueso
@ 2019-11-18 21:52 ` Waiman Long
2019-12-17 10:28 ` Peter Zijlstra
2019-11-19 13:50 ` Waiman Long
2 siblings, 1 reply; 23+ messages in thread
From: Waiman Long @ 2019-11-18 21:52 UTC (permalink / raw)
To: Peter Zijlstra, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
dave, jack
On 11/13/19 5:21 AM, Peter Zijlstra wrote:
> @@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
> bool read, unsigned long ip)
> {
> lock_release(&sem->dep_map, ip);
> -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
> - if (!read)
> - atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
> -#endif
> }
>
This is the only place that set RWSEM_OWNER_UNKNOWN. So you may as well
remove all references to it:
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 00d6054687dd..8a418d9eeb7a 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -53,12 +53,6 @@ struct rw_semaphore {
#endif
};
-/*
- * Setting all bits of the owner field except bit 0 will indicate
- * that the rwsem is writer-owned with an unknown owner.
- */
-#define RWSEM_OWNER_UNKNOWN (-2L)
-
/* In all implementations count != 0 means locked */
static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index eef04551eae7..622842754c73 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -660,8 +660,6 @@ static inline bool rwsem_can_spin_on_owner(struct
rw_semapho
unsigned long flags;
bool ret = true;
- BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
if (need_resched()) {
lockevent_inc(rwsem_opt_fail);
return false;
Cheers,
Longman
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-18 21:52 ` Waiman Long
@ 2019-12-17 10:28 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-12-17 10:28 UTC (permalink / raw)
To: Waiman Long
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, juri.lelli,
williams, bristot, dave, jack
On Mon, Nov 18, 2019 at 04:52:23PM -0500, Waiman Long wrote:
> On 11/13/19 5:21 AM, Peter Zijlstra wrote:
> > @@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
> > bool read, unsigned long ip)
> > {
> > lock_release(&sem->dep_map, ip);
> > -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
> > - if (!read)
> > - atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
> > -#endif
> > }
> >
>
> This is the only place that set RWSEM_OWNER_UNKNOWN. So you may as well
> remove all references to it:
Done, thanks!
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-13 10:21 ` [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
2019-11-18 19:53 ` Davidlohr Bueso
2019-11-18 21:52 ` Waiman Long
@ 2019-11-19 13:50 ` Waiman Long
2019-11-19 15:58 ` Oleg Nesterov
2 siblings, 1 reply; 23+ messages in thread
From: Waiman Long @ 2019-11-19 13:50 UTC (permalink / raw)
To: Peter Zijlstra, mingo, will
Cc: oleg, tglx, linux-kernel, bigeasy, juri.lelli, williams, bristot,
dave, jack
On 11/13/19 5:21 AM, Peter Zijlstra wrote:
> +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
> + unsigned int mode, int wake_flags,
> + void *key)
> +{
> + struct task_struct *p = get_task_struct(wq_entry->private);
> + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
> + struct percpu_rw_semaphore *sem = key;
> +
> + /* concurrent against percpu_down_write(), can get stolen */
> + if (!__percpu_rwsem_trylock(sem, reader))
> + return 1;
> +
> + list_del_init(&wq_entry->entry);
> + smp_store_release(&wq_entry->private, NULL);
> +
> + wake_up_process(p);
> + put_task_struct(p);
> +
> + return !reader; /* wake 'all' readers and 1 writer */
> +}
> +
> +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
> +{
> + DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
> + bool wait;
> +
> + spin_lock_irq(&sem->waiters.lock);
> + /*
> + * Serialize against the wakeup in percpu_up_write(), if we fail
> + * the trylock, the wakeup must see us on the list.
> + */
> + wait = !__percpu_rwsem_trylock(sem, reader);
> + if (wait) {
> + wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
> + __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
> + }
> + spin_unlock_irq(&sem->waiters.lock);
> +
> + while (wait) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + if (!smp_load_acquire(&wq_entry.private))
> + break;
> + schedule();
> + }
If I read the function correctly, you are setting the WQ_FLAG_EXCLUSIVE
for both readers and writers and __wake_up() is called with an exclusive
count of one. So only one reader or writer is woken up each time.
However, the comment above said we wake 'all' readers and 1 writer. That
doesn't match the actual code, IMO. To match the comments, you should
have set WQ_FLAG_EXCLUSIVE flag only on writer. In this case, you
probably don't need WQ_FLAG_CUSTOM to differentiate between readers and
writers.
Cheers,
Longman
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-19 13:50 ` Waiman Long
@ 2019-11-19 15:58 ` Oleg Nesterov
2019-11-19 16:28 ` Waiman Long
2019-12-17 10:26 ` Peter Zijlstra
0 siblings, 2 replies; 23+ messages in thread
From: Oleg Nesterov @ 2019-11-19 15:58 UTC (permalink / raw)
To: Waiman Long
Cc: Peter Zijlstra, mingo, will, tglx, linux-kernel, bigeasy,
juri.lelli, williams, bristot, dave, jack
On 11/19, Waiman Long wrote:
>
> On 11/13/19 5:21 AM, Peter Zijlstra wrote:
> > +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
> > + unsigned int mode, int wake_flags,
> > + void *key)
> > +{
> > + struct task_struct *p = get_task_struct(wq_entry->private);
> > + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
> > + struct percpu_rw_semaphore *sem = key;
> > +
> > + /* concurrent against percpu_down_write(), can get stolen */
> > + if (!__percpu_rwsem_trylock(sem, reader))
> > + return 1;
> > +
> > + list_del_init(&wq_entry->entry);
> > + smp_store_release(&wq_entry->private, NULL);
> > +
> > + wake_up_process(p);
> > + put_task_struct(p);
> > +
> > + return !reader; /* wake 'all' readers and 1 writer */
> > +}
> > +
>
> If I read the function correctly, you are setting the WQ_FLAG_EXCLUSIVE
> for both readers and writers and __wake_up() is called with an exclusive
> count of one. So only one reader or writer is woken up each time.
This depends on what percpu_rwsem_wake_function() returns. If it returns 1,
__wake_up_common() stops, exactly because all waiters have WQ_FLAG_EXCLUSIVE.
> However, the comment above said we wake 'all' readers and 1 writer. That
> doesn't match the actual code, IMO.
Well, "'all' readers" probably means "all readers before writer",
> To match the comments, you should
> have set WQ_FLAG_EXCLUSIVE flag only on writer. In this case, you
> probably don't need WQ_FLAG_CUSTOM to differentiate between readers and
> writers.
See above...
note also the
if (!__percpu_rwsem_trylock(sem, reader))
return 1;
at the start of percpu_rwsem_wake_function(). We want to stop wake_up_common()
as soon as percpu_rwsem_trylock() fails. Because we know that if it fails once
it can't succeed later. Although iiuc this can only happen if another (new)
writer races with __wake_up(&sem->waiters).
I guess WQ_FLAG_CUSTOM can be avoided, percpu_rwsem_wait() could do
if (read)
__add_wait_queue_entry_tail(...);
else {
wq_entry.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(...);
}
but this is "unfair".
Oleg.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-19 15:58 ` Oleg Nesterov
@ 2019-11-19 16:28 ` Waiman Long
2019-12-17 10:26 ` Peter Zijlstra
1 sibling, 0 replies; 23+ messages in thread
From: Waiman Long @ 2019-11-19 16:28 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Peter Zijlstra, mingo, will, tglx, linux-kernel, bigeasy,
juri.lelli, williams, bristot, dave, jack
On 11/19/19 10:58 AM, Oleg Nesterov wrote:
> On 11/19, Waiman Long wrote:
>> On 11/13/19 5:21 AM, Peter Zijlstra wrote:
>>> +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
>>> + unsigned int mode, int wake_flags,
>>> + void *key)
>>> +{
>>> + struct task_struct *p = get_task_struct(wq_entry->private);
>>> + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
>>> + struct percpu_rw_semaphore *sem = key;
>>> +
>>> + /* concurrent against percpu_down_write(), can get stolen */
>>> + if (!__percpu_rwsem_trylock(sem, reader))
>>> + return 1;
>>> +
>>> + list_del_init(&wq_entry->entry);
>>> + smp_store_release(&wq_entry->private, NULL);
>>> +
>>> + wake_up_process(p);
>>> + put_task_struct(p);
>>> +
>>> + return !reader; /* wake 'all' readers and 1 writer */
>>> +}
>>> +
>> If I read the function correctly, you are setting the WQ_FLAG_EXCLUSIVE
>> for both readers and writers and __wake_up() is called with an exclusive
>> count of one. So only one reader or writer is woken up each time.
> This depends on what percpu_rwsem_wake_function() returns. If it returns 1,
> __wake_up_common() stops, exactly because all waiters have WQ_FLAG_EXCLUSIVE.
>
>> However, the comment above said we wake 'all' readers and 1 writer. That
>> doesn't match the actual code, IMO.
> Well, "'all' readers" probably means "all readers before writer",
>
>> To match the comments, you should
>> have set WQ_FLAG_EXCLUSIVE flag only on writer. In this case, you
>> probably don't need WQ_FLAG_CUSTOM to differentiate between readers and
>> writers.
> See above...
>
> note also the
>
> if (!__percpu_rwsem_trylock(sem, reader))
> return 1;
>
> at the start of percpu_rwsem_wake_function(). We want to stop wake_up_common()
> as soon as percpu_rwsem_trylock() fails. Because we know that if it fails once
> it can't succeed later. Although iiuc this can only happen if another (new)
> writer races with __wake_up(&sem->waiters).
>
>
> I guess WQ_FLAG_CUSTOM can be avoided, percpu_rwsem_wait() could do
>
> if (read)
> __add_wait_queue_entry_tail(...);
> else {
> wq_entry.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(...);
> }
>
> but this is "unfair".
Thanks for the explanation. That clarifies my understanding of the patch.
Cheers,
Longman
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-11-19 15:58 ` Oleg Nesterov
2019-11-19 16:28 ` Waiman Long
@ 2019-12-17 10:26 ` Peter Zijlstra
2019-12-17 10:28 ` Peter Zijlstra
1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2019-12-17 10:26 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Waiman Long, mingo, will, tglx, linux-kernel, bigeasy,
juri.lelli, williams, bristot, dave, jack
On Tue, Nov 19, 2019 at 04:58:26PM +0100, Oleg Nesterov wrote:
> On 11/19, Waiman Long wrote:
> >
> > On 11/13/19 5:21 AM, Peter Zijlstra wrote:
> > > +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
> > > + unsigned int mode, int wake_flags,
> > > + void *key)
> > > +{
> > > + struct task_struct *p = get_task_struct(wq_entry->private);
> > > + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
> > > + struct percpu_rw_semaphore *sem = key;
> > > +
> > > + /* concurrent against percpu_down_write(), can get stolen */
> > > + if (!__percpu_rwsem_trylock(sem, reader))
> > > + return 1;
> > > +
> > > + list_del_init(&wq_entry->entry);
> > > + smp_store_release(&wq_entry->private, NULL);
> > > +
> > > + wake_up_process(p);
> > > + put_task_struct(p);
> > > +
> > > + return !reader; /* wake 'all' readers and 1 writer */
> > > +}
> > > +
> >
> > If I read the function correctly, you are setting the WQ_FLAG_EXCLUSIVE
> > for both readers and writers and __wake_up() is called with an exclusive
> > count of one. So only one reader or writer is woken up each time.
>
> This depends on what percpu_rwsem_wake_function() returns. If it returns 1,
> __wake_up_common() stops, exactly because all waiters have WQ_FLAG_EXCLUSIVE.
Indeed, let me see if I can clarify that somehow.
> > However, the comment above said we wake 'all' readers and 1 writer. That
> > doesn't match the actual code, IMO.
>
> Well, "'all' readers" probably means "all readers before writer",
Correct.
> > To match the comments, you should
> > have set WQ_FLAG_EXCLUSIVE flag only on writer. In this case, you
> > probably don't need WQ_FLAG_CUSTOM to differentiate between readers and
> > writers.
>
> See above...
>
> note also the
>
> if (!__percpu_rwsem_trylock(sem, reader))
> return 1;
>
> at the start of percpu_rwsem_wake_function(). We want to stop wake_up_common()
> as soon as percpu_rwsem_trylock() fails. Because we know that if it fails once
> it can't succeed later. Although iiuc this can only happen if another (new)
> writer races with __wake_up(&sem->waiters).
Yes, writer-writer stealing can cause that. I even put a comment in
there :-)
> I guess WQ_FLAG_CUSTOM can be avoided, percpu_rwsem_wait() could do
>
> if (read)
> __add_wait_queue_entry_tail(...);
> else {
> wq_entry.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(...);
> }
>
> but this is "unfair".
Yes, I could not make it fair without that extra bit, and I figured we
have plenty bits there to play with so why not.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem
2019-12-17 10:26 ` Peter Zijlstra
@ 2019-12-17 10:28 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-12-17 10:28 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Waiman Long, mingo, will, tglx, linux-kernel, bigeasy,
juri.lelli, williams, bristot, dave, jack
On Tue, Dec 17, 2019 at 11:26:54AM +0100, Peter Zijlstra wrote:
> On Tue, Nov 19, 2019 at 04:58:26PM +0100, Oleg Nesterov wrote:
> > On 11/19, Waiman Long wrote:
> > >
> > > On 11/13/19 5:21 AM, Peter Zijlstra wrote:
> > > > +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
> > > > + unsigned int mode, int wake_flags,
> > > > + void *key)
> > > > +{
> > > > + struct task_struct *p = get_task_struct(wq_entry->private);
> > > > + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
> > > > + struct percpu_rw_semaphore *sem = key;
> > > > +
> > > > + /* concurrent against percpu_down_write(), can get stolen */
> > > > + if (!__percpu_rwsem_trylock(sem, reader))
> > > > + return 1;
> > > > +
> > > > + list_del_init(&wq_entry->entry);
> > > > + smp_store_release(&wq_entry->private, NULL);
> > > > +
> > > > + wake_up_process(p);
> > > > + put_task_struct(p);
> > > > +
> > > > + return !reader; /* wake 'all' readers and 1 writer */
> > > > +}
> > > > +
> > >
> > > If I read the function correctly, you are setting the WQ_FLAG_EXCLUSIVE
> > > for both readers and writers and __wake_up() is called with an exclusive
> > > count of one. So only one reader or writer is woken up each time.
> >
> > This depends on what percpu_rwsem_wake_function() returns. If it returns 1,
> > __wake_up_common() stops, exactly because all waiters have WQ_FLAG_EXCLUSIVE.
>
> Indeed, let me see if I can clarify that somehow.
>
> > > However, the comment above said we wake 'all' readers and 1 writer. That
> > > doesn't match the actual code, IMO.
> >
> > Well, "'all' readers" probably means "all readers before writer",
>
> Correct.
Does this clarify?
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -101,6 +101,19 @@ static bool __percpu_rwsem_trylock(struc
return __percpu_down_write_trylock(sem);
}
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ * <0 - error, wakeup is terminated and the error is returned
+ * 0 - no wakeup, a next waiter is tried
+ * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
unsigned int mode, int wake_flags,
void *key)
@@ -119,7 +132,7 @@ static int percpu_rwsem_wake_function(st
wake_up_process(p);
put_task_struct(p);
- return !reader; /* wake 'all' readers and 1 writer */
+ return !reader; /* wake (readers until) 1 writer */
}
static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 0/5] locking: Percpu-rwsem rewrite
2019-11-13 10:21 [PATCH 0/5] locking: Percpu-rwsem rewrite Peter Zijlstra
` (4 preceding siblings ...)
2019-11-13 10:21 ` [PATCH 5/5] locking/percpu-rwsem: Remove the embedded rwsem Peter Zijlstra
@ 2019-11-15 17:14 ` Juri Lelli
5 siblings, 0 replies; 23+ messages in thread
From: Juri Lelli @ 2019-11-15 17:14 UTC (permalink / raw)
To: Peter Zijlstra
Cc: mingo, will, oleg, tglx, linux-kernel, bigeasy, williams,
bristot, longman, dave, jack
Hi,
On 13/11/19 11:21, Peter Zijlstra wrote:
> Yet another version of the percpu-rwsem rewrite..
>
> This one (ab)uses the waitqueue in an entirely different and unique way, but no
> longer shares it like it did. It retains the use of rcuwait for the
> writer-waiting-for-readers-to-complete condition.
>
> This one should be FIFO fair with writer-stealing.
>
> It seems to pass locktorture torture_type=percpu_rwsem_lock. But as always,
> this stuff is tricky, please look carefully.
Backported this series to v5.2.21-rt13.
locktorture looks good (running for several hours) and DEBUG_LOCKS splat
[1] not reproducible anymore.
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Thanks!
Juri
1 - https://lore.kernel.org/lkml/20190326093421.GA29508@localhost.localdomain/
^ permalink raw reply [flat|nested] 23+ messages in thread