linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
@ 2019-08-05 14:02 Peter Zijlstra
  2019-08-05 14:43 ` Boqun Feng
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-08-05 14:02 UTC (permalink / raw)
  To: Oleg Nesterov, Will Deacon, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, bigeasy, juri.lelli, williams, bristot, longman,
	dave, oleg, jack


The filesystem freezer uses percpu_rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.

When -RT re-implements rwsem this house of cards comes undone.

Re-implement percpu_rwsem without relying on rwsem.

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: jack@suse.com
---
 include/linux/percpu-rwsem.h  |   72 +++++++++++++-------------
 include/linux/wait.h          |    3 +
 kernel/cpu.c                  |    4 -
 kernel/locking/percpu-rwsem.c |  116 +++++++++++++++++++++++++-----------------
 4 files changed, 112 insertions(+), 83 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,39 +5,49 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/rcuwait.h>
+#include <linux/wait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem; /* slowpath */
-	struct rcuwait          writer; /* blocked writer */
-	int			readers_block;
+	wait_queue_head_t	waiters;
+	atomic_t		block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
 #define __DEFINE_PERCPU_RWSEM(name, is_static)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
 is_static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
-	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
-	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	.waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),		\
+	.block = ATOMIC_INIT(0),					\
+	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
+
 #define DEFINE_PERCPU_RWSEM(name)		\
 	__DEFINE_PERCPU_RWSEM(name, /* not static */)
 #define DEFINE_STATIC_PERCPU_RWSEM(name)	\
 	__DEFINE_PERCPU_RWSEM(name, static)
 
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 
-	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	preempt_disable();
 	/*
@@ -58,42 +68,42 @@ static inline void percpu_down_read(stru
 	preempt_enable();
 }
 
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	int ret = 1;
-
 	preempt_disable();
 	/*
 	 * Same as in percpu_down_read().
 	 */
 	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
-		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+	if (unlikely(!rcu_sync_is_idle(&sem->rss))) {
+		if (!__percpu_down_read(sem, true)) /* Unconditional memory barrier */
+			return false;
+	}
 	preempt_enable();
 	/*
 	 * The barrier() from preempt_enable() prevents the compiler from
 	 * bleeding the critical section out.
 	 */
 
-	if (ret)
-		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
-
-	return ret;
+	rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+	return true;
 }
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
+	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
 	preempt_disable();
 	/*
 	 * Same as in percpu_down_read().
 	 */
-	if (likely(rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss))) {
 		__this_cpu_dec(*sem->read_count);
-	else
-		__percpu_up_read(sem); /* Unconditional memory barrier */
-	preempt_enable();
+		preempt_enable();
+		return;
+	}
 
-	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+	__percpu_up_read(sem); /* Unconditional memory barrier */
 }
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,29 +120,19 @@ extern void percpu_free_rwsem(struct per
 	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem)				\
-	lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem)	lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem)	lockdep_assert_held(sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_release(&sem->rw_sem.dep_map, 1, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
+	lock_release(&sem->dep_map, 1, ip);
 }
 
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
+	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
 }
 
 #endif
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -408,6 +408,9 @@ do {										\
 	__wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);		\
 } while (0)
 
+#define wait_event_exclusive(wq_head, condition)				\
+	wait_event_exclusive_cmd(wq_head, condition, ,)
+
 #define __wait_event_cmd(wq_head, condition, cmd1, cmd2)			\
 	(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
 			    cmd1; schedule(); cmd2)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
 
 static void lockdep_acquire_cpus_lock(void)
 {
-	rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+	rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
 }
 
 static void lockdep_release_cpus_lock(void)
 {
-	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+	rwsem_release(&cpu_hotplug_lock.dep_map, 1, _THIS_IP_);
 }
 
 /*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -2,6 +2,7 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
+#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
@@ -11,17 +12,19 @@
 #include "rwsem.h"
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
-			const char *name, struct lock_class_key *rwsem_key)
+			const char *name, struct lock_class_key *key)
 {
 	sem->read_count = alloc_percpu(int);
 	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	__init_rwsem(&sem->rw_sem, name, rwsem_key);
-	rcuwait_init(&sem->writer);
-	sem->readers_block = 0;
+	init_waitqueue_head(&sem->waiters);
+	atomic_set(&sem->block, 0);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,59 +44,62 @@ void percpu_free_rwsem(struct percpu_rw_
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+/*
+ * Called with preemption disabled, and returns with preemption disabled,
+ * except when it fails the trylock.
+ */
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	/*
 	 * Due to having preemption disabled the decrement happens on
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * If the reader misses the writer's assignment of readers_block, then
-	 * the writer is guaranteed to see the reader's increment.
+	 * If the reader misses the writer's assignment of sem->block, then the
+	 * writer is guaranteed to see the reader's increment.
 	 *
 	 * Conversely, any readers that increment their sem->read_count after
-	 * the writer looks are guaranteed to see the readers_block value,
-	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter that the
-	 * writer missed them.
+	 * the writer looks are guaranteed to see the sem->block value, which
+	 * in turn means that they are guaranteed to immediately decrement
+	 * their sem->read_count, so that it doesn't matter that the writer
+	 * missed them.
 	 */
 
+again:
 	smp_mb(); /* A matches D */
 
 	/*
-	 * If !readers_block the critical section starts here, matched by the
+	 * If !sem->block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(!smp_load_acquire(&sem->readers_block)))
-		return 1;
+	if (likely(!atomic_read_acquire(&sem->block)))
+		return true;
 
 	/*
 	 * Per the above comment; we still have preemption disabled and
 	 * will thus decrement on the same CPU as we incremented.
 	 */
-	__percpu_up_read(sem);
+	__percpu_up_read(sem); /* implies preempt_enable() */
 
 	if (try)
-		return 0;
+		return false;
 
-	/*
-	 * We either call schedule() in the wait, or we'll fall through
-	 * and reschedule on the preempt_enable() in percpu_down_read().
-	 */
-	preempt_enable_no_resched();
+	wait_event(sem->waiters, !atomic_read_acquire(&sem->block));
+	preempt_disable();
+	__this_cpu_inc(*sem->read_count);
 
 	/*
-	 * Avoid lockdep for the down/up_read() we already have them.
+	 * percpu_down_write() could've set sem->block right after we've seen
+	 * it 0 but missed our this_cpu_inc(), which is exactly the condition
+	 * we get called for from percpu_down_read().
 	 */
-	__down_read(&sem->rw_sem);
-	this_cpu_inc(*sem->read_count);
-	__up_read(&sem->rw_sem);
-
-	preempt_disable();
-	return 1;
+	goto again;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 
+/*
+ * Called with preemption disabled, returns with preemption enabled.
+ */
 void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	smp_mb(); /* B matches C */
@@ -103,9 +109,10 @@ void __percpu_up_read(struct percpu_rw_s
 	 * critical section.
 	 */
 	__this_cpu_dec(*sem->read_count);
+	preempt_enable();
 
-	/* Prod writer to recheck readers_active */
-	rcuwait_wake_up(&sem->writer);
+	/* Prod writer to re-evaluate readers_active_check() */
+	wake_up(&sem->waiters);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
 
@@ -124,6 +131,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
  * zero.  If this sum is zero, then it is stable due to the fact that if any
  * newly arriving readers increment a given counter, they will immediately
  * decrement that same counter.
+ *
+ * Assumes sem->block is set.
  */
 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
@@ -140,29 +149,37 @@ static bool readers_active_check(struct
 	return true;
 }
 
+static inline bool try_acquire_block(struct percpu_rw_semaphore *sem)
+{
+	if (atomic_read(&sem->block))
+		return false;
+
+	return atomic_xchg(&sem->block, 1) == 0;
+}
+
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	down_write(&sem->rw_sem);
-
 	/*
-	 * Notify new readers to block; up until now, and thus throughout the
-	 * longish rcu_sync_enter() above, new readers could still come in.
+	 * Try set sem->block; this provides writer-writer exclusion.
+	 * Having sem->block set makes new readers block.
 	 */
-	WRITE_ONCE(sem->readers_block, 1);
+	wait_event_exclusive(sem->waiters, try_acquire_block(sem));
 
-	smp_mb(); /* D matches A */
+	/* smp_mb() implied by acquire_block() on success -- D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block, then we are
-	 * guaranteed to see their sem->read_count increment, and therefore
-	 * will wait for them.
+	 * If they don't see our store of sem->block, then we are guaranteed to
+	 * see their sem->read_count increment, and therefore will wait for
+	 * them.
 	 */
 
-	/* Wait for all now active readers to complete. */
-	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
+	/* Wait for all active readers to complete. */
+	wait_event(sem->waiters, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
@@ -178,12 +195,19 @@ void percpu_up_write(struct percpu_rw_se
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->readers_block, 0);
+	atomic_set_release(&sem->block, 0);
 
 	/*
-	 * Release the write lock, this will allow readers back in the game.
+	 * Prod any pending reader/writer to make progress.
+	 *
+	 * While there is no fairness guarantee; readers are waiting !exclusive
+	 * and will thus be on the wait_list head, while writers are waiting
+	 * exclusive and will thus be on the wait_list tail.
+	 *
+	 * Therefore it is more likely a reader will acquire the lock; if there
+	 * are any.
 	 */
-	up_write(&sem->rw_sem);
+	wake_up(&sem->waiters);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
@@ -191,5 +215,7 @@ void percpu_up_write(struct percpu_rw_se
 	 * exclusive write lock because its counting.
 	 */
 	rcu_sync_exit(&sem->rss);
+
+	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-05 14:02 [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem Peter Zijlstra
@ 2019-08-05 14:43 ` Boqun Feng
  2019-08-05 14:58   ` Boqun Feng
  2019-08-06 16:17 ` Oleg Nesterov
  2019-08-07 14:45 ` Will Deacon
  2 siblings, 1 reply; 19+ messages in thread
From: Boqun Feng @ 2019-08-05 14:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Will Deacon, Ingo Molnar, Thomas Gleixner,
	linux-kernel, bigeasy, juri.lelli, williams, bristot, longman,
	dave, jack

[-- Attachment #1: Type: text/plain, Size: 761 bytes --]

On Mon, Aug 05, 2019 at 04:02:41PM +0200, Peter Zijlstra wrote:
[...]
>  
>  static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
>  {
> +	rwsem_release(&sem->dep_map, 1, _RET_IP_);
> +
>  	preempt_disable();
>  	/*
>  	 * Same as in percpu_down_read().
>  	 */
> -	if (likely(rcu_sync_is_idle(&sem->rss)))
> +	if (likely(rcu_sync_is_idle(&sem->rss))) {
>  		__this_cpu_dec(*sem->read_count);
> -	else
> -		__percpu_up_read(sem); /* Unconditional memory barrier */
> -	preempt_enable();
> +		preempt_enable();
> +		return;
> +	}
>  
> -	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);

Missing a preempt_enable() here?

Regards,
Boqun

> +	__percpu_up_read(sem); /* Unconditional memory barrier */
>  }
>  
[...]

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-05 14:43 ` Boqun Feng
@ 2019-08-05 14:58   ` Boqun Feng
  2019-08-05 15:43     ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Boqun Feng @ 2019-08-05 14:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Will Deacon, Ingo Molnar, Thomas Gleixner,
	linux-kernel, bigeasy, juri.lelli, williams, bristot, longman,
	dave, jack

[-- Attachment #1: Type: text/plain, Size: 1021 bytes --]

On Mon, Aug 05, 2019 at 10:43:18PM +0800, Boqun Feng wrote:
> On Mon, Aug 05, 2019 at 04:02:41PM +0200, Peter Zijlstra wrote:
> [...]
> >  
> >  static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
> >  {
> > +	rwsem_release(&sem->dep_map, 1, _RET_IP_);
> > +
> >  	preempt_disable();
> >  	/*
> >  	 * Same as in percpu_down_read().
> >  	 */
> > -	if (likely(rcu_sync_is_idle(&sem->rss)))
> > +	if (likely(rcu_sync_is_idle(&sem->rss))) {
> >  		__this_cpu_dec(*sem->read_count);
> > -	else
> > -		__percpu_up_read(sem); /* Unconditional memory barrier */
> > -	preempt_enable();
> > +		preempt_enable();
> > +		return;
> > +	}
> >  
> > -	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
> 
> Missing a preempt_enable() here?
> 

Ah.. you modified the semantics of __percpu_up_read() to imply a
preempt_enable(), sorry for the noise...

Regards,
Boqun

> Regards,
> Boqun
> 
> > +	__percpu_up_read(sem); /* Unconditional memory barrier */
> >  }
> >  
> [...]



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-05 14:58   ` Boqun Feng
@ 2019-08-05 15:43     ` Peter Zijlstra
  2019-08-06 14:15       ` Boqun Feng
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2019-08-05 15:43 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Oleg Nesterov, Will Deacon, Ingo Molnar, Thomas Gleixner,
	linux-kernel, bigeasy, juri.lelli, williams, bristot, longman,
	dave, jack

On Mon, Aug 05, 2019 at 10:58:13PM +0800, Boqun Feng wrote:
> On Mon, Aug 05, 2019 at 10:43:18PM +0800, Boqun Feng wrote:
> > On Mon, Aug 05, 2019 at 04:02:41PM +0200, Peter Zijlstra wrote:
> > [...]
> > >  
> > >  static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
> > >  {
> > > +	rwsem_release(&sem->dep_map, 1, _RET_IP_);
> > > +
> > >  	preempt_disable();
> > >  	/*
> > >  	 * Same as in percpu_down_read().
> > >  	 */
> > > -	if (likely(rcu_sync_is_idle(&sem->rss)))
> > > +	if (likely(rcu_sync_is_idle(&sem->rss))) {
> > >  		__this_cpu_dec(*sem->read_count);
> > > -	else
> > > -		__percpu_up_read(sem); /* Unconditional memory barrier */
> > > -	preempt_enable();
> > > +		preempt_enable();
> > > +		return;
> > > +	}
> > >  
> > > -	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
> > 
> > Missing a preempt_enable() here?
> > 
> 
> Ah.. you modified the semantics of __percpu_up_read() to imply a
> preempt_enable(), sorry for the noise...

Yes indeed; I suppose I should've noted that in the Changlog. The reason
is that waitqueues use spin_lock() which change into a sleepable lock on
RT and thus cannot be used with preeption disabled. We also cannot
(easily) switch to swait because we use both exclusive and !exclusive
waits.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-05 15:43     ` Peter Zijlstra
@ 2019-08-06 14:15       ` Boqun Feng
  0 siblings, 0 replies; 19+ messages in thread
From: Boqun Feng @ 2019-08-06 14:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Will Deacon, Ingo Molnar, Thomas Gleixner,
	linux-kernel, bigeasy, juri.lelli, williams, bristot, longman,
	dave, jack

[-- Attachment #1: Type: text/plain, Size: 1568 bytes --]

On Mon, Aug 05, 2019 at 05:43:28PM +0200, Peter Zijlstra wrote:
> On Mon, Aug 05, 2019 at 10:58:13PM +0800, Boqun Feng wrote:
> > On Mon, Aug 05, 2019 at 10:43:18PM +0800, Boqun Feng wrote:
> > > On Mon, Aug 05, 2019 at 04:02:41PM +0200, Peter Zijlstra wrote:
> > > [...]
> > > >  
> > > >  static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
> > > >  {
> > > > +	rwsem_release(&sem->dep_map, 1, _RET_IP_);
> > > > +
> > > >  	preempt_disable();
> > > >  	/*
> > > >  	 * Same as in percpu_down_read().
> > > >  	 */
> > > > -	if (likely(rcu_sync_is_idle(&sem->rss)))
> > > > +	if (likely(rcu_sync_is_idle(&sem->rss))) {
> > > >  		__this_cpu_dec(*sem->read_count);
> > > > -	else
> > > > -		__percpu_up_read(sem); /* Unconditional memory barrier */
> > > > -	preempt_enable();
> > > > +		preempt_enable();
> > > > +		return;
> > > > +	}
> > > >  
> > > > -	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
> > > 
> > > Missing a preempt_enable() here?
> > > 
> > 
> > Ah.. you modified the semantics of __percpu_up_read() to imply a
> > preempt_enable(), sorry for the noise...
> 
> Yes indeed; I suppose I should've noted that in the Changlog. The reason
> is that waitqueues use spin_lock() which change into a sleepable lock on
> RT and thus cannot be used with preeption disabled. We also cannot
> (easily) switch to swait because we use both exclusive and !exclusive
> waits.

Thanks for the explanation. I was missing the point that the modfication
is mostly for RT, much clear now ;-)

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-05 14:02 [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem Peter Zijlstra
  2019-08-05 14:43 ` Boqun Feng
@ 2019-08-06 16:17 ` Oleg Nesterov
  2019-08-06 17:15   ` Peter Zijlstra
  2019-08-07 14:45 ` Will Deacon
  2 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2019-08-06 16:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On 08/05, Peter Zijlstra wrote:
>
> Re-implement percpu_rwsem without relying on rwsem.

looks correct... But,

> +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
>  {
>  	/*
>  	 * Due to having preemption disabled the decrement happens on
>  	 * the same CPU as the increment, avoiding the
>  	 * increment-on-one-CPU-and-decrement-on-another problem.
>  	 *
> -	 * If the reader misses the writer's assignment of readers_block, then
> -	 * the writer is guaranteed to see the reader's increment.
> +	 * If the reader misses the writer's assignment of sem->block, then the
> +	 * writer is guaranteed to see the reader's increment.
>  	 *
>  	 * Conversely, any readers that increment their sem->read_count after
> -	 * the writer looks are guaranteed to see the readers_block value,
> -	 * which in turn means that they are guaranteed to immediately
> -	 * decrement their sem->read_count, so that it doesn't matter that the
> -	 * writer missed them.
> +	 * the writer looks are guaranteed to see the sem->block value, which
> +	 * in turn means that they are guaranteed to immediately decrement
> +	 * their sem->read_count, so that it doesn't matter that the writer
> +	 * missed them.
>  	 */
>  
> +again:
>  	smp_mb(); /* A matches D */
>  
>  	/*
> -	 * If !readers_block the critical section starts here, matched by the
> +	 * If !sem->block the critical section starts here, matched by the
>  	 * release in percpu_up_write().
>  	 */
> -	if (likely(!smp_load_acquire(&sem->readers_block)))
> -		return 1;
> +	if (likely(!atomic_read_acquire(&sem->block)))
> +		return true;
>  
>  	/*
>  	 * Per the above comment; we still have preemption disabled and
>  	 * will thus decrement on the same CPU as we incremented.
>  	 */
> -	__percpu_up_read(sem);
> +	__percpu_up_read(sem); /* implies preempt_enable() */

...

>  void __percpu_up_read(struct percpu_rw_semaphore *sem)
>  {
>  	smp_mb(); /* B matches C */
> @@ -103,9 +109,10 @@ void __percpu_up_read(struct percpu_rw_s
>  	 * critical section.
>  	 */
>  	__this_cpu_dec(*sem->read_count);
> +	preempt_enable();
>  
> -	/* Prod writer to recheck readers_active */
> -	rcuwait_wake_up(&sem->writer);
> +	/* Prod writer to re-evaluate readers_active_check() */
> +	wake_up(&sem->waiters);

but this will also wake all the pending readers up. Every reader will burn
CPU for no reason and likely delay the writer.

In fact I'm afraid this can lead to live-lock, because every reader in turn
will call __percpu_up_read().

To simplify, lets consider a single-cpu machine.

Suppose we have an active reader which sleeps somewhere and a pending writer
sleeping in wait_event(readers_active_check).

A new reader R1 comes and blocks in wait_event(!sem->block).

Another reader R2 comes and does wake_up(sem->waiters). Quite possibly it will
enter wait_event(!sem->block) before R1 gets CPU.

Then it is quite possible that R1 does __this_cpu_inc() before the writer
passes wait_event(readers_active_check()) or even before it gets CPU.

Now, R1 calls __percpu_up_read(), wakes R2 up, and so on.


How about 2 wait queues?

This way we can also make percpu_up_write() more fair, it can probably check
waitqueue_active(writers) before wake_up(readers).

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-06 16:17 ` Oleg Nesterov
@ 2019-08-06 17:15   ` Peter Zijlstra
  2019-08-07  9:56     ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2019-08-06 17:15 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Tue, Aug 06, 2019 at 06:17:42PM +0200, Oleg Nesterov wrote:

> but this will also wake all the pending readers up. Every reader will burn
> CPU for no reason and likely delay the writer.
> 
> In fact I'm afraid this can lead to live-lock, because every reader in turn
> will call __percpu_up_read().

I didn't really consider that case important; because of how heavy the
write side is, it should be relatively rare.

> How about 2 wait queues?

That said, I can certainly try that.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-06 17:15   ` Peter Zijlstra
@ 2019-08-07  9:56     ` Oleg Nesterov
  2019-10-29 18:47       ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2019-08-07  9:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On 08/06, Peter Zijlstra wrote:
>
> On Tue, Aug 06, 2019 at 06:17:42PM +0200, Oleg Nesterov wrote:
>
> > but this will also wake all the pending readers up. Every reader will burn
> > CPU for no reason and likely delay the writer.
> >
> > In fact I'm afraid this can lead to live-lock, because every reader in turn
> > will call __percpu_up_read().
>
> I didn't really consider that case important; because of how heavy the
> write side is, it should be relatively rare.

Well yes, but down_read() should not stress the system.

However I was wrong, it is not that bad as I thought, I forgot that the
pending reader won't return from wait_event(sem->block) if another reader
comes.

Still I think we should try to avoid the unnecessary wakeups. See below.

> > How about 2 wait queues?
>
> That said, I can certainly try that.

and either way, with or without 2 queues, what do you think about the code
below?

This way the new reader does wake_up() only in the very unlikely case when
it races with the new writer which sets sem->block = 1 right after
this_cpu_inc().

Oleg.
-------------------------------------------------------------------------------

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
	might_sleep();
	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

	preempt_disable();

	if (likely(rcu_sync_is_idle(&sem->rss)))
		__this_cpu_inc(*sem->read_count);
	else
		__percpu_down_read(sem, false);

	preempt_enable();
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
	rwsem_release(&sem->dep_map, 1, _RET_IP_);

	preempt_disable();

	if (likely(rcu_sync_is_idle(&sem->rss)))
		__this_cpu_dec(*sem->read_count);
	else
		__percpu_up_read(sem);

	preempt_enable();
}

// both called and return with preemption disabled

bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{

	if (atomic_read_acquire(&sem->block)) {
again:
		preempt_enable();
		__wait_event(sem->waiters, !atomic_read_acquire(&sem->block));
		preempt_disable();
	}

	__this_cpu_inc(*sem->read_count);

	smp_mb();

	if (likely(!atomic_read_acquire(&sem->block)))
		return true;

	__percpu_up_read(sem);

	if (try)
		return false;

	goto again;
}

void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
	smp_mb();

	__this_cpu_dec(*sem->read_count);

	wake_up(&sem->waiters);
}


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-05 14:02 [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem Peter Zijlstra
  2019-08-05 14:43 ` Boqun Feng
  2019-08-06 16:17 ` Oleg Nesterov
@ 2019-08-07 14:45 ` Will Deacon
  2019-10-29 19:06   ` Peter Zijlstra
  2 siblings, 1 reply; 19+ messages in thread
From: Will Deacon @ 2019-08-07 14:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Will Deacon, Ingo Molnar, Thomas Gleixner,
	linux-kernel, bigeasy, juri.lelli, williams, bristot, longman,
	dave, jack

Hi Peter,

I've mostly been spared the joys of pcpu rwsem, but I took a look anyway.
Comments of questionable quality below.

On Mon, Aug 05, 2019 at 04:02:41PM +0200, Peter Zijlstra wrote:
> The filesystem freezer uses percpu_rwsem in a way that is effectively
> write_non_owner() and achieves this with a few horrible hacks that
> rely on the rwsem (!percpu) implementation.
> 
> When -RT re-implements rwsem this house of cards comes undone.
> 
> Re-implement percpu_rwsem without relying on rwsem.
> 
> Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Reported-by: Juri Lelli <juri.lelli@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Tested-by: Juri Lelli <juri.lelli@redhat.com>
> Cc: Clark Williams <williams@redhat.com>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Davidlohr Bueso <dave@stgolabs.net>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: jack@suse.com
> ---
>  include/linux/percpu-rwsem.h  |   72 +++++++++++++-------------
>  include/linux/wait.h          |    3 +
>  kernel/cpu.c                  |    4 -
>  kernel/locking/percpu-rwsem.c |  116 +++++++++++++++++++++++++-----------------
>  4 files changed, 112 insertions(+), 83 deletions(-)

[...]

> +/*
> + * Called with preemption disabled, and returns with preemption disabled,
> + * except when it fails the trylock.
> + */
> +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
>  {
>  	/*
>  	 * Due to having preemption disabled the decrement happens on
>  	 * the same CPU as the increment, avoiding the
>  	 * increment-on-one-CPU-and-decrement-on-another problem.
>  	 *
> -	 * If the reader misses the writer's assignment of readers_block, then
> -	 * the writer is guaranteed to see the reader's increment.
> +	 * If the reader misses the writer's assignment of sem->block, then the
> +	 * writer is guaranteed to see the reader's increment.
>  	 *
>  	 * Conversely, any readers that increment their sem->read_count after
> -	 * the writer looks are guaranteed to see the readers_block value,
> -	 * which in turn means that they are guaranteed to immediately
> -	 * decrement their sem->read_count, so that it doesn't matter that the
> -	 * writer missed them.
> +	 * the writer looks are guaranteed to see the sem->block value, which
> +	 * in turn means that they are guaranteed to immediately decrement
> +	 * their sem->read_count, so that it doesn't matter that the writer
> +	 * missed them.
>  	 */
>  
> +again:
>  	smp_mb(); /* A matches D */
>  
>  	/*
> -	 * If !readers_block the critical section starts here, matched by the
> +	 * If !sem->block the critical section starts here, matched by the
>  	 * release in percpu_up_write().
>  	 */
> -	if (likely(!smp_load_acquire(&sem->readers_block)))
> -		return 1;
> +	if (likely(!atomic_read_acquire(&sem->block)))
> +		return true;
>  
>  	/*
>  	 * Per the above comment; we still have preemption disabled and
>  	 * will thus decrement on the same CPU as we incremented.
>  	 */
> -	__percpu_up_read(sem);
> +	__percpu_up_read(sem); /* implies preempt_enable() */

Irritatingly, it also implies an smp_mb() which I don't think we need here.

>  	if (try)
> -		return 0;
> +		return false;
>  
> -	/*
> -	 * We either call schedule() in the wait, or we'll fall through
> -	 * and reschedule on the preempt_enable() in percpu_down_read().
> -	 */
> -	preempt_enable_no_resched();
> +	wait_event(sem->waiters, !atomic_read_acquire(&sem->block));

Why do you need acquire semantics here? Is the control dependency to the
increment not enough?

Talking of control dependencies, could we replace the smp_mb() in
readers_active_check() with smp_acquire__after_ctrl_dep()? In fact, perhaps
we could remove it altogether, given that our writes will be ordered by
the dependency and I don't think we care about ordering our reads wrt
previous readers. Hmm. Either way, clearly not for this patch.

Anyway, general shape of the patch looks good to me.

Will

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-07  9:56     ` Oleg Nesterov
@ 2019-10-29 18:47       ` Peter Zijlstra
  2019-10-30 14:21         ` Oleg Nesterov
  2019-10-30 17:52         ` Peter Zijlstra
  0 siblings, 2 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-29 18:47 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Wed, Aug 07, 2019 at 11:56:58AM +0200, Oleg Nesterov wrote:

> and either way, with or without 2 queues, what do you think about the code
> below?

Sorry for being so tardy with this thread.. having once again picked up
the patch, I found your email.

> This way the new reader does wake_up() only in the very unlikely case when
> it races with the new writer which sets sem->block = 1 right after
> this_cpu_inc().

Ah, by waiting early, you avoid spurious wakeups when
__percpu_down_read() happens after a successful percpu_down_write().
Nice!

I've made these changes. Now let me go have a play with that second
waitqueue.

> -------------------------------------------------------------------------------
> 
> static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> {
> 	might_sleep();
> 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
> 
> 	preempt_disable();
> 
> 	if (likely(rcu_sync_is_idle(&sem->rss)))
> 		__this_cpu_inc(*sem->read_count);
> 	else
> 		__percpu_down_read(sem, false);
> 
> 	preempt_enable();
> }
> 
> static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
> {
> 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
> 
> 	preempt_disable();
> 
> 	if (likely(rcu_sync_is_idle(&sem->rss)))
> 		__this_cpu_dec(*sem->read_count);
> 	else
> 		__percpu_up_read(sem);
> 
> 	preempt_enable();
> }

I like that symmetry, but see below ...

> // both called and return with preemption disabled
> 
> bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
> {
> 
> 	if (atomic_read_acquire(&sem->block)) {
> again:
> 		preempt_enable();
> 		__wait_event(sem->waiters, !atomic_read_acquire(&sem->block));
> 		preempt_disable();
> 	}
> 
> 	__this_cpu_inc(*sem->read_count);
> 
> 	smp_mb();
> 
> 	if (likely(!atomic_read_acquire(&sem->block)))
> 		return true;
> 
> 	__percpu_up_read(sem);
> 
> 	if (try)
> 		return false;
> 
> 	goto again;
> }
> 
> void __percpu_up_read(struct percpu_rw_semaphore *sem)
> {
> 	smp_mb();
> 
> 	__this_cpu_dec(*sem->read_count);
> 
	preempt_enable();
> 	wake_up(&sem->waiters);
	preempt_disable()

and this (sadly) means there's a bunch of back-to-back
preempt_disable()+preempt_enable() calls. Leaving out the
preempt_disable() here makes it ugly again :/

Admittedly, this is PREEMPT_RT only, but given that is >< close to
mainline we'd better get it right.

> }
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-08-07 14:45 ` Will Deacon
@ 2019-10-29 19:06   ` Peter Zijlstra
  2019-10-30 15:57     ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-29 19:06 UTC (permalink / raw)
  To: Will Deacon
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, linux-kernel,
	bigeasy, juri.lelli, williams, bristot, longman, dave, jack

On Wed, Aug 07, 2019 at 03:45:34PM +0100, Will Deacon wrote:
> Hi Peter,
> 
> I've mostly been spared the joys of pcpu rwsem, but I took a look anyway.
> Comments of questionable quality below.

Thanks for having a look, and sorry for being tardy.

> On Mon, Aug 05, 2019 at 04:02:41PM +0200, Peter Zijlstra wrote:

> > +/*
> > + * Called with preemption disabled, and returns with preemption disabled,
> > + * except when it fails the trylock.
> > + */
> > +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
> >  {
> >  	/*
> >  	 * Due to having preemption disabled the decrement happens on
> >  	 * the same CPU as the increment, avoiding the
> >  	 * increment-on-one-CPU-and-decrement-on-another problem.
> >  	 *
> > -	 * If the reader misses the writer's assignment of readers_block, then
> > -	 * the writer is guaranteed to see the reader's increment.
> > +	 * If the reader misses the writer's assignment of sem->block, then the
> > +	 * writer is guaranteed to see the reader's increment.
> >  	 *
> >  	 * Conversely, any readers that increment their sem->read_count after
> > -	 * the writer looks are guaranteed to see the readers_block value,
> > -	 * which in turn means that they are guaranteed to immediately
> > -	 * decrement their sem->read_count, so that it doesn't matter that the
> > -	 * writer missed them.
> > +	 * the writer looks are guaranteed to see the sem->block value, which
> > +	 * in turn means that they are guaranteed to immediately decrement
> > +	 * their sem->read_count, so that it doesn't matter that the writer
> > +	 * missed them.
> >  	 */
> >  
> > +again:
> >  	smp_mb(); /* A matches D */
> >  
> >  	/*
> > -	 * If !readers_block the critical section starts here, matched by the
> > +	 * If !sem->block the critical section starts here, matched by the
> >  	 * release in percpu_up_write().
> >  	 */
> > -	if (likely(!smp_load_acquire(&sem->readers_block)))
> > -		return 1;
> > +	if (likely(!atomic_read_acquire(&sem->block)))
> > +		return true;
> >  
> >  	/*
> >  	 * Per the above comment; we still have preemption disabled and
> >  	 * will thus decrement on the same CPU as we incremented.
> >  	 */
> > -	__percpu_up_read(sem);
> > +	__percpu_up_read(sem); /* implies preempt_enable() */
> 
> Irritatingly, it also implies an smp_mb() which I don't think we need here.

Hurm.. yes, I think you're right. We have:

	__this_cpu_inc()
	smp_mb()
	if (!atomic_read_acquire(&sem->block))
		return true;
	__percpu_up_read();

Between that smp_mb() and the ACQUIRE there is no way the
__this_cpu_dec() can get hoisted. That is, except if we rely on that
stronger transitivity guarantee (I forgot what we ended up calling it,
and I can't ever find anything in a hurry in memory-barriers.txt).

That said, with Oleg's suggestion getting here is much reduced and when
we do, the cost of actual wakeups is much higher than that of the memory
barrier, so I'm inclined to leave things as they are.

> >  	if (try)
> > -		return 0;
> > +		return false;
> >  
> > -	/*
> > -	 * We either call schedule() in the wait, or we'll fall through
> > -	 * and reschedule on the preempt_enable() in percpu_down_read().
> > -	 */
> > -	preempt_enable_no_resched();
> > +	wait_event(sem->waiters, !atomic_read_acquire(&sem->block));
> 
> Why do you need acquire semantics here? Is the control dependency to the
> increment not enough?

Uuuhhh... let me think about that. Clearly we want ACQUIRE for the
'return true' case, but I'm not sure why I used it here.

> Talking of control dependencies, could we replace the smp_mb() in
> readers_active_check() with smp_acquire__after_ctrl_dep()? In fact, perhaps
> we could remove it altogether, given that our writes will be ordered by
> the dependency and I don't think we care about ordering our reads wrt
> previous readers. Hmm. Either way, clearly not for this patch.

I think we can, but we've never had the need to optimize the slow path
here. The design of this thing has always been that if you hit the
slow-path, you're doing it wrong.

That said, I think cgroups use a variant of percpu-rwsem that wreck rss
on purpose and always take the slowpaths. So it might actually be worth
it.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-29 18:47       ` Peter Zijlstra
@ 2019-10-30 14:21         ` Oleg Nesterov
  2019-10-30 16:09           ` Peter Zijlstra
  2019-10-30 17:52         ` Peter Zijlstra
  1 sibling, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2019-10-30 14:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On 10/29, Peter Zijlstra wrote:
>
> I like that symmetry, but see below ...

...

> > void __percpu_up_read(struct percpu_rw_semaphore *sem)
> > {
> > 	smp_mb();
> >
> > 	__this_cpu_dec(*sem->read_count);
> >
> 	preempt_enable();
> > 	wake_up(&sem->waiters);
> 	preempt_disable()
>
> and this (sadly) means there's a bunch of back-to-back
> preempt_disable()+preempt_enable() calls.

Hmm. Where did these enable+disable come from?

	void __percpu_up_read(struct percpu_rw_semaphore *sem)
	{
		smp_mb();

		__this_cpu_dec(*sem->read_count);

		wake_up(&sem->waiters);
	}

should work just fine?

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-29 19:06   ` Peter Zijlstra
@ 2019-10-30 15:57     ` Oleg Nesterov
  2019-10-30 16:47       ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2019-10-30 15:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On 10/29, Peter Zijlstra wrote:
>
> That said, I think cgroups use a variant of percpu-rwsem that wreck rss
> on purpose and always take the slowpaths.

I forgot (never understodd) why does Android need this.

I am wondering if it makes any sense to add a config/boot or even runtime
knob for cgroup_threadgroup_rwsem.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-30 14:21         ` Oleg Nesterov
@ 2019-10-30 16:09           ` Peter Zijlstra
  0 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-30 16:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Wed, Oct 30, 2019 at 03:21:10PM +0100, Oleg Nesterov wrote:
> On 10/29, Peter Zijlstra wrote:
> >
> > I like that symmetry, but see below ...
> 
> ...
> 
> > > void __percpu_up_read(struct percpu_rw_semaphore *sem)
> > > {
> > > 	smp_mb();
> > >
> > > 	__this_cpu_dec(*sem->read_count);
> > >
> > 	preempt_enable();
> > > 	wake_up(&sem->waiters);
> > 	preempt_disable()
> >
> > and this (sadly) means there's a bunch of back-to-back
> > preempt_disable()+preempt_enable() calls.
> 
> Hmm. Where did these enable+disable come from?
> 
> 	void __percpu_up_read(struct percpu_rw_semaphore *sem)
> 	{
> 		smp_mb();
> 
> 		__this_cpu_dec(*sem->read_count);
> 
> 		wake_up(&sem->waiters);
> 	}
> 
> should work just fine?

Not on PREEMPT_RT, because wake_up() will take wait_queue_head::lock,
which is spin_lock_t and turns into a pi_mutex, which we cannot take
with preemption disabled.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-30 15:57     ` Oleg Nesterov
@ 2019-10-30 16:47       ` Peter Zijlstra
  0 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-30 16:47 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Wed, Oct 30, 2019 at 04:57:20PM +0100, Oleg Nesterov wrote:
> On 10/29, Peter Zijlstra wrote:
> >
> > That said, I think cgroups use a variant of percpu-rwsem that wreck rss
> > on purpose and always take the slowpaths.
> 
> I forgot (never understodd) why does Android need this.
> 
> I am wondering if it makes any sense to add a config/boot or even runtime
> knob for cgroup_threadgroup_rwsem.

It isn't just Android, but Android in specific likes to move tasks
between cgroups a lot.

And moving tasks between cgroups is what requires the write-side of the
cgroup percpu-rwsem to be taken. Adding an RCU-GP to every task movement
wasn't making them happy (which I can understand).


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-29 18:47       ` Peter Zijlstra
  2019-10-30 14:21         ` Oleg Nesterov
@ 2019-10-30 17:52         ` Peter Zijlstra
  2019-10-30 18:47           ` Peter Zijlstra
                             ` (2 more replies)
  1 sibling, 3 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-30 17:52 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Tue, Oct 29, 2019 at 07:47:39PM +0100, Peter Zijlstra wrote:
> I've made these changes. Now let me go have a play with that second
> waitqueue.

What I've ended up with is a 'custom' waitqueue and an rcuwait. The
rcuwait conveniently got around the tedious preempt_enable/disable
around the __percpu_up_read() wakeup.

I realized that up_read will only ever have to wake a (single) blocked
writer, never a series of readers.

Compile tested only, I'll build and boot test once i've had dinner.

---
 include/linux/percpu-rwsem.h  |  64 +++++++++--------
 include/linux/wait.h          |   1 +
 kernel/cpu.c                  |   4 +-
 kernel/locking/percpu-rwsem.c | 156 ++++++++++++++++++++++++++++++------------
 4 files changed, 150 insertions(+), 75 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index ad2ca2a89d5b..806af4bf257e 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -6,38 +6,51 @@
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/rcuwait.h>
+#include <linux/wait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem; /* slowpath */
-	struct rcuwait          writer; /* blocked writer */
-	int			readers_block;
+	struct rcuwait		writer;
+	wait_queue_head_t	waiters;
+	atomic_t		block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
 #define __DEFINE_PERCPU_RWSEM(name, is_static)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);		\
 is_static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss),			\
 	.read_count = &__percpu_rwsem_rc_##name,			\
-	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
+	.waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),		\
+	.block = ATOMIC_INIT(0),					\
+	__PERCPU_RWSEM_DEP_MAP_INIT(name)				\
 }
+
 #define DEFINE_PERCPU_RWSEM(name)		\
 	__DEFINE_PERCPU_RWSEM(name, /* not static */)
 #define DEFINE_STATIC_PERCPU_RWSEM(name)	\
 	__DEFINE_PERCPU_RWSEM(name, static)
 
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 
-	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	preempt_disable();
 	/*
@@ -48,8 +61,9 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	 * and that once the synchronize_rcu() is done, the writer will see
 	 * anything we did within this RCU-sched read-size critical section.
 	 */
-	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_inc(*sem->read_count);
+	else
 		__percpu_down_read(sem, false); /* Unconditional memory barrier */
 	/*
 	 * The preempt_enable() prevents the compiler from
@@ -58,16 +72,17 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 	preempt_enable();
 }
 
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	int ret = 1;
+	bool ret = true;
 
 	preempt_disable();
 	/*
 	 * Same as in percpu_down_read().
 	 */
-	__this_cpu_inc(*sem->read_count);
-	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+	if (likely(!rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_inc(*sem->read_count);
+	else
 		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
@@ -76,13 +91,15 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	 */
 
 	if (ret)
-		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
 
 	return ret;
 }
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
 	preempt_disable();
 	/*
 	 * Same as in percpu_down_read().
@@ -91,9 +108,8 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 		__this_cpu_dec(*sem->read_count);
 	else
 		__percpu_up_read(sem); /* Unconditional memory barrier */
-	preempt_enable();
 
-	rwsem_release(&sem->rw_sem.dep_map, _RET_IP_);
+	preempt_enable();
 }
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,29 +126,19 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem)				\
-	lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem)	lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem)	lockdep_assert_held(sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_release(&sem->rw_sem.dep_map, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
+	lock_release(&sem->dep_map, ip);
 }
 
 static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 					bool read, unsigned long ip)
 {
-	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-	if (!read)
-		atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
+	lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
 }
 
 #endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..94580163a4b1 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -20,6 +20,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 #define WQ_FLAG_BOOKMARK	0x04
+#define WQ_FLAG_CUSTOM		0x08
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 68f85627d909..7ea0c0225590 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
 
 static void lockdep_acquire_cpus_lock(void)
 {
-	rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+	rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
 }
 
 static void lockdep_release_cpus_lock(void)
 {
-	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
+	rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
 }
 
 /*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 364d38a0c444..9aa69345cdc8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -2,6 +2,7 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
+#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
@@ -11,17 +12,20 @@
 #include "rwsem.h"
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
-			const char *name, struct lock_class_key *rwsem_key)
+			const char *name, struct lock_class_key *key)
 {
 	sem->read_count = alloc_percpu(int);
 	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss);
-	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 	rcuwait_init(&sem->writer);
-	sem->readers_block = 0;
+	init_waitqueue_head(&sem->waiters);
+	atomic_set(&sem->block, 0);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+	lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,31 +45,95 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+enum wake_state {
+	unknown = -1,
+	writer = 0,
+	reader = 1,
+};
+
+/*
+ * percpu_rwsem_wake_function -- provide FIFO fair reader/writer wakeups
+ *
+ * As per percpu_rwsem_wait() all waiters are queued exclusive (tail/FIFO)
+ * without autoremove to preserve FIFO order.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+				      unsigned int mode, int wake_flags,
+				      void *key)
+{
+	enum wake_state state = (wq_entry->flags & WQ_FLAG_CUSTOM) ? reader : writer;
+	enum wake_state *statep = key;
+
+	if (*statep != unknown && (*statep == writer || state == writer))
+		return 1; /* stop; woken 1 writer or exhausted readers */
+
+	if (default_wake_function(wq_entry, mode, wake_flags, NULL))
+		*statep = state;
+
+	return 0; /* continue waking */
+}
+
+#define percpu_rwsem_wait(sem, reader, cond)				\
+do {									\
+	DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);		\
+									\
+	if (reader)							\
+		wq_entry.flags |= WQ_FLAG_CUSTOM;			\
+									\
+	add_wait_queue_exclusive(&(sem)->waiters, &wq_entry);		\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (cond)						\
+			break;						\
+		schedule();						\
+	}								\
+	__set_current_state(TASK_RUNNING);				\
+	remove_wait_queue(&(sem)->waiters, &wq_entry);			\
+} while (0)
+
+#define percpu_rwsem_wake(sem)						\
+do {									\
+	enum wake_state ____state = unknown;				\
+	__wake_up(&(sem)->waiters, TASK_NORMAL, 1, &____state);		\
+} while (0)
+
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
+	if (atomic_read(&sem->block)) {
+again:
+		if (try)
+			return false;
+
+		preempt_enable();
+		percpu_rwsem_wait(sem, reader, !atomic_read(&sem->block));
+		preempt_disable();
+	}
+
 	/*
 	 * Due to having preemption disabled the decrement happens on
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * If the reader misses the writer's assignment of readers_block, then
-	 * the writer is guaranteed to see the reader's increment.
+	 * If the reader misses the writer's assignment of sem->block, then the
+	 * writer is guaranteed to see the reader's increment.
 	 *
 	 * Conversely, any readers that increment their sem->read_count after
-	 * the writer looks are guaranteed to see the readers_block value,
-	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter that the
-	 * writer missed them.
+	 * the writer looks are guaranteed to see the sem->block value, which
+	 * in turn means that they are guaranteed to immediately decrement
+	 * their sem->read_count, so that it doesn't matter that the writer
+	 * missed them.
 	 */
 
+	__this_cpu_inc(*sem->read_count);
+
 	smp_mb(); /* A matches D */
 
 	/*
-	 * If !readers_block the critical section starts here, matched by the
+	 * If !sem->block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(!smp_load_acquire(&sem->readers_block)))
-		return 1;
+	if (likely(!atomic_read_acquire(&sem->block)))
+		return true;
 
 	/*
 	 * Per the above comment; we still have preemption disabled and
@@ -73,24 +141,12 @@ int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 	 */
 	__percpu_up_read(sem);
 
-	if (try)
-		return 0;
-
-	/*
-	 * We either call schedule() in the wait, or we'll fall through
-	 * and reschedule on the preempt_enable() in percpu_down_read().
-	 */
-	preempt_enable_no_resched();
-
 	/*
-	 * Avoid lockdep for the down/up_read() we already have them.
+	 * percpu_down_write() could've set sem->block right after we've seen
+	 * it 0 but missed our this_cpu_inc(), which is exactly the condition
+	 * we get called for from percpu_down_read().
 	 */
-	__down_read(&sem->rw_sem);
-	this_cpu_inc(*sem->read_count);
-	__up_read(&sem->rw_sem);
-
-	preempt_disable();
-	return 1;
+	goto again;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 
@@ -104,7 +160,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
 	 */
 	__this_cpu_dec(*sem->read_count);
 
-	/* Prod writer to recheck readers_active */
+	/* Prod writer to re-evaluate readers_active_check() */
 	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -124,6 +180,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
  * zero.  If this sum is zero, then it is stable due to the fact that if any
  * newly arriving readers increment a given counter, they will immediately
  * decrement that same counter.
+ *
+ * Assumes sem->block is set.
  */
 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
@@ -140,28 +198,36 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
 	return true;
 }
 
+static inline bool try_acquire_block(struct percpu_rw_semaphore *sem)
+{
+	if (atomic_read(&sem->block))
+		return false;
+
+	return atomic_xchg(&sem->block, 1) == 0;
+}
+
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
-	down_write(&sem->rw_sem);
-
 	/*
-	 * Notify new readers to block; up until now, and thus throughout the
-	 * longish rcu_sync_enter() above, new readers could still come in.
+	 * Try set sem->block; this provides writer-writer exclusion.
+	 * Having sem->block set makes new readers block.
 	 */
-	WRITE_ONCE(sem->readers_block, 1);
+	percpu_rwsem_wait(sem, writer, try_acquire_block(sem));
 
-	smp_mb(); /* D matches A */
+	/* smp_mb() implied by try_acquire_block() on success -- D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block, then we are
-	 * guaranteed to see their sem->read_count increment, and therefore
-	 * will wait for them.
+	 * If they don't see our store of sem->block, then we are guaranteed to
+	 * see their sem->read_count increment, and therefore will wait for
+	 * them.
 	 */
 
-	/* Wait for all now active readers to complete. */
+	/* Wait for all active readers to complete. */
 	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -178,12 +244,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->readers_block, 0);
+	atomic_set_release(&sem->block, 0);
 
 	/*
-	 * Release the write lock, this will allow readers back in the game.
+	 * Prod any pending reader/writer to make progress.
 	 */
-	up_write(&sem->rw_sem);
+	percpu_rwsem_wake(sem);
 
 	/*
 	 * Once this completes (at least one RCU-sched grace period hence) the
@@ -191,5 +257,7 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 	 * exclusive write lock because its counting.
 	 */
 	rcu_sync_exit(&sem->rss);
+
+	rwsem_release(&sem->dep_map, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-30 17:52         ` Peter Zijlstra
@ 2019-10-30 18:47           ` Peter Zijlstra
  2019-10-30 19:31           ` Peter Zijlstra
  2019-10-31  6:11           ` Peter Zijlstra
  2 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-30 18:47 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Wed, Oct 30, 2019 at 06:52:31PM +0100, Peter Zijlstra wrote:
> On Tue, Oct 29, 2019 at 07:47:39PM +0100, Peter Zijlstra wrote:
> > I've made these changes. Now let me go have a play with that second
> > waitqueue.
> 
> What I've ended up with is a 'custom' waitqueue and an rcuwait. The
> rcuwait conveniently got around the tedious preempt_enable/disable
> around the __percpu_up_read() wakeup.
> 
> I realized that up_read will only ever have to wake a (single) blocked
> writer, never a series of readers.
> 
> Compile tested only, I'll build and boot test once i've had dinner.

It seems to boot and build a kernel, it must be perfect ;-)

I think I'll go split this into a number of smaller patches:

 - move lockdep_map into percpu_rwsem and stop using the rwsem one
 - use bool
 - move the __this_cpu_{inc,dec} into the slowpath
 - rework __percpu_down_read() as per your earlier suggestion
 - replace rwsem with wait_queue + atomic_t

that might help make all this slightly easier to read.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-30 17:52         ` Peter Zijlstra
  2019-10-30 18:47           ` Peter Zijlstra
@ 2019-10-30 19:31           ` Peter Zijlstra
  2019-10-31  6:11           ` Peter Zijlstra
  2 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-30 19:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Wed, Oct 30, 2019 at 06:52:31PM +0100, Peter Zijlstra wrote:
> @@ -58,16 +72,17 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
>  	preempt_enable();
>  }
>  
> -static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
> +static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
>  {
> -	int ret = 1;
> +	bool ret = true;
>  
>  	preempt_disable();
>  	/*
>  	 * Same as in percpu_down_read().
>  	 */
> -	__this_cpu_inc(*sem->read_count);
> -	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
> +	if (likely(!rcu_sync_is_idle(&sem->rss)))

That should obviously also loose the !

> +		__this_cpu_inc(*sem->read_count);
> +	else
>  		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
>  	preempt_enable();
>  	/*

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem
  2019-10-30 17:52         ` Peter Zijlstra
  2019-10-30 18:47           ` Peter Zijlstra
  2019-10-30 19:31           ` Peter Zijlstra
@ 2019-10-31  6:11           ` Peter Zijlstra
  2 siblings, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2019-10-31  6:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Will Deacon, Ingo Molnar, Thomas Gleixner, linux-kernel, bigeasy,
	juri.lelli, williams, bristot, longman, dave, jack

On Wed, Oct 30, 2019 at 06:52:31PM +0100, Peter Zijlstra wrote:
> +enum wake_state {
> +	unknown = -1,
> +	writer = 0,
> +	reader = 1,
> +};
> +
> +/*
> + * percpu_rwsem_wake_function -- provide FIFO fair reader/writer wakeups
> + *
> + * As per percpu_rwsem_wait() all waiters are queued exclusive (tail/FIFO)
> + * without autoremove to preserve FIFO order.
> + */
> +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
> +				      unsigned int mode, int wake_flags,
> +				      void *key)
> +{
> +	enum wake_state state = (wq_entry->flags & WQ_FLAG_CUSTOM) ? reader : writer;
> +	enum wake_state *statep = key;
> +
> +	if (*statep != unknown && (*statep == writer || state == writer))
> +		return 1; /* stop; woken 1 writer or exhausted readers */
> +
> +	if (default_wake_function(wq_entry, mode, wake_flags, NULL))
> +		*statep = state;
> +
> +	return 0; /* continue waking */
> +}
> +
> +#define percpu_rwsem_wait(sem, reader, cond)				\
> +do {									\
> +	DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);		\
> +									\
> +	if (reader)							\
> +		wq_entry.flags |= WQ_FLAG_CUSTOM;			\
> +									\
> +	add_wait_queue_exclusive(&(sem)->waiters, &wq_entry);		\
> +	for (;;) {							\
> +		set_current_state(TASK_UNINTERRUPTIBLE);		\
> +		if (cond)						\
> +			break;						\
> +		schedule();						\
> +	}								\
> +	__set_current_state(TASK_RUNNING);				\
> +	remove_wait_queue(&(sem)->waiters, &wq_entry);			\
> +} while (0)
> +
> +#define percpu_rwsem_wake(sem)						\
> +do {									\
> +	enum wake_state ____state = unknown;				\
> +	__wake_up(&(sem)->waiters, TASK_NORMAL, 1, &____state);		\
> +} while (0)

That isn't quite right, when it has readers and a pending writer, things
go sideways.

I'm going to have to poke a little more at this.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2019-10-31  6:11 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-05 14:02 [PATCH] locking/percpu_rwsem: Rewrite to not use rwsem Peter Zijlstra
2019-08-05 14:43 ` Boqun Feng
2019-08-05 14:58   ` Boqun Feng
2019-08-05 15:43     ` Peter Zijlstra
2019-08-06 14:15       ` Boqun Feng
2019-08-06 16:17 ` Oleg Nesterov
2019-08-06 17:15   ` Peter Zijlstra
2019-08-07  9:56     ` Oleg Nesterov
2019-10-29 18:47       ` Peter Zijlstra
2019-10-30 14:21         ` Oleg Nesterov
2019-10-30 16:09           ` Peter Zijlstra
2019-10-30 17:52         ` Peter Zijlstra
2019-10-30 18:47           ` Peter Zijlstra
2019-10-30 19:31           ` Peter Zijlstra
2019-10-31  6:11           ` Peter Zijlstra
2019-08-07 14:45 ` Will Deacon
2019-10-29 19:06   ` Peter Zijlstra
2019-10-30 15:57     ` Oleg Nesterov
2019-10-30 16:47       ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).