All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
@ 2021-11-16  1:29 Waiman Long
  2021-11-16  2:52 ` Aiqun(Maria) Yu
                   ` (5 more replies)
  0 siblings, 6 replies; 41+ messages in thread
From: Waiman Long @ 2021-11-16  1:29 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon
  Cc: linux-kernel, Davidlohr Bueso, mazhenhua, Hillf Danton, Maria Yu,
	Waiman Long

There are some inconsistency in the way that the handoff bit is being
handled in readers and writers.

Firstly, when a queue head writer set the handoff bit, it will clear it
when the writer is being killed or interrupted on its way out without
acquiring the lock. That is not the case for a queue head reader. The
handoff bit will simply be inherited by the next waiter.

Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
the waiter and handoff bits are cleared if the wait queue becomes empty.
For rwsem_down_write_slowpath(), however, the handoff bit is not checked
and cleared if the wait queue is empty. This can potentially make the
handoff bit set with empty wait queue.

To make the handoff bit handling more consistent and robust, extract
out handoff bit clearing code into the new rwsem_del_waiter() helper
function.  The common function will only use atomic_long_andnot() to
clear bits when the wait queue is empty to avoid possible race condition.
If the first waiter with handoff bit set is killed or interrupted to
exit the slowpath without acquiring the lock, the next waiter will
inherit the handoff bit.

While at it, simplify the trylock for loop in rwsem_down_write_slowpath()
to make it easier to read.

Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/locking/rwsem.c | 171 ++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 86 deletions(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c51387a43265..e039cf1605af 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -105,9 +105,9 @@
  * atomic_long_cmpxchg() will be used to obtain writer lock.
  *
  * There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers		-- set, clear
+ * 2) rwsem_try_write_lock() for writers	-- set, clear
+ * 3) rwsem_del_waiter()			-- clear
  *
  * For all the above cases, wait_lock will be held. A writer must also
  * be the first one in the wait_list to be eligible for setting the handoff
@@ -334,6 +334,9 @@ struct rwsem_waiter {
 	struct task_struct *task;
 	enum rwsem_waiter_type type;
 	unsigned long timeout;
+
+	/* Writer only, not initialized in reader */
+	bool handoff_set;
 };
 #define rwsem_first_waiter(sem) \
 	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -344,12 +347,6 @@ enum rwsem_wake_type {
 	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
 };
 
-enum writer_wait_state {
-	WRITER_NOT_FIRST,	/* Writer is not first in wait list */
-	WRITER_FIRST,		/* Writer is first in wait list     */
-	WRITER_HANDOFF		/* Writer is first & handoff needed */
-};
-
 /*
  * The typical HZ value is either 250 or 1000. So set the minimum waiting
  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -365,6 +362,31 @@ enum writer_wait_state {
  */
 #define MAX_READERS_WAKEUP	0x100
 
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+	lockdep_assert_held(&sem->wait_lock);
+	list_add_tail(&waiter->list, &sem->wait_list);
+	/* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ */
+static inline void
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+	lockdep_assert_held(&sem->wait_lock);
+	list_del(&waiter->list);
+	if (likely(!list_empty(&sem->wait_list)))
+		return;
+
+	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+}
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -376,6 +398,8 @@ enum writer_wait_state {
  *   preferably when the wait_lock is released
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
  */
 static void rwsem_mark_wake(struct rw_semaphore *sem,
 			    enum rwsem_wake_type wake_type,
@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 
 	adjustment = woken * RWSEM_READER_BIAS - adjustment;
 	lockevent_cond_inc(rwsem_wake_reader, woken);
+
+	oldcount = atomic_long_read(&sem->count);
 	if (list_empty(&sem->wait_list)) {
-		/* hit end of list above */
+		/*
+		 * Combined with list_move_tail() above, this implies
+		 * rwsem_del_waiter().
+		 */
 		adjustment -= RWSEM_FLAG_WAITERS;
+		if (oldcount & RWSEM_FLAG_HANDOFF)
+			adjustment -= RWSEM_FLAG_HANDOFF;
+	} else if (woken) {
+		/*
+		 * When we've woken a reader, we no longer need to force
+		 * writers to give up the lock and we can clear HANDOFF.
+		 */
+		if (oldcount & RWSEM_FLAG_HANDOFF)
+			adjustment -= RWSEM_FLAG_HANDOFF;
 	}
 
-	/*
-	 * When we've woken a reader, we no longer need to force writers
-	 * to give up the lock and we can clear HANDOFF.
-	 */
-	if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
-		adjustment -= RWSEM_FLAG_HANDOFF;
-
 	if (adjustment)
 		atomic_long_add(adjustment, &sem->count);
 
@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
  * race conditions between checking the rwsem wait list and setting the
  * sem->count accordingly.
  *
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
  */
 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
-					enum writer_wait_state wstate)
+					struct rwsem_waiter *waiter)
 {
+	bool first = rwsem_first_waiter(sem) == waiter;
 	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	do {
 		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
 
-		if (has_handoff && wstate == WRITER_NOT_FIRST)
-			return false;
+		if (has_handoff) {
+			if (!first)
+				return false;
+
+			/* First waiter inherits a previously set handoff bit */
+			waiter->handoff_set = true;
+		}
 
 		new = count;
 
 		if (count & RWSEM_LOCK_MASK) {
-			if (has_handoff || (wstate != WRITER_HANDOFF))
+			if (has_handoff || (!rt_task(waiter->task) &&
+					    !time_after(jiffies, waiter->timeout)))
 				return false;
 
 			new |= RWSEM_FLAG_HANDOFF;
@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	 * We have either acquired the lock with handoff bit cleared or
 	 * set the handoff bit.
 	 */
-	if (new & RWSEM_FLAG_HANDOFF)
+	if (new & RWSEM_FLAG_HANDOFF) {
+		waiter->handoff_set = true;
+		lockevent_inc(rwsem_wlock_handoff);
 		return false;
+	}
 
+	/*
+	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+	 * success.
+	 */
+	list_del(&waiter->list);
 	rwsem_set_owner(sem);
 	return true;
 }
@@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
 		}
 		adjustment += RWSEM_FLAG_WAITERS;
 	}
-	list_add_tail(&waiter.list, &sem->wait_list);
+	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = atomic_long_add_return(adjustment, &sem->count);
@@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
 	return sem;
 
 out_nolock:
-	list_del(&waiter.list);
-	if (list_empty(&sem->wait_list)) {
-		atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
-				   &sem->count);
-	}
+	rwsem_del_waiter(sem, &waiter);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock_fail);
@@ -1020,9 +1061,7 @@ static struct rw_semaphore *
 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
 	long count;
-	enum writer_wait_state wstate;
 	struct rwsem_waiter waiter;
-	struct rw_semaphore *ret = sem;
 	DEFINE_WAKE_Q(wake_q);
 
 	/* do optimistic spinning and steal lock if possible */
@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+	waiter.handoff_set = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
-
-	/* account for this before adding a new element to the list */
-	wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
-	list_add_tail(&waiter.list, &sem->wait_list);
+	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock */
-	if (wstate == WRITER_NOT_FIRST) {
+	if (rwsem_first_waiter(sem) != &waiter) {
 		count = atomic_long_read(&sem->count);
 
 		/*
@@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
 	for (;;) {
-		if (rwsem_try_write_lock(sem, wstate)) {
+		if (rwsem_try_write_lock(sem, &waiter)) {
 			/* rwsem_try_write_lock() implies ACQUIRE on success */
 			break;
 		}
 
 		raw_spin_unlock_irq(&sem->wait_lock);
 
+		if (signal_pending_state(state, current))
+			goto out_nolock;
+
 		/*
 		 * After setting the handoff bit and failing to acquire
 		 * the lock, attempt to spin on owner to accelerate lock
@@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 		 * In this case, we attempt to acquire the lock again
 		 * without sleeping.
 		 */
-		if (wstate == WRITER_HANDOFF) {
+		if (waiter.handoff_set) {
 			enum owner_state owner_state;
 
 			preempt_disable();
@@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 				goto trylock_again;
 		}
 
-		/* Block until there are no active lockers. */
-		for (;;) {
-			if (signal_pending_state(state, current))
-				goto out_nolock;
-
-			schedule();
-			lockevent_inc(rwsem_sleep_writer);
-			set_current_state(state);
-			/*
-			 * If HANDOFF bit is set, unconditionally do
-			 * a trylock.
-			 */
-			if (wstate == WRITER_HANDOFF)
-				break;
-
-			if ((wstate == WRITER_NOT_FIRST) &&
-			    (rwsem_first_waiter(sem) == &waiter))
-				wstate = WRITER_FIRST;
-
-			count = atomic_long_read(&sem->count);
-			if (!(count & RWSEM_LOCK_MASK))
-				break;
-
-			/*
-			 * The setting of the handoff bit is deferred
-			 * until rwsem_try_write_lock() is called.
-			 */
-			if ((wstate == WRITER_FIRST) && (rt_task(current) ||
-			    time_after(jiffies, waiter.timeout))) {
-				wstate = WRITER_HANDOFF;
-				lockevent_inc(rwsem_wlock_handoff);
-				break;
-			}
-		}
+		schedule();
+		lockevent_inc(rwsem_sleep_writer);
+		set_current_state(state);
 trylock_again:
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
 	__set_current_state(TASK_RUNNING);
-	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	lockevent_inc(rwsem_wlock);
-
-	return ret;
+	return sem;
 
 out_nolock:
 	__set_current_state(TASK_RUNNING);
 	raw_spin_lock_irq(&sem->wait_lock);
-	list_del(&waiter.list);
-
-	if (unlikely(wstate == WRITER_HANDOFF))
-		atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
-
-	if (list_empty(&sem->wait_list))
-		atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
-	else
+	rwsem_del_waiter(sem, &waiter);
+	if (!list_empty(&sem->wait_list))
 		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	wake_up_q(&wake_q);
 	lockevent_inc(rwsem_wlock_fail);
-
 	return ERR_PTR(-EINTR);
 }
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
@ 2021-11-16  2:52 ` Aiqun(Maria) Yu
  2021-11-16  9:14   ` Peter Zijlstra
  2021-11-17 13:36 ` Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 41+ messages in thread
From: Aiqun(Maria) Yu @ 2021-11-16  2:52 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra, Ingo Molnar, Will Deacon
  Cc: linux-kernel, Davidlohr Bueso, mazhenhua, Hillf Danton

On 11/16/2021 9:29 AM, Waiman Long wrote:
> There are some inconsistency in the way that the handoff bit is being
> handled in readers and writers.
> 
> Firstly, when a queue head writer set the handoff bit, it will clear it
> when the writer is being killed or interrupted on its way out without
> acquiring the lock. That is not the case for a queue head reader. The
> handoff bit will simply be inherited by the next waiter.
> 
> Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
> the waiter and handoff bits are cleared if the wait queue becomes empty.
> For rwsem_down_write_slowpath(), however, the handoff bit is not checked
> and cleared if the wait queue is empty. This can potentially make the
> handoff bit set with empty wait queue.
> 
> To make the handoff bit handling more consistent and robust, extract
> out handoff bit clearing code into the new rwsem_del_waiter() helper
> function.  The common function will only use atomic_long_andnot() to
> clear bits when the wait queue is empty to avoid possible race condition.
we do have race condition needed to be fixed with this change.
> If the first waiter with handoff bit set is killed or interrupted to
> exit the slowpath without acquiring the lock, the next waiter will
> inherit the handoff bit.
> 
> While at it, simplify the trylock for loop in rwsem_down_write_slowpath()
> to make it easier to read.
> 
> Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   kernel/locking/rwsem.c | 171 ++++++++++++++++++++---------------------
>   1 file changed, 85 insertions(+), 86 deletions(-)
> 
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index c51387a43265..e039cf1605af 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -105,9 +105,9 @@
>    * atomic_long_cmpxchg() will be used to obtain writer lock.
>    *
>    * There are three places where the lock handoff bit may be set or cleared.
> - * 1) rwsem_mark_wake() for readers.
> - * 2) rwsem_try_write_lock() for writers.
> - * 3) Error path of rwsem_down_write_slowpath().
> + * 1) rwsem_mark_wake() for readers		-- set, clear
> + * 2) rwsem_try_write_lock() for writers	-- set, clear
> + * 3) rwsem_del_waiter()			-- clear
>    *
>    * For all the above cases, wait_lock will be held. A writer must also
>    * be the first one in the wait_list to be eligible for setting the handoff
> @@ -334,6 +334,9 @@ struct rwsem_waiter {
>   	struct task_struct *task;
>   	enum rwsem_waiter_type type;
>   	unsigned long timeout;
> +
> +	/* Writer only, not initialized in reader */
> +	bool handoff_set;
>   };
>   #define rwsem_first_waiter(sem) \
>   	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
> @@ -344,12 +347,6 @@ enum rwsem_wake_type {
>   	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
>   };
>   
> -enum writer_wait_state {
> -	WRITER_NOT_FIRST,	/* Writer is not first in wait list */
> -	WRITER_FIRST,		/* Writer is first in wait list     */
> -	WRITER_HANDOFF		/* Writer is first & handoff needed */
> -};
> -
>   /*
>    * The typical HZ value is either 250 or 1000. So set the minimum waiting
>    * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
> @@ -365,6 +362,31 @@ enum writer_wait_state {
>    */
>   #define MAX_READERS_WAKEUP	0x100
>   
> +static inline void
> +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> +{
> +	lockdep_assert_held(&sem->wait_lock);
> +	list_add_tail(&waiter->list, &sem->wait_list);
> +	/* caller will set RWSEM_FLAG_WAITERS */
> +}
> +
> +/*
> + * Remove a waiter from the wait_list and clear flags.
> + *
> + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
> + * this function. Modify with care.
> + */
> +static inline void
> +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> +{
> +	lockdep_assert_held(&sem->wait_lock);
> +	list_del(&waiter->list);
> +	if (likely(!list_empty(&sem->wait_list)))
> +		return;
> +
> +	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
> +}
> +
>   /*
>    * handle the lock release when processes blocked on it that can now run
>    * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
> @@ -376,6 +398,8 @@ enum writer_wait_state {
>    *   preferably when the wait_lock is released
>    * - woken process blocks are discarded from the list after having task zeroed
>    * - writers are only marked woken if downgrading is false
> + *
> + * Implies rwsem_del_waiter() for all woken readers.
>    */
>   static void rwsem_mark_wake(struct rw_semaphore *sem,
>   			    enum rwsem_wake_type wake_type,
> @@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
>   
>   	adjustment = woken * RWSEM_READER_BIAS - adjustment;
>   	lockevent_cond_inc(rwsem_wake_reader, woken);
> +
> +	oldcount = atomic_long_read(&sem->count);
>   	if (list_empty(&sem->wait_list)) {
> -		/* hit end of list above */
> +		/*
> +		 * Combined with list_move_tail() above, this implies
> +		 * rwsem_del_waiter().
> +		 */
>   		adjustment -= RWSEM_FLAG_WAITERS;
> +		if (oldcount & RWSEM_FLAG_HANDOFF)
> +			adjustment -= RWSEM_FLAG_HANDOFF;
> +	} else if (woken) {
> +		/*
> +		 * When we've woken a reader, we no longer need to force
> +		 * writers to give up the lock and we can clear HANDOFF.
> +		 */
> +		if (oldcount & RWSEM_FLAG_HANDOFF)
> +			adjustment -= RWSEM_FLAG_HANDOFF;
>   	}
>   
> -	/*
> -	 * When we've woken a reader, we no longer need to force writers
> -	 * to give up the lock and we can clear HANDOFF.
> -	 */
> -	if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
> -		adjustment -= RWSEM_FLAG_HANDOFF;
> -
>   	if (adjustment)
>   		atomic_long_add(adjustment, &sem->count);
>   
> @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
>    * race conditions between checking the rwsem wait list and setting the
>    * sem->count accordingly.
>    *
> - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
> - * bit is set or the lock is acquired with handoff bit cleared.
> + * Implies rwsem_del_waiter() on success.
>    */
>   static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
> -					enum writer_wait_state wstate)
> +					struct rwsem_waiter *waiter)
>   {
> +	bool first = rwsem_first_waiter(sem) == waiter;
>   	long count, new;
>   
>   	lockdep_assert_held(&sem->wait_lock);
> @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>   	do {
>   		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
>   
> -		if (has_handoff && wstate == WRITER_NOT_FIRST)
> -			return false;
> +		if (has_handoff) {
> +			if (!first)
> +				return false;
> +
> +			/* First waiter inherits a previously set handoff bit */
> +			waiter->handoff_set = true;
> +		}
>   
>   		new = count;
>   
>   		if (count & RWSEM_LOCK_MASK) {
> -			if (has_handoff || (wstate != WRITER_HANDOFF))
> +			if (has_handoff || (!rt_task(waiter->task) &&
> +					    !time_after(jiffies, waiter->timeout)))
>   				return false;
>   
>   			new |= RWSEM_FLAG_HANDOFF;
> @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>   	 * We have either acquired the lock with handoff bit cleared or
>   	 * set the handoff bit.
>   	 */
> -	if (new & RWSEM_FLAG_HANDOFF)
> +	if (new & RWSEM_FLAG_HANDOFF) {
> +		waiter->handoff_set = true;
> +		lockevent_inc(rwsem_wlock_handoff);
>   		return false;
> +	}
>   
> +	/*
> +	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
> +	 * success.
> +	 */
> +	list_del(&waiter->list);
>   	rwsem_set_owner(sem);
>   	return true;
>   }
> @@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
>   		}
>   		adjustment += RWSEM_FLAG_WAITERS;
>   	}
> -	list_add_tail(&waiter.list, &sem->wait_list);
> +	rwsem_add_waiter(sem, &waiter);
>   
>   	/* we're now waiting on the lock, but no longer actively locking */
>   	count = atomic_long_add_return(adjustment, &sem->count);
> @@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
>   	return sem;
>   
>   out_nolock:
> -	list_del(&waiter.list);
> -	if (list_empty(&sem->wait_list)) {
> -		atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
> -				   &sem->count);
> -	}
> +	rwsem_del_waiter(sem, &waiter);
>   	raw_spin_unlock_irq(&sem->wait_lock);
>   	__set_current_state(TASK_RUNNING);
>   	lockevent_inc(rwsem_rlock_fail);
> @@ -1020,9 +1061,7 @@ static struct rw_semaphore *
>   rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
>   {
>   	long count;
> -	enum writer_wait_state wstate;
>   	struct rwsem_waiter waiter;
> -	struct rw_semaphore *ret = sem;
>   	DEFINE_WAKE_Q(wake_q);
>   
>   	/* do optimistic spinning and steal lock if possible */
> @@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
>   	waiter.task = current;
>   	waiter.type = RWSEM_WAITING_FOR_WRITE;
>   	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
> +	waiter.handoff_set = false;
>   
>   	raw_spin_lock_irq(&sem->wait_lock);
> -
> -	/* account for this before adding a new element to the list */
> -	wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
> -
> -	list_add_tail(&waiter.list, &sem->wait_list);
> +	rwsem_add_waiter(sem, &waiter);
>   
>   	/* we're now waiting on the lock */
> -	if (wstate == WRITER_NOT_FIRST) {
> +	if (rwsem_first_waiter(sem) != &waiter) {
>   		count = atomic_long_read(&sem->count);
>   
>   		/*
> @@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
>   	/* wait until we successfully acquire the lock */
>   	set_current_state(state);
>   	for (;;) {
> -		if (rwsem_try_write_lock(sem, wstate)) {
> +		if (rwsem_try_write_lock(sem, &waiter)) {
>   			/* rwsem_try_write_lock() implies ACQUIRE on success */
>   			break;
>   		}
>   
>   		raw_spin_unlock_irq(&sem->wait_lock);
>   
> +		if (signal_pending_state(state, current))
> +			goto out_nolock;
> +
>   		/*
>   		 * After setting the handoff bit and failing to acquire
>   		 * the lock, attempt to spin on owner to accelerate lock
> @@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
>   		 * In this case, we attempt to acquire the lock again
>   		 * without sleeping.
>   		 */
> -		if (wstate == WRITER_HANDOFF) {
> +		if (waiter.handoff_set) {
>   			enum owner_state owner_state;
>   
>   			preempt_disable();
> @@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
>   				goto trylock_again;
>   		}
>   
> -		/* Block until there are no active lockers. */
> -		for (;;) {
> -			if (signal_pending_state(state, current))
> -				goto out_nolock;
> -
> -			schedule();
> -			lockevent_inc(rwsem_sleep_writer);
> -			set_current_state(state);
> -			/*
> -			 * If HANDOFF bit is set, unconditionally do
> -			 * a trylock.
> -			 */
> -			if (wstate == WRITER_HANDOFF)
> -				break;
> -
> -			if ((wstate == WRITER_NOT_FIRST) &&
> -			    (rwsem_first_waiter(sem) == &waiter))
> -				wstate = WRITER_FIRST;
> -
> -			count = atomic_long_read(&sem->count);
> -			if (!(count & RWSEM_LOCK_MASK))
> -				break;
> -
> -			/*
> -			 * The setting of the handoff bit is deferred
> -			 * until rwsem_try_write_lock() is called.
> -			 */
> -			if ((wstate == WRITER_FIRST) && (rt_task(current) ||
> -			    time_after(jiffies, waiter.timeout))) {
> -				wstate = WRITER_HANDOFF;
> -				lockevent_inc(rwsem_wlock_handoff);
> -				break;
> -			}
> -		}
> +		schedule();
> +		lockevent_inc(rwsem_sleep_writer);
> +		set_current_state(state);
>   trylock_again:
>   		raw_spin_lock_irq(&sem->wait_lock);
>   	}
>   	__set_current_state(TASK_RUNNING);
> -	list_del(&waiter.list);
>   	raw_spin_unlock_irq(&sem->wait_lock);
>   	lockevent_inc(rwsem_wlock);
> -
> -	return ret;
> +	return sem;
>   
>   out_nolock:
>   	__set_current_state(TASK_RUNNING);
>   	raw_spin_lock_irq(&sem->wait_lock);
> -	list_del(&waiter.list);
> -
> -	if (unlikely(wstate == WRITER_HANDOFF))
> -		atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
> -
> -	if (list_empty(&sem->wait_list))
> -		atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
> -	else
> +	rwsem_del_waiter(sem, &waiter);
> +	if (!list_empty(&sem->wait_list))
>   		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
>   	raw_spin_unlock_irq(&sem->wait_lock);
>   	wake_up_q(&wake_q);
>   	lockevent_inc(rwsem_wlock_fail);
> -
>   	return ERR_PTR(-EINTR);
>   }
>   
> 


-- 
Thx and BRs,
Aiqun(Maria) Yu

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  2:52 ` Aiqun(Maria) Yu
@ 2021-11-16  9:14   ` Peter Zijlstra
  2021-11-16  9:24     ` Peter Zijlstra
  0 siblings, 1 reply; 41+ messages in thread
From: Peter Zijlstra @ 2021-11-16  9:14 UTC (permalink / raw)
  To: Aiqun(Maria) Yu
  Cc: Waiman Long, Ingo Molnar, Will Deacon, linux-kernel,
	Davidlohr Bueso, mazhenhua, Hillf Danton

On Tue, Nov 16, 2021 at 10:52:42AM +0800, Aiqun(Maria) Yu wrote:
> On 11/16/2021 9:29 AM, Waiman Long wrote:
> > There are some inconsistency in the way that the handoff bit is being
> > handled in readers and writers.
> > 
> > Firstly, when a queue head writer set the handoff bit, it will clear it
> > when the writer is being killed or interrupted on its way out without
> > acquiring the lock. That is not the case for a queue head reader. The
> > handoff bit will simply be inherited by the next waiter.
> > 
> > Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
> > the waiter and handoff bits are cleared if the wait queue becomes empty.
> > For rwsem_down_write_slowpath(), however, the handoff bit is not checked
> > and cleared if the wait queue is empty. This can potentially make the
> > handoff bit set with empty wait queue.
> > 
> > To make the handoff bit handling more consistent and robust, extract
> > out handoff bit clearing code into the new rwsem_del_waiter() helper
> > function.  The common function will only use atomic_long_andnot() to
> > clear bits when the wait queue is empty to avoid possible race condition.
> we do have race condition needed to be fixed with this change.

Indeed, let me edit the changelog to reflect that. Also, I think, it
needs a Reported-by:.

> > If the first waiter with handoff bit set is killed or interrupted to
> > exit the slowpath without acquiring the lock, the next waiter will
> > inherit the handoff bit.
> > 
> > While at it, simplify the trylock for loop in rwsem_down_write_slowpath()
> > to make it easier to read.
> > 
> > Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
> > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > Signed-off-by: Waiman Long <longman@redhat.com>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  9:14   ` Peter Zijlstra
@ 2021-11-16  9:24     ` Peter Zijlstra
  2021-11-16 14:52       ` Waiman Long
  0 siblings, 1 reply; 41+ messages in thread
From: Peter Zijlstra @ 2021-11-16  9:24 UTC (permalink / raw)
  To: Aiqun(Maria) Yu
  Cc: Waiman Long, Ingo Molnar, Will Deacon, linux-kernel,
	Davidlohr Bueso, mazhenhua, Hillf Danton

On Tue, Nov 16, 2021 at 10:14:20AM +0100, Peter Zijlstra wrote:
> On Tue, Nov 16, 2021 at 10:52:42AM +0800, Aiqun(Maria) Yu wrote:
> > On 11/16/2021 9:29 AM, Waiman Long wrote:
> > > There are some inconsistency in the way that the handoff bit is being
> > > handled in readers and writers.
> > > 
> > > Firstly, when a queue head writer set the handoff bit, it will clear it
> > > when the writer is being killed or interrupted on its way out without
> > > acquiring the lock. That is not the case for a queue head reader. The
> > > handoff bit will simply be inherited by the next waiter.
> > > 
> > > Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
> > > the waiter and handoff bits are cleared if the wait queue becomes empty.
> > > For rwsem_down_write_slowpath(), however, the handoff bit is not checked
> > > and cleared if the wait queue is empty. This can potentially make the
> > > handoff bit set with empty wait queue.
> > > 
> > > To make the handoff bit handling more consistent and robust, extract
> > > out handoff bit clearing code into the new rwsem_del_waiter() helper
> > > function.  The common function will only use atomic_long_andnot() to
> > > clear bits when the wait queue is empty to avoid possible race condition.
> > we do have race condition needed to be fixed with this change.
> 
> Indeed, let me edit the changelog to reflect that. Also, I think, it
> needs a Reported-by:.

How's something liks so then?

---
Subject: locking/rwsem: Make handoff bit handling more consistent
From: Waiman Long <longman@redhat.com>
Date: Mon, 15 Nov 2021 20:29:12 -0500

From: Waiman Long <longman@redhat.com>

There are some inconsistency in the way that the handoff bit is being
handled in readers and writers that lead to a race condition.

Firstly, when a queue head writer set the handoff bit, it will clear
it when the writer is being killed or interrupted on its way out
without acquiring the lock. That is not the case for a queue head
reader. The handoff bit will simply be inherited by the next waiter.

Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
the waiter and handoff bits are cleared if the wait queue becomes
empty.  For rwsem_down_write_slowpath(), however, the handoff bit is
not checked and cleared if the wait queue is empty. This can
potentially make the handoff bit set with empty wait queue.

Worse, the situation in rwsem_down_write_slowpath() relies on wstate,
a variable set outside of the critical section containing the ->count
manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF
can be double subtracted, corrupting ->count.

To make the handoff bit handling more consistent and robust, extract
out handoff bit clearing code into the new rwsem_del_waiter() helper
function. Also, completely eradicate wstate; always evaluate
everything inside the same critical section.

The common function will only use atomic_long_andnot() to clear bits
when the wait queue is empty to avoid possible race condition.  If the
first waiter with handoff bit set is killed or interrupted to exit the
slowpath without acquiring the lock, the next waiter will inherit the
handoff bit.

While at it, simplify the trylock for loop in
rwsem_down_write_slowpath() to make it easier to read.

Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
Reported-by: Zhenhua Ma <mazhenhua@xiaomi.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com
---

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  9:24     ` Peter Zijlstra
@ 2021-11-16 14:52       ` Waiman Long
  0 siblings, 0 replies; 41+ messages in thread
From: Waiman Long @ 2021-11-16 14:52 UTC (permalink / raw)
  To: Peter Zijlstra, Aiqun(Maria) Yu
  Cc: Ingo Molnar, Will Deacon, linux-kernel, Davidlohr Bueso,
	mazhenhua, Hillf Danton


On 11/16/21 04:24, Peter Zijlstra wrote:
> On Tue, Nov 16, 2021 at 10:14:20AM +0100, Peter Zijlstra wrote:
>> On Tue, Nov 16, 2021 at 10:52:42AM +0800, Aiqun(Maria) Yu wrote:
>>> On 11/16/2021 9:29 AM, Waiman Long wrote:
>>>> There are some inconsistency in the way that the handoff bit is being
>>>> handled in readers and writers.
>>>>
>>>> Firstly, when a queue head writer set the handoff bit, it will clear it
>>>> when the writer is being killed or interrupted on its way out without
>>>> acquiring the lock. That is not the case for a queue head reader. The
>>>> handoff bit will simply be inherited by the next waiter.
>>>>
>>>> Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
>>>> the waiter and handoff bits are cleared if the wait queue becomes empty.
>>>> For rwsem_down_write_slowpath(), however, the handoff bit is not checked
>>>> and cleared if the wait queue is empty. This can potentially make the
>>>> handoff bit set with empty wait queue.
>>>>
>>>> To make the handoff bit handling more consistent and robust, extract
>>>> out handoff bit clearing code into the new rwsem_del_waiter() helper
>>>> function.  The common function will only use atomic_long_andnot() to
>>>> clear bits when the wait queue is empty to avoid possible race condition.
>>> we do have race condition needed to be fixed with this change.
>> Indeed, let me edit the changelog to reflect that. Also, I think, it
>> needs a Reported-by:.
> How's something liks so then?
>
> ---
> Subject: locking/rwsem: Make handoff bit handling more consistent
> From: Waiman Long <longman@redhat.com>
> Date: Mon, 15 Nov 2021 20:29:12 -0500
>
> From: Waiman Long <longman@redhat.com>
>
> There are some inconsistency in the way that the handoff bit is being
> handled in readers and writers that lead to a race condition.
>
> Firstly, when a queue head writer set the handoff bit, it will clear
> it when the writer is being killed or interrupted on its way out
> without acquiring the lock. That is not the case for a queue head
> reader. The handoff bit will simply be inherited by the next waiter.
>
> Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
> the waiter and handoff bits are cleared if the wait queue becomes
> empty.  For rwsem_down_write_slowpath(), however, the handoff bit is
> not checked and cleared if the wait queue is empty. This can
> potentially make the handoff bit set with empty wait queue.
>
> Worse, the situation in rwsem_down_write_slowpath() relies on wstate,
> a variable set outside of the critical section containing the ->count
> manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF
> can be double subtracted, corrupting ->count.
>
> To make the handoff bit handling more consistent and robust, extract
> out handoff bit clearing code into the new rwsem_del_waiter() helper
> function. Also, completely eradicate wstate; always evaluate
> everything inside the same critical section.
>
> The common function will only use atomic_long_andnot() to clear bits
> when the wait queue is empty to avoid possible race condition.  If the
> first waiter with handoff bit set is killed or interrupted to exit the
> slowpath without acquiring the lock, the next waiter will inherit the
> handoff bit.
>
> While at it, simplify the trylock for loop in
> rwsem_down_write_slowpath() to make it easier to read.
>
> Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
> Reported-by: Zhenhua Ma <mazhenhua@xiaomi.com>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Waiman Long <longman@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com
> ---
>
Yes, that looks good to me. Thanks for the editing.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
  2021-11-16  2:52 ` Aiqun(Maria) Yu
@ 2021-11-17 13:36 ` Peter Zijlstra
  2021-11-23  8:53 ` [tip: locking/urgent] " tip-bot2 for Waiman Long
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Peter Zijlstra @ 2021-11-17 13:36 UTC (permalink / raw)
  To: Waiman Long
  Cc: Ingo Molnar, Will Deacon, linux-kernel, Davidlohr Bueso,
	mazhenhua, Hillf Danton, Maria Yu

On Mon, Nov 15, 2021 at 08:29:12PM -0500, Waiman Long wrote:
> There are some inconsistency in the way that the handoff bit is being
> handled in readers and writers.
> 
> Firstly, when a queue head writer set the handoff bit, it will clear it
> when the writer is being killed or interrupted on its way out without
> acquiring the lock. That is not the case for a queue head reader. The
> handoff bit will simply be inherited by the next waiter.
> 
> Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
> the waiter and handoff bits are cleared if the wait queue becomes empty.
> For rwsem_down_write_slowpath(), however, the handoff bit is not checked
> and cleared if the wait queue is empty. This can potentially make the
> handoff bit set with empty wait queue.
> 
> To make the handoff bit handling more consistent and robust, extract
> out handoff bit clearing code into the new rwsem_del_waiter() helper
> function.  The common function will only use atomic_long_andnot() to
> clear bits when the wait queue is empty to avoid possible race condition.
> If the first waiter with handoff bit set is killed or interrupted to
> exit the slowpath without acquiring the lock, the next waiter will
> inherit the handoff bit.
> 
> While at it, simplify the trylock for loop in rwsem_down_write_slowpath()
> to make it easier to read.
> 
> Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---

Zhenhua Ma, would you be able to confirm this works for you and provide
a Tested-by? 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [tip: locking/urgent] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
  2021-11-16  2:52 ` Aiqun(Maria) Yu
  2021-11-17 13:36 ` Peter Zijlstra
@ 2021-11-23  8:53 ` tip-bot2 for Waiman Long
  2022-02-14 15:47 ` Re:[PATCH v5] " chenguanyou
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: tip-bot2 for Waiman Long @ 2021-11-23  8:53 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Zhenhua Ma, Peter Zijlstra, Waiman Long, x86, linux-kernel

The following commit has been merged into the locking/urgent branch of tip:

Commit-ID:     d257cc8cb8d5355ffc43a96bab94db7b5a324803
Gitweb:        https://git.kernel.org/tip/d257cc8cb8d5355ffc43a96bab94db7b5a324803
Author:        Waiman Long <longman@redhat.com>
AuthorDate:    Mon, 15 Nov 2021 20:29:12 -05:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 23 Nov 2021 09:45:35 +01:00

locking/rwsem: Make handoff bit handling more consistent

There are some inconsistency in the way that the handoff bit is being
handled in readers and writers that lead to a race condition.

Firstly, when a queue head writer set the handoff bit, it will clear
it when the writer is being killed or interrupted on its way out
without acquiring the lock. That is not the case for a queue head
reader. The handoff bit will simply be inherited by the next waiter.

Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
the waiter and handoff bits are cleared if the wait queue becomes
empty.  For rwsem_down_write_slowpath(), however, the handoff bit is
not checked and cleared if the wait queue is empty. This can
potentially make the handoff bit set with empty wait queue.

Worse, the situation in rwsem_down_write_slowpath() relies on wstate,
a variable set outside of the critical section containing the ->count
manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF
can be double subtracted, corrupting ->count.

To make the handoff bit handling more consistent and robust, extract
out handoff bit clearing code into the new rwsem_del_waiter() helper
function. Also, completely eradicate wstate; always evaluate
everything inside the same critical section.

The common function will only use atomic_long_andnot() to clear bits
when the wait queue is empty to avoid possible race condition.  If the
first waiter with handoff bit set is killed or interrupted to exit the
slowpath without acquiring the lock, the next waiter will inherit the
handoff bit.

While at it, simplify the trylock for loop in
rwsem_down_write_slowpath() to make it easier to read.

Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
Reported-by: Zhenhua Ma <mazhenhua@xiaomi.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com
---
 kernel/locking/rwsem.c | 171 +++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 86 deletions(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c51387a..e039cf1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -105,9 +105,9 @@
  * atomic_long_cmpxchg() will be used to obtain writer lock.
  *
  * There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers		-- set, clear
+ * 2) rwsem_try_write_lock() for writers	-- set, clear
+ * 3) rwsem_del_waiter()			-- clear
  *
  * For all the above cases, wait_lock will be held. A writer must also
  * be the first one in the wait_list to be eligible for setting the handoff
@@ -334,6 +334,9 @@ struct rwsem_waiter {
 	struct task_struct *task;
 	enum rwsem_waiter_type type;
 	unsigned long timeout;
+
+	/* Writer only, not initialized in reader */
+	bool handoff_set;
 };
 #define rwsem_first_waiter(sem) \
 	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -344,12 +347,6 @@ enum rwsem_wake_type {
 	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
 };
 
-enum writer_wait_state {
-	WRITER_NOT_FIRST,	/* Writer is not first in wait list */
-	WRITER_FIRST,		/* Writer is first in wait list     */
-	WRITER_HANDOFF		/* Writer is first & handoff needed */
-};
-
 /*
  * The typical HZ value is either 250 or 1000. So set the minimum waiting
  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -365,6 +362,31 @@ enum writer_wait_state {
  */
 #define MAX_READERS_WAKEUP	0x100
 
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+	lockdep_assert_held(&sem->wait_lock);
+	list_add_tail(&waiter->list, &sem->wait_list);
+	/* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ */
+static inline void
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+	lockdep_assert_held(&sem->wait_lock);
+	list_del(&waiter->list);
+	if (likely(!list_empty(&sem->wait_list)))
+		return;
+
+	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+}
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -376,6 +398,8 @@ enum writer_wait_state {
  *   preferably when the wait_lock is released
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
  */
 static void rwsem_mark_wake(struct rw_semaphore *sem,
 			    enum rwsem_wake_type wake_type,
@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 
 	adjustment = woken * RWSEM_READER_BIAS - adjustment;
 	lockevent_cond_inc(rwsem_wake_reader, woken);
+
+	oldcount = atomic_long_read(&sem->count);
 	if (list_empty(&sem->wait_list)) {
-		/* hit end of list above */
+		/*
+		 * Combined with list_move_tail() above, this implies
+		 * rwsem_del_waiter().
+		 */
 		adjustment -= RWSEM_FLAG_WAITERS;
+		if (oldcount & RWSEM_FLAG_HANDOFF)
+			adjustment -= RWSEM_FLAG_HANDOFF;
+	} else if (woken) {
+		/*
+		 * When we've woken a reader, we no longer need to force
+		 * writers to give up the lock and we can clear HANDOFF.
+		 */
+		if (oldcount & RWSEM_FLAG_HANDOFF)
+			adjustment -= RWSEM_FLAG_HANDOFF;
 	}
 
-	/*
-	 * When we've woken a reader, we no longer need to force writers
-	 * to give up the lock and we can clear HANDOFF.
-	 */
-	if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
-		adjustment -= RWSEM_FLAG_HANDOFF;
-
 	if (adjustment)
 		atomic_long_add(adjustment, &sem->count);
 
@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
  * race conditions between checking the rwsem wait list and setting the
  * sem->count accordingly.
  *
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
  */
 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
-					enum writer_wait_state wstate)
+					struct rwsem_waiter *waiter)
 {
+	bool first = rwsem_first_waiter(sem) == waiter;
 	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	do {
 		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
 
-		if (has_handoff && wstate == WRITER_NOT_FIRST)
-			return false;
+		if (has_handoff) {
+			if (!first)
+				return false;
+
+			/* First waiter inherits a previously set handoff bit */
+			waiter->handoff_set = true;
+		}
 
 		new = count;
 
 		if (count & RWSEM_LOCK_MASK) {
-			if (has_handoff || (wstate != WRITER_HANDOFF))
+			if (has_handoff || (!rt_task(waiter->task) &&
+					    !time_after(jiffies, waiter->timeout)))
 				return false;
 
 			new |= RWSEM_FLAG_HANDOFF;
@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	 * We have either acquired the lock with handoff bit cleared or
 	 * set the handoff bit.
 	 */
-	if (new & RWSEM_FLAG_HANDOFF)
+	if (new & RWSEM_FLAG_HANDOFF) {
+		waiter->handoff_set = true;
+		lockevent_inc(rwsem_wlock_handoff);
 		return false;
+	}
 
+	/*
+	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+	 * success.
+	 */
+	list_del(&waiter->list);
 	rwsem_set_owner(sem);
 	return true;
 }
@@ -956,7 +1001,7 @@ queue:
 		}
 		adjustment += RWSEM_FLAG_WAITERS;
 	}
-	list_add_tail(&waiter.list, &sem->wait_list);
+	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = atomic_long_add_return(adjustment, &sem->count);
@@ -1002,11 +1047,7 @@ queue:
 	return sem;
 
 out_nolock:
-	list_del(&waiter.list);
-	if (list_empty(&sem->wait_list)) {
-		atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
-				   &sem->count);
-	}
+	rwsem_del_waiter(sem, &waiter);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock_fail);
@@ -1020,9 +1061,7 @@ static struct rw_semaphore *
 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
 	long count;
-	enum writer_wait_state wstate;
 	struct rwsem_waiter waiter;
-	struct rw_semaphore *ret = sem;
 	DEFINE_WAKE_Q(wake_q);
 
 	/* do optimistic spinning and steal lock if possible */
@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+	waiter.handoff_set = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
-
-	/* account for this before adding a new element to the list */
-	wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
-	list_add_tail(&waiter.list, &sem->wait_list);
+	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock */
-	if (wstate == WRITER_NOT_FIRST) {
+	if (rwsem_first_waiter(sem) != &waiter) {
 		count = atomic_long_read(&sem->count);
 
 		/*
@@ -1083,13 +1119,16 @@ wait:
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
 	for (;;) {
-		if (rwsem_try_write_lock(sem, wstate)) {
+		if (rwsem_try_write_lock(sem, &waiter)) {
 			/* rwsem_try_write_lock() implies ACQUIRE on success */
 			break;
 		}
 
 		raw_spin_unlock_irq(&sem->wait_lock);
 
+		if (signal_pending_state(state, current))
+			goto out_nolock;
+
 		/*
 		 * After setting the handoff bit and failing to acquire
 		 * the lock, attempt to spin on owner to accelerate lock
@@ -1098,7 +1137,7 @@ wait:
 		 * In this case, we attempt to acquire the lock again
 		 * without sleeping.
 		 */
-		if (wstate == WRITER_HANDOFF) {
+		if (waiter.handoff_set) {
 			enum owner_state owner_state;
 
 			preempt_disable();
@@ -1109,66 +1148,26 @@ wait:
 				goto trylock_again;
 		}
 
-		/* Block until there are no active lockers. */
-		for (;;) {
-			if (signal_pending_state(state, current))
-				goto out_nolock;
-
-			schedule();
-			lockevent_inc(rwsem_sleep_writer);
-			set_current_state(state);
-			/*
-			 * If HANDOFF bit is set, unconditionally do
-			 * a trylock.
-			 */
-			if (wstate == WRITER_HANDOFF)
-				break;
-
-			if ((wstate == WRITER_NOT_FIRST) &&
-			    (rwsem_first_waiter(sem) == &waiter))
-				wstate = WRITER_FIRST;
-
-			count = atomic_long_read(&sem->count);
-			if (!(count & RWSEM_LOCK_MASK))
-				break;
-
-			/*
-			 * The setting of the handoff bit is deferred
-			 * until rwsem_try_write_lock() is called.
-			 */
-			if ((wstate == WRITER_FIRST) && (rt_task(current) ||
-			    time_after(jiffies, waiter.timeout))) {
-				wstate = WRITER_HANDOFF;
-				lockevent_inc(rwsem_wlock_handoff);
-				break;
-			}
-		}
+		schedule();
+		lockevent_inc(rwsem_sleep_writer);
+		set_current_state(state);
 trylock_again:
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
 	__set_current_state(TASK_RUNNING);
-	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	lockevent_inc(rwsem_wlock);
-
-	return ret;
+	return sem;
 
 out_nolock:
 	__set_current_state(TASK_RUNNING);
 	raw_spin_lock_irq(&sem->wait_lock);
-	list_del(&waiter.list);
-
-	if (unlikely(wstate == WRITER_HANDOFF))
-		atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
-
-	if (list_empty(&sem->wait_list))
-		atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
-	else
+	rwsem_del_waiter(sem, &waiter);
+	if (!list_empty(&sem->wait_list))
 		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	wake_up_q(&wake_q);
 	lockevent_inc(rwsem_wlock_fail);
-
 	return ERR_PTR(-EINTR);
 }
 

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re:[PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
                   ` (2 preceding siblings ...)
  2021-11-23  8:53 ` [tip: locking/urgent] " tip-bot2 for Waiman Long
@ 2022-02-14 15:47 ` chenguanyou
  2022-02-14 16:01   ` [PATCH " Greg KH
  2022-04-11 18:26   ` john.p.donnelly
  2022-02-14 16:22 ` chenguanyou
  2022-07-19  0:27 ` Doug Anderson
  5 siblings, 2 replies; 41+ messages in thread
From: chenguanyou @ 2022-02-14 15:47 UTC (permalink / raw)
  To: longman, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz, quic_aiquny, will

Hi Waiman, Greg,
This patch has been merged in branch linux-5.16.y.
Can we take it to the linux-5.10.y LTS version?

thanks,

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-02-14 15:47 ` Re:[PATCH v5] " chenguanyou
@ 2022-02-14 16:01   ` Greg KH
  2022-04-11 18:26   ` john.p.donnelly
  1 sibling, 0 replies; 41+ messages in thread
From: Greg KH @ 2022-02-14 16:01 UTC (permalink / raw)
  To: chenguanyou
  Cc: longman, dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will

On Mon, Feb 14, 2022 at 11:47:41PM +0800, chenguanyou wrote:
> Hi Waiman, Greg,
> This patch has been merged in branch linux-5.16.y.
> Can we take it to the linux-5.10.y LTS version?

What is "this patch"?

To have patches applied to the stable kernel tree, please read:
    https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re:[PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
                   ` (3 preceding siblings ...)
  2022-02-14 15:47 ` Re:[PATCH v5] " chenguanyou
@ 2022-02-14 16:22 ` chenguanyou
  2022-02-15  7:41   ` [PATCH " Greg KH
  2022-07-19  0:27 ` Doug Anderson
  5 siblings, 1 reply; 41+ messages in thread
From: chenguanyou @ 2022-02-14 16:22 UTC (permalink / raw)
  To: longman, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz, quic_aiquny, will

>> Hi Waiman, Greg,
>> This patch has been merged in branch linux-5.16.y.
>> Can we take it to the linux-5.10.y LTS version?

>What is "this patch"?

commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff bit handling more consistent")

thanks,

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-02-14 16:22 ` chenguanyou
@ 2022-02-15  7:41   ` Greg KH
  2022-02-16 16:30     ` Waiman Long
  0 siblings, 1 reply; 41+ messages in thread
From: Greg KH @ 2022-02-15  7:41 UTC (permalink / raw)
  To: chenguanyou
  Cc: longman, dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will

On Tue, Feb 15, 2022 at 12:22:18AM +0800, chenguanyou wrote:
> >> Hi Waiman, Greg,
> >> This patch has been merged in branch linux-5.16.y.
> >> Can we take it to the linux-5.10.y LTS version?
> 
> >What is "this patch"?
> 
> commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff bit handling more consistent")

Have you tested it on the 5.10.y branch to verify it actually works
properly for you?

If so, please provide a working backport to the stable list, as it does
not apply cleanly as-is.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-02-15  7:41   ` [PATCH " Greg KH
@ 2022-02-16 16:30     ` Waiman Long
  2022-02-17 15:41       ` chenguanyou
  0 siblings, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-02-16 16:30 UTC (permalink / raw)
  To: Greg KH, chenguanyou, Jaegeuk Kim
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz, quic_aiquny, will

[-- Attachment #1: Type: text/plain, Size: 1182 bytes --]

On 2/15/22 02:41, Greg KH wrote:
> On Tue, Feb 15, 2022 at 12:22:18AM +0800, chenguanyou wrote:
>>>> Hi Waiman, Greg,
>>>> This patch has been merged in branch linux-5.16.y.
>>>> Can we take it to the linux-5.10.y LTS version?
>>> What is "this patch"?
>> commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff bit handling more consistent")
> Have you tested it on the 5.10.y branch to verify it actually works
> properly for you?
>
> If so, please provide a working backport to the stable list, as it does
> not apply cleanly as-is.
>
> thanks,
>
> greg k-h
>
I have attached the 5.10.y backport of commit 
d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff 
bit handling more consistent"). I also include a backport of commit 
2f06f702925b512a95b95dca3855549c047eef58 ("locking/rwsem: Prevent 
potential lock starvation") which I think may help Jaegeuk. I had run 
some sanity tests and the backported patches work fine. However, I don't 
have access to their testing environments to verify if they can fix the 
problems seem by Chen or Jaegeuk. So please test these patches to see if 
they can address your problems.

Cheers,
Longman


[-- Attachment #2: 0001-locking-rwsem-Prevent-potential-lock-starvation.patch --]
[-- Type: text/x-patch, Size: 3133 bytes --]

From 0dbc7a60246afd00640e14f1d872b96e71bd92b3 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 15 Feb 2022 16:36:38 -0500
Subject: [PATCH 1/2] locking/rwsem: Prevent potential lock starvation

commit 2f06f702925b512a95b95dca3855549c047eef58 upstream.

The lock handoff bit is added in commit 4f23dbc1e657 ("locking/rwsem:
Implement lock handoff to prevent lock starvation") to avoid lock
starvation. However, allowing readers to do optimistic spinning does
introduce an unlikely scenario where lock starvation can happen.

The lock handoff bit may only be set when a waiter is being woken up.
In the case of reader unlock, wakeup happens only when the reader count
reaches 0. If there is a continuous stream of incoming readers acquiring
read lock via optimistic spinning, it is possible that the reader count
may never reach 0 and so the handoff bit will never be asserted.

One way to prevent this scenario from happening is to disallow optimistic
spinning if the rwsem is currently owned by readers. If the previous
or current owner is a writer, optimistic spinning will be allowed.

If the previous owner is a reader but the reader count has reached 0
before, a wakeup should have been issued. So the handoff mechanism
will be kicked in to prevent lock starvation. As a result, it should
be OK to do optimistic spinning in this case.

This patch may have some impact on reader performance as it reduces
reader optimistic spinning especially if the lock critical sections
are short the number of contending readers are small.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Link: https://lkml.kernel.org/r/20201121041416.12285-3-longman@redhat.com
---
 kernel/locking/rwsem.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 7bf45b0a1b1d..d4f5a8a473b3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -998,16 +998,28 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
 static struct rw_semaphore __sched *
 rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
 {
-	long count, adjustment = -RWSEM_READER_BIAS;
+	long count = atomic_long_read(&sem->count);
+	long owner, adjustment = -RWSEM_READER_BIAS;
+	long rcnt = (count >> RWSEM_READER_SHIFT);
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
 	bool wake = false;
 
+	/*
+	 * To prevent a constant stream of readers from starving a sleeping
+	 * waiter, don't attempt optimistic spinning if the lock is currently
+	 * owned by readers.
+	 */
+	owner = atomic_long_read(&sem->owner);
+	if ((owner & RWSEM_READER_OWNED) && (rcnt > 1) &&
+	   !(count & RWSEM_WRITER_LOCKED))
+		goto queue;
+
 	/*
 	 * Save the current read-owner of rwsem, if available, and the
 	 * reader nonspinnable bit.
 	 */
-	waiter.last_rowner = atomic_long_read(&sem->owner);
+	waiter.last_rowner = owner;
 	if (!(waiter.last_rowner & RWSEM_READER_OWNED))
 		waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
 
-- 
2.27.0


[-- Attachment #3: 0002-locking-rwsem-Make-handoff-bit-handling-more-consist.patch --]
[-- Type: text/x-patch, Size: 12487 bytes --]

From 52b265ee6918c6641ae9b440e55a7eee6b94039f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 15 Feb 2022 16:51:04 -0500
Subject: [PATCH 2/2] locking/rwsem: Make handoff bit handling more consistent

commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 upstream.

There are some inconsistency in the way that the handoff bit is being
handled in readers and writers that lead to a race condition.

Firstly, when a queue head writer set the handoff bit, it will clear
it when the writer is being killed or interrupted on its way out
without acquiring the lock. That is not the case for a queue head
reader. The handoff bit will simply be inherited by the next waiter.

Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
the waiter and handoff bits are cleared if the wait queue becomes
empty.  For rwsem_down_write_slowpath(), however, the handoff bit is
not checked and cleared if the wait queue is empty. This can
potentially make the handoff bit set with empty wait queue.

Worse, the situation in rwsem_down_write_slowpath() relies on wstate,
a variable set outside of the critical section containing the ->count
manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF
can be double subtracted, corrupting ->count.

To make the handoff bit handling more consistent and robust, extract
out handoff bit clearing code into the new rwsem_del_waiter() helper
function. Also, completely eradicate wstate; always evaluate
everything inside the same critical section.

The common function will only use atomic_long_andnot() to clear bits
when the wait queue is empty to avoid possible race condition.  If the
first waiter with handoff bit set is killed or interrupted to exit the
slowpath without acquiring the lock, the next waiter will inherit the
handoff bit.

While at it, simplify the trylock for loop in
rwsem_down_write_slowpath() to make it easier to read.

Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
Reported-by: Zhenhua Ma <mazhenhua@xiaomi.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com
---
 kernel/locking/rwsem.c | 171 ++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 86 deletions(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index d4f5a8a473b3..c3cb8ea1a82b 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -143,9 +143,9 @@
  * atomic_long_cmpxchg() will be used to obtain writer lock.
  *
  * There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers		-- set, clear
+ * 2) rwsem_try_write_lock() for writers	-- set, clear
+ * 3) rwsem_del_waiter()			-- clear
  *
  * For all the above cases, wait_lock will be held. A writer must also
  * be the first one in the wait_list to be eligible for setting the handoff
@@ -361,6 +361,9 @@ struct rwsem_waiter {
 	enum rwsem_waiter_type type;
 	unsigned long timeout;
 	unsigned long last_rowner;
+
+	/* Writer only, not initialized in reader */
+	bool handoff_set;
 };
 #define rwsem_first_waiter(sem) \
 	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -371,12 +374,6 @@ enum rwsem_wake_type {
 	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
 };
 
-enum writer_wait_state {
-	WRITER_NOT_FIRST,	/* Writer is not first in wait list */
-	WRITER_FIRST,		/* Writer is first in wait list     */
-	WRITER_HANDOFF		/* Writer is first & handoff needed */
-};
-
 /*
  * The typical HZ value is either 250 or 1000. So set the minimum waiting
  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -392,6 +389,31 @@ enum writer_wait_state {
  */
 #define MAX_READERS_WAKEUP	0x100
 
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+	lockdep_assert_held(&sem->wait_lock);
+	list_add_tail(&waiter->list, &sem->wait_list);
+	/* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ */
+static inline void
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+	lockdep_assert_held(&sem->wait_lock);
+	list_del(&waiter->list);
+	if (likely(!list_empty(&sem->wait_list)))
+		return;
+
+	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+}
+
 /*
  * handle the lock release when processes blocked on it that can now run
  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -403,6 +425,8 @@ enum writer_wait_state {
  *   preferably when the wait_lock is released
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
  */
 static void rwsem_mark_wake(struct rw_semaphore *sem,
 			    enum rwsem_wake_type wake_type,
@@ -521,18 +545,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 
 	adjustment = woken * RWSEM_READER_BIAS - adjustment;
 	lockevent_cond_inc(rwsem_wake_reader, woken);
+
+	oldcount = atomic_long_read(&sem->count);
 	if (list_empty(&sem->wait_list)) {
-		/* hit end of list above */
+		/*
+		 * Combined with list_move_tail() above, this implies
+		 * rwsem_del_waiter().
+		 */
 		adjustment -= RWSEM_FLAG_WAITERS;
+		if (oldcount & RWSEM_FLAG_HANDOFF)
+			adjustment -= RWSEM_FLAG_HANDOFF;
+	} else if (woken) {
+		/*
+		 * When we've woken a reader, we no longer need to force
+		 * writers to give up the lock and we can clear HANDOFF.
+		 */
+		if (oldcount & RWSEM_FLAG_HANDOFF)
+			adjustment -= RWSEM_FLAG_HANDOFF;
 	}
 
-	/*
-	 * When we've woken a reader, we no longer need to force writers
-	 * to give up the lock and we can clear HANDOFF.
-	 */
-	if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
-		adjustment -= RWSEM_FLAG_HANDOFF;
-
 	if (adjustment)
 		atomic_long_add(adjustment, &sem->count);
 
@@ -563,12 +594,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
  * race conditions between checking the rwsem wait list and setting the
  * sem->count accordingly.
  *
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
  */
 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
-					enum writer_wait_state wstate)
+					struct rwsem_waiter *waiter)
 {
+	bool first = rwsem_first_waiter(sem) == waiter;
 	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
@@ -577,13 +608,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	do {
 		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
 
-		if (has_handoff && wstate == WRITER_NOT_FIRST)
-			return false;
+		if (has_handoff) {
+			if (!first)
+				return false;
+
+			/* First waiter inherits a previously set handoff bit */
+			waiter->handoff_set = true;
+		}
 
 		new = count;
 
 		if (count & RWSEM_LOCK_MASK) {
-			if (has_handoff || (wstate != WRITER_HANDOFF))
+			if (has_handoff || (!rt_task(waiter->task) &&
+					    !time_after(jiffies, waiter->timeout)))
 				return false;
 
 			new |= RWSEM_FLAG_HANDOFF;
@@ -600,9 +637,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	 * We have either acquired the lock with handoff bit cleared or
 	 * set the handoff bit.
 	 */
-	if (new & RWSEM_FLAG_HANDOFF)
+	if (new & RWSEM_FLAG_HANDOFF) {
+		waiter->handoff_set = true;
+		lockevent_inc(rwsem_wlock_handoff);
 		return false;
+	}
 
+	/*
+	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+	 * success.
+	 */
+	list_del(&waiter->list);
 	rwsem_set_owner(sem);
 	return true;
 }
@@ -1075,7 +1120,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
 		}
 		adjustment += RWSEM_FLAG_WAITERS;
 	}
-	list_add_tail(&waiter.list, &sem->wait_list);
+	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock, but no longer actively locking */
 	if (adjustment)
@@ -1124,11 +1169,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
 	return sem;
 
 out_nolock:
-	list_del(&waiter.list);
-	if (list_empty(&sem->wait_list)) {
-		atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
-				   &sem->count);
-	}
+	rwsem_del_waiter(sem, &waiter);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock_fail);
@@ -1156,9 +1197,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
 	long count;
 	bool disable_rspin;
-	enum writer_wait_state wstate;
 	struct rwsem_waiter waiter;
-	struct rw_semaphore *ret = sem;
 	DEFINE_WAKE_Q(wake_q);
 
 	/* do optimistic spinning and steal lock if possible */
@@ -1182,16 +1221,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+	waiter.handoff_set = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
-
-	/* account for this before adding a new element to the list */
-	wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
-	list_add_tail(&waiter.list, &sem->wait_list);
+	rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock */
-	if (wstate == WRITER_NOT_FIRST) {
+	if (rwsem_first_waiter(sem) != &waiter) {
 		count = atomic_long_read(&sem->count);
 
 		/*
@@ -1227,13 +1263,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
 	for (;;) {
-		if (rwsem_try_write_lock(sem, wstate)) {
+		if (rwsem_try_write_lock(sem, &waiter)) {
 			/* rwsem_try_write_lock() implies ACQUIRE on success */
 			break;
 		}
 
 		raw_spin_unlock_irq(&sem->wait_lock);
 
+		if (signal_pending_state(state, current))
+			goto out_nolock;
+
 		/*
 		 * After setting the handoff bit and failing to acquire
 		 * the lock, attempt to spin on owner to accelerate lock
@@ -1242,71 +1281,31 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 		 * In this case, we attempt to acquire the lock again
 		 * without sleeping.
 		 */
-		if (wstate == WRITER_HANDOFF &&
+		if (waiter.handoff_set &&
 		    rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL)
 			goto trylock_again;
 
-		/* Block until there are no active lockers. */
-		for (;;) {
-			if (signal_pending_state(state, current))
-				goto out_nolock;
-
-			schedule();
-			lockevent_inc(rwsem_sleep_writer);
-			set_current_state(state);
-			/*
-			 * If HANDOFF bit is set, unconditionally do
-			 * a trylock.
-			 */
-			if (wstate == WRITER_HANDOFF)
-				break;
-
-			if ((wstate == WRITER_NOT_FIRST) &&
-			    (rwsem_first_waiter(sem) == &waiter))
-				wstate = WRITER_FIRST;
-
-			count = atomic_long_read(&sem->count);
-			if (!(count & RWSEM_LOCK_MASK))
-				break;
-
-			/*
-			 * The setting of the handoff bit is deferred
-			 * until rwsem_try_write_lock() is called.
-			 */
-			if ((wstate == WRITER_FIRST) && (rt_task(current) ||
-			    time_after(jiffies, waiter.timeout))) {
-				wstate = WRITER_HANDOFF;
-				lockevent_inc(rwsem_wlock_handoff);
-				break;
-			}
-		}
+		schedule();
+		lockevent_inc(rwsem_sleep_writer);
+		set_current_state(state);
 trylock_again:
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
 	__set_current_state(TASK_RUNNING);
-	list_del(&waiter.list);
 	rwsem_disable_reader_optspin(sem, disable_rspin);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	lockevent_inc(rwsem_wlock);
-
-	return ret;
+	return sem;
 
 out_nolock:
 	__set_current_state(TASK_RUNNING);
 	raw_spin_lock_irq(&sem->wait_lock);
-	list_del(&waiter.list);
-
-	if (unlikely(wstate == WRITER_HANDOFF))
-		atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
-
-	if (list_empty(&sem->wait_list))
-		atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
-	else
+	rwsem_del_waiter(sem, &waiter);
+	if (!list_empty(&sem->wait_list))
 		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	wake_up_q(&wake_q);
 	lockevent_inc(rwsem_wlock_fail);
-
 	return ERR_PTR(-EINTR);
 }
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re:[PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-02-16 16:30     ` Waiman Long
@ 2022-02-17 15:41       ` chenguanyou
  2022-03-14  8:07         ` [PATCH " Greg KH
  0 siblings, 1 reply; 41+ messages in thread
From: chenguanyou @ 2022-02-17 15:41 UTC (permalink / raw)
  To: longman
  Cc: chenguanyou9338, dave, gregkh, hdanton, jaegeuk, linux-kernel,
	mazhenhua, mingo, peterz, quic_aiquny, will

>>>>> Hi Waiman, Greg,
>>>>> This patch has been merged in branch linux-5.16.y.
>>>>> Can we take it to the linux-5.10.y LTS version?
>>>> What is "this patch"?
>>> commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff bit handling more consistent")
>> Have you tested it on the 5.10.y branch to verify it actually works
>> properly for you?
>>
>> If so, please provide a working backport to the stable list, as it does
>> not apply cleanly as-is.
>>
>> thanks,
>>
>> greg k-h
>>
> I have attached the 5.10.y backport of commit 
> d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff 
> bit handling more consistent"). I also include a backport of commit 
> 2f06f702925b512a95b95dca3855549c047eef58 ("locking/rwsem: Prevent 
> potential lock starvation") which I think may help Jaegeuk. I had run 
> some sanity tests and the backported patches work fine. However, I don't 
> have access to their testing environments to verify if they can fix the 
> problems seem by Chen or Jaegeuk. So please test these patches to see if 
> they can address your problems.

Hi Longman,

I'll do some stability testing on our 5.10 phone.

thanks,
Guanyou.Chen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-02-17 15:41       ` chenguanyou
@ 2022-03-14  8:07         ` Greg KH
  2022-03-22  2:49           ` chenguanyou
  0 siblings, 1 reply; 41+ messages in thread
From: Greg KH @ 2022-03-14  8:07 UTC (permalink / raw)
  To: chenguanyou
  Cc: longman, dave, hdanton, jaegeuk, linux-kernel, mazhenhua, mingo,
	peterz, quic_aiquny, will

On Thu, Feb 17, 2022 at 11:41:54PM +0800, chenguanyou wrote:
> >>>>> Hi Waiman, Greg,
> >>>>> This patch has been merged in branch linux-5.16.y.
> >>>>> Can we take it to the linux-5.10.y LTS version?
> >>>> What is "this patch"?
> >>> commit d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff bit handling more consistent")
> >> Have you tested it on the 5.10.y branch to verify it actually works
> >> properly for you?
> >>
> >> If so, please provide a working backport to the stable list, as it does
> >> not apply cleanly as-is.
> >>
> >> thanks,
> >>
> >> greg k-h
> >>
> > I have attached the 5.10.y backport of commit 
> > d257cc8cb8d5355ffc43a96bab94db7b5a324803 ("locking/rwsem: Make handoff 
> > bit handling more consistent"). I also include a backport of commit 
> > 2f06f702925b512a95b95dca3855549c047eef58 ("locking/rwsem: Prevent 
> > potential lock starvation") which I think may help Jaegeuk. I had run 
> > some sanity tests and the backported patches work fine. However, I don't 
> > have access to their testing environments to verify if they can fix the 
> > problems seem by Chen or Jaegeuk. So please test these patches to see if 
> > they can address your problems.
> 
> Hi Longman,
> 
> I'll do some stability testing on our 5.10 phone.

What ever happened with this testing?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re:[PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-03-14  8:07         ` [PATCH " Greg KH
@ 2022-03-22  2:49           ` chenguanyou
  2022-03-24 12:51             ` [PATCH " Greg KH
  0 siblings, 1 reply; 41+ messages in thread
From: chenguanyou @ 2022-03-22  2:49 UTC (permalink / raw)
  To: gregkh
  Cc: chenguanyou9338, dave, hdanton, jaegeuk, linux-kernel, longman,
	mazhenhua, mingo, peterz, quic_aiquny, will

> What ever happened with this testing?

> thanks,

> greg k-h

We updated android gki, monkey test mini patch.
Link: https://cs.android.com/android/kernel/superproject/+/common-android12-5.10-2022-03:common/kernel/locking/rwsem.c;l=1290

thanks,
Guanyou.Chen

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-03-22  2:49           ` chenguanyou
@ 2022-03-24 12:51             ` Greg KH
  0 siblings, 0 replies; 41+ messages in thread
From: Greg KH @ 2022-03-24 12:51 UTC (permalink / raw)
  To: chenguanyou
  Cc: dave, hdanton, jaegeuk, linux-kernel, longman, mazhenhua, mingo,
	peterz, quic_aiquny, will

On Tue, Mar 22, 2022 at 10:49:58AM +0800, chenguanyou wrote:
> > What ever happened with this testing?
> 
> > thanks,
> 
> > greg k-h
> 
> We updated android gki, monkey test mini patch.
> Link: https://cs.android.com/android/kernel/superproject/+/common-android12-5.10-2022-03:common/kernel/locking/rwsem.c;l=1290

I do not understand what this is supposed to tell me to do at all :(

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-02-14 15:47 ` Re:[PATCH v5] " chenguanyou
  2022-02-14 16:01   ` [PATCH " Greg KH
@ 2022-04-11 18:26   ` john.p.donnelly
  2022-04-11 18:40     ` Waiman Long
  1 sibling, 1 reply; 41+ messages in thread
From: john.p.donnelly @ 2022-04-11 18:26 UTC (permalink / raw)
  To: chenguanyou, longman, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, John Donnelly, Peter Zijlstra (Intel),
	sashal

On 2/14/22 9:47 AM, chenguanyou wrote:
> Hi Waiman, Greg,
> This patch has been merged in branch linux-5.16.y.
> Can we take it to the linux-5.10.y LTS version?
> 
> thanks,

Hi,

As a FYI:

We have observed that following lockup with this commit added to 5.15.LTS:

d257cc8cb8d5 - locking/rwsem: Make handoff bit handling more consistent 
(4 months ago) <Waiman Long>

The "fio" test suit fails with LVM devices composed of four NVME devices 
with these observed lockup, panic.



ext4:

PID: 3682   TASK: ffff8f489ae34bc0  CPU: 2   COMMAND: "dio/dm-0"
  #0 [fffffe0000083e50] crash_nmi_callback at ffffffff828772b3
  #1 [fffffe0000083e58] nmi_handle at ffffffff82840778
  #2 [fffffe0000083ea0] default_do_nmi at ffffffff8337a1e2
  #3 [fffffe0000083ec8] exc_nmi at ffffffff8337a48d
  #4 [fffffe0000083ef0] end_repeat_nmi at ffffffff8340153b
     [exception RIP: _raw_spin_lock_irq+23]
     RIP: ffffffff8338b2e7  RSP: ffff9c4409b47c78  RFLAGS: 00000046
     RAX: 0000000000000000  RBX: ffff8f489ae34bc0  RCX: 0000000000000000
     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff8f47f7b90104
     RBP: ffff9c4409b47d20   R8: 0000000000000000   R9: 0000000000000000
     R10: 0000000000000000  R11: 0000000000000000  R12: ffff8f47f7b90104
     R13: ffff9c4409b47cb0  R14: ffff8f47f7b900f0  R15: 0000000000000000
     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
     <NMI exception stack> ---
  #5 [ffff9c4409b47c78] _raw_spin_lock_irq at ffffffff8338b2e7
  #6 [ffff9c4409b47c78] rwsem_down_write_slowpath at ffffffff82925be9
  #7 [ffff9c4409b47d28] ext4_map_blocks at ffffffffc11c26dc [ext4]
  #8 [ffff9c4409b47d98] ext4_convert_unwritten_extents at
ffffffffc11ad9e0 [ext4]
  #9 [ffff9c4409b47df0] ext4_dio_write_end_io at ffffffffc11b22aa [ext4]

xfs:

PID: 3719   TASK: ffff9f81d2d74bc0  CPU: 37  COMMAND: "dio/dm-0"
  #0 [fffffe0000894e50] crash_nmi_callback at ffffffffad6772b3
  #1 [fffffe0000894e58] nmi_handle at ffffffffad640778
  #2 [fffffe0000894ea0] default_do_nmi at ffffffffae17a1e2
  #3 [fffffe0000894ec8] exc_nmi at ffffffffae17a48d
  #4 [fffffe0000894ef0] end_repeat_nmi at ffffffffae20153b
     [exception RIP: _raw_spin_lock_irq+23]
     RIP: ffffffffae18b2e7  RSP: ffffbb7ec9637c48  RFLAGS: 00000046
     RAX: 0000000000000000  RBX: ffff9f81d2d74bc0  RCX: 0000000000000000
     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff9f81c04a918c
     RBP: ffffbb7ec9637ce8   R8: 0000000000000000   R9: 0000000000000000
     R10: 0000000000000000  R11: 0000000000000000  R12: ffff9f81c04a918c
     R13: ffffbb7ec9637c80  R14: ffff9f81c04a9178  R15: 0000000000000000
     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
     <NMI exception stack> ---
  #5 [ffffbb7ec9637c48] _raw_spin_lock_irq at ffffffffae18b2e7
  #6 [ffffbb7ec9637c48] rwsem_down_write_slowpath at ffffffffad725be9
  #7 [ffffbb7ec9637cf0] xfs_trans_alloc_inode at ffffffffc074f2bd [xfs]
  #8 [ffffbb7ec9637d50] xfs_iomap_write_unwritten at ffffffffc073ad15
[xfs]
  #9 [ffffbb7ec9637dd0] xfs_dio_write_end_io at ffffffffc072db62 [xfs]


I have reached out to Waiman and he suggested this for our next test pass:


1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock path

-- 

Thank you.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-11 18:26   ` john.p.donnelly
@ 2022-04-11 18:40     ` Waiman Long
  2022-04-11 21:03       ` john.p.donnelly
  0 siblings, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-04-11 18:40 UTC (permalink / raw)
  To: john.p.donnelly, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal

On 4/11/22 14:26, john.p.donnelly@oracle.com wrote:
> On 2/14/22 9:47 AM, chenguanyou wrote:
>> Hi Waiman, Greg,
>> This patch has been merged in branch linux-5.16.y.
>> Can we take it to the linux-5.10.y LTS version?
>>
>> thanks,
>
> Hi,
>
> As a FYI:
>
> We have observed that following lockup with this commit added to 
> 5.15.LTS:
>
> d257cc8cb8d5 - locking/rwsem: Make handoff bit handling more 
> consistent (4 months ago) <Waiman Long>
>
> The "fio" test suit fails with LVM devices composed of four NVME 
> devices with these observed lockup, panic.
>
>
>
> ext4:
>
> PID: 3682   TASK: ffff8f489ae34bc0  CPU: 2   COMMAND: "dio/dm-0"
>  #0 [fffffe0000083e50] crash_nmi_callback at ffffffff828772b3
>  #1 [fffffe0000083e58] nmi_handle at ffffffff82840778
>  #2 [fffffe0000083ea0] default_do_nmi at ffffffff8337a1e2
>  #3 [fffffe0000083ec8] exc_nmi at ffffffff8337a48d
>  #4 [fffffe0000083ef0] end_repeat_nmi at ffffffff8340153b
>     [exception RIP: _raw_spin_lock_irq+23]
>     RIP: ffffffff8338b2e7  RSP: ffff9c4409b47c78  RFLAGS: 00000046
>     RAX: 0000000000000000  RBX: ffff8f489ae34bc0  RCX: 0000000000000000
>     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff8f47f7b90104
>     RBP: ffff9c4409b47d20   R8: 0000000000000000   R9: 0000000000000000
>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff8f47f7b90104
>     R13: ffff9c4409b47cb0  R14: ffff8f47f7b900f0  R15: 0000000000000000
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>     <NMI exception stack> ---
>  #5 [ffff9c4409b47c78] _raw_spin_lock_irq at ffffffff8338b2e7
>  #6 [ffff9c4409b47c78] rwsem_down_write_slowpath at ffffffff82925be9
>  #7 [ffff9c4409b47d28] ext4_map_blocks at ffffffffc11c26dc [ext4]
>  #8 [ffff9c4409b47d98] ext4_convert_unwritten_extents at
> ffffffffc11ad9e0 [ext4]
>  #9 [ffff9c4409b47df0] ext4_dio_write_end_io at ffffffffc11b22aa [ext4]
>
> xfs:
>
> PID: 3719   TASK: ffff9f81d2d74bc0  CPU: 37  COMMAND: "dio/dm-0"
>  #0 [fffffe0000894e50] crash_nmi_callback at ffffffffad6772b3
>  #1 [fffffe0000894e58] nmi_handle at ffffffffad640778
>  #2 [fffffe0000894ea0] default_do_nmi at ffffffffae17a1e2
>  #3 [fffffe0000894ec8] exc_nmi at ffffffffae17a48d
>  #4 [fffffe0000894ef0] end_repeat_nmi at ffffffffae20153b
>     [exception RIP: _raw_spin_lock_irq+23]
>     RIP: ffffffffae18b2e7  RSP: ffffbb7ec9637c48  RFLAGS: 00000046
>     RAX: 0000000000000000  RBX: ffff9f81d2d74bc0  RCX: 0000000000000000
>     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff9f81c04a918c
>     RBP: ffffbb7ec9637ce8   R8: 0000000000000000   R9: 0000000000000000
>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff9f81c04a918c
>     R13: ffffbb7ec9637c80  R14: ffff9f81c04a9178  R15: 0000000000000000
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>     <NMI exception stack> ---
>  #5 [ffffbb7ec9637c48] _raw_spin_lock_irq at ffffffffae18b2e7
>  #6 [ffffbb7ec9637c48] rwsem_down_write_slowpath at ffffffffad725be9
>  #7 [ffffbb7ec9637cf0] xfs_trans_alloc_inode at ffffffffc074f2bd [xfs]
>  #8 [ffffbb7ec9637d50] xfs_iomap_write_unwritten at ffffffffc073ad15
> [xfs]
>  #9 [ffffbb7ec9637dd0] xfs_dio_write_end_io at ffffffffc072db62 [xfs]
>
>
> I have reached out to Waiman and he suggested this for our next test 
> pass:
>
>
> 1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock 
> path

Does this commit help to avoid the lockup problem?

Commit 1ee326196c6658 fixes a potential missed wakeup problem when a 
reader first in the wait queue is interrupted out without acquiring the 
lock. It is actually not a fix for commit d257cc8cb8d5. However, this 
commit changes the out_nolock path behavior of writers by leaving the 
handoff bit set when the wait queue isn't empty. That likely makes the 
missed wakeup problem easier to reproduce.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-11 18:40     ` Waiman Long
@ 2022-04-11 21:03       ` john.p.donnelly
  2022-04-11 21:07         ` Waiman Long
  0 siblings, 1 reply; 41+ messages in thread
From: john.p.donnelly @ 2022-04-11 21:03 UTC (permalink / raw)
  To: Waiman Long, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal, John Donnelly

On 4/11/22 1:40 PM, Waiman Long wrote:
> On 4/11/22 14:26, john.p.donnelly@oracle.com wrote:
>> On 2/14/22 9:47 AM, chenguanyou wrote:
>>> Hi Waiman, Greg,
>>> This patch has been merged in branch linux-5.16.y.
>>> Can we take it to the linux-5.10.y LTS version?
>>>
>>> thanks,
>>
>> Hi,
>>
>> As a FYI:
>>
>> We have observed that following lockup with this commit added to 
>> 5.15.LTS:
>>
>> d257cc8cb8d5 - locking/rwsem: Make handoff bit handling more 
>> consistent (4 months ago) <Waiman Long>
>>
>> The "fio" test suit fails with LVM devices composed of four NVME 
>> devices with these observed lockup, panic.
>>
>>
>>
>> ext4:
>>
>> PID: 3682   TASK: ffff8f489ae34bc0  CPU: 2   COMMAND: "dio/dm-0"
>>  #0 [fffffe0000083e50] crash_nmi_callback at ffffffff828772b3
>>  #1 [fffffe0000083e58] nmi_handle at ffffffff82840778
>>  #2 [fffffe0000083ea0] default_do_nmi at ffffffff8337a1e2
>>  #3 [fffffe0000083ec8] exc_nmi at ffffffff8337a48d
>>  #4 [fffffe0000083ef0] end_repeat_nmi at ffffffff8340153b
>>     [exception RIP: _raw_spin_lock_irq+23]
>>     RIP: ffffffff8338b2e7  RSP: ffff9c4409b47c78  RFLAGS: 00000046
>>     RAX: 0000000000000000  RBX: ffff8f489ae34bc0  RCX: 0000000000000000
>>     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff8f47f7b90104
>>     RBP: ffff9c4409b47d20   R8: 0000000000000000   R9: 0000000000000000
>>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff8f47f7b90104
>>     R13: ffff9c4409b47cb0  R14: ffff8f47f7b900f0  R15: 0000000000000000
>>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>>     <NMI exception stack> ---
>>  #5 [ffff9c4409b47c78] _raw_spin_lock_irq at ffffffff8338b2e7
>>  #6 [ffff9c4409b47c78] rwsem_down_write_slowpath at ffffffff82925be9
>>  #7 [ffff9c4409b47d28] ext4_map_blocks at ffffffffc11c26dc [ext4]
>>  #8 [ffff9c4409b47d98] ext4_convert_unwritten_extents at
>> ffffffffc11ad9e0 [ext4]
>>  #9 [ffff9c4409b47df0] ext4_dio_write_end_io at ffffffffc11b22aa [ext4]
>>
>> xfs:
>>
>> PID: 3719   TASK: ffff9f81d2d74bc0  CPU: 37  COMMAND: "dio/dm-0"
>>  #0 [fffffe0000894e50] crash_nmi_callback at ffffffffad6772b3
>>  #1 [fffffe0000894e58] nmi_handle at ffffffffad640778
>>  #2 [fffffe0000894ea0] default_do_nmi at ffffffffae17a1e2
>>  #3 [fffffe0000894ec8] exc_nmi at ffffffffae17a48d
>>  #4 [fffffe0000894ef0] end_repeat_nmi at ffffffffae20153b
>>     [exception RIP: _raw_spin_lock_irq+23]
>>     RIP: ffffffffae18b2e7  RSP: ffffbb7ec9637c48  RFLAGS: 00000046
>>     RAX: 0000000000000000  RBX: ffff9f81d2d74bc0  RCX: 0000000000000000
>>     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff9f81c04a918c
>>     RBP: ffffbb7ec9637ce8   R8: 0000000000000000   R9: 0000000000000000
>>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff9f81c04a918c
>>     R13: ffffbb7ec9637c80  R14: ffff9f81c04a9178  R15: 0000000000000000
>>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>>     <NMI exception stack> ---
>>  #5 [ffffbb7ec9637c48] _raw_spin_lock_irq at ffffffffae18b2e7
>>  #6 [ffffbb7ec9637c48] rwsem_down_write_slowpath at ffffffffad725be9
>>  #7 [ffffbb7ec9637cf0] xfs_trans_alloc_inode at ffffffffc074f2bd [xfs]
>>  #8 [ffffbb7ec9637d50] xfs_iomap_write_unwritten at ffffffffc073ad15
>> [xfs]
>>  #9 [ffffbb7ec9637dd0] xfs_dio_write_end_io at ffffffffc072db62 [xfs]
>>
>>
>> I have reached out to Waiman and he suggested this for our next test 
>> pass:
>>
>>
>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock 
>> path
> 
> Does this commit help to avoid the lockup problem?
> 
> Commit 1ee326196c6658 fixes a potential missed wakeup problem when a 
> reader first in the wait queue is interrupted out without acquiring the 
> lock. It is actually not a fix for commit d257cc8cb8d5. However, this 
> commit changes the out_nolock path behavior of writers by leaving the 
> handoff bit set when the wait queue isn't empty. That likely makes the 
> missed wakeup problem easier to reproduce.
> 
> Cheers,
> Longman
> 

Hi,


We are testing now

ETA for fio soak test completion is  ~15hr from now.

I wanted to share the stack traces for future reference + occurrences.


Cheers.

JD



...


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-11 21:03       ` john.p.donnelly
@ 2022-04-11 21:07         ` Waiman Long
  2022-04-12 16:28           ` john.p.donnelly
  0 siblings, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-04-11 21:07 UTC (permalink / raw)
  To: john.p.donnelly, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal


On 4/11/22 17:03, john.p.donnelly@oracle.com wrote:
>
>>>
>>> I have reached out to Waiman and he suggested this for our next test 
>>> pass:
>>>
>>>
>>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>> out_nolock path
>>
>> Does this commit help to avoid the lockup problem?
>>
>> Commit 1ee326196c6658 fixes a potential missed wakeup problem when a 
>> reader first in the wait queue is interrupted out without acquiring 
>> the lock. It is actually not a fix for commit d257cc8cb8d5. However, 
>> this commit changes the out_nolock path behavior of writers by 
>> leaving the handoff bit set when the wait queue isn't empty. That 
>> likely makes the missed wakeup problem easier to reproduce.
>>
>> Cheers,
>> Longman
>>
>
> Hi,
>
>
> We are testing now
>
> ETA for fio soak test completion is  ~15hr from now.
>
> I wanted to share the stack traces for future reference + occurrences.
>
I am looking forward to your testing results tomorrow.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-11 21:07         ` Waiman Long
@ 2022-04-12 16:28           ` john.p.donnelly
  2022-04-12 17:04             ` Waiman Long
  2022-04-20 13:55             ` john.p.donnelly
  0 siblings, 2 replies; 41+ messages in thread
From: john.p.donnelly @ 2022-04-12 16:28 UTC (permalink / raw)
  To: Waiman Long, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal

[-- Attachment #1: Type: text/plain, Size: 1473 bytes --]

On 4/11/22 4:07 PM, Waiman Long wrote:
> 
> On 4/11/22 17:03, john.p.donnelly@oracle.com wrote:
>>
>>>>
>>>> I have reached out to Waiman and he suggested this for our next test 
>>>> pass:
>>>>
>>>>
>>>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>>> out_nolock path
>>>
>>> Does this commit help to avoid the lockup problem?
>>>
>>> Commit 1ee326196c6658 fixes a potential missed wakeup problem when a 
>>> reader first in the wait queue is interrupted out without acquiring 
>>> the lock. It is actually not a fix for commit d257cc8cb8d5. However, 
>>> this commit changes the out_nolock path behavior of writers by 
>>> leaving the handoff bit set when the wait queue isn't empty. That 
>>> likely makes the missed wakeup problem easier to reproduce.
>>>
>>> Cheers,
>>> Longman
>>>
>>
>> Hi,
>>
>>
>> We are testing now
>>
>> ETA for fio soak test completion is  ~15hr from now.
>>
>> I wanted to share the stack traces for future reference + occurrences.
>>
> I am looking forward to your testing results tomorrow.
> 
> Cheers,
> Longman
> 
Hi

  Our 24hr fio soak test with :

  1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock 
path


  applied to 5.15.30  passed.

  I suggest you append  1ee326196c6658 with :


  cc: stable

   Fixes: d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more 
consistent")


I'll leave the implementation details up to the core maintainers how to 
do that ;-)

...

Thank you

John.

[-- Attachment #2: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent.eml --]
[-- Type: message/rfc822, Size: 8994 bytes --]

From: john.p.donnelly@oracle.com
To: Waiman Long <longman@redhat.com>, chenguanyou <chenguanyou9338@gmail.com>, gregkh@linuxfoundation.org
Cc: dave@stgolabs.net, hdanton@sina.com, linux-kernel@vger.kernel.org, mazhenhua@xiaomi.com, mingo@redhat.com, peterz@infradead.org, quic_aiquny@quicinc.com, will@kernel.org, sashal@kernel.org, John Donnelly <john.p.donnelly@oracle.com>
Subject: Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
Date: Mon, 11 Apr 2022 16:03:40 -0500
Message-ID: <31178c33-e25c-c3e8-35e2-776b5211200c@oracle.com>

On 4/11/22 1:40 PM, Waiman Long wrote:
> On 4/11/22 14:26, john.p.donnelly@oracle.com wrote:
>> On 2/14/22 9:47 AM, chenguanyou wrote:
>>> Hi Waiman, Greg,
>>> This patch has been merged in branch linux-5.16.y.
>>> Can we take it to the linux-5.10.y LTS version?
>>>
>>> thanks,
>>
>> Hi,
>>
>> As a FYI:
>>
>> We have observed that following lockup with this commit added to 
>> 5.15.LTS:
>>
>> d257cc8cb8d5 - locking/rwsem: Make handoff bit handling more 
>> consistent (4 months ago) <Waiman Long>
>>
>> The "fio" test suit fails with LVM devices composed of four NVME 
>> devices with these observed lockup, panic.
>>
>>
>>
>> ext4:
>>
>> PID: 3682   TASK: ffff8f489ae34bc0  CPU: 2   COMMAND: "dio/dm-0"
>>  #0 [fffffe0000083e50] crash_nmi_callback at ffffffff828772b3
>>  #1 [fffffe0000083e58] nmi_handle at ffffffff82840778
>>  #2 [fffffe0000083ea0] default_do_nmi at ffffffff8337a1e2
>>  #3 [fffffe0000083ec8] exc_nmi at ffffffff8337a48d
>>  #4 [fffffe0000083ef0] end_repeat_nmi at ffffffff8340153b
>>     [exception RIP: _raw_spin_lock_irq+23]
>>     RIP: ffffffff8338b2e7  RSP: ffff9c4409b47c78  RFLAGS: 00000046
>>     RAX: 0000000000000000  RBX: ffff8f489ae34bc0  RCX: 0000000000000000
>>     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff8f47f7b90104
>>     RBP: ffff9c4409b47d20   R8: 0000000000000000   R9: 0000000000000000
>>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff8f47f7b90104
>>     R13: ffff9c4409b47cb0  R14: ffff8f47f7b900f0  R15: 0000000000000000
>>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>>     <NMI exception stack> ---
>>  #5 [ffff9c4409b47c78] _raw_spin_lock_irq at ffffffff8338b2e7
>>  #6 [ffff9c4409b47c78] rwsem_down_write_slowpath at ffffffff82925be9
>>  #7 [ffff9c4409b47d28] ext4_map_blocks at ffffffffc11c26dc [ext4]
>>  #8 [ffff9c4409b47d98] ext4_convert_unwritten_extents at
>> ffffffffc11ad9e0 [ext4]
>>  #9 [ffff9c4409b47df0] ext4_dio_write_end_io at ffffffffc11b22aa [ext4]
>>
>> xfs:
>>
>> PID: 3719   TASK: ffff9f81d2d74bc0  CPU: 37  COMMAND: "dio/dm-0"
>>  #0 [fffffe0000894e50] crash_nmi_callback at ffffffffad6772b3
>>  #1 [fffffe0000894e58] nmi_handle at ffffffffad640778
>>  #2 [fffffe0000894ea0] default_do_nmi at ffffffffae17a1e2
>>  #3 [fffffe0000894ec8] exc_nmi at ffffffffae17a48d
>>  #4 [fffffe0000894ef0] end_repeat_nmi at ffffffffae20153b
>>     [exception RIP: _raw_spin_lock_irq+23]
>>     RIP: ffffffffae18b2e7  RSP: ffffbb7ec9637c48  RFLAGS: 00000046
>>     RAX: 0000000000000000  RBX: ffff9f81d2d74bc0  RCX: 0000000000000000
>>     RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffff9f81c04a918c
>>     RBP: ffffbb7ec9637ce8   R8: 0000000000000000   R9: 0000000000000000
>>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff9f81c04a918c
>>     R13: ffffbb7ec9637c80  R14: ffff9f81c04a9178  R15: 0000000000000000
>>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>>     <NMI exception stack> ---
>>  #5 [ffffbb7ec9637c48] _raw_spin_lock_irq at ffffffffae18b2e7
>>  #6 [ffffbb7ec9637c48] rwsem_down_write_slowpath at ffffffffad725be9
>>  #7 [ffffbb7ec9637cf0] xfs_trans_alloc_inode at ffffffffc074f2bd [xfs]
>>  #8 [ffffbb7ec9637d50] xfs_iomap_write_unwritten at ffffffffc073ad15
>> [xfs]
>>  #9 [ffffbb7ec9637dd0] xfs_dio_write_end_io at ffffffffc072db62 [xfs]
>>
>>
>> I have reached out to Waiman and he suggested this for our next test 
>> pass:
>>
>>
>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock 
>> path
> 
> Does this commit help to avoid the lockup problem?
> 
> Commit 1ee326196c6658 fixes a potential missed wakeup problem when a 
> reader first in the wait queue is interrupted out without acquiring the 
> lock. It is actually not a fix for commit d257cc8cb8d5. However, this 
> commit changes the out_nolock path behavior of writers by leaving the 
> handoff bit set when the wait queue isn't empty. That likely makes the 
> missed wakeup problem easier to reproduce.
> 
> Cheers,
> Longman
> 

Hi,


We are testing now

ETA for fio soak test completion is  ~15hr from now.

I wanted to share the stack traces for future reference + occurrences.


Cheers.

JD



...


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-12 16:28           ` john.p.donnelly
@ 2022-04-12 17:04             ` Waiman Long
  2022-04-14 10:48               ` Greg KH
  2022-04-20 13:55             ` john.p.donnelly
  1 sibling, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-04-12 17:04 UTC (permalink / raw)
  To: john.p.donnelly, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal, stable

On 4/12/22 12:28, john.p.donnelly@oracle.com wrote:
> On 4/11/22 4:07 PM, Waiman Long wrote:
>>
>> On 4/11/22 17:03, john.p.donnelly@oracle.com wrote:
>>>
>>>>>
>>>>> I have reached out to Waiman and he suggested this for our next 
>>>>> test pass:
>>>>>
>>>>>
>>>>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>>>> out_nolock path
>>>>
>>>> Does this commit help to avoid the lockup problem?
>>>>
>>>> Commit 1ee326196c6658 fixes a potential missed wakeup problem when 
>>>> a reader first in the wait queue is interrupted out without 
>>>> acquiring the lock. It is actually not a fix for commit 
>>>> d257cc8cb8d5. However, this commit changes the out_nolock path 
>>>> behavior of writers by leaving the handoff bit set when the wait 
>>>> queue isn't empty. That likely makes the missed wakeup problem 
>>>> easier to reproduce.
>>>>
>>>> Cheers,
>>>> Longman
>>>>
>>>
>>> Hi,
>>>
>>>
>>> We are testing now
>>>
>>> ETA for fio soak test completion is  ~15hr from now.
>>>
>>> I wanted to share the stack traces for future reference + occurrences.
>>>
>> I am looking forward to your testing results tomorrow.
>>
>> Cheers,
>> Longman
>>
> Hi
>
>  Our 24hr fio soak test with :
>
>  1ee326196c6658 locking/rwsem: Always try to wake waiters in 
> out_nolock path
>
>
>  applied to 5.15.30  passed.
>
>  I suggest you append  1ee326196c6658 with :
>
>
>  cc: stable
>
>   Fixes: d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more 
> consistent")
>
>
> I'll leave the implementation details up to the core maintainers how 
> to do that ;-)

Thanks for the test.

The patch has already been in the tip tree. It may not be easy to add a 
Fixes tag to it. Anyway, I will encourage stable tree maintainer to take 
it as it does fix a problem as shown in your test.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-12 17:04             ` Waiman Long
@ 2022-04-14 10:48               ` Greg KH
  2022-04-14 15:18                 ` Waiman Long
  0 siblings, 1 reply; 41+ messages in thread
From: Greg KH @ 2022-04-14 10:48 UTC (permalink / raw)
  To: Waiman Long
  Cc: john.p.donnelly, chenguanyou, dave, hdanton, linux-kernel,
	mazhenhua, mingo, peterz, quic_aiquny, will, sashal, stable

On Tue, Apr 12, 2022 at 01:04:05PM -0400, Waiman Long wrote:
> The patch has already been in the tip tree. It may not be easy to add a
> Fixes tag to it. Anyway, I will encourage stable tree maintainer to take it
> as it does fix a problem as shown in your test.

I have no idea what you want me to do here.  Please be explicit...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-14 10:48               ` Greg KH
@ 2022-04-14 15:18                 ` Waiman Long
  2022-04-14 15:42                   ` Greg KH
  0 siblings, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-04-14 15:18 UTC (permalink / raw)
  To: Greg KH
  Cc: john.p.donnelly, chenguanyou, dave, hdanton, linux-kernel,
	mazhenhua, mingo, peterz, quic_aiquny, will, sashal, stable

On 4/14/22 06:48, Greg KH wrote:
> On Tue, Apr 12, 2022 at 01:04:05PM -0400, Waiman Long wrote:
>> The patch has already been in the tip tree. It may not be easy to add a
>> Fixes tag to it. Anyway, I will encourage stable tree maintainer to take it
>> as it does fix a problem as shown in your test.
> I have no idea what you want me to do here.  Please be explicit...
>
I would like the stable trees to include commit 1ee326196c66 
("locking/rwsem: Always try to wake waiters in out_nolock path") after 
it is merged into the mainline in the v5.19 merge window. It should be 
applied to the stable trees that have incorporated the backport of 
commit d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more 
consistent") since this commit seems to make the missed wakeup problem 
easier to show up.

Cheers,
Longman




^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-14 15:18                 ` Waiman Long
@ 2022-04-14 15:42                   ` Greg KH
  2022-04-14 15:44                     ` Waiman Long
  0 siblings, 1 reply; 41+ messages in thread
From: Greg KH @ 2022-04-14 15:42 UTC (permalink / raw)
  To: Waiman Long
  Cc: john.p.donnelly, chenguanyou, dave, hdanton, linux-kernel,
	mazhenhua, mingo, peterz, quic_aiquny, will, sashal, stable

On Thu, Apr 14, 2022 at 11:18:23AM -0400, Waiman Long wrote:
> On 4/14/22 06:48, Greg KH wrote:
> > On Tue, Apr 12, 2022 at 01:04:05PM -0400, Waiman Long wrote:
> > > The patch has already been in the tip tree. It may not be easy to add a
> > > Fixes tag to it. Anyway, I will encourage stable tree maintainer to take it
> > > as it does fix a problem as shown in your test.
> > I have no idea what you want me to do here.  Please be explicit...
> > 
> I would like the stable trees to include commit 1ee326196c66
> ("locking/rwsem: Always try to wake waiters in out_nolock path") after it is
> merged into the mainline in the v5.19 merge window. It should be applied to
> the stable trees that have incorporated the backport of commit d257cc8cb8d5
> ("locking/rwsem: Make handoff bit handling more consistent") since this
> commit seems to make the missed wakeup problem easier to show up.

Please let us know when this is merged into Linus's tree.  Nothing we
can do until then and I'm not going to remember this in a few months
anyway.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-14 15:42                   ` Greg KH
@ 2022-04-14 15:44                     ` Waiman Long
  0 siblings, 0 replies; 41+ messages in thread
From: Waiman Long @ 2022-04-14 15:44 UTC (permalink / raw)
  To: Greg KH
  Cc: john.p.donnelly, chenguanyou, dave, hdanton, linux-kernel,
	mazhenhua, mingo, peterz, quic_aiquny, will, sashal, stable

On 4/14/22 11:42, Greg KH wrote:
> On Thu, Apr 14, 2022 at 11:18:23AM -0400, Waiman Long wrote:
>> On 4/14/22 06:48, Greg KH wrote:
>>> On Tue, Apr 12, 2022 at 01:04:05PM -0400, Waiman Long wrote:
>>>> The patch has already been in the tip tree. It may not be easy to add a
>>>> Fixes tag to it. Anyway, I will encourage stable tree maintainer to take it
>>>> as it does fix a problem as shown in your test.
>>> I have no idea what you want me to do here.  Please be explicit...
>>>
>> I would like the stable trees to include commit 1ee326196c66
>> ("locking/rwsem: Always try to wake waiters in out_nolock path") after it is
>> merged into the mainline in the v5.19 merge window. It should be applied to
>> the stable trees that have incorporated the backport of commit d257cc8cb8d5
>> ("locking/rwsem: Make handoff bit handling more consistent") since this
>> commit seems to make the missed wakeup problem easier to show up.
> Please let us know when this is merged into Linus's tree.  Nothing we
> can do until then and I'm not going to remember this in a few months
> anyway.

Sure, I will remind you again after the merge happens.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-12 16:28           ` john.p.donnelly
  2022-04-12 17:04             ` Waiman Long
@ 2022-04-20 13:55             ` john.p.donnelly
  2022-04-26 20:21               ` Waiman Long
  1 sibling, 1 reply; 41+ messages in thread
From: john.p.donnelly @ 2022-04-20 13:55 UTC (permalink / raw)
  To: Waiman Long, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal

On 4/12/22 11:28 AM, john.p.donnelly@oracle.com wrote:
> On 4/11/22 4:07 PM, Waiman Long wrote:
>>
>> On 4/11/22 17:03, john.p.donnelly@oracle.com wrote:
>>>
>>>>>
>>>>> I have reached out to Waiman and he suggested this for our next 
>>>>> test pass:
>>>>>
>>>>>
>>>>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>>>> out_nolock path
>>>>
>>>> Does this commit help to avoid the lockup problem?
>>>>
>>>> Commit 1ee326196c6658 fixes a potential missed wakeup problem when a 
>>>> reader first in the wait queue is interrupted out without acquiring 
>>>> the lock. It is actually not a fix for commit d257cc8cb8d5. However, 
>>>> this commit changes the out_nolock path behavior of writers by 
>>>> leaving the handoff bit set when the wait queue isn't empty. That 
>>>> likely makes the missed wakeup problem easier to reproduce.
>>>>
>>>> Cheers,
>>>> Longman
>>>>
>>>
>>> Hi,
>>>
>>>
>>> We are testing now
>>>
>>> ETA for fio soak test completion is  ~15hr from now.
>>>
>>> I wanted to share the stack traces for future reference + occurrences.
>>>
>> I am looking forward to your testing results tomorrow.
>>
>> Cheers,
>> Longman
>>
> Hi
> 
>   Our 24hr fio soak test with :
> 
>   1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock 
> path
> 
> 
>   applied to 5.15.30  passed.
> 
>   I suggest you append  1ee326196c6658 with :
> 
> 
>   cc: stable
> 
>    Fixes: d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more 
> consistent")
> 
> 
> I'll leave the implementation details up to the core maintainers how to 
> do that ;-)
> 
> ...
> 
> Thank you
> 
> John.

Hi ,


  We have observed another panic with :

  1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock
  path

  Applied to 5.15.30 :


PID: 3789   TASK: ffff900fc409b300  CPU: 29  COMMAND: "dio/dm-0"
  #0 [fffffe00006bce50] crash_nmi_callback at ffffffff97c772c3
  #1 [fffffe00006bce58] nmi_handle at ffffffff97c40778
  #2 [fffffe00006bcea0] default_do_nmi at ffffffff988161e2
  #3 [fffffe00006bcec8] exc_nmi at ffffffff9881648d
  #4 [fffffe00006bcef0] end_repeat_nmi at ffffffff98a0153b
     [exception RIP: _raw_spin_lock_irq+35]
     RIP: ffffffff98827333  RSP: ffffa9320917fc78  RFLAGS: 00000046
     RAX: 0000000000000000  RBX: ffff900fc409b300  RCX: 0000000000000000
     RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000000000000
     RBP: ffffa9320917fd20   R8: 0000000000000000   R9: 0000000000000000
     R10: 0000000000000000  R11: 0000000000000000  R12: ffff90006259546c
     R13: ffffa9320917fcb0  R14: ffff900062595458  R15: 0000000000000000
     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
  #5 [ffffa9320917fc78] _raw_spin_lock_irq at ffffffff98827333
  #6 [ffffa9320917fc78] rwsem_down_write_slowpath at ffffffff97d25d49
  #7 [ffffa9320917fd28] ext4_map_blocks at ffffffffc104b6dc [ext4]
  #8 [ffffa9320917fd98] ext4_convert_unwritten_extents at 
ffffffffc10369e0 [ext4]
  #9 [ffffa9320917fdf0] ext4_dio_write_end_io at ffffffffc103b2aa [ext4]
#10 [ffffa9320917fe18] iomap_dio_complete at ffffffff98013f45
#11 [ffffa9320917fe48] iomap_dio_complete_work at ffffffff98014047
#12 [ffffa9320917fe60] process_one_work at ffffffff97cd9191
#13 [ffffa9320917fea8] rescuer_thread at ffffffff97cd991b
#14 [ffffa9320917ff10] kthread at ffffffff97ce11f7
#15 [ffffa9320917ff50] ret_from_fork at ffffffff97c04cf2
crash>


The failure is observed running "fio test suite"  as a 24 hour soak test 
  on an LVM composed of four NVME devices, Intel 72 core server. The 
test cycles through a variety of file-system types.


This kernel has these commits

1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock  path

d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more consistent")

In earlier testing I had reverted d257cc8cb8d5 and did not observe said 
panics.  I still feel d257cc8cb8d5 is  still the root cause.


Thank you,
John.






^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-20 13:55             ` john.p.donnelly
@ 2022-04-26 20:21               ` Waiman Long
  2022-04-26 21:22                 ` john.p.donnelly
  0 siblings, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-04-26 20:21 UTC (permalink / raw)
  To: john.p.donnelly, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal

On 4/20/22 09:55, john.p.donnelly@oracle.com wrote:
> On 4/12/22 11:28 AM, john.p.donnelly@oracle.com wrote:
>> On 4/11/22 4:07 PM, Waiman Long wrote:
>>>
>>> On 4/11/22 17:03, john.p.donnelly@oracle.com wrote:
>>>>
>>>>>>
>>>>>> I have reached out to Waiman and he suggested this for our next 
>>>>>> test pass:
>>>>>>
>>>>>>
>>>>>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>>>>> out_nolock path
>>>>>
>>>>> Does this commit help to avoid the lockup problem?
>>>>>
>>>>> Commit 1ee326196c6658 fixes a potential missed wakeup problem when 
>>>>> a reader first in the wait queue is interrupted out without 
>>>>> acquiring the lock. It is actually not a fix for commit 
>>>>> d257cc8cb8d5. However, this commit changes the out_nolock path 
>>>>> behavior of writers by leaving the handoff bit set when the wait 
>>>>> queue isn't empty. That likely makes the missed wakeup problem 
>>>>> easier to reproduce.
>>>>>
>>>>> Cheers,
>>>>> Longman
>>>>>
>>>>
>>>> Hi,
>>>>
>>>>
>>>> We are testing now
>>>>
>>>> ETA for fio soak test completion is  ~15hr from now.
>>>>
>>>> I wanted to share the stack traces for future reference + occurrences.
>>>>
>>> I am looking forward to your testing results tomorrow.
>>>
>>> Cheers,
>>> Longman
>>>
>> Hi
>>
>>   Our 24hr fio soak test with :
>>
>>   1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>> out_nolock path
>>
>>
>>   applied to 5.15.30  passed.
>>
>>   I suggest you append  1ee326196c6658 with :
>>
>>
>>   cc: stable
>>
>>    Fixes: d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling 
>> more consistent")
>>
>>
>> I'll leave the implementation details up to the core maintainers how 
>> to do that ;-)
>>
>> ...
>>
>> Thank you
>>
>> John.
>
> Hi ,
>
>
>  We have observed another panic with :
>
>  1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock
>  path
>
>  Applied to 5.15.30 :
>
>
Sorry for the late reply as I was busy with other important tasks.

When you said panic, you mean a system hang, not an actual panic. Right?


> PID: 3789   TASK: ffff900fc409b300  CPU: 29  COMMAND: "dio/dm-0"
>  #0 [fffffe00006bce50] crash_nmi_callback at ffffffff97c772c3
>  #1 [fffffe00006bce58] nmi_handle at ffffffff97c40778
>  #2 [fffffe00006bcea0] default_do_nmi at ffffffff988161e2
>  #3 [fffffe00006bcec8] exc_nmi at ffffffff9881648d
>  #4 [fffffe00006bcef0] end_repeat_nmi at ffffffff98a0153b
>     [exception RIP: _raw_spin_lock_irq+35]
>     RIP: ffffffff98827333  RSP: ffffa9320917fc78  RFLAGS: 00000046
>     RAX: 0000000000000000  RBX: ffff900fc409b300  RCX: 0000000000000000
>     RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000000000000
>     RBP: ffffa9320917fd20   R8: 0000000000000000   R9: 0000000000000000
>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff90006259546c
>     R13: ffffa9320917fcb0  R14: ffff900062595458  R15: 0000000000000000
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
> --- <NMI exception stack> ---
>  #5 [ffffa9320917fc78] _raw_spin_lock_irq at ffffffff98827333
>  #6 [ffffa9320917fc78] rwsem_down_write_slowpath at ffffffff97d25d49
>  #7 [ffffa9320917fd28] ext4_map_blocks at ffffffffc104b6dc [ext4]
>  #8 [ffffa9320917fd98] ext4_convert_unwritten_extents at 
> ffffffffc10369e0 [ext4]
>  #9 [ffffa9320917fdf0] ext4_dio_write_end_io at ffffffffc103b2aa [ext4]
> #10 [ffffa9320917fe18] iomap_dio_complete at ffffffff98013f45
> #11 [ffffa9320917fe48] iomap_dio_complete_work at ffffffff98014047
> #12 [ffffa9320917fe60] process_one_work at ffffffff97cd9191
> #13 [ffffa9320917fea8] rescuer_thread at ffffffff97cd991b
> #14 [ffffa9320917ff10] kthread at ffffffff97ce11f7
> #15 [ffffa9320917ff50] ret_from_fork at ffffffff97c04cf2
> crash>
>
>
> The failure is observed running "fio test suite"  as a 24 hour soak 
> test  on an LVM composed of four NVME devices, Intel 72 core server. 
> The test cycles through a variety of file-system types.
>
>
> This kernel has these commits
>
> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
> out_nolock  path
>
> d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more consistent")
>
> In earlier testing I had reverted d257cc8cb8d5 and did not observe 
> said panics.  I still feel d257cc8cb8d5 is  still the root cause.

So it is possible that 1ee326196c6658 does not completely eliminate the 
missed wakeup situation.

Regards,
Longman


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-04-26 20:21               ` Waiman Long
@ 2022-04-26 21:22                 ` john.p.donnelly
  0 siblings, 0 replies; 41+ messages in thread
From: john.p.donnelly @ 2022-04-26 21:22 UTC (permalink / raw)
  To: Waiman Long, chenguanyou, gregkh
  Cc: dave, hdanton, linux-kernel, mazhenhua, mingo, peterz,
	quic_aiquny, will, sashal

On 4/26/22 3:21 PM, Waiman Long wrote:
> On 4/20/22 09:55, john.p.donnelly@oracle.com wrote:
>> On 4/12/22 11:28 AM, john.p.donnelly@oracle.com wrote:
>>> On 4/11/22 4:07 PM, Waiman Long wrote:
>>>>
>>>> On 4/11/22 17:03, john.p.donnelly@oracle.com wrote:
>>>>>
>>>>>>>
>>>>>>> I have reached out to Waiman and he suggested this for our next 
>>>>>>> test pass:
>>>>>>>
>>>>>>>
>>>>>>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>>>>>> out_nolock path
>>>>>>
>>>>>> Does this commit help to avoid the lockup problem?
>>>>>>
>>>>>> Commit 1ee326196c6658 fixes a potential missed wakeup problem when 
>>>>>> a reader first in the wait queue is interrupted out without 
>>>>>> acquiring the lock. It is actually not a fix for commit 
>>>>>> d257cc8cb8d5. However, this commit changes the out_nolock path 
>>>>>> behavior of writers by leaving the handoff bit set when the wait 
>>>>>> queue isn't empty. That likely makes the missed wakeup problem 
>>>>>> easier to reproduce.
>>>>>>
>>>>>> Cheers,
>>>>>> Longman
>>>>>>
>>>>>
>>>>> Hi,
>>>>>
>>>>>
>>>>> We are testing now
>>>>>
>>>>> ETA for fio soak test completion is  ~15hr from now.
>>>>>
>>>>> I wanted to share the stack traces for future reference + occurrences.
>>>>>
>>>> I am looking forward to your testing results tomorrow.
>>>>
>>>> Cheers,
>>>> Longman
>>>>
>>> Hi
>>>
>>>   Our 24hr fio soak test with :
>>>
>>>   1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>>> out_nolock path
>>>
>>>
>>>   applied to 5.15.30  passed.
>>>
>>>   I suggest you append  1ee326196c6658 with :
>>>
>>>
>>>   cc: stable
>>>
>>>    Fixes: d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling 
>>> more consistent")
>>>
>>>
>>> I'll leave the implementation details up to the core maintainers how 
>>> to do that ;-)
>>>
>>> ...
>>>
>>> Thank you
>>>
>>> John.
>>
>> Hi ,
>>
>>
>>  We have observed another panic with :
>>
>>  1ee326196c6658 locking/rwsem: Always try to wake waiters in out_nolock
>>  path
>>
>>  Applied to 5.15.30 :
>>
>>
> Sorry for the late reply as I was busy with other important tasks.
> 
> When you said panic, you mean a system hang, not an actual panic. Right?

Hi ,

Our setups turn on all the panic on-hung-task , on-opps,  all those 
various features:

./sys/kernel/hardlockup_panic
./sys/kernel/hung_task_panic
./sys/kernel/max_rcu_stall_to_panic
./sys/kernel/panic
./sys/kernel/panic_on_io_nmi
./sys/kernel/panic_on_oops
./sys/kernel/panic_on_rcu_stall
./sys/kernel/panic_on_unrecovered_nmi
./sys/kernel/panic_on_warn
./sys/kernel/panic_print
./sys/kernel/softlockup_panic
./sys/kernel/unknown_nmi_panic


The machine is unusable when this occurs.


> 
> 
>> PID: 3789   TASK: ffff900fc409b300  CPU: 29  COMMAND: "dio/dm-0"
>>  #0 [fffffe00006bce50] crash_nmi_callback at ffffffff97c772c3
>>  #1 [fffffe00006bce58] nmi_handle at ffffffff97c40778
>>  #2 [fffffe00006bcea0] default_do_nmi at ffffffff988161e2
>>  #3 [fffffe00006bcec8] exc_nmi at ffffffff9881648d
>>  #4 [fffffe00006bcef0] end_repeat_nmi at ffffffff98a0153b
>>     [exception RIP: _raw_spin_lock_irq+35]
>>     RIP: ffffffff98827333  RSP: ffffa9320917fc78  RFLAGS: 00000046
>>     RAX: 0000000000000000  RBX: ffff900fc409b300  RCX: 0000000000000000
>>     RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000000000000
>>     RBP: ffffa9320917fd20   R8: 0000000000000000   R9: 0000000000000000
>>     R10: 0000000000000000  R11: 0000000000000000  R12: ffff90006259546c
>>     R13: ffffa9320917fcb0  R14: ffff900062595458  R15: 0000000000000000
>>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>> --- <NMI exception stack> ---
>>  #5 [ffffa9320917fc78] _raw_spin_lock_irq at ffffffff98827333
>>  #6 [ffffa9320917fc78] rwsem_down_write_slowpath at ffffffff97d25d49
>>  #7 [ffffa9320917fd28] ext4_map_blocks at ffffffffc104b6dc [ext4]
>>  #8 [ffffa9320917fd98] ext4_convert_unwritten_extents at 
>> ffffffffc10369e0 [ext4]
>>  #9 [ffffa9320917fdf0] ext4_dio_write_end_io at ffffffffc103b2aa [ext4]
>> #10 [ffffa9320917fe18] iomap_dio_complete at ffffffff98013f45
>> #11 [ffffa9320917fe48] iomap_dio_complete_work at ffffffff98014047
>> #12 [ffffa9320917fe60] process_one_work at ffffffff97cd9191
>> #13 [ffffa9320917fea8] rescuer_thread at ffffffff97cd991b
>> #14 [ffffa9320917ff10] kthread at ffffffff97ce11f7
>> #15 [ffffa9320917ff50] ret_from_fork at ffffffff97c04cf2
>> crash>
>>
>>
>> The failure is observed running "fio test suite"  as a 24 hour soak 
>> test  on an LVM composed of four NVME devices, Intel 72 core server. 
>> The test cycles through a variety of file-system types.
>>
>>
>> This kernel has these commits
>>
>> 1ee326196c6658 locking/rwsem: Always try to wake waiters in 
>> out_nolock  path
>>
>> d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more consistent")
>>
>> In earlier testing I had reverted d257cc8cb8d5 and did not observe 
>> said panics.  I still feel d257cc8cb8d5 is  still the root cause.
> 
> So it is possible that 1ee326196c6658 does not completely eliminate the 
> missed wakeup situation.
> 
> Regards,
> Longman
> 


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
                   ` (4 preceding siblings ...)
  2022-02-14 16:22 ` chenguanyou
@ 2022-07-19  0:27 ` Doug Anderson
  2022-07-19 10:41   ` Hillf Danton
  5 siblings, 1 reply; 41+ messages in thread
From: Doug Anderson @ 2022-07-19  0:27 UTC (permalink / raw)
  To: Waiman Long
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, LKML, Davidlohr Bueso,
	mazhenhua, Hillf Danton, Maria Yu

Hi,

On Mon, Nov 15, 2021 at 5:38 PM Waiman Long <longman@redhat.com> wrote:
>
> There are some inconsistency in the way that the handoff bit is being
> handled in readers and writers.
>
> Firstly, when a queue head writer set the handoff bit, it will clear it
> when the writer is being killed or interrupted on its way out without
> acquiring the lock. That is not the case for a queue head reader. The
> handoff bit will simply be inherited by the next waiter.
>
> Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both
> the waiter and handoff bits are cleared if the wait queue becomes empty.
> For rwsem_down_write_slowpath(), however, the handoff bit is not checked
> and cleared if the wait queue is empty. This can potentially make the
> handoff bit set with empty wait queue.
>
> To make the handoff bit handling more consistent and robust, extract
> out handoff bit clearing code into the new rwsem_del_waiter() helper
> function.  The common function will only use atomic_long_andnot() to
> clear bits when the wait queue is empty to avoid possible race condition.
> If the first waiter with handoff bit set is killed or interrupted to
> exit the slowpath without acquiring the lock, the next waiter will
> inherit the handoff bit.
>
> While at it, simplify the trylock for loop in rwsem_down_write_slowpath()
> to make it easier to read.
>
> Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  kernel/locking/rwsem.c | 171 ++++++++++++++++++++---------------------
>  1 file changed, 85 insertions(+), 86 deletions(-)

I've been tracking down an occasional hang at reboot on my system and
I've ended up at this as the first bad commit. I will not pretend to
understand the intricacies of the rwsem implementation, but I can
describe what I saw. I have also produced a fairly small test case
that reproduces the problem rather quickly.

First, what I saw:

My system failed to fully boot up and eventually the "hung task"
detection kicked in. Many tasks in my system were hung all waiting on
the "kernfs_rwsem". No tasks actually had the semaphore--it only had
tasks waiting.

Of the tasks waiting, 3 of them were doing a down_write(). The rest
were all waiting on down_read().

2 of the tasks waiting on the down_write() were locked to CPU0. One of
these tasks was a bound kworker. Another of these tasks was a threaded
IRQ handler. The threaded IRQ handler was set to "real time" priority
because in setup_irq_thread() you can see the call to
sched_set_fifo().

At the time the hung task detector kicked in, the real time task was
actually active on a CPU. Specifically it was running in the for (;;)
loop in rwsem_down_write_slowpath(). rwsem_try_write_lock() had
clearly just returned false which meant we didn't get the lock.
Everything else was sitting in schedule().

I managed to get the real time task into kgdb and I could analyze its
state as well as the state of "sem". The real time task was _not_ the
first waiter. The kworker was the first waiter. The
"waiter.handoff_set" was set to "true" for the real time task. The
rwsem owner was OWNER_NULL.

Looking through the code and watching what was happening.

1. The function rwsem_try_write_lock() was instantly returning false
since `handoff` is set and we're not first.
2. After we get back into rwsem_down_write_slowpath() we'll see the
handoff set and we'll try to spin on the owner. There is no owner, so
this is a noop.
3. Since there's no owner, we'll go right back to the start of the loop.

So basically the real time thread (the threaded IRQ handler) was
locked to CPU0 and spinning as fast as possible. The "first waiter"
for the semaphore was blocked from running because it could only run
on CPU0 but was _not_ real time priority.

-

So all the analysis above was done on the Chrome OS 5.15 kernel
branch, which has ${SUBJECT} patch from the stable tree. The code
looks reasonably the same on mainline.

...and also, I coded up a test case that can reproduce this on
mainline. It's ugly/hacky but it gets the job done. This reproduces
the problem at the top of mainline as of commit 80e19f34c288 ("Merge
tag 'hte/for-5.19' of
git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux").

For me, I was only able to reproduce this without "lockdep" enabled.
My lockdep configs were:

CONFIG_DEBUG_RT_MUTEXES=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_PROVE_RCU=y
CONFIG_PROVE_LOCKING=y
CONFIG_DEBUG_ATOMIC_SLEEP=y

I don't know for sure if lockdep is actually required to reproduce.

-

OK, so here's my hacky test case. In my case, I put a call to this
test function in a convenient debugfs "show" function to make it easy
to trigger. You can put it wherever.

struct test_data {
        struct rw_semaphore *rwsem;
        int i;
        bool should_sleep;
};

static int test_thread_fn(void *data)
{
        struct test_data *test_data = data;
        struct rw_semaphore *rwsem = test_data->rwsem;
        ktime_t start;

        trace_printk("Starting\n");
        start = ktime_get();
        while (ktime_to_ms(ktime_sub(ktime_get(), start)) < 60000) {
                trace_printk("About to grab\n");
                down_write(rwsem);
                trace_printk("Grabbed write %d\n", test_data->i);
                schedule();
                up_write(rwsem);
                trace_printk("Released write %d\n", test_data->i);
                if (test_data->should_sleep)
                        msleep(1);
        }
        trace_printk("Done\n");

        return 0;
}

static void test(void)
{
        static struct task_struct *t[10];
        static struct test_data test_data[10];
        static DECLARE_RWSEM(rwsem);
        int i;

        trace_printk("About to create threads\n");

        for (i = 0; i < ARRAY_SIZE(t); i++) {
                test_data[i].rwsem = &rwsem;
                test_data[i].i = i;

                if (i == ARRAY_SIZE(t) - 1) {
                        /*
                         * Last thread will be bound to CPU0 and realtime.
                         * Have it sleep to give other threads a chance to
                         * run and contend.
                         */
                        test_data[i].should_sleep = true;
                        t[i] = kthread_create_on_cpu(test_thread_fn,
                                                     &test_data[i], 0,
                                                     "test0 FIFO-%u");
                        sched_set_fifo(t[i]);
                } else if (i == ARRAY_SIZE(t) - 2) {
                        /* 2nd to last thread will be bound to CPU0 */
                        t[i] = kthread_create_on_cpu(test_thread_fn,
                                                     &test_data[i], 0,
                                                     "test0-%u");
                } else {
                        /* All other threads are just normal */
                        t[i] = kthread_create(test_thread_fn,
                                              &test_data[i], "test");
                }
                wake_up_process(t[i]);
                msleep(10);
        }
}

-

With the reproducer above, I was able to:

1. Validate that on chromeos-5.15 I could revert ${SUBJECT} patch and
the problem went away.

2. I could go to mainline at exactly the commit hash of ${SUBJECT}
patch, see the problem, then revert ${SUBJECT} patch and see the
problem go away.

Thus I'm fairly confident that the problem is related to ${SUBJECT} patch.

-

I'm hoping that someone on this thread can propose a fix. I'm happy to
test, but I was hoping not to have to become an expert on the rwsem
implementation to try to figure out the proper fix.

Thanks!

-Doug

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-19  0:27 ` Doug Anderson
@ 2022-07-19 10:41   ` Hillf Danton
  2022-07-19 15:30     ` Doug Anderson
  0 siblings, 1 reply; 41+ messages in thread
From: Hillf Danton @ 2022-07-19 10:41 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso,
	linux-mm, LKML

On Mon, 18 Jul 2022 17:27:28 -0700 Doug Anderson <dianders@chromium.org> wrote:
> 
> I've been tracking down an occasional hang at reboot on my system and
> I've ended up at this as the first bad commit. I will not pretend to
> understand the intricacies of the rwsem implementation, but I can
> describe what I saw. I have also produced a fairly small test case
> that reproduces the problem rather quickly.
> 
> First, what I saw:
> 
> My system failed to fully boot up and eventually the "hung task"
> detection kicked in. Many tasks in my system were hung all waiting on
> the "kernfs_rwsem". No tasks actually had the semaphore--it only had
> tasks waiting.
> 
> Of the tasks waiting, 3 of them were doing a down_write(). The rest
> were all waiting on down_read().
> 
> 2 of the tasks waiting on the down_write() were locked to CPU0. One of
> these tasks was a bound kworker. Another of these tasks was a threaded
> IRQ handler. The threaded IRQ handler was set to "real time" priority
> because in setup_irq_thread() you can see the call to
> sched_set_fifo().
> 
> At the time the hung task detector kicked in, the real time task was
> actually active on a CPU. Specifically it was running in the for (;;)
> loop in rwsem_down_write_slowpath(). rwsem_try_write_lock() had
> clearly just returned false which meant we didn't get the lock.
> Everything else was sitting in schedule().
> 
> I managed to get the real time task into kgdb and I could analyze its
> state as well as the state of "sem". The real time task was _not_ the
> first waiter. The kworker was the first waiter. The
> "waiter.handoff_set" was set to "true" for the real time task. The
> rwsem owner was OWNER_NULL.
> 
> Looking through the code and watching what was happening.
> 
> 1. The function rwsem_try_write_lock() was instantly returning false
> since `handoff` is set and we're not first.
> 2. After we get back into rwsem_down_write_slowpath() we'll see the
> handoff set and we'll try to spin on the owner. There is no owner, so
> this is a noop.
> 3. Since there's no owner, we'll go right back to the start of the loop.
> 
> So basically the real time thread (the threaded IRQ handler) was
> locked to CPU0 and spinning as fast as possible. The "first waiter"
> for the semaphore was blocked from running because it could only run
> on CPU0 but was _not_ real time priority.
> 
> -
> 
> So all the analysis above was done on the Chrome OS 5.15 kernel
> branch, which has ${SUBJECT} patch from the stable tree. The code
> looks reasonably the same on mainline.
> 
> ...and also, I coded up a test case that can reproduce this on
> mainline. It's ugly/hacky but it gets the job done. This reproduces
> the problem at the top of mainline as of commit 80e19f34c288 ("Merge
> tag 'hte/for-5.19' of
> git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux").
> 
> For me, I was only able to reproduce this without "lockdep" enabled.
> My lockdep configs were:
> 
> CONFIG_DEBUG_RT_MUTEXES=y
> CONFIG_DEBUG_SPINLOCK=y
> CONFIG_DEBUG_MUTEXES=y
> CONFIG_PROVE_RCU=y
> CONFIG_PROVE_LOCKING=y
> CONFIG_DEBUG_ATOMIC_SLEEP=y
> 
> I don't know for sure if lockdep is actually required to reproduce.
> 
> -
> 
> OK, so here's my hacky test case. In my case, I put a call to this
> test function in a convenient debugfs "show" function to make it easy
> to trigger. You can put it wherever.
> 
> struct test_data {
>         struct rw_semaphore *rwsem;
>         int i;
>         bool should_sleep;
> };
> 
> static int test_thread_fn(void *data)
> {
>         struct test_data *test_data = data;
>         struct rw_semaphore *rwsem = test_data->rwsem;
>         ktime_t start;
> 
>         trace_printk("Starting\n");
>         start = ktime_get();
>         while (ktime_to_ms(ktime_sub(ktime_get(), start)) < 60000) {
>                 trace_printk("About to grab\n");
>                 down_write(rwsem);
>                 trace_printk("Grabbed write %d\n", test_data->i);
>                 schedule();
>                 up_write(rwsem);
>                 trace_printk("Released write %d\n", test_data->i);
>                 if (test_data->should_sleep)
>                         msleep(1);
>         }
>         trace_printk("Done\n");
> 
>         return 0;
> }
> 
> static void test(void)
> {
>         static struct task_struct *t[10];
>         static struct test_data test_data[10];
>         static DECLARE_RWSEM(rwsem);
>         int i;
> 
>         trace_printk("About to create threads\n");
> 
>         for (i = 0; i < ARRAY_SIZE(t); i++) {
>                 test_data[i].rwsem = &rwsem;
>                 test_data[i].i = i;
> 
>                 if (i == ARRAY_SIZE(t) - 1) {
>                         /*
>                          * Last thread will be bound to CPU0 and realtime.
>                          * Have it sleep to give other threads a chance to
>                          * run and contend.
>                          */
>                         test_data[i].should_sleep = true;
>                         t[i] = kthread_create_on_cpu(test_thread_fn,
>                                                      &test_data[i], 0,
>                                                      "test0 FIFO-%u");
>                         sched_set_fifo(t[i]);
>                 } else if (i == ARRAY_SIZE(t) - 2) {
>                         /* 2nd to last thread will be bound to CPU0 */
>                         t[i] = kthread_create_on_cpu(test_thread_fn,
>                                                      &test_data[i], 0,
>                                                      "test0-%u");
>                 } else {
>                         /* All other threads are just normal */
>                         t[i] = kthread_create(test_thread_fn,
>                                               &test_data[i], "test");
>                 }
>                 wake_up_process(t[i]);
>                 msleep(10);
>         }
> }
> 
> -
> 
> With the reproducer above, I was able to:
> 
> 1. Validate that on chromeos-5.15 I could revert ${SUBJECT} patch and
> the problem went away.
> 
> 2. I could go to mainline at exactly the commit hash of ${SUBJECT}
> patch, see the problem, then revert ${SUBJECT} patch and see the
> problem go away.
> 
> Thus I'm fairly confident that the problem is related to ${SUBJECT} patch.
> 
> -
> 
> I'm hoping that someone on this thread can propose a fix. I'm happy to
> test, but I was hoping not to have to become an expert on the rwsem
> implementation to try to figure out the proper fix.
> 

See if it makes sense to only allow the first waiter to spin on owner.

Hillf

--- mainline/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -337,7 +337,7 @@ struct rwsem_waiter {
 	unsigned long timeout;
 
 	/* Writer only, not initialized in reader */
-	bool handoff_set;
+	bool handoff_set, first;
 };
 #define rwsem_first_waiter(sem) \
 	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -604,6 +604,7 @@ static inline bool rwsem_try_write_lock(
 
 	lockdep_assert_held(&sem->wait_lock);
 
+	waiter->first = first;
 	count = atomic_long_read(&sem->count);
 	do {
 		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
@@ -1114,6 +1115,7 @@ rwsem_down_write_slowpath(struct rw_sema
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
 	waiter.handoff_set = false;
+	waiter.first = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
 	rwsem_add_waiter(sem, &waiter);
@@ -1158,7 +1160,7 @@ rwsem_down_write_slowpath(struct rw_sema
 		 * In this case, we attempt to acquire the lock again
 		 * without sleeping.
 		 */
-		if (waiter.handoff_set) {
+		if (waiter.handoff_set && waiter.first) {
 			enum owner_state owner_state;
 
 			preempt_disable();


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-19 10:41   ` Hillf Danton
@ 2022-07-19 15:30     ` Doug Anderson
  2022-07-22 11:55       ` Hillf Danton
  0 siblings, 1 reply; 41+ messages in thread
From: Doug Anderson @ 2022-07-19 15:30 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso,
	Linux Memory Management List, LKML

Hi,

On Tue, Jul 19, 2022 at 3:41 AM Hillf Danton <hdanton@sina.com> wrote:
>
> On Mon, 18 Jul 2022 17:27:28 -0700 Doug Anderson <dianders@chromium.org> wrote:
> >
> > I've been tracking down an occasional hang at reboot on my system and
> > I've ended up at this as the first bad commit. I will not pretend to
> > understand the intricacies of the rwsem implementation, but I can
> > describe what I saw. I have also produced a fairly small test case
> > that reproduces the problem rather quickly.
> >
> > First, what I saw:
> >
> > My system failed to fully boot up and eventually the "hung task"
> > detection kicked in. Many tasks in my system were hung all waiting on
> > the "kernfs_rwsem". No tasks actually had the semaphore--it only had
> > tasks waiting.
> >
> > Of the tasks waiting, 3 of them were doing a down_write(). The rest
> > were all waiting on down_read().
> >
> > 2 of the tasks waiting on the down_write() were locked to CPU0. One of
> > these tasks was a bound kworker. Another of these tasks was a threaded
> > IRQ handler. The threaded IRQ handler was set to "real time" priority
> > because in setup_irq_thread() you can see the call to
> > sched_set_fifo().
> >
> > At the time the hung task detector kicked in, the real time task was
> > actually active on a CPU. Specifically it was running in the for (;;)
> > loop in rwsem_down_write_slowpath(). rwsem_try_write_lock() had
> > clearly just returned false which meant we didn't get the lock.
> > Everything else was sitting in schedule().
> >
> > I managed to get the real time task into kgdb and I could analyze its
> > state as well as the state of "sem". The real time task was _not_ the
> > first waiter. The kworker was the first waiter. The
> > "waiter.handoff_set" was set to "true" for the real time task. The
> > rwsem owner was OWNER_NULL.
> >
> > Looking through the code and watching what was happening.
> >
> > 1. The function rwsem_try_write_lock() was instantly returning false
> > since `handoff` is set and we're not first.
> > 2. After we get back into rwsem_down_write_slowpath() we'll see the
> > handoff set and we'll try to spin on the owner. There is no owner, so
> > this is a noop.
> > 3. Since there's no owner, we'll go right back to the start of the loop.
> >
> > So basically the real time thread (the threaded IRQ handler) was
> > locked to CPU0 and spinning as fast as possible. The "first waiter"
> > for the semaphore was blocked from running because it could only run
> > on CPU0 but was _not_ real time priority.
> >
> > -
> >
> > So all the analysis above was done on the Chrome OS 5.15 kernel
> > branch, which has ${SUBJECT} patch from the stable tree. The code
> > looks reasonably the same on mainline.
> >
> > ...and also, I coded up a test case that can reproduce this on
> > mainline. It's ugly/hacky but it gets the job done. This reproduces
> > the problem at the top of mainline as of commit 80e19f34c288 ("Merge
> > tag 'hte/for-5.19' of
> > git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux").
> >
> > For me, I was only able to reproduce this without "lockdep" enabled.
> > My lockdep configs were:
> >
> > CONFIG_DEBUG_RT_MUTEXES=y
> > CONFIG_DEBUG_SPINLOCK=y
> > CONFIG_DEBUG_MUTEXES=y
> > CONFIG_PROVE_RCU=y
> > CONFIG_PROVE_LOCKING=y
> > CONFIG_DEBUG_ATOMIC_SLEEP=y
> >
> > I don't know for sure if lockdep is actually required to reproduce.
> >
> > -
> >
> > OK, so here's my hacky test case. In my case, I put a call to this
> > test function in a convenient debugfs "show" function to make it easy
> > to trigger. You can put it wherever.
> >
> > struct test_data {
> >         struct rw_semaphore *rwsem;
> >         int i;
> >         bool should_sleep;
> > };
> >
> > static int test_thread_fn(void *data)
> > {
> >         struct test_data *test_data = data;
> >         struct rw_semaphore *rwsem = test_data->rwsem;
> >         ktime_t start;
> >
> >         trace_printk("Starting\n");
> >         start = ktime_get();
> >         while (ktime_to_ms(ktime_sub(ktime_get(), start)) < 60000) {
> >                 trace_printk("About to grab\n");
> >                 down_write(rwsem);
> >                 trace_printk("Grabbed write %d\n", test_data->i);
> >                 schedule();
> >                 up_write(rwsem);
> >                 trace_printk("Released write %d\n", test_data->i);
> >                 if (test_data->should_sleep)
> >                         msleep(1);
> >         }
> >         trace_printk("Done\n");
> >
> >         return 0;
> > }
> >
> > static void test(void)
> > {
> >         static struct task_struct *t[10];
> >         static struct test_data test_data[10];
> >         static DECLARE_RWSEM(rwsem);
> >         int i;
> >
> >         trace_printk("About to create threads\n");
> >
> >         for (i = 0; i < ARRAY_SIZE(t); i++) {
> >                 test_data[i].rwsem = &rwsem;
> >                 test_data[i].i = i;
> >
> >                 if (i == ARRAY_SIZE(t) - 1) {
> >                         /*
> >                          * Last thread will be bound to CPU0 and realtime.
> >                          * Have it sleep to give other threads a chance to
> >                          * run and contend.
> >                          */
> >                         test_data[i].should_sleep = true;
> >                         t[i] = kthread_create_on_cpu(test_thread_fn,
> >                                                      &test_data[i], 0,
> >                                                      "test0 FIFO-%u");
> >                         sched_set_fifo(t[i]);
> >                 } else if (i == ARRAY_SIZE(t) - 2) {
> >                         /* 2nd to last thread will be bound to CPU0 */
> >                         t[i] = kthread_create_on_cpu(test_thread_fn,
> >                                                      &test_data[i], 0,
> >                                                      "test0-%u");
> >                 } else {
> >                         /* All other threads are just normal */
> >                         t[i] = kthread_create(test_thread_fn,
> >                                               &test_data[i], "test");
> >                 }
> >                 wake_up_process(t[i]);
> >                 msleep(10);
> >         }
> > }
> >
> > -
> >
> > With the reproducer above, I was able to:
> >
> > 1. Validate that on chromeos-5.15 I could revert ${SUBJECT} patch and
> > the problem went away.
> >
> > 2. I could go to mainline at exactly the commit hash of ${SUBJECT}
> > patch, see the problem, then revert ${SUBJECT} patch and see the
> > problem go away.
> >
> > Thus I'm fairly confident that the problem is related to ${SUBJECT} patch.
> >
> > -
> >
> > I'm hoping that someone on this thread can propose a fix. I'm happy to
> > test, but I was hoping not to have to become an expert on the rwsem
> > implementation to try to figure out the proper fix.
> >
>
> See if it makes sense to only allow the first waiter to spin on owner.
>
> Hillf
>
> --- mainline/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -337,7 +337,7 @@ struct rwsem_waiter {
>         unsigned long timeout;
>
>         /* Writer only, not initialized in reader */
> -       bool handoff_set;
> +       bool handoff_set, first;
>  };
>  #define rwsem_first_waiter(sem) \
>         list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
> @@ -604,6 +604,7 @@ static inline bool rwsem_try_write_lock(
>
>         lockdep_assert_held(&sem->wait_lock);
>
> +       waiter->first = first;
>         count = atomic_long_read(&sem->count);
>         do {
>                 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
> @@ -1114,6 +1115,7 @@ rwsem_down_write_slowpath(struct rw_sema
>         waiter.type = RWSEM_WAITING_FOR_WRITE;
>         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
>         waiter.handoff_set = false;
> +       waiter.first = false;
>
>         raw_spin_lock_irq(&sem->wait_lock);
>         rwsem_add_waiter(sem, &waiter);
> @@ -1158,7 +1160,7 @@ rwsem_down_write_slowpath(struct rw_sema
>                  * In this case, we attempt to acquire the lock again
>                  * without sleeping.
>                  */
> -               if (waiter.handoff_set) {
> +               if (waiter.handoff_set && waiter.first) {

Your patch does fix my test case, so FWIW:

Tested-by: Douglas Anderson <dianders@chromium.org>

I haven't done any stress testing other than my test case, though, so
I can't speak to whether there might be any other unintended issues.


-Doug

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-19 15:30     ` Doug Anderson
@ 2022-07-22 11:55       ` Hillf Danton
  2022-07-22 14:02         ` Doug Anderson
  0 siblings, 1 reply; 41+ messages in thread
From: Hillf Danton @ 2022-07-22 11:55 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

On Tue, 19 Jul 2022 08:30:02 -0700 Doug Anderson wrote:
> 
> I haven't done any stress testing other than my test case, though, so
> I can't speak to whether there might be any other unintended issues.

The diff below is prepared for any regressions I can imagine in stress
tests by adding changes to both read and write acquirer slow pathes.

On the read side, make lock stealing more aggressive; on the other hand,
write acquirers try to set HANDOFF after a RWSEM_WAIT_TIMEOUT nap to
force the reader acquirers to take the slow path.

Hillf

--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -992,13 +992,7 @@ rwsem_down_read_slowpath(struct rw_semap
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
 
-	/*
-	 * To prevent a constant stream of readers from starving a sleeping
-	 * waiter, don't attempt optimistic lock stealing if the lock is
-	 * currently owned by readers.
-	 */
-	if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
-	    (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
+	if (WARN_ON_ONCE(count & RWSEM_FLAG_READFAIL))
 		goto queue;
 
 	/*
@@ -1169,7 +1163,11 @@ rwsem_down_write_slowpath(struct rw_sema
 				goto trylock_again;
 		}
 
-		schedule();
+		if (RWSEM_FLAG_HANDOFF & atomic_long_read(&sem->count))
+			schedule();
+		else
+			schedule_timeout(1 + RWSEM_WAIT_TIMEOUT);
+
 		lockevent_inc(rwsem_sleep_writer);
 		set_current_state(state);
 trylock_again:
--


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-22 11:55       ` Hillf Danton
@ 2022-07-22 14:02         ` Doug Anderson
  2022-07-23  0:17           ` Hillf Danton
  0 siblings, 1 reply; 41+ messages in thread
From: Doug Anderson @ 2022-07-22 14:02 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

Hi,

On Fri, Jul 22, 2022 at 4:55 AM Hillf Danton <hdanton@sina.com> wrote:
>
> On Tue, 19 Jul 2022 08:30:02 -0700 Doug Anderson wrote:
> >
> > I haven't done any stress testing other than my test case, though, so
> > I can't speak to whether there might be any other unintended issues.
>
> The diff below is prepared for any regressions I can imagine in stress
> tests by adding changes to both read and write acquirer slow pathes.
>
> On the read side, make lock stealing more aggressive; on the other hand,
> write acquirers try to set HANDOFF after a RWSEM_WAIT_TIMEOUT nap to
> force the reader acquirers to take the slow path.
>
> Hillf
>
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -992,13 +992,7 @@ rwsem_down_read_slowpath(struct rw_semap
>         struct rwsem_waiter waiter;
>         DEFINE_WAKE_Q(wake_q);
>
> -       /*
> -        * To prevent a constant stream of readers from starving a sleeping
> -        * waiter, don't attempt optimistic lock stealing if the lock is
> -        * currently owned by readers.
> -        */
> -       if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
> -           (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
> +       if (WARN_ON_ONCE(count & RWSEM_FLAG_READFAIL))
>                 goto queue;
>
>         /*
> @@ -1169,7 +1163,11 @@ rwsem_down_write_slowpath(struct rw_sema
>                                 goto trylock_again;
>                 }
>
> -               schedule();
> +               if (RWSEM_FLAG_HANDOFF & atomic_long_read(&sem->count))
> +                       schedule();
> +               else
> +                       schedule_timeout(1 + RWSEM_WAIT_TIMEOUT);
> +
>                 lockevent_inc(rwsem_sleep_writer);
>                 set_current_state(state);
>  trylock_again:
> --

Thanks! I added this diff to your previous diff and my simple test
still passes and I don't see your WARN_ON triggered.

How do we move forward? Are you going to officially submit a patch
with both of your diffs squashed together? Are we waiting for
additional review from someone?

-Doug

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-22 14:02         ` Doug Anderson
@ 2022-07-23  0:17           ` Hillf Danton
  2022-07-23  1:27             ` Hillf Danton
  2022-08-05 17:14             ` Doug Anderson
  0 siblings, 2 replies; 41+ messages in thread
From: Hillf Danton @ 2022-07-23  0:17 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
> 
> Thanks! I added this diff to your previous diff and my simple test
> still passes and I don't see your WARN_ON triggered.

Thanks!
> 
> How do we move forward? Are you going to officially submit a patch
> with both of your diffs squashed together? Are we waiting for
> additional review from someone?

Given it is not unusual for us to miss anything important, lets take
a RWSEM_WAIT_TIMEOUT nap now or two.

Hillf


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-23  0:17           ` Hillf Danton
@ 2022-07-23  1:27             ` Hillf Danton
  2022-08-05 17:14             ` Doug Anderson
  1 sibling, 0 replies; 41+ messages in thread
From: Hillf Danton @ 2022-07-23  1:27 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Doug Anderson, Waiman Long, MM, Peter Zijlstra, Will Deacon,
	Davidlohr Bueso, Dmitry Vyukov, Eric Dumazet, Andrew Morton,
	LKML

On Sat, 23 Jul 2022 08:17:13 +0800 Hillf Danton wrote:
> On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
> > 
> > Thanks! I added this diff to your previous diff and my simple test
> > still passes and I don't see your WARN_ON triggered.
> 
> Thanks!
> > 
> > How do we move forward? Are you going to officially submit a patch
> > with both of your diffs squashed together? Are we waiting for
> > additional review from someone?
> 
> Given it is not unusual for us to miss anything important, lets take
> a RWSEM_WAIT_TIMEOUT nap now or two.

Meanwhile I feel good to, based on what you discovered in rwsem, nominate
Doug Anderson for the 2022 Google OSPB Award, as one of the winners in
2020.

> 
> Hillf


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-07-23  0:17           ` Hillf Danton
  2022-07-23  1:27             ` Hillf Danton
@ 2022-08-05 17:14             ` Doug Anderson
  2022-08-05 19:02               ` Waiman Long
  1 sibling, 1 reply; 41+ messages in thread
From: Doug Anderson @ 2022-08-05 17:14 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

Hi,

On Fri, Jul 22, 2022 at 5:17 PM Hillf Danton <hdanton@sina.com> wrote:
>
> On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
> >
> > Thanks! I added this diff to your previous diff and my simple test
> > still passes and I don't see your WARN_ON triggered.
>
> Thanks!
> >
> > How do we move forward? Are you going to officially submit a patch
> > with both of your diffs squashed together? Are we waiting for
> > additional review from someone?
>
> Given it is not unusual for us to miss anything important, lets take
> a RWSEM_WAIT_TIMEOUT nap now or two.

It appears that another fix has landed in the meantime. Commit
6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to ignore handoff
bit if not set by first waiter").

...unfortunately with that patch my test cases still hangs. :(

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-08-05 17:14             ` Doug Anderson
@ 2022-08-05 19:02               ` Waiman Long
  2022-08-05 19:16                 ` Doug Anderson
  0 siblings, 1 reply; 41+ messages in thread
From: Waiman Long @ 2022-08-05 19:02 UTC (permalink / raw)
  To: Doug Anderson, Hillf Danton
  Cc: Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML


On 8/5/22 13:14, Doug Anderson wrote:
> Hi,
>
> On Fri, Jul 22, 2022 at 5:17 PM Hillf Danton <hdanton@sina.com> wrote:
>> On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
>>> Thanks! I added this diff to your previous diff and my simple test
>>> still passes and I don't see your WARN_ON triggered.
>> Thanks!
>>> How do we move forward? Are you going to officially submit a patch
>>> with both of your diffs squashed together? Are we waiting for
>>> additional review from someone?
>> Given it is not unusual for us to miss anything important, lets take
>> a RWSEM_WAIT_TIMEOUT nap now or two.
> It appears that another fix has landed in the meantime. Commit
> 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to ignore handoff
> bit if not set by first waiter").
>
> ...unfortunately with that patch my test cases still hangs. :(

The aim of commit 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to 
ignore handoff bit if not set by first waiter") is to restore slowpath 
writer behavior to be the same as before commit d257cc8cb8d5 
("locking/rwsem: Make handoff bit handling more consistent").

If the hang still exists, there may be other cause for it. Could you 
share more information about what the test case is doing and any kernel 
splat that you have?

Thanks,
Longman



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-08-05 19:02               ` Waiman Long
@ 2022-08-05 19:16                 ` Doug Anderson
  2022-08-30 16:18                   ` Doug Anderson
  0 siblings, 1 reply; 41+ messages in thread
From: Doug Anderson @ 2022-08-05 19:16 UTC (permalink / raw)
  To: Waiman Long
  Cc: Hillf Danton, Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

Hi,

On Fri, Aug 5, 2022 at 12:02 PM Waiman Long <longman@redhat.com> wrote:
>
>
> On 8/5/22 13:14, Doug Anderson wrote:
> > Hi,
> >
> > On Fri, Jul 22, 2022 at 5:17 PM Hillf Danton <hdanton@sina.com> wrote:
> >> On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
> >>> Thanks! I added this diff to your previous diff and my simple test
> >>> still passes and I don't see your WARN_ON triggered.
> >> Thanks!
> >>> How do we move forward? Are you going to officially submit a patch
> >>> with both of your diffs squashed together? Are we waiting for
> >>> additional review from someone?
> >> Given it is not unusual for us to miss anything important, lets take
> >> a RWSEM_WAIT_TIMEOUT nap now or two.
> > It appears that another fix has landed in the meantime. Commit
> > 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to ignore handoff
> > bit if not set by first waiter").
> >
> > ...unfortunately with that patch my test cases still hangs. :(
>
> The aim of commit 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to
> ignore handoff bit if not set by first waiter") is to restore slowpath
> writer behavior to be the same as before commit d257cc8cb8d5
> ("locking/rwsem: Make handoff bit handling more consistent").

Ah, OK. I just saw another fix to the same commit and assumed that
perhaps it was intended to address the same issue.


> If the hang still exists, there may be other cause for it. Could you
> share more information about what the test case is doing and any kernel
> splat that you have?

It's all described in my earlier reply including my full test case:

https://lore.kernel.org/r/CAD=FV=URCo5xv3k3jWbxV1uRkUU5k6bcnuB1puZhxayEyVc6-A@mail.gmail.com

Previously I tested Hillf's patches and they fixed it for me.

-Doug

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-08-05 19:16                 ` Doug Anderson
@ 2022-08-30 16:18                   ` Doug Anderson
  2022-08-31 11:08                     ` Hillf Danton
  0 siblings, 1 reply; 41+ messages in thread
From: Doug Anderson @ 2022-08-30 16:18 UTC (permalink / raw)
  To: Waiman Long, Hillf Danton
  Cc: Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

Hi,

On Fri, Aug 5, 2022 at 12:16 PM Doug Anderson <dianders@chromium.org> wrote:
>
> Hi,
>
> On Fri, Aug 5, 2022 at 12:02 PM Waiman Long <longman@redhat.com> wrote:
> >
> >
> > On 8/5/22 13:14, Doug Anderson wrote:
> > > Hi,
> > >
> > > On Fri, Jul 22, 2022 at 5:17 PM Hillf Danton <hdanton@sina.com> wrote:
> > >> On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
> > >>> Thanks! I added this diff to your previous diff and my simple test
> > >>> still passes and I don't see your WARN_ON triggered.
> > >> Thanks!
> > >>> How do we move forward? Are you going to officially submit a patch
> > >>> with both of your diffs squashed together? Are we waiting for
> > >>> additional review from someone?
> > >> Given it is not unusual for us to miss anything important, lets take
> > >> a RWSEM_WAIT_TIMEOUT nap now or two.
> > > It appears that another fix has landed in the meantime. Commit
> > > 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to ignore handoff
> > > bit if not set by first waiter").
> > >
> > > ...unfortunately with that patch my test cases still hangs. :(
> >
> > The aim of commit 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to
> > ignore handoff bit if not set by first waiter") is to restore slowpath
> > writer behavior to be the same as before commit d257cc8cb8d5
> > ("locking/rwsem: Make handoff bit handling more consistent").
>
> Ah, OK. I just saw another fix to the same commit and assumed that
> perhaps it was intended to address the same issue.
>
>
> > If the hang still exists, there may be other cause for it. Could you
> > share more information about what the test case is doing and any kernel
> > splat that you have?
>
> It's all described in my earlier reply including my full test case:
>
> https://lore.kernel.org/r/CAD=FV=URCo5xv3k3jWbxV1uRkUU5k6bcnuB1puZhxayEyVc6-A@mail.gmail.com
>
> Previously I tested Hillf's patches and they fixed it for me.

Hillf: do you have any plan here for your patches?

I spent some time re-testing this today atop mainline, specifically
atop commit dcf8e5633e2e ("tracing: Define the is_signed_type() macro
once"). Some notes:

1. I can confirm that my test case still reproduces a hang on
mainline, though it seems a bit harder to reproduce (sometimes I have
to run for a few minutes). I didn't spend lots of time confirming that
the hang is exactly the same, but the same testcase reproduces it so
it probably is. If it's important I can drop into kgdb and dig around
to confirm.

2. Blindly applying the first (and resolving the trivial merge
conflict) or both of your proposed patches no longer fixes the hang on
mainline.

3. Reverting Waiman's commit 6eebd5fb2083 ("locking/rwsem: Allow
slowpath writer to ignore handoff bit if not set by first waiter") and
then applying your two fixes _does_ still fix the patch on mainline. I
ran for 20 minutes w/ no reproduction.

So it seems like Waiman's recent commit interacts with your fix in a bad way. :(

-Doug







-Doug

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v5] locking/rwsem: Make handoff bit handling more consistent
  2022-08-30 16:18                   ` Doug Anderson
@ 2022-08-31 11:08                     ` Hillf Danton
  0 siblings, 0 replies; 41+ messages in thread
From: Hillf Danton @ 2022-08-31 11:08 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Waiman Long, Peter Zijlstra, Will Deacon, Davidlohr Bueso, MM, LKML

On Tue, 30 Aug 2022 09:18:09 -0700 Doug Anderson <dianders@chromium.org> wrote:
> On Fri, Aug 5, 2022 at 12:16 PM Doug Anderson <dianders@chromium.org> wrote:
> > On Fri, Aug 5, 2022 at 12:02 PM Waiman Long <longman@redhat.com> wrote:
> > > On 8/5/22 13:14, Doug Anderson wrote:
> > > > Hi,
> > > >
> > > > On Fri, Jul 22, 2022 at 5:17 PM Hillf Danton <hdanton@sina.com> wrote:
> > > >> On Fri, 22 Jul 2022 07:02:42 -0700 Doug Anderson wrote:
> > > >>> Thanks! I added this diff to your previous diff and my simple test
> > > >>> still passes and I don't see your WARN_ON triggered.
> > > >> Thanks!
> > > >>> How do we move forward? Are you going to officially submit a patch
> > > >>> with both of your diffs squashed together? Are we waiting for
> > > >>> additional review from someone?
> > > >> Given it is not unusual for us to miss anything important, lets take
> > > >> a RWSEM_WAIT_TIMEOUT nap now or two.
> > > > It appears that another fix has landed in the meantime. Commit
> > > > 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to ignore handoff
> > > > bit if not set by first waiter").
> > > >
> > > > ...unfortunately with that patch my test cases still hangs. :(
> > >
> > > The aim of commit 6eebd5fb2083 ("locking/rwsem: Allow slowpath writer to
> > > ignore handoff bit if not set by first waiter") is to restore slowpath
> > > writer behavior to be the same as before commit d257cc8cb8d5
> > > ("locking/rwsem: Make handoff bit handling more consistent").
> >
> > Ah, OK. I just saw another fix to the same commit and assumed that
> > perhaps it was intended to address the same issue.
> >
> >
> > > If the hang still exists, there may be other cause for it. Could you
> > > share more information about what the test case is doing and any kernel
> > > splat that you have?
> >
> > It's all described in my earlier reply including my full test case:
> >
> > https://lore.kernel.org/r/CAD=FV=URCo5xv3k3jWbxV1uRkUU5k6bcnuB1puZhxayEyVc6-A@mail.gmail.com
> >
> > Previously I tested Hillf's patches and they fixed it for me.
> 
> Hillf: do you have any plan here for your patches?

It would take more patience to fix the hang, in addition to ticks and
minutes that are always good at fixing, because those in charge of
locking know better and deeper, and always have more issues to fix with
tighter time budget.

The option I can imagine is to wait fix from the locking folks after
putting my $0.02 next to your $2 on the table.

Hillf
> 
> I spent some time re-testing this today atop mainline, specifically
> atop commit dcf8e5633e2e ("tracing: Define the is_signed_type() macro
> once"). Some notes:
> 
> 1. I can confirm that my test case still reproduces a hang on
> mainline, though it seems a bit harder to reproduce (sometimes I have
> to run for a few minutes). I didn't spend lots of time confirming that
> the hang is exactly the same, but the same testcase reproduces it so
> it probably is. If it's important I can drop into kgdb and dig around
> to confirm.
> 
> 2. Blindly applying the first (and resolving the trivial merge
> conflict) or both of your proposed patches no longer fixes the hang on
> mainline.
> 
> 3. Reverting Waiman's commit 6eebd5fb2083 ("locking/rwsem: Allow
> slowpath writer to ignore handoff bit if not set by first waiter") and
> then applying your two fixes _does_ still fix the patch on mainline. I
> ran for 20 minutes w/ no reproduction.
> 
> So it seems like Waiman's recent commit interacts with your fix in a bad way. :(
> 
> -Doug


^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2022-08-31 11:08 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-16  1:29 [PATCH v5] locking/rwsem: Make handoff bit handling more consistent Waiman Long
2021-11-16  2:52 ` Aiqun(Maria) Yu
2021-11-16  9:14   ` Peter Zijlstra
2021-11-16  9:24     ` Peter Zijlstra
2021-11-16 14:52       ` Waiman Long
2021-11-17 13:36 ` Peter Zijlstra
2021-11-23  8:53 ` [tip: locking/urgent] " tip-bot2 for Waiman Long
2022-02-14 15:47 ` Re:[PATCH v5] " chenguanyou
2022-02-14 16:01   ` [PATCH " Greg KH
2022-04-11 18:26   ` john.p.donnelly
2022-04-11 18:40     ` Waiman Long
2022-04-11 21:03       ` john.p.donnelly
2022-04-11 21:07         ` Waiman Long
2022-04-12 16:28           ` john.p.donnelly
2022-04-12 17:04             ` Waiman Long
2022-04-14 10:48               ` Greg KH
2022-04-14 15:18                 ` Waiman Long
2022-04-14 15:42                   ` Greg KH
2022-04-14 15:44                     ` Waiman Long
2022-04-20 13:55             ` john.p.donnelly
2022-04-26 20:21               ` Waiman Long
2022-04-26 21:22                 ` john.p.donnelly
2022-02-14 16:22 ` chenguanyou
2022-02-15  7:41   ` [PATCH " Greg KH
2022-02-16 16:30     ` Waiman Long
2022-02-17 15:41       ` chenguanyou
2022-03-14  8:07         ` [PATCH " Greg KH
2022-03-22  2:49           ` chenguanyou
2022-03-24 12:51             ` [PATCH " Greg KH
2022-07-19  0:27 ` Doug Anderson
2022-07-19 10:41   ` Hillf Danton
2022-07-19 15:30     ` Doug Anderson
2022-07-22 11:55       ` Hillf Danton
2022-07-22 14:02         ` Doug Anderson
2022-07-23  0:17           ` Hillf Danton
2022-07-23  1:27             ` Hillf Danton
2022-08-05 17:14             ` Doug Anderson
2022-08-05 19:02               ` Waiman Long
2022-08-05 19:16                 ` Doug Anderson
2022-08-30 16:18                   ` Doug Anderson
2022-08-31 11:08                     ` Hillf Danton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.