* [PATCH 1/6] locking/rwsem: Minor code refactoring in rwsem_mark_wake()
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
@ 2023-02-23 12:26 ` Peter Zijlstra
2023-02-23 12:26 ` [PATCH 2/6] locking/rwsem: Enforce queueing when HANDOFF Peter Zijlstra
` (5 subsequent siblings)
6 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-23 12:26 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, peterz, boqun.feng
From: Waiman Long <longman@redhat.com>
Rename "oldcount" to "count" as it is not always old count value.
Also make some minor code refactoring to reduce indentation. There
is no functional change.
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230216210933.1169097-2-longman@redhat.com
---
kernel/locking/rwsem.c | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -40,7 +40,7 @@
*
* When the rwsem is reader-owned and a spinning writer has timed out,
* the nonspinnable bit will be set to disable optimistic spinning.
-
+ *
* When a writer acquires a rwsem, it puts its task_struct pointer
* into the owner field. It is cleared after an unlock.
*
@@ -413,7 +413,7 @@ static void rwsem_mark_wake(struct rw_se
struct wake_q_head *wake_q)
{
struct rwsem_waiter *waiter, *tmp;
- long oldcount, woken = 0, adjustment = 0;
+ long count, woken = 0, adjustment = 0;
struct list_head wlist;
lockdep_assert_held(&sem->wait_lock);
@@ -424,22 +424,23 @@ static void rwsem_mark_wake(struct rw_se
*/
waiter = rwsem_first_waiter(sem);
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY) {
- /*
- * Mark writer at the front of the queue for wakeup.
- * Until the task is actually later awoken later by
- * the caller, other writers are able to steal it.
- * Readers, on the other hand, will block as they
- * will notice the queued writer.
- */
- wake_q_add(wake_q, waiter->task);
- lockevent_inc(rwsem_wake_writer);
- }
+ if (waiter->type != RWSEM_WAITING_FOR_WRITE)
+ goto wake_readers;
- return;
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
+ */
+ wake_q_add(wake_q, waiter->task);
+ lockevent_inc(rwsem_wake_writer);
}
+ return;
+wake_readers:
/*
* No reader wakeup if there are too many of them already.
*/
@@ -455,15 +456,15 @@ static void rwsem_mark_wake(struct rw_se
struct task_struct *owner;
adjustment = RWSEM_READER_BIAS;
- oldcount = atomic_long_fetch_add(adjustment, &sem->count);
- if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+ count = atomic_long_fetch_add(adjustment, &sem->count);
+ if (unlikely(count & RWSEM_WRITER_MASK)) {
/*
* When we've been waiting "too" long (for writers
* to give up the lock), request a HANDOFF to
* force the issue.
*/
if (time_after(jiffies, waiter->timeout)) {
- if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ if (!(count & RWSEM_FLAG_HANDOFF)) {
adjustment -= RWSEM_FLAG_HANDOFF;
lockevent_inc(rwsem_rlock_handoff);
}
@@ -524,21 +525,21 @@ static void rwsem_mark_wake(struct rw_se
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
- oldcount = atomic_long_read(&sem->count);
+ count = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
/*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
*/
adjustment -= RWSEM_FLAG_WAITERS;
- if (oldcount & RWSEM_FLAG_HANDOFF)
+ if (count & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
} else if (woken) {
/*
* When we've woken a reader, we no longer need to force
* writers to give up the lock and we can clear HANDOFF.
*/
- if (oldcount & RWSEM_FLAG_HANDOFF)
+ if (count & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
}
@@ -844,7 +845,6 @@ static bool rwsem_optimistic_spin(struct
* Try to acquire the lock
*/
taken = rwsem_try_write_lock_unqueued(sem);
-
if (taken)
break;
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 2/6] locking/rwsem: Enforce queueing when HANDOFF
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
2023-02-23 12:26 ` [PATCH 1/6] locking/rwsem: Minor code refactoring in rwsem_mark_wake() Peter Zijlstra
@ 2023-02-23 12:26 ` Peter Zijlstra
2023-02-23 12:26 ` [PATCH 3/6] locking/rwsem: Rework writer wakeup Peter Zijlstra
` (4 subsequent siblings)
6 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-23 12:26 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, peterz, boqun.feng
Ensure that HANDOFF disables all spinning and stealing.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/locking/rwsem.c | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -468,7 +468,12 @@ static void rwsem_mark_wake(struct rw_se
adjustment -= RWSEM_FLAG_HANDOFF;
lockevent_inc(rwsem_rlock_handoff);
}
+ /*
+ * With HANDOFF set for reader, we must
+ * terminate all spinning.
+ */
waiter->handoff_set = true;
+ rwsem_set_nonspinnable(sem);
}
atomic_long_add(-adjustment, &sem->count);
@@ -755,6 +760,10 @@ rwsem_spin_on_owner(struct rw_semaphore
owner = rwsem_owner_flags(sem, &flags);
state = rwsem_owner_state(owner, flags);
+
+ if (owner == current)
+ return OWNER_NONSPINNABLE; /* Handoff granted */
+
if (state != OWNER_WRITER)
return state;
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
2023-02-23 12:26 ` [PATCH 1/6] locking/rwsem: Minor code refactoring in rwsem_mark_wake() Peter Zijlstra
2023-02-23 12:26 ` [PATCH 2/6] locking/rwsem: Enforce queueing when HANDOFF Peter Zijlstra
@ 2023-02-23 12:26 ` Peter Zijlstra
2023-02-23 21:38 ` Waiman Long
` (2 more replies)
2023-02-23 12:26 ` [PATCH 4/6] locking/rwsem: Split out rwsem_reader_wake() Peter Zijlstra
` (3 subsequent siblings)
6 siblings, 3 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-23 12:26 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, peterz, boqun.feng
Currently readers and writers have distinctly different wait/wake
methods. For readers the ->count adjustment happens on the wakeup
side, while for writers the ->count adjustment happens on the wait
side.
This asymmetry is unfortunate since the wake side has an additional
guarantee -- specifically, the wake side has observed the unlocked
state, and thus it can know that speculative READER_BIAS perbutations
on ->count are just that, they will be undone.
Additionally, unifying the wait/wake methods allows sharing code.
As such, do a straight-forward transform of the writer wakeup into the
wake side.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/locking/rwsem.c | 253 ++++++++++++++++++++++---------------------------
1 file changed, 115 insertions(+), 138 deletions(-)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -107,7 +107,7 @@
*
* There are three places where the lock handoff bit may be set or cleared.
* 1) rwsem_mark_wake() for readers -- set, clear
- * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 2) rwsem_writer_wake() for writers -- set, clear
* 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
@@ -377,7 +377,7 @@ rwsem_add_waiter(struct rw_semaphore *se
/*
* Remove a waiter from the wait_list and clear flags.
*
- * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * Both rwsem_mark_wake() and rwsem_writer_wake() contain a full 'copy' of
* this function. Modify with care.
*
* Return: true if wait_list isn't empty and false otherwise
@@ -394,6 +394,100 @@ rwsem_del_waiter(struct rw_semaphore *se
return false;
}
+static inline void
+rwsem_waiter_wake(struct rwsem_waiter *waiter, struct wake_q_head *wake_q)
+{
+ struct task_struct *tsk;
+
+ tsk = waiter->task;
+ get_task_struct(tsk);
+
+ /*
+ * Ensure calling get_task_struct() before setting the reader
+ * waiter to nil such that rwsem_down_read_slowpath() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add_safe(wake_q, tsk);
+}
+
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ *
+ * Implies rwsem_del_waiter() on success.
+ */
+static void rwsem_writer_wake(struct rw_semaphore *sem,
+ struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
+ long count, new;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ count = atomic_long_read(&sem->count);
+ do {
+ bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+
+ if (has_handoff) {
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
+ return;
+ }
+
+ new = count;
+
+ if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
+ return;
+
+ new |= RWSEM_FLAG_HANDOFF;
+ } else {
+ new |= RWSEM_WRITER_LOCKED;
+ new &= ~RWSEM_FLAG_HANDOFF;
+
+ if (list_is_singular(&sem->wait_list))
+ new &= ~RWSEM_FLAG_WAITERS;
+ }
+ } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
+
+ /*
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
+ */
+ if (new & RWSEM_FLAG_HANDOFF) {
+ first->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
+ return;
+ }
+
+ /*
+ * Have rwsem_writer_wake() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
+ rwsem_set_owner(sem);
+ rwsem_waiter_wake(waiter, wake_q);
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -424,23 +518,12 @@ static void rwsem_mark_wake(struct rw_se
*/
waiter = rwsem_first_waiter(sem);
- if (waiter->type != RWSEM_WAITING_FOR_WRITE)
- goto wake_readers;
-
- if (wake_type == RWSEM_WAKE_ANY) {
- /*
- * Mark writer at the front of the queue for wakeup.
- * Until the task is actually later awoken later by
- * the caller, other writers are able to steal it.
- * Readers, on the other hand, will block as they
- * will notice the queued writer.
- */
- wake_q_add(wake_q, waiter->task);
- lockevent_inc(rwsem_wake_writer);
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY)
+ rwsem_writer_wake(sem, waiter, wake_q);
+ return;
}
- return;
-wake_readers:
/*
* No reader wakeup if there are too many of them already.
*/
@@ -547,25 +630,8 @@ static void rwsem_mark_wake(struct rw_se
atomic_long_add(adjustment, &sem->count);
/* 2nd pass */
- list_for_each_entry_safe(waiter, tmp, &wlist, list) {
- struct task_struct *tsk;
-
- tsk = waiter->task;
- get_task_struct(tsk);
-
- /*
- * Ensure calling get_task_struct() before setting the reader
- * waiter to nil such that rwsem_down_read_slowpath() cannot
- * race with do_exit() by always holding a reference count
- * to the task to wakeup.
- */
- smp_store_release(&waiter->task, NULL);
- /*
- * Ensure issuing the wakeup (either by us or someone else)
- * after setting the reader waiter to nil.
- */
- wake_q_add_safe(wake_q, tsk);
- }
+ list_for_each_entry_safe(waiter, tmp, &wlist, list)
+ rwsem_waiter_wake(waiter, wake_q);
}
/*
@@ -596,77 +662,6 @@ rwsem_del_wake_waiter(struct rw_semaphor
}
/*
- * This function must be called with the sem->wait_lock held to prevent
- * race conditions between checking the rwsem wait list and setting the
- * sem->count accordingly.
- *
- * Implies rwsem_del_waiter() on success.
- */
-static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
- struct rwsem_waiter *waiter)
-{
- struct rwsem_waiter *first = rwsem_first_waiter(sem);
- long count, new;
-
- lockdep_assert_held(&sem->wait_lock);
-
- count = atomic_long_read(&sem->count);
- do {
- bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
-
- if (has_handoff) {
- /*
- * Honor handoff bit and yield only when the first
- * waiter is the one that set it. Otherwisee, we
- * still try to acquire the rwsem.
- */
- if (first->handoff_set && (waiter != first))
- return false;
- }
-
- new = count;
-
- if (count & RWSEM_LOCK_MASK) {
- /*
- * A waiter (first or not) can set the handoff bit
- * if it is an RT task or wait in the wait queue
- * for too long.
- */
- if (has_handoff || (!rt_task(waiter->task) &&
- !time_after(jiffies, waiter->timeout)))
- return false;
-
- new |= RWSEM_FLAG_HANDOFF;
- } else {
- new |= RWSEM_WRITER_LOCKED;
- new &= ~RWSEM_FLAG_HANDOFF;
-
- if (list_is_singular(&sem->wait_list))
- new &= ~RWSEM_FLAG_WAITERS;
- }
- } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
-
- /*
- * We have either acquired the lock with handoff bit cleared or set
- * the handoff bit. Only the first waiter can have its handoff_set
- * set here to enable optimistic spinning in slowpath loop.
- */
- if (new & RWSEM_FLAG_HANDOFF) {
- first->handoff_set = true;
- lockevent_inc(rwsem_wlock_handoff);
- return false;
- }
-
- /*
- * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
- * success.
- */
- list_del(&waiter->list);
- rwsem_set_owner(sem);
- return true;
-}
-
-/*
* The rwsem_spin_on_owner() function returns the following 4 values
* depending on the lock owner state.
* OWNER_NULL : owner is currently NULL
@@ -1072,7 +1067,7 @@ rwsem_down_read_slowpath(struct rw_semap
for (;;) {
set_current_state(state);
if (!smp_load_acquire(&waiter.task)) {
- /* Matches rwsem_mark_wake()'s smp_store_release(). */
+ /* Matches rwsem_waiter_wake()'s smp_store_release(). */
break;
}
if (signal_pending_state(state, current)) {
@@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
+ raw_spin_unlock_irq(&sem->wait_lock);
/* wait until we successfully acquire the lock */
- set_current_state(state);
trace_contention_begin(sem, LCB_F_WRITE);
for (;;) {
- if (rwsem_try_write_lock(sem, &waiter)) {
- /* rwsem_try_write_lock() implies ACQUIRE on success */
+ set_current_state(state);
+ if (!smp_load_acquire(&waiter.task)) {
+ /* Matches rwsem_waiter_wake()'s smp_store_release(). */
break;
}
-
- raw_spin_unlock_irq(&sem->wait_lock);
-
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- /*
- * After setting the handoff bit and failing to acquire
- * the lock, attempt to spin on owner to accelerate lock
- * transfer. If the previous owner is a on-cpu writer and it
- * has just released the lock, OWNER_NULL will be returned.
- * In this case, we attempt to acquire the lock again
- * without sleeping.
- */
- if (waiter.handoff_set) {
- enum owner_state owner_state;
-
- owner_state = rwsem_spin_on_owner(sem);
- if (owner_state == OWNER_NULL)
- goto trylock_again;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+ break;
}
-
schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_writer);
- set_current_state(state);
-trylock_again:
- raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
- raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
trace_contention_end(sem, 0);
return sem;
out_nolock:
- __set_current_state(TASK_RUNNING);
- raw_spin_lock_irq(&sem->wait_lock);
rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+ __set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_wlock_fail);
trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-23 12:26 ` [PATCH 3/6] locking/rwsem: Rework writer wakeup Peter Zijlstra
@ 2023-02-23 21:38 ` Waiman Long
2023-02-26 11:58 ` Peter Zijlstra
2023-02-26 12:00 ` Peter Zijlstra
2023-02-26 11:59 ` Peter Zijlstra
2023-02-26 15:04 ` Peter Zijlstra
2 siblings, 2 replies; 27+ messages in thread
From: Waiman Long @ 2023-02-23 21:38 UTC (permalink / raw)
To: Peter Zijlstra, mingo, will; +Cc: linux-kernel, boqun.feng
On 2/23/23 07:26, Peter Zijlstra wrote:
> Currently readers and writers have distinctly different wait/wake
> methods. For readers the ->count adjustment happens on the wakeup
> side, while for writers the ->count adjustment happens on the wait
> side.
>
> This asymmetry is unfortunate since the wake side has an additional
> guarantee -- specifically, the wake side has observed the unlocked
> state, and thus it can know that speculative READER_BIAS perbutations
> on ->count are just that, they will be undone.
>
> Additionally, unifying the wait/wake methods allows sharing code.
>
> As such, do a straight-forward transform of the writer wakeup into the
> wake side.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/locking/rwsem.c | 253 ++++++++++++++++++++++---------------------------
> 1 file changed, 115 insertions(+), 138 deletions(-)
>
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -107,7 +107,7 @@
> *
> * There are three places where the lock handoff bit may be set or cleared.
> * 1) rwsem_mark_wake() for readers -- set, clear
> - * 2) rwsem_try_write_lock() for writers -- set, clear
> + * 2) rwsem_writer_wake() for writers -- set, clear
> * 3) rwsem_del_waiter() -- clear
> *
> * For all the above cases, wait_lock will be held. A writer must also
> @@ -377,7 +377,7 @@ rwsem_add_waiter(struct rw_semaphore *se
> /*
> * Remove a waiter from the wait_list and clear flags.
> *
> - * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
> + * Both rwsem_mark_wake() and rwsem_writer_wake() contain a full 'copy' of
> * this function. Modify with care.
> *
> * Return: true if wait_list isn't empty and false otherwise
> @@ -394,6 +394,100 @@ rwsem_del_waiter(struct rw_semaphore *se
> return false;
> }
>
> +static inline void
> +rwsem_waiter_wake(struct rwsem_waiter *waiter, struct wake_q_head *wake_q)
> +{
> + struct task_struct *tsk;
> +
> + tsk = waiter->task;
> + get_task_struct(tsk);
> +
> + /*
> + * Ensure calling get_task_struct() before setting the reader
> + * waiter to nil such that rwsem_down_read_slowpath() cannot
> + * race with do_exit() by always holding a reference count
> + * to the task to wakeup.
> + */
> + smp_store_release(&waiter->task, NULL);
> + /*
> + * Ensure issuing the wakeup (either by us or someone else)
> + * after setting the reader waiter to nil.
> + */
> + wake_q_add_safe(wake_q, tsk);
> +}
> +
> +/*
> + * This function must be called with the sem->wait_lock held to prevent
> + * race conditions between checking the rwsem wait list and setting the
> + * sem->count accordingly.
> + *
> + * Implies rwsem_del_waiter() on success.
> + */
> +static void rwsem_writer_wake(struct rw_semaphore *sem,
> + struct rwsem_waiter *waiter,
> + struct wake_q_head *wake_q)
> +{
> + struct rwsem_waiter *first = rwsem_first_waiter(sem);
> + long count, new;
> +
> + lockdep_assert_held(&sem->wait_lock);
> +
> + count = atomic_long_read(&sem->count);
> + do {
> + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
> +
> + if (has_handoff) {
> + /*
> + * Honor handoff bit and yield only when the first
> + * waiter is the one that set it. Otherwisee, we
> + * still try to acquire the rwsem.
> + */
> + if (first->handoff_set && (waiter != first))
> + return;
> + }
This "if" statement if for a non-first waiter that somehow got woken up
to have a chance to steal the lock. Now the handoff is done in the wake
side for the first waiter, this "if" statement is not applicable and can
be removed.
> +
> + new = count;
> +
> + if (count & RWSEM_LOCK_MASK) {
> + /*
> + * A waiter (first or not) can set the handoff bit
> + * if it is an RT task or wait in the wait queue
> + * for too long.
> + */
> + if (has_handoff || (!rt_task(waiter->task) &&
> + !time_after(jiffies, waiter->timeout)))
> + return;
> +
> + new |= RWSEM_FLAG_HANDOFF;
> + } else {
> + new |= RWSEM_WRITER_LOCKED;
> + new &= ~RWSEM_FLAG_HANDOFF;
> +
> + if (list_is_singular(&sem->wait_list))
> + new &= ~RWSEM_FLAG_WAITERS;
> + }
> + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
> +
> + /*
> + * We have either acquired the lock with handoff bit cleared or set
> + * the handoff bit. Only the first waiter can have its handoff_set
> + * set here to enable optimistic spinning in slowpath loop.
> + */
> + if (new & RWSEM_FLAG_HANDOFF) {
> + first->handoff_set = true;
> + lockevent_inc(rwsem_wlock_handoff);
> + return;
> + }
> +
> + /*
> + * Have rwsem_writer_wake() fully imply rwsem_del_waiter() on
> + * success.
> + */
> + list_del(&waiter->list);
> + rwsem_set_owner(sem);
> + rwsem_waiter_wake(waiter, wake_q);
> +}
> +
> /*
> * handle the lock release when processes blocked on it that can now run
> * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
> @@ -424,23 +518,12 @@ static void rwsem_mark_wake(struct rw_se
> */
> waiter = rwsem_first_waiter(sem);
>
> - if (waiter->type != RWSEM_WAITING_FOR_WRITE)
> - goto wake_readers;
> -
> - if (wake_type == RWSEM_WAKE_ANY) {
> - /*
> - * Mark writer at the front of the queue for wakeup.
> - * Until the task is actually later awoken later by
> - * the caller, other writers are able to steal it.
> - * Readers, on the other hand, will block as they
> - * will notice the queued writer.
> - */
> - wake_q_add(wake_q, waiter->task);
> - lockevent_inc(rwsem_wake_writer);
> + if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
> + if (wake_type == RWSEM_WAKE_ANY)
> + rwsem_writer_wake(sem, waiter, wake_q);
> + return;
> }
> - return;
>
> -wake_readers:
> /*
> * No reader wakeup if there are too many of them already.
> */
> @@ -547,25 +630,8 @@ static void rwsem_mark_wake(struct rw_se
> atomic_long_add(adjustment, &sem->count);
>
> /* 2nd pass */
> - list_for_each_entry_safe(waiter, tmp, &wlist, list) {
> - struct task_struct *tsk;
> -
> - tsk = waiter->task;
> - get_task_struct(tsk);
> -
> - /*
> - * Ensure calling get_task_struct() before setting the reader
> - * waiter to nil such that rwsem_down_read_slowpath() cannot
> - * race with do_exit() by always holding a reference count
> - * to the task to wakeup.
> - */
> - smp_store_release(&waiter->task, NULL);
> - /*
> - * Ensure issuing the wakeup (either by us or someone else)
> - * after setting the reader waiter to nil.
> - */
> - wake_q_add_safe(wake_q, tsk);
> - }
> + list_for_each_entry_safe(waiter, tmp, &wlist, list)
> + rwsem_waiter_wake(waiter, wake_q);
> }
>
> /*
> @@ -596,77 +662,6 @@ rwsem_del_wake_waiter(struct rw_semaphor
> }
>
> /*
> - * This function must be called with the sem->wait_lock held to prevent
> - * race conditions between checking the rwsem wait list and setting the
> - * sem->count accordingly.
> - *
> - * Implies rwsem_del_waiter() on success.
> - */
> -static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
> - struct rwsem_waiter *waiter)
> -{
> - struct rwsem_waiter *first = rwsem_first_waiter(sem);
> - long count, new;
> -
> - lockdep_assert_held(&sem->wait_lock);
> -
> - count = atomic_long_read(&sem->count);
> - do {
> - bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
> -
> - if (has_handoff) {
> - /*
> - * Honor handoff bit and yield only when the first
> - * waiter is the one that set it. Otherwisee, we
> - * still try to acquire the rwsem.
> - */
> - if (first->handoff_set && (waiter != first))
> - return false;
> - }
> -
> - new = count;
> -
> - if (count & RWSEM_LOCK_MASK) {
> - /*
> - * A waiter (first or not) can set the handoff bit
> - * if it is an RT task or wait in the wait queue
> - * for too long.
> - */
> - if (has_handoff || (!rt_task(waiter->task) &&
> - !time_after(jiffies, waiter->timeout)))
> - return false;
> -
> - new |= RWSEM_FLAG_HANDOFF;
> - } else {
> - new |= RWSEM_WRITER_LOCKED;
> - new &= ~RWSEM_FLAG_HANDOFF;
> -
> - if (list_is_singular(&sem->wait_list))
> - new &= ~RWSEM_FLAG_WAITERS;
> - }
> - } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
> -
> - /*
> - * We have either acquired the lock with handoff bit cleared or set
> - * the handoff bit. Only the first waiter can have its handoff_set
> - * set here to enable optimistic spinning in slowpath loop.
> - */
> - if (new & RWSEM_FLAG_HANDOFF) {
> - first->handoff_set = true;
> - lockevent_inc(rwsem_wlock_handoff);
> - return false;
> - }
> -
> - /*
> - * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
> - * success.
> - */
> - list_del(&waiter->list);
> - rwsem_set_owner(sem);
> - return true;
> -}
> -
> -/*
> * The rwsem_spin_on_owner() function returns the following 4 values
> * depending on the lock owner state.
> * OWNER_NULL : owner is currently NULL
> @@ -1072,7 +1067,7 @@ rwsem_down_read_slowpath(struct rw_semap
> for (;;) {
> set_current_state(state);
> if (!smp_load_acquire(&waiter.task)) {
> - /* Matches rwsem_mark_wake()'s smp_store_release(). */
> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> break;
> }
> if (signal_pending_state(state, current)) {
> @@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
> } else {
> atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
> }
> + raw_spin_unlock_irq(&sem->wait_lock);
>
> /* wait until we successfully acquire the lock */
> - set_current_state(state);
> trace_contention_begin(sem, LCB_F_WRITE);
>
> for (;;) {
> - if (rwsem_try_write_lock(sem, &waiter)) {
> - /* rwsem_try_write_lock() implies ACQUIRE on success */
> + set_current_state(state);
> + if (!smp_load_acquire(&waiter.task)) {
> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> break;
> }
> -
> - raw_spin_unlock_irq(&sem->wait_lock);
> -
> - if (signal_pending_state(state, current))
> - goto out_nolock;
> -
> - /*
> - * After setting the handoff bit and failing to acquire
> - * the lock, attempt to spin on owner to accelerate lock
> - * transfer. If the previous owner is a on-cpu writer and it
> - * has just released the lock, OWNER_NULL will be returned.
> - * In this case, we attempt to acquire the lock again
> - * without sleeping.
> - */
> - if (waiter.handoff_set) {
> - enum owner_state owner_state;
> -
> - owner_state = rwsem_spin_on_owner(sem);
> - if (owner_state == OWNER_NULL)
> - goto trylock_again;
> + if (signal_pending_state(state, current)) {
> + raw_spin_lock_irq(&sem->wait_lock);
> + if (waiter.task)
> + goto out_nolock;
> + raw_spin_unlock_irq(&sem->wait_lock);
> + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> + break;
> }
> -
> schedule_preempt_disabled();
> lockevent_inc(rwsem_sleep_writer);
> - set_current_state(state);
> -trylock_again:
> - raw_spin_lock_irq(&sem->wait_lock);
> }
> __set_current_state(TASK_RUNNING);
> - raw_spin_unlock_irq(&sem->wait_lock);
> lockevent_inc(rwsem_wlock);
> trace_contention_end(sem, 0);
> return sem;
>
> out_nolock:
> - __set_current_state(TASK_RUNNING);
> - raw_spin_lock_irq(&sem->wait_lock);
> rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> + __set_current_state(TASK_RUNNING);
> lockevent_inc(rwsem_wlock_fail);
> trace_contention_end(sem, -EINTR);
> return ERR_PTR(-EINTR);
I believe it is better to change state inside the wait_lock critical
section to provide a release barrier for free.
Cheers,
Longman
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-23 21:38 ` Waiman Long
@ 2023-02-26 11:58 ` Peter Zijlstra
2023-02-26 12:00 ` Peter Zijlstra
1 sibling, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 11:58 UTC (permalink / raw)
To: Waiman Long; +Cc: mingo, will, linux-kernel, boqun.feng
On Thu, Feb 23, 2023 at 04:38:08PM -0500, Waiman Long wrote:
> > +static void rwsem_writer_wake(struct rw_semaphore *sem,
> > + struct rwsem_waiter *waiter,
> > + struct wake_q_head *wake_q)
> > +{
> > + struct rwsem_waiter *first = rwsem_first_waiter(sem);
> > + long count, new;
> > +
> > + lockdep_assert_held(&sem->wait_lock);
> > +
> > + count = atomic_long_read(&sem->count);
> > + do {
> > + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
> > +
> > + if (has_handoff) {
> > + /*
> > + * Honor handoff bit and yield only when the first
> > + * waiter is the one that set it. Otherwisee, we
> > + * still try to acquire the rwsem.
> > + */
> > + if (first->handoff_set && (waiter != first))
> > + return;
> > + }
> This "if" statement if for a non-first waiter that somehow got woken up to
> have a chance to steal the lock. Now the handoff is done in the wake side
> for the first waiter, this "if" statement is not applicable and can be
> removed.
Yeah, that can be cleaned up, something like the below. But that doesn't
appear to be the cause of issues.
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -427,25 +427,12 @@ static void rwsem_writer_wake(struct rw_
struct rwsem_waiter *waiter,
struct wake_q_head *wake_q)
{
- struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
count = atomic_long_read(&sem->count);
do {
- bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
-
- if (has_handoff) {
- /*
- * Honor handoff bit and yield only when the first
- * waiter is the one that set it. Otherwisee, we
- * still try to acquire the rwsem.
- */
- if (first->handoff_set && (waiter != first))
- return;
- }
-
new = count;
if (count & RWSEM_LOCK_MASK) {
@@ -454,8 +441,9 @@ static void rwsem_writer_wake(struct rw_
* if it is an RT task or wait in the wait queue
* for too long.
*/
- if (has_handoff || (!rt_task(waiter->task) &&
- !time_after(jiffies, waiter->timeout)))
+ if ((count & RWSEM_FLAG_HANDOFF) ||
+ (!rt_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
return;
new |= RWSEM_FLAG_HANDOFF;
@@ -474,7 +462,7 @@ static void rwsem_writer_wake(struct rw_
* set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- first->handoff_set = true;
+ waiter->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return;
}
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-23 21:38 ` Waiman Long
2023-02-26 11:58 ` Peter Zijlstra
@ 2023-02-26 12:00 ` Peter Zijlstra
2023-02-26 21:31 ` Waiman Long
1 sibling, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 12:00 UTC (permalink / raw)
To: Waiman Long; +Cc: mingo, will, linux-kernel, boqun.feng
On Thu, Feb 23, 2023 at 04:38:08PM -0500, Waiman Long wrote:
> > @@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
> > } else {
> > atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
> > }
> > + raw_spin_unlock_irq(&sem->wait_lock);
> > /* wait until we successfully acquire the lock */
> > - set_current_state(state);
> > trace_contention_begin(sem, LCB_F_WRITE);
> > for (;;) {
> > - if (rwsem_try_write_lock(sem, &waiter)) {
> > - /* rwsem_try_write_lock() implies ACQUIRE on success */
> > + set_current_state(state);
> > + if (!smp_load_acquire(&waiter.task)) {
> > + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > break;
> > }
> > -
> > - raw_spin_unlock_irq(&sem->wait_lock);
> > -
> > - if (signal_pending_state(state, current))
> > - goto out_nolock;
> > -
> > - /*
> > - * After setting the handoff bit and failing to acquire
> > - * the lock, attempt to spin on owner to accelerate lock
> > - * transfer. If the previous owner is a on-cpu writer and it
> > - * has just released the lock, OWNER_NULL will be returned.
> > - * In this case, we attempt to acquire the lock again
> > - * without sleeping.
> > - */
> > - if (waiter.handoff_set) {
> > - enum owner_state owner_state;
> > -
> > - owner_state = rwsem_spin_on_owner(sem);
> > - if (owner_state == OWNER_NULL)
> > - goto trylock_again;
> > + if (signal_pending_state(state, current)) {
> > + raw_spin_lock_irq(&sem->wait_lock);
> > + if (waiter.task)
> > + goto out_nolock;
> > + raw_spin_unlock_irq(&sem->wait_lock);
> > + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> > + break;
> > }
> > -
> > schedule_preempt_disabled();
> > lockevent_inc(rwsem_sleep_writer);
> > - set_current_state(state);
> > -trylock_again:
> > - raw_spin_lock_irq(&sem->wait_lock);
> > }
> > __set_current_state(TASK_RUNNING);
> > - raw_spin_unlock_irq(&sem->wait_lock);
> > lockevent_inc(rwsem_wlock);
> > trace_contention_end(sem, 0);
> > return sem;
> > out_nolock:
> > - __set_current_state(TASK_RUNNING);
> > - raw_spin_lock_irq(&sem->wait_lock);
> > rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> > + __set_current_state(TASK_RUNNING);
> > lockevent_inc(rwsem_wlock_fail);
> > trace_contention_end(sem, -EINTR);
> > return ERR_PTR(-EINTR);
>
> I believe it is better to change state inside the wait_lock critical section
> to provide a release barrier for free.
I can't follow... a release for what? Note that the reader slowpath has
this exact form already.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-26 12:00 ` Peter Zijlstra
@ 2023-02-26 21:31 ` Waiman Long
0 siblings, 0 replies; 27+ messages in thread
From: Waiman Long @ 2023-02-26 21:31 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: mingo, will, linux-kernel, boqun.feng
On 2/26/23 07:00, Peter Zijlstra wrote:
> On Thu, Feb 23, 2023 at 04:38:08PM -0500, Waiman Long wrote:
>
>>> @@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
>>> } else {
>>> atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
>>> }
>>> + raw_spin_unlock_irq(&sem->wait_lock);
>>> /* wait until we successfully acquire the lock */
>>> - set_current_state(state);
>>> trace_contention_begin(sem, LCB_F_WRITE);
>>> for (;;) {
>>> - if (rwsem_try_write_lock(sem, &waiter)) {
>>> - /* rwsem_try_write_lock() implies ACQUIRE on success */
>>> + set_current_state(state);
>>> + if (!smp_load_acquire(&waiter.task)) {
>>> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
>>> break;
>>> }
>>> -
>>> - raw_spin_unlock_irq(&sem->wait_lock);
>>> -
>>> - if (signal_pending_state(state, current))
>>> - goto out_nolock;
>>> -
>>> - /*
>>> - * After setting the handoff bit and failing to acquire
>>> - * the lock, attempt to spin on owner to accelerate lock
>>> - * transfer. If the previous owner is a on-cpu writer and it
>>> - * has just released the lock, OWNER_NULL will be returned.
>>> - * In this case, we attempt to acquire the lock again
>>> - * without sleeping.
>>> - */
>>> - if (waiter.handoff_set) {
>>> - enum owner_state owner_state;
>>> -
>>> - owner_state = rwsem_spin_on_owner(sem);
>>> - if (owner_state == OWNER_NULL)
>>> - goto trylock_again;
>>> + if (signal_pending_state(state, current)) {
>>> + raw_spin_lock_irq(&sem->wait_lock);
>>> + if (waiter.task)
>>> + goto out_nolock;
>>> + raw_spin_unlock_irq(&sem->wait_lock);
>>> + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
>>> + break;
>>> }
>>> -
>>> schedule_preempt_disabled();
>>> lockevent_inc(rwsem_sleep_writer);
>>> - set_current_state(state);
>>> -trylock_again:
>>> - raw_spin_lock_irq(&sem->wait_lock);
>>> }
>>> __set_current_state(TASK_RUNNING);
>>> - raw_spin_unlock_irq(&sem->wait_lock);
>>> lockevent_inc(rwsem_wlock);
>>> trace_contention_end(sem, 0);
>>> return sem;
>>> out_nolock:
>>> - __set_current_state(TASK_RUNNING);
>>> - raw_spin_lock_irq(&sem->wait_lock);
>>> rwsem_del_wake_waiter(sem, &waiter, &wake_q);
>>> + __set_current_state(TASK_RUNNING);
>>> lockevent_inc(rwsem_wlock_fail);
>>> trace_contention_end(sem, -EINTR);
>>> return ERR_PTR(-EINTR);
>> I believe it is better to change state inside the wait_lock critical section
>> to provide a release barrier for free.
> I can't follow... a release for what? Note that the reader slowpath has
> this exact form already.\
You are right. I forgot that we don't need synchronization when setting
state to TASK_RUNNING.
Cheers,
Longman
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-23 12:26 ` [PATCH 3/6] locking/rwsem: Rework writer wakeup Peter Zijlstra
2023-02-23 21:38 ` Waiman Long
@ 2023-02-26 11:59 ` Peter Zijlstra
2023-02-26 15:04 ` Peter Zijlstra
2 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 11:59 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, boqun.feng
On Thu, Feb 23, 2023 at 01:26:45PM +0100, Peter Zijlstra wrote:
> +/*
> + * This function must be called with the sem->wait_lock held to prevent
> + * race conditions between checking the rwsem wait list and setting the
> + * sem->count accordingly.
> + *
> + * Implies rwsem_del_waiter() on success.
> + */
> +static void rwsem_writer_wake(struct rw_semaphore *sem,
> + struct rwsem_waiter *waiter,
> + struct wake_q_head *wake_q)
> +{
> + struct rwsem_waiter *first = rwsem_first_waiter(sem);
> + long count, new;
> +
> + lockdep_assert_held(&sem->wait_lock);
> +
> + count = atomic_long_read(&sem->count);
> + do {
> + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
> +
> + if (has_handoff) {
> + /*
> + * Honor handoff bit and yield only when the first
> + * waiter is the one that set it. Otherwisee, we
> + * still try to acquire the rwsem.
> + */
> + if (first->handoff_set && (waiter != first))
> + return;
> + }
> +
> + new = count;
> +
> + if (count & RWSEM_LOCK_MASK) {
> + /*
> + * A waiter (first or not) can set the handoff bit
> + * if it is an RT task or wait in the wait queue
> + * for too long.
> + */
> + if (has_handoff || (!rt_task(waiter->task) &&
> + !time_after(jiffies, waiter->timeout)))
> + return;
> +
> + new |= RWSEM_FLAG_HANDOFF;
> + } else {
> + new |= RWSEM_WRITER_LOCKED;
> + new &= ~RWSEM_FLAG_HANDOFF;
> +
> + if (list_is_singular(&sem->wait_list))
> + new &= ~RWSEM_FLAG_WAITERS;
> + }
> + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
> +
> + /*
> + * We have either acquired the lock with handoff bit cleared or set
> + * the handoff bit. Only the first waiter can have its handoff_set
> + * set here to enable optimistic spinning in slowpath loop.
> + */
> + if (new & RWSEM_FLAG_HANDOFF) {
> + first->handoff_set = true;
> + lockevent_inc(rwsem_wlock_handoff);
> + return;
> + }
> +
> + /*
> + * Have rwsem_writer_wake() fully imply rwsem_del_waiter() on
> + * success.
> + */
> + list_del(&waiter->list);
> + rwsem_set_owner(sem);
At the very least this needs to be:
atomic_long_set(&sem->owner, (long)waiter->task);
> + rwsem_waiter_wake(waiter, wake_q);
> +}
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-23 12:26 ` [PATCH 3/6] locking/rwsem: Rework writer wakeup Peter Zijlstra
2023-02-23 21:38 ` Waiman Long
2023-02-26 11:59 ` Peter Zijlstra
@ 2023-02-26 15:04 ` Peter Zijlstra
2023-02-26 16:51 ` Peter Zijlstra
2 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 15:04 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, boqun.feng
On Thu, Feb 23, 2023 at 01:26:45PM +0100, Peter Zijlstra wrote:
> @@ -1072,7 +1067,7 @@ rwsem_down_read_slowpath(struct rw_semap
> for (;;) {
> set_current_state(state);
> if (!smp_load_acquire(&waiter.task)) {
> - /* Matches rwsem_mark_wake()'s smp_store_release(). */
> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> break;
> }
> if (signal_pending_state(state, current)) {
> @@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
> } else {
> atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
Found it; if we remove the try_write_lock below, then at least this
new-waiter path needs to still do a trylock.
Let me go test the other patches on top of all this and push out a fresh
set if that all still works.
> }
> + raw_spin_unlock_irq(&sem->wait_lock);
>
> /* wait until we successfully acquire the lock */
> - set_current_state(state);
> trace_contention_begin(sem, LCB_F_WRITE);
>
> for (;;) {
> - if (rwsem_try_write_lock(sem, &waiter)) {
> - /* rwsem_try_write_lock() implies ACQUIRE on success */
> + set_current_state(state);
> + if (!smp_load_acquire(&waiter.task)) {
> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> break;
> }
> -
> - raw_spin_unlock_irq(&sem->wait_lock);
> -
> - if (signal_pending_state(state, current))
> - goto out_nolock;
> -
> - /*
> - * After setting the handoff bit and failing to acquire
> - * the lock, attempt to spin on owner to accelerate lock
> - * transfer. If the previous owner is a on-cpu writer and it
> - * has just released the lock, OWNER_NULL will be returned.
> - * In this case, we attempt to acquire the lock again
> - * without sleeping.
> - */
> - if (waiter.handoff_set) {
> - enum owner_state owner_state;
> -
> - owner_state = rwsem_spin_on_owner(sem);
> - if (owner_state == OWNER_NULL)
> - goto trylock_again;
> + if (signal_pending_state(state, current)) {
> + raw_spin_lock_irq(&sem->wait_lock);
> + if (waiter.task)
> + goto out_nolock;
> + raw_spin_unlock_irq(&sem->wait_lock);
> + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> + break;
> }
> -
> schedule_preempt_disabled();
> lockevent_inc(rwsem_sleep_writer);
> - set_current_state(state);
> -trylock_again:
> - raw_spin_lock_irq(&sem->wait_lock);
> }
> __set_current_state(TASK_RUNNING);
> - raw_spin_unlock_irq(&sem->wait_lock);
> lockevent_inc(rwsem_wlock);
> trace_contention_end(sem, 0);
> return sem;
>
> out_nolock:
> - __set_current_state(TASK_RUNNING);
> - raw_spin_lock_irq(&sem->wait_lock);
> rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> + __set_current_state(TASK_RUNNING);
> lockevent_inc(rwsem_wlock_fail);
> trace_contention_end(sem, -EINTR);
> return ERR_PTR(-EINTR);
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-26 15:04 ` Peter Zijlstra
@ 2023-02-26 16:51 ` Peter Zijlstra
2023-02-27 0:22 ` Waiman Long
0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 16:51 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, boqun.feng
On Sun, Feb 26, 2023 at 04:04:35PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 23, 2023 at 01:26:45PM +0100, Peter Zijlstra wrote:
> > @@ -1072,7 +1067,7 @@ rwsem_down_read_slowpath(struct rw_semap
> > for (;;) {
> > set_current_state(state);
> > if (!smp_load_acquire(&waiter.task)) {
> > - /* Matches rwsem_mark_wake()'s smp_store_release(). */
> > + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > break;
> > }
> > if (signal_pending_state(state, current)) {
> > @@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
> > } else {
> > atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
>
> Found it; if we remove the try_write_lock below, then at least this
> new-waiter path needs to still do a trylock.
>
> Let me go test the other patches on top of all this and push out a fresh
> set if that all still works.
queue.git locking/core
We'll see what the robots make of it.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-26 16:51 ` Peter Zijlstra
@ 2023-02-27 0:22 ` Waiman Long
2023-02-27 10:31 ` Peter Zijlstra
0 siblings, 1 reply; 27+ messages in thread
From: Waiman Long @ 2023-02-27 0:22 UTC (permalink / raw)
To: Peter Zijlstra, mingo, will; +Cc: linux-kernel, boqun.feng
On 2/26/23 11:51, Peter Zijlstra wrote:
> On Sun, Feb 26, 2023 at 04:04:35PM +0100, Peter Zijlstra wrote:
>> On Thu, Feb 23, 2023 at 01:26:45PM +0100, Peter Zijlstra wrote:
>>> @@ -1072,7 +1067,7 @@ rwsem_down_read_slowpath(struct rw_semap
>>> for (;;) {
>>> set_current_state(state);
>>> if (!smp_load_acquire(&waiter.task)) {
>>> - /* Matches rwsem_mark_wake()'s smp_store_release(). */
>>> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
>>> break;
>>> }
>>> if (signal_pending_state(state, current)) {
>>> @@ -1143,54 +1138,36 @@ rwsem_down_write_slowpath(struct rw_sema
>>> } else {
>>> atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
>> Found it; if we remove the try_write_lock below, then at least this
>> new-waiter path needs to still do a trylock.
>>
>> Let me go test the other patches on top of all this and push out a fresh
>> set if that all still works.
> queue.git locking/core
>
> We'll see what the robots make of it.
From your new patch 3:
@@ -1151,55 +1154,39 @@ rwsem_down_write_slowpath(struct rw_semaphore
*sem, int state)
}
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+ if (rwsem_try_write_lock(sem, &waiter))
+ waiter.task = NULL;
}
+ raw_spin_unlock_irq(&sem->wait_lock);
/* wait until we successfully acquire the lock */
- set_current_state(state);
trace_contention_begin(sem, LCB_F_WRITE);
for (;;) {
- if (rwsem_try_write_lock(sem, &waiter)) {
- /* rwsem_try_write_lock() implies ACQUIRE on
success */
+ set_current_state(state);
+ if (!smp_load_acquire(&waiter.task)) {
+ /* Matches rwsem_waiter_wake()'s
smp_store_release(). */
break;
}
-
The additional rwsem_try_write_lock() call seems to address the missed
wakeup problem AFAICT.
I do have some concern that early lock transfer to a lock owner that has
not been woken up yet may suppress writer lock stealing from optimistic
spinning causing some performance regression in some cases. Let's see if
the test robot report anything.
Cheers,
Longman
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-27 0:22 ` Waiman Long
@ 2023-02-27 10:31 ` Peter Zijlstra
2023-02-27 20:16 ` Waiman Long
0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-27 10:31 UTC (permalink / raw)
To: Waiman Long; +Cc: mingo, will, linux-kernel, boqun.feng
On Sun, Feb 26, 2023 at 07:22:47PM -0500, Waiman Long wrote:
> @@ -1151,55 +1154,39 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem,
> int state)
> }
> } else {
> atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
> + if (rwsem_try_write_lock(sem, &waiter))
> + waiter.task = NULL;
> }
> + raw_spin_unlock_irq(&sem->wait_lock);
>
> /* wait until we successfully acquire the lock */
> - set_current_state(state);
> trace_contention_begin(sem, LCB_F_WRITE);
>
> for (;;) {
> - if (rwsem_try_write_lock(sem, &waiter)) {
> - /* rwsem_try_write_lock() implies ACQUIRE on success
> */
> + set_current_state(state);
> + if (!smp_load_acquire(&waiter.task)) {
> + /* Matches rwsem_waiter_wake()'s
> smp_store_release(). */
> break;
> }
> -
>
> The additional rwsem_try_write_lock() call seems to address the missed
> wakeup problem AFAICT.
Indeed, prior to this I could readily reproduce the lockup.
So when thinking about missing wakeups I noticed this race on WAITERS.
If we queue but the unlock does not yet observe WAITERS the unlock does
not go into the slow path and wakeup gets lost.
Reader side fixes this with rwsem_cond_wake_waiter(), but I could not
convince myself that is correct for writer side -- perhaps it is, will
need to think more on that.
> I do have some concern that early lock transfer to a lock owner that has not
> been woken up yet may suppress writer lock stealing from optimistic spinning
> causing some performance regression in some cases. Let's see if the test
> robot report anything.
Ah yes, I suppose that is indeed a possibility. Given this is all under
wait_lock and the spinner is not, I was hoping it would still have
sufficient time to win. But yes, robots will tell us.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-27 10:31 ` Peter Zijlstra
@ 2023-02-27 20:16 ` Waiman Long
2023-03-20 8:12 ` Peter Zijlstra
0 siblings, 1 reply; 27+ messages in thread
From: Waiman Long @ 2023-02-27 20:16 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: mingo, will, linux-kernel, boqun.feng
On 2/27/23 05:31, Peter Zijlstra wrote:
>> I do have some concern that early lock transfer to a lock owner that has not
>> been woken up yet may suppress writer lock stealing from optimistic spinning
>> causing some performance regression in some cases. Let's see if the test
>> robot report anything.
> Ah yes, I suppose that is indeed a possibility. Given this is all under
> wait_lock and the spinner is not, I was hoping it would still have
> sufficient time to win. But yes, robots will tell us.
>
I run my rwsem locking microbenchmark on a 2-socket 96-thread x86-64
system with lock event turned on for 15 secs.
Before this patchset:
Running locktest with rwsem [runtime = 15s, r% = 50%, load = 100]
Threads = 96, Min/Mean/Max = 74,506/91,260/112,409
Threads = 96, Total Rate = 584,091 op/s; Percpu Rate = 6,084 op/s
rwsem_opt_fail=127305
rwsem_opt_lock=4252147
rwsem_opt_nospin=28920
rwsem_rlock=2713129
rwsem_rlock_fail=0
rwsem_rlock_fast=5
rwsem_rlock_handoff=280
rwsem_rlock_steal=1486617
rwsem_sleep_reader=2713085
rwsem_sleep_writer=4313369
rwsem_wake_reader=29876
rwsem_wake_writer=5829160
rwsem_wlock=127305
rwsem_wlock_fail=0
rwsem_wlock_handoff=2515
After this patchset:
Running locktest with rwsem [runtime = 15s, r% = 50%, load = 100]
Threads = 96, Min/Mean/Max = 26,573/26,749/26,833
Threads = 96, Total Rate = 171,184 op/s; Percpu Rate = 1,783 op/s
rwsem_opt_fail=1265481
rwsem_opt_lock=17939
rwsem_rlock=1266157
rwsem_rlock_fail=0
rwsem_rlock_fast=0
rwsem_rlock_handoff=0
rwsem_rlock_steal=551
rwsem_sleep_reader=1266157
rwsem_sleep_writer=1265481
rwsem_wake_reader=26612
rwsem_wake_writer=0
rwsem_wlock=1265481
rwsem_wlock_ehandoff=94
rwsem_wlock_fail=0
rwsem_wlock_handoff=94
So the locking rate is reduced to just 29.3% of the original. Looking at
the number of successful writer lock stealings from optimistic spinning
(rwsem_opt_lock), it is reduced from 4252147 to 17939. It is just about
0.4% of the original.
So for workloads that have a lot of writer contention, there will be
performance regressions. Do you mind if we try to keep the original
logic of my patchset to allow write lock acquisition in writer slow
path, but transfer the lock ownership in the wakeup path when handoff
is required. We can do this with some minor code changes on top of your
current patchset.
Regards,
Longman
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-02-27 20:16 ` Waiman Long
@ 2023-03-20 8:12 ` Peter Zijlstra
2023-03-20 17:36 ` Waiman Long
0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2023-03-20 8:12 UTC (permalink / raw)
To: Waiman Long; +Cc: mingo, will, linux-kernel, boqun.feng
On Mon, Feb 27, 2023 at 03:16:25PM -0500, Waiman Long wrote:
> On 2/27/23 05:31, Peter Zijlstra wrote:
> > > I do have some concern that early lock transfer to a lock owner that has not
> > > been woken up yet may suppress writer lock stealing from optimistic spinning
> > > causing some performance regression in some cases. Let's see if the test
> > > robot report anything.
> > Ah yes, I suppose that is indeed a possibility. Given this is all under
> > wait_lock and the spinner is not, I was hoping it would still have
> > sufficient time to win. But yes, robots will tell us.
> >
> I run my rwsem locking microbenchmark on a 2-socket 96-thread x86-64
> system with lock event turned on for 15 secs.
>
> Before this patchset:
>
> Running locktest with rwsem [runtime = 15s, r% = 50%, load = 100]
> Threads = 96, Min/Mean/Max = 74,506/91,260/112,409
> Threads = 96, Total Rate = 584,091 op/s; Percpu Rate = 6,084 op/s
>
> rwsem_opt_fail=127305
> rwsem_opt_lock=4252147
> rwsem_opt_nospin=28920
> rwsem_rlock=2713129
> rwsem_rlock_fail=0
> rwsem_rlock_fast=5
> rwsem_rlock_handoff=280
> rwsem_rlock_steal=1486617
> rwsem_sleep_reader=2713085
> rwsem_sleep_writer=4313369
> rwsem_wake_reader=29876
> rwsem_wake_writer=5829160
> rwsem_wlock=127305
> rwsem_wlock_fail=0
> rwsem_wlock_handoff=2515
>
> After this patchset:
>
> Running locktest with rwsem [runtime = 15s, r% = 50%, load = 100]
> Threads = 96, Min/Mean/Max = 26,573/26,749/26,833
> Threads = 96, Total Rate = 171,184 op/s; Percpu Rate = 1,783 op/s
>
> rwsem_opt_fail=1265481
> rwsem_opt_lock=17939
> rwsem_rlock=1266157
> rwsem_rlock_fail=0
> rwsem_rlock_fast=0
> rwsem_rlock_handoff=0
> rwsem_rlock_steal=551
> rwsem_sleep_reader=1266157
> rwsem_sleep_writer=1265481
> rwsem_wake_reader=26612
> rwsem_wake_writer=0
> rwsem_wlock=1265481
> rwsem_wlock_ehandoff=94
> rwsem_wlock_fail=0
> rwsem_wlock_handoff=94
>
> So the locking rate is reduced to just 29.3% of the original. Looking at
> the number of successful writer lock stealings from optimistic spinning
> (rwsem_opt_lock), it is reduced from 4252147 to 17939. It is just about
> 0.4% of the original.
>
> So for workloads that have a lot of writer contention, there will be
> performance regressions. Do you mind if we try to keep the original
> logic of my patchset to allow write lock acquisition in writer slow
> path, but transfer the lock ownership in the wakeup path when handoff
> is required. We can do this with some minor code changes on top of your
> current patchset.
Urgh, sorry, I seem to have lost sight of this... those results,..
sadness :/
Yeah, I suppose there's nothing for it but to have live with that mess,
be very sure to add comments eludicating any future poor sod reading it
as to why the code is the way it is.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 3/6] locking/rwsem: Rework writer wakeup
2023-03-20 8:12 ` Peter Zijlstra
@ 2023-03-20 17:36 ` Waiman Long
0 siblings, 0 replies; 27+ messages in thread
From: Waiman Long @ 2023-03-20 17:36 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: mingo, will, linux-kernel, boqun.feng
On 3/20/23 04:12, Peter Zijlstra wrote:
> On Mon, Feb 27, 2023 at 03:16:25PM -0500, Waiman Long wrote:
>> On 2/27/23 05:31, Peter Zijlstra wrote:
>>>> I do have some concern that early lock transfer to a lock owner that has not
>>>> been woken up yet may suppress writer lock stealing from optimistic spinning
>>>> causing some performance regression in some cases. Let's see if the test
>>>> robot report anything.
>>> Ah yes, I suppose that is indeed a possibility. Given this is all under
>>> wait_lock and the spinner is not, I was hoping it would still have
>>> sufficient time to win. But yes, robots will tell us.
>>>
>> I run my rwsem locking microbenchmark on a 2-socket 96-thread x86-64
>> system with lock event turned on for 15 secs.
>>
>> Before this patchset:
>>
>> Running locktest with rwsem [runtime = 15s, r% = 50%, load = 100]
>> Threads = 96, Min/Mean/Max = 74,506/91,260/112,409
>> Threads = 96, Total Rate = 584,091 op/s; Percpu Rate = 6,084 op/s
>>
>> rwsem_opt_fail=127305
>> rwsem_opt_lock=4252147
>> rwsem_opt_nospin=28920
>> rwsem_rlock=2713129
>> rwsem_rlock_fail=0
>> rwsem_rlock_fast=5
>> rwsem_rlock_handoff=280
>> rwsem_rlock_steal=1486617
>> rwsem_sleep_reader=2713085
>> rwsem_sleep_writer=4313369
>> rwsem_wake_reader=29876
>> rwsem_wake_writer=5829160
>> rwsem_wlock=127305
>> rwsem_wlock_fail=0
>> rwsem_wlock_handoff=2515
>>
>> After this patchset:
>>
>> Running locktest with rwsem [runtime = 15s, r% = 50%, load = 100]
>> Threads = 96, Min/Mean/Max = 26,573/26,749/26,833
>> Threads = 96, Total Rate = 171,184 op/s; Percpu Rate = 1,783 op/s
>>
>> rwsem_opt_fail=1265481
>> rwsem_opt_lock=17939
>> rwsem_rlock=1266157
>> rwsem_rlock_fail=0
>> rwsem_rlock_fast=0
>> rwsem_rlock_handoff=0
>> rwsem_rlock_steal=551
>> rwsem_sleep_reader=1266157
>> rwsem_sleep_writer=1265481
>> rwsem_wake_reader=26612
>> rwsem_wake_writer=0
>> rwsem_wlock=1265481
>> rwsem_wlock_ehandoff=94
>> rwsem_wlock_fail=0
>> rwsem_wlock_handoff=94
>>
>> So the locking rate is reduced to just 29.3% of the original. Looking at
>> the number of successful writer lock stealings from optimistic spinning
>> (rwsem_opt_lock), it is reduced from 4252147 to 17939. It is just about
>> 0.4% of the original.
>>
>> So for workloads that have a lot of writer contention, there will be
>> performance regressions. Do you mind if we try to keep the original
>> logic of my patchset to allow write lock acquisition in writer slow
>> path, but transfer the lock ownership in the wakeup path when handoff
>> is required. We can do this with some minor code changes on top of your
>> current patchset.
> Urgh, sorry, I seem to have lost sight of this... those results,..
> sadness :/
>
> Yeah, I suppose there's nothing for it but to have live with that mess,
> be very sure to add comments eludicating any future poor sod reading it
> as to why the code is the way it is.
OK, I will add additional patches to your series to remediate the
performance degradation. Hopefully, I am planning to get it done either
by the end of the week or early next week.
Thanks,
Longman
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 4/6] locking/rwsem: Split out rwsem_reader_wake()
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
` (2 preceding siblings ...)
2023-02-23 12:26 ` [PATCH 3/6] locking/rwsem: Rework writer wakeup Peter Zijlstra
@ 2023-02-23 12:26 ` Peter Zijlstra
2023-02-23 12:26 ` [PATCH 5/6] locking/rwsem: Unify wait loop Peter Zijlstra
` (2 subsequent siblings)
6 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-23 12:26 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, peterz, boqun.feng
To provide symmetry with rwsem_writer_wake().
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/locking/rwsem.c | 84 +++++++++++++++++++++++++++----------------------
1 file changed, 47 insertions(+), 37 deletions(-)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -106,9 +106,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers -- set, clear
+ * 1) rwsem_reader_wake() for readers -- set, clear
* 2) rwsem_writer_wake() for writers -- set, clear
- * 3) rwsem_del_waiter() -- clear
+ * 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@@ -377,8 +377,8 @@ rwsem_add_waiter(struct rw_semaphore *se
/*
* Remove a waiter from the wait_list and clear flags.
*
- * Both rwsem_mark_wake() and rwsem_writer_wake() contain a full 'copy' of
- * this function. Modify with care.
+ * Both rwsem_{reader,writer}_wake() contain a full 'copy' of this function.
+ * Modify with care.
*
* Return: true if wait_list isn't empty and false otherwise
*/
@@ -488,42 +488,15 @@ static void rwsem_writer_wake(struct rw_
rwsem_waiter_wake(waiter, wake_q);
}
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
- * have been set.
- * - there must be someone on the queue
- * - the wait_lock must be held by the caller
- * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
- * to actually wakeup the blocked task(s) and drop the reference count,
- * preferably when the wait_lock is released
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only marked woken if downgrading is false
- *
- * Implies rwsem_del_waiter() for all woken readers.
- */
-static void rwsem_mark_wake(struct rw_semaphore *sem,
- enum rwsem_wake_type wake_type,
- struct wake_q_head *wake_q)
+static void rwsem_reader_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
{
- struct rwsem_waiter *waiter, *tmp;
long count, woken = 0, adjustment = 0;
+ struct rwsem_waiter *tmp;
struct list_head wlist;
- lockdep_assert_held(&sem->wait_lock);
-
- /*
- * Take a peek at the queue head waiter such that we can determine
- * the wakeup(s) to perform.
- */
- waiter = rwsem_first_waiter(sem);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- rwsem_writer_wake(sem, waiter, wake_q);
- return;
- }
-
/*
* No reader wakeup if there are too many of them already.
*/
@@ -635,6 +608,42 @@ static void rwsem_mark_wake(struct rw_se
}
/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+ * have been set.
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken waiters.
+ */
+static void rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *waiter;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = rwsem_first_waiter(sem);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY)
+ rwsem_writer_wake(sem, waiter, wake_q);
+ } else {
+ rwsem_reader_wake(sem, wake_type, waiter, wake_q);
+ }
+}
+
+/*
* Remove a waiter and try to wake up other waiters in the wait queue
* This function is called from the out_nolock path of both the reader and
* writer slowpaths with wait_lock held. It releases the wait_lock and
@@ -1017,9 +1026,10 @@ rwsem_down_read_slowpath(struct rw_semap
*/
if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
raw_spin_lock_irq(&sem->wait_lock);
- if (!list_empty(&sem->wait_list))
+ if (!list_empty(&sem->wait_list)) {
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
&wake_q);
+ }
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
}
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
` (3 preceding siblings ...)
2023-02-23 12:26 ` [PATCH 4/6] locking/rwsem: Split out rwsem_reader_wake() Peter Zijlstra
@ 2023-02-23 12:26 ` Peter Zijlstra
2023-02-23 19:31 ` Boqun Feng
2023-02-23 22:45 ` Waiman Long
2023-02-23 12:26 ` [PATCH 6/6] locking/rwsem: Use the force Peter Zijlstra
2023-02-24 1:19 ` [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Waiman Long
6 siblings, 2 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-23 12:26 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, peterz, boqun.feng
Now that the reader and writer wait loops are identical, share the
code.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/locking/rwsem.c | 117 +++++++++++++++++++------------------------------
1 file changed, 47 insertions(+), 70 deletions(-)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -650,13 +650,11 @@ static void rwsem_mark_wake(struct rw_se
* optionally wake up waiters before it returns.
*/
static inline void
-rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
- struct wake_q_head *wake_q)
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
__releases(&sem->wait_lock)
{
bool first = rwsem_first_waiter(sem) == waiter;
-
- wake_q_init(wake_q);
+ DEFINE_WAKE_Q(wake_q);
/*
* If the wait_list isn't empty and the waiter to be deleted is
@@ -664,10 +662,10 @@ rwsem_del_wake_waiter(struct rw_semaphor
* be eligible to acquire or spin on the lock.
*/
if (rwsem_del_waiter(sem, waiter) && first)
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
- if (!wake_q_empty(wake_q))
- wake_up_q(wake_q);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
/*
@@ -993,6 +991,46 @@ static inline void rwsem_cond_wake_waite
rwsem_mark_wake(sem, wake_type, wake_q);
}
+#define waiter_type(_waiter, _r, _w) \
+ ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
+
+static __always_inline struct rw_semaphore *
+rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
+{
+ trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
+
+ /* wait to be given the lock */
+ for (;;) {
+ set_current_state(state);
+ if (!smp_load_acquire(&waiter->task)) {
+ /* Matches rwsem_waiter_wake()'s smp_store_release(). */
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter->task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+ break;
+ }
+ schedule_preempt_disabled();
+ lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
+ }
+
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
+ trace_contention_end(sem, 0);
+ return sem;
+
+out_nolock:
+ rwsem_del_wake_waiter(sem, waiter);
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
+ trace_contention_end(sem, -EINTR);
+ return ERR_PTR(-EINTR);
+}
+
/*
* Wait for the read lock to be granted
*/
@@ -1071,38 +1109,7 @@ rwsem_down_read_slowpath(struct rw_semap
if (!wake_q_empty(&wake_q))
wake_up_q(&wake_q);
- trace_contention_begin(sem, LCB_F_READ);
-
- /* wait to be given the lock */
- for (;;) {
- set_current_state(state);
- if (!smp_load_acquire(&waiter.task)) {
- /* Matches rwsem_waiter_wake()'s smp_store_release(). */
- break;
- }
- if (signal_pending_state(state, current)) {
- raw_spin_lock_irq(&sem->wait_lock);
- if (waiter.task)
- goto out_nolock;
- raw_spin_unlock_irq(&sem->wait_lock);
- /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
- break;
- }
- schedule_preempt_disabled();
- lockevent_inc(rwsem_sleep_reader);
- }
-
- __set_current_state(TASK_RUNNING);
- lockevent_inc(rwsem_rlock);
- trace_contention_end(sem, 0);
- return sem;
-
-out_nolock:
- rwsem_del_wake_waiter(sem, &waiter, &wake_q);
- __set_current_state(TASK_RUNNING);
- lockevent_inc(rwsem_rlock_fail);
- trace_contention_end(sem, -EINTR);
- return ERR_PTR(-EINTR);
+ return rwsem_waiter_wait(sem, &waiter, state);
}
/*
@@ -1150,37 +1157,7 @@ rwsem_down_write_slowpath(struct rw_sema
}
raw_spin_unlock_irq(&sem->wait_lock);
- /* wait until we successfully acquire the lock */
- trace_contention_begin(sem, LCB_F_WRITE);
-
- for (;;) {
- set_current_state(state);
- if (!smp_load_acquire(&waiter.task)) {
- /* Matches rwsem_waiter_wake()'s smp_store_release(). */
- break;
- }
- if (signal_pending_state(state, current)) {
- raw_spin_lock_irq(&sem->wait_lock);
- if (waiter.task)
- goto out_nolock;
- raw_spin_unlock_irq(&sem->wait_lock);
- /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
- break;
- }
- schedule_preempt_disabled();
- lockevent_inc(rwsem_sleep_writer);
- }
- __set_current_state(TASK_RUNNING);
- lockevent_inc(rwsem_wlock);
- trace_contention_end(sem, 0);
- return sem;
-
-out_nolock:
- rwsem_del_wake_waiter(sem, &waiter, &wake_q);
- __set_current_state(TASK_RUNNING);
- lockevent_inc(rwsem_wlock_fail);
- trace_contention_end(sem, -EINTR);
- return ERR_PTR(-EINTR);
+ return rwsem_waiter_wait(sem, &waiter, state);
}
/*
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-23 12:26 ` [PATCH 5/6] locking/rwsem: Unify wait loop Peter Zijlstra
@ 2023-02-23 19:31 ` Boqun Feng
2023-02-24 1:33 ` Boqun Feng
2023-02-23 22:45 ` Waiman Long
1 sibling, 1 reply; 27+ messages in thread
From: Boqun Feng @ 2023-02-23 19:31 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: longman, mingo, will, linux-kernel
On Thu, Feb 23, 2023 at 01:26:47PM +0100, Peter Zijlstra wrote:
> Now that the reader and writer wait loops are identical, share the
> code.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/locking/rwsem.c | 117 +++++++++++++++++++------------------------------
> 1 file changed, 47 insertions(+), 70 deletions(-)
>
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -650,13 +650,11 @@ static void rwsem_mark_wake(struct rw_se
> * optionally wake up waiters before it returns.
> */
> static inline void
> -rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
> - struct wake_q_head *wake_q)
> +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> __releases(&sem->wait_lock)
> {
> bool first = rwsem_first_waiter(sem) == waiter;
> -
> - wake_q_init(wake_q);
> + DEFINE_WAKE_Q(wake_q);
>
> /*
> * If the wait_list isn't empty and the waiter to be deleted is
> @@ -664,10 +662,10 @@ rwsem_del_wake_waiter(struct rw_semaphor
> * be eligible to acquire or spin on the lock.
> */
> if (rwsem_del_waiter(sem, waiter) && first)
> - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
> + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
> raw_spin_unlock_irq(&sem->wait_lock);
> - if (!wake_q_empty(wake_q))
> - wake_up_q(wake_q);
> + if (!wake_q_empty(&wake_q))
> + wake_up_q(&wake_q);
> }
>
> /*
> @@ -993,6 +991,46 @@ static inline void rwsem_cond_wake_waite
> rwsem_mark_wake(sem, wake_type, wake_q);
> }
>
> +#define waiter_type(_waiter, _r, _w) \
> + ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> +
> +static __always_inline struct rw_semaphore *
> +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> +{
> + trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> +
> + /* wait to be given the lock */
> + for (;;) {
> + set_current_state(state);
> + if (!smp_load_acquire(&waiter->task)) {
> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> + break;
> + }
> + if (signal_pending_state(state, current)) {
> + raw_spin_lock_irq(&sem->wait_lock);
Move the below __set_current_state(TASK_RUNNING)s up here? I think we
need the preemption protection when changing the task state here.
> + if (waiter->task)
> + goto out_nolock;
I originally wanted to suggest renaming the label to "out_locked", but I
think we can just move the labeled code up here? And even open-code
rwsem_del_wake_waiter() since it only has one usage.
Regards,
Boqun
> + raw_spin_unlock_irq(&sem->wait_lock);
> + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> + break;
> + }
> + schedule_preempt_disabled();
> + lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
> + }
> +
> + __set_current_state(TASK_RUNNING);
> + lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
> + trace_contention_end(sem, 0);
> + return sem;
> +
> +out_nolock:
> + rwsem_del_wake_waiter(sem, waiter);
> + __set_current_state(TASK_RUNNING);
> + lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
> + trace_contention_end(sem, -EINTR);
> + return ERR_PTR(-EINTR);
> +}
> +
> /*
> * Wait for the read lock to be granted
> */
> @@ -1071,38 +1109,7 @@ rwsem_down_read_slowpath(struct rw_semap
> if (!wake_q_empty(&wake_q))
> wake_up_q(&wake_q);
>
> - trace_contention_begin(sem, LCB_F_READ);
> -
> - /* wait to be given the lock */
> - for (;;) {
> - set_current_state(state);
> - if (!smp_load_acquire(&waiter.task)) {
> - /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> - break;
> - }
> - if (signal_pending_state(state, current)) {
> - raw_spin_lock_irq(&sem->wait_lock);
> - if (waiter.task)
> - goto out_nolock;
> - raw_spin_unlock_irq(&sem->wait_lock);
> - /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> - break;
> - }
> - schedule_preempt_disabled();
> - lockevent_inc(rwsem_sleep_reader);
> - }
> -
> - __set_current_state(TASK_RUNNING);
> - lockevent_inc(rwsem_rlock);
> - trace_contention_end(sem, 0);
> - return sem;
> -
> -out_nolock:
> - rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> - __set_current_state(TASK_RUNNING);
> - lockevent_inc(rwsem_rlock_fail);
> - trace_contention_end(sem, -EINTR);
> - return ERR_PTR(-EINTR);
> + return rwsem_waiter_wait(sem, &waiter, state);
> }
>
> /*
> @@ -1150,37 +1157,7 @@ rwsem_down_write_slowpath(struct rw_sema
> }
> raw_spin_unlock_irq(&sem->wait_lock);
>
> - /* wait until we successfully acquire the lock */
> - trace_contention_begin(sem, LCB_F_WRITE);
> -
> - for (;;) {
> - set_current_state(state);
> - if (!smp_load_acquire(&waiter.task)) {
> - /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> - break;
> - }
> - if (signal_pending_state(state, current)) {
> - raw_spin_lock_irq(&sem->wait_lock);
> - if (waiter.task)
> - goto out_nolock;
> - raw_spin_unlock_irq(&sem->wait_lock);
> - /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> - break;
> - }
> - schedule_preempt_disabled();
> - lockevent_inc(rwsem_sleep_writer);
> - }
> - __set_current_state(TASK_RUNNING);
> - lockevent_inc(rwsem_wlock);
> - trace_contention_end(sem, 0);
> - return sem;
> -
> -out_nolock:
> - rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> - __set_current_state(TASK_RUNNING);
> - lockevent_inc(rwsem_wlock_fail);
> - trace_contention_end(sem, -EINTR);
> - return ERR_PTR(-EINTR);
> + return rwsem_waiter_wait(sem, &waiter, state);
> }
>
> /*
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-23 19:31 ` Boqun Feng
@ 2023-02-24 1:33 ` Boqun Feng
2023-02-26 12:01 ` Peter Zijlstra
0 siblings, 1 reply; 27+ messages in thread
From: Boqun Feng @ 2023-02-24 1:33 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: longman, mingo, will, linux-kernel
On Thu, Feb 23, 2023 at 11:31:47AM -0800, Boqun Feng wrote:
[..]
> > +#define waiter_type(_waiter, _r, _w) \
> > + ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> > +
> > +static __always_inline struct rw_semaphore *
> > +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> > +{
> > + trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> > +
> > + /* wait to be given the lock */
> > + for (;;) {
> > + set_current_state(state);
> > + if (!smp_load_acquire(&waiter->task)) {
> > + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > + break;
> > + }
> > + if (signal_pending_state(state, current)) {
> > + raw_spin_lock_irq(&sem->wait_lock);
>
> Move the below __set_current_state(TASK_RUNNING)s up here? I think we
> need the preemption protection when changing the task state here.
>
Nevermind since we have the preemption protection for the whole
function... but merging two __set_current_state()s into one still looks
good.
Regards,
Boqun
> > + if (waiter->task)
> > + goto out_nolock;
>
[...]
> > +
> > + __set_current_state(TASK_RUNNING);
> > + lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
> > + trace_contention_end(sem, 0);
> > + return sem;
> > +
> > +out_nolock:
> > + rwsem_del_wake_waiter(sem, waiter);
> > + __set_current_state(TASK_RUNNING);
> > + lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
> > + trace_contention_end(sem, -EINTR);
> > + return ERR_PTR(-EINTR);
> > +}
> > +
> > /*
> > * Wait for the read lock to be granted
> > */
> > @@ -1071,38 +1109,7 @@ rwsem_down_read_slowpath(struct rw_semap
> > if (!wake_q_empty(&wake_q))
> > wake_up_q(&wake_q);
> >
> > - trace_contention_begin(sem, LCB_F_READ);
> > -
> > - /* wait to be given the lock */
> > - for (;;) {
> > - set_current_state(state);
> > - if (!smp_load_acquire(&waiter.task)) {
> > - /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > - break;
> > - }
> > - if (signal_pending_state(state, current)) {
> > - raw_spin_lock_irq(&sem->wait_lock);
> > - if (waiter.task)
> > - goto out_nolock;
> > - raw_spin_unlock_irq(&sem->wait_lock);
> > - /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> > - break;
> > - }
> > - schedule_preempt_disabled();
> > - lockevent_inc(rwsem_sleep_reader);
> > - }
> > -
> > - __set_current_state(TASK_RUNNING);
> > - lockevent_inc(rwsem_rlock);
> > - trace_contention_end(sem, 0);
> > - return sem;
> > -
> > -out_nolock:
> > - rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> > - __set_current_state(TASK_RUNNING);
> > - lockevent_inc(rwsem_rlock_fail);
> > - trace_contention_end(sem, -EINTR);
> > - return ERR_PTR(-EINTR);
> > + return rwsem_waiter_wait(sem, &waiter, state);
> > }
> >
> > /*
> > @@ -1150,37 +1157,7 @@ rwsem_down_write_slowpath(struct rw_sema
> > }
> > raw_spin_unlock_irq(&sem->wait_lock);
> >
> > - /* wait until we successfully acquire the lock */
> > - trace_contention_begin(sem, LCB_F_WRITE);
> > -
> > - for (;;) {
> > - set_current_state(state);
> > - if (!smp_load_acquire(&waiter.task)) {
> > - /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > - break;
> > - }
> > - if (signal_pending_state(state, current)) {
> > - raw_spin_lock_irq(&sem->wait_lock);
> > - if (waiter.task)
> > - goto out_nolock;
> > - raw_spin_unlock_irq(&sem->wait_lock);
> > - /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> > - break;
> > - }
> > - schedule_preempt_disabled();
> > - lockevent_inc(rwsem_sleep_writer);
> > - }
> > - __set_current_state(TASK_RUNNING);
> > - lockevent_inc(rwsem_wlock);
> > - trace_contention_end(sem, 0);
> > - return sem;
> > -
> > -out_nolock:
> > - rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> > - __set_current_state(TASK_RUNNING);
> > - lockevent_inc(rwsem_wlock_fail);
> > - trace_contention_end(sem, -EINTR);
> > - return ERR_PTR(-EINTR);
> > + return rwsem_waiter_wait(sem, &waiter, state);
> > }
> >
> > /*
> >
> >
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-24 1:33 ` Boqun Feng
@ 2023-02-26 12:01 ` Peter Zijlstra
2023-02-26 18:22 ` Boqun Feng
0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 12:01 UTC (permalink / raw)
To: Boqun Feng; +Cc: longman, mingo, will, linux-kernel
On Thu, Feb 23, 2023 at 05:33:53PM -0800, Boqun Feng wrote:
> On Thu, Feb 23, 2023 at 11:31:47AM -0800, Boqun Feng wrote:
> [..]
> > > +#define waiter_type(_waiter, _r, _w) \
> > > + ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> > > +
> > > +static __always_inline struct rw_semaphore *
> > > +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> > > +{
> > > + trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> > > +
> > > + /* wait to be given the lock */
> > > + for (;;) {
> > > + set_current_state(state);
> > > + if (!smp_load_acquire(&waiter->task)) {
> > > + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > > + break;
> > > + }
> > > + if (signal_pending_state(state, current)) {
> > > + raw_spin_lock_irq(&sem->wait_lock);
> >
> > Move the below __set_current_state(TASK_RUNNING)s up here? I think we
> > need the preemption protection when changing the task state here.
> >
>
> Nevermind since we have the preemption protection for the whole
> function... but merging two __set_current_state()s into one still looks
> good.
Even if it were not; I still don't understand the concern. Preemption
ignores task state.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-26 12:01 ` Peter Zijlstra
@ 2023-02-26 18:22 ` Boqun Feng
0 siblings, 0 replies; 27+ messages in thread
From: Boqun Feng @ 2023-02-26 18:22 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: longman, mingo, will, linux-kernel
On Sun, Feb 26, 2023 at 01:01:10PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 23, 2023 at 05:33:53PM -0800, Boqun Feng wrote:
> > On Thu, Feb 23, 2023 at 11:31:47AM -0800, Boqun Feng wrote:
> > [..]
> > > > +#define waiter_type(_waiter, _r, _w) \
> > > > + ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> > > > +
> > > > +static __always_inline struct rw_semaphore *
> > > > +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> > > > +{
> > > > + trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> > > > +
> > > > + /* wait to be given the lock */
> > > > + for (;;) {
> > > > + set_current_state(state);
> > > > + if (!smp_load_acquire(&waiter->task)) {
> > > > + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> > > > + break;
> > > > + }
> > > > + if (signal_pending_state(state, current)) {
> > > > + raw_spin_lock_irq(&sem->wait_lock);
> > >
> > > Move the below __set_current_state(TASK_RUNNING)s up here? I think we
> > > need the preemption protection when changing the task state here.
> > >
> >
> > Nevermind since we have the preemption protection for the whole
> > function... but merging two __set_current_state()s into one still looks
> > good.
>
> Even if it were not; I still don't understand the concern. Preemption
> ignores task state.
Because I missed the exact thing you just mentioned... ;-)
I was worried about the following case:
ttwu();
set_current_state(TASK_UNINTERRUPTIBLE);
....
<preemption enable>
<preempted>
preempt_schedule_irq():
__schedule(...):
deactivate_task(); // Wakeup missed.
However this is not true, since __schedule() in preempt_schedule_irq()
is a SM_PREEMPT one.
Sorry for the noise then. But good for me to revisit these stuffs ;-)
Regards,
Boqun
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-23 12:26 ` [PATCH 5/6] locking/rwsem: Unify wait loop Peter Zijlstra
2023-02-23 19:31 ` Boqun Feng
@ 2023-02-23 22:45 ` Waiman Long
2023-02-26 16:15 ` Peter Zijlstra
1 sibling, 1 reply; 27+ messages in thread
From: Waiman Long @ 2023-02-23 22:45 UTC (permalink / raw)
To: Peter Zijlstra, mingo, will; +Cc: linux-kernel, boqun.feng
[-- Attachment #1: Type: text/plain, Size: 5211 bytes --]
On 2/23/23 07:26, Peter Zijlstra wrote:
> Now that the reader and writer wait loops are identical, share the
> code.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/locking/rwsem.c | 117 +++++++++++++++++++------------------------------
> 1 file changed, 47 insertions(+), 70 deletions(-)
>
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -650,13 +650,11 @@ static void rwsem_mark_wake(struct rw_se
> * optionally wake up waiters before it returns.
> */
> static inline void
> -rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
> - struct wake_q_head *wake_q)
> +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> __releases(&sem->wait_lock)
> {
> bool first = rwsem_first_waiter(sem) == waiter;
> -
> - wake_q_init(wake_q);
> + DEFINE_WAKE_Q(wake_q);
>
> /*
> * If the wait_list isn't empty and the waiter to be deleted is
> @@ -664,10 +662,10 @@ rwsem_del_wake_waiter(struct rw_semaphor
> * be eligible to acquire or spin on the lock.
> */
> if (rwsem_del_waiter(sem, waiter) && first)
> - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
> + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
> raw_spin_unlock_irq(&sem->wait_lock);
> - if (!wake_q_empty(wake_q))
> - wake_up_q(wake_q);
> + if (!wake_q_empty(&wake_q))
> + wake_up_q(&wake_q);
> }
>
> /*
> @@ -993,6 +991,46 @@ static inline void rwsem_cond_wake_waite
> rwsem_mark_wake(sem, wake_type, wake_q);
> }
>
> +#define waiter_type(_waiter, _r, _w) \
> + ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
> +
> +static __always_inline struct rw_semaphore *
> +rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
> +{
> + trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
> +
> + /* wait to be given the lock */
> + for (;;) {
> + set_current_state(state);
> + if (!smp_load_acquire(&waiter->task)) {
> + /* Matches rwsem_waiter_wake()'s smp_store_release(). */
> + break;
> + }
> + if (signal_pending_state(state, current)) {
> + raw_spin_lock_irq(&sem->wait_lock);
> + if (waiter->task)
> + goto out_nolock;
> + raw_spin_unlock_irq(&sem->wait_lock);
> + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> + break;
> + }
> + schedule_preempt_disabled();
> + lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
> + }
> +
> + __set_current_state(TASK_RUNNING);
> + lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
> + trace_contention_end(sem, 0);
> + return sem;
> +
> +out_nolock:
> + rwsem_del_wake_waiter(sem, waiter);
> + __set_current_state(TASK_RUNNING);
Similar to boqun's comment, we should move __set_current_state() before
rwsem_del_wake_waiter().
Unfortunately, lockevent_inc() doesn't work with waiter_type() like that
as the compilation will fail if CONFIG_LOCK_EVENT_COUNTS is enabled.
Could you include the attached patch in your series and make the
following changes?
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index deb0d016a6ce..5b14b0d076fd 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1021,13 +1021,14 @@ static inline void rwsem_cond_wake_waiter(struct
rw_semaphore *sem, long count,
rwsem_mark_wake(sem, wake_type, wake_q);
}
-#define waiter_type(_waiter, _r, _w) \
- ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
+#define waiter_type(_reader, _r, _w) ((_reader) ? (_r) : (_w))
static __always_inline struct rw_semaphore *
rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter
*waiter, int state)
{
- trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ,
LCB_F_WRITE));
+ bool reader = waiter->type == RWSEM_WAITING_FOR_READ;
+
+ trace_contention_begin(sem, waiter_type(reader, LCB_F_READ,
LCB_F_WRITE));
/* wait to be given the lock */
for (;;) {
@@ -1045,18 +1046,18 @@ rwsem_waiter_wait(struct rw_semaphore *sem,
struct rwsem_waiter *waiter, int sta
break;
}
schedule_preempt_disabled();
- lockevent_inc(waiter_type(waiter, rwsem_sleep_reader,
rwsem_sleep_writer));
+ lockevent_cond_inc2(reader, rwsem_sleep_reader,
rwsem_sleep_writer);
}
__set_current_state(TASK_RUNNING);
- lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
+ lockevent_cond_inc2(reader, rwsem_rlock, rwsem_wlock);
trace_contention_end(sem, 0);
return sem;
out_nolock:
rwsem_del_wake_waiter(sem, waiter);
__set_current_state(TASK_RUNNING);
- lockevent_inc(waiter_type(waiter, rwsem_rlock_fail,
rwsem_wlock_fail));
+ lockevent_cond_inc2(reader, rwsem_rlock_fail, rwsem_wlock_fail);
trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
Thanks,
Longman
lockevent_inc
[-- Attachment #2: 0001-locking-lock_events-Add-a-new-lockevent_cond_inc2-he.patch --]
[-- Type: text/x-patch, Size: 1724 bytes --]
From 490fb153006941ec7b576c9e89cb220a0739a95c Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 23 Feb 2023 17:32:06 -0500
Subject: [PATCH] locking/lock_events: Add a new lockevent_cond_inc2() helper
Add a new lockevent_cond_inc2(cond, true_event, false_event) helper
to conditionally increment one of the 2 given events.
Signed-off-by: Waiman Long <longman@redhat.com>
---
kernel/locking/lock_events.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 8c7e7d25f09c..668c2f1397f6 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -43,6 +43,21 @@ static inline void __lockevent_inc(enum lock_events event, bool cond)
#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+/*
+ * Increment either the tevent (cond true) or fevent (cond false)
+ */
+static inline void __lockevent_cond_inc2(bool cond, enum lock_events tevent,
+ enum lock_events fevent)
+{
+ if (cond)
+ raw_cpu_inc(lockevents[tevent]);
+ else
+ raw_cpu_inc(lockevents[fevent]);
+
+}
+#define lockevent_cond_inc2(c, tev, fev) \
+ __lockevent_cond_inc2(c, LOCKEVENT_ ##tev, LOCKEVENT_ ##fev)
+
static inline void __lockevent_add(enum lock_events event, int inc)
{
raw_cpu_add(lockevents[event], inc);
@@ -55,6 +70,7 @@ static inline void __lockevent_add(enum lock_events event, int inc)
#define lockevent_inc(ev)
#define lockevent_add(ev, c)
#define lockevent_cond_inc(ev, c)
+#define lockevent_cond_inc2(c, tev, fev)
#endif /* CONFIG_LOCK_EVENT_COUNTS */
#endif /* __LOCKING_LOCK_EVENTS_H */
--
2.31.1
^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 5/6] locking/rwsem: Unify wait loop
2023-02-23 22:45 ` Waiman Long
@ 2023-02-26 16:15 ` Peter Zijlstra
0 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-26 16:15 UTC (permalink / raw)
To: Waiman Long; +Cc: mingo, will, linux-kernel, boqun.feng
On Thu, Feb 23, 2023 at 05:45:56PM -0500, Waiman Long wrote:
> Unfortunately, lockevent_inc() doesn't work with waiter_type() like that as
> the compilation will fail if CONFIG_LOCK_EVENT_COUNTS is enabled. Could you
> include the attached patch in your series and make the following changes?
Yeah, robot told me; fixed it like so:
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -995,13 +995,16 @@ static inline void rwsem_cond_wake_waite
rwsem_mark_wake(sem, wake_type, wake_q);
}
-#define waiter_type(_waiter, _r, _w) \
- ((_waiter)->type == RWSEM_WAITING_FOR_READ ? (_r) : (_w))
+#define lockevent_rw_inc(rd, evr, evw) do { \
+ lockevent_cond_inc(evr, (rd)); \
+ lockevent_cond_inc(evw, !(rd)); \
+} while (0)
static __always_inline struct rw_semaphore *
-rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter, int state)
+rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ int state, bool reader)
{
- trace_contention_begin(sem, waiter_type(waiter, LCB_F_READ, LCB_F_WRITE));
+ trace_contention_begin(sem, reader ? LCB_F_READ : LCB_F_WRITE);
/* wait to be given the lock */
for (;;) {
@@ -1019,18 +1022,19 @@ rwsem_waiter_wait(struct rw_semaphore *s
break;
}
schedule_preempt_disabled();
- lockevent_inc(waiter_type(waiter, rwsem_sleep_reader, rwsem_sleep_writer));
+ lockevent_rw_inc(reader, rwsem_sleep_reader, rwsem_sleep_writer);
}
__set_current_state(TASK_RUNNING);
- lockevent_inc(waiter_type(waiter, rwsem_rlock, rwsem_wlock));
+
+ lockevent_rw_inc(reader, rwsem_rlock, rwsem_wlock);
trace_contention_end(sem, 0);
return sem;
out_nolock:
rwsem_del_wake_waiter(sem, waiter);
__set_current_state(TASK_RUNNING);
- lockevent_inc(waiter_type(waiter, rwsem_rlock_fail, rwsem_wlock_fail));
+ lockevent_rw_inc(reader, rwem_rlock_fail, rwsem_wlock_fail);
trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
@@ -1112,7 +1116,7 @@ rwsem_down_read_slowpath(struct rw_semap
if (!wake_q_empty(&wake_q))
wake_up_q(&wake_q);
- return rwsem_waiter_wait(sem, &waiter, state);
+ return rwsem_waiter_wait(sem, &waiter, state, true);
}
/*
@@ -1162,7 +1166,7 @@ rwsem_down_write_slowpath(struct rw_sema
}
raw_spin_unlock_irq(&sem->wait_lock);
- return rwsem_waiter_wait(sem, &waiter, state);
+ return rwsem_waiter_wait(sem, &waiter, state, false);
}
/*
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 6/6] locking/rwsem: Use the force
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
` (4 preceding siblings ...)
2023-02-23 12:26 ` [PATCH 5/6] locking/rwsem: Unify wait loop Peter Zijlstra
@ 2023-02-23 12:26 ` Peter Zijlstra
2023-02-24 1:19 ` [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Waiman Long
6 siblings, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2023-02-23 12:26 UTC (permalink / raw)
To: longman, mingo, will; +Cc: linux-kernel, peterz, boqun.feng
Now that the writer adjustment is done from the wakeup side and
HANDOFF guarantees spinning/stealing is disabled, use the combined
guarantee it ignore spurious READER_BIAS and directly claim the lock.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/locking/lock_events_list.h | 1 +
kernel/locking/rwsem.c | 21 +++++++++++++++++++++
2 files changed, 22 insertions(+)
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -67,3 +67,4 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+LOCK_EVENT(rwsem_wlock_ehandoff) /* # of write lock early handoffs */
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -433,6 +433,26 @@ static void rwsem_writer_wake(struct rw_
lockdep_assert_held(&sem->wait_lock);
count = atomic_long_read(&sem->count);
+
+ /*
+ * Since rwsem_mark_wake() is only called (with WAKE_ANY) when
+ * the lock is unlocked, and the HANDOFF bit guarantees that
+ * all spinning / stealing is disabled, it is posssible to
+ * unconditionally claim the lock -- any READER_BIAS will be
+ * temporary.
+ */
+ if (count & RWSEM_FLAG_HANDOFF) {
+ unsigned long adjustment = RWSEM_WRITER_LOCKED - RWSEM_FLAG_HANDOFF;
+
+ if (list_is_singular(&sem->wait_list))
+ adjustment -= RWSEM_FLAG_WAITERS;
+
+ atomic_long_set(&sem->owner, (long)waiter->task);
+ atomic_long_add(adjustment, &sem->count);
+ lockevent_inc(rwsem_wlock_ehandoff);
+ goto success;
+ }
+
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
@@ -479,6 +499,7 @@ static void rwsem_writer_wake(struct rw_
return;
}
+success:
/*
* Have rwsem_writer_wake() fully imply rwsem_del_waiter() on
* success.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff
2023-02-23 12:26 [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Peter Zijlstra
` (5 preceding siblings ...)
2023-02-23 12:26 ` [PATCH 6/6] locking/rwsem: Use the force Peter Zijlstra
@ 2023-02-24 1:19 ` Waiman Long
2023-02-24 11:55 ` Jiri Wiesner
6 siblings, 1 reply; 27+ messages in thread
From: Waiman Long @ 2023-02-24 1:19 UTC (permalink / raw)
To: Peter Zijlstra, mingo, will; +Cc: linux-kernel, boqun.feng
On 2/23/23 07:26, Peter Zijlstra wrote:
> Hi,
>
> these here few patches boot but are otherwise very much untested. Please test.
I like the unification that you have done with this series.
However, I got the following task hanging message when doing a kernel build:
[ 2215.893058] </TASK>
[ 2215.895252] INFO: task pahole:65220 blocked for more than 123 seconds.
[ 2215.901776] Tainted: G S OE 6.2.0-test+ #1
[ 2215.907520] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[ 2215.915347] task:pahole state:D stack:0 pid:65220
ppid:65065 flags:0x00000000
[ 2215.923690] Call Trace:
[ 2215.926146] <TASK>
[ 2215.928250] __schedule+0x367/0x950
[ 2215.931741] schedule+0x50/0xc0
[ 2215.934888] schedule_preempt_disabled+0x11/0x20
[ 2215.939507] rwsem_down_read_slowpath+0x28c/0x520
[ 2215.944215] down_read+0x98/0xc0
[ 2215.947446] do_user_addr_fault+0x410/0x700
[ 2215.951633] exc_page_fault+0x64/0x140
[ 2215.955385] asm_exc_page_fault+0x22/0x30
[ 2215.959394] RIP: 0033:0x7f37f8808dd2
[ 2215.962974] RSP: 002b:00007ffe7bc89e80 EFLAGS: 00010246
[ 2215.968199] RAX: 00007f377d7fb000 RBX: 00007f377dffb700 RCX:
00007f37f8439a1b
[ 2215.975332] RDX: 0000000000000003 RSI: 0000000000800000 RDI:
00007f377d7fc000
[ 2215.982465] RBP: 00007ffe7bc89f50 R08: 00000000ffffffff R09:
0000000000000000
[ 2215.989594] R10: 0000000000000000 R11: 0000000000000206 R12:
0000000000001000
[ 2215.996728] R13: 00007ffe7bc89ee0 R14: 0000000000000000 R15:
0000000000801000
[ 2216.003862] </TASK>
I am a bit tired now. I will look at the series again tomorrow to see if
there is something missing.
Cheers,
Longman
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff
2023-02-24 1:19 ` [PATCH 0/6] locking/rwsem: Rework writer wakeup and handoff Waiman Long
@ 2023-02-24 11:55 ` Jiri Wiesner
0 siblings, 0 replies; 27+ messages in thread
From: Jiri Wiesner @ 2023-02-24 11:55 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Waiman Long, mingo, will, linux-kernel, boqun.feng
On Thu, Feb 23, 2023 at 08:19:46PM -0500, Waiman Long wrote:
> On 2/23/23 07:26, Peter Zijlstra wrote:
> > Hi,
> >
> > these here few patches boot but are otherwise very much untested. Please test.
>
> However, I got the following task hanging message when doing a kernel build:
> [ 2215.895252] INFO: task pahole:65220 blocked for more than 123 seconds.
I was running locktorture and ran into the same problem as Waiman. It's unrelated to the locktorture workload, though:
[ 1482.886856] INFO: task btrfs-transacti:1073 blocked for more than 491 seconds.
[ 1482.895755] Tainted: G E 6.2.0-pz1 #1
[ 1482.912382] task:btrfs-transacti state:D stack:0 pid:1073 ppid:2 flags:0x00004000
[ 1482.922544] Call Trace:
[ 1482.926074] <TASK>
[ 1482.929211] __schedule+0x3c0/0x1360
[ 1482.954121] schedule+0x5c/0xc0
[ 1482.958428] schedule_preempt_disabled+0x11/0x20
[ 1482.964396] rwsem_down_write_slowpath+0x17c/0x580
[ 1482.975260] down_write+0x57/0x60
[ 1482.979763] __btrfs_tree_lock+0x17/0x90 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1482.989779] btrfs_lock_root_node+0x3b/0x90 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.000082] btrfs_search_slot+0x2b7/0xc70 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.010267] btrfs_lookup_file_extent+0x4a/0x70 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.020949] btrfs_drop_extents+0x12e/0xf20 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.031252] insert_reserved_file_extent+0xec/0x2e0 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.053705] insert_prealloc_file_extent+0xb9/0x1b0 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.064774] __btrfs_prealloc_file_range+0x12c/0x420 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.075941] cache_save_setup+0x26d/0x3d0 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.086052] btrfs_setup_space_cache+0x9c/0xc0 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.096648] commit_cowonly_roots+0xd9/0x279 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.107046] btrfs_commit_transaction+0x8e2/0xe70 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.128221] transaction_kthread+0x14e/0x1b0 [btrfs a6f0f85f39d8ec2ab376bf2ae3a09f935847037e]
[ 1483.149588] kthread+0xd7/0x100
[ 1483.158891] ret_from_fork+0x29/0x50
[ 1483.163689] </TASK>
--
Jiri Wiesner
SUSE Labs
^ permalink raw reply [flat|nested] 27+ messages in thread