linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
@ 2022-02-10 18:10 Waiman Long
  0 siblings, 0 replies; 10+ messages in thread
From: Waiman Long @ 2022-02-10 18:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng
  Cc: linux-kernel, Waiman Long

The following lockdep splat was observed:

[ 9776.459819] ======================================================
[ 9776.459820] WARNING: possible circular locking dependency detected
[ 9776.459821] 5.14.0-0.rc4.35.el9.x86_64+debug #1 Not tainted
[ 9776.459823] ------------------------------------------------------
[ 9776.459824] stress-ng/117708 is trying to acquire lock:
[ 9776.459825] ffffffff892d41d8 ((console_sem).lock){-...}-{2:2}, at: down_trylock+0x13/0x70

[ 9776.459831] but task is already holding lock:
[ 9776.459832] ffff888e005f6d18 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x27/0x130

[ 9776.459837] which lock already depends on the new lock.
      :
[ 9776.459857] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 9776.459860]        __lock_acquire+0xb72/0x1870
[ 9776.459861]        lock_acquire+0x1ca/0x570
[ 9776.459862]        _raw_spin_lock_irqsave+0x40/0x90
[ 9776.459863]        try_to_wake_up+0x9d/0x1210
[ 9776.459864]        up+0x7a/0xb0
[ 9776.459864]        __up_console_sem+0x33/0x70
[ 9776.459865]        console_unlock+0x3a1/0x5f0
[ 9776.459866]        vprintk_emit+0x23b/0x2b0
[ 9776.459867]        devkmsg_emit.constprop.0+0xab/0xdc
[ 9776.459868]        devkmsg_write.cold+0x4e/0x78
[ 9776.459869]        do_iter_readv_writev+0x343/0x690
[ 9776.459870]        do_iter_write+0x123/0x340
[ 9776.459871]        vfs_writev+0x19d/0x520
[ 9776.459871]        do_writev+0x110/0x290
[ 9776.459872]        do_syscall_64+0x3b/0x90
[ 9776.459873]        entry_SYSCALL_64_after_hwframe+0x44/0xae
      :
[ 9776.459905] Chain exists of:
[ 9776.459906]   (console_sem).lock --> &p->pi_lock --> &rq->__lock

[ 9776.459911]  Possible unsafe locking scenario:

[ 9776.459913]        CPU0                    CPU1
[ 9776.459914]        ----                    ----
[ 9776.459914]   lock(&rq->__lock);
[ 9776.459917]                                lock(&p->pi_lock);
[ 9776.459919]                                lock(&rq->__lock);
[ 9776.459921]   lock((console_sem).lock);

[ 9776.459923]  *** DEADLOCK ***
[ 9776.459925] 2 locks held by stress-ng/117708:
[ 9776.459925]  #0: ffffffff89403960 (&cpuset_rwsem){++++}-{0:0}, at: __sched_setscheduler+0xe2f/0x2c80
[ 9776.459930]  #1: ffff888e005f6d18 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x27/0x130

[ 9776.459935] stack backtrace:
[ 9776.459936] CPU: 95 PID: 117708 Comm: stress-ng Kdump: loaded Not tainted 5.14.0-0.rc4.35.el9.x86_64+debug #1
[ 9776.459938] Hardware name: FUJITSU PRIMEQUEST 2800E3/D3752, BIOS PRIMEQUEST 2000 Series BIOS Version 01.51 06/29/2020
[ 9776.459939] Call Trace:
[ 9776.459940]  <IRQ>
[ 9776.459940]  dump_stack_lvl+0x57/0x7d
[ 9776.459941]  check_noncircular+0x26a/0x310
[ 9776.459945]  check_prev_add+0x15e/0x20f0
[ 9776.459946]  validate_chain+0xaba/0xde0
[ 9776.459948]  __lock_acquire+0xb72/0x1870
[ 9776.459949]  lock_acquire+0x1ca/0x570
[ 9776.459952]  _raw_spin_lock_irqsave+0x40/0x90
[ 9776.459954]  down_trylock+0x13/0x70
[ 9776.459955]  __down_trylock_console_sem+0x2a/0xb0
[ 9776.459956]  console_trylock_spinning+0x13/0x1f0
[ 9776.459957]  vprintk_emit+0x1e6/0x2b0
[ 9776.459958]  printk+0xb2/0xe3
[ 9776.459960]  __warn_printk+0x9b/0xf3
[ 9776.459964]  update_rq_clock+0x3c2/0x780
[ 9776.459966]  do_sched_rt_period_timer+0x19e/0x9a0
[ 9776.459968]  sched_rt_period_timer+0x6b/0x150
[ 9776.459969]  __run_hrtimer+0x27a/0xb20
[ 9776.459970]  __hrtimer_run_queues+0x159/0x260
[ 9776.459974]  hrtimer_interrupt+0x2cb/0x8f0
[ 9776.459976]  __sysvec_apic_timer_interrupt+0x13e/0x540
[ 9776.459977]  sysvec_apic_timer_interrupt+0x6a/0x90
[ 9776.459977]  </IRQ>

The problematic locking sequence ((console_sem).lock --> &p->pi_lock)
was caused by the fact the semaphore up() function is calling
wake_up_process() while holding the semaphore raw spinlock.

The (&rq->__lock --> (console_sem).lock) locking sequence seems to be
caused by a SCHED_WARN_ON() call in update_rq_clock(). To work around
this problematic locking sequence, we may have to ban all WARN*() calls
when the rq lock is held, which may be too restrictive, or we may have
to add a WARN_DEFERRED() call which can be quite a lot of work.

On the other hand, by moving the wake_up_processs() call out of the
raw spinlock critical section using wake_q, it will break the first
problematic locking sequence as well as reducing raw spinlock hold time.
This is easier and cleaner.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/locking/semaphore.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4..a26c915430ba 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
@@ -37,7 +38,7 @@ static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
 
 /**
  * down - acquire the semaphore
@@ -182,13 +183,16 @@ EXPORT_SYMBOL(down_timeout);
 void up(struct semaphore *sem)
 {
 	unsigned long flags;
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
-		__up(sem);
+		__up(sem, &wake_q);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 }
 EXPORT_SYMBOL(up);
 
@@ -256,11 +260,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+				  struct wake_q_head *wake_q)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
 	list_del(&waiter->list);
 	waiter->up = true;
-	wake_up_process(waiter->task);
+	wake_q_add(wake_q, waiter->task);
 }
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2023-09-22 19:47     ` Peter Zijlstra
@ 2023-09-23  0:29       ` Waiman Long
  0 siblings, 0 replies; 10+ messages in thread
From: Waiman Long @ 2023-09-23  0:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel


On 9/22/23 15:47, Peter Zijlstra wrote:
> On Fri, Sep 22, 2023 at 02:45:04PM -0400, Waiman Long wrote:
>
>> I believe early_printk should only be used in the non-SMP boot process. The
>> use of printk() is frequently used for debugging purpose and the insertion
>> of printk at some lock critical section can cause the lockdep splat to come
>> out obscuring the debugging process.
> By default early_printk is disabled somewhere early, but it has a keep
> argument to keep it around.
>
> Anyway, printk() as it exists today is wholly unsuited for debugging.
> There are too many contexts where it will flat out not work. When you
> use early_print with keep then you can use early_printk() instead of
> printk() to debug.
>
> Also, see the patches I pointed John at. Perf would not be what it is
> without those patches.
>
> Serial lines and early printk are not optional. That is, I flat out
> refuse to develop on machines without them.

Thanks for the debugging tip.

BTW, it is not just printk() that can be problematic in some contexts. I 
believe the various WARN*() calls cause the same kind of lockdep problem 
even though these WARN() calls shouldn't be triggered that often

Cheers,
Longman


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2023-09-22 18:45   ` Waiman Long
@ 2023-09-22 19:47     ` Peter Zijlstra
  2023-09-23  0:29       ` Waiman Long
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2023-09-22 19:47 UTC (permalink / raw)
  To: Waiman Long; +Cc: Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel

On Fri, Sep 22, 2023 at 02:45:04PM -0400, Waiman Long wrote:

> I believe early_printk should only be used in the non-SMP boot process. The
> use of printk() is frequently used for debugging purpose and the insertion
> of printk at some lock critical section can cause the lockdep splat to come
> out obscuring the debugging process.

By default early_printk is disabled somewhere early, but it has a keep
argument to keep it around.

Anyway, printk() as it exists today is wholly unsuited for debugging.
There are too many contexts where it will flat out not work. When you
use early_print with keep then you can use early_printk() instead of
printk() to debug.

Also, see the patches I pointed John at. Perf would not be what it is
without those patches.

Serial lines and early printk are not optional. That is, I flat out
refuse to develop on machines without them.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2023-09-21  7:42 ` Peter Zijlstra
@ 2023-09-22 18:45   ` Waiman Long
  2023-09-22 19:47     ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Waiman Long @ 2023-09-22 18:45 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel

On 9/21/23 03:42, Peter Zijlstra wrote:
> On Fri, Sep 09, 2022 at 03:28:48PM -0400, Waiman Long wrote:
>> It was found that a circular lock dependency can happen with the
>> following locking sequence:
>>
>>     +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
>>     |                                                         |
>>     +---------------------------------------------------------+
>>
>> The &p->pi_lock --> &rq->__lock sequence is very common in all the
>> task_rq_lock() calls.
>>
>> The &rq->__lock --> (console_sem).lock sequence happens when the
>> scheduler code calling printk() or more likely the various WARN*()
>> macros while holding the rq lock. The (console_sem).lock is actually
>> a raw spinlock guarding the semaphore. In the particular lockdep splat
>> that I saw, it was caused by SCHED_WARN_ON() call in update_rq_clock().
>> To work around this locking sequence, we may have to ban all WARN*()
>> calls when the rq lock is held, which may be too restrictive, or we
>> may have to add a WARN_DEFERRED() call and modify all the call sites
>> to use it.
> No, this is all because printk() is pure garbage -- but I believe it's
> being worked on.
>
> And I despise that whole deferred thing -- that's just worse garbage.
>
> If you map printk to early_printk none of this is a problem (and this is
> what i've been doing for something close to a decade).
>
> Printk should not do synchronous, or in-context, printing to non-atomic
> consoles. Doubly so when atomic console are actually available.
>
> As long as it does this printk is fundamentally unreliable and any of
> these hacks are just that.

Thanks for the explanation.

I believe early_printk should only be used in the non-SMP boot process. 
The use of printk() is frequently used for debugging purpose and the 
insertion of printk at some lock critical section can cause the lockdep 
splat to come out obscuring the debugging process.

>
>> Even then, a deferred printk or WARN function may still call
>> console_trylock() which may, in turn, calls up_console_sem() leading
>> to this locking sequence.
>>
>> The other ((console_sem).lock --> &p->pi_lock) locking sequence
>> was caused by the fact that the semaphore up() function is calling
>> wake_up_process() while holding the semaphore raw spinlock. This lockiing
>> sequence can be easily eliminated by moving the wake_up_processs()
>> call out of the raw spinlock critical section using wake_q which is
>> what this patch implements. That is the easiest and the most certain
>> way to break this circular locking sequence.
> So I don't mind the patch, but I hate everything about your
> justification for it.
>
OK, I can reword the commit log and see you are OK with that.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2023-09-21  0:05 ` John Stultz
  2023-09-21  0:37   ` Waiman Long
@ 2023-09-21  7:45   ` Peter Zijlstra
  1 sibling, 0 replies; 10+ messages in thread
From: Peter Zijlstra @ 2023-09-21  7:45 UTC (permalink / raw)
  To: John Stultz
  Cc: Waiman Long, Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel

On Wed, Sep 20, 2023 at 05:05:56PM -0700, John Stultz wrote:
> On Fri, Sep 9, 2022 at 12:28 PM Waiman Long <longman@redhat.com> wrote:
> >
> > It was found that a circular lock dependency can happen with the
> > following locking sequence:
> >
> >    +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
> >    |                                                         |
> >    +---------------------------------------------------------+
> >
> > The &p->pi_lock --> &rq->__lock sequence is very common in all the
> > task_rq_lock() calls.
> 
> Thanks for sending this out! I've been hitting these lockdep warningns
> a lot recently, particularly if I have any debug printks/WARN_ONs in
> the scheduler that trip, so I'm eager to get a fix for this!

https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=debug/experimental

and use with: earlyprintk=serial,ttyS0,115200 force_early_printk, or
somesuch.

I could not have done perf or much of the sched patches without it.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2022-09-09 19:28 Waiman Long
  2022-09-19 19:49 ` Waiman Long
  2023-09-21  0:05 ` John Stultz
@ 2023-09-21  7:42 ` Peter Zijlstra
  2023-09-22 18:45   ` Waiman Long
  2 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2023-09-21  7:42 UTC (permalink / raw)
  To: Waiman Long; +Cc: Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel

On Fri, Sep 09, 2022 at 03:28:48PM -0400, Waiman Long wrote:
> It was found that a circular lock dependency can happen with the
> following locking sequence:
> 
>    +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
>    |                                                         |
>    +---------------------------------------------------------+
> 
> The &p->pi_lock --> &rq->__lock sequence is very common in all the
> task_rq_lock() calls.
> 
> The &rq->__lock --> (console_sem).lock sequence happens when the
> scheduler code calling printk() or more likely the various WARN*()
> macros while holding the rq lock. The (console_sem).lock is actually
> a raw spinlock guarding the semaphore. In the particular lockdep splat
> that I saw, it was caused by SCHED_WARN_ON() call in update_rq_clock().
> To work around this locking sequence, we may have to ban all WARN*()
> calls when the rq lock is held, which may be too restrictive, or we
> may have to add a WARN_DEFERRED() call and modify all the call sites
> to use it.

No, this is all because printk() is pure garbage -- but I believe it's
being worked on.

And I despise that whole deferred thing -- that's just worse garbage.

If you map printk to early_printk none of this is a problem (and this is
what i've been doing for something close to a decade).

Printk should not do synchronous, or in-context, printing to non-atomic
consoles. Doubly so when atomic console are actually available.

As long as it does this printk is fundamentally unreliable and any of
these hacks are just that.

> Even then, a deferred printk or WARN function may still call
> console_trylock() which may, in turn, calls up_console_sem() leading
> to this locking sequence.
> 
> The other ((console_sem).lock --> &p->pi_lock) locking sequence
> was caused by the fact that the semaphore up() function is calling
> wake_up_process() while holding the semaphore raw spinlock. This lockiing
> sequence can be easily eliminated by moving the wake_up_processs()
> call out of the raw spinlock critical section using wake_q which is
> what this patch implements. That is the easiest and the most certain
> way to break this circular locking sequence.

So I don't mind the patch, but I hate everything about your
justification for it.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2023-09-21  0:05 ` John Stultz
@ 2023-09-21  0:37   ` Waiman Long
  2023-09-21  7:45   ` Peter Zijlstra
  1 sibling, 0 replies; 10+ messages in thread
From: Waiman Long @ 2023-09-21  0:37 UTC (permalink / raw)
  To: John Stultz
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel

On 9/20/23 20:05, John Stultz wrote:
> On Fri, Sep 9, 2022 at 12:28 PM Waiman Long <longman@redhat.com> wrote:
>> It was found that a circular lock dependency can happen with the
>> following locking sequence:
>>
>>     +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
>>     |                                                         |
>>     +---------------------------------------------------------+
>>
>> The &p->pi_lock --> &rq->__lock sequence is very common in all the
>> task_rq_lock() calls.
> Thanks for sending this out! I've been hitting these lockdep warningns
> a lot recently, particularly if I have any debug printks/WARN_ONs in
> the scheduler that trip, so I'm eager to get a fix for this!
>
> That said, running with your patch, I'm seeing bootup hang pretty
> close to where init starts when I've had a fair amount of debug
> printks go off. It's odd because the lockup detectors don't seem to
> fire.
> I'll try to debug further, but wanted to give you a heads up. Let me
> know if you have any suggestions.

Thanks for testing this patch. This was not merged because Peter thought 
the merging of atomic console would probably make this not an issue at 
all. We are close to getting the atomic console merged. Let's see if 
this is really the case.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2022-09-09 19:28 Waiman Long
  2022-09-19 19:49 ` Waiman Long
@ 2023-09-21  0:05 ` John Stultz
  2023-09-21  0:37   ` Waiman Long
  2023-09-21  7:45   ` Peter Zijlstra
  2023-09-21  7:42 ` Peter Zijlstra
  2 siblings, 2 replies; 10+ messages in thread
From: John Stultz @ 2023-09-21  0:05 UTC (permalink / raw)
  To: Waiman Long
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, linux-kernel

On Fri, Sep 9, 2022 at 12:28 PM Waiman Long <longman@redhat.com> wrote:
>
> It was found that a circular lock dependency can happen with the
> following locking sequence:
>
>    +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
>    |                                                         |
>    +---------------------------------------------------------+
>
> The &p->pi_lock --> &rq->__lock sequence is very common in all the
> task_rq_lock() calls.

Thanks for sending this out! I've been hitting these lockdep warningns
a lot recently, particularly if I have any debug printks/WARN_ONs in
the scheduler that trip, so I'm eager to get a fix for this!

That said, running with your patch, I'm seeing bootup hang pretty
close to where init starts when I've had a fair amount of debug
printks go off. It's odd because the lockup detectors don't seem to
fire.
I'll try to debug further, but wanted to give you a heads up. Let me
know if you have any suggestions.

thanks
-john

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
  2022-09-09 19:28 Waiman Long
@ 2022-09-19 19:49 ` Waiman Long
  2023-09-21  0:05 ` John Stultz
  2023-09-21  7:42 ` Peter Zijlstra
  2 siblings, 0 replies; 10+ messages in thread
From: Waiman Long @ 2022-09-19 19:49 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng; +Cc: linux-kernel

On 9/9/22 15:28, Waiman Long wrote:
> It was found that a circular lock dependency can happen with the
> following locking sequence:
>
>     +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
>     |                                                         |
>     +---------------------------------------------------------+
>
> The &p->pi_lock --> &rq->__lock sequence is very common in all the
> task_rq_lock() calls.
>
> The &rq->__lock --> (console_sem).lock sequence happens when the
> scheduler code calling printk() or more likely the various WARN*()
> macros while holding the rq lock. The (console_sem).lock is actually
> a raw spinlock guarding the semaphore. In the particular lockdep splat
> that I saw, it was caused by SCHED_WARN_ON() call in update_rq_clock().
> To work around this locking sequence, we may have to ban all WARN*()
> calls when the rq lock is held, which may be too restrictive, or we
> may have to add a WARN_DEFERRED() call and modify all the call sites
> to use it.
>
> Even then, a deferred printk or WARN function may still call
> console_trylock() which may, in turn, calls up_console_sem() leading
> to this locking sequence.
>
> The other ((console_sem).lock --> &p->pi_lock) locking sequence
> was caused by the fact that the semaphore up() function is calling
> wake_up_process() while holding the semaphore raw spinlock. This lockiing
> sequence can be easily eliminated by moving the wake_up_processs()
> call out of the raw spinlock critical section using wake_q which is
> what this patch implements. That is the easiest and the most certain
> way to break this circular locking sequence.
>
> v1: https://lore.kernel.org/lkml/20220118153254.358748-1-longman@redhat.com/
>
> Signed-off-by: Waiman Long <longman@redhat.com>

Ping!

Note that the current printk_deferred() code path may also hit this 
problem as an up() call of console_sem may be issued.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section
@ 2022-09-09 19:28 Waiman Long
  2022-09-19 19:49 ` Waiman Long
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Waiman Long @ 2022-09-09 19:28 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng
  Cc: linux-kernel, Waiman Long

It was found that a circular lock dependency can happen with the
following locking sequence:

   +--> (console_sem).lock --> &p->pi_lock --> &rq->__lock --+
   |                                                         |
   +---------------------------------------------------------+

The &p->pi_lock --> &rq->__lock sequence is very common in all the
task_rq_lock() calls.

The &rq->__lock --> (console_sem).lock sequence happens when the
scheduler code calling printk() or more likely the various WARN*()
macros while holding the rq lock. The (console_sem).lock is actually
a raw spinlock guarding the semaphore. In the particular lockdep splat
that I saw, it was caused by SCHED_WARN_ON() call in update_rq_clock().
To work around this locking sequence, we may have to ban all WARN*()
calls when the rq lock is held, which may be too restrictive, or we
may have to add a WARN_DEFERRED() call and modify all the call sites
to use it.

Even then, a deferred printk or WARN function may still call
console_trylock() which may, in turn, calls up_console_sem() leading
to this locking sequence.

The other ((console_sem).lock --> &p->pi_lock) locking sequence
was caused by the fact that the semaphore up() function is calling
wake_up_process() while holding the semaphore raw spinlock. This lockiing
sequence can be easily eliminated by moving the wake_up_processs()
call out of the raw spinlock critical section using wake_q which is
what this patch implements. That is the easiest and the most certain
way to break this circular locking sequence.

v1: https://lore.kernel.org/lkml/20220118153254.358748-1-longman@redhat.com/

Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/locking/semaphore.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index f2654d2fe43a..b4b817451dd7 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
@@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
 
 /**
  * down - acquire the semaphore
@@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout);
 void up(struct semaphore *sem)
 {
 	unsigned long flags;
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
-		__up(sem);
+		__up(sem, &wake_q);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 }
 EXPORT_SYMBOL(up);
 
@@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+				  struct wake_q_head *wake_q)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
 	list_del(&waiter->list);
 	waiter->up = true;
-	wake_up_process(waiter->task);
+	wake_q_add(wake_q, waiter->task);
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-09-23  0:31 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-10 18:10 [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section Waiman Long
2022-09-09 19:28 Waiman Long
2022-09-19 19:49 ` Waiman Long
2023-09-21  0:05 ` John Stultz
2023-09-21  0:37   ` Waiman Long
2023-09-21  7:45   ` Peter Zijlstra
2023-09-21  7:42 ` Peter Zijlstra
2023-09-22 18:45   ` Waiman Long
2023-09-22 19:47     ` Peter Zijlstra
2023-09-23  0:29       ` Waiman Long

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).