Re: WARNING: possible circular locking dependency detected

From: Peter Zijlstra <peterz@infradead.org>
To: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>,
	Byungchul Park <byungchul.park@lge.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	lkml <linux-kernel@vger.kernel.org>
Subject: Re: WARNING: possible circular locking dependency detected
Date: Mon, 28 Aug 2017 09:41:38 +0200	[thread overview]
Message-ID: <20170828074138.6zeceyffasjmaazx@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20170825144755.ms2h2j2xe6gznnqi@linutronix.de>

On Fri, Aug 25, 2017 at 04:47:55PM +0200, Sebastian Andrzej Siewior wrote:
> On 2017-08-25 12:03:04 [+0200], Borislav Petkov wrote:
> > ======================================================
> > WARNING: possible circular locking dependency detected
> > 4.13.0-rc6+ #1 Not tainted
> > ------------------------------------------------------
> 
> While looking at this, I stumbled upon another one also enabled by
> "completion annotation" in the TIP:
> 
> | ======================================================
> | WARNING: possible circular locking dependency detected
> | 4.13.0-rc6-00758-gd80d4177391f-dirty #112 Not tainted
> | ------------------------------------------------------
> | cpu-off.sh/426 is trying to acquire lock:
> |  ((complete)&st->done){+.+.}, at: [<ffffffff810cb344>] takedown_cpu+0x84/0xf0
> |
> | but task is already holding lock:
> |  (sparse_irq_lock){+.+.}, at: [<ffffffff811220f2>] irq_lock_sparse+0x12/0x20
> |
> | which lock already depends on the new lock.
> |
> | the existing dependency chain (in reverse order) is:
> |
> | -> #1 (sparse_irq_lock){+.+.}:
> |        __mutex_lock+0x88/0x9a0
> |        mutex_lock_nested+0x16/0x20
> |        irq_lock_sparse+0x12/0x20
> |        irq_affinity_online_cpu+0x13/0xd0
> |        cpuhp_invoke_callback+0x4a/0x130
> |
> | -> #0 ((complete)&st->done){+.+.}:
> |        check_prev_add+0x351/0x700
> |        __lock_acquire+0x114a/0x1220
> |        lock_acquire+0x47/0x70
> |        wait_for_completion+0x5c/0x180
> |        takedown_cpu+0x84/0xf0
> |        cpuhp_invoke_callback+0x4a/0x130
> |        cpuhp_down_callbacks+0x3d/0x80
> …
> |
> | other info that might help us debug this:
> |
> |  Possible unsafe locking scenario:
> |        CPU0                    CPU1
> |        ----                    ----
> |   lock(sparse_irq_lock);
> |                                lock((complete)&st->done);
> |                                lock(sparse_irq_lock);
> |   lock((complete)&st->done);
> |
> |  *** DEADLOCK ***
> 
> We hold the sparse_irq_lock lock while waiting for the completion in the
> CPU-down case and in the CPU-up case we acquire the sparse_irq_lock lock
> while the other CPU is waiting for the completion.
> This is not an issue if my interpretation of lockdep here is correct.
> 
> How do we annotate this?

Does something like so work?

---
 include/linux/completion.h | 15 ++++++++++++---
 kernel/kthread.c           | 14 +++++++++++++-
 kernel/sched/completion.c  | 18 +++++++++++++-----
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 791f053f28b7..0eccd2d44c85 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -34,9 +34,9 @@ struct completion {
 };
 
 #ifdef CONFIG_LOCKDEP_COMPLETIONS
-static inline void complete_acquire(struct completion *x)
+static inline void complete_acquire(struct completion *x, int subclass)
 {
-	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
+	lock_acquire_exclusive((struct lockdep_map *)&x->map, subclass, 0, NULL, _RET_IP_);
 }
 
 static inline void complete_release(struct completion *x)
@@ -59,7 +59,7 @@ do {									\
 } while (0)
 #else
 #define init_completion(x) __init_completion(x)
-static inline void complete_acquire(struct completion *x) {}
+static inline void complete_acquire(struct completion *x, int subclass) {}
 static inline void complete_release(struct completion *x) {}
 static inline void complete_release_commit(struct completion *x) {}
 #endif
@@ -132,6 +132,15 @@ static inline void reinit_completion(struct completion *x)
 }
 
 extern void wait_for_completion(struct completion *);
+
+#ifndef CONFIG_LOCKDEP
+static inline void
+wait_for_completion_nested(struct completion *x, int subclass)
+{
+	wait_for_completion(x);
+}
+#endif
+
 extern void wait_for_completion_io(struct completion *);
 extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528c1d88..6092702dd908 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -485,7 +485,19 @@ int kthread_park(struct task_struct *k)
 		set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
 		if (k != current) {
 			wake_up_process(k);
-			wait_for_completion(&kthread->parked);
+			/*
+			 * CPU-UP		CPU-DOWN
+			 *
+			 * cpu_hotplug_lock
+			 * wait_for_completion()
+			 * 			cpu_hotplug_lock
+			 * 			complete()
+			 *
+			 * Which normally spells deadlock, except of course
+			 * that up and down are globally serialized so the
+			 * above cannot in fact happen concurrently.
+			 */
+			wait_for_completion_nested(&kthread->parked, 1);
 		}
 	}
 
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index cc873075c3bd..18ca9b7ef677 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -101,11 +101,11 @@ do_wait_for_common(struct completion *x,
 
 static inline long __sched
 __wait_for_common(struct completion *x,
-		  long (*action)(long), long timeout, int state)
+		  long (*action)(long), long timeout, int state, int subclass)
 {
 	might_sleep();
 
-	complete_acquire(x);
+	complete_acquire(x, subclass);
 
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, action, timeout, state);
@@ -117,9 +117,9 @@ __wait_for_common(struct completion *x,
 }
 
 static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+wait_for_common(struct completion *x, long timeout, int state, int subclass)
 {
-	return __wait_for_common(x, schedule_timeout, timeout, state);
+	return __wait_for_common(x, schedule_timeout, timeout, state, subclass);
 }
 
 static long __sched
@@ -140,10 +140,18 @@ wait_for_common_io(struct completion *x, long timeout, int state)
  */
 void __sched wait_for_completion(struct completion *x)
 {
-	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE, 0);
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+#ifdef CONFIG_LOCKDEP
+void __sched wait_for_completion_nested(struct completion *x, int subclass)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE, subclass);
+}
+EXPORT_SYMBOL(wait_for_completion);
+#endif
+
 /**
  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  * @x:  holds the state of this particular completion