[PATCH 2/2] workqueue: implement lockup detector

From: Tejun Heo <tj@kernel.org>
To: Ulrich Obergfell <uobergfe@redhat.com>,
	Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, kernel-team@fb.com
Subject: [PATCH 2/2] workqueue: implement lockup detector
Date: Wed, 2 Dec 2015 19:28:39 -0500	[thread overview]
Message-ID: <20151203002839.GK19878@mtj.duckdns.org> (raw)
In-Reply-To: <20151203002810.GJ19878@mtj.duckdns.org>

Workqueue stalls can happen from a variety of usage bugs such as
missing WQ_MEM_RECLAIM flag or concurrency managed work item
indefinitely staying RUNNING.  These stalls can be extremely difficult
to hunt down because the usual warning mechanisms can't detect
workqueue stalls and the internal state is pretty opaque.

To alleviate the situation, this patch implements workqueue lockup
detector.  It periodically monitors all worker_pools periodically and,
if any pool failed to make forward progress longer than the threshold
duration, triggers warning and dumps workqueue state as follows.

 BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 31s!
 Showing busy workqueues and worker pools:
 workqueue events: flags=0x0
   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=17/256
     pending: monkey_wrench_fn, e1000_watchdog, cache_reap, vmstat_shepherd, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, release_one_tty, cgroup_release_agent
 workqueue events_power_efficient: flags=0x80
   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256
     pending: check_lifetime, neigh_periodic_work
 workqueue cgroup_pidlist_destroy: flags=0x0
   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1
     pending: cgroup_pidlist_destroy_work_fn
 ...

The detection mechanism is enabled/disabled together with scheduler
softlockup watchdog and uses the same threshold value; however, it
currently doesn't trigger panic.  We can do that later once the
detection mechanism proves to be reliable.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h     |    1 
 include/linux/workqueue.h |    6 +
 kernel/watchdog.c         |   15 +++-
 kernel/workqueue.c        |  146 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 162 insertions(+), 6 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -381,6 +381,7 @@ extern void touch_softlockup_watchdog_sc
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
+extern int get_softlockup_thresh(void);
 extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
 				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos);
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -618,4 +618,10 @@ static inline int workqueue_sysfs_regist
 { return 0; }
 #endif	/* CONFIG_SYSFS */
 
+#ifdef CONFIG_LOCKUP_DETECTOR
+void enable_workqueue_watchdog(void);
+void disable_workqueue_watchdog(void);
+void touch_workqueue_watchdog(int cpu);
+#endif	/* CONFIG_LOCKUP_DETECTOR */
+
 #endif
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -20,6 +20,7 @@
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
 #include <linux/tick.h>
+#include <linux/workqueue.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -192,7 +193,7 @@ __setup("hardlockup_all_cpu_backtrace=",
  * the thresholds with a factor: we make the soft threshold twice the amount of
  * time the hard threshold is.
  */
-static int get_softlockup_thresh(void)
+int get_softlockup_thresh(void)
 {
 	return watchdog_thresh * 2;
 }
@@ -245,6 +246,7 @@ void touch_softlockup_watchdog_sched(voi
 void touch_softlockup_watchdog(void)
 {
 	touch_softlockup_watchdog_sched();
+	touch_workqueue_watchdog(raw_smp_processor_id());
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -259,6 +261,7 @@ void touch_all_softlockup_watchdogs(void
 	 */
 	for_each_watchdog_cpu(cpu)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
+	touch_workqueue_watchdog(-1);
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -795,13 +798,18 @@ static int watchdog_enable_all_cpus(void
 {
 	int err = 0;
 
+	disable_workqueue_watchdog();
+
 	if (!watchdog_running) {
 		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
 							     &watchdog_cpumask);
-		if (err)
+		if (err) {
 			pr_err("Failed to create watchdog threads, disabled\n");
-		else
+		} else {
+			if (watchdog_enabled & SOFT_WATCHDOG_ENABLED)
+				enable_workqueue_watchdog();
 			watchdog_running = 1;
+		}
 	} else {
 		/*
 		 * Enable/disable the lockup detectors or
@@ -826,6 +834,7 @@ static void watchdog_disable_all_cpus(vo
 	if (watchdog_running) {
 		watchdog_running = 0;
 		smpboot_unregister_percpu_thread(&watchdog_threads);
+		disable_workqueue_watchdog();
 	}
 }
 
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,8 @@ struct worker_pool {
 	int			id;		/* I: pool ID */
 	unsigned int		flags;		/* X: flags */
 
+	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
+
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
 
@@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(st
 	struct pool_workqueue *pwq = get_work_pwq(work);
 
 	trace_workqueue_activate_work(work);
+	if (list_empty(&pwq->pool->worklist))
+		pwq->pool->watchdog_ts = jiffies;
 	move_linked_works(work, &pwq->pool->worklist, NULL);
 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
 	pwq->nr_active++;
@@ -1385,6 +1389,8 @@ retry:
 		trace_workqueue_activate_work(work);
 		pwq->nr_active++;
 		worklist = &pwq->pool->worklist;
+		if (list_empty(worklist))
+			pwq->pool->watchdog_ts = jiffies;
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &pwq->delayed_works;
@@ -2157,6 +2163,8 @@ recheck:
 			list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
+		pool->watchdog_ts = jiffies;
+
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -2240,6 +2248,7 @@ repeat:
 					struct pool_workqueue, mayday_node);
 		struct worker_pool *pool = pwq->pool;
 		struct work_struct *work, *n;
+		bool first = true;
 
 		__set_current_state(TASK_RUNNING);
 		list_del_init(&pwq->mayday_node);
@@ -2256,9 +2265,14 @@ repeat:
 		 * process'em.
 		 */
 		WARN_ON_ONCE(!list_empty(scheduled));
-		list_for_each_entry_safe(work, n, &pool->worklist, entry)
-			if (get_work_pwq(work) == pwq)
+		list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+			if (get_work_pwq(work) == pwq) {
+				if (first)
+					pool->watchdog_ts = jiffies;
 				move_linked_works(work, scheduled, &n);
+			}
+			first = false;
+		}
 
 		if (!list_empty(scheduled)) {
 			process_scheduled_works(rescuer);
@@ -3069,6 +3083,7 @@ static int init_worker_pool(struct worke
 	pool->cpu = -1;
 	pool->node = NUMA_NO_NODE;
 	pool->flags |= POOL_DISASSOCIATED;
+	pool->watchdog_ts = jiffies;
 	INIT_LIST_HEAD(&pool->worklist);
 	INIT_LIST_HEAD(&pool->idle_list);
 	hash_init(pool->busy_hash);
@@ -4308,7 +4323,9 @@ void show_workqueue_state(void)
 
 		pr_info("pool %d:", pool->id);
 		pr_cont_pool_info(pool);
-		pr_cont(" workers=%d", pool->nr_workers);
+		pr_cont(" hung=%us workers=%d",
+			jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+			pool->nr_workers);
 		if (pool->manager)
 			pr_cont(" manager: %d",
 				task_pid_nr(pool->manager->task));
@@ -5167,6 +5184,129 @@ static void workqueue_sysfs_unregister(s
 static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
 #endif	/* CONFIG_SYSFS */
 
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely.  Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * The mechanism is enabled and disabled together with softlockup watchdog
+ * and uses the same threshold duration; however, it currently doesn't
+ * cause panic even if softlockup_panic is set.  Also, workqueue watchdog
+ * assumes that the usual jiffies and timer mechanisms are working as there
+ * isn't much point in warning about workqueue stalls when timer is broken.
+ */
+#ifdef CONFIG_LOCKUP_DETECTOR
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh;
+static struct timer_list wq_watchdog_timer =
+	TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+	int cpu;
+
+	wq_watchdog_touched = jiffies;
+	for_each_possible_cpu(cpu)
+		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+	unsigned long thresh = wq_watchdog_thresh;
+	bool lockup_detected = false;
+	struct worker_pool *pool;
+	int pi;
+
+	if (!thresh)
+		return;
+
+	rcu_read_lock();
+
+	for_each_pool(pool, pi) {
+		unsigned long pool_ts, touched, ts;
+
+		if (list_empty(&pool->worklist))
+			continue;
+
+		/* get the latest of pool and touched timestamps */
+		pool_ts = READ_ONCE(pool->watchdog_ts);
+		touched = READ_ONCE(wq_watchdog_touched);
+
+		if (time_after(pool_ts, touched))
+			ts = pool_ts;
+		else
+			ts = touched;
+
+		if (pool->cpu >= 0) {
+			unsigned long cpu_touched =
+				READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+						  pool->cpu));
+			if (time_after(cpu_touched, ts))
+				ts = cpu_touched;
+		}
+
+		/* did we stall? */
+		if (time_after(jiffies, ts + thresh)) {
+			lockup_detected = true;
+			pr_emerg("BUG: workqueue lockup - pool");
+			pr_cont_pool_info(pool);
+			pr_cont(" stuck for %us!\n",
+				jiffies_to_msecs(jiffies - pool_ts) / 1000);
+		}
+	}
+
+	rcu_read_unlock();
+
+	if (lockup_detected)
+		show_workqueue_state();
+
+	wq_watchdog_reset_touched();
+	mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void enable_workqueue_watchdog(void)
+{
+	wq_watchdog_thresh = get_softlockup_thresh() * HZ;
+
+	wq_watchdog_reset_touched();
+	mod_timer(&wq_watchdog_timer, jiffies + wq_watchdog_thresh);
+}
+
+void disable_workqueue_watchdog(void)
+{
+	wq_watchdog_thresh = 0;
+	del_timer_sync(&wq_watchdog_timer);
+}
+
+void touch_workqueue_watchdog(int cpu)
+{
+	/*
+	 * If not explicitly touched, these stamps are never updated, which
+	 * means that they may affect stall detection if jiffies wraps;
+	 * however, it's highly unlikely and the worst that can happen is
+	 * delaying stall detection by upto one threshold duration.
+	 */
+	if (cpu >= 0)
+		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+	else
+		wq_watchdog_touched = jiffies;
+}
+
+#endif	/* CONFIG_LOCKUP_DETECTOR */
+
 static void __init wq_numa_init(void)
 {
 	cpumask_var_t *tbl;