All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Jones <davej@redhat.com>, Oleg Nesterov <oleg@redhat.com>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Linux Kernel <linux-kernel@vger.kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrey Vagin <avagin@openvz.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Jens Axboe <axboe@kernel.dk>
Subject: Re: frequent softlockups with 3.10rc6.
Date: Wed, 26 Jun 2013 19:32:45 -0700	[thread overview]
Message-ID: <20130627023245.GI4536@htj.dyndns.org> (raw)
In-Reply-To: <87ppv82wgq.fsf@xmission.com>

Hello,

On Wed, Jun 26, 2013 at 06:06:45PM -0700, Eric W. Biederman wrote:
> Just based on the last trace and your observation that it seems to be
> vfs/block layer related I am going to mildly suggest that Jens and Tejun
> might have a clue.  Tejun made a transformation of the threads used for
> writeback from a custom thread pool to the generic mechanism.  So it
> seems worth asking the question could it have been in Jens block merge
> of 4de13d7aa8f4d02f4dc99d4609575659f92b3c5a.

If workqueue was suffering deadlock or starvation, it'd lock up in a
lot more boring places, unless it's somehow completely messing up blk
softirq completion logic.

Given the wide variety of places, it's likely something lower level.
Maybe something is raising an interrupt and then its softirq raises it
again right away so that it evades the screaming irq detection but
still live-locks the CPU?  It'd definitely be interesting to watch the
interrupt and softirq counters.  Something like the following may
help.

Thanks.

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 05039e3..21d4e82 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -261,6 +261,49 @@ static void watchdog_interrupt_count(void)
 static int watchdog_nmi_enable(unsigned int cpu);
 static void watchdog_nmi_disable(unsigned int cpu);
 
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
+static void dump_irqs(void)
+{
+	static u64 hirq_sums[NR_IRQS];
+	static u64 sirq_sums[NR_SOFTIRQS];
+	int i, cpu;
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_desc *desc = irq_to_desc(i);
+		u64 sum = 0;
+
+		if (!desc)
+			continue;
+
+		for_each_online_cpu(cpu)
+			sum += kstat_irqs_cpu(i, cpu);
+		if (sum == hirq_sums[i])
+			continue;
+
+		printk("XXX HIRQ %2d %8s : sum_delta=%8llu on_this_cpu=%8u\n",
+		       i, desc->action ? desc->action->name : "",
+		       sum - hirq_sums[i],
+		       kstat_irqs_cpu(i, smp_processor_id()));
+		hirq_sums[i] = sum;
+	}
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		u64 sum = 0;
+
+		for_each_online_cpu(cpu)
+			sum += kstat_softirqs_cpu(i, cpu);
+		if (sum == sirq_sums[i])
+			continue;
+
+		printk("XXX SIRQ %2d %8s : sum_delta=%8llu on_this_cpu=%8u\n",
+		       i, softirq_to_name[i], sum - sirq_sums[i],
+		       kstat_softirqs_cpu(i, smp_processor_id()));
+		sirq_sums[i] = sum;
+	}
+}
+
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -301,6 +344,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	 */
 	duration = is_softlockup(touch_ts);
 	if (unlikely(duration)) {
+		static int warned_on_cpu = -1;
+
 		/*
 		 * If a virtual machine is stopped by the host it can look to
 		 * the watchdog like a soft lockup, check to see if the host
@@ -309,10 +354,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		if (kvm_check_and_clear_guest_paused())
 			return HRTIMER_RESTART;
 
-		/* only warn once */
-		if (__this_cpu_read(soft_watchdog_warn) == true)
+		/* let's follow the first cpu */
+		if (warned_on_cpu >= 0 && warned_on_cpu != smp_processor_id())
 			return HRTIMER_RESTART;
 
+		/* only warn once */
+		//if (__this_cpu_read(soft_watchdog_warn) == true)
+		//	return HRTIMER_RESTART;
+
 		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -323,6 +372,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		else
 			dump_stack();
 
+		dump_irqs();
+
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
 		__this_cpu_write(soft_watchdog_warn, true);

  reply	other threads:[~2013-06-27  2:32 UTC|newest]

Thread overview: 93+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-19 16:45 frequent softlockups with 3.10rc6 Dave Jones
2013-06-19 17:53 ` Dave Jones
2013-06-19 18:13   ` Paul E. McKenney
2013-06-19 18:42     ` Dave Jones
2013-06-20  0:12     ` Dave Jones
2013-06-20 16:16       ` Paul E. McKenney
2013-06-20 16:27         ` Dave Jones
2013-06-21 15:11         ` Dave Jones
2013-06-21 19:59           ` Oleg Nesterov
2013-06-22  1:37             ` Dave Jones
2013-06-22 17:31               ` Oleg Nesterov
2013-06-22 21:59                 ` Dave Jones
2013-06-23  5:00                   ` Andrew Vagin
2013-06-23 14:36                   ` Oleg Nesterov
2013-06-23 15:06                     ` Dave Jones
2013-06-23 16:04                       ` Oleg Nesterov
2013-06-24  0:21                         ` Dave Jones
2013-06-24  2:00                         ` Dave Jones
2013-06-24 14:39                           ` Oleg Nesterov
2013-06-24 14:52                             ` Steven Rostedt
2013-06-24 16:00                               ` Dave Jones
2013-06-24 16:24                                 ` Steven Rostedt
2013-06-24 16:51                                   ` Dave Jones
2013-06-24 17:04                                     ` Steven Rostedt
2013-06-25 16:55                                       ` Dave Jones
2013-06-25 17:21                                         ` Steven Rostedt
2013-06-25 17:23                                           ` Steven Rostedt
2013-06-25 17:26                                           ` Dave Jones
2013-06-25 17:31                                             ` Steven Rostedt
2013-06-25 17:32                                             ` Steven Rostedt
2013-06-25 17:29                                           ` Steven Rostedt
2013-06-25 17:34                                             ` Dave Jones
2013-06-24 16:37                                 ` Oleg Nesterov
2013-06-24 16:49                                   ` Dave Jones
2013-06-24 15:57                         ` Dave Jones
2013-06-24 17:35                           ` Oleg Nesterov
2013-06-24 17:44                             ` Dave Jones
2013-06-24 17:53                             ` Steven Rostedt
2013-06-24 18:00                               ` Dave Jones
2013-06-25 15:35                             ` Dave Jones
2013-06-25 16:23                               ` Steven Rostedt
2013-06-26  5:23                                 ` Dave Jones
2013-06-26 19:52                                   ` Steven Rostedt
2013-06-26 20:00                                     ` Dave Jones
2013-06-27  3:01                                       ` Steven Rostedt
2013-06-26  5:48                                 ` Dave Jones
2013-06-26 19:18                               ` Oleg Nesterov
2013-06-26 19:40                                 ` Dave Jones
2013-06-27  0:22                                 ` Dave Jones
2013-06-27  1:06                                   ` Eric W. Biederman
2013-06-27  2:32                                     ` Tejun Heo [this message]
2013-06-27  7:55                                   ` Dave Chinner
2013-06-27 10:06                                     ` Dave Chinner
2013-06-27 12:52                                       ` Dave Chinner
2013-06-27 15:21                                         ` Dave Jones
2013-06-28  1:13                                           ` Dave Chinner
2013-06-28  3:58                                             ` Dave Chinner
2013-06-28 10:28                                               ` Jan Kara
2013-06-29  3:39                                                 ` Dave Chinner
2013-07-01 12:00                                                   ` Jan Kara
2013-07-02  6:29                                                     ` Dave Chinner
2013-07-02  8:19                                                       ` Jan Kara
2013-07-02 12:38                                                         ` Dave Chinner
2013-07-02 14:05                                                           ` Jan Kara
2013-07-02 16:13                                                             ` Linus Torvalds
2013-07-02 16:57                                                               ` Jan Kara
2013-07-02 17:38                                                                 ` Linus Torvalds
2013-07-03  3:07                                                                   ` Dave Chinner
2013-07-03  3:28                                                                     ` Linus Torvalds
2013-07-03  4:49                                                                       ` Dave Chinner
2013-07-04  7:19                                                                         ` Andrew Morton
2013-06-29 20:13                                               ` Dave Jones
2013-06-29 22:23                                                 ` Linus Torvalds
2013-06-29 23:44                                                   ` Dave Jones
2013-06-30  0:21                                                     ` Steven Rostedt
2013-07-01 12:49                                                     ` Pavel Machek
2013-06-30  0:17                                                   ` Steven Rostedt
2013-06-30  2:05                                                   ` Dave Chinner
2013-06-30  2:34                                                     ` Dave Chinner
2013-06-27 14:30                                     ` Dave Jones
2013-06-28  1:18                                       ` Dave Chinner
2013-06-28  2:54                                         ` Linus Torvalds
2013-06-28  3:54                                           ` Dave Chinner
2013-06-28  5:59                                             ` Linus Torvalds
2013-06-28  7:21                                               ` Dave Chinner
2013-06-28  8:22                                                 ` Linus Torvalds
2013-06-28  8:32                                                   ` Al Viro
2013-06-28  8:22                                               ` Al Viro
2013-06-28  9:49                                               ` Jan Kara
2013-07-01 17:57                                             ` block layer softlockup Dave Jones
2013-07-02  2:07                                               ` Dave Chinner
2013-07-02  6:01                                                 ` Dave Jones
2013-07-02  7:30                                                   ` Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130627023245.GI4536@htj.dyndns.org \
    --to=tj@kernel.org \
    --cc=avagin@openvz.org \
    --cc=axboe@kernel.dk \
    --cc=davej@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=rostedt@goodmis.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.