watchdog: print stolen time increment at softlockup detection

All of lore.kernel.org
 help / color / mirror / Atom feed

* watchdog: print stolen time increment at softlockup detection
@ 2013-06-28  2:57 Marcelo Tosatti
  2013-06-28  8:34 ` Paolo Bonzini
  2013-06-28 14:12 ` Don Zickus
  0 siblings, 2 replies; 7+ messages in thread
From: Marcelo Tosatti @ 2013-06-28  2:57 UTC (permalink / raw)
  To: kvm-devel, linux-kernel
  Cc: Karen Noel, Rik van Riel, Don Zickus, Prarit Bhargava, Thomas Gleixner


One possibility for a softlockup report in a Linux VM, is that the host
system is overcommitted to the point where the watchdog task is unable
to make progress (unable to touch the watchdog).

Maintain the increment in stolen time for the period of 
softlockup threshold detection (20 seconds by the default), 
and report this increment in the softlockup message.

Overcommitment is then indicated by a large stolen time increment,
accounting for more than, or for a significant percentage of the
softlockup threshold.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 05039e3..ed09d58 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -34,6 +34,8 @@ int __read_mostly watchdog_thresh = 10;
 static int __read_mostly watchdog_disabled;
 static u64 __read_mostly sample_period;
 
+#define SOFT_INTRS_PER_PERIOD 5
+
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
@@ -127,9 +129,51 @@ static void set_sample_period(void)
 	 * and hard thresholds) to increment before the
 	 * hardlockup detector generates a warning
 	 */
-	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+	sample_period = get_softlockup_thresh() *
+			((u64)NSEC_PER_SEC / SOFT_INTRS_PER_PERIOD);
 }
 
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+struct steal_clock_record {
+	u64 prev_stolen_time;
+	u64 stolen_time[SOFT_INTRS_PER_PERIOD];
+	int idx;
+};
+
+static DEFINE_PER_CPU(struct steal_clock_record, steal_record);
+static void record_steal_time(void)
+{
+	struct steal_clock_record *r;
+	int cpu = smp_processor_id();
+	u64 steal_time;
+	r = &per_cpu(steal_record, cpu);
+
+	steal_time = paravirt_steal_clock(cpu);
+	r->stolen_time[r->idx] = steal_time - r->prev_stolen_time;
+	r->idx++;
+	if (r->idx == SOFT_INTRS_PER_PERIOD)
+		r->idx = 0;
+	r->prev_stolen_time = steal_time;
+}
+
+static unsigned int get_accumulated_steal(int cpu)
+{
+	int idx;
+	u64 t = 0;
+	struct steal_clock_record *r = &per_cpu(steal_record, cpu);
+
+	for (idx = 0; idx < SOFT_INTRS_PER_PERIOD; idx++)
+		t += r->stolen_time[idx];
+
+	do_div(t, 1000000);
+
+	return t;
+}
+
+#else
+static void record_steal_time(void) { return; }
+#endif
+
 /* Commands for resetting the watchdog */
 static void __touch_watchdog(void)
 {
@@ -271,6 +315,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
 
+	/* record steal time */
+	record_steal_time();
+
 	/* kick the softlockup detector */
 	wake_up_process(__this_cpu_read(softlockup_watchdog));
 
@@ -316,6 +363,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+		printk(KERN_EMERG "soft lockup stolen time = %ums\n",
+			get_accumulated_steal(smp_processor_id()));
+#endif
 		print_modules();
 		print_irqtrace_events(current);
 		if (regs)

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: watchdog: print stolen time increment at softlockup detection
  2013-06-28  2:57 watchdog: print stolen time increment at softlockup detection Marcelo Tosatti
@ 2013-06-28  8:34 ` Paolo Bonzini
  2013-06-28 14:12 ` Don Zickus
  1 sibling, 0 replies; 7+ messages in thread
From: Paolo Bonzini @ 2013-06-28  8:34 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm-devel, linux-kernel, Karen Noel, Rik van Riel, Don Zickus,
	Prarit Bhargava, Thomas Gleixner

Il 28/06/2013 04:57, Marcelo Tosatti ha scritto:
> 
> One possibility for a softlockup report in a Linux VM, is that the host
> system is overcommitted to the point where the watchdog task is unable
> to make progress (unable to touch the watchdog).
> 
> Maintain the increment in stolen time for the period of 
> softlockup threshold detection (20 seconds by the default), 
> and report this increment in the softlockup message.
> 
> Overcommitment is then indicated by a large stolen time increment,
> accounting for more than, or for a significant percentage of the
> softlockup threshold.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 05039e3..ed09d58 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -34,6 +34,8 @@ int __read_mostly watchdog_thresh = 10;
>  static int __read_mostly watchdog_disabled;
>  static u64 __read_mostly sample_period;
>  
> +#define SOFT_INTRS_PER_PERIOD 5
> +
>  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
>  static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
>  static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
> @@ -127,9 +129,51 @@ static void set_sample_period(void)
>  	 * and hard thresholds) to increment before the
>  	 * hardlockup detector generates a warning
>  	 */
> -	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
> +	sample_period = get_softlockup_thresh() *
> +			((u64)NSEC_PER_SEC / SOFT_INTRS_PER_PERIOD);
>  }
>  
> +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> +struct steal_clock_record {
> +	u64 prev_stolen_time;
> +	u64 stolen_time[SOFT_INTRS_PER_PERIOD];
> +	int idx;
> +};
> +
> +static DEFINE_PER_CPU(struct steal_clock_record, steal_record);
> +static void record_steal_time(void)
> +{
> +	struct steal_clock_record *r;
> +	int cpu = smp_processor_id();
> +	u64 steal_time;
> +	r = &per_cpu(steal_record, cpu);
> +
> +	steal_time = paravirt_steal_clock(cpu);
> +	r->stolen_time[r->idx] = steal_time - r->prev_stolen_time;
> +	r->idx++;
> +	if (r->idx == SOFT_INTRS_PER_PERIOD)
> +		r->idx = 0;
> +	r->prev_stolen_time = steal_time;
> +}
> +
> +static unsigned int get_accumulated_steal(int cpu)
> +{
> +	int idx;
> +	u64 t = 0;
> +	struct steal_clock_record *r = &per_cpu(steal_record, cpu);
> +
> +	for (idx = 0; idx < SOFT_INTRS_PER_PERIOD; idx++)
> +		t += r->stolen_time[idx];
> +
> +	do_div(t, 1000000);
> +
> +	return t;
> +}
> +
> +#else
> +static void record_steal_time(void) { return; }
> +#endif
> +
>  /* Commands for resetting the watchdog */
>  static void __touch_watchdog(void)
>  {
> @@ -271,6 +315,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>  	/* kick the hardlockup detector */
>  	watchdog_interrupt_count();
>  
> +	/* record steal time */
> +	record_steal_time();
> +
>  	/* kick the softlockup detector */
>  	wake_up_process(__this_cpu_read(softlockup_watchdog));
>  
> @@ -316,6 +363,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>  		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
>  			smp_processor_id(), duration,
>  			current->comm, task_pid_nr(current));
> +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> +		printk(KERN_EMERG "soft lockup stolen time = %ums\n",
> +			get_accumulated_steal(smp_processor_id()));

Perhaps print this only if nonzero (so that it doesn't print it on bare
metal)?  Then you can remove the #ifdef too, it will be optimized away
by the compiler for !CONFIG_PARAVIRT_TIME_ACCOUNTING.

Paolo

> +#endif
>  		print_modules();
>  		print_irqtrace_events(current);
>  		if (regs)
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: watchdog: print stolen time increment at softlockup detection
  2013-06-28  2:57 watchdog: print stolen time increment at softlockup detection Marcelo Tosatti
  2013-06-28  8:34 ` Paolo Bonzini
@ 2013-06-28 14:12 ` Don Zickus
  2013-06-28 20:37   ` Marcelo Tosatti
  1 sibling, 1 reply; 7+ messages in thread
From: Don Zickus @ 2013-06-28 14:12 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm-devel, linux-kernel, Karen Noel, Rik van Riel,
	Prarit Bhargava, Thomas Gleixner

On Thu, Jun 27, 2013 at 11:57:23PM -0300, Marcelo Tosatti wrote:
> 
> One possibility for a softlockup report in a Linux VM, is that the host
> system is overcommitted to the point where the watchdog task is unable
> to make progress (unable to touch the watchdog).

I think I am confused on the VM/host stuff.  How does an overcommitted
host prevent a high priority task like the watchdog from running?

Or is it the watchdog task on the VM that is being blocked from running
because the host is overcommitted and can't run the VM frequent enough?

The latter would make sense, though I thought you solved that with the
other kvm splat in the watchdog code a while ago.  So I would be
interested in understanding why the previous solution isn't working.

Second, I am still curious how this problem differs from say kgdb or
suspend-hibernate/resume.  Doesn't both of those scenarios deal with a
clock that suddenly jumps forward without the watchdog task running?



For some reason I had the impression that when a VM starts running again,
one of the first things it does it sync up its clock again (which leads to
a softlockup shortly thereafter in the case of paused/overcommitted VMs)?
At that time I would have thought that the code could detect a large jump
in time and touch_softlockup_watchdog_sync() or something to delay the
check until the next cycle.

That would make the watchdog code alot less messier than having custom
kvm/paravirt splat all over it.  Generic solutions are always nice. :-)

Or feel free to correct any of my above assumptions as I am kinda ignorant
on how all this time code works for VMs.

Cheers,
Don

> 
> Maintain the increment in stolen time for the period of 
> softlockup threshold detection (20 seconds by the default), 
> and report this increment in the softlockup message.
> 
> Overcommitment is then indicated by a large stolen time increment,
> accounting for more than, or for a significant percentage of the
> softlockup threshold.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 05039e3..ed09d58 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -34,6 +34,8 @@ int __read_mostly watchdog_thresh = 10;
>  static int __read_mostly watchdog_disabled;
>  static u64 __read_mostly sample_period;
>  
> +#define SOFT_INTRS_PER_PERIOD 5
> +
>  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
>  static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
>  static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
> @@ -127,9 +129,51 @@ static void set_sample_period(void)
>  	 * and hard thresholds) to increment before the
>  	 * hardlockup detector generates a warning
>  	 */
> -	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
> +	sample_period = get_softlockup_thresh() *
> +			((u64)NSEC_PER_SEC / SOFT_INTRS_PER_PERIOD);
>  }
>  
> +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> +struct steal_clock_record {
> +	u64 prev_stolen_time;
> +	u64 stolen_time[SOFT_INTRS_PER_PERIOD];
> +	int idx;
> +};
> +
> +static DEFINE_PER_CPU(struct steal_clock_record, steal_record);
> +static void record_steal_time(void)
> +{
> +	struct steal_clock_record *r;
> +	int cpu = smp_processor_id();
> +	u64 steal_time;
> +	r = &per_cpu(steal_record, cpu);
> +
> +	steal_time = paravirt_steal_clock(cpu);
> +	r->stolen_time[r->idx] = steal_time - r->prev_stolen_time;
> +	r->idx++;
> +	if (r->idx == SOFT_INTRS_PER_PERIOD)
> +		r->idx = 0;
> +	r->prev_stolen_time = steal_time;
> +}
> +
> +static unsigned int get_accumulated_steal(int cpu)
> +{
> +	int idx;
> +	u64 t = 0;
> +	struct steal_clock_record *r = &per_cpu(steal_record, cpu);
> +
> +	for (idx = 0; idx < SOFT_INTRS_PER_PERIOD; idx++)
> +		t += r->stolen_time[idx];
> +
> +	do_div(t, 1000000);
> +
> +	return t;
> +}
> +
> +#else
> +static void record_steal_time(void) { return; }
> +#endif
> +
>  /* Commands for resetting the watchdog */
>  static void __touch_watchdog(void)
>  {
> @@ -271,6 +315,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>  	/* kick the hardlockup detector */
>  	watchdog_interrupt_count();
>  
> +	/* record steal time */
> +	record_steal_time();
> +
>  	/* kick the softlockup detector */
>  	wake_up_process(__this_cpu_read(softlockup_watchdog));
>  
> @@ -316,6 +363,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
>  		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
>  			smp_processor_id(), duration,
>  			current->comm, task_pid_nr(current));
> +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> +		printk(KERN_EMERG "soft lockup stolen time = %ums\n",
> +			get_accumulated_steal(smp_processor_id()));
> +#endif
>  		print_modules();
>  		print_irqtrace_events(current);
>  		if (regs)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: watchdog: print stolen time increment at softlockup detection
  2013-06-28 14:12 ` Don Zickus
@ 2013-06-28 20:37   ` Marcelo Tosatti
  2013-07-03 16:44     ` Don Zickus
  0 siblings, 1 reply; 7+ messages in thread
From: Marcelo Tosatti @ 2013-06-28 20:37 UTC (permalink / raw)
  To: Don Zickus
  Cc: kvm-devel, linux-kernel, Karen Noel, Rik van Riel,
	Prarit Bhargava, Thomas Gleixner

On Fri, Jun 28, 2013 at 10:12:15AM -0400, Don Zickus wrote:
> On Thu, Jun 27, 2013 at 11:57:23PM -0300, Marcelo Tosatti wrote:
> > 
> > One possibility for a softlockup report in a Linux VM, is that the host
> > system is overcommitted to the point where the watchdog task is unable
> > to make progress (unable to touch the watchdog).
> 
> I think I am confused on the VM/host stuff.  How does an overcommitted
> host prevent a high priority task like the watchdog from running?
> 
> Or is it the watchdog task on the VM that is being blocked from running
> because the host is overcommitted and can't run the VM frequent enough?

Yes, thats the case.

> The latter would make sense, though I thought you solved that with the
> other kvm splat in the watchdog code a while ago.  So I would be
> interested in understanding why the previous solution isn't working.

That functionality is for a notification so the guest ignores the time
jump induced by a vm pause. This problem is similar to the kgdb case.

> Second, I am still curious how this problem differs from say kgdb or
> suspend-hibernate/resume.  Doesn't both of those scenarios deal with a
> clock that suddenly jumps forward without the watchdog task running?

The difference is this:

The present functionality in watchdog.c allows the hypervisor to notify
the guest that it should ignore the large delta seen via clock reads
(at the watchdog timer interrupt).
This notification is used for the case where the vm has been paused for
a period of time.

Are you suggesting the host should silence the guest watchdog, also in
the overcommitment case? Issues i see with that:

1) The host is not aware of the variable softlockup threshold in
the guest.

2) Whatever the threshold of overcommitment for sending the ignore
softlockup notification to the guest, genuine softlockup detections in
the guest could be silenced, given proper conditioning.

And why overcommitment is not a valid reason to generate a softlockup in
the first place ?

> For some reason I had the impression that when a VM starts running again,
> one of the first things it does it sync up its clock again (which leads to
> a softlockup shortly thereafter in the case of paused/overcommitted VMs)?

Sort of, the kvmclock counts while the VM is running (whether is
overcommitted or not).

> At that time I would have thought that the code could detect a large jump
> in time and touch_softlockup_watchdog_sync() or something to delay the
> check until the next cycle.

But this would silence any softlockups that are due to delays
in the host causing the watchdog task to make progress (eg:
https://lkml.org/lkml/2013/6/20/633, in that case if 1 operation took
longer than expected your suggestion would silence the report).

> That would make the watchdog code alot less messier than having custom
> kvm/paravirt splat all over it.  Generic solutions are always nice. :-)

Can you give more detail on what the suggestion is and how can you deal
with points 1 and 2 above?

> Or feel free to correct any of my above assumptions as I am kinda ignorant
> on how all this time code works for VMs.

I am afraid that silencing the softlockup watchdog can inhibit genuine 
softlockup cases from being reported.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: watchdog: print stolen time increment at softlockup detection
  2013-06-28 20:37   ` Marcelo Tosatti
@ 2013-07-03 16:44     ` Don Zickus
  2013-07-04  2:15       ` Marcelo Tosatti
  2013-07-04  2:32       ` Marcelo Tosatti
  0 siblings, 2 replies; 7+ messages in thread
From: Don Zickus @ 2013-07-03 16:44 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm-devel, linux-kernel, Karen Noel, Rik van Riel,
	Prarit Bhargava, Thomas Gleixner

On Fri, Jun 28, 2013 at 05:37:39PM -0300, Marcelo Tosatti wrote:
> On Fri, Jun 28, 2013 at 10:12:15AM -0400, Don Zickus wrote:
> > On Thu, Jun 27, 2013 at 11:57:23PM -0300, Marcelo Tosatti wrote:
> > > 
> > > One possibility for a softlockup report in a Linux VM, is that the host
> > > system is overcommitted to the point where the watchdog task is unable
> > > to make progress (unable to touch the watchdog).
> > 
> > I think I am confused on the VM/host stuff.  How does an overcommitted
> > host prevent a high priority task like the watchdog from running?
> > 
> > Or is it the watchdog task on the VM that is being blocked from running
> > because the host is overcommitted and can't run the VM frequent enough?
> 
> Yes, thats the case.
> 
> > The latter would make sense, though I thought you solved that with the
> > other kvm splat in the watchdog code a while ago.  So I would be
> > interested in understanding why the previous solution isn't working.
> 
> That functionality is for a notification so the guest ignores the time
> jump induced by a vm pause. This problem is similar to the kgdb case.
> 
> > Second, I am still curious how this problem differs from say kgdb or
> > suspend-hibernate/resume.  Doesn't both of those scenarios deal with a
> > clock that suddenly jumps forward without the watchdog task running?
> 
> The difference is this:
> 
> The present functionality in watchdog.c allows the hypervisor to notify
> the guest that it should ignore the large delta seen via clock reads
> (at the watchdog timer interrupt).
> This notification is used for the case where the vm has been paused for
> a period of time.

But why do this at the watchdog timer interrupt?  I thought this would be
done at the lower layer like in sched_clock() or something.

> 
> Are you suggesting the host should silence the guest watchdog, also in
> the overcommitment case? Issues i see with that:
> 
> 1) The host is not aware of the variable softlockup threshold in
> the guest.
> 
> 2) Whatever the threshold of overcommitment for sending the ignore
> softlockup notification to the guest, genuine softlockup detections in
> the guest could be silenced, given proper conditioning.

No.  That would be difficult as you described.  What I am trying to get at
is, doesn't the guest /know/ time jumped when it schedules again?  And
can't it determine based on this jump that something unreasonable
happened like a long pause or and overcommit?

> 
> And why overcommitment is not a valid reason to generate a softlockup in
> the first place ?

For the guest I don't believe it is.  It isn't the guest's fault it
couldn't run processes.  A warning should be scheduled on the host that it
couldn't run a process in a very long time.

> 
> > For some reason I had the impression that when a VM starts running again,
> > one of the first things it does it sync up its clock again (which leads to
> > a softlockup shortly thereafter in the case of paused/overcommitted VMs)?
> 
> Sort of, the kvmclock counts while the VM is running (whether is
> overcommitted or not).

Does comparing the kvmclock with the current clock indicate that a long
pause or an overcommit occurred?

> 
> > At that time I would have thought that the code could detect a large jump
> > in time and touch_softlockup_watchdog_sync() or something to delay the
> > check until the next cycle.
> 
> But this would silence any softlockups that are due to delays
> in the host causing the watchdog task to make progress (eg:
> https://lkml.org/lkml/2013/6/20/633, in that case if 1 operation took
> longer than expected your suggestion would silence the report).

Ok.  I don't fully understand that problem, the changelog was a little
vague.

> 
> > That would make the watchdog code alot less messier than having custom
> > kvm/paravirt splat all over it.  Generic solutions are always nice. :-)
> 
> Can you give more detail on what the suggestion is and how can you deal
> with points 1 and 2 above?

I don't have a good suggestion, just a lot of questions really.  The thing
is there are lots of watchdogs in the system (ie clock watchdog,
filesystem watchdog, rcu stalls, etc).  Solving this problem just for the lockup
watchdog doesn't seem right because if the lockup timeout was longer, you
would probably hit the other watchdogs too.

So my suggestion (based on my ignorance of how the clock code works) is
that some sort of generic mechanism be applied to all the watchdogs.  Much
like how kgdb touches all of them at once when it handles an exception.

For example, unpausing a guest could be a good time to touch all the
watchdogs as you have no idea how long the pause was.  I can't think of
any hook for an overcommit though.

Cheers,
Don

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: watchdog: print stolen time increment at softlockup detection
  2013-07-03 16:44     ` Don Zickus
@ 2013-07-04  2:15       ` Marcelo Tosatti
  2013-07-04  2:32       ` Marcelo Tosatti
  1 sibling, 0 replies; 7+ messages in thread
From: Marcelo Tosatti @ 2013-07-04  2:15 UTC (permalink / raw)
  To: Don Zickus
  Cc: kvm-devel, linux-kernel, Karen Noel, Rik van Riel,
	Prarit Bhargava, Thomas Gleixner

On Wed, Jul 03, 2013 at 12:44:01PM -0400, Don Zickus wrote:
> On Fri, Jun 28, 2013 at 05:37:39PM -0300, Marcelo Tosatti wrote:
> > On Fri, Jun 28, 2013 at 10:12:15AM -0400, Don Zickus wrote:
> > > On Thu, Jun 27, 2013 at 11:57:23PM -0300, Marcelo Tosatti wrote:
> > > > 
> > > > One possibility for a softlockup report in a Linux VM, is that the host
> > > > system is overcommitted to the point where the watchdog task is unable
> > > > to make progress (unable to touch the watchdog).
> > > 
> > > I think I am confused on the VM/host stuff.  How does an overcommitted
> > > host prevent a high priority task like the watchdog from running?
> > > 
> > > Or is it the watchdog task on the VM that is being blocked from running
> > > because the host is overcommitted and can't run the VM frequent enough?
> > 
> > Yes, thats the case.
> > 
> > > The latter would make sense, though I thought you solved that with the
> > > other kvm splat in the watchdog code a while ago.  So I would be
> > > interested in understanding why the previous solution isn't working.
> > 
> > That functionality is for a notification so the guest ignores the time
> > jump induced by a vm pause. This problem is similar to the kgdb case.
> > 
> > > Second, I am still curious how this problem differs from say kgdb or
> > > suspend-hibernate/resume.  Doesn't both of those scenarios deal with a
> > > clock that suddenly jumps forward without the watchdog task running?
> > 
> > The difference is this:
> > 
> > The present functionality in watchdog.c allows the hypervisor to notify
> > the guest that it should ignore the large delta seen via clock reads
> > (at the watchdog timer interrupt).
> > This notification is used for the case where the vm has been paused for
> > a period of time.
> 
> But why do this at the watchdog timer interrupt?  I thought this would be
> done at the lower layer like in sched_clock() or something.
> 
> > 
> > Are you suggesting the host should silence the guest watchdog, also in
> > the overcommitment case? Issues i see with that:
> > 
> > 1) The host is not aware of the variable softlockup threshold in
> > the guest.
> > 
> > 2) Whatever the threshold of overcommitment for sending the ignore
> > softlockup notification to the guest, genuine softlockup detections in
> > the guest could be silenced, given proper conditioning.
> 
> No.  That would be difficult as you described.  What I am trying to get at
> is, doesn't the guest /know/ time jumped when it schedules again?  And
> can't it determine based on this jump that something unreasonable
> happened like a long pause or and overcommit?

A large jump alone is not enough information to reset the watchdog(s).

For example for this large jump scenario:

1. guest instruction exits to host for emulation.
2. emulation completes after 10 minutes, resumes execution at 
next instruction.
3. watchdog detects jump and prints a warning.

If the jump is due to inefficiency or incorrect emulation, the message
should be printed.
If the jump is due to a vm pause, the message should not be printed.

> > And why overcommitment is not a valid reason to generate a softlockup in
> > the first place ?
> 
> For the guest I don't believe it is.  It isn't the guest's fault it
> couldn't run processes.  A warning should be scheduled on the host that it
> couldn't run a process in a very long time.
>
> > > For some reason I had the impression that when a VM starts running again,
> > > one of the first things it does it sync up its clock again (which leads to
> > > a softlockup shortly thereafter in the case of paused/overcommitted VMs)?
> > 
> > Sort of, the kvmclock counts while the VM is running (whether is
> > overcommitted or not).
> 
> Does comparing the kvmclock with the current clock indicate that a long
> pause or an overcommit occurred?

By current clock you mean system clock? sched_clock() reads from
kvmclock.

> > > At that time I would have thought that the code could detect a large jump
> > > in time and touch_softlockup_watchdog_sync() or something to delay the
> > > check until the next cycle.
> > 
> > But this would silence any softlockups that are due to delays
> > in the host causing the watchdog task to make progress (eg:
> > https://lkml.org/lkml/2013/6/20/633, in that case if 1 operation took
> > longer than expected your suggestion would silence the report).
> 
> Ok.  I don't fully understand that problem, the changelog was a little
> vague.

That problem is described in the large jump scenario with guest
instruction exiting for emulation (in the beginning of this message).

> > > That would make the watchdog code alot less messier than having custom
> > > kvm/paravirt splat all over it.  Generic solutions are always nice. :-)
> > 
> > Can you give more detail on what the suggestion is and how can you deal
> > with points 1 and 2 above?
> 
> I don't have a good suggestion, just a lot of questions really.  The thing
> is there are lots of watchdogs in the system (ie clock watchdog,
> filesystem watchdog, rcu stalls, etc).  Solving this problem just for the lockup
> watchdog doesn't seem right because if the lockup timeout was longer, you
> would probably hit the other watchdogs too.

Agree. However, can't see how there is a way around "having custom
kvm/paravirt splat all over", for watchdogs that do:

1. check for watchdog resets
2. read time via sched_clock or xtime.
3. based on 2, decide whether there has been a longer delay than
acceptable.

This is the case for the softlockup timer interrupt. So the splat there
is necessary (otherwise any potential notification of vm-pause event 
noticed at 2 might be missed because its checked at 1).

For watchdogs that measure time based on interrupt event (such as hung
task, rcu_cpu_stall, checking for the notification at sched_clock or
lower is fine).

> So my suggestion (based on my ignorance of how the clock code works) is
> that some sort of generic mechanism be applied to all the watchdogs.  Much
> like how kgdb touches all of them at once when it handles an exception.
> 
> For example, unpausing a guest could be a good time to touch all the
> watchdogs as you have no idea how long the pause was.  I can't think of
> any hook for an overcommit though.

Its a good suggestion - will write a patch to touch watchdogs at read
of kvmclock.

Thanks!


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: watchdog: print stolen time increment at softlockup detection
  2013-07-03 16:44     ` Don Zickus
  2013-07-04  2:15       ` Marcelo Tosatti
@ 2013-07-04  2:32       ` Marcelo Tosatti
  1 sibling, 0 replies; 7+ messages in thread
From: Marcelo Tosatti @ 2013-07-04  2:32 UTC (permalink / raw)
  To: Don Zickus
  Cc: kvm-devel, linux-kernel, Karen Noel, Rik van Riel,
	Prarit Bhargava, Thomas Gleixner

On Wed, Jul 03, 2013 at 12:44:01PM -0400, Don Zickus wrote:
> > And why overcommitment is not a valid reason to generate a softlockup in
> > the first place ?
> 
> For the guest I don't believe it is.  It isn't the guest's fault it
> couldn't run processes.  A warning should be scheduled on the host that it
> couldn't run a process in a very long time.

An interesting viewpoint is this: it does not matter who is at fault,
whether the host or the guest. What matters is that an application in
the guest is being starved of the least amount of resources for proper
functioning (as specified by the watchdog threshold).

The responsability for verifying that the least amount of resources for
functioning of the guest application should not be entirely on the part
of the host.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-07-04  2:33 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-28  2:57 watchdog: print stolen time increment at softlockup detection Marcelo Tosatti
2013-06-28  8:34 ` Paolo Bonzini
2013-06-28 14:12 ` Don Zickus
2013-06-28 20:37   ` Marcelo Tosatti
2013-07-03 16:44     ` Don Zickus
2013-07-04  2:15       ` Marcelo Tosatti
2013-07-04  2:32       ` Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.