linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] irq_work: allow certain work in hard irq context
@ 2014-01-31 14:34 Sebastian Andrzej Siewior
  2014-01-31 14:34 ` [PATCH 2/2] timer: really raise softirq if there is irq_work to do Sebastian Andrzej Siewior
  2014-02-02  4:22 ` [PATCH 1/2] irq_work: allow certain work in hard irq context Mike Galbraith
  0 siblings, 2 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-01-31 14:34 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, rostedt, tglx, Sebastian Andrzej Siewior

irq_work is processed in softirq context on -RT because we want to avoid
long latencies which might arise from processing lots of perf events.
The noHZ-full mode requires its callback to be called from real hardirq
context (commit 76c24fb ("nohz: New APIs to re-evaluate the tick on full
dynticks CPUs")). If it is called from a thread context we might get
wrong results for checks like "is_idle_task(current)".
This patch introduces a second list (hirq_work_list) which will be used
if irq_work_run() has been invoked from hardirq context and process only
work items marked with IRQ_WORK_HARD_IRQ.

This patch also removes arch_irq_work_raise() from sparc & powerpc like
it is already done for x86. Atleast for powerpc it is somehow
superfluous because it is called from the timer interrupt which should
invoke update_process_times().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 arch/powerpc/kernel/time.c |  2 +-
 arch/sparc/kernel/pcr.c    |  2 ++
 include/linux/irq_work.h   |  1 +
 kernel/irq_work.c          | 23 ++++++++++++++++++++---
 kernel/time/tick-sched.c   |  1 +
 kernel/timer.c             |  2 +-
 6 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b3b1441..5ac241b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -423,7 +423,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
 
 /*
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 269af58..dbb51a6 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -43,10 +43,12 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 void arch_irq_work_raise(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
+#endif
 
 const struct pcr_ops *pcr_ops;
 EXPORT_SYMBOL_GPL(pcr_ops);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 6601702..60c19ee 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -16,6 +16,7 @@
 #define IRQ_WORK_BUSY		2UL
 #define IRQ_WORK_FLAGS		3UL
 #define IRQ_WORK_LAZY		4UL /* Doesn't want IPI, wait for tick */
+#define IRQ_WORK_HARD_IRQ	8UL /* Run hard IRQ context, even on RT */
 
 struct irq_work {
 	unsigned long flags;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f6e4377..3dc166d 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -20,6 +20,9 @@
 
 
 static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+#ifdef CONFIG_PREEMPT_RT_FULL
+static DEFINE_PER_CPU(struct llist_head, hirq_work_list);
+#endif
 static DEFINE_PER_CPU(int, irq_work_raised);
 
 /*
@@ -48,7 +51,11 @@ static bool irq_work_claim(struct irq_work *work)
 	return true;
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+void arch_irq_work_raise(void)
+#else
 void __weak arch_irq_work_raise(void)
+#endif
 {
 	/*
 	 * Lame architectures will get the timer tick callback
@@ -70,8 +77,12 @@ void irq_work_queue(struct irq_work *work)
 	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
 
-	llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (work->flags & IRQ_WORK_HARD_IRQ)
+		llist_add(&work->llnode, &__get_cpu_var(hirq_work_list));
+	else
+#endif
+		llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
 	/*
 	 * If the work is not "lazy" or the tick is stopped, raise the irq
 	 * work interrupt (if supported by the arch), otherwise, just wait
@@ -115,7 +126,13 @@ static void __irq_work_run(void)
 	__this_cpu_write(irq_work_raised, 0);
 	barrier();
 
-	this_list = &__get_cpu_var(irq_work_list);
+	pr_err("%s(%d) %d\n", __func__, __LINE__, in_irq() ? 1 : 0);
+#ifdef CONFIG_PREEMPT_RT_FULL
+	if (in_irq())
+		this_list = &__get_cpu_var(hirq_work_list);
+	else
+#endif
+		this_list = &__get_cpu_var(irq_work_list);
 	if (llist_empty(this_list))
 		return;
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4675aab..a236875 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -221,6 +221,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
 
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 	.func = nohz_full_kick_work_func,
+	.flags = IRQ_WORK_HARD_IRQ,
 };
 
 /*
diff --git a/kernel/timer.c b/kernel/timer.c
index 106968f..6b3c172 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1425,7 +1425,7 @@ void update_process_times(int user_tick)
 	scheduler_tick();
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
-#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
+#if defined(CONFIG_IRQ_WORK)
 	if (in_irq())
 		irq_work_run();
 #endif
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 14:34 [PATCH 1/2] irq_work: allow certain work in hard irq context Sebastian Andrzej Siewior
@ 2014-01-31 14:34 ` Sebastian Andrzej Siewior
  2014-01-31 17:07   ` Steven Rostedt
  2014-02-02  4:22 ` [PATCH 1/2] irq_work: allow certain work in hard irq context Mike Galbraith
  1 sibling, 1 reply; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-01-31 14:34 UTC (permalink / raw)
  To: linux-rt-users; +Cc: linux-kernel, rostedt, tglx, Sebastian Andrzej Siewior

from looking at the code, it seems that the softirq is only raised (in
the !base->active_timers case) if we have also an expired timer
(time_before_eq() is true). This patch ensures that the timer softirq is
also raised in the !base->active_timers && no timer expired.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/timer.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/timer.c b/kernel/timer.c
index 6b3c172..b04c879 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1453,6 +1453,7 @@ static void run_timer_softirq(struct softirq_action *h)
 void run_local_timers(void)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
+	bool need_raise = false;
 
 	hrtimer_run_queues();
 	/*
@@ -1470,12 +1471,17 @@ void run_local_timers(void)
 #ifdef CONFIG_PREEMPT_RT_FULL
 		/* On RT, irq work runs from softirq */
 		if (!irq_work_needs_cpu())
-#endif
 			goto out;
+		need_raise = true;
+#else
+		goto out;
+#endif
 	}
 
 	/* Check whether the next pending timer has expired */
 	if (time_before_eq(base->next_timer, jiffies))
+		need_raise = true;
+	if (need_raise)
 		raise_softirq(TIMER_SOFTIRQ);
 out:
 #ifdef CONFIG_PREEMPT_RT_FULL
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 14:34 ` [PATCH 2/2] timer: really raise softirq if there is irq_work to do Sebastian Andrzej Siewior
@ 2014-01-31 17:07   ` Steven Rostedt
  2014-01-31 17:11     ` Steven Rostedt
                       ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Steven Rostedt @ 2014-01-31 17:07 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, tglx, Paul E. McKenney, Clark Williams

On Fri, 31 Jan 2014 15:34:05 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> from looking at the code, it seems that the softirq is only raised (in
> the !base->active_timers case) if we have also an expired timer
> (time_before_eq() is true). This patch ensures that the timer softirq is
> also raised in the !base->active_timers && no timer expired.

A couple of things. If there is no active timers, we do not need to
check the expired timers. That may contain a deferred timer that does
not need to be raised if the system is idle. This will just
re-introduce the problems that other people have been seeing.

The bug that I found is that if there *are* active timers, but they
have not expired yet. Why is this a problem? Because in that case we do
not check if there is irq_work to be done. That means the irq_work will
have to wait till the timer expires, and since RCU depends on this,
that can take a while. I've had a synchronize_sched() take up to 5
seconds to complete due to this!


The real fix is the following:

timer/rt: Always raise the softirq if there's irq_work to be done

It was previously discovered that some systems would hang on boot up
with a previous version of 3.12-rt. This was due to RCU using irq_work,
and RT defers the irq_work to a softirq. But if there's no active
timers, the softirq will not be raised, and RCU work will not get done,
causing the system to hang.  The fix was to check that if there was no
active timers but irq_work to be done, then we should raise the softirq.

But this fix was not 100% correct. It left out the case that there were
active timers that were not expired yet. This would have the softirq
not get raised even if there was irq work to be done.

If there is irq_work to be done, then we must raise the timer softirq
regardless of if there is active timers or whether they are expired or
not. The softirq can handle those cases. But we can never ignore
irq_work.

As it is only PREEMPT_RT_FULL that requires irq_work to be done in the
softirq, we can pull out the check in the active_timers condition, and
make the code a bit cleaner by having the irq_work check separate, and
put the code in with the other #ifdef PREEMPT_RT. If there is irq_work
to be done, there's no need to check the active timers or if they are
expired. Just raise the time softirq and be done with it. Otherwise, we
can do the timer checks just like we do with non -rt.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/timer.c b/kernel/timer.c
index 106968f..426d114 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1461,18 +1461,20 @@ void run_local_timers(void)
 	 * the timer softirq.
 	 */
 #ifdef CONFIG_PREEMPT_RT_FULL
+	/* On RT, irq work runs from softirq */
+	if (irq_work_needs_cpu()) {
+		raise_softirq(TIMER_SOFTIRQ);
+		return;
+	}
+
 	if (!spin_do_trylock(&base->lock)) {
 		raise_softirq(TIMER_SOFTIRQ);
 		return;
 	}
 #endif
-	if (!base->active_timers) {
-#ifdef CONFIG_PREEMPT_RT_FULL
-		/* On RT, irq work runs from softirq */
-		if (!irq_work_needs_cpu())
-#endif
-			goto out;
-	}
+
+	if (!base->active_timers)
+		goto out;
 
 	/* Check whether the next pending timer has expired */
 	if (time_before_eq(base->next_timer, jiffies))

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 17:07   ` Steven Rostedt
@ 2014-01-31 17:11     ` Steven Rostedt
  2014-01-31 17:42     ` Paul E. McKenney
  2014-01-31 19:06     ` Sebastian Andrzej Siewior
  2 siblings, 0 replies; 22+ messages in thread
From: Steven Rostedt @ 2014-01-31 17:11 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel, tglx,
	Paul E. McKenney, Clark Williams

On Fri, 31 Jan 2014 12:07:57 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
 
> diff --git a/kernel/timer.c b/kernel/timer.c
> index 106968f..426d114 100644
> --- a/kernel/timer.c
> +++ b/kernel/timer.c
> @@ -1461,18 +1461,20 @@ void run_local_timers(void)
>  	 * the timer softirq.
>  	 */
>  #ifdef CONFIG_PREEMPT_RT_FULL
> +	/* On RT, irq work runs from softirq */
> +	if (irq_work_needs_cpu()) {
> +		raise_softirq(TIMER_SOFTIRQ);
> +		return;
> +	}
> +
>  	if (!spin_do_trylock(&base->lock)) {
>  		raise_softirq(TIMER_SOFTIRQ);
>  		return;
>  	}
>  #endif

Note, I debated about doing:

	if (irq_work_needs_cpu() ||
	    !spin_do_trylock(&base->lock)) {
		raise_softirq(TIMER_SOFTIRQ);
		return;
	}

instead, which is pretty much the same code. But I find it rather ugly,
and does not read as well. I haven't looked at the disassembly, but I
would hope that gcc would make my original version have the same result
as this one.

-- Steve

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 17:07   ` Steven Rostedt
  2014-01-31 17:11     ` Steven Rostedt
@ 2014-01-31 17:42     ` Paul E. McKenney
  2014-01-31 17:57       ` Steven Rostedt
  2014-01-31 19:06     ` Sebastian Andrzej Siewior
  2 siblings, 1 reply; 22+ messages in thread
From: Paul E. McKenney @ 2014-01-31 17:42 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel, tglx,
	Clark Williams

On Fri, Jan 31, 2014 at 12:07:57PM -0500, Steven Rostedt wrote:
> On Fri, 31 Jan 2014 15:34:05 +0100
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > from looking at the code, it seems that the softirq is only raised (in
> > the !base->active_timers case) if we have also an expired timer
> > (time_before_eq() is true). This patch ensures that the timer softirq is
> > also raised in the !base->active_timers && no timer expired.
> 
> A couple of things. If there is no active timers, we do not need to
> check the expired timers. That may contain a deferred timer that does
> not need to be raised if the system is idle. This will just
> re-introduce the problems that other people have been seeing.
> 
> The bug that I found is that if there *are* active timers, but they
> have not expired yet. Why is this a problem? Because in that case we do
> not check if there is irq_work to be done. That means the irq_work will
> have to wait till the timer expires, and since RCU depends on this,
> that can take a while. I've had a synchronize_sched() take up to 5
> seconds to complete due to this!
> 
> 
> The real fix is the following:
> 
> timer/rt: Always raise the softirq if there's irq_work to be done
> 
> It was previously discovered that some systems would hang on boot up
> with a previous version of 3.12-rt. This was due to RCU using irq_work,
> and RT defers the irq_work to a softirq. But if there's no active
> timers, the softirq will not be raised, and RCU work will not get done,
> causing the system to hang.  The fix was to check that if there was no
> active timers but irq_work to be done, then we should raise the softirq.
> 
> But this fix was not 100% correct. It left out the case that there were
> active timers that were not expired yet. This would have the softirq
> not get raised even if there was irq work to be done.
> 
> If there is irq_work to be done, then we must raise the timer softirq
> regardless of if there is active timers or whether they are expired or
> not. The softirq can handle those cases. But we can never ignore
> irq_work.
> 
> As it is only PREEMPT_RT_FULL that requires irq_work to be done in the
> softirq, we can pull out the check in the active_timers condition, and
> make the code a bit cleaner by having the irq_work check separate, and
> put the code in with the other #ifdef PREEMPT_RT. If there is irq_work
> to be done, there's no need to check the active timers or if they are
> expired. Just raise the time softirq and be done with it. Otherwise, we
> can do the timer checks just like we do with non -rt.
> 
> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> 
> diff --git a/kernel/timer.c b/kernel/timer.c
> index 106968f..426d114 100644
> --- a/kernel/timer.c
> +++ b/kernel/timer.c
> @@ -1461,18 +1461,20 @@ void run_local_timers(void)
>  	 * the timer softirq.
>  	 */
>  #ifdef CONFIG_PREEMPT_RT_FULL
> +	/* On RT, irq work runs from softirq */
> +	if (irq_work_needs_cpu()) {
> +		raise_softirq(TIMER_SOFTIRQ);

OK, I'll bite...  What if the IRQ work that needs doing is something
other than TIMER_SOFTIRQ?

							Thanx, Paul

> +		return;
> +	}
> +
>  	if (!spin_do_trylock(&base->lock)) {
>  		raise_softirq(TIMER_SOFTIRQ);
>  		return;
>  	}
>  #endif
> -	if (!base->active_timers) {
> -#ifdef CONFIG_PREEMPT_RT_FULL
> -		/* On RT, irq work runs from softirq */
> -		if (!irq_work_needs_cpu())
> -#endif
> -			goto out;
> -	}
> +
> +	if (!base->active_timers)
> +		goto out;
> 
>  	/* Check whether the next pending timer has expired */
>  	if (time_before_eq(base->next_timer, jiffies))
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 17:42     ` Paul E. McKenney
@ 2014-01-31 17:57       ` Steven Rostedt
  2014-01-31 19:03         ` Paul E. McKenney
  2014-01-31 19:26         ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 22+ messages in thread
From: Steven Rostedt @ 2014-01-31 17:57 UTC (permalink / raw)
  To: paulmck
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel, tglx,
	Clark Williams

On Fri, 31 Jan 2014 09:42:27 -0800
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> On Fri, Jan 31, 2014 at 12:07:57PM -0500, Steven Rostedt wrote:
> > On Fri, 31 Jan 2014 15:34:05 +0100
> > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > 
> > > from looking at the code, it seems that the softirq is only raised (in
> > > the !base->active_timers case) if we have also an expired timer
> > > (time_before_eq() is true). This patch ensures that the timer softirq is
> > > also raised in the !base->active_timers && no timer expired.
> > 
> > A couple of things. If there is no active timers, we do not need to
> > check the expired timers. That may contain a deferred timer that does
> > not need to be raised if the system is idle. This will just
> > re-introduce the problems that other people have been seeing.
> > 
> > The bug that I found is that if there *are* active timers, but they
> > have not expired yet. Why is this a problem? Because in that case we do
> > not check if there is irq_work to be done. That means the irq_work will
> > have to wait till the timer expires, and since RCU depends on this,
> > that can take a while. I've had a synchronize_sched() take up to 5
> > seconds to complete due to this!
> > 
> > 
> > The real fix is the following:
> > 
> > timer/rt: Always raise the softirq if there's irq_work to be done
> > 
> > It was previously discovered that some systems would hang on boot up
> > with a previous version of 3.12-rt. This was due to RCU using irq_work,
> > and RT defers the irq_work to a softirq. But if there's no active
> > timers, the softirq will not be raised, and RCU work will not get done,
> > causing the system to hang.  The fix was to check that if there was no
> > active timers but irq_work to be done, then we should raise the softirq.
> > 
> > But this fix was not 100% correct. It left out the case that there were
> > active timers that were not expired yet. This would have the softirq
> > not get raised even if there was irq work to be done.
> > 
> > If there is irq_work to be done, then we must raise the timer softirq
> > regardless of if there is active timers or whether they are expired or
> > not. The softirq can handle those cases. But we can never ignore
> > irq_work.
> > 
> > As it is only PREEMPT_RT_FULL that requires irq_work to be done in the
> > softirq, we can pull out the check in the active_timers condition, and
> > make the code a bit cleaner by having the irq_work check separate, and
> > put the code in with the other #ifdef PREEMPT_RT. If there is irq_work
> > to be done, there's no need to check the active timers or if they are
> > expired. Just raise the time softirq and be done with it. Otherwise, we
> > can do the timer checks just like we do with non -rt.
> > 
> > Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> > 
> > diff --git a/kernel/timer.c b/kernel/timer.c
> > index 106968f..426d114 100644
> > --- a/kernel/timer.c
> > +++ b/kernel/timer.c
> > @@ -1461,18 +1461,20 @@ void run_local_timers(void)
> >  	 * the timer softirq.
> >  	 */
> >  #ifdef CONFIG_PREEMPT_RT_FULL
> > +	/* On RT, irq work runs from softirq */
> > +	if (irq_work_needs_cpu()) {
> > +		raise_softirq(TIMER_SOFTIRQ);
> 
> OK, I'll bite...  What if the IRQ work that needs doing is something
> other than TIMER_SOFTIRQ?

Heh, don't let the timer part confuse you. The only reason that softirq
is relevant to irq_work is that is the softirq that we placed the
irq_work to be done. If you look at the code that is called for that
softirq (in -rt) you'll see:

static void run_timer_softirq(struct softirq_action *h)
{
	struct tvec_base *base = __this_cpu_read(tvec_bases);

#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
	irq_work_run();
#endif

	if (time_after_eq(jiffies, base->timer_jiffies))
		__run_timers(base);
}

And we also have:

void update_process_times(int user_tick)
{
	struct task_struct *p = current;
	int cpu = smp_processor_id();

	/* Note: this timer irq context must be accounted for as well. */
	account_process_tick(p, user_tick);
	scheduler_tick();
	run_local_timers();
	rcu_check_callbacks(cpu, user_tick);
#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
	if (in_irq())
		irq_work_run();
#endif
	run_posix_cpu_timers(p);
}


In vanilla Linux, irq_work_run() is called from update_process_times()
when it is called from the timer interrupt. In -rt, there's reasons we
can't do the irq work from hard irq, so we push it off to the timer
softirq, and run it there.

That means if we have *any* irq work to do, we raise the timer softirq,
even if the work to be done has nothing to do with timers. As you can
see from the softirq timer code, in -rt, irq_work_run() is always
called, without having to look at any timers.

-- Steve



> 
> 							Thanx, Paul
> 
> > +		return;
> > +	}
> > +
> >  	if (!spin_do_trylock(&base->lock)) {

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 17:57       ` Steven Rostedt
@ 2014-01-31 19:03         ` Paul E. McKenney
  2014-01-31 19:26         ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 22+ messages in thread
From: Paul E. McKenney @ 2014-01-31 19:03 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel, tglx,
	Clark Williams

On Fri, Jan 31, 2014 at 12:57:19PM -0500, Steven Rostedt wrote:
> On Fri, 31 Jan 2014 09:42:27 -0800
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Fri, Jan 31, 2014 at 12:07:57PM -0500, Steven Rostedt wrote:
> > > On Fri, 31 Jan 2014 15:34:05 +0100
> > > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> > > 
> > > > from looking at the code, it seems that the softirq is only raised (in
> > > > the !base->active_timers case) if we have also an expired timer
> > > > (time_before_eq() is true). This patch ensures that the timer softirq is
> > > > also raised in the !base->active_timers && no timer expired.
> > > 
> > > A couple of things. If there is no active timers, we do not need to
> > > check the expired timers. That may contain a deferred timer that does
> > > not need to be raised if the system is idle. This will just
> > > re-introduce the problems that other people have been seeing.
> > > 
> > > The bug that I found is that if there *are* active timers, but they
> > > have not expired yet. Why is this a problem? Because in that case we do
> > > not check if there is irq_work to be done. That means the irq_work will
> > > have to wait till the timer expires, and since RCU depends on this,
> > > that can take a while. I've had a synchronize_sched() take up to 5
> > > seconds to complete due to this!
> > > 
> > > 
> > > The real fix is the following:
> > > 
> > > timer/rt: Always raise the softirq if there's irq_work to be done
> > > 
> > > It was previously discovered that some systems would hang on boot up
> > > with a previous version of 3.12-rt. This was due to RCU using irq_work,
> > > and RT defers the irq_work to a softirq. But if there's no active
> > > timers, the softirq will not be raised, and RCU work will not get done,
> > > causing the system to hang.  The fix was to check that if there was no
> > > active timers but irq_work to be done, then we should raise the softirq.
> > > 
> > > But this fix was not 100% correct. It left out the case that there were
> > > active timers that were not expired yet. This would have the softirq
> > > not get raised even if there was irq work to be done.
> > > 
> > > If there is irq_work to be done, then we must raise the timer softirq
> > > regardless of if there is active timers or whether they are expired or
> > > not. The softirq can handle those cases. But we can never ignore
> > > irq_work.
> > > 
> > > As it is only PREEMPT_RT_FULL that requires irq_work to be done in the
> > > softirq, we can pull out the check in the active_timers condition, and
> > > make the code a bit cleaner by having the irq_work check separate, and
> > > put the code in with the other #ifdef PREEMPT_RT. If there is irq_work
> > > to be done, there's no need to check the active timers or if they are
> > > expired. Just raise the time softirq and be done with it. Otherwise, we
> > > can do the timer checks just like we do with non -rt.
> > > 
> > > Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> > > 
> > > diff --git a/kernel/timer.c b/kernel/timer.c
> > > index 106968f..426d114 100644
> > > --- a/kernel/timer.c
> > > +++ b/kernel/timer.c
> > > @@ -1461,18 +1461,20 @@ void run_local_timers(void)
> > >  	 * the timer softirq.
> > >  	 */
> > >  #ifdef CONFIG_PREEMPT_RT_FULL
> > > +	/* On RT, irq work runs from softirq */
> > > +	if (irq_work_needs_cpu()) {
> > > +		raise_softirq(TIMER_SOFTIRQ);
> > 
> > OK, I'll bite...  What if the IRQ work that needs doing is something
> > other than TIMER_SOFTIRQ?
> 
> Heh, don't let the timer part confuse you. The only reason that softirq
> is relevant to irq_work is that is the softirq that we placed the
> irq_work to be done. If you look at the code that is called for that
> softirq (in -rt) you'll see:
> 
> static void run_timer_softirq(struct softirq_action *h)
> {
> 	struct tvec_base *base = __this_cpu_read(tvec_bases);
> 
> #if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
> 	irq_work_run();
> #endif
> 
> 	if (time_after_eq(jiffies, base->timer_jiffies))
> 		__run_timers(base);
> }
> 
> And we also have:
> 
> void update_process_times(int user_tick)
> {
> 	struct task_struct *p = current;
> 	int cpu = smp_processor_id();
> 
> 	/* Note: this timer irq context must be accounted for as well. */
> 	account_process_tick(p, user_tick);
> 	scheduler_tick();
> 	run_local_timers();
> 	rcu_check_callbacks(cpu, user_tick);
> #if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
> 	if (in_irq())
> 		irq_work_run();
> #endif
> 	run_posix_cpu_timers(p);
> }
> 
> 
> In vanilla Linux, irq_work_run() is called from update_process_times()
> when it is called from the timer interrupt. In -rt, there's reasons we
> can't do the irq work from hard irq, so we push it off to the timer
> softirq, and run it there.
> 
> That means if we have *any* irq work to do, we raise the timer softirq,
> even if the work to be done has nothing to do with timers. As you can
> see from the softirq timer code, in -rt, irq_work_run() is always
> called, without having to look at any timers.

OK, got it!  Thank you for the tutorial.  ;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 17:07   ` Steven Rostedt
  2014-01-31 17:11     ` Steven Rostedt
  2014-01-31 17:42     ` Paul E. McKenney
@ 2014-01-31 19:06     ` Sebastian Andrzej Siewior
  2 siblings, 0 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-01-31 19:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-rt-users, linux-kernel, tglx, Paul E. McKenney, Clark Williams

On 01/31/2014 06:07 PM, Steven Rostedt wrote:
> The bug that I found is that if there *are* active timers, but they
> have not expired yet. Why is this a problem? Because in that case we do

Argh, right. But your patch looks also way better. After I made I was
not too happy about that amount of ifdef and wanted to redo it later.
What you just posted is a way better solution.

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 17:57       ` Steven Rostedt
  2014-01-31 19:03         ` Paul E. McKenney
@ 2014-01-31 19:26         ` Sebastian Andrzej Siewior
  2014-01-31 19:34           ` Steven Rostedt
  1 sibling, 1 reply; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-01-31 19:26 UTC (permalink / raw)
  To: Steven Rostedt, peterz, paulmck
  Cc: linux-rt-users, linux-kernel, tglx, Clark Williams

On 01/31/2014 06:57 PM, Steven Rostedt wrote:

> In vanilla Linux, irq_work_run() is called from update_process_times()
> when it is called from the timer interrupt. In -rt, there's reasons we

and in vanilla Linux some architectures (like x86 or sparc to name just
a few) overwrite arch_irq_work_raise() which means they provide
their "own" interrupt like callback. That means on those architectures
irq_work_run() gets invoked twice: once via update_process_times() and
via and once the custom interface.
So my question to the original inventor of this code: Peter, do we
really need that arch specific callback? Wouldn't one be enough? Is it
that critical that it can't wait to the next timer tick?

Sebastian


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 19:26         ` Sebastian Andrzej Siewior
@ 2014-01-31 19:34           ` Steven Rostedt
  2014-01-31 19:48             ` Sebastian Andrzej Siewior
  2014-01-31 19:54             ` Peter Zijlstra
  0 siblings, 2 replies; 22+ messages in thread
From: Steven Rostedt @ 2014-01-31 19:34 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: peterz, paulmck, linux-rt-users, linux-kernel, tglx, Clark Williams

On Fri, 31 Jan 2014 20:26:54 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> On 01/31/2014 06:57 PM, Steven Rostedt wrote:
> 
> > In vanilla Linux, irq_work_run() is called from update_process_times()
> > when it is called from the timer interrupt. In -rt, there's reasons we
> 
> and in vanilla Linux some architectures (like x86 or sparc to name just
> a few) overwrite arch_irq_work_raise() which means they provide
> their "own" interrupt like callback. That means on those architectures
> irq_work_run() gets invoked twice: once via update_process_times() and
> via and once the custom interface.
> So my question to the original inventor of this code: Peter, do we
> really need that arch specific callback? Wouldn't one be enough? Is it
> that critical that it can't wait to the next timer tick?

There's flags that determine when the next call should be invoked. The
irq_work_run() should return immediately if it was already done by the
arch specific call. The work wont be called twice.

As I have worked on code that uses irq_work() I can say that we want
the arch specific interrupts. For those architectures that don't have
it will experience larger latencies for the work required. It's
basically, a "too bad" for them.

But to answer your question, no we want the immediate response.

-- Steve

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 19:34           ` Steven Rostedt
@ 2014-01-31 19:48             ` Sebastian Andrzej Siewior
  2014-01-31 19:56               ` Steven Rostedt
  2014-01-31 20:05               ` Peter Zijlstra
  2014-01-31 19:54             ` Peter Zijlstra
  1 sibling, 2 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-01-31 19:48 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: peterz, paulmck, linux-rt-users, linux-kernel, tglx, Clark Williams

On 01/31/2014 08:34 PM, Steven Rostedt wrote:
> There's flags that determine when the next call should be invoked. The
> irq_work_run() should return immediately if it was already done by the
> arch specific call. The work wont be called twice.

Well, it is called twice. It just does nothing because the list is
empty & returns.

> As I have worked on code that uses irq_work() I can say that we want
> the arch specific interrupts. For those architectures that don't have
> it will experience larger latencies for the work required. It's
> basically, a "too bad" for them.

How "bad" is it? Is this something generic or just not getting
perf events fast enough out? Most users don't seem to require small
latencies.

> But to answer your question, no we want the immediate response.
> 
> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 19:34           ` Steven Rostedt
  2014-01-31 19:48             ` Sebastian Andrzej Siewior
@ 2014-01-31 19:54             ` Peter Zijlstra
  1 sibling, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2014-01-31 19:54 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sebastian Andrzej Siewior, paulmck, linux-rt-users, linux-kernel,
	tglx, Clark Williams

On Fri, Jan 31, 2014 at 02:34:41PM -0500, Steven Rostedt wrote:
> On Fri, 31 Jan 2014 20:26:54 +0100
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > On 01/31/2014 06:57 PM, Steven Rostedt wrote:
> > 
> > > In vanilla Linux, irq_work_run() is called from update_process_times()
> > > when it is called from the timer interrupt. In -rt, there's reasons we
> > 
> > and in vanilla Linux some architectures (like x86 or sparc to name just
> > a few) overwrite arch_irq_work_raise() which means they provide
> > their "own" interrupt like callback. That means on those architectures
> > irq_work_run() gets invoked twice: once via update_process_times() and
> > via and once the custom interface.
> > So my question to the original inventor of this code: Peter, do we
> > really need that arch specific callback? Wouldn't one be enough? Is it
> > that critical that it can't wait to the next timer tick?
> 
> There's flags that determine when the next call should be invoked. The
> irq_work_run() should return immediately if it was already done by the
> arch specific call. The work wont be called twice.
> 
> As I have worked on code that uses irq_work() I can say that we want
> the arch specific interrupts. For those architectures that don't have
> it will experience larger latencies for the work required. It's
> basically, a "too bad" for them.
> 
> But to answer your question, no we want the immediate response.

Yah, what Steve said. That fallback is really a you suck to have to use
this.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 19:48             ` Sebastian Andrzej Siewior
@ 2014-01-31 19:56               ` Steven Rostedt
  2014-01-31 20:05               ` Peter Zijlstra
  1 sibling, 0 replies; 22+ messages in thread
From: Steven Rostedt @ 2014-01-31 19:56 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: peterz, paulmck, linux-rt-users, linux-kernel, tglx, Clark Williams

On Fri, 31 Jan 2014 20:48:45 +0100
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> > As I have worked on code that uses irq_work() I can say that we want
> > the arch specific interrupts. For those architectures that don't have
> > it will experience larger latencies for the work required. It's
> > basically, a "too bad" for them.
> 
> How "bad" is it? Is this something generic or just not getting
> perf events fast enough out? Most users don't seem to require small
> latencies.

I use it for waking up trace-cmd, and if it is too long, then it we can
miss lots of events. Same goes for perf.

Remember, irq_work can be triggered from NMI context. That means, if
the CPU is idle, it may be several seconds before that work happens.
That is WAY too long to wait.

Anyway, your suggestion to get rid of the arch code doesn't help. We
call the irq_work_run() from interrupt context whether there is work or
not, with or without removing the arch code. The only thing your
suggestion will do is to push the work from happening immediately to
happening on the timer tick (which may be several seconds later). I
don't see the savings.

-- Steve


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 19:48             ` Sebastian Andrzej Siewior
  2014-01-31 19:56               ` Steven Rostedt
@ 2014-01-31 20:05               ` Peter Zijlstra
  2014-01-31 20:23                 ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2014-01-31 20:05 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Steven Rostedt, paulmck, linux-rt-users, linux-kernel, tglx,
	Clark Williams

On Fri, Jan 31, 2014 at 08:48:45PM +0100, Sebastian Andrzej Siewior wrote:
> 
> How "bad" is it? Is this something generic or just not getting
> perf events fast enough out? Most users don't seem to require small
> latencies.

I have vague memories of there being an actual perf problem if there's a
hole between the NMI/IRQ triggering the irq_work and the interrupt
running the work.

I should have some notes on it somewhere and an todo entry to plug the
hole.

But note that the MCE code also uses irq_work, they really _need_ to be
fast because the system might be crumbling under their feet.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 20:05               ` Peter Zijlstra
@ 2014-01-31 20:23                 ` Sebastian Andrzej Siewior
  2014-01-31 20:29                   ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-01-31 20:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, paulmck, linux-rt-users, linux-kernel, tglx,
	Clark Williams

On 01/31/2014 09:05 PM, Peter Zijlstra wrote:
> On Fri, Jan 31, 2014 at 08:48:45PM +0100, Sebastian Andrzej Siewior wrote:
>>
>> How "bad" is it? Is this something generic or just not getting
>> perf events fast enough out? Most users don't seem to require small
>> latencies.
> 
> I have vague memories of there being an actual perf problem if there's a
> hole between the NMI/IRQ triggering the irq_work and the interrupt
> running the work.
> 
> I should have some notes on it somewhere and an todo entry to plug the
> hole.
> 
> But note that the MCE code also uses irq_work, they really _need_ to be
> fast because the system might be crumbling under their feet.

Okay, this makes sense. What looked odd was the powerpc implementation
where they let the timer interrupt expire and call the irq_work from
the timer interrupt just before the clockevents callback is executed
(which would invoke the irq_work callback as well).

Would it be a win if we would remove the arch specific code and instead
raise the timer interrupt asap? It sure won't be a win or change for
-RT but it would allow all architectures to get irq_work done as soon
as possible in IRQ context (and out of NMI context if I understand
correct) without an arch specific implementation.

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/2] timer: really raise softirq if there is irq_work to do
  2014-01-31 20:23                 ` Sebastian Andrzej Siewior
@ 2014-01-31 20:29                   ` Peter Zijlstra
  0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2014-01-31 20:29 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Steven Rostedt, paulmck, linux-rt-users, linux-kernel, tglx,
	Clark Williams

On Fri, Jan 31, 2014 at 09:23:52PM +0100, Sebastian Andrzej Siewior wrote:
> Okay, this makes sense. What looked odd was the powerpc implementation
> where they let the timer interrupt expire and call the irq_work from
> the timer interrupt just before the clockevents callback is executed
> (which would invoke the irq_work callback as well).

Ah, so what I remember Paul Mackerras saying was that that was the
easiest way to trigger an interrupt on the local CPU.

ARM runs the things from every IRQ tail IIRC (or maybe only the PMI, I
forget).

> Would it be a win if we would remove the arch specific code and instead
> raise the timer interrupt asap? It sure won't be a win or change for
> -RT but it would allow all architectures to get irq_work done as soon
> as possible in IRQ context (and out of NMI context if I understand
> correct) without an arch specific implementation.

No, because poking at timer hardware on x86 is stupid expensive, whereas
sending a self-IPI through the APIC is tons cheaper.

Also, I don't really want to poke at the fragile x86 timer hardware from
NMI context while we might already be poking at it from another context.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
  2014-01-31 14:34 [PATCH 1/2] irq_work: allow certain work in hard irq context Sebastian Andrzej Siewior
  2014-01-31 14:34 ` [PATCH 2/2] timer: really raise softirq if there is irq_work to do Sebastian Andrzej Siewior
@ 2014-02-02  4:22 ` Mike Galbraith
  2014-02-02 20:10   ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2014-02-02  4:22 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: linux-rt-users, linux-kernel, rostedt, tglx

On Fri, 2014-01-31 at 15:34 +0100, Sebastian Andrzej Siewior wrote: 
> irq_work is processed in softirq context on -RT because we want to avoid
> long latencies which might arise from processing lots of perf events.
> The noHZ-full mode requires its callback to be called from real hardirq
> context (commit 76c24fb ("nohz: New APIs to re-evaluate the tick on full
> dynticks CPUs")). If it is called from a thread context we might get
> wrong results for checks like "is_idle_task(current)".
> This patch introduces a second list (hirq_work_list) which will be used
> if irq_work_run() has been invoked from hardirq context and process only
> work items marked with IRQ_WORK_HARD_IRQ.

This patch (w. too noisy to live pr_err whacked) reliable kills my 64
core test box, but only in _virgin_ 3.12-rt11.  Add my local patches,
and it runs and runs, happy as a clam.  Odd.  But whatever, box with
virgin source running says it's busted.

Killing what was killable in this run before box had a chance to turn
into a brick, the two tasks below were left, burning 100% CPU until 5
minute RCU deadline expired.  All other cores were idle.

[  705.465667] INFO: rcu_preempt detected stalls on CPUs/tasks:
[  705.465674] 	5: (714 GPs behind) idle=b03/1/0 softirq=1/1 
[  705.465681] 	(detected by 0, t=300002 jiffies, g=14203, c=14202, q=0)
[  705.465681] sending NMI to all CPUs:
[  705.465685] NMI backtrace for cpu 0
[  705.465688] CPU: 0 PID: 0 Comm: swapper/0 Tainted: GF            3.12.9-rt11 #376
[  705.465689] Hardware name: Hewlett-Packard ProLiant DL980 G7, BIOS P66 07/07/2010
[  705.465691] task: ffffffff81a14460 ti: ffffffff81a00000 task.ti: ffffffff81a00000
[  705.465701] RIP: 0010:[<ffffffff8104155a>]  [<ffffffff8104155a>] native_write_msr_safe+0xa/0x10
[  705.465702] RSP: 0000:ffff880276e03c48  EFLAGS: 00000046
[  705.465703] RAX: 0000000000000400 RBX: 000000000000b084 RCX: 0000000000000830
[  705.465704] RDX: 0000000000000002 RSI: 0000000000000400 RDI: 0000000000000830
[  705.465705] RBP: ffff880276e03c48 R08: 0000000000000100 R09: ffffffff81ab74a0
[  705.465705] R10: 0000000000000502 R11: 0000000000000028 R12: ffffffff81ab74a0
[  705.465706] R13: 0000000000080000 R14: 0000000000000002 R15: 0000000000000002
[  705.465708] FS:  0000000000000000(0000) GS:ffff880276e00000(0000) knlGS:0000000000000000
[  705.465709] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  705.465710] CR2: 00007ff8086cbed0 CR3: 000000026347c000 CR4: 00000000000007f0
[  705.465710] Stack:
[  705.465712]  ffff880276e03cb8 ffffffff8103aab9 0000000000000001 0000000000000001
[  705.465714]  ffff880276e03cc8 ffffffff815d1810 0000000000000000 0000000000000092
[  705.465715]  ffff880276e03c98 0000000000000000 ffffffff81a42e00 ffffffff81ab7480
[  705.465716] Call Trace:
[  705.465718]  <IRQ> 
[  705.465722]  [<ffffffff8103aab9>] __x2apic_send_IPI_mask+0xa9/0xe0
[  705.465727]  [<ffffffff815d1810>] ? printk+0x54/0x78
[  705.465729]  [<ffffffff8103ab09>] x2apic_send_IPI_all+0x19/0x20
[  705.465731]  [<ffffffff81036533>] arch_trigger_all_cpu_backtrace+0x73/0xb0
[  705.465734]  [<ffffffff81103df9>] print_other_cpu_stall+0x259/0x360
[  705.465739]  [<ffffffff8100a8d0>] ? native_sched_clock+0x20/0xa0
[  705.465740]  [<ffffffff81103f88>] __rcu_pending+0x88/0x1f0
[  705.465742]  [<ffffffff811042e5>] rcu_check_callbacks+0x1f5/0x300
[  705.465745]  [<ffffffff81068346>] update_process_times+0x46/0x80
[  705.465749]  [<ffffffff810c4f02>] tick_sched_handle+0x32/0x70
[  705.465751]  [<ffffffff810c51d0>] tick_sched_timer+0x40/0x70
[  705.465755]  [<ffffffff81084b8d>] __run_hrtimer+0x14d/0x280
[  705.465757]  [<ffffffff810c5190>] ? tick_nohz_handler+0xa0/0xa0
[  705.465758]  [<ffffffff81084dea>] hrtimer_interrupt+0x12a/0x310
[  705.465762]  [<ffffffff815d94ef>] ? __atomic_notifier_call_chain+0x4f/0x70
[  705.465764]  [<ffffffff81034af6>] local_apic_timer_interrupt+0x36/0x60
[  705.465766]  [<ffffffff810359fe>] smp_apic_timer_interrupt+0x3e/0x60
[  705.465768]  [<ffffffff815ddcdd>] apic_timer_interrupt+0x6d/0x80
[  705.465770]  <EOI> 
[  705.465771]  [<ffffffff81041696>] ? native_safe_halt+0x6/0x10
[  705.465774]  [<ffffffff8100c8d3>] default_idle+0x83/0x120
[  705.465776]  [<ffffffff8100bfa6>] arch_cpu_idle+0x26/0x30
[  705.465778]  [<ffffffff810b341d>] cpu_idle_loop+0x28d/0x2e0
[  705.465779]  [<ffffffff810b34bc>] cpu_startup_entry+0x4c/0x50
[  705.465781]  [<ffffffff815c8fd3>] rest_init+0x83/0x90
[  705.465785]  [<ffffffff81ad5175>] start_kernel+0x3fc/0x4a3
[  705.465787]  [<ffffffff81ad4b66>] ? repair_env_string+0x58/0x58
[  705.465789]  [<ffffffff81ad451f>] x86_64_start_reservations+0x1b/0x32
[  705.465791]  [<ffffffff81ad46a5>] x86_64_start_kernel+0x16f/0x17e
[  705.465792]  [<ffffffff81ad4120>] ? early_idt_handlers+0x120/0x120
[  705.465805] Code: 00 55 89 f9 48 89 e5 0f 32 31 c9 89 c7 48 89 d0 89 0e 48 c1 e0 20 89 fa 48 09 d0 c9 c3 0f 1f 40 00 55 89 f9 89 f0 48 89 e5 0f 30 <31> c0 c9 c3 66 90 55 89 f9 48 89 e5 0f 33 89 c1 48 89 d0 48 c1 
[  705.466006] NMI backtrace for cpu 5
[  705.466009] CPU: 5 PID: 21792 Comm: cc1 Tainted: GF            3.12.9-rt11 #376
[  705.466010] Hardware name: Hewlett-Packard ProLiant DL980 G7, BIOS P66 07/07/2010
[  705.466011] task: ffff88026e9ebdb0 ti: ffff880037b62000 task.ti: ffff880037b62000
[  705.466015] RIP: 0010:[<ffffffff815d5450>]  [<ffffffff815d5450>] _raw_spin_unlock_irq+0x40/0x40
[  705.466016] RSP: 0000:ffff880276ea3d00  EFLAGS: 00000002
[  705.466017] RAX: ffff880276eadcc0 RBX: 00000000ffffffff RCX: 0000000000000086
[  705.466018] RDX: 0000000000000002 RSI: 0000000000000086 RDI: ffff880276eadc40
[  705.466019] RBP: ffff880276ea3d38 R08: 00000000000008ad R09: 00000000000000a2
[  705.466020] R10: 0000000000000005 R11: ffff880276eb41a0 R12: ffff880276eae4e0
[  705.466020] R13: ffff880276eadcc0 R14: 0000000000000000 R15: ffff880276eadcc0
[  705.466022] FS:  00002b5fa3f5c600(0000) GS:ffff880276ea0000(0000) knlGS:0000000000000000
[  705.466023] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  705.466023] CR2: 00002b5fa4c92000 CR3: 0000000078766000 CR4: 00000000000007e0
[  705.466024] Stack:
[  705.466026]  ffffffff81085074 ffff880276ea3d28 ffff88026e9ebe20 0000000000000086
[  705.466027]  ffff880276eae4e0 ffff880276eae4e0 000000000000000a ffff880276ea3d58
[  705.466028]  ffffffff81085160 ffff880276ea3d68 0000005e2f828d7f ffff880276ea3d78
[  705.466029] Call Trace:
[  705.466030]  <IRQ> 
[  705.466033]  [<ffffffff81085074>] ? hrtimer_try_to_cancel+0x44/0x110
[  705.466035]  [<ffffffff81085160>] hrtimer_cancel+0x20/0x30
[  705.466037]  [<ffffffff810c52b2>] tick_nohz_restart+0x12/0x90
[  705.466039]  [<ffffffff810c56da>] tick_nohz_restart_sched_tick+0x4a/0x60
[  705.466041]  [<ffffffff810c5e99>] __tick_nohz_full_check+0x89/0x90
[  705.466043]  [<ffffffff810c5ea9>] nohz_full_kick_work_func+0x9/0x10
[  705.466047]  [<ffffffff81129e89>] __irq_work_run+0x79/0xb0
[  705.466049]  [<ffffffff81129ec9>] irq_work_run+0x9/0x10
[  705.466051]  [<ffffffff81068362>] update_process_times+0x62/0x80
[  705.466053]  [<ffffffff810c4f02>] tick_sched_handle+0x32/0x70
[  705.466055]  [<ffffffff810c51d0>] tick_sched_timer+0x40/0x70
[  705.466057]  [<ffffffff81084b8d>] __run_hrtimer+0x14d/0x280
[  705.466059]  [<ffffffff810c5190>] ? tick_nohz_handler+0xa0/0xa0
[  705.466060]  [<ffffffff81084dea>] hrtimer_interrupt+0x12a/0x310
[  705.466065]  [<ffffffff81096e4c>] ? vtime_account_user+0x6c/0x100
[  705.466067]  [<ffffffff81034af6>] local_apic_timer_interrupt+0x36/0x60
[  705.466069]  [<ffffffff8103a8c4>] ? native_apic_msr_eoi_write+0x14/0x20
[  705.466071]  [<ffffffff810359fe>] smp_apic_timer_interrupt+0x3e/0x60
[  705.466074]  [<ffffffff815ddcdd>] apic_timer_interrupt+0x6d/0x80
[  705.466075]  <EOI> 
[  705.466088] Code: b9 00 00 83 aa 44 e0 ff ff 01 48 8b 82 38 e0 ff ff a8 08 75 0c 48 8b 82 38 e0 ff ff f6 c4 02 74 05 e8 45 dc ff ff c9 c3 0f 1f 00 <55> 48 89 e5 66 83 07 01 48 89 f7 57 9d 66 66 90 66 90 65 48 8b 
[  705.468619] NMI backtrace for cpu 52
[  705.468622] CPU: 52 PID: 23285 Comm: objdump Tainted: GF            3.12.9-rt11 #376
[  705.468623] Hardware name: Hewlett-Packard ProLiant DL980 G7, BIOS P66 07/07/2010
[  705.468625] task: ffff8802640c5820 ti: ffff8801e8b0c000 task.ti: ffff8801e8b0c000
[  705.468634] RIP: 0010:[<ffffffff81085083>]  [<ffffffff81085083>] hrtimer_try_to_cancel+0x53/0x110
[  705.468635] RSP: 0000:ffff880277483d40  EFLAGS: 00000046
[  705.468636] RAX: 00000000ffffffff RBX: ffff88027748e4e0 RCX: 0000000000000086
[  705.468637] RDX: ffff8801e8b0dfd8 RSI: 0000000000000086 RDI: 0000000000000086
[  705.468638] RBP: ffff880277483d58 R08: 000000000000013e R09: 000000000000012f
[  705.468639] R10: 0000000000000005 R11: ffff8802774941a0 R12: ffff88027748e4e0
[  705.468640] R13: 000000000000000a R14: 0000000000000000 R15: ffff88027748dcc0
[  705.468642] FS:  00002ab0cef7d100(0000) GS:ffff880277480000(0000) knlGS:0000000000000000
[  705.468643] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  705.468644] CR2: 00002ab0cff9bed0 CR3: 0000000265bbb000 CR4: 00000000000007e0
[  705.468645] Stack:
[  705.468647]  ffffffff81085160 ffff880277483d68 0000005ec8c10810 ffff880277483d78
[  705.468648]  ffffffff810c52b2 0000005ec8c10810 ffff88027748e4e0 ffff880277483d98
[  705.468649]  ffffffff810c56da ffff88027748e4e0 0000000000000008 ffff880277483db8
[  705.468650] Call Trace:
[  705.468651]  <IRQ> 
[  705.468653]  [<ffffffff81085160>] ? hrtimer_cancel+0x20/0x30
[  705.468660]  [<ffffffff810c52b2>] tick_nohz_restart+0x12/0x90
[  705.468662]  [<ffffffff810c56da>] tick_nohz_restart_sched_tick+0x4a/0x60
[  705.468665]  [<ffffffff810c5e99>] __tick_nohz_full_check+0x89/0x90
[  705.468667]  [<ffffffff810c5ea9>] nohz_full_kick_work_func+0x9/0x10
[  705.468674]  [<ffffffff81129e89>] __irq_work_run+0x79/0xb0
[  705.468676]  [<ffffffff81129ec9>] irq_work_run+0x9/0x10
[  705.468681]  [<ffffffff81068362>] update_process_times+0x62/0x80
[  705.468683]  [<ffffffff810c4f02>] tick_sched_handle+0x32/0x70
[  705.468685]  [<ffffffff810c51d0>] tick_sched_timer+0x40/0x70
[  705.468687]  [<ffffffff81084b8d>] __run_hrtimer+0x14d/0x280
[  705.468689]  [<ffffffff810c5190>] ? tick_nohz_handler+0xa0/0xa0
[  705.468691]  [<ffffffff81084dea>] hrtimer_interrupt+0x12a/0x310
[  705.468700]  [<ffffffff81096c22>] ? vtime_account_system+0x52/0xe0
[  705.468703]  [<ffffffff81034af6>] local_apic_timer_interrupt+0x36/0x60
[  705.468708]  [<ffffffff8103a8c4>] ? native_apic_msr_eoi_write+0x14/0x20
[  705.468710]  [<ffffffff810359fe>] smp_apic_timer_interrupt+0x3e/0x60
[  705.468721]  [<ffffffff815ddcdd>] apic_timer_interrupt+0x6d/0x80
[  705.468722]  <EOI> 
[  705.468733]  [<ffffffff8105ae13>] ? pin_current_cpu+0x63/0x180
[  705.468742]  [<ffffffff81090505>] migrate_disable+0x95/0x100
[  705.468746]  [<ffffffff81168d21>] __do_fault+0x181/0x590
[  705.468748]  [<ffffffff811691c3>] handle_pte_fault+0x93/0x250
[  705.468750]  [<ffffffff811694b7>] __handle_mm_fault+0x137/0x1e0
[  705.468752]  [<ffffffff81169653>] handle_mm_fault+0xf3/0x1a0
[  705.468755]  [<ffffffff815d90f1>] __do_page_fault+0x291/0x550
[  705.468758]  [<ffffffff8100a8d0>] ? native_sched_clock+0x20/0xa0
[  705.468766]  [<ffffffff81108547>] ? acct_account_cputime+0x17/0x20
[  705.468768]  [<ffffffff81096dc2>] ? account_user_time+0xd2/0xf0
[  705.468770]  [<ffffffff81096e4c>] ? vtime_account_user+0x6c/0x100
[  705.468772]  [<ffffffff815d93f0>] do_page_fault+0x40/0x70
[  705.468774]  [<ffffffff815d5d48>] page_fault+0x28/0x30
[  705.468787] Code: 24 38 49 89 c5 89 d0 a8 02 74 25 49 8b 44 24 30 48 8b 75 e0 48 8b 38 e8 dc 03 55 00 89 d8 4c 8b 65 f0 48 8b 5d e8 4c 8b 6d f8 c9 <c3> 0f 1f 40 00 31 db a8 01 74 d5 8b 05 74 1f a2 00 85 c0 74 5d 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
  2014-02-02  4:22 ` [PATCH 1/2] irq_work: allow certain work in hard irq context Mike Galbraith
@ 2014-02-02 20:10   ` Sebastian Andrzej Siewior
  2014-02-03  2:43     ` Mike Galbraith
  2014-02-03  4:00     ` Mike Galbraith
  0 siblings, 2 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-02-02 20:10 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-rt-users, linux-kernel, rostedt, tglx

On 02/02/2014 05:22 AM, Mike Galbraith wrote:
> This patch (w. too noisy to live pr_err whacked) reliable kills my 64
> core test box, but only in _virgin_ 3.12-rt11.  Add my local patches,
> and it runs and runs, happy as a clam.  Odd.  But whatever, box with
> virgin source running says it's busted.

Sorry for that, I removed that line from the patch in my queue but the
sent version still had it…

> Killing what was killable in this run before box had a chance to turn
> into a brick, the two tasks below were left, burning 100% CPU until 5
> minute RCU deadline expired.  All other cores were idle.
> 
> [  705.466006] NMI backtrace for cpu 5
> [  705.466009] CPU: 5 PID: 21792 Comm: cc1 Tainted: GF            3.12.9-rt11 #376
> [  705.466015] RIP: 0010:[<ffffffff815d5450>]  [<ffffffff815d5450>] _raw_spin_unlock_irq+0x40/0x40
> [  705.466030]  <IRQ> 
> [  705.466033]  [<ffffffff81085074>] ? hrtimer_try_to_cancel+0x44/0x110
> [  705.466035]  [<ffffffff81085160>] hrtimer_cancel+0x20/0x30
> [  705.466037]  [<ffffffff810c52b2>] tick_nohz_restart+0x12/0x90
> [  705.466039]  [<ffffffff810c56da>] tick_nohz_restart_sched_tick+0x4a/0x60
> [  705.466041]  [<ffffffff810c5e99>] __tick_nohz_full_check+0x89/0x90
> [  705.466043]  [<ffffffff810c5ea9>] nohz_full_kick_work_func+0x9/0x10
> [  705.466047]  [<ffffffff81129e89>] __irq_work_run+0x79/0xb0
> [  705.466049]  [<ffffffff81129ec9>] irq_work_run+0x9/0x10
> [  705.466051]  [<ffffffff81068362>] update_process_times+0x62/0x80
> [  705.466053]  [<ffffffff810c4f02>] tick_sched_handle+0x32/0x70
> [  705.466055]  [<ffffffff810c51d0>] tick_sched_timer+0x40/0x70
> [  705.466057]  [<ffffffff81084b8d>] __run_hrtimer+0x14d/0x280
> [  705.466059]  [<ffffffff810c5190>] ? tick_nohz_handler+0xa0/0xa0
> [  705.466060]  [<ffffffff81084dea>] hrtimer_interrupt+0x12a/0x310
> [  705.466065]  [<ffffffff81096e4c>] ? vtime_account_user+0x6c/0x100
> [  705.466067]  [<ffffffff81034af6>] local_apic_timer_interrupt+0x36/0x60
> [  705.466069]  [<ffffffff8103a8c4>] ? native_apic_msr_eoi_write+0x14/0x20
> [  705.466071]  [<ffffffff810359fe>] smp_apic_timer_interrupt+0x3e/0x60
> [  705.466074]  [<ffffffff815ddcdd>] apic_timer_interrupt+0x6d/0x80
> [  705.466075]  <EOI> 
> [  705.468619] NMI backtrace for cpu 52
> [  705.468622] CPU: 52 PID: 23285 Comm: objdump Tainted: GF            3.12.9-rt11 #376
> [  705.468634] RIP: 0010:[<ffffffff81085083>]  [<ffffffff81085083>] hrtimer_try_to_cancel+0x53/0x110
> [  705.468650] Call Trace:
> [  705.468651]  <IRQ> 
> [  705.468653]  [<ffffffff81085160>] ? hrtimer_cancel+0x20/0x30
> [  705.468660]  [<ffffffff810c52b2>] tick_nohz_restart+0x12/0x90
> [  705.468662]  [<ffffffff810c56da>] tick_nohz_restart_sched_tick+0x4a/0x60
> [  705.468665]  [<ffffffff810c5e99>] __tick_nohz_full_check+0x89/0x90
> [  705.468667]  [<ffffffff810c5ea9>] nohz_full_kick_work_func+0x9/0x10
> [  705.468674]  [<ffffffff81129e89>] __irq_work_run+0x79/0xb0
> [  705.468676]  [<ffffffff81129ec9>] irq_work_run+0x9/0x10
> [  705.468681]  [<ffffffff81068362>] update_process_times+0x62/0x80
> [  705.468683]  [<ffffffff810c4f02>] tick_sched_handle+0x32/0x70
> [  705.468685]  [<ffffffff810c51d0>] tick_sched_timer+0x40/0x70
> [  705.468687]  [<ffffffff81084b8d>] __run_hrtimer+0x14d/0x280
> [  705.468689]  [<ffffffff810c5190>] ? tick_nohz_handler+0xa0/0xa0
> [  705.468691]  [<ffffffff81084dea>] hrtimer_interrupt+0x12a/0x310
> [  705.468700]  [<ffffffff81096c22>] ? vtime_account_system+0x52/0xe0
> [  705.468703]  [<ffffffff81034af6>] local_apic_timer_interrupt+0x36/0x60
> [  705.468708]  [<ffffffff8103a8c4>] ? native_apic_msr_eoi_write+0x14/0x20
> [  705.468710]  [<ffffffff810359fe>] smp_apic_timer_interrupt+0x3e/0x60
> [  705.468721]  [<ffffffff815ddcdd>] apic_timer_interrupt+0x6d/0x80
> [  705.468722]  <EOI> 
> [  705.468733]  [<ffffffff8105ae13>] ? pin_current_cpu+0x63/0x180
> [  705.468742]  [<ffffffff81090505>] migrate_disable+0x95/0x100
> [  705.468746]  [<ffffffff81168d21>] __do_fault+0x181/0x590
> [  705.468748]  [<ffffffff811691c3>] handle_pte_fault+0x93/0x250
> [  705.468750]  [<ffffffff811694b7>] __handle_mm_fault+0x137/0x1e0
> [  705.468752]  [<ffffffff81169653>] handle_mm_fault+0xf3/0x1a0
> [  705.468755]  [<ffffffff815d90f1>] __do_page_fault+0x291/0x550
> [  705.468758]  [<ffffffff8100a8d0>] ? native_sched_clock+0x20/0xa0
> [  705.468766]  [<ffffffff81108547>] ? acct_account_cputime+0x17/0x20
> [  705.468768]  [<ffffffff81096dc2>] ? account_user_time+0xd2/0xf0
> [  705.468770]  [<ffffffff81096e4c>] ? vtime_account_user+0x6c/0x100
> [  705.468772]  [<ffffffff815d93f0>] do_page_fault+0x40/0x70
> [  705.468774]  [<ffffffff815d5d48>] page_fault+0x28/0x30

So CPU5 & CPU52 were eating 100% CPU doing "nothing" instead of running
cc1 & objdump right?
According to the backtrace both of them are trying to access the
per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
to get the timer lock here. They shouldn't spin there for minutes, I
have no idea why they did so…
I guess this problem does not occur without -RT and before that patch
you saw only that one warning from can_stop_full_tick()?

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
  2014-02-02 20:10   ` Sebastian Andrzej Siewior
@ 2014-02-03  2:43     ` Mike Galbraith
  2014-02-03  4:00     ` Mike Galbraith
  1 sibling, 0 replies; 22+ messages in thread
From: Mike Galbraith @ 2014-02-03  2:43 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: linux-rt-users, linux-kernel, rostedt, tglx

On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote:

> So CPU5 & CPU52 were eating 100% CPU doing "nothing" instead of running
> cc1 & objdump right?

Yeah.

> According to the backtrace both of them are trying to access the
> per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
> to get the timer lock here. They shouldn't spin there for minutes, I
> have no idea why they did so…

I dumped it for later-guy.. but he tends to get busy doing other crap,
and just whacks my carefully saved data ;-)

> I guess this problem does not occur without -RT and before that patch
> you saw only that one warning from can_stop_full_tick()?

I didn't try it without -RT, and yes, without, you just get the warning.

-Mike


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
  2014-02-02 20:10   ` Sebastian Andrzej Siewior
  2014-02-03  2:43     ` Mike Galbraith
@ 2014-02-03  4:00     ` Mike Galbraith
  2014-02-03  8:31       ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 22+ messages in thread
From: Mike Galbraith @ 2014-02-03  4:00 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: linux-rt-users, linux-kernel, rostedt, tglx

On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote:

> According to the backtrace both of them are trying to access the
> per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
> to get the timer lock here. They shouldn't spin there for minutes, I
> have no idea why they did so…

Hm. per-cpu...

I've been chasing an rt hotplug heisenbug that is pointing to per-cpu
oddness.  During sched domain re-construction while running Steven's
stress script on 64 core box, we hit a freshly constructed domain with
_no span_, build_sched_groups()->get_group() explodes when we meeting
it.  But if you try to watch the thing appear... it just doesn't.

static int build_sched_domains(const struct cpumask *cpu_map,
                               struct sched_domain_attr *attr)
{
        enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
        int i, ret = -ENOMEM;

        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
        if (alloc_state != sa_rootdomain)
                goto error;

        /* Set up domains for cpus specified by the cpu_map. */
        for_each_cpu(i, cpu_map) {
                struct sched_domain_topology_level *tl;

                sd = NULL;
                for_each_sd_topology(tl) {
                        sd = build_sched_domain(tl, cpu_map, attr, sd, i);
BUG_ON(sd == spanless-alien) here..
                        if (tl == sched_domain_topology)
                                *per_cpu_ptr(d.sd, i) = sd;
                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
                                sd->flags |= SD_OVERLAP;
                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
                                break;
                }
        }

        /* Build the groups for the domains */
        for_each_cpu(i, cpu_map) {
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                        sd->span_weight = cpumask_weight(sched_domain_span(sd));
                        if (sd->flags & SD_OVERLAP) {
                                if (build_overlap_sched_groups(sd, i))
                                        goto error;
                        } else {
                                if (build_sched_groups(sd, i))
..prevents meeting that alien here.. while hotplug locked.

static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
        struct sched_domain *child = sd->child;
        if (child)
                cpu = cpumask_first(sched_domain_span(child));
                ^^^nr_cpus
        if (sg) {
                *sg = *per_cpu_ptr(sdd->sg, cpu); BOOM



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
  2014-02-03  4:00     ` Mike Galbraith
@ 2014-02-03  8:31       ` Sebastian Andrzej Siewior
  2014-02-03  9:26         ` Mike Galbraith
  0 siblings, 1 reply; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2014-02-03  8:31 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-rt-users, linux-kernel, rostedt, tglx

On 02/03/2014 05:00 AM, Mike Galbraith wrote:
> On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote:
> 
>> According to the backtrace both of them are trying to access the
>> per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
>> to get the timer lock here. They shouldn't spin there for minutes, I
>> have no idea why they did so…
> 
> Hm. per-cpu...
> 
> I've been chasing an rt hotplug heisenbug that is pointing to per-cpu
> oddness.  During sched domain re-construction while running Steven's
> stress script on 64 core box, we hit a freshly constructed domain with
> _no span_, build_sched_groups()->get_group() explodes when we meeting
> it.  But if you try to watch the thing appear... it just doesn't.
> 
> static int build_sched_domains(const struct cpumask *cpu_map,
>                                struct sched_domain_attr *attr)
> {
>         enum s_alloc alloc_state;
>         struct sched_domain *sd;
>         struct s_data d;
>         int i, ret = -ENOMEM;
> 
>         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
>         if (alloc_state != sa_rootdomain)
>                 goto error;
> 
>         /* Set up domains for cpus specified by the cpu_map. */
>         for_each_cpu(i, cpu_map) {
>                 struct sched_domain_topology_level *tl;
> 
>                 sd = NULL;
>                 for_each_sd_topology(tl) {
>                         sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> BUG_ON(sd == spanless-alien) here..

spanless-alien is?
BUG_ON() is actually _very_ cheap. It shouldn't even create any kind of
compiler barrier which would reload variables / registers. It should
evaluate sd and "spanless-alien", do the compare and then go on.

>                         if (tl == sched_domain_topology)
>                                 *per_cpu_ptr(d.sd, i) = sd;
>                         if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
>                                 sd->flags |= SD_OVERLAP;
>                         if (cpumask_equal(cpu_map, sched_domain_span(sd)))
>                                 break;
>                 }
>         }
> 
>         /* Build the groups for the domains */
>         for_each_cpu(i, cpu_map) {
>                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
>                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
>                         if (sd->flags & SD_OVERLAP) {
>                                 if (build_overlap_sched_groups(sd, i))
>                                         goto error;
>                         } else {
>                                 if (build_sched_groups(sd, i))
> ..prevents meeting that alien here.. while hotplug locked.

my copy of build_sched_groups() always returns 0 so it never goes to
the error marker. Did you consider a compiler bug? I could try to
rebuild your source + config on two different compilers just to see if
it makes a difference.

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
  2014-02-03  8:31       ` Sebastian Andrzej Siewior
@ 2014-02-03  9:26         ` Mike Galbraith
  0 siblings, 0 replies; 22+ messages in thread
From: Mike Galbraith @ 2014-02-03  9:26 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: linux-rt-users, linux-kernel, rostedt, tglx

On Mon, 2014-02-03 at 09:31 +0100, Sebastian Andrzej Siewior wrote: 
> On 02/03/2014 05:00 AM, Mike Galbraith wrote:
> > On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote:
> > 
> >> According to the backtrace both of them are trying to access the
> >> per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
> >> to get the timer lock here. They shouldn't spin there for minutes, I
> >> have no idea why they did so…
> > 
> > Hm. per-cpu...
> > 
> > I've been chasing an rt hotplug heisenbug that is pointing to per-cpu
> > oddness.  During sched domain re-construction while running Steven's
> > stress script on 64 core box, we hit a freshly constructed domain with
> > _no span_, build_sched_groups()->get_group() explodes when we meeting
> > it.  But if you try to watch the thing appear... it just doesn't.
> > 
> > static int build_sched_domains(const struct cpumask *cpu_map,
> >                                struct sched_domain_attr *attr)
> > {
> >         enum s_alloc alloc_state;
> >         struct sched_domain *sd;
> >         struct s_data d;
> >         int i, ret = -ENOMEM;
> > 
> >         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
> >         if (alloc_state != sa_rootdomain)
> >                 goto error;
> > 
> >         /* Set up domains for cpus specified by the cpu_map. */
> >         for_each_cpu(i, cpu_map) {
> >                 struct sched_domain_topology_level *tl;
> > 
> >                 sd = NULL;
> >                 for_each_sd_topology(tl) {
> >                         sd = build_sched_domain(tl, cpu_map, attr, sd, i);
> > BUG_ON(sd == spanless-alien) here..
> 
> spanless-alien is?

An sd spanning zero CPUs, cpumask is empty.

> BUG_ON() is actually _very_ cheap. It shouldn't even create any kind of
> compiler barrier which would reload variables / registers. It should
> evaluate sd and "spanless-alien", do the compare and then go on.

Nonetheless, the bad thing then refuses to happen.

> >                         if (tl == sched_domain_topology)
> >                                 *per_cpu_ptr(d.sd, i) = sd;
> >                         if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
> >                                 sd->flags |= SD_OVERLAP;
> >                         if (cpumask_equal(cpu_map, sched_domain_span(sd)))
> >                                 break;
> >                 }
> >         }
> > 
> >         /* Build the groups for the domains */
> >         for_each_cpu(i, cpu_map) {
> >                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
> >                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
> >                         if (sd->flags & SD_OVERLAP) {
> >                                 if (build_overlap_sched_groups(sd, i))
> >                                         goto error;
> >                         } else {
> >                                 if (build_sched_groups(sd, i))
> > ..prevents meeting that alien here.. while hotplug locked.
> 
> my copy of build_sched_groups() always returns 0 so it never goes to
> the error marker.

It explodes before return, in the call to get_group(), trying to write
to lala land.

> Did you consider a compiler bug? I could try to
> rebuild your source + config on two different compilers just to see if
> it makes a difference.

Possible, but seems very unlikely.  I suspect it's something to do with
the rather unusual topology of this box.  8 sockets, but 1 NUMA node
with a whole 8GB ram.  Lots of CPU, no way to use it well.  I suppose I
should try beating up a less crippled big box, see if its reproducible
elsewhere.  It takes a while to explode on this box, and may not ever
explode in a box that doesn't have 64 CPUs all poking the same couple
sticks of ram.

-Mike


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2014-02-03  9:26 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-31 14:34 [PATCH 1/2] irq_work: allow certain work in hard irq context Sebastian Andrzej Siewior
2014-01-31 14:34 ` [PATCH 2/2] timer: really raise softirq if there is irq_work to do Sebastian Andrzej Siewior
2014-01-31 17:07   ` Steven Rostedt
2014-01-31 17:11     ` Steven Rostedt
2014-01-31 17:42     ` Paul E. McKenney
2014-01-31 17:57       ` Steven Rostedt
2014-01-31 19:03         ` Paul E. McKenney
2014-01-31 19:26         ` Sebastian Andrzej Siewior
2014-01-31 19:34           ` Steven Rostedt
2014-01-31 19:48             ` Sebastian Andrzej Siewior
2014-01-31 19:56               ` Steven Rostedt
2014-01-31 20:05               ` Peter Zijlstra
2014-01-31 20:23                 ` Sebastian Andrzej Siewior
2014-01-31 20:29                   ` Peter Zijlstra
2014-01-31 19:54             ` Peter Zijlstra
2014-01-31 19:06     ` Sebastian Andrzej Siewior
2014-02-02  4:22 ` [PATCH 1/2] irq_work: allow certain work in hard irq context Mike Galbraith
2014-02-02 20:10   ` Sebastian Andrzej Siewior
2014-02-03  2:43     ` Mike Galbraith
2014-02-03  4:00     ` Mike Galbraith
2014-02-03  8:31       ` Sebastian Andrzej Siewior
2014-02-03  9:26         ` Mike Galbraith

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).