All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] per-cpu preempt_count
@ 2013-08-12 11:51 Peter Zijlstra
  2013-08-12 17:35 ` Linus Torvalds
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2013-08-12 11:51 UTC (permalink / raw)
  To: Linus Torvalds, mingo, tglx; +Cc: bitbucket, hpa, ak, linux-kernel

Hi,

new thread since someone forgot to CC scheduler maintainers on actual
scheduler patches and I can't be arsed to look up the original thread.

The below boots to wanting to mount a root filesystem with
CONFIG_PREEMPT=y using kvm -smp 4.

I suppose we might want to move TIF_NEED_RESCHED into the preempt_count
just as we might want to move PREEMPT_ACTIVE out of it.

Adding TIF_NEED_RESCHED into the preempt count would allow a single test
in preempt_check_resched() instead of still needing the TI. Removing
PREEMPT_ACTIVE from preempt count should allow us to get rid of
ti::preempt_count altogether.

The only problem with TIF_NEED_RESCHED is that its cross-cpu which would
make the entire thing atomic which would suck donkey balls so maybe we
need two separate per-cpu variables? 

---
 arch/x86/kernel/entry_64.S |  2 +-
 include/linux/preempt.h    |  9 ++++++---
 kernel/context_tracking.c  |  3 +--
 kernel/sched/core.c        | 20 +++++++++++++++-----
 lib/smp_processor_id.c     |  3 +--
 5 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1b69951..5ea77d2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1118,7 +1118,7 @@ ENTRY(native_iret)
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
 ENTRY(retint_kernel)
-	cmpl $0,TI_preempt_count(%rcx)
+	cmpl $0,PER_CPU_VAR(__preempt_count_var)
 	jnz  retint_restore_args
 	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
 	jnc  retint_restore_args
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f5d4723..2ca9c8ff 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,7 +6,7 @@
  * preempt_count (used for kernel preemption, interrupt count, etc.)
  */
 
-#include <linux/thread_info.h>
+#include <asm/percpu.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 
@@ -21,7 +21,9 @@
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+DECLARE_PER_CPU(int, __preempt_count_var);
+
+#define preempt_count() __raw_get_cpu_var(__preempt_count_var)
 
 #ifdef CONFIG_PREEMPT
 
@@ -29,7 +31,8 @@ asmlinkage void preempt_schedule(void);
 
 #define preempt_check_resched() \
 do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+	if (unlikely(preempt_count() == 0 && \
+	             test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f823..6d113d8 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -87,10 +87,9 @@ void user_enter(void)
  */
 void __sched notrace preempt_schedule_context(void)
 {
-	struct thread_info *ti = current_thread_info();
 	enum ctx_state prev_ctx;
 
-	if (likely(ti->preempt_count || irqs_disabled()))
+	if (likely(preempt_count() || irqs_disabled()))
 		return;
 
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54957a6..59d0b6e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
+DEFINE_PER_CPU(int, __preempt_count_var) = INIT_PREEMPT_COUNT;
+
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
@@ -2013,6 +2015,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+#ifdef CONFIG_PREEMPT_COUNT
+	/*
+	 * If it weren't for PREEMPT_ACTIVE we could guarantee that the
+	 * preempt_count() of all tasks was equal here and this wouldn't be
+	 * needed at all -- try and move PREEMPT_ACTIVE into TI_flags?
+	 */
+	task_thread_info(prev)->preempt_count = preempt_count();
+	preempt_count() = task_thread_info(next)->preempt_count;
+#endif
+
 	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
@@ -2515,13 +2527,11 @@ void __sched schedule_preempt_disabled(void)
  */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
-	struct thread_info *ti = current_thread_info();
-
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
-	if (likely(ti->preempt_count || irqs_disabled()))
+	if (likely(preempt_count() || irqs_disabled()))
 		return;
 
 	do {
@@ -2546,11 +2556,10 @@ EXPORT_SYMBOL(preempt_schedule);
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-	struct thread_info *ti = current_thread_info();
 	enum ctx_state prev_state;
 
 	/* Catch callers which need to be fixed */
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	BUG_ON(preempt_count() || !irqs_disabled());
 
 	prev_state = exception_enter();
 
@@ -4218,6 +4227,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
+	per_cpu(__preempt_count_var, cpu) = 0;
 
 	/*
 	 * The idle tasks have their own, simple scheduling class:
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e5..04abe53 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -9,10 +9,9 @@
 
 notrace unsigned int debug_smp_processor_id(void)
 {
-	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
 
-	if (likely(preempt_count))
+	if (likely(preempt_count()))
 		goto out;
 
 	if (irqs_disabled())

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 11:51 [RFC] per-cpu preempt_count Peter Zijlstra
@ 2013-08-12 17:35 ` Linus Torvalds
  2013-08-12 17:51   ` H. Peter Anvin
                     ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Linus Torvalds @ 2013-08-12 17:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Mon, Aug 12, 2013 at 4:51 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> The below boots to wanting to mount a root filesystem with
> CONFIG_PREEMPT=y using kvm -smp 4.

But doesn't work in general? Or you just never tested?

I think that "thread_info->preempt_count" variable would need to be
renamed to "saved_preempt_count" or similar to make sure we catch any
users. But the patch certainly looks simple otherwise.

I'm pretty sure I had a discussion about this with Paul McKenney some
time ago (because the RCU readlock is the most noticeable user of the
preempt count - the others tend to be hidden inside the out-of-line
spinlock functions etc), and I thought he had tried this and had some
problems. Maybe we've fixed things since, or maybe he missed some
case..

But if the patch really is this simple, then we should just do it. Of
course, we should double-check that the percpu preempt count is in a
cacheline that is already accessed (preferably already dirtied) by the
context switching code.  And I think this should be an
architecture-specific thing, because using a percpu variable might be
good on some architectures but not others. So I get the feeling that
it should be in the x86 __switch_to(), rather than in the generic
code. I think it would fit very well with the per-cpu "old_rsp" and
"current_task" updates that we already do.

> Adding TIF_NEED_RESCHED into the preempt count would allow a single test
> in preempt_check_resched() instead of still needing the TI. Removing
> PREEMPT_ACTIVE from preempt count should allow us to get rid of
> ti::preempt_count altogether.
>
> The only problem with TIF_NEED_RESCHED is that its cross-cpu which would
> make the entire thing atomic which would suck donkey balls so maybe we
> need two separate per-cpu variables?

Agreed. Making it atomic would suck, and cancel all advantages of the
better code generation to access it. Good point.

And yeah, it could be two variables in the same cacheline or something.

                 Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 17:35 ` Linus Torvalds
@ 2013-08-12 17:51   ` H. Peter Anvin
  2013-08-12 18:53     ` Linus Torvalds
  2013-08-12 17:58   ` Ingo Molnar
  2013-08-18 17:57   ` Paul E. McKenney
  2 siblings, 1 reply; 17+ messages in thread
From: H. Peter Anvin @ 2013-08-12 17:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Andi Kleen, Linux Kernel Mailing List

On 08/12/2013 10:35 AM, Linus Torvalds wrote:
> 
> Agreed. Making it atomic would suck, and cancel all advantages of the
> better code generation to access it. Good point.
> 
> And yeah, it could be two variables in the same cacheline or something.
> 

So we would have code looking something like:

	decl %fs:preempt_count
	jnz 1f
	cmpb $0,%fs:need_resched
	je 1f
	call __preempt_schedule
1:

It's a nontrivial amount of code, but would seem a fair bit better than
what we have now, at least.

	-hpa


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 17:35 ` Linus Torvalds
  2013-08-12 17:51   ` H. Peter Anvin
@ 2013-08-12 17:58   ` Ingo Molnar
  2013-08-12 19:00     ` Linus Torvalds
  2013-08-18 17:57   ` Paul E. McKenney
  2 siblings, 1 reply; 17+ messages in thread
From: Ingo Molnar @ 2013-08-12 17:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, Aug 12, 2013 at 4:51 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > The below boots to wanting to mount a root filesystem with
> > CONFIG_PREEMPT=y using kvm -smp 4.
> 
> But doesn't work in general? Or you just never tested?

(I think Peter never tested it on real hw - this is an RFC patch to show 
the concept .)

> > Adding TIF_NEED_RESCHED into the preempt count would allow a single 
> > test in preempt_check_resched() instead of still needing the TI. 
> > Removing PREEMPT_ACTIVE from preempt count should allow us to get rid 
> > of ti::preempt_count altogether.
> >
> > The only problem with TIF_NEED_RESCHED is that its cross-cpu which 
> > would make the entire thing atomic which would suck donkey balls so 
> > maybe we need two separate per-cpu variables?
> 
> Agreed. Making it atomic would suck, and cancel all advantages of the 
> better code generation to access it. Good point.

We could still have the advantages of NEED_RESCHED in preempt_count() by 
realizing that we only rarely actually set/clear need_resched and mostly 
read it from the highest freq user, the preempt_enable() check.

So we could have it atomic, but do atomic_read() in the preempt_enable() 
hotpath which wouldn't suck donkey balls, right?

That would allow a really sweet preempt_enable() fastpath, on x86 at 
least.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 17:51   ` H. Peter Anvin
@ 2013-08-12 18:53     ` Linus Torvalds
  2013-08-13  8:39       ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2013-08-12 18:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Andi Kleen, Linux Kernel Mailing List

On Mon, Aug 12, 2013 at 10:51 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>
> So we would have code looking something like:
>
>         decl %fs:preempt_count
>         jnz 1f
>         cmpb $0,%fs:need_resched
>         je 1f
>         call __preempt_schedule
> 1:
>
> It's a nontrivial amount of code, but would seem a fair bit better than
> what we have now, at least.

Well, we currently don't even bother checking the preempt count at
all, and we just optimistically assume that we don't nest in the
common case. The preempt count is then re-checked in
__preempt_schedule, I think.

Which sounds like a fair approach.

So the code would be simplified to just

         decl %fs:preempt_count
         cmpb $0,%fs:need_resched
         jne .. unlikely branch that calls __preempt_schedule

which is not horrible. Not *quite* as nice as just doing a single
"decl+js", but hey, certainly better than what we have now.

             Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 17:58   ` Ingo Molnar
@ 2013-08-12 19:00     ` Linus Torvalds
  2013-08-12 20:44       ` H. Peter Anvin
  2013-08-13 10:30       ` Ingo Molnar
  0 siblings, 2 replies; 17+ messages in thread
From: Linus Torvalds @ 2013-08-12 19:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Mon, Aug 12, 2013 at 10:58 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> We could still have the advantages of NEED_RESCHED in preempt_count() by
> realizing that we only rarely actually set/clear need_resched and mostly
> read it from the highest freq user, the preempt_enable() check.
>
> So we could have it atomic, but do atomic_read() in the preempt_enable()
> hotpath which wouldn't suck donkey balls, right?

Wrong. The thing is, the common case for preempt is to increment and
decrement the count, not testing it. Exactly because we do this for
spinlocks and for rcu read-locked regions.

Now, what we *could* do is to say:

 - we will use the high bit of the preempt count for NEED_RESCHED

 - when we set/clear that high bit, we *always* use atomic sequences,
and we never change any of the other bits.

 - we will increment/decrement the other counters, we *only* do so on
the local CPU, and we don't use atomic accesses.

Now, the downside of that is that *because* we don't use atomic
accesses for the inc/dec parts, the updates to the high bit can get
lost. But because the high bit updates are done with atomics, we know
that they won't mess up the actual counting bits, so at least the
count is never corrupted.

And the NEED_RESCHED bit getting lost would be very unusual. That
clearly would *not* be acceptable for RT, but it it might be
acceptable for "in the unusual case where we want to preempt a thread
that was not preemtible, *and* we ended up having the extra unsual
case that preemption enable ended up missing the preempt bit, we don't
get preempted in a timely manner". It's probably impossible to ever
see in practice, and considering that for non-RT use the PREEMPT bit
is a "strong hint" rather than anything else, it sounds like it might
be acceptable.

It is obviously *not* going to be acceptable for the RT people,
though, but since they do different code sequences _anyway_, that's
not really much of an issue.

             Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 19:00     ` Linus Torvalds
@ 2013-08-12 20:44       ` H. Peter Anvin
  2013-08-13 10:30       ` Ingo Molnar
  1 sibling, 0 replies; 17+ messages in thread
From: H. Peter Anvin @ 2013-08-12 20:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Mike Galbraith,
	Andi Kleen, Linux Kernel Mailing List

On 08/12/2013 12:00 PM, Linus Torvalds wrote:
> 
> Wrong. The thing is, the common case for preempt is to increment and
> decrement the count, not testing it. Exactly because we do this for
> spinlocks and for rcu read-locked regions.
> 
> Now, what we *could* do is to say:
> 
>  - we will use the high bit of the preempt count for NEED_RESCHED
> 
>  - when we set/clear that high bit, we *always* use atomic sequences,
> and we never change any of the other bits.
> 
>  - we will increment/decrement the other counters, we *only* do so on
> the local CPU, and we don't use atomic accesses.
> 
> Now, the downside of that is that *because* we don't use atomic
> accesses for the inc/dec parts, the updates to the high bit can get
> lost. But because the high bit updates are done with atomics, we know
> that they won't mess up the actual counting bits, so at least the
> count is never corrupted.
> 
> And the NEED_RESCHED bit getting lost would be very unusual. That
> clearly would *not* be acceptable for RT, but it it might be
> acceptable for "in the unusual case where we want to preempt a thread
> that was not preemtible, *and* we ended up having the extra unsual
> case that preemption enable ended up missing the preempt bit, we don't
> get preempted in a timely manner". It's probably impossible to ever
> see in practice, and considering that for non-RT use the PREEMPT bit
> is a "strong hint" rather than anything else, it sounds like it might
> be acceptable.
> 
> It is obviously *not* going to be acceptable for the RT people,
> though, but since they do different code sequences _anyway_, that's
> not really much of an issue.
> 

This seems more pain than need be if checking the count in the slow path
is okay.

	-hpa



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 18:53     ` Linus Torvalds
@ 2013-08-13  8:39       ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2013-08-13  8:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Andi Kleen, Linux Kernel Mailing List, arjan

On Mon, Aug 12, 2013 at 11:53:25AM -0700, Linus Torvalds wrote:
> On Mon, Aug 12, 2013 at 10:51 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> >
> > So we would have code looking something like:
> >
> >         decl %fs:preempt_count
> >         jnz 1f
> >         cmpb $0,%fs:need_resched
> >         je 1f
> >         call __preempt_schedule
> > 1:
> >
> > It's a nontrivial amount of code, but would seem a fair bit better than
> > what we have now, at least.
> 
> Well, we currently don't even bother checking the preempt count at
> all, and we just optimistically assume that we don't nest in the
> common case. The preempt count is then re-checked in
> __preempt_schedule, I think.
> 
> Which sounds like a fair approach.
> 
> So the code would be simplified to just
> 
>          decl %fs:preempt_count
>          cmpb $0,%fs:need_resched
>          jne .. unlikely branch that calls __preempt_schedule
> 
> which is not horrible. Not *quite* as nice as just doing a single
> "decl+js", but hey, certainly better than what we have now.

OK, so doing per-cpu need_resched is a trivial patch except for the
mwait side of things. Then again, Arjan already asked for per-cpu
need_resched specifically for mwait so we might as well do that.

The only complication is that IIRC Arjan wants to stagger the mwait
cache-lines and we would very much like our preempt_count and
need_resched (and possible some other __switch_to related things) in the
same cacheline.

Afaict that'll yield a double indirect again :/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 19:00     ` Linus Torvalds
  2013-08-12 20:44       ` H. Peter Anvin
@ 2013-08-13 10:30       ` Ingo Molnar
  2013-08-13 12:26         ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Ingo Molnar @ 2013-08-13 10:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, Aug 12, 2013 at 10:58 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > We could still have the advantages of NEED_RESCHED in preempt_count() by
> > realizing that we only rarely actually set/clear need_resched and mostly
> > read it from the highest freq user, the preempt_enable() check.
> >
> > So we could have it atomic, but do atomic_read() in the preempt_enable()
> > hotpath which wouldn't suck donkey balls, right?
> 
> Wrong. The thing is, the common case for preempt is to increment and 
> decrement the count, not testing it. Exactly because we do this for 
> spinlocks and for rcu read-locked regions.

Indeed, I should have realized that immediately ...

> Now, what we *could* do is to say:
> 
>  - we will use the high bit of the preempt count for NEED_RESCHED
> 
>  - when we set/clear that high bit, we *always* use atomic sequences, 
> and we never change any of the other bits.
> 
>  - we will increment/decrement the other counters, we *only* do so on 
> the local CPU, and we don't use atomic accesses.
> 
> Now, the downside of that is that *because* we don't use atomic accesses 
> for the inc/dec parts, the updates to the high bit can get lost. But 
> because the high bit updates are done with atomics, we know that they 
> won't mess up the actual counting bits, so at least the count is never 
> corrupted.
> 
> And the NEED_RESCHED bit getting lost would be very unusual. That 
> clearly would *not* be acceptable for RT, but it it might be acceptable 
> for "in the unusual case where we want to preempt a thread that was not 
> preemtible, *and* we ended up having the extra unsual case that 
> preemption enable ended up missing the preempt bit, we don't get 
> preempted in a timely manner". It's probably impossible to ever see in 
> practice, and considering that for non-RT use the PREEMPT bit is a 
> "strong hint" rather than anything else, it sounds like it might be 
> acceptable.
> 
> It is obviously *not* going to be acceptable for the RT people, though, 
> but since they do different code sequences _anyway_, that's not really 
> much of an issue.

Hm, this could introduce weird artifacts for code like signal delivery 
(via kick_process()), with occasional high - possibly user noticeable - 
signal delivery latencies.

But we could perhaps do something else and push the overhead into 
resched_task(): instead of using atomics we could use the resched IPI to 
set the local preempt_count(). That way preempt_count() will only ever be 
updated CPU-locally and we could merge need_resched into preempt_count() 
just fine.

[ Some care has to be taken with polling-idle threads: those could simply
  use another signalling mechanism, another field in task struct, no need
  to abuse need_resched for that. ]

We could still _read_ the preempt count non-destructively from other CPUs, 
to avoid having to send a resched IPI for already marked tasks. [ OTOH it 
might be faster to never do that and assume that an IPI has to be sent in 
99.9% of the cases - that would have to be re-measured. ]

Using this method we could have a really lightweight, minimal, percpu 
based preempt count mechanism in all the default fastpath cases, both for 
nested and for non-nested preempt_enable()s.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 10:30       ` Ingo Molnar
@ 2013-08-13 12:26         ` Peter Zijlstra
  2013-08-13 15:39           ` Linus Torvalds
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2013-08-13 12:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Tue, Aug 13, 2013 at 12:30:56PM +0200, Ingo Molnar wrote:

> But we could perhaps do something else and push the overhead into 
> resched_task(): instead of using atomics we could use the resched IPI to 
> set the local preempt_count(). That way preempt_count() will only ever be 
> updated CPU-locally and we could merge need_resched into preempt_count() 
> just fine.
> 
> [ Some care has to be taken with polling-idle threads: those could simply
>   use another signalling mechanism, another field in task struct, no need
>   to abuse need_resched for that. ]
> 
> We could still _read_ the preempt count non-destructively from other CPUs, 
> to avoid having to send a resched IPI for already marked tasks. [ OTOH it 
> might be faster to never do that and assume that an IPI has to be sent in 
> 99.9% of the cases - that would have to be re-measured. ]
> 
> Using this method we could have a really lightweight, minimal, percpu 
> based preempt count mechanism in all the default fastpath cases, both for 
> nested and for non-nested preempt_enable()s.

Indeed, however we need to keep TIF_NEED_RESCHED as is, and simply
transfer it to preempt_count() on the IPI. Every NEED_RESCHED already
causes the IPI, except indeed the 'polling' idle threads. Those we need
to also teach to transfer.

The reason is that the reschedule IPI is slightly over-used and we
wouldn't want to unconditionally cause a reschedule.

The below again boots to wanting to mount a root filesystem and is only
tested with kvm -smp 4, CONFIG_PREEMPT=y.

So we're now down to something like:

  decl fs:preempt_count
  cmpl PREEMPT_NEED_RESCHED,fs:preempt_count
  jnz 1f
  call preempt_schedule
1f:



---
 arch/x86/include/asm/thread_info.h |  9 ++++----
 arch/x86/kernel/asm-offsets.c      |  2 +-
 arch/x86/kernel/entry_64.S         |  4 +---
 include/linux/hardirq.h            |  4 ++--
 include/linux/preempt.h            | 11 ++++++----
 include/linux/sched.h              |  2 +-
 include/linux/thread_info.h        |  1 +
 include/trace/events/sched.h       |  2 +-
 init/main.c                        |  7 ++++---
 kernel/context_tracking.c          |  3 +--
 kernel/cpu/idle.c                  |  3 +++
 kernel/sched/core.c                | 43 ++++++++++++++++++++++++++------------
 kernel/softirq.c                   |  8 +++----
 kernel/timer.c                     |  9 ++++----
 lib/smp_processor_id.c             |  3 +--
 15 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2781119..232d512 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -28,8 +28,7 @@ struct thread_info {
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
-	int			preempt_count;	/* 0 => preemptable,
-						   <0 => BUG */
+	int			saved_preempt_count;
 	mm_segment_t		addr_limit;
 	struct restart_block    restart_block;
 	void __user		*sysenter_return;
@@ -49,7 +48,7 @@ struct thread_info {
 	.exec_domain	= &default_exec_domain,	\
 	.flags		= 0,			\
 	.cpu		= 0,			\
-	.preempt_count	= INIT_PREEMPT_COUNT,	\
+	.saved_preempt_count	= INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
 	.restart_block = {			\
 		.fn = do_no_restart_syscall,	\
@@ -100,8 +99,8 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
@@ -112,6 +111,7 @@ struct thread_info {
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
+#define _TIF_MEMDIE		(1 << TIF_MEMDIE)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
@@ -155,6 +155,7 @@ struct thread_info {
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
 
 #define PREEMPT_ACTIVE		0x10000000
+#define PREEMPT_NEED_RESCHED	0x20000000
 
 #ifdef CONFIG_X86_32
 
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2861082..46d889f 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,7 @@ void common(void) {
 	OFFSET(TI_flags, thread_info, flags);
 	OFFSET(TI_status, thread_info, status);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
-	OFFSET(TI_preempt_count, thread_info, preempt_count);
+//	OFFSET(TI_preempt_count, thread_info, preempt_count);
 
 	BLANK();
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1b69951..011b6d3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1118,10 +1118,8 @@ ENTRY(native_iret)
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
 ENTRY(retint_kernel)
-	cmpl $0,TI_preempt_count(%rcx)
+	cmpl $PREEMPT_NEED_RESCHED,PER_CPU_VAR(__preempt_count_var)
 	jnz  retint_restore_args
-	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
-	jnc  retint_restore_args
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 05bcc09..5d7f83c 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -107,14 +107,14 @@
  * used in the general case to determine whether sleeping is possible.
  * Do not use in_atomic() in driver code.
  */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+#define in_atomic()	((preempt_count() & ~(PREEMPT_ACTIVE|PREEMPT_NEED_RESCHED)) != 0)
 
 /*
  * Check whether we were atomic before we did preempt_disable():
  * (used by the scheduler, *after* releasing the kernel lock)
  */
 #define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+		((preempt_count() & ~(PREEMPT_ACTIVE|PREEMPT_NEED_RESCHED)) != PREEMPT_CHECK_OFFSET)
 
 #ifdef CONFIG_PREEMPT_COUNT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f5d4723..724ee32 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,7 +6,8 @@
  * preempt_count (used for kernel preemption, interrupt count, etc.)
  */
 
-#include <linux/thread_info.h>
+#include <linux/thread_info.h> /* for PREEMPT_NEED_RESCHED */
+#include <asm/percpu.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 
@@ -21,7 +22,9 @@
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+DECLARE_PER_CPU(int, __preempt_count_var);
+
+#define preempt_count() __raw_get_cpu_var(__preempt_count_var)
 
 #ifdef CONFIG_PREEMPT
 
@@ -29,7 +32,7 @@ asmlinkage void preempt_schedule(void);
 
 #define preempt_check_resched() \
 do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+	if (unlikely(preempt_count() == PREEMPT_NEED_RESCHED)) \
 		preempt_schedule(); \
 } while (0)
 
@@ -39,7 +42,7 @@ void preempt_schedule_context(void);
 
 #define preempt_check_resched_context() \
 do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+	if (unlikely(preempt_count() == PREEMPT_NEED_RESCHED)) \
 		preempt_schedule_context(); \
 } while (0)
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f79ced7..6d2ee58 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2405,7 +2405,7 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 
 static inline int need_resched(void)
 {
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+	return preempt_count() & PREEMPT_NEED_RESCHED;
 }
 
 /*
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e7e0473..477c301 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -104,6 +104,7 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 #define test_thread_flag(flag) \
 	test_ti_thread_flag(current_thread_info(), flag)
 
+#define test_need_resched()	test_thread_flag(TIF_NEED_RESCHED)
 #define set_need_resched()	set_thread_flag(TIF_NEED_RESCHED)
 #define clear_need_resched()	clear_thread_flag(TIF_NEED_RESCHED)
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586ca..816e2d6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -103,7 +103,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
 	/*
 	 * For all intents and purposes a preempted task is a running task.
 	 */
-	if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
+	if (task_thread_info(p)->saved_preempt_count & PREEMPT_ACTIVE)
 		state = TASK_RUNNING | TASK_STATE_MAX;
 #endif
 
diff --git a/init/main.c b/init/main.c
index d03d2ec..6948f23 100644
--- a/init/main.c
+++ b/init/main.c
@@ -677,7 +677,7 @@ static int __init_or_module do_one_initcall_debug(initcall_t fn)
 
 int __init_or_module do_one_initcall(initcall_t fn)
 {
-	int count = preempt_count();
+	int count = preempt_count() & PREEMPT_MASK;
 	int ret;
 	char msgbuf[64];
 
@@ -688,9 +688,10 @@ int __init_or_module do_one_initcall(initcall_t fn)
 
 	msgbuf[0] = 0;
 
-	if (preempt_count() != count) {
+	if ((preempt_count() & PREEMPT_MASK) != count) {
 		sprintf(msgbuf, "preemption imbalance ");
-		preempt_count() = count;
+		preempt_count() &= ~PREEMPT_MASK;
+		preempt_count() |= count;
 	}
 	if (irqs_disabled()) {
 		strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f823..6d113d8 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -87,10 +87,9 @@ void user_enter(void)
  */
 void __sched notrace preempt_schedule_context(void)
 {
-	struct thread_info *ti = current_thread_info();
 	enum ctx_state prev_ctx;
 
-	if (likely(ti->preempt_count || irqs_disabled()))
+	if (likely(preempt_count() || irqs_disabled()))
 		return;
 
 	/*
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a..7a1afab 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -106,6 +106,9 @@ static void cpu_idle_loop(void)
 				current_set_polling();
 			}
 			arch_cpu_idle_exit();
+
+			if (test_need_resched())
+				preempt_count() |= PREEMPT_NEED_RESCHED;
 		}
 		tick_nohz_idle_exit();
 		schedule_preempt_disabled();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54957a6..e52e468 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
+DEFINE_PER_CPU(int, __preempt_count_var) = INIT_PREEMPT_COUNT;
+
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
@@ -526,8 +528,10 @@ void resched_task(struct task_struct *p)
 	set_tsk_need_resched(p);
 
 	cpu = task_cpu(p);
-	if (cpu == smp_processor_id())
+	if (cpu == smp_processor_id()) {
+		preempt_count() |= PREEMPT_NEED_RESCHED;
 		return;
+	}
 
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
@@ -994,7 +998,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	 * ttwu() will sort out the placement.
 	 */
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+			!(task_thread_info(p)->saved_preempt_count & PREEMPT_ACTIVE));
 
 #ifdef CONFIG_LOCKDEP
 	/*
@@ -1411,6 +1415,9 @@ static void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
+	if (test_need_resched())
+		preempt_count() |= PREEMPT_NEED_RESCHED;
+
 	if (llist_empty(&this_rq()->wake_list)
 			&& !tick_nohz_full_cpu(smp_processor_id())
 			&& !got_nohz_idle_kick())
@@ -1728,7 +1735,7 @@ void sched_fork(struct task_struct *p)
 #endif
 #ifdef CONFIG_PREEMPT_COUNT
 	/* Want to start with kernel preemption disabled. */
-	task_thread_info(p)->preempt_count = 1;
+	task_thread_info(p)->saved_preempt_count = 1;
 #endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
@@ -2013,6 +2020,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 
+#ifdef CONFIG_PREEMPT_COUNT
+	/*
+	 * If it weren't for PREEMPT_ACTIVE we could guarantee that the
+	 * preempt_count() of all tasks was equal here and this wouldn't be
+	 * needed at all -- try and move PREEMPT_ACTIVE into TI_flags?
+	 */
+	task_thread_info(prev)->saved_preempt_count = preempt_count();
+	preempt_count() = task_thread_info(next)->saved_preempt_count;
+#endif
+
 	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
@@ -2241,7 +2258,7 @@ void __kprobes add_preempt_count(int val)
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
-	if (preempt_count() == val)
+	if ((preempt_count() & ~PREEMPT_NEED_RESCHED) == val)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
@@ -2252,7 +2269,7 @@ void __kprobes sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+	if (DEBUG_LOCKS_WARN_ON(val > (preempt_count() & ~PREEMPT_NEED_RESCHED)))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
@@ -2262,7 +2279,7 @@ void __kprobes sub_preempt_count(int val)
 		return;
 #endif
 
-	if (preempt_count() == val)
+	if ((preempt_count() & ~PREEMPT_NEED_RESCHED) == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
@@ -2433,6 +2450,7 @@ static void __sched __schedule(void)
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
+	preempt_count() &= ~PREEMPT_NEED_RESCHED;
 	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
@@ -2515,13 +2533,11 @@ void __sched schedule_preempt_disabled(void)
  */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
-	struct thread_info *ti = current_thread_info();
-
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
-	if (likely(ti->preempt_count || irqs_disabled()))
+	if (likely((preempt_count() & ~PREEMPT_NEED_RESCHED) || irqs_disabled()))
 		return;
 
 	do {
@@ -2546,11 +2562,10 @@ EXPORT_SYMBOL(preempt_schedule);
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-	struct thread_info *ti = current_thread_info();
 	enum ctx_state prev_state;
 
 	/* Catch callers which need to be fixed */
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	BUG_ON((preempt_count() & ~PREEMPT_NEED_RESCHED) || !irqs_disabled());
 
 	prev_state = exception_enter();
 
@@ -4217,7 +4232,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-	task_thread_info(idle)->preempt_count = 0;
+	task_thread_info(idle)->saved_preempt_count = 0;
+	per_cpu(__preempt_count_var, cpu) = 0;
 
 	/*
 	 * The idle tasks have their own, simple scheduling class:
@@ -6562,7 +6578,8 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = (preempt_count() & ~(PREEMPT_ACTIVE|PREEMPT_NEED_RESCHED)) +
+		     rcu_preempt_depth();
 
 	return (nested == preempt_offset);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index be3d351..87ede78 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,7 +114,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
 
-	if (preempt_count() == cnt)
+	if ((preempt_count() & ~PREEMPT_NEED_RESCHED) == cnt)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
@@ -243,20 +243,20 @@ asmlinkage void __do_softirq(void)
 	do {
 		if (pending & 1) {
 			unsigned int vec_nr = h - softirq_vec;
-			int prev_count = preempt_count();
+			int prev_count = preempt_count() & ~PREEMPT_NEED_RESCHED;
 
 			kstat_incr_softirqs_this_cpu(vec_nr);
 
 			trace_softirq_entry(vec_nr);
 			h->action(h);
 			trace_softirq_exit(vec_nr);
-			if (unlikely(prev_count != preempt_count())) {
+			if (unlikely(prev_count != (preempt_count() & ~PREEMPT_NEED_RESCHED))) {
 				printk(KERN_ERR "huh, entered softirq %u %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", vec_nr,
 				       softirq_to_name[vec_nr], h->action,
 				       prev_count, preempt_count());
-				preempt_count() = prev_count;
+				preempt_count() = prev_count | (preempt_count() & PREEMPT_NEED_RESCHED);
 			}
 
 			rcu_bh_qs(cpu);
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13..4c6be29 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 			  unsigned long data)
 {
-	int preempt_count = preempt_count();
+	int count = preempt_count() & PREEMPT_MASK;
 
 #ifdef CONFIG_LOCKDEP
 	/*
@@ -1119,16 +1119,17 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 
 	lock_map_release(&lockdep_map);
 
-	if (preempt_count != preempt_count()) {
+	if (count != (preempt_count() & PREEMPT_MASK)) {
 		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
-			  fn, preempt_count, preempt_count());
+			  fn, count, preempt_count());
 		/*
 		 * Restore the preempt count. That gives us a decent
 		 * chance to survive and extract information. If the
 		 * callback kept a lock held, bad luck, but not worse
 		 * than the BUG() we had.
 		 */
-		preempt_count() = preempt_count;
+		preempt_count() &= ~PREEMPT_MASK;
+		preempt_count() |= count;
 	}
 }
 
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4c0d0e5..04abe53 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -9,10 +9,9 @@
 
 notrace unsigned int debug_smp_processor_id(void)
 {
-	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
 
-	if (likely(preempt_count))
+	if (likely(preempt_count()))
 		goto out;
 
 	if (irqs_disabled())

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 12:26         ` Peter Zijlstra
@ 2013-08-13 15:39           ` Linus Torvalds
  2013-08-13 15:56             ` Ingo Molnar
  2013-08-13 16:29             ` Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Linus Torvalds @ 2013-08-13 15:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Tue, Aug 13, 2013 at 5:26 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> So we're now down to something like:
>
>   decl fs:preempt_count
>   cmpl PREEMPT_NEED_RESCHED,fs:preempt_count
>   jnz 1f

Well, this isn't worth doing unless you can make PREEMPT_NEED_RESCHED
be the high bit, and we can combine it into just "decl+jns". Otherwise
we'd be better off with the simpler two separate adjacent variables.

Also, I think your patch is too big, and you should have aim to just
made the "preempt_count()" helper function mask off PREEMPT_MASK, so
that you don't change the semantics of that. I realize that there are
a couple of users that do things like "preempt_count() += x", and you
probably wanted to keep those working, but I think it is easier (and
cleaner) to fix those to "preempt_count_update(x)" instead of adding
all those explicitly PREEMPT_MASK masks.

Hmm?

             Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 15:39           ` Linus Torvalds
@ 2013-08-13 15:56             ` Ingo Molnar
  2013-08-13 16:26               ` Peter Zijlstra
  2013-08-13 16:28               ` H. Peter Anvin
  2013-08-13 16:29             ` Peter Zijlstra
  1 sibling, 2 replies; 17+ messages in thread
From: Ingo Molnar @ 2013-08-13 15:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Tue, Aug 13, 2013 at 5:26 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > So we're now down to something like:
> >
> >   decl fs:preempt_count
> >   cmpl PREEMPT_NEED_RESCHED,fs:preempt_count
> >   jnz 1f
> 
> Well, this isn't worth doing unless you can make PREEMPT_NEED_RESCHED be 
> the high bit, and we can combine it into just "decl+jns". Otherwise we'd 
> be better off with the simpler two separate adjacent variables.

Definitely, the cmpl should be avoided.

PREEMPT_NEED_RESCHED could be made the high bit - or maybe an even simpler 
solution is to invert its meaning: making '0' the "it needs to resched!" 
case, so the check would be decl+jz?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 15:56             ` Ingo Molnar
@ 2013-08-13 16:26               ` Peter Zijlstra
  2013-08-13 16:28               ` H. Peter Anvin
  1 sibling, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2013-08-13 16:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Tue, Aug 13, 2013 at 05:56:37PM +0200, Ingo Molnar wrote:

> PREEMPT_NEED_RESCHED could be made the high bit - or maybe an even simpler 
> solution is to invert its meaning: making '0' the "it needs to resched!" 
> case, so the check would be decl+jz?

Right, inverted NEED_RESCHED would work.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 15:56             ` Ingo Molnar
  2013-08-13 16:26               ` Peter Zijlstra
@ 2013-08-13 16:28               ` H. Peter Anvin
  1 sibling, 0 replies; 17+ messages in thread
From: H. Peter Anvin @ 2013-08-13 16:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Peter Zijlstra, Thomas Gleixner, Mike Galbraith,
	Andi Kleen, Linux Kernel Mailing List

On 08/13/2013 08:56 AM, Ingo Molnar wrote:
> 
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>> On Tue, Aug 13, 2013 at 5:26 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> So we're now down to something like:
>>>
>>>   decl fs:preempt_count
>>>   cmpl PREEMPT_NEED_RESCHED,fs:preempt_count
>>>   jnz 1f
>>
>> Well, this isn't worth doing unless you can make PREEMPT_NEED_RESCHED be 
>> the high bit, and we can combine it into just "decl+jns". Otherwise we'd 
>> be better off with the simpler two separate adjacent variables.
> 
> Definitely, the cmpl should be avoided.
> 
> PREEMPT_NEED_RESCHED could be made the high bit - or maybe an even simpler 
> solution is to invert its meaning: making '0' the "it needs to resched!" 
> case, so the check would be decl+jz?
> 

That is pretty elegant.  A little more elegant in fact than my
suggestion to bias NEED_RESCHED by 0x7fffffff and test the overflow flag.

That way we also avoid going off to the slow path without the count
being zero, which although we already covered that it doesn't matter all
that much still is a nice bonus.

	-hpa


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 15:39           ` Linus Torvalds
  2013-08-13 15:56             ` Ingo Molnar
@ 2013-08-13 16:29             ` Peter Zijlstra
  2013-08-13 16:38               ` Linus Torvalds
  1 sibling, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2013-08-13 16:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Tue, Aug 13, 2013 at 08:39:46AM -0700, Linus Torvalds wrote:
> Also, I think your patch is too big, and you should have aim to just
> made the "preempt_count()" helper function mask off PREEMPT_MASK, so
> that you don't change the semantics of that. I realize that there are
> a couple of users that do things like "preempt_count() += x", and you
> probably wanted to keep those working, but I think it is easier (and
> cleaner) to fix those to "preempt_count_update(x)" instead of adding
> all those explicitly PREEMPT_MASK masks.

For sure.. but I didn't want to spend time cleaning things up until
there was something half-way promising in it.

The inverted need_resched that gives decl+jnz idea from Ingo should do
it though. Not entirely sure I understand your MSB + jns suggestion:

  0x80000002 - 1 = 0x80000001

Both are very much signed and neither wants to cause a reschedule.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-13 16:29             ` Peter Zijlstra
@ 2013-08-13 16:38               ` Linus Torvalds
  0 siblings, 0 replies; 17+ messages in thread
From: Linus Torvalds @ 2013-08-13 16:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Mike Galbraith, Peter Anvin,
	Andi Kleen, Linux Kernel Mailing List

On Tue, Aug 13, 2013 at 9:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> The inverted need_resched that gives decl+jnz idea from Ingo should do
> it though.

I agree that that is a good approach.

> Not entirely sure I understand your MSB + jns suggestion:
>
>   0x80000002 - 1 = 0x80000001
>
> Both are very much signed and neither wants to cause a reschedule.

The thing is, we don't check the preempt count even currently, so the
above isn't fatal. The "need preemption" bit being set (or reset with
the reverse bit meaning) should be the unusual case with preemption
(you have to hit the race), *and* it should be unusual to have deeply
nested preemption anyway, so it's fine to test that in the slow path
(and we do: preempt_schedule() checks the preempt count being zero and
irqs being disabled, *exactly* because the preemption enable check
isn't precise).

But avoiding a few sloppy cases is certainly good, even if they are
unusual, so I do like the reversed bit approach. It also allows us to
pick any arbitrary bit, although I'm not sure that matters much.

               Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] per-cpu preempt_count
  2013-08-12 17:35 ` Linus Torvalds
  2013-08-12 17:51   ` H. Peter Anvin
  2013-08-12 17:58   ` Ingo Molnar
@ 2013-08-18 17:57   ` Paul E. McKenney
  2 siblings, 0 replies; 17+ messages in thread
From: Paul E. McKenney @ 2013-08-18 17:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Peter Anvin, Andi Kleen, Linux Kernel Mailing List

On Mon, Aug 12, 2013 at 10:35:48AM -0700, Linus Torvalds wrote:
> On Mon, Aug 12, 2013 at 4:51 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > The below boots to wanting to mount a root filesystem with
> > CONFIG_PREEMPT=y using kvm -smp 4.
> 
> But doesn't work in general? Or you just never tested?
> 
> I think that "thread_info->preempt_count" variable would need to be
> renamed to "saved_preempt_count" or similar to make sure we catch any
> users. But the patch certainly looks simple otherwise.
> 
> I'm pretty sure I had a discussion about this with Paul McKenney some
> time ago (because the RCU readlock is the most noticeable user of the
> preempt count - the others tend to be hidden inside the out-of-line
> spinlock functions etc), and I thought he had tried this and had some
> problems. Maybe we've fixed things since, or maybe he missed some
> case..

I was doing something a bit different -- trying to put preemptible RCU's
nesting counter into a per-CPU variable.  I considered putting this
counter into thread_info, but got flummoxed by the save/restore code.
If Peter's approach works out, I will look into a similar approach for
RCU's nesting counter.

For whatever it is worth, with the current Kconfigs, RCU only invokes
preempt_enable() and preempt_disable() when CONFIG_PREEMPT=n, in which
case these two functions are nops.  So RCU never exercises the
conditional function call in preempt_enable().

However, preemptible RCU has a situation similar to preempt_disable()
and preempt_enable(): simple increment and (not so simple) decrement in
the common case, and rare conditional function call from rcu_read_unlock()
that is invoked only if the read-side critical section was preempted or
ran for a long time.

							Thanx, Paul

> But if the patch really is this simple, then we should just do it. Of
> course, we should double-check that the percpu preempt count is in a
> cacheline that is already accessed (preferably already dirtied) by the
> context switching code.  And I think this should be an
> architecture-specific thing, because using a percpu variable might be
> good on some architectures but not others. So I get the feeling that
> it should be in the x86 __switch_to(), rather than in the generic
> code. I think it would fit very well with the per-cpu "old_rsp" and
> "current_task" updates that we already do.
> 
> > Adding TIF_NEED_RESCHED into the preempt count would allow a single test
> > in preempt_check_resched() instead of still needing the TI. Removing
> > PREEMPT_ACTIVE from preempt count should allow us to get rid of
> > ti::preempt_count altogether.
> >
> > The only problem with TIF_NEED_RESCHED is that its cross-cpu which would
> > make the entire thing atomic which would suck donkey balls so maybe we
> > need two separate per-cpu variables?
> 
> Agreed. Making it atomic would suck, and cancel all advantages of the
> better code generation to access it. Good point.
> 
> And yeah, it could be two variables in the same cacheline or something.
> 
>                  Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2013-08-18 17:57 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-12 11:51 [RFC] per-cpu preempt_count Peter Zijlstra
2013-08-12 17:35 ` Linus Torvalds
2013-08-12 17:51   ` H. Peter Anvin
2013-08-12 18:53     ` Linus Torvalds
2013-08-13  8:39       ` Peter Zijlstra
2013-08-12 17:58   ` Ingo Molnar
2013-08-12 19:00     ` Linus Torvalds
2013-08-12 20:44       ` H. Peter Anvin
2013-08-13 10:30       ` Ingo Molnar
2013-08-13 12:26         ` Peter Zijlstra
2013-08-13 15:39           ` Linus Torvalds
2013-08-13 15:56             ` Ingo Molnar
2013-08-13 16:26               ` Peter Zijlstra
2013-08-13 16:28               ` H. Peter Anvin
2013-08-13 16:29             ` Peter Zijlstra
2013-08-13 16:38               ` Linus Torvalds
2013-08-18 17:57   ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.