linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* per_cpu fixes
@ 2003-07-09 21:20 David Mosberger
  2003-07-09 21:56 ` David Mosberger
  2003-07-10  1:41 ` Rusty Russell
  0 siblings, 2 replies; 9+ messages in thread
From: David Mosberger @ 2003-07-09 21:20 UTC (permalink / raw)
  To: rusty; +Cc: linux-kernel

Rusty,

Care needs to be taken when taking the address of a CPU-local
variable, because otherwise things may break when comparing addresses
on a platform which uses virtual remapping to implement such
variables.  In particular, it's almost always unsafe to use the
address of a per-CPU variable which contains a "struct list", because
the list-manipulation routines use the list-head address to detect the
end of the list etc.

The patch below makes 2.5.74+ work on ia64 by reverting to the old
definition of this_rq().  Ditto for kernel/timer.c.

	--david

===== kernel/sched.c 1.196 vs edited =====
--- 1.196/kernel/sched.c	Wed Jul  9 11:06:25 2003
+++ edited/kernel/sched.c	Wed Jul  9 14:06:49 2003
@@ -176,7 +176,7 @@
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
-#define this_rq()		(&__get_cpu_var(runqueues))
+#define this_rq()		(&cpu_rq(smp_processor_id())) /* not __get_cpu_var(runqueues)! */
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define rt_task(p)		((p)->prio < MAX_RT_PRIO)
===== kernel/timer.c 1.62 vs edited =====
--- 1.62/kernel/timer.c	Wed Jul  9 11:06:25 2003
+++ edited/kernel/timer.c	Wed Jul  9 13:30:31 2003
@@ -160,7 +160,7 @@
  */
 void add_timer(struct timer_list *timer)
 {
-	tvec_base_t *base = &get_cpu_var(tvec_bases);
+	tvec_base_t *base = &per_cpu(tvec_bases, get_cpu());
   	unsigned long flags;
   
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -171,7 +171,7 @@
 	internal_add_timer(base, timer);
 	timer->base = base;
 	spin_unlock_irqrestore(&base->lock, flags);
-	put_cpu_var(tvec_bases);
+	put_cpu();
 }
 
 /***
@@ -234,7 +234,7 @@
 		return 1;
 
 	spin_lock_irqsave(&timer->lock, flags);
-	new_base = &__get_cpu_var(tvec_bases);
+	new_base = &per_cpu(tvec_bases, smp_processor_id());
 repeat:
 	old_base = timer->base;
 
@@ -792,7 +792,7 @@
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+	tvec_base_t *base = &per_cpu(tvec_bases, smp_processor_id());
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-09 21:20 per_cpu fixes David Mosberger
@ 2003-07-09 21:56 ` David Mosberger
  2003-07-10  1:41 ` Rusty Russell
  1 sibling, 0 replies; 9+ messages in thread
From: David Mosberger @ 2003-07-09 21:56 UTC (permalink / raw)
  To: rusty; +Cc: davidm, linux-kernel

>>>>> On Wed, 9 Jul 2003 14:20:29 -0700, David Mosberger <davidm@linux.hpl.hp.com> said:

  David> Care needs to be taken when taking the address of a CPU-local
  David> variable, because otherwise things may break when comparing addresses
  David> on a platform which uses virtual remapping to implement such
  David> variables.  In particular, it's almost always unsafe to use the
  David> address of a per-CPU variable which contains a "struct list", because
  David> the list-manipulation routines use the list-head address to detect the
  David> end of the list etc.

  David> The patch below makes 2.5.74+ work on ia64 by reverting to the old
  David> definition of this_rq().  Ditto for kernel/timer.c.

Sorry, the original patch had an extraneous & which snuck in undetected.

Patch below should be better...

	--david

===== kernel/sched.c 1.196 vs edited =====
--- 1.196/kernel/sched.c	Wed Jul  9 11:06:25 2003
+++ edited/kernel/sched.c	Wed Jul  9 14:36:55 2003
@@ -176,7 +176,7 @@
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
-#define this_rq()		(&__get_cpu_var(runqueues))
+#define this_rq()		(cpu_rq(smp_processor_id())) /* not __get_cpu_var(runqueues)! */
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define rt_task(p)		((p)->prio < MAX_RT_PRIO)
===== kernel/timer.c 1.62 vs edited =====
--- 1.62/kernel/timer.c	Wed Jul  9 11:06:25 2003
+++ edited/kernel/timer.c	Wed Jul  9 13:30:31 2003
@@ -160,7 +160,7 @@
  */
 void add_timer(struct timer_list *timer)
 {
-	tvec_base_t *base = &get_cpu_var(tvec_bases);
+	tvec_base_t *base = &per_cpu(tvec_bases, get_cpu());
   	unsigned long flags;
   
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -171,7 +171,7 @@
 	internal_add_timer(base, timer);
 	timer->base = base;
 	spin_unlock_irqrestore(&base->lock, flags);
-	put_cpu_var(tvec_bases);
+	put_cpu();
 }
 
 /***
@@ -234,7 +234,7 @@
 		return 1;
 
 	spin_lock_irqsave(&timer->lock, flags);
-	new_base = &__get_cpu_var(tvec_bases);
+	new_base = &per_cpu(tvec_bases, smp_processor_id());
 repeat:
 	old_base = timer->base;
 
@@ -792,7 +792,7 @@
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+	tvec_base_t *base = &per_cpu(tvec_bases, smp_processor_id());
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-09 21:20 per_cpu fixes David Mosberger
  2003-07-09 21:56 ` David Mosberger
@ 2003-07-10  1:41 ` Rusty Russell
  2003-07-10  9:37   ` Andi Kleen
  2003-07-10 17:55   ` David Mosberger
  1 sibling, 2 replies; 9+ messages in thread
From: Rusty Russell @ 2003-07-10  1:41 UTC (permalink / raw)
  To: davidm; +Cc: linux-kernel, torvalds, ak

In message <200307092120.h69LKTBH002759@napali.hpl.hp.com> you write:
> Rusty,
> 
> Care needs to be taken when taking the address of a CPU-local
> variable, because otherwise things may break when comparing addresses
> on a platform which uses virtual remapping to implement such
> variables.  In particular, it's almost always unsafe to use the
> address of a per-CPU variable which contains a "struct list", because
> the list-manipulation routines use the list-head address to detect the
> end of the list etc.

The horror.  Such rules are entirely too much problem to push on the
poor programmer 8(

When I implemented this, I imagined archs putting their per-cpu offset
inside a register, so they could get to their vars in one instruction,
but not the IA64 remapping thing.  We are now suffering because of my
limited imagination (which David has commented on before 8).

A compromise is possible.  I believe that the address of a per-cpu
variable *must* be the same everywhere, but we can provide get & set
macros which never expose an lvalue, and on IA64 could use the pinned
TLB thing:

/* Usage: set_cpu_local(myint, = 1), or set_cpu_local(mystruct,.member = 1) */
#define set_cpu_local(var, assign) ...

/* Usage: get_cpu_local(myint), or get_cpu_local(mystruct).member */
#define get_cpu_local(var) ...

I rejected such an approach before when Andi Kleen asked for it (IIRC
he wanted to use %gs as the per-cpu ptr, but couldn't easily produce
an lvalue), because I wanted a nice, clean interface.  However, recent
gcc handles the struct result of the statement expression flawlessly
AFAICT, so I'm less inclined to resist.

Thoughts?
Rusty.
PS. David, this is your revenge for making more work for you, isn't it?
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-10  1:41 ` Rusty Russell
@ 2003-07-10  9:37   ` Andi Kleen
  2003-07-10 17:55   ` David Mosberger
  1 sibling, 0 replies; 9+ messages in thread
From: Andi Kleen @ 2003-07-10  9:37 UTC (permalink / raw)
  To: Rusty Russell; +Cc: davidm, linux-kernel, torvalds, ak

> When I implemented this, I imagined archs putting their per-cpu offset
> inside a register, so they could get to their vars in one instruction,
> but not the IA64 remapping thing.  We are now suffering because of my
> limited imagination (which David has commented on before 8).

x86-64 has similar problems. While the virtual addresses are different
the direct access using the segment register doesn't yield an address
(there is no LEA instruction to read from segment registers). It can be
worked around, but you have to follow an indirect pointer.
For most efficient access you can't take the address.

[currently the code doesn't use the Segment access for per_cpu data,
but I plan to readd this eventually]

-Andi

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-10  1:41 ` Rusty Russell
  2003-07-10  9:37   ` Andi Kleen
@ 2003-07-10 17:55   ` David Mosberger
  2003-07-10 18:15     ` Linus Torvalds
  2003-07-11  2:01     ` Rusty Russell
  1 sibling, 2 replies; 9+ messages in thread
From: David Mosberger @ 2003-07-10 17:55 UTC (permalink / raw)
  To: Rusty Russell; +Cc: davidm, linux-kernel, torvalds, ak

>>>>> On Thu, 10 Jul 2003 11:41:07 +1000, Rusty Russell <rusty@rustcorp.com.au> said:

  Rusty> A compromise is possible.  I believe that the address of a
  Rusty> per-cpu variable *must* be the same everywhere, but we can
  Rusty> provide get & set macros which never expose an lvalue, and on
  Rusty> IA64 could use the pinned TLB thing:

  Rusty> /* Usage: set_cpu_local(myint, = 1), or set_cpu_local(mystruct,.member = 1) */
  Rusty> #define set_cpu_local(var, assign) ...

  Rusty> /* Usage: get_cpu_local(myint), or get_cpu_local(mystruct).member */
  Rusty> #define get_cpu_local(var)
  Rusty> ...

You mean there would be three primitives:

 (1) get value from a per-CPU variable
 (2) set value of a per-CPU variable
 (3) get the (canonical) address of a per-CPU variable

?

If so, I could live with that.

I don't like the proposed syntax for (2) very much, but I don't have a
better suggestion and it may not matter much: for extremely
performance-critical stuff (2) can be used and if you really hate the
syntax, you can use (3) to get a pointer to the variable and then do
the normal thing.

On ia64, we should be able to implement (3) via a CPU-local variable
which stores the per-CPU offset for a CPU.  Ideally, calculating the
canonical address of CPU-local variable FOO would boil down to:

 movl ADDR = local_per_cpu_offset;;
 load ADDR = [ADDR];;				// read the per-CPU offset
 addl ADDR = (FOO - local_per_cpu_offset), ADDR	// calculate local addr of FOO

I'm not sure we can coax gcc to generate exactly this code, but we
should be able to get close and should definitely be able to avoid an
array lookup and accessing remote memory (in the NUMA case).

I suspect Andi could do something similar?

	--david

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-10 17:55   ` David Mosberger
@ 2003-07-10 18:15     ` Linus Torvalds
  2003-07-10 18:22       ` David Mosberger
  2003-07-11  2:01     ` Rusty Russell
  1 sibling, 1 reply; 9+ messages in thread
From: Linus Torvalds @ 2003-07-10 18:15 UTC (permalink / raw)
  To: davidm; +Cc: Rusty Russell, Kernel Mailing List, ak


On Thu, 10 Jul 2003, David Mosberger wrote:
> 
> You mean there would be three primitives:
> 
>  (1) get value from a per-CPU variable
>  (2) set value of a per-CPU variable
>  (3) get the (canonical) address of a per-CPU variable

Argh.

We'd better have the rule that if there are any virtual caches or other
issues, then the "canonical address" had better be the _only_ address (or
at least any virtual remapping has to be done in such a way that it never
causes aliasing or other performance problems with the canonical address).

This is already turning fairly ugly, and I just don't want to see even 
more ugly rules like "you can't mix direct accesses with pointer accesses"

		Linus


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-10 18:15     ` Linus Torvalds
@ 2003-07-10 18:22       ` David Mosberger
  0 siblings, 0 replies; 9+ messages in thread
From: David Mosberger @ 2003-07-10 18:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: davidm, Rusty Russell, Kernel Mailing List, ak

>>>>> On Thu, 10 Jul 2003 11:15:25 -0700 (PDT), Linus Torvalds <torvalds@osdl.org> said:

  Linus> On Thu, 10 Jul 2003, David Mosberger wrote:
  >> 
  >> You mean there would be three primitives:
  >> 
  >> (1) get value from a per-CPU variable
  >> (2) set value of a per-CPU variable
  >> (3) get the (canonical) address of a per-CPU variable

  Linus> Argh.

  Linus> We'd better have the rule that if there are any virtual
  Linus> caches or other issues, then the "canonical address" had
  Linus> better be the _only_ address (or at least any virtual
  Linus> remapping has to be done in such a way that it never causes
  Linus> aliasing or other performance problems with the canonical
  Linus> address).

  Linus> This is already turning fairly ugly, and I just don't want to
  Linus> see even more ugly rules like "you can't mix direct accesses
  Linus> with pointer accesses"

Yes, Rusty's proposal (as I think I summarized above) would do exactly
that: the only address that you'll ever see for a per-CPU variable
will be the canonical one.  The get/set macros can use an alias on
platforms where this is more efficient, but the alias will never be
visible outside the macros.

	--david

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-10 17:55   ` David Mosberger
  2003-07-10 18:15     ` Linus Torvalds
@ 2003-07-11  2:01     ` Rusty Russell
  2003-07-11  2:08       ` David Mosberger
  1 sibling, 1 reply; 9+ messages in thread
From: Rusty Russell @ 2003-07-11  2:01 UTC (permalink / raw)
  To: davidm; +Cc: linux-kernel, torvalds, ak

In message <16141.43130.657025.952793@napali.hpl.hp.com> you write:
> You mean there would be three primitives:
> 
>  (1) get value from a per-CPU variable
>  (2) set value of a per-CPU variable
>  (3) get the (canonical) address of a per-CPU variable
> 
> ?

Almost, #3 would actually be "get lvalue", as now.

The only problem is that a quick audit of 2.5.75 reveals 16 places in
generic code where set/get primitives would work without jumping
through hoops, out of 43.

New idea:

Provide cpu_local_inc(var)/cpu_local_dec(var) and __cpu_local_inc(var)
and __cpu_local_dec(var).  Generic implementation:

  /* This is local.h */
  typedef atomic_t local_t;

  #define local_inc(l) atomic_inc(l)
  #define local_dec(l) atomic_dec(l)

  /* l is a per-cpu local_t.  Increment it atomically. */
  #define cpu_local_inc(l) local_inc(&__get_cpu_var(l))
  #define cpu_local_dec(l) local_dec(&__get_cpu_var(l))

  /* Increment non-atomically (must have preempt disabled) */
  #define __cpu_local_inc(l) \
	atomic_set(&__get_cpu_var(l), atomic_read(&__get_cpu_var(l)) + 1)
  #define __cpu_local_dec(l) \
	atomic_set(&__get_cpu_var(l), atomic_read(&__get_cpu_var(l)) - 1)

This catches one common case for ia64 to use the magic pinned area,
and also allows x86 to use its incl/decl instructions, which both
networking stats and module refcounts have wanted for a while.

Thoughts?
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: per_cpu fixes
  2003-07-11  2:01     ` Rusty Russell
@ 2003-07-11  2:08       ` David Mosberger
  0 siblings, 0 replies; 9+ messages in thread
From: David Mosberger @ 2003-07-11  2:08 UTC (permalink / raw)
  To: Rusty Russell; +Cc: davidm, linux-kernel, torvalds, ak

>>>>> On Fri, 11 Jul 2003 12:01:08 +1000, Rusty Russell <rusty@rustcorp.com.au> said:

  Rusty> This catches one common case for ia64 to use the magic pinned
  Rusty> area, and also allows x86 to use its incl/decl instructions,
  Rusty> which both networking stats and module refcounts have wanted
  Rusty> for a while.

Looks fine to me.

	--david

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2003-07-11  1:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-07-09 21:20 per_cpu fixes David Mosberger
2003-07-09 21:56 ` David Mosberger
2003-07-10  1:41 ` Rusty Russell
2003-07-10  9:37   ` Andi Kleen
2003-07-10 17:55   ` David Mosberger
2003-07-10 18:15     ` Linus Torvalds
2003-07-10 18:22       ` David Mosberger
2003-07-11  2:01     ` Rusty Russell
2003-07-11  2:08       ` David Mosberger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).