linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC, patch] i386: vgetcpu(), take 2
@ 2006-06-21  7:27 Chuck Ebbert
  2006-06-21  8:15 ` Ingo Molnar
  2006-06-21  9:26 ` Andi Kleen
  0 siblings, 2 replies; 30+ messages in thread
From: Chuck Ebbert @ 2006-06-21  7:27 UTC (permalink / raw)
  To: linux-kernel; +Cc: Linus Torvalds, Ingo Molnar, Andi Kleen

Use a GDT entry's limit field to store per-cpu data for fast access
from userspace, and provide a vsyscall to access the current CPU
number stored there.

Questions:
 1. Will the vdso relocation patch break this?
 2. Should the version number of the vsyscall .so be incremented?

Test program using the new call:

/* vgetcpu.c: get CPU number we are running on.
 * build kernel with vgetcpu patch first, then:
 *  gcc -o vgetcpu vgetcpu.c <srcpath>/arch/i386/kernel/vsyscall-sysenter.so
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>

extern int __vgetcpu(void);

int main(int argc, char * const argv[])
{
	printf("cpu: %u\n", __vgetcpu());

	return 0;
}

---
 arch/i386/kernel/cpu/common.c        |    3 +++
 arch/i386/kernel/head.S              |    8 +++++++-
 arch/i386/kernel/vsyscall-getcpu.S   |   25 +++++++++++++++++++++++++
 arch/i386/kernel/vsyscall-int80.S    |    2 ++
 arch/i386/kernel/vsyscall-sysenter.S |    2 ++
 arch/i386/kernel/vsyscall.lds.S      |    1 +
 6 files changed, 40 insertions(+), 1 deletion(-)

--- 2.6.17-32.orig/arch/i386/kernel/cpu/common.c
+++ 2.6.17-32/arch/i386/kernel/cpu/common.c
@@ -642,6 +642,9 @@ void __cpuinit cpu_init(void)
 		((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
 		(CPU_16BIT_STACK_SIZE - 1);
 
+	/* Set up GDT entry for per-cpu data */
+ 	*(__u64 *)(&gdt[27]) |= cpu;
+
 	cpu_gdt_descr->size = GDT_SIZE - 1;
  	cpu_gdt_descr->address = (unsigned long)gdt;
 
--- 2.6.17-32.orig/arch/i386/kernel/head.S
+++ 2.6.17-32/arch/i386/kernel/head.S
@@ -525,7 +525,13 @@ ENTRY(cpu_gdt_table)
 	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
 
 	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
-	.quad 0x0000000000000000	/* 0xd8 - unused */
+
+	/*
+	 * Use a GDT entry to store per-cpu data for user space (DPL 3.)
+	 * 32-bit data segment, byte granularity, base 0, limit set at runtime.
+	 */
+	.quad 0x0040f20000000000	/* 0xd8 - for per-cpu user data */
+
 	.quad 0x0000000000000000	/* 0xe0 - unused */
 	.quad 0x0000000000000000	/* 0xe8 - unused */
 	.quad 0x0000000000000000	/* 0xf0 - unused */
--- /dev/null
+++ 2.6.17-32/arch/i386/kernel/vsyscall-getcpu.S
@@ -0,0 +1,25 @@
+/*
+ * vgetcpu
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The kernel assumes that the addresses of these
+ * routines are constant for all vsyscall implementations.
+ */
+
+#include <linux/errno.h>
+
+	.text
+	.org __kernel_rt_sigreturn+32,0x90
+	.globl __vgetcpu
+	.type __vgetcpu,@function
+__vgetcpu:
+.LSTART_vgetcpu:
+	movl $-EFAULT,%eax
+	movl $((27<<3)|3),%edx
+	lsll %edx,%eax
+	jnz 1f
+	andl $0xff,%eax
+1:
+	ret
+.LEND_vgetcpu:
+	.size __vgetcpu,.-.LSTART_vgetcpu
+
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall-int80.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall-int80.S
@@ -51,3 +51,5 @@ __kernel_vsyscall:
  * Get the common code for the sigreturn entry points.
  */
 #include "vsyscall-sigreturn.S"
+
+#include "vsyscall-getcpu.S"
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall-sysenter.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall-sysenter.S
@@ -120,3 +120,5 @@ SYSENTER_RETURN:
  * Get the common code for the sigreturn entry points.
  */
 #include "vsyscall-sigreturn.S"
+
+#include "vsyscall-getcpu.S"
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall.lds.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall.lds.S
@@ -57,6 +57,7 @@ VERSION
     	__kernel_vsyscall;
     	__kernel_sigreturn;
     	__kernel_rt_sigreturn;
+	__vgetcpu;
 
     local: *;
   };
-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21  7:27 [RFC, patch] i386: vgetcpu(), take 2 Chuck Ebbert
@ 2006-06-21  8:15 ` Ingo Molnar
  2006-06-21 17:38   ` Artur Skawina
  2006-06-28  5:44   ` Paul Jackson
  2006-06-21  9:26 ` Andi Kleen
  1 sibling, 2 replies; 30+ messages in thread
From: Ingo Molnar @ 2006-06-21  8:15 UTC (permalink / raw)
  To: Chuck Ebbert
  Cc: linux-kernel, Linus Torvalds, Andi Kleen, Ulrich Drepper,
	Roland McGrath, Jakub Jelinek


* Chuck Ebbert <76306.1226@compuserve.com> wrote:

> Use a GDT entry's limit field to store per-cpu data for fast access 
> from userspace, and provide a vsyscall to access the current CPU 
> number stored there.

very nice idea! I thought of doing sys_get_cpu() too, but my idea was to 
use the scheduler to keep a writable [and permanently pinned, 
per-thread] VDSO data page uptodate with the current CPU# [and other 
interesting data]. Btw., do we know how fast LSL is on modern CPUs?

> Questions:
>  1. Will the vdso relocation patch break this?

no - why should it?

>  2. Should the version number of the vsyscall .so be incremented?

i've Cc:-ed the glibc folks.

but my gut feeling is that we should add a proper sys_get_cpu() syscall 
as well, and thus make this a transparent syscall, not dependent on the 
availability of the vDSO.

> +__vgetcpu:
> +.LSTART_vgetcpu:
> +	movl $-EFAULT,%eax
> +	movl $((27<<3)|3),%edx
> +	lsll %edx,%eax
> +	jnz 1f
> +	andl $0xff,%eax
> +1:
> +	ret

this needs unwinder annotations as well to make this a proper DSO, so 
that for example a breakpoint here does not confuse gdb.

also, would be nice to do something like this in 64-bit mode too.

	Ingo

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21  7:27 [RFC, patch] i386: vgetcpu(), take 2 Chuck Ebbert
  2006-06-21  8:15 ` Ingo Molnar
@ 2006-06-21  9:26 ` Andi Kleen
  2006-06-21  9:35   ` Ingo Molnar
  2006-06-21 21:54   ` Rohit Seth
  1 sibling, 2 replies; 30+ messages in thread
From: Andi Kleen @ 2006-06-21  9:26 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: Linus Torvalds, Ingo Molnar, linux-kernel

Chuck Ebbert <76306.1226@compuserve.com> writes:

> Use a GDT entry's limit field to store per-cpu data for fast access
> from userspace, and provide a vsyscall to access the current CPU
> number stored there.

Just the CPU alone is useless - you want at least the node too in many
cases. Best you use the prototype I proposed earlier for x86-64.

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21  9:26 ` Andi Kleen
@ 2006-06-21  9:35   ` Ingo Molnar
  2006-06-21 21:54   ` Rohit Seth
  1 sibling, 0 replies; 30+ messages in thread
From: Ingo Molnar @ 2006-06-21  9:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Chuck Ebbert, Linus Torvalds, linux-kernel, Ulrich Drepper,
	Jakub Jelinek, Roland McGrath


* Andi Kleen <ak@suse.de> wrote:

> Chuck Ebbert <76306.1226@compuserve.com> writes:
> 
> > Use a GDT entry's limit field to store per-cpu data for fast access 
> > from userspace, and provide a vsyscall to access the current CPU 
> > number stored there.
> 
> Just the CPU alone is useless - you want at least the node too in many 
> cases. Best you use the prototype I proposed earlier for x86-64.

just the CPU is fine already in many cases [and the node ID derives from 
the linear CPU id anyway] - but i agree that in the API we want to 
include the node-ID too, for NUMA-aware userspace allocators, etc.

	Ingo

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21  8:15 ` Ingo Molnar
@ 2006-06-21 17:38   ` Artur Skawina
  2006-06-28  5:44   ` Paul Jackson
  1 sibling, 0 replies; 30+ messages in thread
From: Artur Skawina @ 2006-06-21 17:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Chuck Ebbert, linux-kernel, Linus Torvalds, Andi Kleen,
	Ulrich Drepper, Roland McGrath, Jakub Jelinek

Ingo Molnar wrote:
> * Chuck Ebbert <76306.1226@compuserve.com> wrote:
> 
>> Use a GDT entry's limit field to store per-cpu data for fast access 
>> from userspace, and provide a vsyscall to access the current CPU 
>> number stored there.
> 
> very nice idea! I thought of doing sys_get_cpu() too, but my idea was to 
> use the scheduler to keep a writable [and permanently pinned, 
> per-thread] VDSO data page uptodate with the current CPU# [and other 
> interesting data]. Btw., do we know how fast LSL is on modern CPUs?

a quick check on two p2/p4 boxes gives the cycle numbers below. syscall/io times for comparison.
Not that cheap, but still only ~1/4 of a syscall...

 P4   P2
 123  39  {movl $-47,%%eax ; movl $((27<<3)|3),%%edx ; lsll %%edx,%%eax ; jnz 1f ; andl $0xff,%%eax ; 1: ;}  (average: 155)

 959  287 {movl $20,%%eax ; int $0x80 ; # getpid() }  (average: 983)
 475  153 {movl $20,%%eax ; call *vsyscall ; # getpid() }  (average: 519)

 333  586 {outb %%al,$0x80;}  (average: 369)
3572  1181 {outb %%al,$0x80;outb %%al,$0x80;}  (average: 3628)
6755  1557 {outb %%al,$0x80;outb %%al,$0x80;outb %%al,$0x80;}  (average: 6866)

P2:
cpu family      : 6
model           : 5
model name      : Pentium II (Deschutes)
stepping        : 2
cpu MHz         : 400.982
cache size      : 512 KB

P4:
cpu family      : 15
model           : 4
model name      : Intel(R) Celeron(R) CPU 2.53GHz
stepping        : 1
cpu MHz         : 2533.270
cache size      : 256 KB

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21  9:26 ` Andi Kleen
  2006-06-21  9:35   ` Ingo Molnar
@ 2006-06-21 21:54   ` Rohit Seth
  2006-06-21 22:21     ` Andi Kleen
  1 sibling, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-21 21:54 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Wed, 2006-06-21 at 11:26 +0200, Andi Kleen wrote:
> Chuck Ebbert <76306.1226@compuserve.com> writes:
> 
> > Use a GDT entry's limit field to store per-cpu data for fast access
> > from userspace, and provide a vsyscall to access the current CPU
> > number stored there.
> 

Very clever.  

> Just the CPU alone is useless - you want at least the node too in many
> cases. Best you use the prototype I proposed earlier for x86-64.
> 

Can we use similar  mechanism to access pda in vsyscall in x86_64 (by
storing the address of pda there).  That way the useful variables like
cpunumber, nodenumber can be accessed easily without doing cpuid (and
without tcache).  The system call can take a flag like GET_CPUNUMBER or
GET_NODENUMBER or GET_NMICOUNT or if anything new gets added in this
structure.

-rohit


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 21:54   ` Rohit Seth
@ 2006-06-21 22:21     ` Andi Kleen
  2006-06-21 22:59       ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-21 22:21 UTC (permalink / raw)
  To: rohitseth; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel


> Can we use similar  mechanism to access pda in vsyscall in x86_64 (by
> storing the address of pda there).  


You mean in the kernel? %gs prefix is a lot faster than this.

Also the limit is only 20bit, not enough for a full address.

For user space it's useful though, but I don't see any immediate uses
other than cpu number and node number. For most purposes glibc TLS
(which uses %fs) is probably sufficient.

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 22:21     ` Andi Kleen
@ 2006-06-21 22:59       ` Rohit Seth
  2006-06-21 23:05         ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-21 22:59 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thu, 2006-06-22 at 00:21 +0200, Andi Kleen wrote:
> > Can we use similar  mechanism to access pda in vsyscall in x86_64 (by
> > storing the address of pda there).  
> 
> 
> You mean in the kernel? %gs prefix is a lot faster than this.
> 

Yes it is.  And will work if we are okay to swap to kernel gs in
vsyscall code.

> Also the limit is only 20bit, not enough for a full address.
> 

I was thinking of storing it is base address part of the descriptor and
then using the memory load to read it in vsyscall.  (Keeping the p bit
to zero in the descriptor).

> For user space it's useful though, but I don't see any immediate uses
> other than cpu number and node number. For most purposes glibc TLS
> (which uses %fs) is probably sufficient.

cpu and node number are really important (for the reasons that you
mentioned in your initial mail on vgetcpu).  In addition to that I was
thinking in terms of having some counters like nmi_count that is already
there and per cpu specific.

Besides, not having to use the tcache part in the proposed system call
seems to just make the interface cleaner. 

-rohit




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 22:59       ` Rohit Seth
@ 2006-06-21 23:05         ` Andi Kleen
  2006-06-21 23:18           ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-21 23:05 UTC (permalink / raw)
  To: rohitseth; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thursday 22 June 2006 00:59, Rohit Seth wrote:
> On Thu, 2006-06-22 at 00:21 +0200, Andi Kleen wrote:
> > > Can we use similar  mechanism to access pda in vsyscall in x86_64 (by
> > > storing the address of pda there).  
> > 
> > 
> > You mean in the kernel? %gs prefix is a lot faster than this.
> > 
> 
> Yes it is.  And will work if we are okay to swap to kernel gs in
> vsyscall code.

swapgs is only allowed in ring 0.

> 
> > Also the limit is only 20bit, not enough for a full address.
> > 
> 
> I was thinking of storing it is base address part of the descriptor and
> then using the memory load to read it in vsyscall.  (Keeping the p bit
> to zero in the descriptor).

I'm still not sure where and for what you want to use this. In user space 
or in kernel space? And what information should be stored in there?

> 
> > For user space it's useful though, but I don't see any immediate uses
> > other than cpu number and node number. For most purposes glibc TLS
> > (which uses %fs) is probably sufficient.
> 
> cpu and node number are really important (for the reasons that you
> mentioned in your initial mail on vgetcpu).  In addition to that I was
> thinking in terms of having some counters like nmi_count that is already
> there and per cpu specific.

For what would you need nmi count in user space?

> Besides, not having to use the tcache part in the proposed system call
> seems to just make the interface cleaner. 

tcache is still far faster than LSL (which is slower than RDTSCP) 

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 23:05         ` Andi Kleen
@ 2006-06-21 23:18           ` Rohit Seth
  2006-06-21 23:29             ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-21 23:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thu, 2006-06-22 at 01:05 +0200, Andi Kleen wrote:
> On Thursday 22 June 2006 00:59, Rohit Seth wrote:

> > I was thinking of storing it is base address part of the descriptor and
> > then using the memory load to read it in vsyscall.  (Keeping the p bit
> > to zero in the descriptor).
> 
> I'm still not sure where and for what you want to use this. In user space 
> or in kernel space? And what information should be stored in there?
> 

Store the kernel virtual pointer in gdt to access pda in (proposed)
vgetcpu in vsyscall.  Using this pointer we can easily reach the cpu and
node numbers and any other information that is there in pda.  For the
cpu and node numbers this will get rid of the need to do a serializing
operation cpuid.

Does it make any sense?


> > Besides, not having to use the tcache part in the proposed system call
> > seems to just make the interface cleaner. 
> 
> tcache is still far faster than LSL (which is slower than RDTSCP) 

Since we are not using the limits part of the descriptor so lsl will not
be needed.  Though an indirect load from  gdt page will be made.

-rohit


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 23:18           ` Rohit Seth
@ 2006-06-21 23:29             ` Andi Kleen
  2006-06-22  0:55               ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-21 23:29 UTC (permalink / raw)
  To: rohitseth; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thursday 22 June 2006 01:18, Rohit Seth wrote:
> On Thu, 2006-06-22 at 01:05 +0200, Andi Kleen wrote:
> > On Thursday 22 June 2006 00:59, Rohit Seth wrote:
> 
> > > I was thinking of storing it is base address part of the descriptor and
> > > then using the memory load to read it in vsyscall.  (Keeping the p bit
> > > to zero in the descriptor).
> > 
> > I'm still not sure where and for what you want to use this. In user space 
> > or in kernel space? And what information should be stored in there?
> > 
> 
> Store the kernel virtual pointer in gdt to access pda in (proposed)
> vgetcpu in vsyscall. 
> Using this pointer we can easily reach the cpu and 
> node numbers and any other information that is there in pda.  For the
> cpu and node numbers this will get rid of the need to do a serializing
> operation cpuid.
> 
> Does it make any sense?

Ok to spell it out (please correct me if I misinterpreted you). You want to:

- Split PDA into kernel part and user exportable part
- Export user exportable part to ring 3
- Put base address of user exportable part into GDT
- Access it using that.

I don't think it can work because the GDT only supports 32bit
base addresses for code/data segments in long mode and you can't put
a kernel virtual address into 32bit (only user space there) 

And you can't get at at the base address anyways because they
are ignored in long mode (except for fs/gs). For fs/gs you would
need to save/restore them to reuse them which would be slow.

You can't also just put them into fs/gs because those are
already reserved for user space.

Also I don't know what other information other than cpu/node 
would be useful, so just using the 20 bits of limit seems plenty to me.

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 23:29             ` Andi Kleen
@ 2006-06-22  0:55               ` Rohit Seth
  2006-06-22  8:08                 ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-22  0:55 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thu, 2006-06-22 at 01:29 +0200, Andi Kleen wrote:
> On Thursday 22 June 2006 01:18, Rohit Seth wrote:
> > On Thu, 2006-06-22 at 01:05 +0200, Andi Kleen wrote:
> > > On Thursday 22 June 2006 00:59, Rohit Seth wrote:
> > 
> > > > I was thinking of storing it is base address part of the descriptor and
> > > > then using the memory load to read it in vsyscall.  (Keeping the p bit
> > > > to zero in the descriptor).
> > > 
> > > I'm still not sure where and for what you want to use this. In user space 
> > > or in kernel space? And what information should be stored in there?
> > > 
> > 
> > Store the kernel virtual pointer in gdt to access pda in (proposed)
> > vgetcpu in vsyscall. 
> > Using this pointer we can easily reach the cpu and 
> > node numbers and any other information that is there in pda.  For the
> > cpu and node numbers this will get rid of the need to do a serializing
> > operation cpuid.
> > 
> > Does it make any sense?
> 
> Ok to spell it out (please correct me if I misinterpreted you). You want to:
> 
> - Split PDA into kernel part and user exportable part

yes.

> - Export user exportable part to ring 3

yes for vsyscall purposes.

> - Put base address of user exportable part into GDT
> - Access it using that.
> 

These are the steps that I'm proposing in vgetcpu:

Read the GDT pointer in vgetcpu code path.  This is the base of gdt
table.
Read descriptor #20 from base.  
This is the pointer to user visible part of per cpu data structure.

Please let me know if I'm missing something here.

Just a side note, in your vgetcpu patch, would it be better to return
the logical CPU number (as printed in /proc/cpuinfo).  Also, I think
applications would be interested in knowing the physical package id for
cores sharing caches.

 
> I don't think it can work because the GDT only supports 32bit
> base addresses for code/data segments in long mode and you can't put
> a kernel virtual address into 32bit (only user space there) 
> 

Really not using the GDT descriptor in terms of  loading it in any
segment register.

> And you can't get at at the base address anyways because they
> are ignored in long mode (except for fs/gs). For fs/gs you would
> need to save/restore them to reuse them which would be slow.
> 
> You can't also just put them into fs/gs because those are
> already reserved for user space.
> 

That is the reason I'm not proposing to alter existing fs/gs.

> Also I don't know what other information other than cpu/node 
> would be useful, so just using the 20 bits of limit seems plenty to me.
> 


physical id (of the package for exmpale) is another useful field.  I
would also like to see number of interrupts serviced by this cpu, page
faults  etc.  But I think that is a separate discussion.

Thanks,
-rohit


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-22  0:55               ` Rohit Seth
@ 2006-06-22  8:08                 ` Andi Kleen
  2006-06-22 21:06                   ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-22  8:08 UTC (permalink / raw)
  To: rohitseth; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thursday 22 June 2006 02:55, Rohit Seth wrote:


> > - Put base address of user exportable part into GDT
> > - Access it using that.
>
> These are the steps that I'm proposing in vgetcpu:
>
> Read the GDT pointer in vgetcpu code path.  This is the base of gdt
> table.
> Read descriptor #20 from base.
> This is the pointer to user visible part of per cpu data structure.

> Please let me know if I'm missing something here.

Ok that would probably work, but you would need to export the GDT too.

I still don't see why we should do it - limit should be enough.

> Just a side note, in your vgetcpu patch, would it be better to return
> the logical CPU number (as printed in /proc/cpuinfo).

The latest code does that already - i dropped the cpuid code
completely and replaced it with LSL.

> Also, I think 
> applications would be interested in knowing the physical package id for
> cores sharing caches.

They can always map that themselves using cpuinfo. I would
prefer to not overload the single call too much.

> > And you can't get at at the base address anyways because they
> > are ignored in long mode (except for fs/gs). For fs/gs you would
> > need to save/restore them to reuse them which would be slow.
> >
> > You can't also just put them into fs/gs because those are
> > already reserved for user space.
>
> That is the reason I'm not proposing to alter existing fs/gs.
>
> > Also I don't know what other information other than cpu/node
> > would be useful, so just using the 20 bits of limit seems plenty to me.
>
> physical id (of the package for exmpale) is another useful field. 

Ok I see that, but it could be as well done by a small user space
library that reads cpuinfo once and maps given vgetcpu()

On the other hand I got people complaining who need some more
topology information (like number of cores/cpus), but /proc/cpuinfo
is quite slow and adds a lot of overhead to fast starting programs.

I've been pondering to put some more information about that
in the ELF aux vector, but exporting might work too. I suppose
exporting would require the vDSO first to give a sane interface.

> would also like to see number of interrupts serviced by this cpu, page
> faults  etc.  But I think that is a separate discussion.

Well, the complex mechanism you're proposing above only makes
sense if it is established more fields are needed (and cannot be satisfied
by reserving a few more segment selectors) I admit I'm not
quite convinced yet.

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-22  8:08                 ` Andi Kleen
@ 2006-06-22 21:06                   ` Rohit Seth
  2006-06-22 22:14                     ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-22 21:06 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Thu, 2006-06-22 at 10:08 +0200, Andi Kleen wrote:
> On Thursday 22 June 2006 02:55, Rohit Seth wrote:
> 
> 
> > > - Put base address of user exportable part into GDT
> > > - Access it using that.
> >
> > These are the steps that I'm proposing in vgetcpu:
> >
> > Read the GDT pointer in vgetcpu code path.  This is the base of gdt
> > table.
> > Read descriptor #20 from base.
> > This is the pointer to user visible part of per cpu data structure.
> 
> > Please let me know if I'm missing something here.
> 
> Ok that would probably work, but you would need to export the GDT too.
> 

Would sgdt not be sufficient?  I agree that we will have to end up
giving RO access to user for the gdt page.

> I still don't see why we should do it - limit should be enough.
> 
> > Just a side note, in your vgetcpu patch, would it be better to return
> > the logical CPU number (as printed in /proc/cpuinfo).
> 
> The latest code does that already - i dropped the cpuid code
> completely and replaced it with LSL.
> 

Ah that is good.

> > Also, I think 
> > applications would be interested in knowing the physical package id for
> > cores sharing caches.
> 
> They can always map that themselves using cpuinfo. I would
> prefer to not overload the single call too much.

Yes they can map using /proc/cpuinfo.  But if there is any easier
mechanism then using /proc then that would help.  

I agree that we should not overload a single call (though cpu, package
and node numbers do belong in one category IMO).  We can have multiple
calls if that is required as long as there is an efficient mechanism to
provide that information.

> 
> > > And you can't get at at the base address anyways because they
> > > are ignored in long mode (except for fs/gs). For fs/gs you would
> > > need to save/restore them to reuse them which would be slow.
> > >
> > > You can't also just put them into fs/gs because those are
> > > already reserved for user space.
> >
> > That is the reason I'm not proposing to alter existing fs/gs.
> >
> > > Also I don't know what other information other than cpu/node
> > > would be useful, so just using the 20 bits of limit seems plenty to me.
> >
> > physical id (of the package for exmpale) is another useful field. 
> 
> Ok I see that, but it could be as well done by a small user space
> library that reads cpuinfo once and maps given vgetcpu()
> 

Why maintain that extra logic in user space when kernel can easily give
that information.

> On the other hand I got people complaining who need some more
> topology information (like number of cores/cpus), but /proc/cpuinfo
> is quite slow and adds a lot of overhead to fast starting programs.
> 

This is an excellent point.

> I've been pondering to put some more information about that
> in the ELF aux vector, but exporting might work too. I suppose
> exporting would require the vDSO first to give a sane interface.
> 
Can you please tell me what more information you are thinking of putting
in aux vector?

> > would also like to see number of interrupts serviced by this cpu, page
> > faults  etc.  But I think that is a separate discussion.
> 
> Well, the complex mechanism you're proposing above only makes

complex---no.  But sure that it is not as simple as lsl.

> sense if it is established more fields are needed (and cannot be satisfied
> by reserving a few more segment selectors) I admit I'm not
> quite convinced yet.

You are absolutely right that the mechanism I'm proposing makes sense
only if we have more fields AND if any of those fields are dynamically
changing.  But this is a generic mechanism that could be extended to
share any user visible information in efficient way.  Once we have this
in place then information like whole cpuinfo, percpu interrupts etc. can
be retrieved easily.

-rohit


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-22 21:06                   ` Rohit Seth
@ 2006-06-22 22:14                     ` Andi Kleen
  2006-06-22 23:10                       ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-22 22:14 UTC (permalink / raw)
  To: rohitseth
  Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel, discuss


> Would sgdt not be sufficient?  I agree that we will have to end up
> giving RO access to user for the gdt page.

I meant exporting the GDT page

> I agree that we should not overload a single call (though cpu, package
> and node numbers do belong in one category IMO).  We can have multiple
> calls if that is required as long as there is an efficient mechanism to
> provide that information.

The current mechanism doesn't scale to much more calls, but I guess
i'll have to do a vDSO sooner or later.
 
> Why maintain that extra logic in user space when kernel can easily give
> that information.

It already does.
 
> > I've been pondering to put some more information about that
> > in the ELF aux vector, but exporting might work too. I suppose
> > exporting would require the vDSO first to give a sane interface.
> > 
> Can you please tell me what more information you are thinking of putting
> in aux vector?

One proposal (not fully fleshed out was) number of siblings / sockets / nodes 
I don't think bitmaps would work well there (and if someone really needs
those they can read cpuinfo again) 

This is mostly for OpenMP and tuning of a few functions (e.g. on AMD
the memory latencies varies with the number of nodes so some functions
can be tuned in different ways based on that) 

> You are absolutely right that the mechanism I'm proposing makes sense
> only if we have more fields AND if any of those fields are dynamically
> changing.  But this is a generic mechanism that could be extended to
> share any user visible information in efficient way.  Once we have this
> in place then information like whole cpuinfo, percpu interrupts etc. can
> be retrieved easily.

The problem with exposing too much is that it might be a nightmare
to guarantee a stable ABI for this. At least it would
constrain the kernel internally. Probably less is better here. 

Also I'm still not sure why user space should care about interrupts?

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-22 22:14                     ` Andi Kleen
@ 2006-06-22 23:10                       ` Rohit Seth
  2006-06-23 12:42                         ` [discuss] " Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-22 23:10 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel, discuss

On Fri, 2006-06-23 at 00:14 +0200, Andi Kleen wrote:
> > Would sgdt not be sufficient?  I agree that we will have to end up
> > giving RO access to user for the gdt page.
> 
> I meant exporting the GDT page
> 

Yes indeed.  That shouldn't be an issue though.

> > I agree that we should not overload a single call (though cpu, package
> > and node numbers do belong in one category IMO).  We can have multiple
> > calls if that is required as long as there is an efficient mechanism to
> > provide that information.
> 
> The current mechanism doesn't scale to much more calls, but I guess
> i'll have to do a vDSO sooner or later.
>  
> > Why maintain that extra logic in user space when kernel can easily give
> > that information.
> 
> It already does.
>  

I'm missing your point here.  How and where?

> > > I've been pondering to put some more information about that
> > > in the ELF aux vector, but exporting might work too. I suppose
> > > exporting would require the vDSO first to give a sane interface.
> > > 
> > Can you please tell me what more information you are thinking of putting
> > in aux vector?
> 
> One proposal (not fully fleshed out was) number of siblings / sockets / nodes 
> I don't think bitmaps would work well there (and if someone really needs
> those they can read cpuinfo again) 
> 

This is exactly the point, why do that expensive /proc operation when
you can do a quick vsyscall and get all of that information.  I'm not
sure if Aux is the right direction.

> This is mostly for OpenMP and tuning of a few functions (e.g. on AMD
> the memory latencies varies with the number of nodes so some functions
> can be tuned in different ways based on that) 
> 
> > You are absolutely right that the mechanism I'm proposing makes sense
> > only if we have more fields AND if any of those fields are dynamically
> > changing.  But this is a generic mechanism that could be extended to
> > share any user visible information in efficient way.  Once we have this
> > in place then information like whole cpuinfo, percpu interrupts etc. can
> > be retrieved easily.
> 
> The problem with exposing too much is that it might be a nightmare
> to guarantee a stable ABI for this. At least it would
> constrain the kernel internally. Probably less is better here. 
> 

There will be (in all probability) requests to include as much as
possible, but I think that should be manageable with sensible API.

> Also I'm still not sure why user space should care about interrupts?
> 
Okay. I just cooked that example for some monitoring process to find out
the interrupts /sec on that CPU.  But as you mentioned above sibling,
sockets, nodes, flags, and even other characteristics like current
p-state are all important information that will help applications
sitting in user land (even if some of them will be used only couple of
times in the life of a process).

Side note: I don't want to delay the vgetcpu call into mainline because
of this discussion (as long as there is no cpuid and tcache in that
call).

-rohit


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [discuss] Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-22 23:10                       ` Rohit Seth
@ 2006-06-23 12:42                         ` Andi Kleen
  2006-06-24  2:06                           ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-23 12:42 UTC (permalink / raw)
  To: discuss, rohitseth
  Cc: Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Friday 23 June 2006 01:10, Rohit Seth wrote:

> > > I agree that we should not overload a single call (though cpu, package
> > > and node numbers do belong in one category IMO).  We can have multiple
> > > calls if that is required as long as there is an efficient mechanism to
> > > provide that information.
> > 
> > The current mechanism doesn't scale to much more calls, but I guess
> > i'll have to do a vDSO sooner or later.
> >  
> > > Why maintain that extra logic in user space when kernel can easily give
> > > that information.
> > 
> > It already does.
> >  
> 
> I'm missing your point here.  How and where?

In /proc/cpuinfo. 

Suresh and others even put a lot of thought into how to present the information
there.

Or did you just refer to the overhead of writing a /proc parser?
 
> > > > I've been pondering to put some more information about that
> > > > in the ELF aux vector, but exporting might work too. I suppose
> > > > exporting would require the vDSO first to give a sane interface.
> > > > 
> > > Can you please tell me what more information you are thinking of putting
> > > in aux vector?
> > 
> > One proposal (not fully fleshed out was) number of siblings / sockets / nodes 
> > I don't think bitmaps would work well there (and if someone really needs
> > those they can read cpuinfo again) 
> > 
> 
> This is exactly the point, why do that expensive /proc operation when
> you can do a quick vsyscall and get all of that information.  I'm not
> sure if Aux is the right direction.

It's already used for this at least (hwcap etc.) 

vDSO might be better too, but I haven't thought too much about it yet

> 
> > This is mostly for OpenMP and tuning of a few functions (e.g. on AMD
> > the memory latencies varies with the number of nodes so some functions
> > can be tuned in different ways based on that) 
> > 
> > > You are absolutely right that the mechanism I'm proposing makes sense
> > > only if we have more fields AND if any of those fields are dynamically
> > > changing.  But this is a generic mechanism that could be extended to
> > > share any user visible information in efficient way.  Once we have this
> > > in place then information like whole cpuinfo, percpu interrupts etc. can
> > > be retrieved easily.
> > 
> > The problem with exposing too much is that it might be a nightmare
> > to guarantee a stable ABI for this. At least it would
> > constrain the kernel internally. Probably less is better here. 
> > 
> 
> There will be (in all probability) requests to include as much as
> possible, 

Yes but that doesn't mean all these requests make sense and should
be actually followed :)


> but I think that should be manageable with sensible API.

Not sure. Leaner interfaces are really better here.

It's one of the lessons I learned from libnuma - i provide a lot of tools,
but nearly all people are perfectly satisfied with the total basics. So
it's better to start small and only add stuff when there is really a clear
use case.


> Okay. I just cooked that example for some monitoring process to find out
> the interrupts /sec on that CPU.  But as you mentioned above sibling,
> sockets, nodes, flags, and even other characteristics like current
> p-state are all important information that will help applications
> sitting in user land (even if some of them will be used only couple of
> times in the life of a process).

Ok you want faster monitoring applications? Some faster way than 
/proc for some stuff probably makes sense - but I don't think shared
mappings are the right way for it.

There's still a lot of other possibilities for this like relayfs 
or binary /proc files 
 
> Side note: I don't want to delay the vgetcpu call into mainline because
> of this discussion
I'll probably delay it after 2.6.18

> (as long as there is no cpuid and tcache in that 
> call).

What do you not like about tcache? 

-Andi


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [discuss] Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-23 12:42                         ` [discuss] " Andi Kleen
@ 2006-06-24  2:06                           ` Rohit Seth
  2006-06-24  8:42                             ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Rohit Seth @ 2006-06-24  2:06 UTC (permalink / raw)
  To: Andi Kleen
  Cc: discuss, Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Fri, 2006-06-23 at 14:42 +0200, Andi Kleen wrote:
> On Friday 23 June 2006 01:10, Rohit Seth wrote:
> 
> > > > I agree that we should not overload a single call (though cpu, package
> > > > and node numbers do belong in one category IMO).  We can have multiple
> > > > calls if that is required as long as there is an efficient mechanism to
> > > > provide that information.
> > > 
> > > The current mechanism doesn't scale to much more calls, but I guess
> > > i'll have to do a vDSO sooner or later.
> > >  
> > > > Why maintain that extra logic in user space when kernel can easily give
> > > > that information.
> > > 
> > > It already does.
> > >  
> > 
> > I'm missing your point here.  How and where?
> 
> In /proc/cpuinfo. 
> 
> Suresh and others even put a lot of thought into how to present the information
> there.
> 

That part I know very well :)

> Or did you just refer to the overhead of writing a /proc parser?
Yes exactly.

>  
> > > > > I've been pondering to put some more information about that
> > > > > in the ELF aux vector, but exporting might work too. I suppose
> > > > > exporting would require the vDSO first to give a sane interface.
> > > > > 
> > > > Can you please tell me what more information you are thinking of putting
> > > > in aux vector?
> > > 
> > > One proposal (not fully fleshed out was) number of siblings / sockets / nodes 
> > > I don't think bitmaps would work well there (and if someone really needs
> > > those they can read cpuinfo again) 
> > > 
> > 
> > This is exactly the point, why do that expensive /proc operation when
> > you can do a quick vsyscall and get all of that information.  I'm not
> > sure if Aux is the right direction.
> 
> It's already used for this at least (hwcap etc.) 
> 
> vDSO might be better too, but I haven't thought too much about it yet
> 

I like the vDSO interface.

> > 
> > There will be (in all probability) requests to include as much as
> > possible, 
> 
> Yes but that doesn't mean all these requests make sense and should
> be actually followed :)
> 
> 
> > but I think that should be manageable with sensible API.
> 
> Not sure. Leaner interfaces are really better here.
> 

yes. some sample implementation will be good here.  I'll try.

> It's one of the lessons I learned from libnuma - i provide a lot of tools,
> but nearly all people are perfectly satisfied with the total basics. So
> it's better to start small and only add stuff when there is really a clear
> use case.
> 

I'm not sure smaller libnuma would have gone too far at that time.
Though in this case you are starting small by just giving the vgetcpu
system call. And then we are going to look at other exportable data.

> 
> > Okay. I just cooked that example for some monitoring process to find out
> > the interrupts /sec on that CPU.  But as you mentioned above sibling,
> > sockets, nodes, flags, and even other characteristics like current
> > p-state are all important information that will help applications
> > sitting in user land (even if some of them will be used only couple of
> > times in the life of a process).
> 
> Ok you want faster monitoring applications? Some faster way than 
> /proc for some stuff probably makes sense - but I don't think shared
> mappings are the right way for it.
> 
> There's still a lot of other possibilities for this like relayfs 
> or binary /proc files 

/proc et.al just add overhead in retrieving this information. For things
that are easily available in kernel vDSO will help.

>  
> > Side note: I don't want to delay the vgetcpu call into mainline because
> > of this discussion

> I'll probably delay it after 2.6.18
> 

Why?

> > (as long as there is no cpuid and tcache in that 
> > call).
> 
> What do you not like about tcache? 
> 

It just does not sound like a right interface.  Why should an app be
giving the last time value that it asked for the same information.  User
wants cpu, package and node numbers and those are the three parameters
that should be there.  Besides if we are using lsl then the latency part
of cpuid is already gone so no need to optimize this any more.

Though this will be good interface to export jiffies ;-)

-rohit





^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [discuss] Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-24  2:06                           ` Rohit Seth
@ 2006-06-24  8:42                             ` Andi Kleen
  2006-06-27  1:13                               ` Rohit Seth
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-24  8:42 UTC (permalink / raw)
  To: rohitseth
  Cc: discuss, Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel


> It just does not sound like a right interface.  Why should an app be
> giving the last time value that it asked for the same information.  

First this information comes with a good-before date stamp
so it's natural. Otherwise the application will never pick
up when the scheduler decides to schedule it somewhere else,
which would be bad.

And that came from conversation with application developers.

A: We want something to get the current node
me: how fast does it need to be? 
B: we will cache it anyways.

Problem is that normally the application can't do a good job
at doing the cache because it doesn't have a fast way to 
do time stamping (gettimeofday would be too slow and it's
the fastest timer available short of having a second thread
that sleeps and updates a counter) 

But the vsyscall incidentially knows this because of it
sharing data with  vgettimeofday(), so it can
do the job for the application

> User 
> wants cpu, package and node numbers and those are the three parameters
> that should be there.  Besides if we are using lsl then the latency part
> of cpuid is already gone so no need to optimize this any more.
>
> Though this will be good interface to export jiffies ;-)

No - jiffies don't have a defined unit and might even go away
on a fully tickless kernel.

If we just exported jiffies you would get lots of HZ dependent
programs.

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [discuss] Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-24  8:42                             ` Andi Kleen
@ 2006-06-27  1:13                               ` Rohit Seth
  0 siblings, 0 replies; 30+ messages in thread
From: Rohit Seth @ 2006-06-27  1:13 UTC (permalink / raw)
  To: Andi Kleen
  Cc: discuss, Chuck Ebbert, Linus Torvalds, Ingo Molnar, linux-kernel

On Sat, 2006-06-24 at 10:42 +0200, Andi Kleen wrote:
> > It just does not sound like a right interface.  Why should an app be
> > giving the last time value that it asked for the same information.  
> 
> First this information comes with a good-before date stamp
> so it's natural. Otherwise the application will never pick
> up when the scheduler decides to schedule it somewhere else,
> which would be bad.
> 

Though the rescheduling can happen any time.  I'm not sure how is tcache
going to track rescheduling deterministically.  In theory there are
always going to be those pathological cases which will be very difficult
to get right (with or without tcache).


> And that came from conversation with application developers.
> 
> A: We want something to get the current node
> me: how fast does it need to be? 
> B: we will cache it anyways.
> 
> Problem is that normally the application can't do a good job
> at doing the cache because it doesn't have a fast way to 
> do time stamping (gettimeofday would be too slow and it's
> the fastest timer available short of having a second thread
> that sleeps and updates a counter) 
> 
> But the vsyscall incidentially knows this because of it
> sharing data with  vgettimeofday(), so it can
> do the job for the application
> 

I think I probably read your patch wrong or an earlier version where
user was sending the tcache down to vsyscall.  Is user sending the
tcache parameter containing the last jiffies down to vsyscall in your
latest patch?  Could you please point me to latest patch.

I think the system call is going to come with caveat that the
information provided by vgetcpu could be stale as the process could have
moved to different CPU before returning information.  

> > User 
> > wants cpu, package and node numbers and those are the three parameters
> > that should be there.  Besides if we are using lsl then the latency part
> > of cpuid is already gone so no need to optimize this any more.
> >

> > Though this will be good interface to export jiffies ;-)
> 
> No - jiffies don't have a defined unit and might even go away
> on a fully tickless kernel.
> 
> If we just exported jiffies you would get lots of HZ dependent
> programs.


I agree and that is why I don't think we should export anything relating
to jiffies to external world (may be my smiley selection wasn't right).
This goes back to my understanding that tcache is user visible.

-rohit


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21  8:15 ` Ingo Molnar
  2006-06-21 17:38   ` Artur Skawina
@ 2006-06-28  5:44   ` Paul Jackson
  2006-06-28  8:53     ` Andi Kleen
  1 sibling, 1 reply; 30+ messages in thread
From: Paul Jackson @ 2006-06-28  5:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: 76306.1226, linux-kernel, torvalds, ak, drepper, roland, jakub

> but my gut feeling is that we should add a proper sys_get_cpu() syscall 

Yes - this should be for more or less all arch's.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-28  5:44   ` Paul Jackson
@ 2006-06-28  8:53     ` Andi Kleen
  2006-06-28  9:00       ` Ingo Molnar
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-28  8:53 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Ingo Molnar, 76306.1226, linux-kernel, torvalds, drepper, roland, jakub

On Wednesday 28 June 2006 07:44, Paul Jackson wrote:
> > but my gut feeling is that we should add a proper sys_get_cpu() syscall 
> 
> Yes - this should be for more or less all arch's.

The whole point of the original implementation is to do a fast architecture specific call.
A slow generic call isn't very useful.

-Andi
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-28  8:53     ` Andi Kleen
@ 2006-06-28  9:00       ` Ingo Molnar
  2006-06-29  8:47         ` Paul Jackson
  0 siblings, 1 reply; 30+ messages in thread
From: Ingo Molnar @ 2006-06-28  9:00 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Paul Jackson, 76306.1226, linux-kernel, torvalds, drepper, roland, jakub


* Andi Kleen <ak@suse.de> wrote:

> On Wednesday 28 June 2006 07:44, Paul Jackson wrote:
> > > but my gut feeling is that we should add a proper sys_get_cpu() syscall 
> > 
> > Yes - this should be for more or less all arch's.
> 
> The whole point of the original implementation is to do a fast 
> architecture specific call. A slow generic call isn't very useful.

it's useful in terms of userspace uniformity. It's alot easier to expose 
such APIs via glibc if there's a generic implementation everywhere. 
Obviously every arch is encouraged to optimize it into a vsyscall.

	Ingo

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-28  9:00       ` Ingo Molnar
@ 2006-06-29  8:47         ` Paul Jackson
  0 siblings, 0 replies; 30+ messages in thread
From: Paul Jackson @ 2006-06-29  8:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: ak, 76306.1226, linux-kernel, torvalds, drepper, roland, jakub

Ingo wrote:
> it's useful in terms of userspace uniformity.

Yes.  It's an important property of Linux that it
provides a common, portable API for all arch's,
except where the obvious semantics (not performance)
of a call are necessarily arch-specific.

Just coding up system calls for those arch's that
happen to run a particular call super-fast, even
though the call makes logical sense on all arch's,
would lead to API chaos and impede application
portability.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-22 12:23 Chuck Ebbert
@ 2006-06-22 12:44 ` Andi Kleen
  0 siblings, 0 replies; 30+ messages in thread
From: Andi Kleen @ 2006-06-22 12:44 UTC (permalink / raw)
  To: Chuck Ebbert
  Cc: linux-kernel, Linus Torvalds, Ulrich Drepper, Roland McGrath,
	Jakub Jelinek, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 2452 bytes --]

On Thursday 22 June 2006 14:23, Chuck Ebbert wrote:
> In-Reply-To: <200606211914.37137.ak@suse.de>
> 
> On Wed, 21 Jun 2006 19:14:37 +0200, Andi Kleen wrote:
> 
> >> 
> >> /* test how fast lsl/jnz/and runs.
> >>  */
> >> #define _GNU_SOURCE
> >> #include <stdio.h>
> >> #include <stdlib.h>
> >> 
> >> #define rdtscll(t)   asm volatile ("rdtsc" : "=A" (t))
> >> 
> >> #ifndef ITERS
> >> #define ITERS        1000000
> >> #endif
> >> 
> >> int main(int argc, char * const argv[])
> >> {
> >>      unsigned long long tsc1, tsc2;
> >>      int count, cpu, junk;
> >> 
> >>      rdtscll(tsc1);
> >>      asm (
> >>              "       pushl %%ds              \n"
> >>              "       popl %2                 \n"
> >>              "1:                             \n"
> >> #ifdef DO_TEST
> >>              "       lsl %2,%0               \n"
> >>              "       jnz 2f                  \n"
> >>              "       and $0xff,%0            \n"
> >> #endif
> >>              "       dec %1                  \n"
> >>              "       jnz 1b                  \n"
> >>              "2:                             \n"
> >>              : "=&r" (cpu), "=&r" (count), "=&r" (junk)
> >>              : "1" (ITERS), "0" (-1)
> >>      );
> >>      rdtscll(tsc2);
> >
> > Measuring this way is a bad idea because you get far too much 
> > noise from the RDTSCs. Usually you need to put a a few thousands entry 
> > loop inside the RDTSCP and devide the result by the loop count
> 
> I got tired of people (namely me) forgetting to compile the C code
> with optimization, so I did the loop in assembler.  It does 1000000
> iterations by default.  Later I added the DO_TEST that lets you test
> the empty loop just because I was curious.
> 
> A more realistic test with the two 'mov' instructions inside the loops
> still only takes 16 clocks, so I'm wondering why you get 60?  Does the
> vsyscall add that much overhead?  With this I get 29-30 clocks per loop
> on Pentium II:

This is the x86-64 test code I used. It's basically an emulation of the vsyscall
(including indirect call) in user space.

rdtscp shows less cycles, so it's not all overhead of the infrastructure.

-Andi


K8 E stepping: 

getpid 168 cycles
vgetcpu lsl 79 cycles
vgetcpu cached 15 cycles

K8 F stepping:

getpid 162 cycles
vgetcpu lsl 77 cycles
vgetcpu rdtscp 32 cycles
vgetcpu cached 15 cycles

Nocona: 

getpid 1491 cycles
vgetcpu lsl 130 cycles
vgetcpu cached 26 cycles



[-- Attachment #2: tvgetcpu.c --]
[-- Type: text/x-csrc, Size: 2575 bytes --]

#include <asm/msr.h>
#include <asm/unistd.h>
#include <stdio.h>

int errno;

_syscall0(int,getpid)

#define rdtscp(low,high,aux) \
     asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))

#define rdtscpll(val, aux) do { \
     unsigned long __a, __d; \
     asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
(val) = (__d << 32) | __a; \
} while (0)


enum {
	ITER = 100000,
}; 

long __jiffies;
enum {
	VGETCPU_RDTSCP = 1,
} __vgetcpu_mode;
unsigned char __cpu_to_node[32];

long do_vgetcpu(int *cpu, int *node, unsigned long *tcache)
{
       unsigned int dummy, p;
       unsigned long j = __jiffies;

       /* Fast cache - only recompute value once per jiffies and avoid
          relatively costly rdtscp/cpuid otherwise.
          This works because the scheduler usually keeps the process
          on the same CPU and this syscall doesn't guarantee its
          results anyways.
          We do this here because otherwise user space would do it on
          its own in a likely inferior way (no access to jiffies).
          If you don't like it pass NULL. */
       if (tcache && tcache[0] == j) {
               p = tcache[1];
       } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
               rdtscp(dummy, dummy, p);
       } else {
#if 1
		asm("lsl %1,%0" : "=r" (p) : "r" (15 * 8));
#else
               cpuid(1, &dummy, &p, &dummy, &dummy);
               p >>= 24;
               p |= (__cpu_to_node[p] << 16);
#endif
       }
       if (tcache) {
               tcache[0] = j;
               tcache[1] = p;
       }
       if (cpu)
               *cpu = p & 0xffff;
       if (node)
               *node = p >> 16;
       return 0;
 }

long (*vgetcpu)(int *cpu, int *node, unsigned long *tcache) = do_vgetcpu;


int main(void)
{
	unsigned long start, end; 
	int i;
	int rdtscp = 0;

#if 0
	rdtscp = 1;
#endif

	rdtscll(start);
	for (i = 0; i < ITER; i++) 
		getpid();
	rdtscll(end);
	printf("getpid %lu cycles\n", (end-start)/ITER); 

	int cpu, node;
	rdtscll(start);
	for (i = 0; i < ITER; i++) 
		vgetcpu(&cpu, &node, NULL);
	rdtscll(end);
	printf("vgetcpu lsl %lu cycles\n", (end-start)/ITER); 

	if (rdtscp) { 
		__vgetcpu_mode = VGETCPU_RDTSCP;

		rdtscll(start);
		for (i = 0; i < ITER; i++) 
			vgetcpu(&cpu, &node, NULL);
		rdtscll(end);
		printf("vgetcpu rdtscp %lu cycles\n", (end-start)/ITER); 

	}

	unsigned long cache[2];
	rdtscll(start);
	for (i = 0; i < ITER; i++) 
		vgetcpu(&cpu,&node,cache);
	rdtscll(end);
	printf("vgetcpu cached %lu cycles\n", (end-start)/ITER); 

	return 0;
}



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
@ 2006-06-22 12:23 Chuck Ebbert
  2006-06-22 12:44 ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Chuck Ebbert @ 2006-06-22 12:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, Linus Torvalds, Ulrich Drepper, Roland McGrath,
	Jakub Jelinek, Ingo Molnar

In-Reply-To: <200606211914.37137.ak@suse.de>

On Wed, 21 Jun 2006 19:14:37 +0200, Andi Kleen wrote:

>> 
>> /* test how fast lsl/jnz/and runs.
>>  */
>> #define _GNU_SOURCE
>> #include <stdio.h>
>> #include <stdlib.h>
>> 
>> #define rdtscll(t)   asm volatile ("rdtsc" : "=A" (t))
>> 
>> #ifndef ITERS
>> #define ITERS        1000000
>> #endif
>> 
>> int main(int argc, char * const argv[])
>> {
>>      unsigned long long tsc1, tsc2;
>>      int count, cpu, junk;
>> 
>>      rdtscll(tsc1);
>>      asm (
>>              "       pushl %%ds              \n"
>>              "       popl %2                 \n"
>>              "1:                             \n"
>> #ifdef DO_TEST
>>              "       lsl %2,%0               \n"
>>              "       jnz 2f                  \n"
>>              "       and $0xff,%0            \n"
>> #endif
>>              "       dec %1                  \n"
>>              "       jnz 1b                  \n"
>>              "2:                             \n"
>>              : "=&r" (cpu), "=&r" (count), "=&r" (junk)
>>              : "1" (ITERS), "0" (-1)
>>      );
>>      rdtscll(tsc2);
>
> Measuring this way is a bad idea because you get far too much 
> noise from the RDTSCs. Usually you need to put a a few thousands entry 
> loop inside the RDTSCP and devide the result by the loop count

I got tired of people (namely me) forgetting to compile the C code
with optimization, so I did the loop in assembler.  It does 1000000
iterations by default.  Later I added the DO_TEST that lets you test
the empty loop just because I was curious.

A more realistic test with the two 'mov' instructions inside the loops
still only takes 16 clocks, so I'm wondering why you get 60?  Does the
vsyscall add that much overhead?  With this I get 29-30 clocks per loop
on Pentium II:


/* vgetcpu.c: test how fast vgetcpu runs
 * boot kernel with vgetcpu patch first, then build this:
 *  gcc -O3 -o vgetcpu vgetcpu.c <srcpath>/arch/i386/kernel/vsyscall-int80.so
 * (don't forget the optimization (-O3))
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>

extern int __vgetcpu(void);

#define rdtscll(t)      asm("rdtsc" : "=A" (t))

int main(int argc, char * const argv[])
{
        long long tsc1, tsc2;
        int i, iters = 999999;

        rdtscll(tsc1);
        for (i = 0; i < iters; i++)
                __vgetcpu();
        rdtscll(tsc2);

        printf("loops: %d, avg: %llu\n", iters, (tsc2 - tsc1) / iters);

        return 0;
}
-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 17:27   ` Linus Torvalds
@ 2006-06-21 17:50     ` Andi Kleen
  0 siblings, 0 replies; 30+ messages in thread
From: Andi Kleen @ 2006-06-21 17:50 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Chuck Ebbert, Ingo Molnar, Jakub Jelinek, Roland McGrath,
	Ulrich Drepper, linux-kernel

On Wednesday 21 June 2006 19:27, Linus Torvalds wrote:
> 
> On Wed, 21 Jun 2006, Andi Kleen wrote:
> > 
> > My measurements show different - i get 60+ cycles on K8 and 150+ cycles
> > on P4. That is with a full vsyscall around it. However it is still
> > far better than CPUID, however slower than RDTSCP on those CPUs that support it.
> > 
> > I changed the CPUID fallback path to use LSL on x86-64
> 
> One note of warning: 
> 
> Playing "clever games" has a real tendency to suck badly eventually. I'm 
> betting LSL is pretty damn low on any list of instructions to be optimized 
> by the CPU core, so it would tend to always be microcoded, while other ops 
> might get faster.

Any way we use to get the current CPU number is microcoded.
Unless RDTSCP and CPUID LSL is not defined to flush any pipelines fortunately.
And with the cache it is not THAT critical.

> > Measuring this way is a bad idea because you get far too much 
> > noise from the RDTSCs. Usually you need to put a a few thousands entry 
> > loop inside the RDTSCP and devide the result by the loop count
> 
> And measuring that way isn't perfect either, because it tends to show you 
> how well an instruction works in that particular instruction mix, but not 
> necessarily in real life.
> 
> Benchmarking single instructions is simply damn hard. It's often better to 
> try to find a real load where that particular sequence is important enough 
> to be measurable at all, and then try the alternatives. Not perfect 
> either, but if you can't find such a load, maybe you shouldn't be doing it 
> in the first place.. And if you _can_ find such a real load, at least you 
> measured something that was actually real.

I benchmarked it in a faithful simulation of a x86-64 vsyscall

-Andi

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 17:14 ` Andi Kleen
@ 2006-06-21 17:27   ` Linus Torvalds
  2006-06-21 17:50     ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Linus Torvalds @ 2006-06-21 17:27 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Chuck Ebbert, Ingo Molnar, Jakub Jelinek, Roland McGrath,
	Ulrich Drepper, linux-kernel



On Wed, 21 Jun 2006, Andi Kleen wrote:
> 
> My measurements show different - i get 60+ cycles on K8 and 150+ cycles
> on P4. That is with a full vsyscall around it. However it is still
> far better than CPUID, however slower than RDTSCP on those CPUs that support it.
> 
> I changed the CPUID fallback path to use LSL on x86-64

One note of warning: 

Playing "clever games" has a real tendency to suck badly eventually. I'm 
betting LSL is pretty damn low on any list of instructions to be optimized 
by the CPU core, so it would tend to always be microcoded, while other ops 
might get faster.

> Measuring this way is a bad idea because you get far too much 
> noise from the RDTSCs. Usually you need to put a a few thousands entry 
> loop inside the RDTSCP and devide the result by the loop count

And measuring that way isn't perfect either, because it tends to show you 
how well an instruction works in that particular instruction mix, but not 
necessarily in real life.

Benchmarking single instructions is simply damn hard. It's often better to 
try to find a real load where that particular sequence is important enough 
to be measurable at all, and then try the alternatives. Not perfect 
either, but if you can't find such a load, maybe you shouldn't be doing it 
in the first place.. And if you _can_ find such a real load, at least you 
measured something that was actually real.

		Linus

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
  2006-06-21 12:24 Chuck Ebbert
@ 2006-06-21 17:14 ` Andi Kleen
  2006-06-21 17:27   ` Linus Torvalds
  0 siblings, 1 reply; 30+ messages in thread
From: Andi Kleen @ 2006-06-21 17:14 UTC (permalink / raw)
  To: Chuck Ebbert
  Cc: Ingo Molnar, Jakub Jelinek, Roland McGrath, Ulrich Drepper,
	Linus Torvalds, linux-kernel

On Wednesday 21 June 2006 14:24, Chuck Ebbert wrote:
> In-Reply-To: <20060621081539.GA14227@elte.hu>
> 
> On Wed, 21 Jun 2006 10:15:39 +0200, Ingo Molnar wrote:
> 
> > * Chuck Ebbert <76306.1226@compuserve.com> wrote:
> > 
> > > Use a GDT entry's limit field to store per-cpu data for fast access 
> > > from userspace, and provide a vsyscall to access the current CPU 
> > > number stored there.
> > 
> > very nice idea! I thought of doing sys_get_cpu() too, but my idea was to 
> > use the scheduler to keep a writable [and permanently pinned, 
> > per-thread] VDSO data page uptodate with the current CPU# [and other 
> > interesting data]. Btw., do we know how fast LSL is on modern CPUs?
> 
> Now that the GDT is a full page for each CPU there's plenty of space
> for all kinds of per-cpu data, even if we waste 75% of it.  LSL seems
> pretty fast; I got 13 clocks for the whole lsl/jnz/and sequence on K8

My measurements show different - i get 60+ cycles on K8 and 150+ cycles
on P4. That is with a full vsyscall around it. However it is still
far better than CPUID, however slower than RDTSCP on those CPUs that support it.

I changed the CPUID fallback path to use LSL on x86-64

> and 21 clocks on PII.  Myabe you can test P4?
> 
> /* test how fast lsl/jnz/and runs.
>  */
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> 
> #define rdtscll(t)	asm volatile ("rdtsc" : "=A" (t))
> 
> #ifndef ITERS
> #define ITERS	1000000
> #endif
> 
> int main(int argc, char * const argv[])
> {
> 	unsigned long long tsc1, tsc2;
> 	int count, cpu, junk;
> 
> 	rdtscll(tsc1);
> 	asm (
> 		"	pushl %%ds		\n"
> 		"	popl %2			\n"
> 		"1:				\n"
> #ifdef DO_TEST
> 		"	lsl %2,%0		\n"
> 		"	jnz 2f			\n"
> 		"	and $0xff,%0		\n"
> #endif
> 		"	dec %1			\n"
> 		"	jnz 1b			\n"
> 		"2:				\n"
> 		: "=&r" (cpu), "=&r" (count), "=&r" (junk)
> 		: "1" (ITERS), "0" (-1)
> 	);
> 	rdtscll(tsc2);

Measuring this way is a bad idea because you get far too much 
noise from the RDTSCs. Usually you need to put a a few thousands entry 
loop inside the RDTSCP and devide the result by the loop count

-Andi

> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC, patch] i386: vgetcpu(), take 2
@ 2006-06-21 12:24 Chuck Ebbert
  2006-06-21 17:14 ` Andi Kleen
  0 siblings, 1 reply; 30+ messages in thread
From: Chuck Ebbert @ 2006-06-21 12:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jakub Jelinek, Roland McGrath, Ulrich Drepper, Andi Kleen,
	Linus Torvalds, linux-kernel

In-Reply-To: <20060621081539.GA14227@elte.hu>

On Wed, 21 Jun 2006 10:15:39 +0200, Ingo Molnar wrote:

> * Chuck Ebbert <76306.1226@compuserve.com> wrote:
> 
> > Use a GDT entry's limit field to store per-cpu data for fast access 
> > from userspace, and provide a vsyscall to access the current CPU 
> > number stored there.
> 
> very nice idea! I thought of doing sys_get_cpu() too, but my idea was to 
> use the scheduler to keep a writable [and permanently pinned, 
> per-thread] VDSO data page uptodate with the current CPU# [and other 
> interesting data]. Btw., do we know how fast LSL is on modern CPUs?

Now that the GDT is a full page for each CPU there's plenty of space
for all kinds of per-cpu data, even if we waste 75% of it.  LSL seems
pretty fast; I got 13 clocks for the whole lsl/jnz/and sequence on K8
and 21 clocks on PII.  Myabe you can test P4?

/* test how fast lsl/jnz/and runs.
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>

#define rdtscll(t)	asm volatile ("rdtsc" : "=A" (t))

#ifndef ITERS
#define ITERS	1000000
#endif

int main(int argc, char * const argv[])
{
	unsigned long long tsc1, tsc2;
	int count, cpu, junk;

	rdtscll(tsc1);
	asm (
		"	pushl %%ds		\n"
		"	popl %2			\n"
		"1:				\n"
#ifdef DO_TEST
		"	lsl %2,%0		\n"
		"	jnz 2f			\n"
		"	and $0xff,%0		\n"
#endif
		"	dec %1			\n"
		"	jnz 1b			\n"
		"2:				\n"
		: "=&r" (cpu), "=&r" (count), "=&r" (junk)
		: "1" (ITERS), "0" (-1)
	);
	rdtscll(tsc2);

	if (count == 0)
		printf("loops: %d, avg: %llu clocks\n",
			ITERS, (tsc2 - tsc1) / ITERS);
	return 0;
}


> > +__vgetcpu:
> > +.LSTART_vgetcpu:
> > +   movl $-EFAULT,%eax
> > +   movl $((27<<3)|3),%edx
> > +   lsll %edx,%eax
> > +   jnz 1f
> > +   andl $0xff,%eax
> > +1:
> > +   ret
> 
> this needs unwinder annotations as well to make this a proper DSO, so 
> that for example a breakpoint here does not confuse gdb.

I can't write those.

> also, would be nice to do something like this in 64-bit mode too.

Andi has x86_64 patches in his tree and is considering this method for
ia32 support.

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2006-06-29  8:48 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-06-21  7:27 [RFC, patch] i386: vgetcpu(), take 2 Chuck Ebbert
2006-06-21  8:15 ` Ingo Molnar
2006-06-21 17:38   ` Artur Skawina
2006-06-28  5:44   ` Paul Jackson
2006-06-28  8:53     ` Andi Kleen
2006-06-28  9:00       ` Ingo Molnar
2006-06-29  8:47         ` Paul Jackson
2006-06-21  9:26 ` Andi Kleen
2006-06-21  9:35   ` Ingo Molnar
2006-06-21 21:54   ` Rohit Seth
2006-06-21 22:21     ` Andi Kleen
2006-06-21 22:59       ` Rohit Seth
2006-06-21 23:05         ` Andi Kleen
2006-06-21 23:18           ` Rohit Seth
2006-06-21 23:29             ` Andi Kleen
2006-06-22  0:55               ` Rohit Seth
2006-06-22  8:08                 ` Andi Kleen
2006-06-22 21:06                   ` Rohit Seth
2006-06-22 22:14                     ` Andi Kleen
2006-06-22 23:10                       ` Rohit Seth
2006-06-23 12:42                         ` [discuss] " Andi Kleen
2006-06-24  2:06                           ` Rohit Seth
2006-06-24  8:42                             ` Andi Kleen
2006-06-27  1:13                               ` Rohit Seth
2006-06-21 12:24 Chuck Ebbert
2006-06-21 17:14 ` Andi Kleen
2006-06-21 17:27   ` Linus Torvalds
2006-06-21 17:50     ` Andi Kleen
2006-06-22 12:23 Chuck Ebbert
2006-06-22 12:44 ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).