linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] faster vgetcpu using sidt
@ 2007-01-07  2:41 dean gaudet
  2007-01-09  0:26 ` dean gaudet
  2007-01-14  7:00 ` [patch] faster vgetcpu using sidt (take 2) dean gaudet
  0 siblings, 2 replies; 7+ messages in thread
From: dean gaudet @ 2007-01-07  2:41 UTC (permalink / raw)
  To: ak, vojtech, linux-kernel

below is a patch which improves vgetcpu latency on all x86_64 
implementations i've tested.

Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
userland-accessible and we could use their limit fields to tuck away a few 
bits of per-cpu information.

vgetcpu generally uses lsl at present, but all of sgdt/sidt/sldt are 
faster than lsl on all x86_64 processors i've tested.  on p4 processers 
lsl tends to be 150 cycles whereas the s*dt instructions are 15 cycles or 
less.  lsl requires microcoded permission testing whereas s*dt are free 
of any such hassle.

sldt is the least expensive of the three instructions however it's a 
hassle to use because processes may want to adjust their ldt.  sidt/sgdt 
have essentially the same performance across all the major architectures 
-- however sidt has the advantage that its limit field is 16-bits, yet any 
value >= 0xfff is essentially "infinite" because there are only 256 (16 
byte) descriptors.  so sidt is probably the best choice of the three.

in benchmarking i've discovered the rdtscp implementation of vgetcpu is 
slower than even the lsl-based implementation on opteron revF.  so i've 
dropped the rdtscp implementation in this patch.  however i've left the 
rdtscp_aux register initialized because i'm sure it's the right choice for 
various proposed vgettimeofday / per-cpu tsc state improvements which need 
the atomic nature of the rdtscp instruction and i hope it'll be used in 
those situations.

at compile time this patch detects if 0x1000 + 
(CONFIG_NR_CPUS<<CONFIG_NODES_SHIFT) will fit in the idt limit field and 
selects the lsl method otherwise.  i've further added a test for the 20 
bit limit of the lsl method and #error in the event it doesn't fit (we 
could fall all the way back to cpuid method if someone has a box with that 
many cpus*nodes, but i'll let someone else handle that case ;).

given this is a compile-time choice, and rdtscp is always slower than 
sidt, i've dropped the vgetcpu_mode variable.

i've also dropped the cache support in the sidt case -- depending on the 
compiler and cpu i found it to be 1 cycle slower than the uncached case, 
and it just doesn't seem worth the potential extra L1 traffic (besides if 
you add in the implied __thread overhead it's definitely a loss).

here are the before/after results:

                        baseline        patched
                no cache        cache
k8 pre-revF        21             14      16
k8 revF            31             14      17
core2              38             12      17

sorry i don't have a handy EMT p4 on which i can install a 2.6.20-rc3 
kernel...  but based on userland-only comparisons of the sidt/lsl 
instructions i'll be amazed if this isn't a huge win on p4.

timing tools and test case can be found at 
<http://arctic.org/~dean/vgetcpu/>

-dean

Signed-off-by: dean gaudet <dean@arctic.org>

Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c	2007-01-06 13:31:10.000000000 -0800
+++ linux/arch/x86_64/kernel/time.c	2007-01-06 16:04:01.000000000 -0800
@@ -957,11 +957,6 @@
 	if (unsynchronized_tsc())
 		notsc = 1;
 
- 	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
-		vgetcpu_mode = VGETCPU_RDTSCP;
-	else
-		vgetcpu_mode = VGETCPU_LSL;
-
 	if (vxtime.hpet_address && notsc) {
 		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
 		if (hpet_use_timer)
Index: linux/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.orig/arch/x86_64/kernel/vsyscall.c	2007-01-06 13:31:10.000000000 -0800
+++ linux/arch/x86_64/kernel/vsyscall.c	2007-01-06 17:29:36.000000000 -0800
@@ -40,13 +40,18 @@
 #include <asm/segment.h>
 #include <asm/desc.h>
 #include <asm/topology.h>
+#include <asm/desc.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","rcx","memory"
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-int __vgetcpu_mode __section_vgetcpu_mode;
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
 
 #include <asm/unistd.h>
 
@@ -147,11 +152,21 @@
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-	unsigned int dummy, p;
+	unsigned int p;
+#ifdef VGETCPU_USE_SIDT
+	struct {
+		char pad[6];	/* avoid unaligned stores */
+		u16 size;
+		u64 address;
+	} idt;
+
+	asm("sidt %0" : "=m" (idt.size));
+	p = idt.size - 0x1000;
+#else
 	unsigned long j = 0;
 
 	/* Fast cache - only recompute value once per jiffies and avoid
-	   relatively costly rdtscp/cpuid otherwise.
+	   relatively costly lsl otherwise.
 	   This works because the scheduler usually keeps the process
 	   on the same CPU and this syscall doesn't guarantee its
 	   results anyways.
@@ -160,21 +175,20 @@
 	   If you don't like it pass NULL. */
 	if (tcache && tcache->blob[0] == (j = __jiffies)) {
 		p = tcache->blob[1];
-	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-		/* Load per CPU data from RDTSCP */
-		rdtscp(dummy, dummy, p);
-	} else {
+	}
+	else {
 		/* Load per CPU data from GDT */
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+		if (tcache) {
+			tcache->blob[0] = j;
+			tcache->blob[1] = p;
+		}
 	}
-	if (tcache) {
-		tcache->blob[0] = j;
-		tcache->blob[1] = p;
-	}
+#endif
 	if (cpu)
-		*cpu = p & 0xfff;
+		*cpu = p >> CONFIG_NODES_SHIFT;
 	if (node)
-		*node = p >> 12;
+		*node = p & ((1<<CONFIG_NODES_SHIFT) - 1);
 	return 0;
 }
 
@@ -250,22 +264,37 @@
    doesn't violate that. We'll find out if it does. */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
-	unsigned long *d;
-	unsigned long node = 0;
+	unsigned long cpu_node_encoding = cpu << CONFIG_NODES_SHIFT;
+
 #ifdef CONFIG_NUMA
-	node = cpu_to_node[cpu];
+	cpu_node_encoding |= cpu_to_node[cpu];
 #endif
+
+	/* Even though we never use rdtscp for vgetcpu we set up the rdtscp_aux
+	 * register here for (future) use in vgettimeofday et al.
+	 */
 	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
-		write_rdtscp_aux((node << 12) | cpu);
+		write_rdtscp_aux(cpu_node_encoding);
 
+#ifdef VGETCPU_USE_SIDT
+	{
+		struct desc_ptr local_idt;
+
+		local_idt.size = 0x1000 + cpu_node_encoding;
+		local_idt.address = idt_descr.address;
+		asm("lidt %0" :: "m" (local_idt));
+	}
+#else
 	/* Store cpu number in limit so that it can be loaded quickly
-	   in user space in vgetcpu.
-	   12 bits for the CPU and 8 bits for the node. */
-	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
-	*d = 0x0f40000000000ULL;
-	*d |= cpu;
-	*d |= (node & 0xf) << 12;
-	*d |= (node >> 4) << 48;
+	   in user space in vgetcpu. */
+	{
+		unsigned long *d;
+		d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+		*d = 0x0f40000000000ULL;
+		*d |= cpu_node_encoding & 0xffff;
+		*d |= (cpu_node_encoding >> 16) << 48;
+	};
+#endif
 }
 
 static void __cpuinit cpu_vsyscall_init(void *arg)
Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h	2007-01-06 13:31:10.000000000 -0800
+++ linux/include/asm-x86_64/vsyscall.h	2007-01-06 17:15:53.000000000 -0800
@@ -17,7 +17,6 @@
 #include <linux/seqlock.h>
 
 #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
 #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
 #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16)))
 #define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
@@ -28,9 +27,6 @@
 #define VXTIME_HPET	2
 #define VXTIME_PMTMR	3
 
-#define VGETCPU_RDTSCP	1
-#define VGETCPU_LSL	2
-
 struct vxtime_data {
 	long hpet_address;	/* HPET base address */
 	int last;
@@ -45,7 +41,6 @@
 
 /* vsyscall space (readonly) */
 extern struct vxtime_data __vxtime;
-extern int __vgetcpu_mode;
 extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
 extern struct timezone __sys_tz;
@@ -53,7 +48,6 @@
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
-extern int vgetcpu_mode;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
 extern seqlock_t xtime_lock;
@@ -62,6 +56,28 @@
 
 #define ARCH_HAVE_XTIME_LOCK 1
 
+/*
+ * To use the IDT limit for vgetcpu we encode things like so:
+ *
+ *   0x1000 + node + (cpu << CONFIG_NODES_SHIFT)
+ *
+ * this ensures a system using this method has an IDT limit other than
+ * 0xfff, while systems not using this method will have an IDT limit
+ * of 0xfff.  (just in case anyone cares to have a test).
+ *
+ * This test verifies the various config options are in an appropriate
+ * range for the 16-bit limit field.
+ */
+#if 0x1000 + (CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) <= 0x10000
+#define VGETCPU_USE_SIDT 1
+
+/* might as well test this somewhere -- the lsl method of vgetcpu has
+ * only 20 bits available to it.
+ */
+#elif (CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) >= (1<<20)
+#error "(CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) out of range for existing vgetcpu implementations"
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_64_VSYSCALL_H_ */
Index: linux/arch/x86_64/kernel/vmlinux.lds
===================================================================
--- linux.orig/arch/x86_64/kernel/vmlinux.lds	2007-01-06 16:03:53.000000000 -0800
+++ linux/arch/x86_64/kernel/vmlinux.lds	2007-01-06 16:04:20.000000000 -0800
@@ -1372,8 +1372,6 @@
   xtime_lock = (ADDR(.xtime_lock) - ((-10*1024*1024) - ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))));
   .vxtime : AT((ADDR(.vxtime) - ((-10*1024*1024) - ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))))) { *(.vxtime) }
   vxtime = (ADDR(.vxtime) - ((-10*1024*1024) - ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))));
-  .vgetcpu_mode : AT((ADDR(.vgetcpu_mode) - ((-10*1024*1024) - ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))))) { *(.vgetcpu_mode) }
-  vgetcpu_mode = (ADDR(.vgetcpu_mode) - ((-10*1024*1024) - ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))));
   .sys_tz : AT((ADDR(.sys_tz) - ((-10*1024*1024) - ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))))) { *(.sys_tz) }
   sys_tz = (ADDR(.sys_tz) - ((-10*1024*1024) - ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))));
   .sysctl_vsyscall : AT((ADDR(.sysctl_vsyscall) - ((-10*1024*1024) - ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))))) { *(.sysctl_vsyscall) }
Index: linux/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux.orig/arch/x86_64/kernel/vmlinux.lds.S	2007-01-06 16:03:53.000000000 -0800
+++ linux/arch/x86_64/kernel/vmlinux.lds.S	2007-01-06 16:04:07.000000000 -0800
@@ -94,9 +94,6 @@
   .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
   vxtime = VVIRT(.vxtime);
 
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
   .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
   sys_tz = VVIRT(.sys_tz);
 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] faster vgetcpu using sidt
  2007-01-07  2:41 [patch] faster vgetcpu using sidt dean gaudet
@ 2007-01-09  0:26 ` dean gaudet
  2007-01-09  9:07   ` Andi Kleen
  2007-01-14  7:00 ` [patch] faster vgetcpu using sidt (take 2) dean gaudet
  1 sibling, 1 reply; 7+ messages in thread
From: dean gaudet @ 2007-01-09  0:26 UTC (permalink / raw)
  To: ak, vojtech, linux-kernel

On Sat, 6 Jan 2007, dean gaudet wrote:

> below is a patch which improves vgetcpu latency on all x86_64 
> implementations i've tested.
> 
> Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
> userland-accessible and we could use their limit fields to tuck away a few 
> bits of per-cpu information.
...

i got a hold of a p4 (model 4) and ran the timings there:

                        baseline        patched
                no cache        cache
k8 pre-revF        21             14      16
k8 revF            31             14      17
core2              38             12      17
p4                 49             24      37

not as good as i hoped... i'll have to put the cache back in just for the 
p4... so i'll respin my patch with the cache back in place.

another thought occured to me -- 64-bit processes can't actually use their 
LDT can they?  in that case i could probably use sldt (faster than sidt) 
for 64-bit procs and fallback to sidt for 32-bit emulation (which doesn't 
exist for this vsyscall yet anyhow).

let me know if you have any other feedback.

thanks
-dean

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] faster vgetcpu using sidt
  2007-01-09  0:26 ` dean gaudet
@ 2007-01-09  9:07   ` Andi Kleen
  0 siblings, 0 replies; 7+ messages in thread
From: Andi Kleen @ 2007-01-09  9:07 UTC (permalink / raw)
  To: dean gaudet; +Cc: vojtech, linux-kernel

> 64-bit processes can't actually use their 
> LDT can they?  

The kernel supports LDT for 64bit processes, it just is not commonly
used. 

-Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch] faster vgetcpu using sidt (take 2)
  2007-01-07  2:41 [patch] faster vgetcpu using sidt dean gaudet
  2007-01-09  0:26 ` dean gaudet
@ 2007-01-14  7:00 ` dean gaudet
  2007-01-14  8:07   ` dean gaudet
  1 sibling, 1 reply; 7+ messages in thread
From: dean gaudet @ 2007-01-14  7:00 UTC (permalink / raw)
  To: ak, vojtech, linux-kernel

ok here is the latest rev of this patch (against 2.6.20-rc4).

timings in cycles:

                baseline   patched    baseline   patched
                no cache   no cache    cache      cache
k8 pre-revF        21        16          14        17
k8 revF            31        17          14        17
core2              38        16          12        14
p4                 49        41          24        24

the degredation in cached timings appears to be due to the 16 byte stack
frame set up for the sidt instruction.  apparently due to -mno-red-zone...
would you accept a patch which re-enables the red-zone for vsyscalls?

here is the slightly updated description:

below is a patch which improves vgetcpu latency on all x86_64 
implementations i've tested.

Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
userland-accessible and we could use their limit fields to tuck away a few 
bits of per-cpu information.

vgetcpu generally uses lsl at present, but all of sgdt/sidt/sldt are
faster than lsl on all x86_64 processors i've tested.  lsl requires
microcoded permission testing whereas s*dt are free of any such hassle.

sldt is the least expensive of the three instructions however it's a 
hassle to use because processes may want to adjust their ldt.  sidt/sgdt 
have essentially the same performance across all the major architectures 
-- however sidt has the advantage that its limit field is 16-bits, yet any 
value >= 0xfff is essentially "infinite" because there are only 256 (16 
byte) descriptors.  so sidt is probably the best choice of the three.

in benchmarking i've discovered the rdtscp implementation of vgetcpu is 
slower than even the lsl-based implementation on opteron revF.  so i've 
dropped the rdtscp implementation in this patch.  however i've left the 
rdtscp_aux register initialized because i'm sure it's the right choice for 
various proposed vgettimeofday / per-cpu tsc state improvements which need 
the atomic nature of the rdtscp instruction and i hope it'll be used in 
those situations.

at compile time this patch detects if 0x1000 + 
(CONFIG_NR_CPUS<<CONFIG_NODES_SHIFT) will fit in the idt limit field and 
selects the lsl method otherwise.  i've further added a test for the 20 
bit limit of the lsl method and #error in the event it doesn't fit (we 
could fall all the way back to cpuid method if someone has a box with that 
many cpus*nodes, but i'll let someone else handle that case ;).

given this is a compile-time choice, and rdtscp is always slower than 
sidt, i've dropped the vgetcpu_mode variable.

timing tools and test case can be found at 
<http://arctic.org/~dean/vgetcpu/>

-dean

Signed-off-by: dean gaudet <dean@arctic.org>

Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c	2007-01-13 22:20:46.000000000 -0800
+++ linux/arch/x86_64/kernel/time.c	2007-01-13 22:21:01.000000000 -0800
@@ -957,11 +957,6 @@
 	if (unsynchronized_tsc())
 		notsc = 1;
 
- 	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
-		vgetcpu_mode = VGETCPU_RDTSCP;
-	else
-		vgetcpu_mode = VGETCPU_LSL;
-
 	if (vxtime.hpet_address && notsc) {
 		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
 		if (hpet_use_timer)
Index: linux/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.orig/arch/x86_64/kernel/vsyscall.c	2007-01-13 22:20:46.000000000 -0800
+++ linux/arch/x86_64/kernel/vsyscall.c	2007-01-13 22:21:01.000000000 -0800
@@ -46,7 +46,11 @@
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-int __vgetcpu_mode __section_vgetcpu_mode;
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
 
 #include <asm/unistd.h>
 
@@ -147,11 +151,11 @@
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-	unsigned int dummy, p;
+	unsigned int p;
 	unsigned long j = 0;
 
 	/* Fast cache - only recompute value once per jiffies and avoid
-	   relatively costly rdtscp/cpuid otherwise.
+	   relatively costly lsl/sidt otherwise.
 	   This works because the scheduler usually keeps the process
 	   on the same CPU and this syscall doesn't guarantee its
 	   results anyways.
@@ -160,21 +164,30 @@
 	   If you don't like it pass NULL. */
 	if (tcache && tcache->blob[0] == (j = __jiffies)) {
 		p = tcache->blob[1];
-	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-		/* Load per CPU data from RDTSCP */
-		rdtscp(dummy, dummy, p);
-	} else {
+	}
+	else {
+#ifdef VGETCPU_USE_SIDT
+                struct {
+                        char pad[6];	/* avoid unaligned stores */
+                        u16 size;
+                        u64 address;
+                } idt;
+
+                asm("sidt %0" : "=m" (idt.size));
+                p = idt.size - 0x1000;
+#else
 		/* Load per CPU data from GDT */
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-	}
-	if (tcache) {
-		tcache->blob[0] = j;
-		tcache->blob[1] = p;
+#endif
+		if (tcache) {
+			tcache->blob[0] = j;
+			tcache->blob[1] = p;
+		}
 	}
 	if (cpu)
-		*cpu = p & 0xfff;
+		*cpu = p >> CONFIG_NODES_SHIFT;
 	if (node)
-		*node = p >> 12;
+		*node = p & ((1<<CONFIG_NODES_SHIFT) - 1);
 	return 0;
 }
 
@@ -250,22 +263,37 @@
    doesn't violate that. We'll find out if it does. */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
-	unsigned long *d;
-	unsigned long node = 0;
+	unsigned long cpu_node_encoding = cpu << CONFIG_NODES_SHIFT;
+
 #ifdef CONFIG_NUMA
-	node = cpu_to_node[cpu];
+	cpu_node_encoding |= cpu_to_node[cpu];
 #endif
+
+	/* Even though we never use rdtscp for vgetcpu we set up the rdtscp_aux
+	 * register here for (future) use in vgettimeofday et al.
+	 */
 	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
-		write_rdtscp_aux((node << 12) | cpu);
+		write_rdtscp_aux(cpu_node_encoding);
 
+#ifdef VGETCPU_USE_SIDT
+	{
+		struct desc_ptr local_idt;
+
+		local_idt.size = 0x1000 + cpu_node_encoding;
+		local_idt.address = idt_descr.address;
+		asm("lidt %0" :: "m" (local_idt));
+	}
+#else
 	/* Store cpu number in limit so that it can be loaded quickly
-	   in user space in vgetcpu.
-	   12 bits for the CPU and 8 bits for the node. */
-	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
-	*d = 0x0f40000000000ULL;
-	*d |= cpu;
-	*d |= (node & 0xf) << 12;
-	*d |= (node >> 4) << 48;
+	   in user space in vgetcpu. */
+	{
+		unsigned long *d;
+		d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+		*d = 0x0f40000000000ULL;
+		*d |= cpu_node_encoding & 0xffff;
+		*d |= (cpu_node_encoding >> 16) << 48;
+	};
+#endif
 }
 
 static void __cpuinit cpu_vsyscall_init(void *arg)
Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h	2007-01-13 22:20:46.000000000 -0800
+++ linux/include/asm-x86_64/vsyscall.h	2007-01-13 22:21:01.000000000 -0800
@@ -17,7 +17,6 @@
 #include <linux/seqlock.h>
 
 #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
 #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
 #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16)))
 #define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
@@ -28,9 +27,6 @@
 #define VXTIME_HPET	2
 #define VXTIME_PMTMR	3
 
-#define VGETCPU_RDTSCP	1
-#define VGETCPU_LSL	2
-
 struct vxtime_data {
 	long hpet_address;	/* HPET base address */
 	int last;
@@ -45,7 +41,6 @@
 
 /* vsyscall space (readonly) */
 extern struct vxtime_data __vxtime;
-extern int __vgetcpu_mode;
 extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
 extern struct timezone __sys_tz;
@@ -53,7 +48,6 @@
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
-extern int vgetcpu_mode;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
 extern seqlock_t xtime_lock;
@@ -62,6 +56,28 @@
 
 #define ARCH_HAVE_XTIME_LOCK 1
 
+/*
+ * To use the IDT limit for vgetcpu we encode things like so:
+ *
+ *   0x1000 + node + (cpu << CONFIG_NODES_SHIFT)
+ *
+ * this ensures a system using this method has an IDT limit other than
+ * 0xfff, while systems not using this method will have an IDT limit
+ * of 0xfff.  (just in case anyone cares to have a test).
+ *
+ * This test verifies the various config options are in an appropriate
+ * range for the 16-bit limit field.
+ */
+#if 0x1000 + (CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) <= 0x10000
+#define VGETCPU_USE_SIDT 1
+
+/* might as well test this somewhere -- the lsl method of vgetcpu has
+ * only 20 bits available to it.
+ */
+#elif (CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) >= (1<<20)
+#error "(CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) out of range for existing vgetcpu implementations"
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_64_VSYSCALL_H_ */
Index: linux/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux.orig/arch/x86_64/kernel/vmlinux.lds.S	2007-01-13 22:20:46.000000000 -0800
+++ linux/arch/x86_64/kernel/vmlinux.lds.S	2007-01-13 22:21:01.000000000 -0800
@@ -94,9 +94,6 @@
   .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
   vxtime = VVIRT(.vxtime);
 
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
   .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
   sys_tz = VVIRT(.sys_tz);
 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] faster vgetcpu using sidt (take 2)
  2007-01-14  7:00 ` [patch] faster vgetcpu using sidt (take 2) dean gaudet
@ 2007-01-14  8:07   ` dean gaudet
  2007-01-18 21:45     ` Andi Kleen
  0 siblings, 1 reply; 7+ messages in thread
From: dean gaudet @ 2007-01-14  8:07 UTC (permalink / raw)
  To: ak, vojtech, linux-kernel

On Sat, 13 Jan 2007, dean gaudet wrote:

> ok here is the latest rev of this patch (against 2.6.20-rc4).
> 
> timings in cycles:
> 
>                 baseline   patched    baseline   patched
>                 no cache   no cache    cache      cache
> k8 pre-revF        21        16          14        17
> k8 revF            31        17          14        17
> core2              38        16          12        14
> p4                 49        41          24        24
> 
> the degredation in cached timings appears to be due to the 16 byte stack
> frame set up for the sidt instruction.  apparently due to -mno-red-zone...
> would you accept a patch which re-enables the red-zone for vsyscalls?

here is a first stab at a patch (applied on top of my vgetcpu sidt patch) 
which enables red-zone for vsyscall.  it fixes the cache degredation 
problem above by getting rid of the stack frame setup in vgetcpu (and 
improves the no cache cases as well but i haven't run it everywhere yet).

to do this i split the user-mode-only portion of vsyscall.c into 
vsyscall_user.c.  this required a couple externs in vsyscall.c and two 
extra ".globl" in the asm in vsyscall_user.c.

i'm not sure if we need the CFLAGS_vsyscall.o still or not.

let me know what you think... thanks.

-dean

Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile	2006-11-29 13:57:37.000000000 -0800
+++ linux/arch/x86_64/kernel/Makefile	2007-01-13 23:34:22.000000000 -0800
@@ -6,7 +6,7 @@
 EXTRA_AFLAGS	:= -traditional
 obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
-		x8664_ksyms.o i387.o syscall.o vsyscall.o \
+		x8664_ksyms.o i387.o syscall.o vsyscall.o vsyscall_user.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
 		pci-dma.o pci-nommu.o alternative.o
 
@@ -45,6 +45,7 @@
 obj-y				+= intel_cacheinfo.o
 
 CFLAGS_vsyscall.o		:= $(PROFILING) -g0
+CFLAGS_vsyscall_user.o		:= $(PROFILING) -g0 -mred-zone
 
 therm_throt-y                   += ../../i386/kernel/cpu/mcheck/therm_throt.o
 bootflag-y			+= ../../i386/kernel/bootflag.o
Index: linux/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.orig/arch/x86_64/kernel/vsyscall.c	2007-01-13 22:21:01.000000000 -0800
+++ linux/arch/x86_64/kernel/vsyscall.c	2007-01-13 23:41:08.000000000 -0800
@@ -40,161 +40,12 @@
 #include <asm/segment.h>
 #include <asm/desc.h>
 #include <asm/topology.h>
-
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
-
-int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-
-/* is this necessary? */
-#ifndef CONFIG_NODES_SHIFT
-#define CONFIG_NODES_SHIFT 0
-#endif
-
 #include <asm/unistd.h>
 
-static __always_inline void timeval_normalize(struct timeval * tv)
-{
-	time_t __sec;
-
-	__sec = tv->tv_usec / 1000000;
-	if (__sec) {
-		tv->tv_usec %= 1000000;
-		tv->tv_sec += __sec;
-	}
-}
-
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-	long sequence, t;
-	unsigned long sec, usec;
-
-	do {
-		sequence = read_seqbegin(&__xtime_lock);
-		
-		sec = __xtime.tv_sec;
-		usec = __xtime.tv_nsec / 1000;
-
-		if (__vxtime.mode != VXTIME_HPET) {
-			t = get_cycles_sync();
-			if (t < __vxtime.last_tsc)
-				t = __vxtime.last_tsc;
-			usec += ((t - __vxtime.last_tsc) *
-				 __vxtime.tsc_quot) >> 32;
-			/* See comment in x86_64 do_gettimeofday. */
-		} else {
-			usec += ((readl((void __iomem *)
-				   fix_to_virt(VSYSCALL_HPET) + 0xf0) -
-				  __vxtime.last) * __vxtime.quot) >> 32;
-		}
-	} while (read_seqretry(&__xtime_lock, sequence));
-
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
-}
-
-/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
-	*tz = __sys_tz;
-}
-
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
-	int ret;
-	asm volatile("vsysc2: syscall"
-		: "=a" (ret)
-		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
-	return ret;
-}
-
-static __always_inline long time_syscall(long *t)
-{
-	long secs;
-	asm volatile("vsysc1: syscall"
-		: "=a" (secs)
-		: "0" (__NR_time),"D" (t) : __syscall_clobber);
-	return secs;
-}
-
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
-{
-	if (!__sysctl_vsyscall)
-		return gettimeofday(tv,tz);
-	if (tv)
-		do_vgettimeofday(tv);
-	if (tz)
-		do_get_tz(tz);
-	return 0;
-}
-
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
-	if (!__sysctl_vsyscall)
-		return time_syscall(t);
-	else if (t)
-		*t = __xtime.tv_sec;		
-	return __xtime.tv_sec;
-}
-
-/* Fast way to get current CPU and node.
-   This helps to do per node and per CPU caches in user space.
-   The result is not guaranteed without CPU affinity, but usually
-   works out because the scheduler tries to keep a thread on the same
-   CPU.
-
-   tcache must point to a two element sized long array.
-   All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
-	unsigned int p;
-	unsigned long j = 0;
-
-	/* Fast cache - only recompute value once per jiffies and avoid
-	   relatively costly lsl/sidt otherwise.
-	   This works because the scheduler usually keeps the process
-	   on the same CPU and this syscall doesn't guarantee its
-	   results anyways.
-	   We do this here because otherwise user space would do it on
-	   its own in a likely inferior way (no access to jiffies).
-	   If you don't like it pass NULL. */
-	if (tcache && tcache->blob[0] == (j = __jiffies)) {
-		p = tcache->blob[1];
-	}
-	else {
-#ifdef VGETCPU_USE_SIDT
-                struct {
-                        char pad[6];	/* avoid unaligned stores */
-                        u16 size;
-                        u64 address;
-                } idt;
-
-                asm("sidt %0" : "=m" (idt.size));
-                p = idt.size - 0x1000;
-#else
-		/* Load per CPU data from GDT */
-		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-#endif
-		if (tcache) {
-			tcache->blob[0] = j;
-			tcache->blob[1] = p;
-		}
-	}
-	if (cpu)
-		*cpu = p >> CONFIG_NODES_SHIFT;
-	if (node)
-		*node = p & ((1<<CONFIG_NODES_SHIFT) - 1);
-	return 0;
-}
-
-long __vsyscall(3) venosys_1(void)
-{
-	return -ENOSYS;
-}
+/* the vsyscalls themselves */
+extern int vgettimeofday(struct timeval * tv, struct timezone * tz);
+extern time_t vtime(time_t *t);
+extern long vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache);
 
 #ifdef CONFIG_SYSCTL
 
@@ -259,6 +110,11 @@
 
 #endif
 
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
+
 /* Assume __initcall executes before all user space. Hopefully kmod
    doesn't violate that. We'll find out if it does. */
 static void __cpuinit vsyscall_set_cpu(int cpu)
Index: linux/arch/x86_64/kernel/vsyscall_user.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux/arch/x86_64/kernel/vsyscall_user.c	2007-01-14 00:03:44.000000000 -0800
@@ -0,0 +1,201 @@
+/*
+ *  linux/arch/x86_64/kernel/vsyscall_user.c
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *  If we want more than four we need a vDSO.
+ *
+ *  Note: the concept clashes with user mode linux. If you use UML and
+ *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __syscall_clobber "r11","rcx","memory"
+
+int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
+seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+
+#include <asm/unistd.h>
+
+static __always_inline void timeval_normalize(struct timeval * tv)
+{
+	time_t __sec;
+
+	__sec = tv->tv_usec / 1000000;
+	if (__sec) {
+		tv->tv_usec %= 1000000;
+		tv->tv_sec += __sec;
+	}
+}
+
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+	long sequence, t;
+	unsigned long sec, usec;
+
+	do {
+		sequence = read_seqbegin(&__xtime_lock);
+
+		sec = __xtime.tv_sec;
+		usec = __xtime.tv_nsec / 1000;
+
+		if (__vxtime.mode != VXTIME_HPET) {
+			t = get_cycles_sync();
+			if (t < __vxtime.last_tsc)
+				t = __vxtime.last_tsc;
+			usec += ((t - __vxtime.last_tsc) *
+				 __vxtime.tsc_quot) >> 32;
+			/* See comment in x86_64 do_gettimeofday. */
+		} else {
+			usec += ((readl((void __iomem *)
+				   fix_to_virt(VSYSCALL_HPET) + 0xf0) -
+				  __vxtime.last) * __vxtime.quot) >> 32;
+		}
+	} while (read_seqretry(&__xtime_lock, sequence));
+
+	tv->tv_sec = sec + usec / 1000000;
+	tv->tv_usec = usec % 1000000;
+}
+
+/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
+static __always_inline void do_get_tz(struct timezone * tz)
+{
+	*tz = __sys_tz;
+}
+
+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	int ret;
+	asm volatile(
+                ".globl vsysc2\n"
+                "vsysc2: syscall\n"
+		: "=a" (ret)
+		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
+	return ret;
+}
+
+static __always_inline long time_syscall(long *t)
+{
+	long secs;
+	asm volatile(
+                ".globl vsysc1\n"
+                "vsysc1: syscall\n"
+		: "=a" (secs)
+		: "0" (__NR_time),"D" (t) : __syscall_clobber);
+	return secs;
+}
+
+int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+	if (!__sysctl_vsyscall)
+		return gettimeofday(tv,tz);
+	if (tv)
+		do_vgettimeofday(tv);
+	if (tz)
+		do_get_tz(tz);
+	return 0;
+}
+
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+time_t __vsyscall(1) vtime(time_t *t)
+{
+	if (!__sysctl_vsyscall)
+		return time_syscall(t);
+	else if (t)
+		*t = __xtime.tv_sec;
+	return __xtime.tv_sec;
+}
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
+
+/* Fast way to get current CPU and node.
+   This helps to do per node and per CPU caches in user space.
+   The result is not guaranteed without CPU affinity, but usually
+   works out because the scheduler tries to keep a thread on the same
+   CPU.
+
+   tcache must point to a two element sized long array.
+   All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+	unsigned int p;
+	unsigned long j = 0;
+
+	/* Fast cache - only recompute value once per jiffies and avoid
+	   relatively costly lsl/sidt otherwise.
+	   This works because the scheduler usually keeps the process
+	   on the same CPU and this syscall doesn't guarantee its
+	   results anyways.
+	   We do this here because otherwise user space would do it on
+	   its own in a likely inferior way (no access to jiffies).
+	   If you don't like it pass NULL. */
+	if (tcache && tcache->blob[0] == (j = __jiffies)) {
+		p = tcache->blob[1];
+	}
+	else {
+#ifdef VGETCPU_USE_SIDT
+                struct {
+                        char pad[6];	/* avoid unaligned stores */
+                        u16 size;
+                        u64 address;
+                } idt;
+
+                asm("sidt %0" : "=m" (idt.size));
+                p = idt.size - 0x1000;
+#else
+		/* Load per CPU data from GDT */
+		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+#endif
+		if (tcache) {
+			tcache->blob[0] = j;
+			tcache->blob[1] = p;
+		}
+	}
+	if (cpu)
+		*cpu = p >> CONFIG_NODES_SHIFT;
+	if (node)
+		*node = p & ((1<<CONFIG_NODES_SHIFT) - 1);
+	return 0;
+}
+
+long __vsyscall(3) venosys_1(void)
+{
+	return -ENOSYS;
+}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] faster vgetcpu using sidt (take 2)
  2007-01-14  8:07   ` dean gaudet
@ 2007-01-18 21:45     ` Andi Kleen
  2007-01-23  4:53       ` dean gaudet
  0 siblings, 1 reply; 7+ messages in thread
From: Andi Kleen @ 2007-01-18 21:45 UTC (permalink / raw)
  To: dean gaudet; +Cc: vojtech, linux-kernel

> let me know what you think... thanks.

It's ok, although I would like to have the file in a separate directory.

-Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] faster vgetcpu using sidt (take 2)
  2007-01-18 21:45     ` Andi Kleen
@ 2007-01-23  4:53       ` dean gaudet
  0 siblings, 0 replies; 7+ messages in thread
From: dean gaudet @ 2007-01-23  4:53 UTC (permalink / raw)
  To: Andi Kleen; +Cc: vojtech, linux-kernel

On Thu, 18 Jan 2007, Andi Kleen wrote:

> > let me know what you think... thanks.
> 
> It's ok, although I would like to have the file in a separate directory.

cool -- do you have a directory in mind?

and would you like this change as two separate patches or one combined 
patch?

thanks
-dean

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-01-23  4:54 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-07  2:41 [patch] faster vgetcpu using sidt dean gaudet
2007-01-09  0:26 ` dean gaudet
2007-01-09  9:07   ` Andi Kleen
2007-01-14  7:00 ` [patch] faster vgetcpu using sidt (take 2) dean gaudet
2007-01-14  8:07   ` dean gaudet
2007-01-18 21:45     ` Andi Kleen
2007-01-23  4:53       ` dean gaudet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).