* [RFC, patch] i386: vgetcpu() for NUMA, take 2
@ 2006-06-24 8:23 Chuck Ebbert
2006-06-24 15:37 ` Martin J. Bligh
0 siblings, 1 reply; 3+ messages in thread
From: Chuck Ebbert @ 2006-06-24 8:23 UTC (permalink / raw)
To: linux-kernel
Cc: Rohit Seth, Andi Kleen, Andrew Morton, Martin Bligh, Ingo Molnar
This is attempt #2 at vgetcpu() NUMA support for i386. It uses a
GDT entry to hold cpu and node number for fast userspace access.
changes since #1:
proper function prototype (same as x86_64)
changed alignment of vsyscall functions to 16 bytes
(sigreturn needs to stay fixed, others can move)
to-do:
CFI annotations
test NUMA on real NUMA hardware (someone please test)
Test program:
/* vgetcpu.c: test how fast vgetcpu runs
* boot kernel with vgetcpu patch first, then:
* gcc -O3 -o vgetcpu vgetcpu.c <srcpath>/arch/i386/kernel/vsyscall-int80.so
* (don't forget the optimization (-O3))
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
extern int __attribute__ ((regparm(2))) __vgetcpu(int *cpu, int *node);
#define rdtscll(t) asm("rdtsc" : "=A" (t))
int main(int argc, char * const argv[])
{
long long tsc1, tsc2;
int i, cpu = 999, node = 999, iters = 99999;
if (__vgetcpu(&cpu, &node) || node == 999 || cpu == 999) {
printf("vgetcpu failed!\n");
_exit(1);
}
printf("node: %d, cpu: %d\n", node, cpu);
rdtscll(tsc1);
for (i = 0; i < iters; i++)
__vgetcpu(&cpu, &node);
rdtscll(tsc2);
printf("vgetcpu took %llu clocks per call\n", (tsc2 - tsc1) / iters);
return 0;
}
Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
arch/i386/kernel/cpu/common.c | 3 ++
arch/i386/kernel/head.S | 11 +++++++-
arch/i386/kernel/smpboot.c | 2 +
arch/i386/kernel/vsyscall-getcpu.S | 42 ++++++++++++++++++++++++++++++++++
arch/i386/kernel/vsyscall-int80.S | 2 +
arch/i386/kernel/vsyscall-sigreturn.S | 3 --
arch/i386/kernel/vsyscall-sysenter.S | 2 +
arch/i386/kernel/vsyscall.lds.S | 1
include/asm-i386/segment.h | 4 ++-
9 files changed, 65 insertions(+), 5 deletions(-)
--- 2.6.17-32.orig/arch/i386/kernel/cpu/common.c
+++ 2.6.17-32/arch/i386/kernel/cpu/common.c
@@ -642,6 +642,9 @@ void __cpuinit cpu_init(void)
((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
(CPU_16BIT_STACK_SIZE - 1);
+ /* Set up GDT entry for per-cpu data */
+ gdt[GDT_ENTRY_VGETCPU].a |= cpu & 0xff;
+
cpu_gdt_descr->size = GDT_SIZE - 1;
cpu_gdt_descr->address = (unsigned long)gdt;
--- 2.6.17-32.orig/arch/i386/kernel/head.S
+++ 2.6.17-32/arch/i386/kernel/head.S
@@ -480,7 +480,7 @@ ENTRY(boot_gdt_table)
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
/*
- * The Global Descriptor Table contains 28 quadwords, per-CPU.
+ * The Global Descriptor Table contains 32 quadwords, per-CPU.
*/
.align L1_CACHE_BYTES
ENTRY(cpu_gdt_table)
@@ -525,7 +525,14 @@ ENTRY(cpu_gdt_table)
.quad 0x004092000000ffff /* 0xc8 APM DS data */
.quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
- .quad 0x0000000000000000 /* 0xd8 - unused */
+
+ /*
+ * Use GDT entries to store per-cpu data for user space (DPL 3.)
+ * 32-bit data segment, byte granularity, base 0, limit set at runtime.
+ * Userspace will use LSL to access this data, stored in the limit field.
+ */
+ .quad 0x0040f20000000000 /* 0xd8 - nodeid and logical CPU number */
+
.quad 0x0000000000000000 /* 0xe0 - unused */
.quad 0x0000000000000000 /* 0xe8 - unused */
.quad 0x0000000000000000 /* 0xf0 - unused */
--- /dev/null
+++ 2.6.17-32/arch/i386/kernel/vsyscall-getcpu.S
@@ -0,0 +1,42 @@
+/*
+ * fastcall int __vgetcpu(int *cpu, int *node)
+ *
+ * This file is #include'd by vsyscall-*.S to place vgetcpu after the
+ * sigreturn code.
+ *
+ * Puts logical CPU number in *cpu, node ID in *node;
+ * returns 0 for success and -EFAULT on error.
+ *
+ * CPU number and node ID are 8 bits each, with 4 total bits available
+ * for future growth of either field.
+ */
+
+#include <linux/errno.h>
+#include <asm/segment.h>
+
+ .text
+ .balign 16
+ .globl __vgetcpu
+ .type __vgetcpu,@function
+__vgetcpu:
+.LSTART_vgetcpu:
+ mov $((GDT_ENTRY_VGETCPU<<3)|3),%cx
+ lsl %ecx,%ecx
+ jnz 1f
+ push %ecx
+ and $0xff,%ecx /* 8-bit cpu number */
+ mov %ecx,(%eax)
+ pop %ecx
+ xor %eax,%eax
+ shr $8,%ecx /* assume top 4 bits are zero */
+ mov %ecx,(%edx)
+ ret
+1:
+ push $-EFAULT /* saves 2 bytes of .text */
+ pop %eax
+ ret
+.LEND_vgetcpu:
+ .size __vgetcpu,.-.LSTART_vgetcpu
+ .previous
+
+/* ZZZ: need CFI annotations here */
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall-int80.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall-int80.S
@@ -51,3 +51,5 @@ __kernel_vsyscall:
* Get the common code for the sigreturn entry points.
*/
#include "vsyscall-sigreturn.S"
+
+#include "vsyscall-getcpu.S"
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall-sysenter.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall-sysenter.S
@@ -120,3 +120,5 @@ SYSENTER_RETURN:
* Get the common code for the sigreturn entry points.
*/
#include "vsyscall-sigreturn.S"
+
+#include "vsyscall-getcpu.S"
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall.lds.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall.lds.S
@@ -57,6 +57,7 @@ VERSION
__kernel_vsyscall;
__kernel_sigreturn;
__kernel_rt_sigreturn;
+ __vgetcpu;
local: *;
};
--- 2.6.17-32.orig/arch/i386/kernel/smpboot.c
+++ 2.6.17-32/arch/i386/kernel/smpboot.c
@@ -615,6 +615,7 @@ static inline void map_cpu_to_node(int c
printk("Mapping cpu %d to node %d\n", cpu, node);
cpu_set(cpu, node_2_cpu_mask[node]);
cpu_2_node[cpu] = node;
+ get_cpu_gdt_table(cpu)[GDT_ENTRY_VGETCPU].a |= (node & 0xff) << 8;
}
/* undo a mapping between cpu and node. */
@@ -626,6 +627,7 @@ static inline void unmap_cpu_to_node(int
for (node = 0; node < MAX_NUMNODES; node ++)
cpu_clear(cpu, node_2_cpu_mask[node]);
cpu_2_node[cpu] = 0;
+ get_cpu_gdt_table(cpu)[GDT_ENTRY_VGETCPU].a &= ~(0xff << 8);
}
#else /* !CONFIG_NUMA */
--- 2.6.17-32.orig/include/asm-i386/segment.h
+++ 2.6.17-32/include/asm-i386/segment.h
@@ -39,7 +39,7 @@
* 25 - APM BIOS support
*
* 26 - ESPFIX small SS
- * 27 - unused
+ * 27 - vgetcpu() data
* 28 - unused
* 29 - unused
* 30 - unused
@@ -74,6 +74,8 @@
#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ENTRY_VGETCPU (GDT_ENTRY_KERNEL_BASE + 15)
+
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
/*
--- 2.6.17-32.orig/arch/i386/kernel/vsyscall-sigreturn.S
+++ 2.6.17-32/arch/i386/kernel/vsyscall-sigreturn.S
@@ -26,7 +26,7 @@ __kernel_sigreturn:
.LEND_sigreturn:
.size __kernel_sigreturn,.-.LSTART_sigreturn
- .balign 32
+ .balign 16
.globl __kernel_rt_sigreturn
.type __kernel_rt_sigreturn,@function
__kernel_rt_sigreturn:
@@ -35,7 +35,6 @@ __kernel_rt_sigreturn:
int $0x80
.LEND_rt_sigreturn:
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
- .balign 32
.previous
.section .eh_frame,"a",@progbits
--
Chuck
"You can't read a newspaper if you can't read." --George W. Bush
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [RFC, patch] i386: vgetcpu() for NUMA, take 2
2006-06-24 8:23 [RFC, patch] i386: vgetcpu() for NUMA, take 2 Chuck Ebbert
@ 2006-06-24 15:37 ` Martin J. Bligh
2006-06-25 15:19 ` Andi Kleen
0 siblings, 1 reply; 3+ messages in thread
From: Martin J. Bligh @ 2006-06-24 15:37 UTC (permalink / raw)
To: Chuck Ebbert
Cc: linux-kernel, Rohit Seth, Andi Kleen, Andrew Morton, Ingo Molnar
Chuck Ebbert wrote:
> This is attempt #2 at vgetcpu() NUMA support for i386. It uses a
> GDT entry to hold cpu and node number for fast userspace access.
>
> changes since #1:
> proper function prototype (same as x86_64)
> changed alignment of vsyscall functions to 16 bytes
> (sigreturn needs to stay fixed, others can move)
>
> to-do:
> CFI annotations
> test NUMA on real NUMA hardware (someone please test)
What was the point of returning wrong info to userspace really quickly?
;-) (ie you could have migrated CPUs, so it's totally unreliable)
Is this just for some statistical monitoring thing?
M.
> Test program:
>
> /* vgetcpu.c: test how fast vgetcpu runs
> * boot kernel with vgetcpu patch first, then:
> * gcc -O3 -o vgetcpu vgetcpu.c <srcpath>/arch/i386/kernel/vsyscall-int80.so
> * (don't forget the optimization (-O3))
> */
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
>
> extern int __attribute__ ((regparm(2))) __vgetcpu(int *cpu, int *node);
>
> #define rdtscll(t) asm("rdtsc" : "=A" (t))
>
> int main(int argc, char * const argv[])
> {
> long long tsc1, tsc2;
> int i, cpu = 999, node = 999, iters = 99999;
>
> if (__vgetcpu(&cpu, &node) || node == 999 || cpu == 999) {
> printf("vgetcpu failed!\n");
> _exit(1);
> }
> printf("node: %d, cpu: %d\n", node, cpu);
>
> rdtscll(tsc1);
> for (i = 0; i < iters; i++)
> __vgetcpu(&cpu, &node);
> rdtscll(tsc2);
>
> printf("vgetcpu took %llu clocks per call\n", (tsc2 - tsc1) / iters);
>
> return 0;
> }
>
>
> Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
>
> arch/i386/kernel/cpu/common.c | 3 ++
> arch/i386/kernel/head.S | 11 +++++++-
> arch/i386/kernel/smpboot.c | 2 +
> arch/i386/kernel/vsyscall-getcpu.S | 42 ++++++++++++++++++++++++++++++++++
> arch/i386/kernel/vsyscall-int80.S | 2 +
> arch/i386/kernel/vsyscall-sigreturn.S | 3 --
> arch/i386/kernel/vsyscall-sysenter.S | 2 +
> arch/i386/kernel/vsyscall.lds.S | 1
> include/asm-i386/segment.h | 4 ++-
> 9 files changed, 65 insertions(+), 5 deletions(-)
>
> --- 2.6.17-32.orig/arch/i386/kernel/cpu/common.c
> +++ 2.6.17-32/arch/i386/kernel/cpu/common.c
> @@ -642,6 +642,9 @@ void __cpuinit cpu_init(void)
> ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
> (CPU_16BIT_STACK_SIZE - 1);
>
> + /* Set up GDT entry for per-cpu data */
> + gdt[GDT_ENTRY_VGETCPU].a |= cpu & 0xff;
> +
> cpu_gdt_descr->size = GDT_SIZE - 1;
> cpu_gdt_descr->address = (unsigned long)gdt;
>
> --- 2.6.17-32.orig/arch/i386/kernel/head.S
> +++ 2.6.17-32/arch/i386/kernel/head.S
> @@ -480,7 +480,7 @@ ENTRY(boot_gdt_table)
> .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
>
> /*
> - * The Global Descriptor Table contains 28 quadwords, per-CPU.
> + * The Global Descriptor Table contains 32 quadwords, per-CPU.
> */
> .align L1_CACHE_BYTES
> ENTRY(cpu_gdt_table)
> @@ -525,7 +525,14 @@ ENTRY(cpu_gdt_table)
> .quad 0x004092000000ffff /* 0xc8 APM DS data */
>
> .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
> - .quad 0x0000000000000000 /* 0xd8 - unused */
> +
> + /*
> + * Use GDT entries to store per-cpu data for user space (DPL 3.)
> + * 32-bit data segment, byte granularity, base 0, limit set at runtime.
> + * Userspace will use LSL to access this data, stored in the limit field.
> + */
> + .quad 0x0040f20000000000 /* 0xd8 - nodeid and logical CPU number */
> +
> .quad 0x0000000000000000 /* 0xe0 - unused */
> .quad 0x0000000000000000 /* 0xe8 - unused */
> .quad 0x0000000000000000 /* 0xf0 - unused */
> --- /dev/null
> +++ 2.6.17-32/arch/i386/kernel/vsyscall-getcpu.S
> @@ -0,0 +1,42 @@
> +/*
> + * fastcall int __vgetcpu(int *cpu, int *node)
> + *
> + * This file is #include'd by vsyscall-*.S to place vgetcpu after the
> + * sigreturn code.
> + *
> + * Puts logical CPU number in *cpu, node ID in *node;
> + * returns 0 for success and -EFAULT on error.
> + *
> + * CPU number and node ID are 8 bits each, with 4 total bits available
> + * for future growth of either field.
> + */
> +
> +#include <linux/errno.h>
> +#include <asm/segment.h>
> +
> + .text
> + .balign 16
> + .globl __vgetcpu
> + .type __vgetcpu,@function
> +__vgetcpu:
> +.LSTART_vgetcpu:
> + mov $((GDT_ENTRY_VGETCPU<<3)|3),%cx
> + lsl %ecx,%ecx
> + jnz 1f
> + push %ecx
> + and $0xff,%ecx /* 8-bit cpu number */
> + mov %ecx,(%eax)
> + pop %ecx
> + xor %eax,%eax
> + shr $8,%ecx /* assume top 4 bits are zero */
> + mov %ecx,(%edx)
> + ret
> +1:
> + push $-EFAULT /* saves 2 bytes of .text */
> + pop %eax
> + ret
> +.LEND_vgetcpu:
> + .size __vgetcpu,.-.LSTART_vgetcpu
> + .previous
> +
> +/* ZZZ: need CFI annotations here */
> --- 2.6.17-32.orig/arch/i386/kernel/vsyscall-int80.S
> +++ 2.6.17-32/arch/i386/kernel/vsyscall-int80.S
> @@ -51,3 +51,5 @@ __kernel_vsyscall:
> * Get the common code for the sigreturn entry points.
> */
> #include "vsyscall-sigreturn.S"
> +
> +#include "vsyscall-getcpu.S"
> --- 2.6.17-32.orig/arch/i386/kernel/vsyscall-sysenter.S
> +++ 2.6.17-32/arch/i386/kernel/vsyscall-sysenter.S
> @@ -120,3 +120,5 @@ SYSENTER_RETURN:
> * Get the common code for the sigreturn entry points.
> */
> #include "vsyscall-sigreturn.S"
> +
> +#include "vsyscall-getcpu.S"
> --- 2.6.17-32.orig/arch/i386/kernel/vsyscall.lds.S
> +++ 2.6.17-32/arch/i386/kernel/vsyscall.lds.S
> @@ -57,6 +57,7 @@ VERSION
> __kernel_vsyscall;
> __kernel_sigreturn;
> __kernel_rt_sigreturn;
> + __vgetcpu;
>
> local: *;
> };
> --- 2.6.17-32.orig/arch/i386/kernel/smpboot.c
> +++ 2.6.17-32/arch/i386/kernel/smpboot.c
> @@ -615,6 +615,7 @@ static inline void map_cpu_to_node(int c
> printk("Mapping cpu %d to node %d\n", cpu, node);
> cpu_set(cpu, node_2_cpu_mask[node]);
> cpu_2_node[cpu] = node;
> + get_cpu_gdt_table(cpu)[GDT_ENTRY_VGETCPU].a |= (node & 0xff) << 8;
> }
>
> /* undo a mapping between cpu and node. */
> @@ -626,6 +627,7 @@ static inline void unmap_cpu_to_node(int
> for (node = 0; node < MAX_NUMNODES; node ++)
> cpu_clear(cpu, node_2_cpu_mask[node]);
> cpu_2_node[cpu] = 0;
> + get_cpu_gdt_table(cpu)[GDT_ENTRY_VGETCPU].a &= ~(0xff << 8);
> }
> #else /* !CONFIG_NUMA */
>
> --- 2.6.17-32.orig/include/asm-i386/segment.h
> +++ 2.6.17-32/include/asm-i386/segment.h
> @@ -39,7 +39,7 @@
> * 25 - APM BIOS support
> *
> * 26 - ESPFIX small SS
> - * 27 - unused
> + * 27 - vgetcpu() data
> * 28 - unused
> * 29 - unused
> * 30 - unused
> @@ -74,6 +74,8 @@
> #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
> #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
>
> +#define GDT_ENTRY_VGETCPU (GDT_ENTRY_KERNEL_BASE + 15)
> +
> #define GDT_ENTRY_DOUBLEFAULT_TSS 31
>
> /*
> --- 2.6.17-32.orig/arch/i386/kernel/vsyscall-sigreturn.S
> +++ 2.6.17-32/arch/i386/kernel/vsyscall-sigreturn.S
> @@ -26,7 +26,7 @@ __kernel_sigreturn:
> .LEND_sigreturn:
> .size __kernel_sigreturn,.-.LSTART_sigreturn
>
> - .balign 32
> + .balign 16
> .globl __kernel_rt_sigreturn
> .type __kernel_rt_sigreturn,@function
> __kernel_rt_sigreturn:
> @@ -35,7 +35,6 @@ __kernel_rt_sigreturn:
> int $0x80
> .LEND_rt_sigreturn:
> .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
> - .balign 32
> .previous
>
> .section .eh_frame,"a",@progbits
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [RFC, patch] i386: vgetcpu() for NUMA, take 2
2006-06-24 15:37 ` Martin J. Bligh
@ 2006-06-25 15:19 ` Andi Kleen
0 siblings, 0 replies; 3+ messages in thread
From: Andi Kleen @ 2006-06-25 15:19 UTC (permalink / raw)
To: Martin J. Bligh
Cc: Chuck Ebbert, linux-kernel, Rohit Seth, Andrew Morton, Ingo Molnar
> What was the point of returning wrong info to userspace really quickly?
Read the long rationale in my original vgetcpu() post.
-Andi
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2006-06-25 15:22 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-06-24 8:23 [RFC, patch] i386: vgetcpu() for NUMA, take 2 Chuck Ebbert
2006-06-24 15:37 ` Martin J. Bligh
2006-06-25 15:19 ` Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).