linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Add VDSO time function support for x86 32-bit kernel
@ 2012-12-11 16:11 stefani
  2012-12-11 19:27 ` John Stultz
  2012-12-11 19:37 ` Andy Lutomirski
  0 siblings, 2 replies; 27+ messages in thread
From: stefani @ 2012-12-11 16:11 UTC (permalink / raw)
  To: linux-kernel, ak, aarcange, john.stultz, luto; +Cc: stefani

From: Stefani Seibold <stefani@seibold.net>

This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
and vdso_time() support to the VDSO for x86 32-bit kernels.

The reason to do this was to get a fast reliable time stamp. Many developers
uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
time functions a fast and reliable way, because the kernel knows the best time
source and the P- and C-state of the CPU.

For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
the ACPI timer should be easily to add with an other patch.

The helper library to use the VDSO functions can be download at
http://http://seibold.net/vdso.c
The libary is very small, only 228 lines of code. Compile it with
gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
and use it with LD_PRELOAD=<path>/libvdso.so

This kind of helper must be integrated into glibc, for x86 64-bit and
PowerPC it is already there.

Some benchmark results (all measurements are in nano seconds):

Intel(R) Celeron(TM) CPU 400MHz

Average time kernel call:
 gettimeofday(): 1039
 clock_gettime(): 1578
 benchmark time(): 526
Average time VDSO call:
 gettimeofday(): 378
 clock_gettime(): 303
 benchmark time(): 60

Celeron(R) Dual-Core CPU T3100 1.90GHz

Average time kernel call:
 gettimeofday(): 209
 clock_gettime(): 406
 benchmark time(): 135
Average time VDSO call:
 gettimeofday(): 51
 clock_gettime(): 43
 benchmark time(): 10

So you can see a performance increase between 4 and 13, depending on the
CPU and the function.

The patch is against kernel 3.7. Please apply if you like it.

Changelog:

25.11.2012 - first release and prove of concept for linux 3.4
11.12.2012 - Port to linux 3.7 and code cleanup

Signed-off-by: Stefani Seibold <stefani@seibold.net>
---
 arch/x86/Kconfig                      |  4 +-
 arch/x86/include/asm/clocksource.h    |  4 --
 arch/x86/include/asm/compat.h         |  5 ++
 arch/x86/include/asm/fixmap.h         |  3 +-
 arch/x86/include/asm/vgtod.h          |  1 +
 arch/x86/include/asm/vvar.h           |  7 +++
 arch/x86/kernel/Makefile              |  1 +
 arch/x86/kernel/hpet.c                |  9 ++--
 arch/x86/kernel/setup.c               |  2 +
 arch/x86/kernel/tsc.c                 |  2 -
 arch/x86/kernel/vmlinux.lds.S         |  3 --
 arch/x86/kernel/vsyscall_64.c         | 49 ------------------
 arch/x86/kernel/vsyscall_gtod.c       | 94 +++++++++++++++++++++++++++++++++++
 arch/x86/vdso/Makefile                |  1 +
 arch/x86/vdso/vclock_gettime.c        | 16 +++++-
 arch/x86/vdso/vdso32/vclock_gettime.c |  7 +++
 arch/x86/vdso/vdso32/vdso32.lds.S     | 15 ++++++
 17 files changed, 158 insertions(+), 65 deletions(-)
 create mode 100644 arch/x86/kernel/vsyscall_gtod.c
 create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff..b8c2c74 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,9 +100,9 @@ config X86
 	select GENERIC_CMOS_UPDATE
 	select CLOCKSOURCE_WATCHDOG
 	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA if X86_64
+	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL if X86_64
+	select GENERIC_TIME_VSYSCALL
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3..67d68b9 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,8 +3,6 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#ifdef CONFIG_X86_64
-
 #define VCLOCK_NONE 0  /* No vDSO clock available.	*/
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
@@ -13,6 +11,4 @@ struct arch_clocksource_data {
 	int vclock_mode;
 };
 
-#endif /* CONFIG_X86_64 */
-
 #endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c40..45ba688 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -295,6 +295,10 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 
 static inline void __user *arch_compat_alloc_user_space(long len)
 {
+#ifdef CONFIG_X86_32
+	struct pt_regs *regs = task_pt_regs(current);
+	return (void __user *)regs->sp - len;
+#else
 	compat_uptr_t sp;
 
 	if (test_thread_flag(TIF_IA32)) {
@@ -305,6 +309,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 	}
 
 	return (void __user *)round_down(sp - len, 16);
+#endif
 }
 
 static inline bool is_x32_task(void)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c..b26e9e0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,9 +78,10 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
 			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+#endif
 	VVAR_PAGE,
 	VSYSCALL_HPET,
-#endif
+
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 46e24d3..eb87b53 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -27,4 +27,5 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;

+extern void map_vgtod(void);
 #endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index de656ac..6f71098 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -17,7 +17,11 @@
  */
 
 /* Base address of vvars.  This is not ABI. */
+#ifdef CONFIG_X86_64
 #define VVAR_ADDRESS (-10*1024*1024 - 4096)
+#else
+#define VVAR_ADDRESS 0xffffd000
+#endif
 
 #if defined(__VVAR_KERNEL_LDS)
 
@@ -46,5 +50,8 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+#ifdef CONFIG_X86_32
+DECLARE_VVAR(512, const void __iomem *, vsyscall_hpet)
+#endif
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 91ce48f..298a0b1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-y			+= syscall_$(BITS).o
+obj-y			+= vsyscall_gtod.o
 obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1460a5d..38887ca 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -69,13 +69,18 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
 
 #ifdef CONFIG_X86_64
 #include <asm/pgtable.h>
+#else
+#include <asm/vvar.h>
+
+DEFINE_VVAR(const void __iomem *, vsyscall_hpet);
 #endif
 
 static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
 	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
+#ifdef CONFIG_X86_32
+	vsyscall_hpet = (const void __iomem *)fix_to_virt(VSYSCALL_HPET);
 #endif
 }
 
@@ -752,9 +757,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
-#ifdef CONFIG_X86_64
 	.archdata	= { .vclock_mode = VCLOCK_HPET },
-#endif
 };
 
 static int hpet_clocksource_register(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..c2f6bbb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,7 @@
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
+#include <asm/vgtod.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -997,6 +998,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_64
 	map_vsyscall();
 #endif
+	map_vgtod();
 
 	generic_apic_probe();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cfa5d4f..078cc9a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -772,9 +772,7 @@ static struct clocksource clocksource_tsc = {
 	.mask                   = CLOCKSOURCE_MASK(64),
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
-#endif
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 22a1530..d73c1df 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -151,7 +151,6 @@ SECTIONS
 		_edata = .;
 	} :data
 
-#ifdef CONFIG_X86_64
 
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
@@ -173,8 +172,6 @@ SECTIONS
 
        . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
 
-#endif /* CONFIG_X86_64 */
-
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
 	.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9..dfc9727 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,6 @@
 #include "vsyscall_trace.h"
 
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
@@ -77,48 +76,6 @@ static int __init vsyscall_setup(char *str)
 }
 early_param("vsyscall", vsyscall_setup);
 
-void update_vsyscall_tz(void)
-{
-	vsyscall_gtod_data.sys_tz = sys_tz;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
-	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
-	write_seqcount_begin(&vdata->seq);
-
-	/* copy vsyscall data */
-	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->clock->cycle_last;
-	vdata->clock.mask		= tk->clock->mask;
-	vdata->clock.mult		= tk->mult;
-	vdata->clock.shift		= tk->shift;
-
-	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->xtime_nsec;
-
-	vdata->monotonic_time_sec	= tk->xtime_sec
-					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->xtime_nsec
-					+ (tk->wall_to_monotonic.tv_nsec
-						<< tk->shift);
-	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->shift)) {
-		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->shift;
-		vdata->monotonic_time_sec++;
-	}
-
-	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
-
-	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
-							tk->wall_to_monotonic);
-
-	write_seqcount_end(&vdata->seq);
-}
-
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 			      const char *message)
 {
@@ -366,8 +323,6 @@ void __init map_vsyscall(void)
 {
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
-	extern char __vvar_page;
-	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
 		     vsyscall_mode == NATIVE
@@ -375,10 +330,6 @@ void __init map_vsyscall(void)
 		     : PAGE_KERNEL_VVAR);
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
 		     (unsigned long)VSYSCALL_START);
-
-	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
-	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
-		     (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 0000000..3631f7f
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Modified for x86 32 bit architecture by
+ *  Stefani Seibold <stefani@seibold.net>
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/topology.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/compat.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#include <asm/traps.h>
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+	vsyscall_gtod_data.sys_tz = sys_tz;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+	write_seqcount_begin(&vdata->seq);
+
+	/* copy vsyscall data */
+	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->clock->cycle_last;
+	vdata->clock.mask		= tk->clock->mask;
+	vdata->clock.mult		= tk->mult;
+	vdata->clock.shift		= tk->shift;
+
+	vdata->wall_time_sec		= tk->xtime_sec;
+	vdata->wall_time_snsec		= tk->xtime_nsec;
+
+	vdata->monotonic_time_sec	= tk->xtime_sec
+					+ tk->wall_to_monotonic.tv_sec;
+	vdata->monotonic_time_snsec	= tk->xtime_nsec
+					+ (tk->wall_to_monotonic.tv_nsec
+						<< tk->shift);
+	while (vdata->monotonic_time_snsec >=
+					(((u64)NSEC_PER_SEC) << tk->shift)) {
+		vdata->monotonic_time_snsec -=
+					((u64)NSEC_PER_SEC) << tk->shift;
+		vdata->monotonic_time_sec++;
+	}
+
+	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
+	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
+
+	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
+							tk->wall_to_monotonic);
+
+	write_seqcount_end(&vdata->seq);
+}
+
+void __init map_vgtod(void)
+{
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+		     (unsigned long)VVAR_ADDRESS);
+#endif
+}
+
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index fd14be1..959221b 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -147,6 +147,7 @@ $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
 
 $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
 				 $(obj)/vdso32/vdso32.lds \
+				 $(obj)/vdso32/vclock_gettime.o \
 				 $(obj)/vdso32/note.o \
 				 $(obj)/vdso32/%.o
 	$(call if_changed,vdso)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c37..2dc6b72 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
 
 static notrace cycle_t vread_hpet(void)
 {
+#ifdef CONFIG_X86_64
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+#else
+	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
+#endif
 }
 
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
+#ifdef CONFIG_X86_64
 	asm("syscall" : "=a" (ret) :
 	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
+#else
+	asm("int $0x80" : "=a" (ret) :
+	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
+#endif
 	return ret;
 }
 
@@ -74,15 +83,20 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 
+#ifdef CONFIG_X86_64
 	asm("syscall" : "=a" (ret) :
 	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+#else
+	asm("int $0x80" : "=a" (ret) :
+	    "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
+#endif
 	return ret;
 }
 
 
 notrace static inline u64 vgetsns(void)
 {
-	long v;
+	u64 v;
 	cycles_t cycles;
 	if (gtod->clock.vclock_mode == VCLOCK_TSC)
 		cycles = vread_tsc();
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 0000000..c9a1909
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,7 @@
+/*
+ * since vgtod layout differs between X86_64 and x86_32, it is not possible to
+ * provide a 32 bit vclock with a 64 bit kernel
+ */
+#ifdef CONFIG_X86_32
+#include "../vclock_gettime.c"
+#endif
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
index 976124b..80ef0d6 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -24,6 +24,16 @@ VERSION
 		__kernel_vsyscall;
 		__kernel_sigreturn;
 		__kernel_rt_sigreturn;
+#ifdef CONFIG_X86_32
+		clock_gettime;
+		__vdso_clock_gettime;
+		gettimeofday;
+		__vdso_gettimeofday;
+		gettimeofday;
+		__vdso_gettimeofday;
+		time;
+		__vdso_time;
+#endif
 	local: *;
 	};
 }
@@ -35,3 +45,8 @@ VDSO32_PRELINK		= VDSO_PRELINK;
 VDSO32_vsyscall		= __kernel_vsyscall;
 VDSO32_sigreturn	= __kernel_sigreturn;
 VDSO32_rt_sigreturn	= __kernel_rt_sigreturn;
+#ifdef CONFIG_X86_32
+VDSO32_clock_gettime	= clock_gettime;
+VDSO32_gettimeofday	= gettimeofday;
+VDSO32_time		= time;
+#endif
-- 
1.8.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 16:11 [PATCH] Add VDSO time function support for x86 32-bit kernel stefani
@ 2012-12-11 19:27 ` John Stultz
  2012-12-11 19:37   ` Andy Lutomirski
  2012-12-11 20:54   ` Stefani Seibold
  2012-12-11 19:37 ` Andy Lutomirski
  1 sibling, 2 replies; 27+ messages in thread
From: John Stultz @ 2012-12-11 19:27 UTC (permalink / raw)
  To: stefani; +Cc: linux-kernel, ak, aarcange, luto

On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
> From: Stefani Seibold <stefani@seibold.net>
>
> This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> and vdso_time() support to the VDSO for x86 32-bit kernels.
>
> The reason to do this was to get a fast reliable time stamp. Many developers
> uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> time functions a fast and reliable way, because the kernel knows the best time
> source and the P- and C-state of the CPU.
Very cool. There have been similar implementations of this patch over 
the years, but they were all bit more hackish then this.


> For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> the ACPI timer should be easily to add with an other patch.
Although the ACPI PM timer requires port-io which would need tweaking to 
allow normal users to access it. And I'm not sure if the performance 
would be much improved, as the port-io probably dominates the 
performance cost.

> The helper library to use the VDSO functions can be download at
> http://http://seibold.net/vdso.c
> The libary is very small, only 228 lines of code. Compile it with
> gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
> and use it with LD_PRELOAD=<path>/libvdso.so
>
> This kind of helper must be integrated into glibc, for x86 64-bit and
> PowerPC it is already there.
>
A few notes below...


> diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
> index 59c6c40..45ba688 100644
> --- a/arch/x86/include/asm/compat.h
> +++ b/arch/x86/include/asm/compat.h
> @@ -295,6 +295,10 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
>   
>   static inline void __user *arch_compat_alloc_user_space(long len)
>   {
> +#ifdef CONFIG_X86_32
> +	struct pt_regs *regs = task_pt_regs(current);
> +	return (void __user *)regs->sp - len;
> +#else
>   	compat_uptr_t sp;
>   
>   	if (test_thread_flag(TIF_IA32)) {
> @@ -305,6 +309,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
>   	}
>   
>   	return (void __user *)round_down(sp - len, 16);
> +#endif
>   }

This style of in-line ifdefs are ugly and hard to read.

So instead of doing:
void myfunction (void)
{
#ifdef 32bits
     32_bit_implementation();
#else
     64_bit_implementation();
#endif
}

Where possible, please do:
#ifdef 32bits
void myfunction1(void)
{
     32_bit implementation();
}
void myfunction2(void)
{
....
#else /*64 bit versions */
void myfunction1(void)
{
     64_bit implementation();
}
void myfunction2(void)
....
#endif

> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index 4df6c37..2dc6b72 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
>   
>   static notrace cycle_t vread_hpet(void)
>   {
> +#ifdef CONFIG_X86_64
>   	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> +#else
> +	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> +#endif
>   }
>   
>   notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>   {
>   	long ret;
> +#ifdef CONFIG_X86_64
>   	asm("syscall" : "=a" (ret) :
>   	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> +#else
> +	asm("int $0x80" : "=a" (ret) :
> +	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> +#endif
>   	return ret;
>   }
Same point here.


> diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> new file mode 100644
> index 0000000..c9a1909
> --- /dev/null
> +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> @@ -0,0 +1,7 @@
> +/*
> + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> + * provide a 32 bit vclock with a 64 bit kernel
> + */
> +#ifdef CONFIG_X86_32
> +#include "../vclock_gettime.c"
> +#endif
Could you expand a bit as to why a compat layer isn't possible? It seems 
we could easily convert the vsyscall_gtod_data to a more explicit 
arch-neutral size. Or is it the actual data page mapping?

thanks
-john

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 16:11 [PATCH] Add VDSO time function support for x86 32-bit kernel stefani
  2012-12-11 19:27 ` John Stultz
@ 2012-12-11 19:37 ` Andy Lutomirski
  2012-12-11 20:40   ` Stefani Seibold
  1 sibling, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-11 19:37 UTC (permalink / raw)
  To: stefani; +Cc: linux-kernel, ak, aarcange, john.stultz

On Tue, Dec 11, 2012 at 8:11 AM,  <stefani@seibold.net> wrote:
> From: Stefani Seibold <stefani@seibold.net>
>
> This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> and vdso_time() support to the VDSO for x86 32-bit kernels.
>
> The reason to do this was to get a fast reliable time stamp. Many developers
> uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> time functions a fast and reliable way, because the kernel knows the best time
> source and the P- and C-state of the CPU.
>
> For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> the ACPI timer should be easily to add with an other patch.
>
> The helper library to use the VDSO functions can be download at
> http://http://seibold.net/vdso.c

Wow -- another implementation.  See Documentation/vDSO/parse_vdso.c.
(Also, I think your code will break if anyone ever strips the vdso.)

> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
>
>  static notrace cycle_t vread_hpet(void)
>  {
> +#ifdef CONFIG_X86_64
>         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> +#else
> +       return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> +#endif
>  }

Is 0xf0 not equal to HPET_COUNTER?

>
>  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>  {
>         long ret;
> +#ifdef CONFIG_X86_64
>         asm("syscall" : "=a" (ret) :
>             "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> +#else
> +       asm("int $0x80" : "=a" (ret) :
> +           "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> +#endif
>         return ret;
>  }

__kernel_vsyscall is probably much faster if you can figure out how to
call it from here :)

>
> @@ -74,15 +83,20 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
>  {
>         long ret;
>
> +#ifdef CONFIG_X86_64
>         asm("syscall" : "=a" (ret) :
>             "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
> +#else
> +       asm("int $0x80" : "=a" (ret) :
> +           "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
> +#endif
>         return ret;
>  }
>

Ditto.

> --- a/arch/x86/vdso/vdso32/vdso32.lds.S
> +++ b/arch/x86/vdso/vdso32/vdso32.lds.S
> @@ -24,6 +24,16 @@ VERSION
>                 __kernel_vsyscall;
>                 __kernel_sigreturn;
>                 __kernel_rt_sigreturn;
> +#ifdef CONFIG_X86_32
> +               clock_gettime;
> +               __vdso_clock_gettime;
> +               gettimeofday;
> +               __vdso_gettimeofday;
> +               gettimeofday;
> +               __vdso_gettimeofday;
> +               time;
> +               __vdso_time;
> +#endif

Please remove the non-__vdso versions.  They're not useful and x86-64
only has them for backwards compatibility.

>         local: *;
>         };
>  }
> @@ -35,3 +45,8 @@ VDSO32_PRELINK                = VDSO_PRELINK;
>  VDSO32_vsyscall                = __kernel_vsyscall;
>  VDSO32_sigreturn       = __kernel_sigreturn;
>  VDSO32_rt_sigreturn    = __kernel_rt_sigreturn;
> +#ifdef CONFIG_X86_32
> +VDSO32_clock_gettime   = clock_gettime;
> +VDSO32_gettimeofday    = gettimeofday;
> +VDSO32_time            = time;
> +#endif

Yikes.  What's this?  (Just curious.)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 19:27 ` John Stultz
@ 2012-12-11 19:37   ` Andy Lutomirski
  2012-12-11 20:54   ` Stefani Seibold
  1 sibling, 0 replies; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-11 19:37 UTC (permalink / raw)
  To: John Stultz; +Cc: stefani, linux-kernel, ak, aarcange

On Tue, Dec 11, 2012 at 11:27 AM, John Stultz <john.stultz@linaro.org> wrote:
> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
>>
>> From: Stefani Seibold <stefani@seibold.net>
>>
>> This small patch add the functions vdso_gettimeofday(),
>> vdso_clock_gettime()
>> and vdso_time() support to the VDSO for x86 32-bit kernels.
>>
>> The reason to do this was to get a fast reliable time stamp. Many
>> developers
>> uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
>> time functions a fast and reliable way, because the kernel knows the best
>> time
>> source and the P- and C-state of the CPU.
>
> Very cool. There have been similar implementations of this patch over the
> years, but they were all bit more hackish then this.
>
>
>
>> For x86 the vclock_gettime.c currently supports only the HPET and TSC
>> timer,
>> the ACPI timer should be easily to add with an other patch.
>
> Although the ACPI PM timer requires port-io which would need tweaking to
> allow normal users to access it. And I'm not sure if the performance would
> be much improved, as the port-io probably dominates the performance cost.
>

I'd actually advocate going the other way and removing hpet vdso
support.  Last time I benchmarked this, a syscall took about 20ns on
my box, and reading the hpet counter took about 750ns, giving
approximately no gain to the hpet-via-vdso optimization.

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 19:37 ` Andy Lutomirski
@ 2012-12-11 20:40   ` Stefani Seibold
  2012-12-12  1:29     ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Stefani Seibold @ 2012-12-11 20:40 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: linux-kernel, ak, aarcange, john.stultz

Am Dienstag, den 11.12.2012, 11:37 -0800 schrieb Andy Lutomirski:
> On Tue, Dec 11, 2012 at 8:11 AM,  <stefani@seibold.net> wrote:
> > From: Stefani Seibold <stefani@seibold.net>
> >
> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> > and vdso_time() support to the VDSO for x86 32-bit kernels.
> >
> > The reason to do this was to get a fast reliable time stamp. Many developers
> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> > time functions a fast and reliable way, because the kernel knows the best time
> > source and the P- and C-state of the CPU.
> >
> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> > the ACPI timer should be easily to add with an other patch.
> >
> > The helper library to use the VDSO functions can be download at
> > http://http://seibold.net/vdso.c
> 
> Wow -- another implementation.  See Documentation/vDSO/parse_vdso.c.
> (Also, I think your code will break if anyone ever strips the vdso.)
> 
> > --- a/arch/x86/vdso/vclock_gettime.c
> > +++ b/arch/x86/vdso/vclock_gettime.c
> > @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
> >
> >  static notrace cycle_t vread_hpet(void)
> >  {
> > +#ifdef CONFIG_X86_64
> >         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> > +#else
> > +       return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> > +#endif
> >  }
> 
> Is 0xf0 not equal to HPET_COUNTER?
> 

Yes, but HPET_COUNTER is more readable.

> >
> >  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
> >  {
> >         long ret;
> > +#ifdef CONFIG_X86_64
> >         asm("syscall" : "=a" (ret) :
> >             "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> > +#else
> > +       asm("int $0x80" : "=a" (ret) :
> > +           "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> > +#endif
> >         return ret;
> >  }
> 
> __kernel_vsyscall is probably much faster if you can figure out how to
> call it from here :)
> 

Yes i know. Thats one of my problems, because i cannot call
__kernel_vsyscall directly due the relocation. Any idea?

> >
> > @@ -74,15 +83,20 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
> >  {
> >         long ret;
> >
> > +#ifdef CONFIG_X86_64
> >         asm("syscall" : "=a" (ret) :
> >             "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
> > +#else
> > +       asm("int $0x80" : "=a" (ret) :
> > +           "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
> > +#endif
> >         return ret;
> >  }
> >
> 
> Ditto.
> 
> > --- a/arch/x86/vdso/vdso32/vdso32.lds.S
> > +++ b/arch/x86/vdso/vdso32/vdso32.lds.S
> > @@ -24,6 +24,16 @@ VERSION
> >                 __kernel_vsyscall;
> >                 __kernel_sigreturn;
> >                 __kernel_rt_sigreturn;
> > +#ifdef CONFIG_X86_32
> > +               clock_gettime;
> > +               __vdso_clock_gettime;
> > +               gettimeofday;
> > +               __vdso_gettimeofday;
> > +               gettimeofday;
> > +               __vdso_gettimeofday;
> > +               time;
> > +               __vdso_time;
> > +#endif
> 
> Please remove the non-__vdso versions.  They're not useful and x86-64
> only has them for backwards compatibility.
> 
> >         local: *;
> >         };
> >  }
> > @@ -35,3 +45,8 @@ VDSO32_PRELINK                = VDSO_PRELINK;
> >  VDSO32_vsyscall                = __kernel_vsyscall;
> >  VDSO32_sigreturn       = __kernel_sigreturn;
> >  VDSO32_rt_sigreturn    = __kernel_rt_sigreturn;
> > +#ifdef CONFIG_X86_32
> > +VDSO32_clock_gettime   = clock_gettime;
> > +VDSO32_gettimeofday    = gettimeofday;
> > +VDSO32_time            = time;
> > +#endif
> 
> Yikes.  What's this?  (Just curious.)

I don't know if anyone needs this. Maybe i can kick it away.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 19:27 ` John Stultz
  2012-12-11 19:37   ` Andy Lutomirski
@ 2012-12-11 20:54   ` Stefani Seibold
  2012-12-11 21:18     ` Andy Lutomirski
  1 sibling, 1 reply; 27+ messages in thread
From: Stefani Seibold @ 2012-12-11 20:54 UTC (permalink / raw)
  To: John Stultz; +Cc: linux-kernel, ak, aarcange, luto

Am Dienstag, den 11.12.2012, 11:27 -0800 schrieb John Stultz:
> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
> > From: Stefani Seibold <stefani@seibold.net>
> >
> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> > and vdso_time() support to the VDSO for x86 32-bit kernels.
> >
> > The reason to do this was to get a fast reliable time stamp. Many developers
> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> > time functions a fast and reliable way, because the kernel knows the best time
> > source and the P- and C-state of the CPU.
> Very cool. There have been similar implementations of this patch over 
> the years, but they were all bit more hackish then this.
> 
Thanks, i think it is not more hackish than the x86 64 code.

> 
> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> > the ACPI timer should be easily to add with an other patch.
> Although the ACPI PM timer requires port-io which would need tweaking to 
> allow normal users to access it. And I'm not sure if the performance 
> would be much improved, as the port-io probably dominates the 
> performance cost.
> 

It was only an idea. I think it would be easy to give the ioperm read
for the ACPI Timer without breaking anything.

> > The helper library to use the VDSO functions can be download at
> > http://http://seibold.net/vdso.c
> > The libary is very small, only 228 lines of code. Compile it with
> > gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
> > and use it with LD_PRELOAD=<path>/libvdso.so
> >
> > This kind of helper must be integrated into glibc, for x86 64-bit and
> > PowerPC it is already there.
> >
> A few notes below...
> 
> 
> > diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
> > index 59c6c40..45ba688 100644
> > --- a/arch/x86/include/asm/compat.h
> > +++ b/arch/x86/include/asm/compat.h
> > @@ -295,6 +295,10 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
> >   
> >   static inline void __user *arch_compat_alloc_user_space(long len)
> >   {
> > +#ifdef CONFIG_X86_32
> > +	struct pt_regs *regs = task_pt_regs(current);
> > +	return (void __user *)regs->sp - len;
> > +#else
> >   	compat_uptr_t sp;
> >   
> >   	if (test_thread_flag(TIF_IA32)) {
> > @@ -305,6 +309,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
> >   	}
> >   
> >   	return (void __user *)round_down(sp - len, 16);
> > +#endif
> >   }
> 
> This style of in-line ifdefs are ugly and hard to read.
> 
> So instead of doing:
> void myfunction (void)
> {
> #ifdef 32bits
>      32_bit_implementation();
> #else
>      64_bit_implementation();
> #endif
> }
> 
> Where possible, please do:
> #ifdef 32bits
> void myfunction1(void)
> {
>      32_bit implementation();
> }
> void myfunction2(void)
> {
> ....
> #else /*64 bit versions */
> void myfunction1(void)
> {
>      64_bit implementation();
> }
> void myfunction2(void)
> ....
> #endif
> 
> > diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> > index 4df6c37..2dc6b72 100644
> > --- a/arch/x86/vdso/vclock_gettime.c
> > +++ b/arch/x86/vdso/vclock_gettime.c
> > @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
> >   
> >   static notrace cycle_t vread_hpet(void)
> >   {
> > +#ifdef CONFIG_X86_64
> >   	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> > +#else
> > +	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> > +#endif
> >   }
> >   
> >   notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
> >   {
> >   	long ret;
> > +#ifdef CONFIG_X86_64
> >   	asm("syscall" : "=a" (ret) :
> >   	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> > +#else
> > +	asm("int $0x80" : "=a" (ret) :
> > +	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> > +#endif
> >   	return ret;
> >   }
> Same point here.
> 
> 

Will be fixed.

> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> > new file mode 100644
> > index 0000000..c9a1909
> > --- /dev/null
> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> > @@ -0,0 +1,7 @@
> > +/*
> > + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> > + * provide a 32 bit vclock with a 64 bit kernel
> > + */
> > +#ifdef CONFIG_X86_32
> > +#include "../vclock_gettime.c"
> > +#endif
> Could you expand a bit as to why a compat layer isn't possible? It seems 
> we could easily convert the vsyscall_gtod_data to a more explicit 
> arch-neutral size. Or is it the actual data page mapping?
> 

This could be done in a subsequent patch. First i want not change to
much, to make the review easier. Converting the vsyscall_gtod_data to
arch neutral is not so easy, because the size of time_t, timezone and
timespec differs. So currently 32 bit programs running under a 64 bit
kernel will not get a VDSO with time functions. But real 32 bit kernel
will provide it.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 20:54   ` Stefani Seibold
@ 2012-12-11 21:18     ` Andy Lutomirski
  2012-12-11 21:28       ` Stefani Seibold
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-11 21:18 UTC (permalink / raw)
  To: Stefani Seibold; +Cc: John Stultz, linux-kernel, ak, aarcange

On Tue, Dec 11, 2012 at 12:54 PM, Stefani Seibold <stefani@seibold.net> wrote:
> Am Dienstag, den 11.12.2012, 11:27 -0800 schrieb John Stultz:
>> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
>> > From: Stefani Seibold <stefani@seibold.net>
>> >
>> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
>> > and vdso_time() support to the VDSO for x86 32-bit kernels.
>> >
>> > The reason to do this was to get a fast reliable time stamp. Many developers
>> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
>> > time functions a fast and reliable way, because the kernel knows the best time
>> > source and the P- and C-state of the CPU.
>> Very cool. There have been similar implementations of this patch over
>> the years, but they were all bit more hackish then this.
>>
> Thanks, i think it is not more hackish than the x86 64 code.
>
>>
>> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
>> > the ACPI timer should be easily to add with an other patch.
>> Although the ACPI PM timer requires port-io which would need tweaking to
>> allow normal users to access it. And I'm not sure if the performance
>> would be much improved, as the port-io probably dominates the
>> performance cost.
>>
>
> It was only an idea. I think it would be easy to give the ioperm read
> for the ACPI Timer without breaking anything.

I think you'll have to be careful not to make context switches slower
if you do this.  IIRC ioperm bitmaps live in the task structure (the
i386 thing, not task_struct) and require extra slow instructions.

I still think we should do TSC only.

>> Could you expand a bit as to why a compat layer isn't possible? It seems
>> we could easily convert the vsyscall_gtod_data to a more explicit
>> arch-neutral size. Or is it the actual data page mapping?
>>
>
> This could be done in a subsequent patch. First i want not change to
> much, to make the review easier. Converting the vsyscall_gtod_data to
> arch neutral is not so easy, because the size of time_t, timezone and
> timespec differs. So currently 32 bit programs running under a 64 bit
> kernel will not get a VDSO with time functions. But real 32 bit kernel
> will provide it.
>
>

How would you address the vvar page in 32-bit userspace on a 64-bit
kernel?  Would you map it low for 32-bit tasks or play descriptor
games?

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 21:18     ` Andy Lutomirski
@ 2012-12-11 21:28       ` Stefani Seibold
  0 siblings, 0 replies; 27+ messages in thread
From: Stefani Seibold @ 2012-12-11 21:28 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: John Stultz, linux-kernel, ak, aarcange

Am Dienstag, den 11.12.2012, 13:18 -0800 schrieb Andy Lutomirski:
> On Tue, Dec 11, 2012 at 12:54 PM, Stefani Seibold <stefani@seibold.net> wrote:
> > Am Dienstag, den 11.12.2012, 11:27 -0800 schrieb John Stultz:
> >> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
> >> > From: Stefani Seibold <stefani@seibold.net>
> >> >
> >> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> >> > and vdso_time() support to the VDSO for x86 32-bit kernels.
> >> >
> >> > The reason to do this was to get a fast reliable time stamp. Many developers
> >> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> >> > time functions a fast and reliable way, because the kernel knows the best time
> >> > source and the P- and C-state of the CPU.
> >> Very cool. There have been similar implementations of this patch over
> >> the years, but they were all bit more hackish then this.
> >>
> > Thanks, i think it is not more hackish than the x86 64 code.
> >
> >>
> >> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> >> > the ACPI timer should be easily to add with an other patch.
> >> Although the ACPI PM timer requires port-io which would need tweaking to
> >> allow normal users to access it. And I'm not sure if the performance
> >> would be much improved, as the port-io probably dominates the
> >> performance cost.
> >>
> >
> > It was only an idea. I think it would be easy to give the ioperm read
> > for the ACPI Timer without breaking anything.
> 
> I think you'll have to be careful not to make context switches slower
> if you do this.  IIRC ioperm bitmaps live in the task structure (the
> i386 thing, not task_struct) and require extra slow instructions.
> 

Good point. I will measure the overhead. But for now, it was only an
idea.

> I still think we should do TSC only.
> 

HPET is still necessary, since many system have no stable TSC, because
of the C1E (enhanced halt). And with the VDSO the kernel overhead for
system call will go away, this is better than nothing.

> >> Could you expand a bit as to why a compat layer isn't possible? It seems
> >> we could easily convert the vsyscall_gtod_data to a more explicit
> >> arch-neutral size. Or is it the actual data page mapping?
> >>
> >
> > This could be done in a subsequent patch. First i want not change to
> > much, to make the review easier. Converting the vsyscall_gtod_data to
> > arch neutral is not so easy, because the size of time_t, timezone and
> > timespec differs. So currently 32 bit programs running under a 64 bit
> > kernel will not get a VDSO with time functions. But real 32 bit kernel
> > will provide it.
> >
> >
> 
> How would you address the vvar page in 32-bit userspace on a 64-bit
> kernel?  Would you map it low for 32-bit tasks or play descriptor
> games?
> 

As i mentioned, this is currently not implemented. On a 64 bit kernel
there is no VDSO time functions for 32 bit programs available. This
sugar will be come maybe with an other patch if it is possible to map he
vvar page to the low 32-bit address space. But for now a 32 bit programs
on 32 kernel will get the advantage.

- Stefani



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 20:40   ` Stefani Seibold
@ 2012-12-12  1:29     ` Andy Lutomirski
  0 siblings, 0 replies; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-12  1:29 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, ak, aarcange, john.stultz, Jeremy Fitzhardinge

[cc: Jeremy Fitzhardinge -- you wrote some of this]

On Tue, Dec 11, 2012 at 12:40 PM, Stefani Seibold <stefani@seibold.net> wrote:
> Am Dienstag, den 11.12.2012, 11:37 -0800 schrieb Andy Lutomirski:
>> On Tue, Dec 11, 2012 at 8:11 AM,  <stefani@seibold.net> wrote:
>> > --- a/arch/x86/vdso/vclock_gettime.c
>> > +++ b/arch/x86/vdso/vclock_gettime.c
>> > @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
>> >
>> >  static notrace cycle_t vread_hpet(void)
>> >  {
>> > +#ifdef CONFIG_X86_64
>> >         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
>> > +#else
>> > +       return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
>> > +#endif
>> >  }
>>
>> Is 0xf0 not equal to HPET_COUNTER?
>>
>
> Yes, but HPET_COUNTER is more readable.

Sorry -- read it backwards.  Can you change the 64-bit one as well?

>
>> >
>> >  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>> >  {
>> >         long ret;
>> > +#ifdef CONFIG_X86_64
>> >         asm("syscall" : "=a" (ret) :
>> >             "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
>> > +#else
>> > +       asm("int $0x80" : "=a" (ret) :
>> > +           "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
>> > +#endif
>> >         return ret;
>> >  }
>>
>> __kernel_vsyscall is probably much faster if you can figure out how to
>> call it from here :)
>>
>
> Yes i know. Thats one of my problems, because i cannot call
> __kernel_vsyscall directly due the relocation. Any idea?

What actually goes wrong?  I think that if calling __kernel_vsyscall
like a normal symbol doesn't work then it's a bug in the relocation.

What's the point of this relocation?  It appears thoroughly useless in
the !compat case.

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  2:18                           ` H. Peter Anvin
@ 2012-12-14  2:20                             ` Andy Lutomirski
  0 siblings, 0 replies; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-14  2:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: criu, Stefani Seibold, linux-kernel, x86, tglx, mingo, ak,
	aarcange, john.stultz

On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> Wouldn't the vdso get mapped already and could be mremap()'d.  If we really need more control I'd almost push for a device/filesystem node that could be mmapped the usual way.

Hmm.  That may work, but it'll still break ABI.  I'm not sure that
criu is stable enough yet that we should care.  Criu people?

(In brief summary: how annoying would it be if the vdso was no longer
just a bunch of constant bytes that lived somewhere?)

--Andy

>
> Andy Lutomirski <luto@amacapital.net> wrote:
>
>>On Thu, Dec 13, 2012 at 5:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
>>>>
>>>> The 64-bit/x32 case is currently very simple and fast because it
>>uses
>>>> absolute addressing.  Admittedly, pcrel references are free, so
>>>> changing this wouldn't cost much.  For native, it'll be slower, but
>>>> maybe no one cares.  I seem to care about this more than anyone
>>else,
>>>> and I don't use 32 bit code. :)
>>>>
>>>
>>> pcrel is actually cheaper than absolute addressing in 64-bit mode.
>>>
>>>> The benefit of switching is that the vdso code could be the same in
>>>> all three cases.  (Actually, it's even better than that.  All of the
>>>> VVAR magic could be the same in the vdso and the kernel -- the
>>kernel
>>>> linker script would just have to have an appropriate symbol to see
>>the
>>>> appropriate mapping.)
>>>>
>>>>
>>>> This:
>>>>
>>>> __attribute__((visibility("hidden"))) int foo;
>>>>
>>>> int get_foo(void)
>>>> {
>>>>   return foo;
>>>> }
>>>>
>>>> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
>>>>
>>>> The only reason I didn't use a real symbol in the first place is
>>>> because I couldn't figure out how to get gcc to emit an absolute
>>>> relocation in pic code.
>>>
>>> Well, then, we wouldn't need to do that... this is starting to sound
>>> like a significant win.
>>
>>How will this avoid breaking checkpoint/restore in userspace?  If the
>>vdso is not just plain old code, criu presumably needs to know about
>>it.  Should there be an arch_prctl(ARCH_MAP_VDSO, addr) to create a
>>vdso mapping somewhere?
>>
>>--Andy
>
> --
> Sent from my mobile phone. Please excuse brevity and lack of formatting.



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  2:11                         ` Andy Lutomirski
@ 2012-12-14  2:18                           ` H. Peter Anvin
  2012-12-14  2:20                             ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-14  2:18 UTC (permalink / raw)
  To: Andy Lutomirski, criu
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

Wouldn't the vdso get mapped already and could be mremap()'d.  If we really need more control I'd almost push for a device/filesystem node that could be mmapped the usual way.

Andy Lutomirski <luto@amacapital.net> wrote:

>On Thu, Dec 13, 2012 at 5:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
>>>
>>> The 64-bit/x32 case is currently very simple and fast because it
>uses
>>> absolute addressing.  Admittedly, pcrel references are free, so
>>> changing this wouldn't cost much.  For native, it'll be slower, but
>>> maybe no one cares.  I seem to care about this more than anyone
>else,
>>> and I don't use 32 bit code. :)
>>>
>>
>> pcrel is actually cheaper than absolute addressing in 64-bit mode.
>>
>>> The benefit of switching is that the vdso code could be the same in
>>> all three cases.  (Actually, it's even better than that.  All of the
>>> VVAR magic could be the same in the vdso and the kernel -- the
>kernel
>>> linker script would just have to have an appropriate symbol to see
>the
>>> appropriate mapping.)
>>>
>>>
>>> This:
>>>
>>> __attribute__((visibility("hidden"))) int foo;
>>>
>>> int get_foo(void)
>>> {
>>>   return foo;
>>> }
>>>
>>> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
>>>
>>> The only reason I didn't use a real symbol in the first place is
>>> because I couldn't figure out how to get gcc to emit an absolute
>>> relocation in pic code.
>>
>> Well, then, we wouldn't need to do that... this is starting to sound
>> like a significant win.
>
>How will this avoid breaking checkpoint/restore in userspace?  If the
>vdso is not just plain old code, criu presumably needs to know about
>it.  Should there be an arch_prctl(ARCH_MAP_VDSO, addr) to create a
>vdso mapping somewhere?
>
>--Andy

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  1:49                       ` H. Peter Anvin
@ 2012-12-14  2:11                         ` Andy Lutomirski
  2012-12-14  2:18                           ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-14  2:11 UTC (permalink / raw)
  To: H. Peter Anvin, criu
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Thu, Dec 13, 2012 at 5:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
>>
>> The 64-bit/x32 case is currently very simple and fast because it uses
>> absolute addressing.  Admittedly, pcrel references are free, so
>> changing this wouldn't cost much.  For native, it'll be slower, but
>> maybe no one cares.  I seem to care about this more than anyone else,
>> and I don't use 32 bit code. :)
>>
>
> pcrel is actually cheaper than absolute addressing in 64-bit mode.
>
>> The benefit of switching is that the vdso code could be the same in
>> all three cases.  (Actually, it's even better than that.  All of the
>> VVAR magic could be the same in the vdso and the kernel -- the kernel
>> linker script would just have to have an appropriate symbol to see the
>> appropriate mapping.)
>>
>>
>> This:
>>
>> __attribute__((visibility("hidden"))) int foo;
>>
>> int get_foo(void)
>> {
>>   return foo;
>> }
>>
>> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
>>
>> The only reason I didn't use a real symbol in the first place is
>> because I couldn't figure out how to get gcc to emit an absolute
>> relocation in pic code.
>
> Well, then, we wouldn't need to do that... this is starting to sound
> like a significant win.

How will this avoid breaking checkpoint/restore in userspace?  If the
vdso is not just plain old code, criu presumably needs to know about
it.  Should there be an arch_prctl(ARCH_MAP_VDSO, addr) to create a
vdso mapping somewhere?

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  1:42                     ` Andy Lutomirski
@ 2012-12-14  1:49                       ` H. Peter Anvin
  2012-12-14  2:11                         ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-14  1:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
> 
> The 64-bit/x32 case is currently very simple and fast because it uses
> absolute addressing.  Admittedly, pcrel references are free, so
> changing this wouldn't cost much.  For native, it'll be slower, but
> maybe no one cares.  I seem to care about this more than anyone else,
> and I don't use 32 bit code. :)
> 

pcrel is actually cheaper than absolute addressing in 64-bit mode.

> The benefit of switching is that the vdso code could be the same in
> all three cases.  (Actually, it's even better than that.  All of the
> VVAR magic could be the same in the vdso and the kernel -- the kernel
> linker script would just have to have an appropriate symbol to see the
> appropriate mapping.)
> 
> 
> This:
> 
> __attribute__((visibility("hidden"))) int foo;
> 
> int get_foo(void)
> {
>   return foo;
> }
> 
> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
> 
> The only reason I didn't use a real symbol in the first place is
> because I couldn't figure out how to get gcc to emit an absolute
> relocation in pic code.

Well, then, we wouldn't need to do that... this is starting to sound
like a significant win.

	-hpa


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  1:32                   ` H. Peter Anvin
@ 2012-12-14  1:42                     ` Andy Lutomirski
  2012-12-14  1:49                       ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-14  1:42 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Thu, Dec 13, 2012 at 5:32 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/13/2012 04:20 PM, Andy Lutomirski wrote:
>>
>>
>> What you could do is probably arrange (using some linker script magic)
>> for a symbol to exist that points at the page *before* the vdso
>> starts.  Then just map the vvar page there when starting a compat
>> task.  You could then address it using a normal symbol reference by
>> tweaking the vvar macro.  (I think this'll access it via the GOT.)
>> Alternatively, you could just pick an absolute address -- the page is
>> NX, so you don't really need to worry about randomization.
>>
>
> The best would probably if we could generate GOTOFF references rather than
> GOT, which again probably means making the vvar page part of the vdso
> proper.  Then, when building the list of vdso pages, we need to substitute
> in the vvar page in the proper place.
>
> I have to admit to kind of thinking this might work well even for the
> 64-bit/x32 case, and perhaps even for native 32 bits.

The 64-bit/x32 case is currently very simple and fast because it uses
absolute addressing.  Admittedly, pcrel references are free, so
changing this wouldn't cost much.  For native, it'll be slower, but
maybe no one cares.  I seem to care about this more than anyone else,
and I don't use 32 bit code. :)

The benefit of switching is that the vdso code could be the same in
all three cases.  (Actually, it's even better than that.  All of the
VVAR magic could be the same in the vdso and the kernel -- the kernel
linker script would just have to have an appropriate symbol to see the
appropriate mapping.)


This:

__attribute__((visibility("hidden"))) int foo;

int get_foo(void)
{
  return foo;
}

generates a rip-relative access on 64 bits and GOTOFF on 32 bits.

The only reason I didn't use a real symbol in the first place is
because I couldn't figure out how to get gcc to emit an absolute
relocation in pic code.

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  0:20                 ` Andy Lutomirski
  2012-12-14  0:36                   ` H. Peter Anvin
@ 2012-12-14  1:32                   ` H. Peter Anvin
  2012-12-14  1:42                     ` Andy Lutomirski
  1 sibling, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-14  1:32 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 04:20 PM, Andy Lutomirski wrote:
>
> What you could do is probably arrange (using some linker script magic)
> for a symbol to exist that points at the page *before* the vdso
> starts.  Then just map the vvar page there when starting a compat
> task.  You could then address it using a normal symbol reference by
> tweaking the vvar macro.  (I think this'll access it via the GOT.)
> Alternatively, you could just pick an absolute address -- the page is
> NX, so you don't really need to worry about randomization.
>

The best would probably if we could generate GOTOFF references rather 
than GOT, which again probably means making the vvar page part of the 
vdso proper.  Then, when building the list of vdso pages, we need to 
substitute in the vvar page in the proper place.

I have to admit to kind of thinking this might work well even for the 
64-bit/x32 case, and perhaps even for native 32 bits.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  0:20                 ` Andy Lutomirski
@ 2012-12-14  0:36                   ` H. Peter Anvin
  2012-12-14  1:32                   ` H. Peter Anvin
  1 sibling, 0 replies; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-14  0:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 04:20 PM, Andy Lutomirski wrote:
>>
>> Whatever data you need you can just map it into the vdso range.  There
>> really shouldn't be anything special about that at all.
>>
>> The fixmap stuff is an x86-64 legacy that you don't have to worry about,
>> obviously.
>
> The fixmap page is new.  It's not ABI -- the layout can freely change,
> but the vdso needs to be able to see.  It would make no sense to cow
> that page, and it would be more complicated to make it be part of the
> (64-bit) vdso, so I left it as a fixmap when I did the vvar cleanups.
>

Well, the vsyscall fixmap is an ABI.  But just because a page is mapped 
into userspace doesn't mean cow.  Think of it as a device mmap, or an 
mmap of a shared file.

> For compat mode, though, I don't think it can be in the fixmap unless
> there are segmentation games that I think are impossible, or unless
> the vdso wants to do a far call (which I would guess is not much
> faster than a system call, although I haven't benchmarked it).  This
> is because there are no addresses at all that can be seen from 32-bit
> code segments and that are in the kernel address range.

Yes, you'd have to nip into 64-bit mode which is not really practical.

> What you could do is probably arrange (using some linker script magic)
> for a symbol to exist that points at the page *before* the vdso
> starts.  Then just map the vvar page there when starting a compat
> task.  You could then address it using a normal symbol reference by
> tweaking the vvar macro.  (I think this'll access it via the GOT.)
> Alternatively, you could just pick an absolute address -- the page is
> NX, so you don't really need to worry about randomization.

IMO it seems this is making it way more complicated than it is.  Just 
make sure you have a section in the vdso where you can map in a data 
page with the symbols in the right offsets.  Extra points for doing 
magic so that it is at the beginning or end, but I think that might be 
harder than necessary.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  0:09               ` H. Peter Anvin
@ 2012-12-14  0:20                 ` Andy Lutomirski
  2012-12-14  0:36                   ` H. Peter Anvin
  2012-12-14  1:32                   ` H. Peter Anvin
  0 siblings, 2 replies; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-14  0:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Thu, Dec 13, 2012 at 4:09 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/13/2012 11:32 AM, Andy Lutomirski wrote:
>>
>>
>> x32's vdso cheats -- x32 code can see high addresses just fine.  The
>> toolchain just makes it difficult.
>>
>> Your best bet is probably to just map the vvar page twice -- once at
>> the same address as native 32-bit mode (but only for compat tasks)
>> would use and once in the usual fixmap location.  You can't use the
>> fixmap for the compat mapping, though, since it would be a *user*
>> address.
>>
>> For HPET support, you'd have to have special support.  I'd say to skip
>> it for compat mode.
>>
>
> Whatever data you need you can just map it into the vdso range.  There
> really shouldn't be anything special about that at all.
>
> The fixmap stuff is an x86-64 legacy that you don't have to worry about,
> obviously.

The fixmap page is new.  It's not ABI -- the layout can freely change,
but the vdso needs to be able to see.  It would make no sense to cow
that page, and it would be more complicated to make it be part of the
(64-bit) vdso, so I left it as a fixmap when I did the vvar cleanups.

For compat mode, though, I don't think it can be in the fixmap unless
there are segmentation games that I think are impossible, or unless
the vdso wants to do a far call (which I would guess is not much
faster than a system call, although I haven't benchmarked it).  This
is because there are no addresses at all that can be seen from 32-bit
code segments and that are in the kernel address range.

What you could do is probably arrange (using some linker script magic)
for a symbol to exist that points at the page *before* the vdso
starts.  Then just map the vvar page there when starting a compat
task.  You could then address it using a normal symbol reference by
tweaking the vvar macro.  (I think this'll access it via the GOT.)
Alternatively, you could just pick an absolute address -- the page is
NX, so you don't really need to worry about randomization.

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13 19:32             ` Andy Lutomirski
@ 2012-12-14  0:09               ` H. Peter Anvin
  2012-12-14  0:20                 ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-14  0:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 11:32 AM, Andy Lutomirski wrote:
>
> x32's vdso cheats -- x32 code can see high addresses just fine.  The
> toolchain just makes it difficult.
>
> Your best bet is probably to just map the vvar page twice -- once at
> the same address as native 32-bit mode (but only for compat tasks)
> would use and once in the usual fixmap location.  You can't use the
> fixmap for the compat mapping, though, since it would be a *user*
> address.
>
> For HPET support, you'd have to have special support.  I'd say to skip
> it for compat mode.
>

Whatever data you need you can just map it into the vdso range.  There 
really shouldn't be anything special about that at all.

The fixmap stuff is an x86-64 legacy that you don't have to worry about, 
obviously.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  7:17           ` Stefani Seibold
@ 2012-12-13 19:32             ` Andy Lutomirski
  2012-12-14  0:09               ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2012-12-13 19:32 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: H. Peter Anvin, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Wed, Dec 12, 2012 at 11:17 PM, Stefani Seibold <stefani@seibold.net> wrote:
> Am Mittwoch, den 12.12.2012, 22:47 -0800 schrieb H. Peter Anvin:
>> Should be a simple matter of sharing pages.  Look perhaps at the x32 vdso for a hint.
>>
>
>
>> >
>> >Any idea or clean solution how i can map the 64 bit vgtod into the 32
>> >bit address space? Thats the only problem i see.
>> >
>
> No, i see no special handling for x32 vdso to do this. I am not sure if
> x32 vdso can access the 64 bit address space of vsyscall_gtod_data. I
> can't test this due the lack of a x32 abi system.
>
>

x32's vdso cheats -- x32 code can see high addresses just fine.  The
toolchain just makes it difficult.

Your best bet is probably to just map the vvar page twice -- once at
the same address as native 32-bit mode (but only for compat tasks)
would use and once in the usual fixmap location.  You can't use the
fixmap for the compat mapping, though, since it would be a *user*
address.

For HPET support, you'd have to have special support.  I'd say to skip
it for compat mode.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  6:47         ` H. Peter Anvin
@ 2012-12-13  7:17           ` Stefani Seibold
  2012-12-13 19:32             ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Stefani Seibold @ 2012-12-13  7:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Am Mittwoch, den 12.12.2012, 22:47 -0800 schrieb H. Peter Anvin:
> Should be a simple matter of sharing pages.  Look perhaps at the x32 vdso for a hint.
> 


> >
> >Any idea or clean solution how i can map the 64 bit vgtod into the 32
> >bit address space? Thats the only problem i see.
> >

No, i see no special handling for x32 vdso to do this. I am not sure if
x32 vdso can access the 64 bit address space of vsyscall_gtod_data. I
can't test this due the lack of a x32 abi system.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  6:17       ` Stefani Seibold
@ 2012-12-13  6:47         ` H. Peter Anvin
  2012-12-13  7:17           ` Stefani Seibold
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-13  6:47 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Should be a simple matter of sharing pages.  Look perhaps at the x32 vdso for a hint.

Stefani Seibold <stefani@seibold.net> wrote:

>Am Mittwoch, den 12.12.2012, 22:14 -0800 schrieb H. Peter Anvin:
>> This is too late for 3.8 anyway, so there is time to make it work
>correctly before tge 3.9 merge window anyway.  After this merge window
>is over I may pull tjis into a testing branch, but compat support is a
>precondition.
>> 
>> The vdso is only optional if you build in backwards compatibility
>anyway, and software has a right to expect a specific numeric kernel
>version to export a single ABI.
>> 
>
>Any idea or clean solution how i can map the 64 bit vgtod into the 32
>bit address space? Thats the only problem i see.
>
>- Stefani

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  6:14     ` H. Peter Anvin
@ 2012-12-13  6:17       ` Stefani Seibold
  2012-12-13  6:47         ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: Stefani Seibold @ 2012-12-13  6:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Am Mittwoch, den 12.12.2012, 22:14 -0800 schrieb H. Peter Anvin:
> This is too late for 3.8 anyway, so there is time to make it work correctly before tge 3.9 merge window anyway.  After this merge window is over I may pull tjis into a testing branch, but compat support is a precondition.
> 
> The vdso is only optional if you build in backwards compatibility anyway, and software has a right to expect a specific numeric kernel version to export a single ABI.
> 

Any idea or clean solution how i can map the 64 bit vgtod into the 32
bit address space? Thats the only problem i see.

- Stefani



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  5:53   ` Stefani Seibold
  2012-12-13  6:10     ` H. Peter Anvin
@ 2012-12-13  6:14     ` H. Peter Anvin
  2012-12-13  6:17       ` Stefani Seibold
  1 sibling, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-13  6:14 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

This is too late for 3.8 anyway, so there is time to make it work correctly before tge 3.9 merge window anyway.  After this merge window is over I may pull tjis into a testing branch, but compat support is a precondition.

The vdso is only optional if you build in backwards compatibility anyway, and software has a right to expect a specific numeric kernel version to export a single ABI.

Stefani Seibold <stefani@seibold.net> wrote:

>Am Mittwoch, den 12.12.2012, 15:34 -0800 schrieb H. Peter Anvin:
>> On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
>> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c
>b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > new file mode 100644
>> > index 0000000..c9a1909
>> > --- /dev/null
>> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > @@ -0,0 +1,7 @@
>> > +/*
>> > + * since vgtod layout differs between X86_64 and x86_32, it is not
>possible to
>> > + * provide a 32 bit vclock with a 64 bit kernel
>> > + */
>> > +#ifdef CONFIG_X86_32
>> > +#include "../vclock_gettime.c"
>> > +#endif
>> 
>> This is where this goes fail.  Sorry, it is not acceptable to
>introduce 
>> an ABI on x86-32 without providing it also on x86-64 in compatibility
>mode.
>> 
>> I also don't believe it is not possible... it might require some more
>
>> cleverness; perhaps we need to do the 32-bit vgtod in such a way that
>it 
>> *is* compatible with 64 bits.
>> 
>
>The comment is ambiguous:
>
>Since vgtod layout differs between X86_64 and x86_32 AND the vgtod is
>not inside the accessible address space of a 32 bit program, it is
>CURRENTLY not possible to provide a 32 bit vclock with a 64 bit kernel
>
>As i understand VDSO it is an alternativ way, so if there is no support
>for it, there must be a fall back to the system call interface in the
>program or lib, which tries to use it.
>
>So there is no drawback for 32 bit programs running on a 64 bit kernel.
>
>
>I think this feature is not so important and can implemented in a
>subsequent patch, because a 64 bit kernel system mostly runs 64 bit
>programs. Let us fix this things step by step.
>
>- Stefani

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  5:53   ` Stefani Seibold
@ 2012-12-13  6:10     ` H. Peter Anvin
  2012-12-13  6:14     ` H. Peter Anvin
  1 sibling, 0 replies; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-13  6:10 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

No, let's not.  Why?  Because if we do that we may inadvertently create an ABI which is hard to support across the board.

Stefani Seibold <stefani@seibold.net> wrote:

>Am Mittwoch, den 12.12.2012, 15:34 -0800 schrieb H. Peter Anvin:
>> On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
>> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c
>b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > new file mode 100644
>> > index 0000000..c9a1909
>> > --- /dev/null
>> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > @@ -0,0 +1,7 @@
>> > +/*
>> > + * since vgtod layout differs between X86_64 and x86_32, it is not
>possible to
>> > + * provide a 32 bit vclock with a 64 bit kernel
>> > + */
>> > +#ifdef CONFIG_X86_32
>> > +#include "../vclock_gettime.c"
>> > +#endif
>> 
>> This is where this goes fail.  Sorry, it is not acceptable to
>introduce 
>> an ABI on x86-32 without providing it also on x86-64 in compatibility
>mode.
>> 
>> I also don't believe it is not possible... it might require some more
>
>> cleverness; perhaps we need to do the 32-bit vgtod in such a way that
>it 
>> *is* compatible with 64 bits.
>> 
>
>The comment is ambiguous:
>
>Since vgtod layout differs between X86_64 and x86_32 AND the vgtod is
>not inside the accessible address space of a 32 bit program, it is
>CURRENTLY not possible to provide a 32 bit vclock with a 64 bit kernel
>
>As i understand VDSO it is an alternativ way, so if there is no support
>for it, there must be a fall back to the system call interface in the
>program or lib, which tries to use it.
>
>So there is no drawback for 32 bit programs running on a 64 bit kernel.
>
>
>I think this feature is not so important and can implemented in a
>subsequent patch, because a 64 bit kernel system mostly runs 64 bit
>programs. Let us fix this things step by step.
>
>- Stefani

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-12 23:34 ` H. Peter Anvin
@ 2012-12-13  5:53   ` Stefani Seibold
  2012-12-13  6:10     ` H. Peter Anvin
  2012-12-13  6:14     ` H. Peter Anvin
  0 siblings, 2 replies; 27+ messages in thread
From: Stefani Seibold @ 2012-12-13  5:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Am Mittwoch, den 12.12.2012, 15:34 -0800 schrieb H. Peter Anvin:
> On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> > new file mode 100644
> > index 0000000..c9a1909
> > --- /dev/null
> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> > @@ -0,0 +1,7 @@
> > +/*
> > + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> > + * provide a 32 bit vclock with a 64 bit kernel
> > + */
> > +#ifdef CONFIG_X86_32
> > +#include "../vclock_gettime.c"
> > +#endif
> 
> This is where this goes fail.  Sorry, it is not acceptable to introduce 
> an ABI on x86-32 without providing it also on x86-64 in compatibility mode.
> 
> I also don't believe it is not possible... it might require some more 
> cleverness; perhaps we need to do the 32-bit vgtod in such a way that it 
> *is* compatible with 64 bits.
> 

The comment is ambiguous:

Since vgtod layout differs between X86_64 and x86_32 AND the vgtod is
not inside the accessible address space of a 32 bit program, it is
CURRENTLY not possible to provide a 32 bit vclock with a 64 bit kernel

As i understand VDSO it is an alternativ way, so if there is no support
for it, there must be a fall back to the system call interface in the
program or lib, which tries to use it.

So there is no drawback for 32 bit programs running on a 64 bit kernel. 

I think this feature is not so important and can implemented in a
subsequent patch, because a 64 bit kernel system mostly runs 64 bit
programs. Let us fix this things step by step.

- Stefani



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-12 20:19 stefani
@ 2012-12-12 23:34 ` H. Peter Anvin
  2012-12-13  5:53   ` Stefani Seibold
  0 siblings, 1 reply; 27+ messages in thread
From: H. Peter Anvin @ 2012-12-12 23:34 UTC (permalink / raw)
  To: stefani; +Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
> diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> new file mode 100644
> index 0000000..c9a1909
> --- /dev/null
> +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> @@ -0,0 +1,7 @@
> +/*
> + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> + * provide a 32 bit vclock with a 64 bit kernel
> + */
> +#ifdef CONFIG_X86_32
> +#include "../vclock_gettime.c"
> +#endif

This is where this goes fail.  Sorry, it is not acceptable to introduce 
an ABI on x86-32 without providing it also on x86-64 in compatibility mode.

I also don't believe it is not possible... it might require some more 
cleverness; perhaps we need to do the 32-bit vgtod in such a way that it 
*is* compatible with 64 bits.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH] Add VDSO time function support for x86 32-bit kernel
@ 2012-12-12 20:19 stefani
  2012-12-12 23:34 ` H. Peter Anvin
  0 siblings, 1 reply; 27+ messages in thread
From: stefani @ 2012-12-12 20:19 UTC (permalink / raw)
  To: linux-kernel, x86, tglx, mingo, hpa, ak, aarcange, john.stultz, luto
  Cc: stefani

From: Stefani Seibold <stefani@seibold.net>

This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
and vdso_time() support to the VDSO for x86 32-bit kernels.

The reason to do this was to get a fast reliable time stamp. Many developers
uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
time functions a fast and reliable way, because the kernel knows the
best time source and the P- and C-state of the CPU.

The helper library to use the VDSO functions can be download at
http://http://seibold.net/vdso.c
The libary is very small, only 228 lines of code. Compile it with
gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
and use it with LD_PRELOAD=<path>/libvdso.so

This kind of helper must be integrated into glibc, for x86 64-bit and
PowerPC it is already there.

Some benchmark results (all measurements are in nano seconds):

Intel(R) Celeron(TM) CPU 400MHz

Average time kernel call:
 gettimeofday(): 1039
 clock_gettime(): 1578
 time(): 526
Average time VDSO call:
 gettimeofday(): 378
 clock_gettime(): 303
 time(): 60

Celeron(R) Dual-Core CPU T3100 1.90GHz

 Average time kernel call:
  gettimeofday(): 209
  clock_gettime(): 406
  time(): 135
 Average time VDSO call:
  gettimeofday(): 51
  clock_gettime(): 43
  time(): 10

So you can see a performance increase between 4 and 13, depending on the
CPU and the function.

The patch is against kernel 3.7. Please apply if you like it.

Changelog:
25.11.2012 - first release and proof of concept for linux 3.4
11.12.2012 - Port to linux 3.7 and code cleanup
12.12.2012 - fixes suggested by Andy Lutomirski
           - fixes suggested by John Stultz
           - use call VDSO32_vsyscall instead of int 80
           - code cleanup

Signed-off-by: Stefani Seibold <stefani@seibold.net>
---
 arch/x86/Kconfig                      |  4 +-
 arch/x86/include/asm/clocksource.h    |  4 --
 arch/x86/include/asm/fixmap.h         |  3 +-
 arch/x86/include/asm/vgtod.h          |  1 +
 arch/x86/include/asm/vvar.h           |  7 +++
 arch/x86/kernel/Makefile              |  1 +
 arch/x86/kernel/hpet.c                |  9 ++--
 arch/x86/kernel/setup.c               |  2 +
 arch/x86/kernel/tsc.c                 |  2 -
 arch/x86/kernel/vmlinux.lds.S         |  4 --
 arch/x86/kernel/vsyscall_64.c         | 49 ------------------
 arch/x86/kernel/vsyscall_gtod.c       | 93 +++++++++++++++++++++++++++++++++++
 arch/x86/vdso/Makefile                |  1 +
 arch/x86/vdso/vclock_gettime.c        | 25 +++++++++-
 arch/x86/vdso/vdso32/vclock_gettime.c |  7 +++
 arch/x86/vdso/vdso32/vdso32.lds.S     |  5 ++
 16 files changed, 151 insertions(+), 66 deletions(-)
 create mode 100644 arch/x86/kernel/vsyscall_gtod.c
 create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff..b8c2c74 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,9 +100,9 @@ config X86
 	select GENERIC_CMOS_UPDATE
 	select CLOCKSOURCE_WATCHDOG
 	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA if X86_64
+	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL if X86_64
+	select GENERIC_TIME_VSYSCALL
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3..67d68b9 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,8 +3,6 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#ifdef CONFIG_X86_64
-
 #define VCLOCK_NONE 0  /* No vDSO clock available.	*/
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
@@ -13,6 +11,4 @@ struct arch_clocksource_data {
 	int vclock_mode;
 };
 
-#endif /* CONFIG_X86_64 */
-
 #endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c..b26e9e0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,9 +78,10 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
 			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+#endif
 	VVAR_PAGE,
 	VSYSCALL_HPET,
-#endif
+
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 46e24d3..eb87b53 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -27,4 +27,5 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;
 
+extern void map_vgtod(void);
 #endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index de656ac..6f71098 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -17,7 +17,11 @@
  */
 
 /* Base address of vvars.  This is not ABI. */
+#ifdef CONFIG_X86_64
 #define VVAR_ADDRESS (-10*1024*1024 - 4096)
+#else
+#define VVAR_ADDRESS 0xffffd000
+#endif
 
 #if defined(__VVAR_KERNEL_LDS)
 
@@ -46,5 +50,8 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+#ifdef CONFIG_X86_32
+DECLARE_VVAR(512, const void __iomem *, vsyscall_hpet)
+#endif
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 91ce48f..298a0b1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-y			+= syscall_$(BITS).o
+obj-y			+= vsyscall_gtod.o
 obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1460a5d..38887ca 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -69,13 +69,18 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
 
 #ifdef CONFIG_X86_64
 #include <asm/pgtable.h>
+#else
+#include <asm/vvar.h>
+
+DEFINE_VVAR(const void __iomem *, vsyscall_hpet);
 #endif
 
 static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
 	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
+#ifdef CONFIG_X86_32
+	vsyscall_hpet = (const void __iomem *)fix_to_virt(VSYSCALL_HPET);
 #endif
 }
 
@@ -752,9 +757,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
-#ifdef CONFIG_X86_64
 	.archdata	= { .vclock_mode = VCLOCK_HPET },
-#endif
 };
 
 static int hpet_clocksource_register(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..c2f6bbb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,7 @@
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
+#include <asm/vgtod.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -997,6 +998,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_64
 	map_vsyscall();
 #endif
+	map_vgtod();
 
 	generic_apic_probe();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cfa5d4f..078cc9a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -772,9 +772,7 @@ static struct clocksource clocksource_tsc = {
 	.mask                   = CLOCKSOURCE_MASK(64),
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
-#endif
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 22a1530..31a0cdd 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -151,8 +151,6 @@ SECTIONS
 		_edata = .;
 	} :data
 
-#ifdef CONFIG_X86_64
-
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
 
@@ -173,8 +171,6 @@ SECTIONS
 
        . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
 
-#endif /* CONFIG_X86_64 */
-
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
 	.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9..dfc9727 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,6 @@
 #include "vsyscall_trace.h"
 
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
@@ -77,48 +76,6 @@ static int __init vsyscall_setup(char *str)
 }
 early_param("vsyscall", vsyscall_setup);
 
-void update_vsyscall_tz(void)
-{
-	vsyscall_gtod_data.sys_tz = sys_tz;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
-	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
-	write_seqcount_begin(&vdata->seq);
-
-	/* copy vsyscall data */
-	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->clock->cycle_last;
-	vdata->clock.mask		= tk->clock->mask;
-	vdata->clock.mult		= tk->mult;
-	vdata->clock.shift		= tk->shift;
-
-	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->xtime_nsec;
-
-	vdata->monotonic_time_sec	= tk->xtime_sec
-					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->xtime_nsec
-					+ (tk->wall_to_monotonic.tv_nsec
-						<< tk->shift);
-	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->shift)) {
-		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->shift;
-		vdata->monotonic_time_sec++;
-	}
-
-	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
-
-	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
-							tk->wall_to_monotonic);
-
-	write_seqcount_end(&vdata->seq);
-}
-
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 			      const char *message)
 {
@@ -366,8 +323,6 @@ void __init map_vsyscall(void)
 {
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
-	extern char __vvar_page;
-	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
 		     vsyscall_mode == NATIVE
@@ -375,10 +330,6 @@ void __init map_vsyscall(void)
 		     : PAGE_KERNEL_VVAR);
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
 		     (unsigned long)VSYSCALL_START);
-
-	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
-	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
-		     (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 0000000..9b96488
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Modified for x86 32 bit architecture by
+ *  Stefani Seibold <stefani@seibold.net>
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/topology.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#include <asm/traps.h>
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+	vsyscall_gtod_data.sys_tz = sys_tz;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+	write_seqcount_begin(&vdata->seq);
+
+	/* copy vsyscall data */
+	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->clock->cycle_last;
+	vdata->clock.mask		= tk->clock->mask;
+	vdata->clock.mult		= tk->mult;
+	vdata->clock.shift		= tk->shift;
+
+	vdata->wall_time_sec		= tk->xtime_sec;
+	vdata->wall_time_snsec		= tk->xtime_nsec;
+
+	vdata->monotonic_time_sec	= tk->xtime_sec
+					+ tk->wall_to_monotonic.tv_sec;
+	vdata->monotonic_time_snsec	= tk->xtime_nsec
+					+ (tk->wall_to_monotonic.tv_nsec
+						<< tk->shift);
+	while (vdata->monotonic_time_snsec >=
+					(((u64)NSEC_PER_SEC) << tk->shift)) {
+		vdata->monotonic_time_snsec -=
+					((u64)NSEC_PER_SEC) << tk->shift;
+		vdata->monotonic_time_sec++;
+	}
+
+	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
+	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
+
+	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
+							tk->wall_to_monotonic);
+
+	write_seqcount_end(&vdata->seq);
+}
+
+void __init map_vgtod(void)
+{
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+		     (unsigned long)VVAR_ADDRESS);
+#endif
+}
+
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index fd14be1..959221b 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -147,6 +147,7 @@ $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
 
 $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
 				 $(obj)/vdso32/vdso32.lds \
+				 $(obj)/vdso32/vclock_gettime.o \
 				 $(obj)/vdso32/note.o \
 				 $(obj)/vdso32/%.o
 	$(call if_changed,vdso)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c37..3490e1c 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -57,6 +57,7 @@ notrace static cycle_t vread_tsc(void)
 	return last;
 }
 
+#ifdef CONFIG_X86_64
 static notrace cycle_t vread_hpet(void)
 {
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
@@ -78,11 +79,33 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
 	return ret;
 }
+#else
+static notrace cycle_t vread_hpet(void)
+{
+	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
+}
 
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+	long ret;
+	asm("call VDSO32_vsyscall" : "=a" (ret) :
+	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
+	return ret;
+}
+
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+	long ret;
+
+	asm("call VDSO32_vsyscall" : "=a" (ret) :
+	    "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
+	return ret;
+}
+#endif
 
 notrace static inline u64 vgetsns(void)
 {
-	long v;
+	u64 v;
 	cycles_t cycles;
 	if (gtod->clock.vclock_mode == VCLOCK_TSC)
 		cycles = vread_tsc();
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 0000000..c9a1909
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,7 @@
+/*
+ * since vgtod layout differs between X86_64 and x86_32, it is not possible to
+ * provide a 32 bit vclock with a 64 bit kernel
+ */
+#ifdef CONFIG_X86_32
+#include "../vclock_gettime.c"
+#endif
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
index 976124b..197d50f 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -24,6 +24,11 @@ VERSION
 		__kernel_vsyscall;
 		__kernel_sigreturn;
 		__kernel_rt_sigreturn;
+#ifdef CONFIG_X86_32
+		__vdso_clock_gettime;
+		__vdso_gettimeofday;
+		__vdso_time;
+#endif
 	local: *;
 	};
 }
-- 
1.8.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2012-12-14  2:20 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-11 16:11 [PATCH] Add VDSO time function support for x86 32-bit kernel stefani
2012-12-11 19:27 ` John Stultz
2012-12-11 19:37   ` Andy Lutomirski
2012-12-11 20:54   ` Stefani Seibold
2012-12-11 21:18     ` Andy Lutomirski
2012-12-11 21:28       ` Stefani Seibold
2012-12-11 19:37 ` Andy Lutomirski
2012-12-11 20:40   ` Stefani Seibold
2012-12-12  1:29     ` Andy Lutomirski
2012-12-12 20:19 stefani
2012-12-12 23:34 ` H. Peter Anvin
2012-12-13  5:53   ` Stefani Seibold
2012-12-13  6:10     ` H. Peter Anvin
2012-12-13  6:14     ` H. Peter Anvin
2012-12-13  6:17       ` Stefani Seibold
2012-12-13  6:47         ` H. Peter Anvin
2012-12-13  7:17           ` Stefani Seibold
2012-12-13 19:32             ` Andy Lutomirski
2012-12-14  0:09               ` H. Peter Anvin
2012-12-14  0:20                 ` Andy Lutomirski
2012-12-14  0:36                   ` H. Peter Anvin
2012-12-14  1:32                   ` H. Peter Anvin
2012-12-14  1:42                     ` Andy Lutomirski
2012-12-14  1:49                       ` H. Peter Anvin
2012-12-14  2:11                         ` Andy Lutomirski
2012-12-14  2:18                           ` H. Peter Anvin
2012-12-14  2:20                             ` Andy Lutomirski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).