linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Add VDSO time function support for x86 32-bit kernel
@ 2012-12-12 20:19 stefani
  2012-12-12 23:34 ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: stefani @ 2012-12-12 20:19 UTC (permalink / raw)
  To: linux-kernel, x86, tglx, mingo, hpa, ak, aarcange, john.stultz, luto
  Cc: stefani

From: Stefani Seibold <stefani@seibold.net>

This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
and vdso_time() support to the VDSO for x86 32-bit kernels.

The reason to do this was to get a fast reliable time stamp. Many developers
uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
time functions a fast and reliable way, because the kernel knows the
best time source and the P- and C-state of the CPU.

The helper library to use the VDSO functions can be download at
http://http://seibold.net/vdso.c
The libary is very small, only 228 lines of code. Compile it with
gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
and use it with LD_PRELOAD=<path>/libvdso.so

This kind of helper must be integrated into glibc, for x86 64-bit and
PowerPC it is already there.

Some benchmark results (all measurements are in nano seconds):

Intel(R) Celeron(TM) CPU 400MHz

Average time kernel call:
 gettimeofday(): 1039
 clock_gettime(): 1578
 time(): 526
Average time VDSO call:
 gettimeofday(): 378
 clock_gettime(): 303
 time(): 60

Celeron(R) Dual-Core CPU T3100 1.90GHz

 Average time kernel call:
  gettimeofday(): 209
  clock_gettime(): 406
  time(): 135
 Average time VDSO call:
  gettimeofday(): 51
  clock_gettime(): 43
  time(): 10

So you can see a performance increase between 4 and 13, depending on the
CPU and the function.

The patch is against kernel 3.7. Please apply if you like it.

Changelog:
25.11.2012 - first release and proof of concept for linux 3.4
11.12.2012 - Port to linux 3.7 and code cleanup
12.12.2012 - fixes suggested by Andy Lutomirski
           - fixes suggested by John Stultz
           - use call VDSO32_vsyscall instead of int 80
           - code cleanup

Signed-off-by: Stefani Seibold <stefani@seibold.net>
---
 arch/x86/Kconfig                      |  4 +-
 arch/x86/include/asm/clocksource.h    |  4 --
 arch/x86/include/asm/fixmap.h         |  3 +-
 arch/x86/include/asm/vgtod.h          |  1 +
 arch/x86/include/asm/vvar.h           |  7 +++
 arch/x86/kernel/Makefile              |  1 +
 arch/x86/kernel/hpet.c                |  9 ++--
 arch/x86/kernel/setup.c               |  2 +
 arch/x86/kernel/tsc.c                 |  2 -
 arch/x86/kernel/vmlinux.lds.S         |  4 --
 arch/x86/kernel/vsyscall_64.c         | 49 ------------------
 arch/x86/kernel/vsyscall_gtod.c       | 93 +++++++++++++++++++++++++++++++++++
 arch/x86/vdso/Makefile                |  1 +
 arch/x86/vdso/vclock_gettime.c        | 25 +++++++++-
 arch/x86/vdso/vdso32/vclock_gettime.c |  7 +++
 arch/x86/vdso/vdso32/vdso32.lds.S     |  5 ++
 16 files changed, 151 insertions(+), 66 deletions(-)
 create mode 100644 arch/x86/kernel/vsyscall_gtod.c
 create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff..b8c2c74 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,9 +100,9 @@ config X86
 	select GENERIC_CMOS_UPDATE
 	select CLOCKSOURCE_WATCHDOG
 	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA if X86_64
+	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL if X86_64
+	select GENERIC_TIME_VSYSCALL
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3..67d68b9 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,8 +3,6 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#ifdef CONFIG_X86_64
-
 #define VCLOCK_NONE 0  /* No vDSO clock available.	*/
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
@@ -13,6 +11,4 @@ struct arch_clocksource_data {
 	int vclock_mode;
 };
 
-#endif /* CONFIG_X86_64 */
-
 #endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c..b26e9e0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,9 +78,10 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
 			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+#endif
 	VVAR_PAGE,
 	VSYSCALL_HPET,
-#endif
+
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 46e24d3..eb87b53 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -27,4 +27,5 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;
 
+extern void map_vgtod(void);
 #endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index de656ac..6f71098 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -17,7 +17,11 @@
  */
 
 /* Base address of vvars.  This is not ABI. */
+#ifdef CONFIG_X86_64
 #define VVAR_ADDRESS (-10*1024*1024 - 4096)
+#else
+#define VVAR_ADDRESS 0xffffd000
+#endif
 
 #if defined(__VVAR_KERNEL_LDS)
 
@@ -46,5 +50,8 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+#ifdef CONFIG_X86_32
+DECLARE_VVAR(512, const void __iomem *, vsyscall_hpet)
+#endif
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 91ce48f..298a0b1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-y			+= syscall_$(BITS).o
+obj-y			+= vsyscall_gtod.o
 obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1460a5d..38887ca 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -69,13 +69,18 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
 
 #ifdef CONFIG_X86_64
 #include <asm/pgtable.h>
+#else
+#include <asm/vvar.h>
+
+DEFINE_VVAR(const void __iomem *, vsyscall_hpet);
 #endif
 
 static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
 	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
+#ifdef CONFIG_X86_32
+	vsyscall_hpet = (const void __iomem *)fix_to_virt(VSYSCALL_HPET);
 #endif
 }
 
@@ -752,9 +757,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
-#ifdef CONFIG_X86_64
 	.archdata	= { .vclock_mode = VCLOCK_HPET },
-#endif
 };
 
 static int hpet_clocksource_register(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..c2f6bbb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,7 @@
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
+#include <asm/vgtod.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -997,6 +998,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_64
 	map_vsyscall();
 #endif
+	map_vgtod();
 
 	generic_apic_probe();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cfa5d4f..078cc9a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -772,9 +772,7 @@ static struct clocksource clocksource_tsc = {
 	.mask                   = CLOCKSOURCE_MASK(64),
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
-#endif
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 22a1530..31a0cdd 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -151,8 +151,6 @@ SECTIONS
 		_edata = .;
 	} :data
 
-#ifdef CONFIG_X86_64
-
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
 
@@ -173,8 +171,6 @@ SECTIONS
 
        . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
 
-#endif /* CONFIG_X86_64 */
-
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
 	.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9..dfc9727 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,6 @@
 #include "vsyscall_trace.h"
 
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
@@ -77,48 +76,6 @@ static int __init vsyscall_setup(char *str)
 }
 early_param("vsyscall", vsyscall_setup);
 
-void update_vsyscall_tz(void)
-{
-	vsyscall_gtod_data.sys_tz = sys_tz;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
-	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
-	write_seqcount_begin(&vdata->seq);
-
-	/* copy vsyscall data */
-	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->clock->cycle_last;
-	vdata->clock.mask		= tk->clock->mask;
-	vdata->clock.mult		= tk->mult;
-	vdata->clock.shift		= tk->shift;
-
-	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->xtime_nsec;
-
-	vdata->monotonic_time_sec	= tk->xtime_sec
-					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->xtime_nsec
-					+ (tk->wall_to_monotonic.tv_nsec
-						<< tk->shift);
-	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->shift)) {
-		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->shift;
-		vdata->monotonic_time_sec++;
-	}
-
-	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
-
-	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
-							tk->wall_to_monotonic);
-
-	write_seqcount_end(&vdata->seq);
-}
-
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 			      const char *message)
 {
@@ -366,8 +323,6 @@ void __init map_vsyscall(void)
 {
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
-	extern char __vvar_page;
-	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
 		     vsyscall_mode == NATIVE
@@ -375,10 +330,6 @@ void __init map_vsyscall(void)
 		     : PAGE_KERNEL_VVAR);
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
 		     (unsigned long)VSYSCALL_START);
-
-	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
-	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
-		     (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 0000000..9b96488
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Modified for x86 32 bit architecture by
+ *  Stefani Seibold <stefani@seibold.net>
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/topology.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#include <asm/traps.h>
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+	vsyscall_gtod_data.sys_tz = sys_tz;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+	write_seqcount_begin(&vdata->seq);
+
+	/* copy vsyscall data */
+	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->clock->cycle_last;
+	vdata->clock.mask		= tk->clock->mask;
+	vdata->clock.mult		= tk->mult;
+	vdata->clock.shift		= tk->shift;
+
+	vdata->wall_time_sec		= tk->xtime_sec;
+	vdata->wall_time_snsec		= tk->xtime_nsec;
+
+	vdata->monotonic_time_sec	= tk->xtime_sec
+					+ tk->wall_to_monotonic.tv_sec;
+	vdata->monotonic_time_snsec	= tk->xtime_nsec
+					+ (tk->wall_to_monotonic.tv_nsec
+						<< tk->shift);
+	while (vdata->monotonic_time_snsec >=
+					(((u64)NSEC_PER_SEC) << tk->shift)) {
+		vdata->monotonic_time_snsec -=
+					((u64)NSEC_PER_SEC) << tk->shift;
+		vdata->monotonic_time_sec++;
+	}
+
+	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
+	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
+
+	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
+							tk->wall_to_monotonic);
+
+	write_seqcount_end(&vdata->seq);
+}
+
+void __init map_vgtod(void)
+{
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+		     (unsigned long)VVAR_ADDRESS);
+#endif
+}
+
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index fd14be1..959221b 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -147,6 +147,7 @@ $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
 
 $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
 				 $(obj)/vdso32/vdso32.lds \
+				 $(obj)/vdso32/vclock_gettime.o \
 				 $(obj)/vdso32/note.o \
 				 $(obj)/vdso32/%.o
 	$(call if_changed,vdso)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c37..3490e1c 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -57,6 +57,7 @@ notrace static cycle_t vread_tsc(void)
 	return last;
 }
 
+#ifdef CONFIG_X86_64
 static notrace cycle_t vread_hpet(void)
 {
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
@@ -78,11 +79,33 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
 	return ret;
 }
+#else
+static notrace cycle_t vread_hpet(void)
+{
+	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
+}
 
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+	long ret;
+	asm("call VDSO32_vsyscall" : "=a" (ret) :
+	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
+	return ret;
+}
+
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+	long ret;
+
+	asm("call VDSO32_vsyscall" : "=a" (ret) :
+	    "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
+	return ret;
+}
+#endif
 
 notrace static inline u64 vgetsns(void)
 {
-	long v;
+	u64 v;
 	cycles_t cycles;
 	if (gtod->clock.vclock_mode == VCLOCK_TSC)
 		cycles = vread_tsc();
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 0000000..c9a1909
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,7 @@
+/*
+ * since vgtod layout differs between X86_64 and x86_32, it is not possible to
+ * provide a 32 bit vclock with a 64 bit kernel
+ */
+#ifdef CONFIG_X86_32
+#include "../vclock_gettime.c"
+#endif
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
index 976124b..197d50f 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -24,6 +24,11 @@ VERSION
 		__kernel_vsyscall;
 		__kernel_sigreturn;
 		__kernel_rt_sigreturn;
+#ifdef CONFIG_X86_32
+		__vdso_clock_gettime;
+		__vdso_gettimeofday;
+		__vdso_time;
+#endif
 	local: *;
 	};
 }
-- 
1.8.0


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-12 20:19 [PATCH] Add VDSO time function support for x86 32-bit kernel stefani
@ 2012-12-12 23:34 ` H. Peter Anvin
  2012-12-13  5:53   ` Stefani Seibold
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-12 23:34 UTC (permalink / raw)
  To: stefani; +Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
> diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> new file mode 100644
> index 0000000..c9a1909
> --- /dev/null
> +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> @@ -0,0 +1,7 @@
> +/*
> + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> + * provide a 32 bit vclock with a 64 bit kernel
> + */
> +#ifdef CONFIG_X86_32
> +#include "../vclock_gettime.c"
> +#endif

This is where this goes fail.  Sorry, it is not acceptable to introduce 
an ABI on x86-32 without providing it also on x86-64 in compatibility mode.

I also don't believe it is not possible... it might require some more 
cleverness; perhaps we need to do the 32-bit vgtod in such a way that it 
*is* compatible with 64 bits.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-12 23:34 ` H. Peter Anvin
@ 2012-12-13  5:53   ` Stefani Seibold
  2012-12-13  6:10     ` H. Peter Anvin
  2012-12-13  6:14     ` H. Peter Anvin
  0 siblings, 2 replies; 50+ messages in thread
From: Stefani Seibold @ 2012-12-13  5:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Am Mittwoch, den 12.12.2012, 15:34 -0800 schrieb H. Peter Anvin:
> On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> > new file mode 100644
> > index 0000000..c9a1909
> > --- /dev/null
> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> > @@ -0,0 +1,7 @@
> > +/*
> > + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> > + * provide a 32 bit vclock with a 64 bit kernel
> > + */
> > +#ifdef CONFIG_X86_32
> > +#include "../vclock_gettime.c"
> > +#endif
> 
> This is where this goes fail.  Sorry, it is not acceptable to introduce 
> an ABI on x86-32 without providing it also on x86-64 in compatibility mode.
> 
> I also don't believe it is not possible... it might require some more 
> cleverness; perhaps we need to do the 32-bit vgtod in such a way that it 
> *is* compatible with 64 bits.
> 

The comment is ambiguous:

Since vgtod layout differs between X86_64 and x86_32 AND the vgtod is
not inside the accessible address space of a 32 bit program, it is
CURRENTLY not possible to provide a 32 bit vclock with a 64 bit kernel

As i understand VDSO it is an alternativ way, so if there is no support
for it, there must be a fall back to the system call interface in the
program or lib, which tries to use it.

So there is no drawback for 32 bit programs running on a 64 bit kernel. 

I think this feature is not so important and can implemented in a
subsequent patch, because a 64 bit kernel system mostly runs 64 bit
programs. Let us fix this things step by step.

- Stefani



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  5:53   ` Stefani Seibold
@ 2012-12-13  6:10     ` H. Peter Anvin
  2012-12-13  6:14     ` H. Peter Anvin
  1 sibling, 0 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-13  6:10 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

No, let's not.  Why?  Because if we do that we may inadvertently create an ABI which is hard to support across the board.

Stefani Seibold <stefani@seibold.net> wrote:

>Am Mittwoch, den 12.12.2012, 15:34 -0800 schrieb H. Peter Anvin:
>> On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
>> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c
>b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > new file mode 100644
>> > index 0000000..c9a1909
>> > --- /dev/null
>> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > @@ -0,0 +1,7 @@
>> > +/*
>> > + * since vgtod layout differs between X86_64 and x86_32, it is not
>possible to
>> > + * provide a 32 bit vclock with a 64 bit kernel
>> > + */
>> > +#ifdef CONFIG_X86_32
>> > +#include "../vclock_gettime.c"
>> > +#endif
>> 
>> This is where this goes fail.  Sorry, it is not acceptable to
>introduce 
>> an ABI on x86-32 without providing it also on x86-64 in compatibility
>mode.
>> 
>> I also don't believe it is not possible... it might require some more
>
>> cleverness; perhaps we need to do the 32-bit vgtod in such a way that
>it 
>> *is* compatible with 64 bits.
>> 
>
>The comment is ambiguous:
>
>Since vgtod layout differs between X86_64 and x86_32 AND the vgtod is
>not inside the accessible address space of a 32 bit program, it is
>CURRENTLY not possible to provide a 32 bit vclock with a 64 bit kernel
>
>As i understand VDSO it is an alternativ way, so if there is no support
>for it, there must be a fall back to the system call interface in the
>program or lib, which tries to use it.
>
>So there is no drawback for 32 bit programs running on a 64 bit kernel.
>
>
>I think this feature is not so important and can implemented in a
>subsequent patch, because a 64 bit kernel system mostly runs 64 bit
>programs. Let us fix this things step by step.
>
>- Stefani

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  5:53   ` Stefani Seibold
  2012-12-13  6:10     ` H. Peter Anvin
@ 2012-12-13  6:14     ` H. Peter Anvin
  2012-12-13  6:17       ` Stefani Seibold
  1 sibling, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-13  6:14 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

This is too late for 3.8 anyway, so there is time to make it work correctly before tge 3.9 merge window anyway.  After this merge window is over I may pull tjis into a testing branch, but compat support is a precondition.

The vdso is only optional if you build in backwards compatibility anyway, and software has a right to expect a specific numeric kernel version to export a single ABI.

Stefani Seibold <stefani@seibold.net> wrote:

>Am Mittwoch, den 12.12.2012, 15:34 -0800 schrieb H. Peter Anvin:
>> On 12/12/2012 12:19 PM, stefani@seibold.net wrote:
>> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c
>b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > new file mode 100644
>> > index 0000000..c9a1909
>> > --- /dev/null
>> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
>> > @@ -0,0 +1,7 @@
>> > +/*
>> > + * since vgtod layout differs between X86_64 and x86_32, it is not
>possible to
>> > + * provide a 32 bit vclock with a 64 bit kernel
>> > + */
>> > +#ifdef CONFIG_X86_32
>> > +#include "../vclock_gettime.c"
>> > +#endif
>> 
>> This is where this goes fail.  Sorry, it is not acceptable to
>introduce 
>> an ABI on x86-32 without providing it also on x86-64 in compatibility
>mode.
>> 
>> I also don't believe it is not possible... it might require some more
>
>> cleverness; perhaps we need to do the 32-bit vgtod in such a way that
>it 
>> *is* compatible with 64 bits.
>> 
>
>The comment is ambiguous:
>
>Since vgtod layout differs between X86_64 and x86_32 AND the vgtod is
>not inside the accessible address space of a 32 bit program, it is
>CURRENTLY not possible to provide a 32 bit vclock with a 64 bit kernel
>
>As i understand VDSO it is an alternativ way, so if there is no support
>for it, there must be a fall back to the system call interface in the
>program or lib, which tries to use it.
>
>So there is no drawback for 32 bit programs running on a 64 bit kernel.
>
>
>I think this feature is not so important and can implemented in a
>subsequent patch, because a 64 bit kernel system mostly runs 64 bit
>programs. Let us fix this things step by step.
>
>- Stefani

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  6:14     ` H. Peter Anvin
@ 2012-12-13  6:17       ` Stefani Seibold
  2012-12-13  6:47         ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Stefani Seibold @ 2012-12-13  6:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Am Mittwoch, den 12.12.2012, 22:14 -0800 schrieb H. Peter Anvin:
> This is too late for 3.8 anyway, so there is time to make it work correctly before tge 3.9 merge window anyway.  After this merge window is over I may pull tjis into a testing branch, but compat support is a precondition.
> 
> The vdso is only optional if you build in backwards compatibility anyway, and software has a right to expect a specific numeric kernel version to export a single ABI.
> 

Any idea or clean solution how i can map the 64 bit vgtod into the 32
bit address space? Thats the only problem i see.

- Stefani



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  6:17       ` Stefani Seibold
@ 2012-12-13  6:47         ` H. Peter Anvin
  2012-12-13  7:17           ` Stefani Seibold
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-13  6:47 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Should be a simple matter of sharing pages.  Look perhaps at the x32 vdso for a hint.

Stefani Seibold <stefani@seibold.net> wrote:

>Am Mittwoch, den 12.12.2012, 22:14 -0800 schrieb H. Peter Anvin:
>> This is too late for 3.8 anyway, so there is time to make it work
>correctly before tge 3.9 merge window anyway.  After this merge window
>is over I may pull tjis into a testing branch, but compat support is a
>precondition.
>> 
>> The vdso is only optional if you build in backwards compatibility
>anyway, and software has a right to expect a specific numeric kernel
>version to export a single ABI.
>> 
>
>Any idea or clean solution how i can map the 64 bit vgtod into the 32
>bit address space? Thats the only problem i see.
>
>- Stefani

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  6:47         ` H. Peter Anvin
@ 2012-12-13  7:17           ` Stefani Seibold
  2012-12-13 19:32             ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: Stefani Seibold @ 2012-12-13  7:17 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-kernel, x86, tglx, mingo, ak, aarcange, john.stultz, luto

Am Mittwoch, den 12.12.2012, 22:47 -0800 schrieb H. Peter Anvin:
> Should be a simple matter of sharing pages.  Look perhaps at the x32 vdso for a hint.
> 


> >
> >Any idea or clean solution how i can map the 64 bit vgtod into the 32
> >bit address space? Thats the only problem i see.
> >

No, i see no special handling for x32 vdso to do this. I am not sure if
x32 vdso can access the 64 bit address space of vsyscall_gtod_data. I
can't test this due the lack of a x32 abi system.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13  7:17           ` Stefani Seibold
@ 2012-12-13 19:32             ` Andy Lutomirski
  2012-12-14  0:09               ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-13 19:32 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: H. Peter Anvin, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Wed, Dec 12, 2012 at 11:17 PM, Stefani Seibold <stefani@seibold.net> wrote:
> Am Mittwoch, den 12.12.2012, 22:47 -0800 schrieb H. Peter Anvin:
>> Should be a simple matter of sharing pages.  Look perhaps at the x32 vdso for a hint.
>>
>
>
>> >
>> >Any idea or clean solution how i can map the 64 bit vgtod into the 32
>> >bit address space? Thats the only problem i see.
>> >
>
> No, i see no special handling for x32 vdso to do this. I am not sure if
> x32 vdso can access the 64 bit address space of vsyscall_gtod_data. I
> can't test this due the lack of a x32 abi system.
>
>

x32's vdso cheats -- x32 code can see high addresses just fine.  The
toolchain just makes it difficult.

Your best bet is probably to just map the vvar page twice -- once at
the same address as native 32-bit mode (but only for compat tasks)
would use and once in the usual fixmap location.  You can't use the
fixmap for the compat mapping, though, since it would be a *user*
address.

For HPET support, you'd have to have special support.  I'd say to skip
it for compat mode.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-13 19:32             ` Andy Lutomirski
@ 2012-12-14  0:09               ` H. Peter Anvin
  2012-12-14  0:20                 ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14  0:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 11:32 AM, Andy Lutomirski wrote:
>
> x32's vdso cheats -- x32 code can see high addresses just fine.  The
> toolchain just makes it difficult.
>
> Your best bet is probably to just map the vvar page twice -- once at
> the same address as native 32-bit mode (but only for compat tasks)
> would use and once in the usual fixmap location.  You can't use the
> fixmap for the compat mapping, though, since it would be a *user*
> address.
>
> For HPET support, you'd have to have special support.  I'd say to skip
> it for compat mode.
>

Whatever data you need you can just map it into the vdso range.  There 
really shouldn't be anything special about that at all.

The fixmap stuff is an x86-64 legacy that you don't have to worry about, 
obviously.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  0:09               ` H. Peter Anvin
@ 2012-12-14  0:20                 ` Andy Lutomirski
  2012-12-14  0:36                   ` H. Peter Anvin
  2012-12-14  1:32                   ` H. Peter Anvin
  0 siblings, 2 replies; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-14  0:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Thu, Dec 13, 2012 at 4:09 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/13/2012 11:32 AM, Andy Lutomirski wrote:
>>
>>
>> x32's vdso cheats -- x32 code can see high addresses just fine.  The
>> toolchain just makes it difficult.
>>
>> Your best bet is probably to just map the vvar page twice -- once at
>> the same address as native 32-bit mode (but only for compat tasks)
>> would use and once in the usual fixmap location.  You can't use the
>> fixmap for the compat mapping, though, since it would be a *user*
>> address.
>>
>> For HPET support, you'd have to have special support.  I'd say to skip
>> it for compat mode.
>>
>
> Whatever data you need you can just map it into the vdso range.  There
> really shouldn't be anything special about that at all.
>
> The fixmap stuff is an x86-64 legacy that you don't have to worry about,
> obviously.

The fixmap page is new.  It's not ABI -- the layout can freely change,
but the vdso needs to be able to see.  It would make no sense to cow
that page, and it would be more complicated to make it be part of the
(64-bit) vdso, so I left it as a fixmap when I did the vvar cleanups.

For compat mode, though, I don't think it can be in the fixmap unless
there are segmentation games that I think are impossible, or unless
the vdso wants to do a far call (which I would guess is not much
faster than a system call, although I haven't benchmarked it).  This
is because there are no addresses at all that can be seen from 32-bit
code segments and that are in the kernel address range.

What you could do is probably arrange (using some linker script magic)
for a symbol to exist that points at the page *before* the vdso
starts.  Then just map the vvar page there when starting a compat
task.  You could then address it using a normal symbol reference by
tweaking the vvar macro.  (I think this'll access it via the GOT.)
Alternatively, you could just pick an absolute address -- the page is
NX, so you don't really need to worry about randomization.

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  0:20                 ` Andy Lutomirski
@ 2012-12-14  0:36                   ` H. Peter Anvin
  2012-12-14  1:32                   ` H. Peter Anvin
  1 sibling, 0 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14  0:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 04:20 PM, Andy Lutomirski wrote:
>>
>> Whatever data you need you can just map it into the vdso range.  There
>> really shouldn't be anything special about that at all.
>>
>> The fixmap stuff is an x86-64 legacy that you don't have to worry about,
>> obviously.
>
> The fixmap page is new.  It's not ABI -- the layout can freely change,
> but the vdso needs to be able to see.  It would make no sense to cow
> that page, and it would be more complicated to make it be part of the
> (64-bit) vdso, so I left it as a fixmap when I did the vvar cleanups.
>

Well, the vsyscall fixmap is an ABI.  But just because a page is mapped 
into userspace doesn't mean cow.  Think of it as a device mmap, or an 
mmap of a shared file.

> For compat mode, though, I don't think it can be in the fixmap unless
> there are segmentation games that I think are impossible, or unless
> the vdso wants to do a far call (which I would guess is not much
> faster than a system call, although I haven't benchmarked it).  This
> is because there are no addresses at all that can be seen from 32-bit
> code segments and that are in the kernel address range.

Yes, you'd have to nip into 64-bit mode which is not really practical.

> What you could do is probably arrange (using some linker script magic)
> for a symbol to exist that points at the page *before* the vdso
> starts.  Then just map the vvar page there when starting a compat
> task.  You could then address it using a normal symbol reference by
> tweaking the vvar macro.  (I think this'll access it via the GOT.)
> Alternatively, you could just pick an absolute address -- the page is
> NX, so you don't really need to worry about randomization.

IMO it seems this is making it way more complicated than it is.  Just 
make sure you have a section in the vdso where you can map in a data 
page with the symbols in the right offsets.  Extra points for doing 
magic so that it is at the beginning or end, but I think that might be 
harder than necessary.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  0:20                 ` Andy Lutomirski
  2012-12-14  0:36                   ` H. Peter Anvin
@ 2012-12-14  1:32                   ` H. Peter Anvin
  2012-12-14  1:42                     ` Andy Lutomirski
  1 sibling, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14  1:32 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 04:20 PM, Andy Lutomirski wrote:
>
> What you could do is probably arrange (using some linker script magic)
> for a symbol to exist that points at the page *before* the vdso
> starts.  Then just map the vvar page there when starting a compat
> task.  You could then address it using a normal symbol reference by
> tweaking the vvar macro.  (I think this'll access it via the GOT.)
> Alternatively, you could just pick an absolute address -- the page is
> NX, so you don't really need to worry about randomization.
>

The best would probably if we could generate GOTOFF references rather 
than GOT, which again probably means making the vvar page part of the 
vdso proper.  Then, when building the list of vdso pages, we need to 
substitute in the vvar page in the proper place.

I have to admit to kind of thinking this might work well even for the 
64-bit/x32 case, and perhaps even for native 32 bits.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  1:32                   ` H. Peter Anvin
@ 2012-12-14  1:42                     ` Andy Lutomirski
  2012-12-14  1:49                       ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-14  1:42 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Thu, Dec 13, 2012 at 5:32 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/13/2012 04:20 PM, Andy Lutomirski wrote:
>>
>>
>> What you could do is probably arrange (using some linker script magic)
>> for a symbol to exist that points at the page *before* the vdso
>> starts.  Then just map the vvar page there when starting a compat
>> task.  You could then address it using a normal symbol reference by
>> tweaking the vvar macro.  (I think this'll access it via the GOT.)
>> Alternatively, you could just pick an absolute address -- the page is
>> NX, so you don't really need to worry about randomization.
>>
>
> The best would probably if we could generate GOTOFF references rather than
> GOT, which again probably means making the vvar page part of the vdso
> proper.  Then, when building the list of vdso pages, we need to substitute
> in the vvar page in the proper place.
>
> I have to admit to kind of thinking this might work well even for the
> 64-bit/x32 case, and perhaps even for native 32 bits.

The 64-bit/x32 case is currently very simple and fast because it uses
absolute addressing.  Admittedly, pcrel references are free, so
changing this wouldn't cost much.  For native, it'll be slower, but
maybe no one cares.  I seem to care about this more than anyone else,
and I don't use 32 bit code. :)

The benefit of switching is that the vdso code could be the same in
all three cases.  (Actually, it's even better than that.  All of the
VVAR magic could be the same in the vdso and the kernel -- the kernel
linker script would just have to have an appropriate symbol to see the
appropriate mapping.)


This:

__attribute__((visibility("hidden"))) int foo;

int get_foo(void)
{
  return foo;
}

generates a rip-relative access on 64 bits and GOTOFF on 32 bits.

The only reason I didn't use a real symbol in the first place is
because I couldn't figure out how to get gcc to emit an absolute
relocation in pic code.

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  1:42                     ` Andy Lutomirski
@ 2012-12-14  1:49                       ` H. Peter Anvin
  2012-12-14  2:11                         ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14  1:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
> 
> The 64-bit/x32 case is currently very simple and fast because it uses
> absolute addressing.  Admittedly, pcrel references are free, so
> changing this wouldn't cost much.  For native, it'll be slower, but
> maybe no one cares.  I seem to care about this more than anyone else,
> and I don't use 32 bit code. :)
> 

pcrel is actually cheaper than absolute addressing in 64-bit mode.

> The benefit of switching is that the vdso code could be the same in
> all three cases.  (Actually, it's even better than that.  All of the
> VVAR magic could be the same in the vdso and the kernel -- the kernel
> linker script would just have to have an appropriate symbol to see the
> appropriate mapping.)
> 
> 
> This:
> 
> __attribute__((visibility("hidden"))) int foo;
> 
> int get_foo(void)
> {
>   return foo;
> }
> 
> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
> 
> The only reason I didn't use a real symbol in the first place is
> because I couldn't figure out how to get gcc to emit an absolute
> relocation in pic code.

Well, then, we wouldn't need to do that... this is starting to sound
like a significant win.

	-hpa


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  1:49                       ` H. Peter Anvin
@ 2012-12-14  2:11                         ` Andy Lutomirski
  2012-12-14  2:18                           ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-14  2:11 UTC (permalink / raw)
  To: H. Peter Anvin, criu
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

On Thu, Dec 13, 2012 at 5:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
>>
>> The 64-bit/x32 case is currently very simple and fast because it uses
>> absolute addressing.  Admittedly, pcrel references are free, so
>> changing this wouldn't cost much.  For native, it'll be slower, but
>> maybe no one cares.  I seem to care about this more than anyone else,
>> and I don't use 32 bit code. :)
>>
>
> pcrel is actually cheaper than absolute addressing in 64-bit mode.
>
>> The benefit of switching is that the vdso code could be the same in
>> all three cases.  (Actually, it's even better than that.  All of the
>> VVAR magic could be the same in the vdso and the kernel -- the kernel
>> linker script would just have to have an appropriate symbol to see the
>> appropriate mapping.)
>>
>>
>> This:
>>
>> __attribute__((visibility("hidden"))) int foo;
>>
>> int get_foo(void)
>> {
>>   return foo;
>> }
>>
>> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
>>
>> The only reason I didn't use a real symbol in the first place is
>> because I couldn't figure out how to get gcc to emit an absolute
>> relocation in pic code.
>
> Well, then, we wouldn't need to do that... this is starting to sound
> like a significant win.

How will this avoid breaking checkpoint/restore in userspace?  If the
vdso is not just plain old code, criu presumably needs to know about
it.  Should there be an arch_prctl(ARCH_MAP_VDSO, addr) to create a
vdso mapping somewhere?

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  2:11                         ` Andy Lutomirski
@ 2012-12-14  2:18                           ` H. Peter Anvin
  2012-12-14  2:20                             ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14  2:18 UTC (permalink / raw)
  To: Andy Lutomirski, criu
  Cc: Stefani Seibold, linux-kernel, x86, tglx, mingo, ak, aarcange,
	john.stultz

Wouldn't the vdso get mapped already and could be mremap()'d.  If we really need more control I'd almost push for a device/filesystem node that could be mmapped the usual way.

Andy Lutomirski <luto@amacapital.net> wrote:

>On Thu, Dec 13, 2012 at 5:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
>>>
>>> The 64-bit/x32 case is currently very simple and fast because it
>uses
>>> absolute addressing.  Admittedly, pcrel references are free, so
>>> changing this wouldn't cost much.  For native, it'll be slower, but
>>> maybe no one cares.  I seem to care about this more than anyone
>else,
>>> and I don't use 32 bit code. :)
>>>
>>
>> pcrel is actually cheaper than absolute addressing in 64-bit mode.
>>
>>> The benefit of switching is that the vdso code could be the same in
>>> all three cases.  (Actually, it's even better than that.  All of the
>>> VVAR magic could be the same in the vdso and the kernel -- the
>kernel
>>> linker script would just have to have an appropriate symbol to see
>the
>>> appropriate mapping.)
>>>
>>>
>>> This:
>>>
>>> __attribute__((visibility("hidden"))) int foo;
>>>
>>> int get_foo(void)
>>> {
>>>   return foo;
>>> }
>>>
>>> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
>>>
>>> The only reason I didn't use a real symbol in the first place is
>>> because I couldn't figure out how to get gcc to emit an absolute
>>> relocation in pic code.
>>
>> Well, then, we wouldn't need to do that... this is starting to sound
>> like a significant win.
>
>How will this avoid breaking checkpoint/restore in userspace?  If the
>vdso is not just plain old code, criu presumably needs to know about
>it.  Should there be an arch_prctl(ARCH_MAP_VDSO, addr) to create a
>vdso mapping somewhere?
>
>--Andy

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  2:18                           ` H. Peter Anvin
@ 2012-12-14  2:20                             ` Andy Lutomirski
  2012-12-14  8:34                               ` [CRIU] " Pavel Emelyanov
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-14  2:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: criu, Stefani Seibold, linux-kernel, x86, tglx, mingo, ak,
	aarcange, john.stultz

On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> Wouldn't the vdso get mapped already and could be mremap()'d.  If we really need more control I'd almost push for a device/filesystem node that could be mmapped the usual way.

Hmm.  That may work, but it'll still break ABI.  I'm not sure that
criu is stable enough yet that we should care.  Criu people?

(In brief summary: how annoying would it be if the vdso was no longer
just a bunch of constant bytes that lived somewhere?)

--Andy

>
> Andy Lutomirski <luto@amacapital.net> wrote:
>
>>On Thu, Dec 13, 2012 at 5:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> On 12/13/2012 05:42 PM, Andy Lutomirski wrote:
>>>>
>>>> The 64-bit/x32 case is currently very simple and fast because it
>>uses
>>>> absolute addressing.  Admittedly, pcrel references are free, so
>>>> changing this wouldn't cost much.  For native, it'll be slower, but
>>>> maybe no one cares.  I seem to care about this more than anyone
>>else,
>>>> and I don't use 32 bit code. :)
>>>>
>>>
>>> pcrel is actually cheaper than absolute addressing in 64-bit mode.
>>>
>>>> The benefit of switching is that the vdso code could be the same in
>>>> all three cases.  (Actually, it's even better than that.  All of the
>>>> VVAR magic could be the same in the vdso and the kernel -- the
>>kernel
>>>> linker script would just have to have an appropriate symbol to see
>>the
>>>> appropriate mapping.)
>>>>
>>>>
>>>> This:
>>>>
>>>> __attribute__((visibility("hidden"))) int foo;
>>>>
>>>> int get_foo(void)
>>>> {
>>>>   return foo;
>>>> }
>>>>
>>>> generates a rip-relative access on 64 bits and GOTOFF on 32 bits.
>>>>
>>>> The only reason I didn't use a real symbol in the first place is
>>>> because I couldn't figure out how to get gcc to emit an absolute
>>>> relocation in pic code.
>>>
>>> Well, then, we wouldn't need to do that... this is starting to sound
>>> like a significant win.
>>
>>How will this avoid breaking checkpoint/restore in userspace?  If the
>>vdso is not just plain old code, criu presumably needs to know about
>>it.  Should there be an arch_prctl(ARCH_MAP_VDSO, addr) to create a
>>vdso mapping somewhere?
>>
>>--Andy
>
> --
> Sent from my mobile phone. Please excuse brevity and lack of formatting.



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  2:20                             ` Andy Lutomirski
@ 2012-12-14  8:34                               ` Pavel Emelyanov
  2012-12-14 18:35                                 ` H. Peter Anvin
  2012-12-14 22:46                                 ` H. Peter Anvin
  0 siblings, 2 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2012-12-14  8:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. Peter Anvin, aarcange, ak, Stefani Seibold, x86, linux-kernel,
	criu, mingo, john.stultz, tglx

On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> Wouldn't the vdso get mapped already and could be mremap()'d.  If we
> really need more control I'd almost push for a device/filesystem node
> that could be mmapped the usual way.
> 
> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
> criu is stable enough yet that we should care.  Criu people?

It's not yet, but we'd still appreciate the criu-friendly vdso redesign.

> (In brief summary: how annoying would it be if the vdso was no longer
> just a bunch of constant bytes that lived somewhere?)

It depends on what vdso is going to be. In the perfect case it should
a) be mremap-able to any address (or be at fixed address _forever_, but
   I assume this is not feasible);
b) have entry points at fixed (or somehow movable) places.

I admit that I didn't understand your question properly, if I did,
please correct me.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  8:34                               ` [CRIU] " Pavel Emelyanov
@ 2012-12-14 18:35                                 ` H. Peter Anvin
  2012-12-14 18:44                                   ` Andy Lutomirski
  2012-12-14 22:46                                 ` H. Peter Anvin
  1 sibling, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 18:35 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andy Lutomirski, aarcange, ak, Stefani Seibold, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
> On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
>> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> Wouldn't the vdso get mapped already and could be mremap()'d.  If we
>> really need more control I'd almost push for a device/filesystem node
>> that could be mmapped the usual way.
>>
>> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
>> criu is stable enough yet that we should care.  Criu people?
> 
> It's not yet, but we'd still appreciate the criu-friendly vdso redesign.
> 
>> (In brief summary: how annoying would it be if the vdso was no longer
>> just a bunch of constant bytes that lived somewhere?)
> 
> It depends on what vdso is going to be. In the perfect case it should
> a) be mremap-able to any address (or be at fixed address _forever_, but
>    I assume this is not feasible);
> b) have entry points at fixed (or somehow movable) places.
> 
> I admit that I didn't understand your question properly, if I did,
> please correct me.
> 

mremap() should work.  At the same time, the code itself is not going to
have any stability guarantees between kernel versions -- it obviously
cannot.

Incidentally, the MAYWRITE bit which is there to allow breakpoints is
obviously problematic for the vvar page.  We could mark the vvar page
differently, meaning more vmas, or we could decide it just doesn't
matter and that if you mprotect() the vvar page and write to it you get
exactly what you asked for...

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 18:35                                 ` H. Peter Anvin
@ 2012-12-14 18:44                                   ` Andy Lutomirski
  2012-12-14 18:47                                     ` H. Peter Anvin
  2012-12-17  9:05                                     ` Pavel Emelyanov
  0 siblings, 2 replies; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-14 18:44 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Pavel Emelyanov, aarcange, ak, Stefani Seibold, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On Fri, Dec 14, 2012 at 10:35 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
>> On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
>>> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>> Wouldn't the vdso get mapped already and could be mremap()'d.  If we
>>> really need more control I'd almost push for a device/filesystem node
>>> that could be mmapped the usual way.
>>>
>>> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
>>> criu is stable enough yet that we should care.  Criu people?
>>
>> It's not yet, but we'd still appreciate the criu-friendly vdso redesign.
>>
>>> (In brief summary: how annoying would it be if the vdso was no longer
>>> just a bunch of constant bytes that lived somewhere?)
>>
>> It depends on what vdso is going to be. In the perfect case it should
>> a) be mremap-able to any address (or be at fixed address _forever_, but
>>    I assume this is not feasible);
>> b) have entry points at fixed (or somehow movable) places.
>>
>> I admit that I didn't understand your question properly, if I did,
>> please correct me.
>>
>
> mremap() should work.  At the same time, the code itself is not going to
> have any stability guarantees between kernel versions -- it obviously
> cannot.

We could guarantee that the symbols in the vdso resolve to particular
offsets within the vdso.  (Yes, this is ugly.)

Does criu support checkpointing with one version of a shared library
and restoring with another?  If there are no textrels (or whatever the
relocation type that actually modifies text as opposed to just the plt
or got) then, in principle, it should be doable.  Otherwise some
kernel help will be needed to checkpoint reliably on one kernel and
restore somewhere else.

(This isn't a regression -- it's already broken.)

>
> Incidentally, the MAYWRITE bit which is there to allow breakpoints is
> obviously problematic for the vvar page.  We could mark the vvar page
> differently, meaning more vmas, or we could decide it just doesn't
> matter and that if you mprotect() the vvar page and write to it you get
> exactly what you asked for...

I have no strong preference here.

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 18:44                                   ` Andy Lutomirski
@ 2012-12-14 18:47                                     ` H. Peter Anvin
  2012-12-14 20:12                                       ` Cyrill Gorcunov
  2012-12-17  9:05                                     ` Pavel Emelyanov
  1 sibling, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 18:47 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Pavel Emelyanov, aarcange, ak, Stefani Seibold, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 10:44 AM, Andy Lutomirski wrote:
>>
>> mremap() should work.  At the same time, the code itself is not going to
>> have any stability guarantees between kernel versions -- it obviously
>> cannot.
> 
> We could guarantee that the symbols in the vdso resolve to particular
> offsets within the vdso.  (Yes, this is ugly.)
> 
> Does criu support checkpointing with one version of a shared library
> and restoring with another?  If there are no textrels (or whatever the
> relocation type that actually modifies text as opposed to just the plt
> or got) then, in principle, it should be doable.  Otherwise some
> kernel help will be needed to checkpoint reliably on one kernel and
> restore somewhere else.
> 
> (This isn't a regression -- it's already broken.)
> 

The real issue is that happens if the process is checkpointed while
inside the vdso and now eip/rip or a stack frame points into the vdso.
This is not impossible or even unlikely, especially on 32 bits it is
downright likely.

	-hpa


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 18:47                                     ` H. Peter Anvin
@ 2012-12-14 20:12                                       ` Cyrill Gorcunov
  2012-12-14 21:08                                         ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2012-12-14 20:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On Fri, Dec 14, 2012 at 10:47:53AM -0800, H. Peter Anvin wrote:
> On 12/14/2012 10:44 AM, Andy Lutomirski wrote:
> >>
> >> mremap() should work.  At the same time, the code itself is not going to
> >> have any stability guarantees between kernel versions -- it obviously
> >> cannot.
> > 
> > We could guarantee that the symbols in the vdso resolve to particular
> > offsets within the vdso.  (Yes, this is ugly.)
> > 
> > Does criu support checkpointing with one version of a shared library
> > and restoring with another?  If there are no textrels (or whatever the
> > relocation type that actually modifies text as opposed to just the plt
> > or got) then, in principle, it should be doable.  Otherwise some
> > kernel help will be needed to checkpoint reliably on one kernel and
> > restore somewhere else.
> > 
> > (This isn't a regression -- it's already broken.)
> > 
> The real issue is that happens if the process is checkpointed while
> inside the vdso and now eip/rip or a stack frame points into the vdso.
> This is not impossible or even unlikely, especially on 32 bits it is
> downright likely.

I fear if there are stacked ip which point to vdso -- we simply won't
be able to restore properly if vdso internal format changed significantly
between kernel versions. (At moment we restore vdso exactly at same position
it was on checkpoint stage with same content, iirc).

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 20:12                                       ` Cyrill Gorcunov
@ 2012-12-14 21:08                                         ` H. Peter Anvin
  2012-12-14 21:20                                           ` Cyrill Gorcunov
  2012-12-14 21:27                                           ` Andy Lutomirski
  0 siblings, 2 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 21:08 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 12:12 PM, Cyrill Gorcunov wrote:
>>>
>> The real issue is that happens if the process is checkpointed while
>> inside the vdso and now eip/rip or a stack frame points into the vdso.
>> This is not impossible or even unlikely, especially on 32 bits it is
>> downright likely.
> 
> I fear if there are stacked ip which point to vdso -- we simply won't
> be able to restore properly if vdso internal format changed significantly
> between kernel versions. (At moment we restore vdso exactly at same position
> it was on checkpoint stage with same content, iirc).
> 

I don't think there is a way around that.  It is completely unreasonable
to say that the vdso cannot change between kernel versions, for obvious
reasons.  It's worse than "significantly"... changing even one
instruction makes it plausible your eip/rip will point into the middle
of an instruction.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 21:08                                         ` H. Peter Anvin
@ 2012-12-14 21:20                                           ` Cyrill Gorcunov
  2012-12-14 21:21                                             ` H. Peter Anvin
  2012-12-14 21:27                                           ` Andy Lutomirski
  1 sibling, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2012-12-14 21:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On Fri, Dec 14, 2012 at 01:08:35PM -0800, H. Peter Anvin wrote:
> On 12/14/2012 12:12 PM, Cyrill Gorcunov wrote:
> >>>
> >> The real issue is that happens if the process is checkpointed while
> >> inside the vdso and now eip/rip or a stack frame points into the vdso.
> >> This is not impossible or even unlikely, especially on 32 bits it is
> >> downright likely.
> > 
> > I fear if there are stacked ip which point to vdso -- we simply won't
> > be able to restore properly if vdso internal format changed significantly
> > between kernel versions. (At moment we restore vdso exactly at same position
> > it was on checkpoint stage with same content, iirc).
> > 
> 
> I don't think there is a way around that.  It is completely unreasonable
> to say that the vdso cannot change between kernel versions, for obvious
> reasons.  It's worse than "significantly"... changing even one
> instruction makes it plausible your eip/rip will point into the middle
> of an instruction.

Well, one idea was to try to escape dumping when a dumpee inside vdso area
and wait until it leaves this zone, then proceed dumping. Then, if vdso is
changed (say some new instructions were added) we zap original prologues
with jmp to new symbols from fresh vdso provided us by a kernel. I'm not
really sure if this would help us much but just saying (I must admit I
didn't looked yet into vdso implementation details, so sorry if it sounds
stupid).

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 21:20                                           ` Cyrill Gorcunov
@ 2012-12-14 21:21                                             ` H. Peter Anvin
  0 siblings, 0 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 21:21 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 01:20 PM, Cyrill Gorcunov wrote:
> On Fri, Dec 14, 2012 at 01:08:35PM -0800, H. Peter Anvin wrote:
>> On 12/14/2012 12:12 PM, Cyrill Gorcunov wrote:
>>>>>
>>>> The real issue is that happens if the process is checkpointed while
>>>> inside the vdso and now eip/rip or a stack frame points into the vdso.
>>>> This is not impossible or even unlikely, especially on 32 bits it is
>>>> downright likely.
>>>
>>> I fear if there are stacked ip which point to vdso -- we simply won't
>>> be able to restore properly if vdso internal format changed significantly
>>> between kernel versions. (At moment we restore vdso exactly at same position
>>> it was on checkpoint stage with same content, iirc).
>>>
>>
>> I don't think there is a way around that.  It is completely unreasonable
>> to say that the vdso cannot change between kernel versions, for obvious
>> reasons.  It's worse than "significantly"... changing even one
>> instruction makes it plausible your eip/rip will point into the middle
>> of an instruction.
> 
> Well, one idea was to try to escape dumping when a dumpee inside vdso area
> and wait until it leaves this zone, then proceed dumping. Then, if vdso is
> changed (say some new instructions were added) we zap original prologues
> with jmp to new symbols from fresh vdso provided us by a kernel. I'm not
> really sure if this would help us much but just saying (I must admit I
> didn't looked yet into vdso implementation details, so sorry if it sounds
> stupid).
> 

Well, if the vdso contains a system call you may be waiting indefinitely.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 21:08                                         ` H. Peter Anvin
  2012-12-14 21:20                                           ` Cyrill Gorcunov
@ 2012-12-14 21:27                                           ` Andy Lutomirski
  2012-12-14 22:00                                             ` H. Peter Anvin
  1 sibling, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-14 21:27 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Cyrill Gorcunov, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On Fri, Dec 14, 2012 at 1:08 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 12/14/2012 12:12 PM, Cyrill Gorcunov wrote:
>>>>
>>> The real issue is that happens if the process is checkpointed while
>>> inside the vdso and now eip/rip or a stack frame points into the vdso.
>>> This is not impossible or even unlikely, especially on 32 bits it is
>>> downright likely.
>>
>> I fear if there are stacked ip which point to vdso -- we simply won't
>> be able to restore properly if vdso internal format changed significantly
>> between kernel versions. (At moment we restore vdso exactly at same position
>> it was on checkpoint stage with same content, iirc).
>>
>
> I don't think there is a way around that.  It is completely unreasonable
> to say that the vdso cannot change between kernel versions, for obvious
> reasons.  It's worse than "significantly"... changing even one
> instruction makes it plausible your eip/rip will point into the middle
> of an instruction.

It's not just kernel versions -- different toolchains may generate
different code.  Heck, building from a different directory can
sometimes generate different output.

The ABI of each vdso function is stable, though -- a sufficiently
clever tool could (maybe) use that knowledge along with unwind data in
the vdso to fix everything up.  This would be interesting, perhaps,
but certainly not easy.

I say we declare "if you want a working vdso in a weird location,
mremap it".  But how does userspace figure out what size to pass to
mremap?  If it's one vma, it's easy.

I don't know all that much about the linux vm.  Can we create a
special vdso address_space or struct inode or something so that a
single vma can contain pages with different flags?

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 21:27                                           ` Andy Lutomirski
@ 2012-12-14 22:00                                             ` H. Peter Anvin
  2012-12-14 22:25                                               ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 22:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Cyrill Gorcunov, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 01:27 PM, Andy Lutomirski wrote:
> 
> I don't know all that much about the linux vm.  Can we create a
> special vdso address_space or struct inode or something so that a
> single vma can contain pages with different flags?
> 

No, that is still different vmas, but it probably isn't a big deal.

The advantage of having an inode/namespace is that it lets you use
mmap() as opposed to mremap() with it, which might be useful, I don't know.

One option for the checkpoint people might actually be to not use the
vdso for a process that needs to be checkpointed and restarted on a
different machine or different kernel version.  Instead they can install
a pseudo-vdso which just calls normal system calls, and is simply a
static piece of code that makes normal system calls ... since the
internals of the kernel are hidden from userspace it is "clean" that way.

With any actual vdso you risk something like:

	-> vdso entry
	-> signal received, transfer to signal handler
	-> checkpoint
	-> restart
	-> signal handler exit

... and now you return to the address in the old vdso, but the internals
of the vdso may have changed.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 22:00                                             ` H. Peter Anvin
@ 2012-12-14 22:25                                               ` Cyrill Gorcunov
  2012-12-14 22:27                                                 ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2012-12-14 22:25 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On Fri, Dec 14, 2012 at 02:00:17PM -0800, H. Peter Anvin wrote:
> On 12/14/2012 01:27 PM, Andy Lutomirski wrote:
> > 
> > I don't know all that much about the linux vm.  Can we create a
> > special vdso address_space or struct inode or something so that a
> > single vma can contain pages with different flags?
> > 
> 
> No, that is still different vmas, but it probably isn't a big deal.
> 
> The advantage of having an inode/namespace is that it lets you use
> mmap() as opposed to mremap() with it, which might be useful, I don't know.
> 
> One option for the checkpoint people might actually be to not use the
> vdso for a process that needs to be checkpointed and restarted on a
> different machine or different kernel version.  Instead they can install
> a pseudo-vdso which just calls normal system calls, and is simply a
> static piece of code that makes normal system calls ... since the
> internals of the kernel are hidden from userspace it is "clean" that way.
> 
> With any actual vdso you risk something like:
> 

Is there a chance to make it something like that (assuming the
dumpee is ptraced)

> 	-> vdso entry

mark task as vdso-entered

> 	-> signal received, transfer to signal handler
> 	-> signal handler exit

before task leave vdso the task mark vdso-entered get cleaned
and if ptraced, the ptracing task is notified

> ... and now you return to the address in the old vdso, but the internals
> of the vdso may have changed.

this would allow us to defer checkpoint until task finish vdso code. Peter,
if I understand you correctly you propose we provide some own proxy-vdso
which would redirect calls to real ones, right? But the main problem
is that is exactly the idea to be able to c/r existing programs without
recompiling and such (or I miss something here?).

	Cyrill

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 22:25                                               ` Cyrill Gorcunov
@ 2012-12-14 22:27                                                 ` H. Peter Anvin
  2012-12-14 22:43                                                   ` Cyrill Gorcunov
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 22:27 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 02:25 PM, Cyrill Gorcunov wrote:
> 
> this would allow us to defer checkpoint until task finish vdso code. Peter,
> if I understand you correctly you propose we provide some own proxy-vdso
> which would redirect calls to real ones, right? But the main problem
> is that is exactly the idea to be able to c/r existing programs without
> recompiling and such (or I miss something here?).
> 

No, I'm proposing that you use a proxy-vdso which does nothing but
system calls, and therefore can be stable indefinitely.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 22:27                                                 ` H. Peter Anvin
@ 2012-12-14 22:43                                                   ` Cyrill Gorcunov
  2012-12-14 22:48                                                     ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Cyrill Gorcunov @ 2012-12-14 22:43 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On Fri, Dec 14, 2012 at 02:27:08PM -0800, H. Peter Anvin wrote:
> On 12/14/2012 02:25 PM, Cyrill Gorcunov wrote:
> > 
> > this would allow us to defer checkpoint until task finish vdso code. Peter,
> > if I understand you correctly you propose we provide some own proxy-vdso
> > which would redirect calls to real ones, right? But the main problem
> > is that is exactly the idea to be able to c/r existing programs without
> > recompiling and such (or I miss something here?).
> 
> No, I'm proposing that you use a proxy-vdso which does nothing but
> system calls, and therefore can be stable indefinitely.

This won't help in case of scenario you've been pointing in
previous email (where c/r happens in a middle of vdso),
would it? Because we still need somehow to be sure we're not
checkpointing in a middle of signal handler which will return
to some vdso place.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14  8:34                               ` [CRIU] " Pavel Emelyanov
  2012-12-14 18:35                                 ` H. Peter Anvin
@ 2012-12-14 22:46                                 ` H. Peter Anvin
  2012-12-14 23:09                                   ` Stefani Seibold
  1 sibling, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 22:46 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andy Lutomirski, aarcange, ak, Stefani Seibold, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
> On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
>> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> Wouldn't the vdso get mapped already and could be mremap()'d.  If we
>> really need more control I'd almost push for a device/filesystem node
>> that could be mmapped the usual way.
>>
>> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
>> criu is stable enough yet that we should care.  Criu people?
> 
> It's not yet, but we'd still appreciate the criu-friendly vdso redesign.
> 
>> (In brief summary: how annoying would it be if the vdso was no longer
>> just a bunch of constant bytes that lived somewhere?)
> 
> It depends on what vdso is going to be. In the perfect case it should
> a) be mremap-able to any address (or be at fixed address _forever_, but
>    I assume this is not feasible);
> b) have entry points at fixed (or somehow movable) places.
> 
> I admit that I didn't understand your question properly, if I did,
> please correct me.
> 

Either way... criu on the side, we should proceed with this vdso
redesign and get support for the 32-bit entry points including compat
mode on x86-64.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 22:43                                                   ` Cyrill Gorcunov
@ 2012-12-14 22:48                                                     ` H. Peter Anvin
  2012-12-14 23:48                                                       ` John Stultz
  0 siblings, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 22:48 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andy Lutomirski, aarcange, ak, Pavel Emelyanov, Stefani Seibold,
	x86, linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 02:43 PM, Cyrill Gorcunov wrote:
> On Fri, Dec 14, 2012 at 02:27:08PM -0800, H. Peter Anvin wrote:
>> On 12/14/2012 02:25 PM, Cyrill Gorcunov wrote:
>>>
>>> this would allow us to defer checkpoint until task finish vdso code. Peter,
>>> if I understand you correctly you propose we provide some own proxy-vdso
>>> which would redirect calls to real ones, right? But the main problem
>>> is that is exactly the idea to be able to c/r existing programs without
>>> recompiling and such (or I miss something here?).
>>
>> No, I'm proposing that you use a proxy-vdso which does nothing but
>> system calls, and therefore can be stable indefinitely.
> 
> This won't help in case of scenario you've been pointing in
> previous email (where c/r happens in a middle of vdso),
> would it? Because we still need somehow to be sure we're not
> checkpointing in a middle of signal handler which will return
> to some vdso place.

It is okay if and only if those vdso places never change... which I
think is doable if they only contain trival system call wrappers, i.e.
something like:

	movl $__SYS_gettimeofday, %eax
	syscall
	ret

These kinds of wrappers don't rely on live data provided by the kernel,
and so can be checkpointed together with the rest of the process.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 22:46                                 ` H. Peter Anvin
@ 2012-12-14 23:09                                   ` Stefani Seibold
  2012-12-14 23:29                                     ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Stefani Seibold @ 2012-12-14 23:09 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Pavel Emelyanov, Andy Lutomirski, aarcange, ak, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

Am Freitag, den 14.12.2012, 14:46 -0800 schrieb H. Peter Anvin:
> On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
> > On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
> >> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> >>> Wouldn't the vdso get mapped already and could be mremap()'d.  If we
> >> really need more control I'd almost push for a device/filesystem node
> >> that could be mmapped the usual way.
> >>
> >> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
> >> criu is stable enough yet that we should care.  Criu people?
> > 
> > It's not yet, but we'd still appreciate the criu-friendly vdso redesign.
> > 
> >> (In brief summary: how annoying would it be if the vdso was no longer
> >> just a bunch of constant bytes that lived somewhere?)
> > 
> > It depends on what vdso is going to be. In the perfect case it should
> > a) be mremap-able to any address (or be at fixed address _forever_, but
> >    I assume this is not feasible);
> > b) have entry points at fixed (or somehow movable) places.
> > 
> > I admit that I didn't understand your question properly, if I did,
> > please correct me.
> > 
> 
> Either way... criu on the side, we should proceed with this vdso
> redesign and get support for the 32-bit entry points including compat
> mode on x86-64.
> 
> 	-hpa
> 
> 

Sorry for not following the discussion, but im am currently trying to
compile the vclocktime.c as a 32 bit object. Most of the (clever) work
is done.

After this the next step is to map the needed fixmaps into the 32 bit
address space. Maybe this can be done with install_special_mapping().

I think i will do this job in the next days.

- Stefani



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 23:09                                   ` Stefani Seibold
@ 2012-12-14 23:29                                     ` H. Peter Anvin
  0 siblings, 0 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 23:29 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: Pavel Emelyanov, Andy Lutomirski, aarcange, ak, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On 12/14/2012 03:09 PM, Stefani Seibold wrote:
> 
> Sorry for not following the discussion, but im am currently trying to
> compile the vclocktime.c as a 32 bit object. Most of the (clever) work
> is done.
> 
> After this the next step is to map the needed fixmaps into the 32 bit
> address space. Maybe this can be done with install_special_mapping().
> 

install_special_mapping() is indeed how it is done.  The suggestion is
to make the vvar page an actual section inside the vdso, and then just
substitute the vvar page into the mapping array when installing the vdso
into the process user space.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 22:48                                                     ` H. Peter Anvin
@ 2012-12-14 23:48                                                       ` John Stultz
  2012-12-14 23:55                                                         ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: John Stultz @ 2012-12-14 23:48 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Cyrill Gorcunov, Andy Lutomirski, aarcange, ak, Pavel Emelyanov,
	Stefani Seibold, x86, linux-kernel, criu, mingo, tglx

On 12/14/2012 02:48 PM, H. Peter Anvin wrote:
> On 12/14/2012 02:43 PM, Cyrill Gorcunov wrote:
>> On Fri, Dec 14, 2012 at 02:27:08PM -0800, H. Peter Anvin wrote:
>>
>>
>> This won't help in case of scenario you've been pointing in
>> previous email (where c/r happens in a middle of vdso),
>> would it? Because we still need somehow to be sure we're not
>> checkpointing in a middle of signal handler which will return
>> to some vdso place.
> It is okay if and only if those vdso places never change... which I
> think is doable if they only contain trival system call wrappers, i.e.
> something like:
>
> 	movl $__SYS_gettimeofday, %eax
> 	syscall
> 	ret

Though doesn't this make it easier for exploits (somewhat undoing ASLR)? 
I know Andi always wanted to avoid having syscall instructions at a 
fixed location for the old vsyscall code (though I know we had it 
none-the-less for awhile).   But maybe I'm confusing issues here?

thanks
-john

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 23:48                                                       ` John Stultz
@ 2012-12-14 23:55                                                         ` H. Peter Anvin
  0 siblings, 0 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-14 23:55 UTC (permalink / raw)
  To: John Stultz
  Cc: Cyrill Gorcunov, Andy Lutomirski, aarcange, ak, Pavel Emelyanov,
	Stefani Seibold, x86, linux-kernel, criu, mingo, tglx

On 12/14/2012 03:48 PM, John Stultz wrote:
> On 12/14/2012 02:48 PM, H. Peter Anvin wrote:
>> On 12/14/2012 02:43 PM, Cyrill Gorcunov wrote:
>>> On Fri, Dec 14, 2012 at 02:27:08PM -0800, H. Peter Anvin wrote:
>>>
>>>
>>> This won't help in case of scenario you've been pointing in
>>> previous email (where c/r happens in a middle of vdso),
>>> would it? Because we still need somehow to be sure we're not
>>> checkpointing in a middle of signal handler which will return
>>> to some vdso place.
>> It is okay if and only if those vdso places never change... which I
>> think is doable if they only contain trival system call wrappers, i.e.
>> something like:
>>
>>     movl $__SYS_gettimeofday, %eax
>>     syscall
>>     ret
> 
> Though doesn't this make it easier for exploits (somewhat undoing ASLR)?
> I know Andi always wanted to avoid having syscall instructions at a
> fixed location for the old vsyscall code (though I know we had it
> none-the-less for awhile).   But maybe I'm confusing issues here?
> 

They aren't in fixed addresses across processes... the vdso location can
still be randomized.  It just has to be the same across the
checkpoint/restart operation, just like all the other instructions.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-14 18:44                                   ` Andy Lutomirski
  2012-12-14 18:47                                     ` H. Peter Anvin
@ 2012-12-17  9:05                                     ` Pavel Emelyanov
       [not found]                                       ` <fb2e871b-3e2a-4e96-9eb9-cb2dd4f66eaa@email.android! .com>
  2012-12-17 15:21                                       ` H. Peter Anvin
  1 sibling, 2 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2012-12-17  9:05 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. Peter Anvin, aarcange, ak, Stefani Seibold, x86, linux-kernel,
	criu, mingo, john.stultz, tglx

On 12/14/2012 10:44 PM, Andy Lutomirski wrote:
> On Fri, Dec 14, 2012 at 10:35 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
>>> On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
>>>> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>>> Wouldn't the vdso get mapped already and could be mremap()'d.  If we
>>>> really need more control I'd almost push for a device/filesystem node
>>>> that could be mmapped the usual way.
>>>>
>>>> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
>>>> criu is stable enough yet that we should care.  Criu people?
>>>
>>> It's not yet, but we'd still appreciate the criu-friendly vdso redesign.
>>>
>>>> (In brief summary: how annoying would it be if the vdso was no longer
>>>> just a bunch of constant bytes that lived somewhere?)
>>>
>>> It depends on what vdso is going to be. In the perfect case it should
>>> a) be mremap-able to any address (or be at fixed address _forever_, but
>>>    I assume this is not feasible);
>>> b) have entry points at fixed (or somehow movable) places.
>>>
>>> I admit that I didn't understand your question properly, if I did,
>>> please correct me.
>>>
>>
>> mremap() should work.  At the same time, the code itself is not going to
>> have any stability guarantees between kernel versions -- it obviously
>> cannot.
> 
> We could guarantee that the symbols in the vdso resolve to particular
> offsets within the vdso.  (Yes, this is ugly.)
> 
> Does criu support checkpointing with one version of a shared library
> and restoring with another?

No, neither we have this in plans.
However, if somebody needs this and implements -- why not?!

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-17  9:05                                     ` Pavel Emelyanov
       [not found]                                       ` <fb2e871b-3e2a-4e96-9eb9-cb2dd4f66eaa@email.android! .com>
@ 2012-12-17 15:21                                       ` H. Peter Anvin
  2012-12-17 18:56                                         ` Pavel Emelyanov
  1 sibling, 1 reply; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-17 15:21 UTC (permalink / raw)
  To: Pavel Emelyanov, Andy Lutomirski
  Cc: aarcange, ak, Stefani Seibold, x86, linux-kernel, criu, mingo,
	john.stultz, tglx

Because it is almost impossible to do right?

Pavel Emelyanov <xemul@parallels.com> wrote:

>On 12/14/2012 10:44 PM, Andy Lutomirski wrote:
>> On Fri, Dec 14, 2012 at 10:35 AM, H. Peter Anvin <hpa@zytor.com>
>wrote:
>>> On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
>>>> On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
>>>>> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com>
>wrote:
>>>>>> Wouldn't the vdso get mapped already and could be mremap()'d.  If
>we
>>>>> really need more control I'd almost push for a device/filesystem
>node
>>>>> that could be mmapped the usual way.
>>>>>
>>>>> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
>>>>> criu is stable enough yet that we should care.  Criu people?
>>>>
>>>> It's not yet, but we'd still appreciate the criu-friendly vdso
>redesign.
>>>>
>>>>> (In brief summary: how annoying would it be if the vdso was no
>longer
>>>>> just a bunch of constant bytes that lived somewhere?)
>>>>
>>>> It depends on what vdso is going to be. In the perfect case it
>should
>>>> a) be mremap-able to any address (or be at fixed address _forever_,
>but
>>>>    I assume this is not feasible);
>>>> b) have entry points at fixed (or somehow movable) places.
>>>>
>>>> I admit that I didn't understand your question properly, if I did,
>>>> please correct me.
>>>>
>>>
>>> mremap() should work.  At the same time, the code itself is not
>going to
>>> have any stability guarantees between kernel versions -- it
>obviously
>>> cannot.
>> 
>> We could guarantee that the symbols in the vdso resolve to particular
>> offsets within the vdso.  (Yes, this is ugly.)
>> 
>> Does criu support checkpointing with one version of a shared library
>> and restoring with another?
>
>No, neither we have this in plans.
>However, if somebody needs this and implements -- why not?!
>
>Thanks,
>Pavel

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-17 15:21                                       ` H. Peter Anvin
@ 2012-12-17 18:56                                         ` Pavel Emelyanov
  2012-12-17 18:57                                           ` H. Peter Anvin
  0 siblings, 1 reply; 50+ messages in thread
From: Pavel Emelyanov @ 2012-12-17 18:56 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, aarcange, ak, Stefani Seibold, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On 12/17/2012 07:21 PM, H. Peter Anvin wrote:
> Because it is almost impossible to do right?

In the generic case -- I tend to agree. But it's possible to describe
how a library should communicate to crtools to make it possible.

Anyway, what I wanted to say -- we didn't have this scenario in our
plans, but criu project is open, and if someone comes with sane idea,
we will not object merging it.

> Pavel Emelyanov <xemul@parallels.com> wrote:
> 
>> On 12/14/2012 10:44 PM, Andy Lutomirski wrote:
>>> On Fri, Dec 14, 2012 at 10:35 AM, H. Peter Anvin <hpa@zytor.com>
>> wrote:
>>>> On 12/14/2012 12:34 AM, Pavel Emelyanov wrote:
>>>>> On 12/14/2012 06:20 AM, Andy Lutomirski wrote:
>>>>>> On Thu, Dec 13, 2012 at 6:18 PM, H. Peter Anvin <hpa@zytor.com>
>> wrote:
>>>>>>> Wouldn't the vdso get mapped already and could be mremap()'d.  If
>> we
>>>>>> really need more control I'd almost push for a device/filesystem
>> node
>>>>>> that could be mmapped the usual way.
>>>>>>
>>>>>> Hmm.  That may work, but it'll still break ABI.  I'm not sure that
>>>>>> criu is stable enough yet that we should care.  Criu people?
>>>>>
>>>>> It's not yet, but we'd still appreciate the criu-friendly vdso
>> redesign.
>>>>>
>>>>>> (In brief summary: how annoying would it be if the vdso was no
>> longer
>>>>>> just a bunch of constant bytes that lived somewhere?)
>>>>>
>>>>> It depends on what vdso is going to be. In the perfect case it
>> should
>>>>> a) be mremap-able to any address (or be at fixed address _forever_,
>> but
>>>>>    I assume this is not feasible);
>>>>> b) have entry points at fixed (or somehow movable) places.
>>>>>
>>>>> I admit that I didn't understand your question properly, if I did,
>>>>> please correct me.
>>>>>
>>>>
>>>> mremap() should work.  At the same time, the code itself is not
>> going to
>>>> have any stability guarantees between kernel versions -- it
>> obviously
>>>> cannot.
>>>
>>> We could guarantee that the symbols in the vdso resolve to particular
>>> offsets within the vdso.  (Yes, this is ugly.)
>>>
>>> Does criu support checkpointing with one version of a shared library
>>> and restoring with another?
>>
>> No, neither we have this in plans.
>> However, if somebody needs this and implements -- why not?!
>>
>> Thanks,
>> Pavel
> 



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [CRIU] [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-17 18:56                                         ` Pavel Emelyanov
@ 2012-12-17 18:57                                           ` H. Peter Anvin
  0 siblings, 0 replies; 50+ messages in thread
From: H. Peter Anvin @ 2012-12-17 18:57 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andy Lutomirski, aarcange, ak, Stefani Seibold, x86,
	linux-kernel, criu, mingo, john.stultz, tglx

On 12/17/2012 10:56 AM, Pavel Emelyanov wrote:
> On 12/17/2012 07:21 PM, H. Peter Anvin wrote:
>> Because it is almost impossible to do right?
> 
> In the generic case -- I tend to agree. But it's possible to describe
> how a library should communicate to crtools to make it possible.
> 
> Anyway, what I wanted to say -- we didn't have this scenario in our
> plans, but criu project is open, and if someone comes with sane idea,
> we will not object merging it.
> 

I doubt it is possible using existing compiler toolchains.

	-hpa



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 20:40   ` Stefani Seibold
@ 2012-12-12  1:29     ` Andy Lutomirski
  0 siblings, 0 replies; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-12  1:29 UTC (permalink / raw)
  To: Stefani Seibold
  Cc: linux-kernel, ak, aarcange, john.stultz, Jeremy Fitzhardinge

[cc: Jeremy Fitzhardinge -- you wrote some of this]

On Tue, Dec 11, 2012 at 12:40 PM, Stefani Seibold <stefani@seibold.net> wrote:
> Am Dienstag, den 11.12.2012, 11:37 -0800 schrieb Andy Lutomirski:
>> On Tue, Dec 11, 2012 at 8:11 AM,  <stefani@seibold.net> wrote:
>> > --- a/arch/x86/vdso/vclock_gettime.c
>> > +++ b/arch/x86/vdso/vclock_gettime.c
>> > @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
>> >
>> >  static notrace cycle_t vread_hpet(void)
>> >  {
>> > +#ifdef CONFIG_X86_64
>> >         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
>> > +#else
>> > +       return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
>> > +#endif
>> >  }
>>
>> Is 0xf0 not equal to HPET_COUNTER?
>>
>
> Yes, but HPET_COUNTER is more readable.

Sorry -- read it backwards.  Can you change the 64-bit one as well?

>
>> >
>> >  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>> >  {
>> >         long ret;
>> > +#ifdef CONFIG_X86_64
>> >         asm("syscall" : "=a" (ret) :
>> >             "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
>> > +#else
>> > +       asm("int $0x80" : "=a" (ret) :
>> > +           "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
>> > +#endif
>> >         return ret;
>> >  }
>>
>> __kernel_vsyscall is probably much faster if you can figure out how to
>> call it from here :)
>>
>
> Yes i know. Thats one of my problems, because i cannot call
> __kernel_vsyscall directly due the relocation. Any idea?

What actually goes wrong?  I think that if calling __kernel_vsyscall
like a normal symbol doesn't work then it's a bug in the relocation.

What's the point of this relocation?  It appears thoroughly useless in
the !compat case.

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 21:18     ` Andy Lutomirski
@ 2012-12-11 21:28       ` Stefani Seibold
  0 siblings, 0 replies; 50+ messages in thread
From: Stefani Seibold @ 2012-12-11 21:28 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: John Stultz, linux-kernel, ak, aarcange

Am Dienstag, den 11.12.2012, 13:18 -0800 schrieb Andy Lutomirski:
> On Tue, Dec 11, 2012 at 12:54 PM, Stefani Seibold <stefani@seibold.net> wrote:
> > Am Dienstag, den 11.12.2012, 11:27 -0800 schrieb John Stultz:
> >> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
> >> > From: Stefani Seibold <stefani@seibold.net>
> >> >
> >> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> >> > and vdso_time() support to the VDSO for x86 32-bit kernels.
> >> >
> >> > The reason to do this was to get a fast reliable time stamp. Many developers
> >> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> >> > time functions a fast and reliable way, because the kernel knows the best time
> >> > source and the P- and C-state of the CPU.
> >> Very cool. There have been similar implementations of this patch over
> >> the years, but they were all bit more hackish then this.
> >>
> > Thanks, i think it is not more hackish than the x86 64 code.
> >
> >>
> >> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> >> > the ACPI timer should be easily to add with an other patch.
> >> Although the ACPI PM timer requires port-io which would need tweaking to
> >> allow normal users to access it. And I'm not sure if the performance
> >> would be much improved, as the port-io probably dominates the
> >> performance cost.
> >>
> >
> > It was only an idea. I think it would be easy to give the ioperm read
> > for the ACPI Timer without breaking anything.
> 
> I think you'll have to be careful not to make context switches slower
> if you do this.  IIRC ioperm bitmaps live in the task structure (the
> i386 thing, not task_struct) and require extra slow instructions.
> 

Good point. I will measure the overhead. But for now, it was only an
idea.

> I still think we should do TSC only.
> 

HPET is still necessary, since many system have no stable TSC, because
of the C1E (enhanced halt). And with the VDSO the kernel overhead for
system call will go away, this is better than nothing.

> >> Could you expand a bit as to why a compat layer isn't possible? It seems
> >> we could easily convert the vsyscall_gtod_data to a more explicit
> >> arch-neutral size. Or is it the actual data page mapping?
> >>
> >
> > This could be done in a subsequent patch. First i want not change to
> > much, to make the review easier. Converting the vsyscall_gtod_data to
> > arch neutral is not so easy, because the size of time_t, timezone and
> > timespec differs. So currently 32 bit programs running under a 64 bit
> > kernel will not get a VDSO with time functions. But real 32 bit kernel
> > will provide it.
> >
> >
> 
> How would you address the vvar page in 32-bit userspace on a 64-bit
> kernel?  Would you map it low for 32-bit tasks or play descriptor
> games?
> 

As i mentioned, this is currently not implemented. On a 64 bit kernel
there is no VDSO time functions for 32 bit programs available. This
sugar will be come maybe with an other patch if it is possible to map he
vvar page to the low 32-bit address space. But for now a 32 bit programs
on 32 kernel will get the advantage.

- Stefani



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 20:54   ` Stefani Seibold
@ 2012-12-11 21:18     ` Andy Lutomirski
  2012-12-11 21:28       ` Stefani Seibold
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-11 21:18 UTC (permalink / raw)
  To: Stefani Seibold; +Cc: John Stultz, linux-kernel, ak, aarcange

On Tue, Dec 11, 2012 at 12:54 PM, Stefani Seibold <stefani@seibold.net> wrote:
> Am Dienstag, den 11.12.2012, 11:27 -0800 schrieb John Stultz:
>> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
>> > From: Stefani Seibold <stefani@seibold.net>
>> >
>> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
>> > and vdso_time() support to the VDSO for x86 32-bit kernels.
>> >
>> > The reason to do this was to get a fast reliable time stamp. Many developers
>> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
>> > time functions a fast and reliable way, because the kernel knows the best time
>> > source and the P- and C-state of the CPU.
>> Very cool. There have been similar implementations of this patch over
>> the years, but they were all bit more hackish then this.
>>
> Thanks, i think it is not more hackish than the x86 64 code.
>
>>
>> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
>> > the ACPI timer should be easily to add with an other patch.
>> Although the ACPI PM timer requires port-io which would need tweaking to
>> allow normal users to access it. And I'm not sure if the performance
>> would be much improved, as the port-io probably dominates the
>> performance cost.
>>
>
> It was only an idea. I think it would be easy to give the ioperm read
> for the ACPI Timer without breaking anything.

I think you'll have to be careful not to make context switches slower
if you do this.  IIRC ioperm bitmaps live in the task structure (the
i386 thing, not task_struct) and require extra slow instructions.

I still think we should do TSC only.

>> Could you expand a bit as to why a compat layer isn't possible? It seems
>> we could easily convert the vsyscall_gtod_data to a more explicit
>> arch-neutral size. Or is it the actual data page mapping?
>>
>
> This could be done in a subsequent patch. First i want not change to
> much, to make the review easier. Converting the vsyscall_gtod_data to
> arch neutral is not so easy, because the size of time_t, timezone and
> timespec differs. So currently 32 bit programs running under a 64 bit
> kernel will not get a VDSO with time functions. But real 32 bit kernel
> will provide it.
>
>

How would you address the vvar page in 32-bit userspace on a 64-bit
kernel?  Would you map it low for 32-bit tasks or play descriptor
games?

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 19:27 ` John Stultz
  2012-12-11 19:37   ` Andy Lutomirski
@ 2012-12-11 20:54   ` Stefani Seibold
  2012-12-11 21:18     ` Andy Lutomirski
  1 sibling, 1 reply; 50+ messages in thread
From: Stefani Seibold @ 2012-12-11 20:54 UTC (permalink / raw)
  To: John Stultz; +Cc: linux-kernel, ak, aarcange, luto

Am Dienstag, den 11.12.2012, 11:27 -0800 schrieb John Stultz:
> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
> > From: Stefani Seibold <stefani@seibold.net>
> >
> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> > and vdso_time() support to the VDSO for x86 32-bit kernels.
> >
> > The reason to do this was to get a fast reliable time stamp. Many developers
> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> > time functions a fast and reliable way, because the kernel knows the best time
> > source and the P- and C-state of the CPU.
> Very cool. There have been similar implementations of this patch over 
> the years, but they were all bit more hackish then this.
> 
Thanks, i think it is not more hackish than the x86 64 code.

> 
> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> > the ACPI timer should be easily to add with an other patch.
> Although the ACPI PM timer requires port-io which would need tweaking to 
> allow normal users to access it. And I'm not sure if the performance 
> would be much improved, as the port-io probably dominates the 
> performance cost.
> 

It was only an idea. I think it would be easy to give the ioperm read
for the ACPI Timer without breaking anything.

> > The helper library to use the VDSO functions can be download at
> > http://http://seibold.net/vdso.c
> > The libary is very small, only 228 lines of code. Compile it with
> > gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
> > and use it with LD_PRELOAD=<path>/libvdso.so
> >
> > This kind of helper must be integrated into glibc, for x86 64-bit and
> > PowerPC it is already there.
> >
> A few notes below...
> 
> 
> > diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
> > index 59c6c40..45ba688 100644
> > --- a/arch/x86/include/asm/compat.h
> > +++ b/arch/x86/include/asm/compat.h
> > @@ -295,6 +295,10 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
> >   
> >   static inline void __user *arch_compat_alloc_user_space(long len)
> >   {
> > +#ifdef CONFIG_X86_32
> > +	struct pt_regs *regs = task_pt_regs(current);
> > +	return (void __user *)regs->sp - len;
> > +#else
> >   	compat_uptr_t sp;
> >   
> >   	if (test_thread_flag(TIF_IA32)) {
> > @@ -305,6 +309,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
> >   	}
> >   
> >   	return (void __user *)round_down(sp - len, 16);
> > +#endif
> >   }
> 
> This style of in-line ifdefs are ugly and hard to read.
> 
> So instead of doing:
> void myfunction (void)
> {
> #ifdef 32bits
>      32_bit_implementation();
> #else
>      64_bit_implementation();
> #endif
> }
> 
> Where possible, please do:
> #ifdef 32bits
> void myfunction1(void)
> {
>      32_bit implementation();
> }
> void myfunction2(void)
> {
> ....
> #else /*64 bit versions */
> void myfunction1(void)
> {
>      64_bit implementation();
> }
> void myfunction2(void)
> ....
> #endif
> 
> > diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> > index 4df6c37..2dc6b72 100644
> > --- a/arch/x86/vdso/vclock_gettime.c
> > +++ b/arch/x86/vdso/vclock_gettime.c
> > @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
> >   
> >   static notrace cycle_t vread_hpet(void)
> >   {
> > +#ifdef CONFIG_X86_64
> >   	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> > +#else
> > +	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> > +#endif
> >   }
> >   
> >   notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
> >   {
> >   	long ret;
> > +#ifdef CONFIG_X86_64
> >   	asm("syscall" : "=a" (ret) :
> >   	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> > +#else
> > +	asm("int $0x80" : "=a" (ret) :
> > +	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> > +#endif
> >   	return ret;
> >   }
> Same point here.
> 
> 

Will be fixed.

> > diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> > new file mode 100644
> > index 0000000..c9a1909
> > --- /dev/null
> > +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> > @@ -0,0 +1,7 @@
> > +/*
> > + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> > + * provide a 32 bit vclock with a 64 bit kernel
> > + */
> > +#ifdef CONFIG_X86_32
> > +#include "../vclock_gettime.c"
> > +#endif
> Could you expand a bit as to why a compat layer isn't possible? It seems 
> we could easily convert the vsyscall_gtod_data to a more explicit 
> arch-neutral size. Or is it the actual data page mapping?
> 

This could be done in a subsequent patch. First i want not change to
much, to make the review easier. Converting the vsyscall_gtod_data to
arch neutral is not so easy, because the size of time_t, timezone and
timespec differs. So currently 32 bit programs running under a 64 bit
kernel will not get a VDSO with time functions. But real 32 bit kernel
will provide it.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 19:37 ` Andy Lutomirski
@ 2012-12-11 20:40   ` Stefani Seibold
  2012-12-12  1:29     ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: Stefani Seibold @ 2012-12-11 20:40 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: linux-kernel, ak, aarcange, john.stultz

Am Dienstag, den 11.12.2012, 11:37 -0800 schrieb Andy Lutomirski:
> On Tue, Dec 11, 2012 at 8:11 AM,  <stefani@seibold.net> wrote:
> > From: Stefani Seibold <stefani@seibold.net>
> >
> > This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> > and vdso_time() support to the VDSO for x86 32-bit kernels.
> >
> > The reason to do this was to get a fast reliable time stamp. Many developers
> > uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> > time functions a fast and reliable way, because the kernel knows the best time
> > source and the P- and C-state of the CPU.
> >
> > For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> > the ACPI timer should be easily to add with an other patch.
> >
> > The helper library to use the VDSO functions can be download at
> > http://http://seibold.net/vdso.c
> 
> Wow -- another implementation.  See Documentation/vDSO/parse_vdso.c.
> (Also, I think your code will break if anyone ever strips the vdso.)
> 
> > --- a/arch/x86/vdso/vclock_gettime.c
> > +++ b/arch/x86/vdso/vclock_gettime.c
> > @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
> >
> >  static notrace cycle_t vread_hpet(void)
> >  {
> > +#ifdef CONFIG_X86_64
> >         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> > +#else
> > +       return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> > +#endif
> >  }
> 
> Is 0xf0 not equal to HPET_COUNTER?
> 

Yes, but HPET_COUNTER is more readable.

> >
> >  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
> >  {
> >         long ret;
> > +#ifdef CONFIG_X86_64
> >         asm("syscall" : "=a" (ret) :
> >             "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> > +#else
> > +       asm("int $0x80" : "=a" (ret) :
> > +           "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> > +#endif
> >         return ret;
> >  }
> 
> __kernel_vsyscall is probably much faster if you can figure out how to
> call it from here :)
> 

Yes i know. Thats one of my problems, because i cannot call
__kernel_vsyscall directly due the relocation. Any idea?

> >
> > @@ -74,15 +83,20 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
> >  {
> >         long ret;
> >
> > +#ifdef CONFIG_X86_64
> >         asm("syscall" : "=a" (ret) :
> >             "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
> > +#else
> > +       asm("int $0x80" : "=a" (ret) :
> > +           "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
> > +#endif
> >         return ret;
> >  }
> >
> 
> Ditto.
> 
> > --- a/arch/x86/vdso/vdso32/vdso32.lds.S
> > +++ b/arch/x86/vdso/vdso32/vdso32.lds.S
> > @@ -24,6 +24,16 @@ VERSION
> >                 __kernel_vsyscall;
> >                 __kernel_sigreturn;
> >                 __kernel_rt_sigreturn;
> > +#ifdef CONFIG_X86_32
> > +               clock_gettime;
> > +               __vdso_clock_gettime;
> > +               gettimeofday;
> > +               __vdso_gettimeofday;
> > +               gettimeofday;
> > +               __vdso_gettimeofday;
> > +               time;
> > +               __vdso_time;
> > +#endif
> 
> Please remove the non-__vdso versions.  They're not useful and x86-64
> only has them for backwards compatibility.
> 
> >         local: *;
> >         };
> >  }
> > @@ -35,3 +45,8 @@ VDSO32_PRELINK                = VDSO_PRELINK;
> >  VDSO32_vsyscall                = __kernel_vsyscall;
> >  VDSO32_sigreturn       = __kernel_sigreturn;
> >  VDSO32_rt_sigreturn    = __kernel_rt_sigreturn;
> > +#ifdef CONFIG_X86_32
> > +VDSO32_clock_gettime   = clock_gettime;
> > +VDSO32_gettimeofday    = gettimeofday;
> > +VDSO32_time            = time;
> > +#endif
> 
> Yikes.  What's this?  (Just curious.)

I don't know if anyone needs this. Maybe i can kick it away.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 19:27 ` John Stultz
@ 2012-12-11 19:37   ` Andy Lutomirski
  2012-12-11 20:54   ` Stefani Seibold
  1 sibling, 0 replies; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-11 19:37 UTC (permalink / raw)
  To: John Stultz; +Cc: stefani, linux-kernel, ak, aarcange

On Tue, Dec 11, 2012 at 11:27 AM, John Stultz <john.stultz@linaro.org> wrote:
> On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
>>
>> From: Stefani Seibold <stefani@seibold.net>
>>
>> This small patch add the functions vdso_gettimeofday(),
>> vdso_clock_gettime()
>> and vdso_time() support to the VDSO for x86 32-bit kernels.
>>
>> The reason to do this was to get a fast reliable time stamp. Many
>> developers
>> uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
>> time functions a fast and reliable way, because the kernel knows the best
>> time
>> source and the P- and C-state of the CPU.
>
> Very cool. There have been similar implementations of this patch over the
> years, but they were all bit more hackish then this.
>
>
>
>> For x86 the vclock_gettime.c currently supports only the HPET and TSC
>> timer,
>> the ACPI timer should be easily to add with an other patch.
>
> Although the ACPI PM timer requires port-io which would need tweaking to
> allow normal users to access it. And I'm not sure if the performance would
> be much improved, as the port-io probably dominates the performance cost.
>

I'd actually advocate going the other way and removing hpet vdso
support.  Last time I benchmarked this, a syscall took about 20ns on
my box, and reading the hpet counter took about 750ns, giving
approximately no gain to the hpet-via-vdso optimization.

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 16:11 stefani
  2012-12-11 19:27 ` John Stultz
@ 2012-12-11 19:37 ` Andy Lutomirski
  2012-12-11 20:40   ` Stefani Seibold
  1 sibling, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2012-12-11 19:37 UTC (permalink / raw)
  To: stefani; +Cc: linux-kernel, ak, aarcange, john.stultz

On Tue, Dec 11, 2012 at 8:11 AM,  <stefani@seibold.net> wrote:
> From: Stefani Seibold <stefani@seibold.net>
>
> This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> and vdso_time() support to the VDSO for x86 32-bit kernels.
>
> The reason to do this was to get a fast reliable time stamp. Many developers
> uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> time functions a fast and reliable way, because the kernel knows the best time
> source and the P- and C-state of the CPU.
>
> For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> the ACPI timer should be easily to add with an other patch.
>
> The helper library to use the VDSO functions can be download at
> http://http://seibold.net/vdso.c

Wow -- another implementation.  See Documentation/vDSO/parse_vdso.c.
(Also, I think your code will break if anyone ever strips the vdso.)

> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
>
>  static notrace cycle_t vread_hpet(void)
>  {
> +#ifdef CONFIG_X86_64
>         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> +#else
> +       return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> +#endif
>  }

Is 0xf0 not equal to HPET_COUNTER?

>
>  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>  {
>         long ret;
> +#ifdef CONFIG_X86_64
>         asm("syscall" : "=a" (ret) :
>             "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> +#else
> +       asm("int $0x80" : "=a" (ret) :
> +           "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> +#endif
>         return ret;
>  }

__kernel_vsyscall is probably much faster if you can figure out how to
call it from here :)

>
> @@ -74,15 +83,20 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
>  {
>         long ret;
>
> +#ifdef CONFIG_X86_64
>         asm("syscall" : "=a" (ret) :
>             "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
> +#else
> +       asm("int $0x80" : "=a" (ret) :
> +           "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
> +#endif
>         return ret;
>  }
>

Ditto.

> --- a/arch/x86/vdso/vdso32/vdso32.lds.S
> +++ b/arch/x86/vdso/vdso32/vdso32.lds.S
> @@ -24,6 +24,16 @@ VERSION
>                 __kernel_vsyscall;
>                 __kernel_sigreturn;
>                 __kernel_rt_sigreturn;
> +#ifdef CONFIG_X86_32
> +               clock_gettime;
> +               __vdso_clock_gettime;
> +               gettimeofday;
> +               __vdso_gettimeofday;
> +               gettimeofday;
> +               __vdso_gettimeofday;
> +               time;
> +               __vdso_time;
> +#endif

Please remove the non-__vdso versions.  They're not useful and x86-64
only has them for backwards compatibility.

>         local: *;
>         };
>  }
> @@ -35,3 +45,8 @@ VDSO32_PRELINK                = VDSO_PRELINK;
>  VDSO32_vsyscall                = __kernel_vsyscall;
>  VDSO32_sigreturn       = __kernel_sigreturn;
>  VDSO32_rt_sigreturn    = __kernel_rt_sigreturn;
> +#ifdef CONFIG_X86_32
> +VDSO32_clock_gettime   = clock_gettime;
> +VDSO32_gettimeofday    = gettimeofday;
> +VDSO32_time            = time;
> +#endif

Yikes.  What's this?  (Just curious.)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Add VDSO time function support for x86 32-bit kernel
  2012-12-11 16:11 stefani
@ 2012-12-11 19:27 ` John Stultz
  2012-12-11 19:37   ` Andy Lutomirski
  2012-12-11 20:54   ` Stefani Seibold
  2012-12-11 19:37 ` Andy Lutomirski
  1 sibling, 2 replies; 50+ messages in thread
From: John Stultz @ 2012-12-11 19:27 UTC (permalink / raw)
  To: stefani; +Cc: linux-kernel, ak, aarcange, luto

On 12/11/2012 08:11 AM, stefani@seibold.net wrote:
> From: Stefani Seibold <stefani@seibold.net>
>
> This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
> and vdso_time() support to the VDSO for x86 32-bit kernels.
>
> The reason to do this was to get a fast reliable time stamp. Many developers
> uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
> time functions a fast and reliable way, because the kernel knows the best time
> source and the P- and C-state of the CPU.
Very cool. There have been similar implementations of this patch over 
the years, but they were all bit more hackish then this.


> For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
> the ACPI timer should be easily to add with an other patch.
Although the ACPI PM timer requires port-io which would need tweaking to 
allow normal users to access it. And I'm not sure if the performance 
would be much improved, as the port-io probably dominates the 
performance cost.

> The helper library to use the VDSO functions can be download at
> http://http://seibold.net/vdso.c
> The libary is very small, only 228 lines of code. Compile it with
> gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
> and use it with LD_PRELOAD=<path>/libvdso.so
>
> This kind of helper must be integrated into glibc, for x86 64-bit and
> PowerPC it is already there.
>
A few notes below...


> diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
> index 59c6c40..45ba688 100644
> --- a/arch/x86/include/asm/compat.h
> +++ b/arch/x86/include/asm/compat.h
> @@ -295,6 +295,10 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
>   
>   static inline void __user *arch_compat_alloc_user_space(long len)
>   {
> +#ifdef CONFIG_X86_32
> +	struct pt_regs *regs = task_pt_regs(current);
> +	return (void __user *)regs->sp - len;
> +#else
>   	compat_uptr_t sp;
>   
>   	if (test_thread_flag(TIF_IA32)) {
> @@ -305,6 +309,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
>   	}
>   
>   	return (void __user *)round_down(sp - len, 16);
> +#endif
>   }

This style of in-line ifdefs are ugly and hard to read.

So instead of doing:
void myfunction (void)
{
#ifdef 32bits
     32_bit_implementation();
#else
     64_bit_implementation();
#endif
}

Where possible, please do:
#ifdef 32bits
void myfunction1(void)
{
     32_bit implementation();
}
void myfunction2(void)
{
....
#else /*64 bit versions */
void myfunction1(void)
{
     64_bit implementation();
}
void myfunction2(void)
....
#endif

> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index 4df6c37..2dc6b72 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
>   
>   static notrace cycle_t vread_hpet(void)
>   {
> +#ifdef CONFIG_X86_64
>   	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
> +#else
> +	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
> +#endif
>   }
>   
>   notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
>   {
>   	long ret;
> +#ifdef CONFIG_X86_64
>   	asm("syscall" : "=a" (ret) :
>   	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
> +#else
> +	asm("int $0x80" : "=a" (ret) :
> +	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
> +#endif
>   	return ret;
>   }
Same point here.


> diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
> new file mode 100644
> index 0000000..c9a1909
> --- /dev/null
> +++ b/arch/x86/vdso/vdso32/vclock_gettime.c
> @@ -0,0 +1,7 @@
> +/*
> + * since vgtod layout differs between X86_64 and x86_32, it is not possible to
> + * provide a 32 bit vclock with a 64 bit kernel
> + */
> +#ifdef CONFIG_X86_32
> +#include "../vclock_gettime.c"
> +#endif
Could you expand a bit as to why a compat layer isn't possible? It seems 
we could easily convert the vsyscall_gtod_data to a more explicit 
arch-neutral size. Or is it the actual data page mapping?

thanks
-john

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH] Add VDSO time function support for x86 32-bit kernel
@ 2012-12-11 16:11 stefani
  2012-12-11 19:27 ` John Stultz
  2012-12-11 19:37 ` Andy Lutomirski
  0 siblings, 2 replies; 50+ messages in thread
From: stefani @ 2012-12-11 16:11 UTC (permalink / raw)
  To: linux-kernel, ak, aarcange, john.stultz, luto; +Cc: stefani

From: Stefani Seibold <stefani@seibold.net>

This small patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
and vdso_time() support to the VDSO for x86 32-bit kernels.

The reason to do this was to get a fast reliable time stamp. Many developers
uses TSC to get a fast time time stamp, without knowing the pitfalls. VDSO
time functions a fast and reliable way, because the kernel knows the best time
source and the P- and C-state of the CPU.

For x86 the vclock_gettime.c currently supports only the HPET and TSC timer,
the ACPI timer should be easily to add with an other patch.

The helper library to use the VDSO functions can be download at
http://http://seibold.net/vdso.c
The libary is very small, only 228 lines of code. Compile it with
gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
and use it with LD_PRELOAD=<path>/libvdso.so

This kind of helper must be integrated into glibc, for x86 64-bit and
PowerPC it is already there.

Some benchmark results (all measurements are in nano seconds):

Intel(R) Celeron(TM) CPU 400MHz

Average time kernel call:
 gettimeofday(): 1039
 clock_gettime(): 1578
 benchmark time(): 526
Average time VDSO call:
 gettimeofday(): 378
 clock_gettime(): 303
 benchmark time(): 60

Celeron(R) Dual-Core CPU T3100 1.90GHz

Average time kernel call:
 gettimeofday(): 209
 clock_gettime(): 406
 benchmark time(): 135
Average time VDSO call:
 gettimeofday(): 51
 clock_gettime(): 43
 benchmark time(): 10

So you can see a performance increase between 4 and 13, depending on the
CPU and the function.

The patch is against kernel 3.7. Please apply if you like it.

Changelog:

25.11.2012 - first release and prove of concept for linux 3.4
11.12.2012 - Port to linux 3.7 and code cleanup

Signed-off-by: Stefani Seibold <stefani@seibold.net>
---
 arch/x86/Kconfig                      |  4 +-
 arch/x86/include/asm/clocksource.h    |  4 --
 arch/x86/include/asm/compat.h         |  5 ++
 arch/x86/include/asm/fixmap.h         |  3 +-
 arch/x86/include/asm/vgtod.h          |  1 +
 arch/x86/include/asm/vvar.h           |  7 +++
 arch/x86/kernel/Makefile              |  1 +
 arch/x86/kernel/hpet.c                |  9 ++--
 arch/x86/kernel/setup.c               |  2 +
 arch/x86/kernel/tsc.c                 |  2 -
 arch/x86/kernel/vmlinux.lds.S         |  3 --
 arch/x86/kernel/vsyscall_64.c         | 49 ------------------
 arch/x86/kernel/vsyscall_gtod.c       | 94 +++++++++++++++++++++++++++++++++++
 arch/x86/vdso/Makefile                |  1 +
 arch/x86/vdso/vclock_gettime.c        | 16 +++++-
 arch/x86/vdso/vdso32/vclock_gettime.c |  7 +++
 arch/x86/vdso/vdso32/vdso32.lds.S     | 15 ++++++
 17 files changed, 158 insertions(+), 65 deletions(-)
 create mode 100644 arch/x86/kernel/vsyscall_gtod.c
 create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 46c3bff..b8c2c74 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,9 +100,9 @@ config X86
 	select GENERIC_CMOS_UPDATE
 	select CLOCKSOURCE_WATCHDOG
 	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA if X86_64
+	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL if X86_64
+	select GENERIC_TIME_VSYSCALL
 	select KTIME_SCALAR if X86_32
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3..67d68b9 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,8 +3,6 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#ifdef CONFIG_X86_64
-
 #define VCLOCK_NONE 0  /* No vDSO clock available.	*/
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
@@ -13,6 +11,4 @@ struct arch_clocksource_data {
 	int vclock_mode;
 };
 
-#endif /* CONFIG_X86_64 */
-
 #endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c40..45ba688 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -295,6 +295,10 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 
 static inline void __user *arch_compat_alloc_user_space(long len)
 {
+#ifdef CONFIG_X86_32
+	struct pt_regs *regs = task_pt_regs(current);
+	return (void __user *)regs->sp - len;
+#else
 	compat_uptr_t sp;
 
 	if (test_thread_flag(TIF_IA32)) {
@@ -305,6 +309,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 	}
 
 	return (void __user *)round_down(sp - len, 16);
+#endif
 }
 
 static inline bool is_x32_task(void)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c..b26e9e0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,9 +78,10 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
 			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+#endif
 	VVAR_PAGE,
 	VSYSCALL_HPET,
-#endif
+
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 46e24d3..eb87b53 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -27,4 +27,5 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;

+extern void map_vgtod(void);
 #endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index de656ac..6f71098 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -17,7 +17,11 @@
  */
 
 /* Base address of vvars.  This is not ABI. */
+#ifdef CONFIG_X86_64
 #define VVAR_ADDRESS (-10*1024*1024 - 4096)
+#else
+#define VVAR_ADDRESS 0xffffd000
+#endif
 
 #if defined(__VVAR_KERNEL_LDS)
 
@@ -46,5 +50,8 @@
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+#ifdef CONFIG_X86_32
+DECLARE_VVAR(512, const void __iomem *, vsyscall_hpet)
+#endif
 
 #undef DECLARE_VVAR
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 91ce48f..298a0b1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-y			+= syscall_$(BITS).o
+obj-y			+= vsyscall_gtod.o
 obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1460a5d..38887ca 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -69,13 +69,18 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
 
 #ifdef CONFIG_X86_64
 #include <asm/pgtable.h>
+#else
+#include <asm/vvar.h>
+
+DEFINE_VVAR(const void __iomem *, vsyscall_hpet);
 #endif
 
 static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
 	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
+#ifdef CONFIG_X86_32
+	vsyscall_hpet = (const void __iomem *)fix_to_virt(VSYSCALL_HPET);
 #endif
 }
 
@@ -752,9 +757,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
-#ifdef CONFIG_X86_64
 	.archdata	= { .vclock_mode = VCLOCK_HPET },
-#endif
 };
 
 static int hpet_clocksource_register(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..c2f6bbb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,7 @@
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
+#include <asm/vgtod.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -997,6 +998,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_64
 	map_vsyscall();
 #endif
+	map_vgtod();
 
 	generic_apic_probe();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cfa5d4f..078cc9a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -772,9 +772,7 @@ static struct clocksource clocksource_tsc = {
 	.mask                   = CLOCKSOURCE_MASK(64),
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
-#endif
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 22a1530..d73c1df 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -151,7 +151,6 @@ SECTIONS
 		_edata = .;
 	} :data
 
-#ifdef CONFIG_X86_64
 
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
@@ -173,8 +172,6 @@ SECTIONS
 
        . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
 
-#endif /* CONFIG_X86_64 */
-
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
 	.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9..dfc9727 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,6 @@
 #include "vsyscall_trace.h"
 
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
@@ -77,48 +76,6 @@ static int __init vsyscall_setup(char *str)
 }
 early_param("vsyscall", vsyscall_setup);
 
-void update_vsyscall_tz(void)
-{
-	vsyscall_gtod_data.sys_tz = sys_tz;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
-	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
-	write_seqcount_begin(&vdata->seq);
-
-	/* copy vsyscall data */
-	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->clock->cycle_last;
-	vdata->clock.mask		= tk->clock->mask;
-	vdata->clock.mult		= tk->mult;
-	vdata->clock.shift		= tk->shift;
-
-	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->xtime_nsec;
-
-	vdata->monotonic_time_sec	= tk->xtime_sec
-					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->xtime_nsec
-					+ (tk->wall_to_monotonic.tv_nsec
-						<< tk->shift);
-	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->shift)) {
-		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->shift;
-		vdata->monotonic_time_sec++;
-	}
-
-	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
-
-	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
-							tk->wall_to_monotonic);
-
-	write_seqcount_end(&vdata->seq);
-}
-
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 			      const char *message)
 {
@@ -366,8 +323,6 @@ void __init map_vsyscall(void)
 {
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
-	extern char __vvar_page;
-	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
 		     vsyscall_mode == NATIVE
@@ -375,10 +330,6 @@ void __init map_vsyscall(void)
 		     : PAGE_KERNEL_VVAR);
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
 		     (unsigned long)VSYSCALL_START);
-
-	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
-	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
-		     (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 0000000..3631f7f
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Modified for x86 32 bit architecture by
+ *  Stefani Seibold <stefani@seibold.net>
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/topology.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/compat.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#include <asm/traps.h>
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+	vsyscall_gtod_data.sys_tz = sys_tz;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+	write_seqcount_begin(&vdata->seq);
+
+	/* copy vsyscall data */
+	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->clock->cycle_last;
+	vdata->clock.mask		= tk->clock->mask;
+	vdata->clock.mult		= tk->mult;
+	vdata->clock.shift		= tk->shift;
+
+	vdata->wall_time_sec		= tk->xtime_sec;
+	vdata->wall_time_snsec		= tk->xtime_nsec;
+
+	vdata->monotonic_time_sec	= tk->xtime_sec
+					+ tk->wall_to_monotonic.tv_sec;
+	vdata->monotonic_time_snsec	= tk->xtime_nsec
+					+ (tk->wall_to_monotonic.tv_nsec
+						<< tk->shift);
+	while (vdata->monotonic_time_snsec >=
+					(((u64)NSEC_PER_SEC) << tk->shift)) {
+		vdata->monotonic_time_snsec -=
+					((u64)NSEC_PER_SEC) << tk->shift;
+		vdata->monotonic_time_sec++;
+	}
+
+	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec;
+	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift);
+
+	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse,
+							tk->wall_to_monotonic);
+
+	write_seqcount_end(&vdata->seq);
+}
+
+void __init map_vgtod(void)
+{
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+		     (unsigned long)VVAR_ADDRESS);
+#endif
+}
+
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index fd14be1..959221b 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -147,6 +147,7 @@ $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
 
 $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
 				 $(obj)/vdso32/vdso32.lds \
+				 $(obj)/vdso32/vclock_gettime.o \
 				 $(obj)/vdso32/note.o \
 				 $(obj)/vdso32/%.o
 	$(call if_changed,vdso)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c37..2dc6b72 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -59,14 +59,23 @@ notrace static cycle_t vread_tsc(void)
 
 static notrace cycle_t vread_hpet(void)
 {
+#ifdef CONFIG_X86_64
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+#else
+	return readl(VVAR(vsyscall_hpet) + HPET_COUNTER);
+#endif
 }
 
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
+#ifdef CONFIG_X86_64
 	asm("syscall" : "=a" (ret) :
 	    "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
+#else
+	asm("int $0x80" : "=a" (ret) :
+	    "a" (__NR_clock_gettime), "b" (clock), "c" (ts) : "memory");
+#endif
 	return ret;
 }
 
@@ -74,15 +83,20 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 
+#ifdef CONFIG_X86_64
 	asm("syscall" : "=a" (ret) :
 	    "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+#else
+	asm("int $0x80" : "=a" (ret) :
+	    "a" (__NR_gettimeofday), "b" (tv), "c" (tz) : "memory");
+#endif
 	return ret;
 }
 
 
 notrace static inline u64 vgetsns(void)
 {
-	long v;
+	u64 v;
 	cycles_t cycles;
 	if (gtod->clock.vclock_mode == VCLOCK_TSC)
 		cycles = vread_tsc();
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 0000000..c9a1909
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,7 @@
+/*
+ * since vgtod layout differs between X86_64 and x86_32, it is not possible to
+ * provide a 32 bit vclock with a 64 bit kernel
+ */
+#ifdef CONFIG_X86_32
+#include "../vclock_gettime.c"
+#endif
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S
index 976124b..80ef0d6 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -24,6 +24,16 @@ VERSION
 		__kernel_vsyscall;
 		__kernel_sigreturn;
 		__kernel_rt_sigreturn;
+#ifdef CONFIG_X86_32
+		clock_gettime;
+		__vdso_clock_gettime;
+		gettimeofday;
+		__vdso_gettimeofday;
+		gettimeofday;
+		__vdso_gettimeofday;
+		time;
+		__vdso_time;
+#endif
 	local: *;
 	};
 }
@@ -35,3 +45,8 @@ VDSO32_PRELINK		= VDSO_PRELINK;
 VDSO32_vsyscall		= __kernel_vsyscall;
 VDSO32_sigreturn	= __kernel_sigreturn;
 VDSO32_rt_sigreturn	= __kernel_rt_sigreturn;
+#ifdef CONFIG_X86_32
+VDSO32_clock_gettime	= clock_gettime;
+VDSO32_gettimeofday	= gettimeofday;
+VDSO32_time		= time;
+#endif
-- 
1.8.0


^ permalink raw reply related	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2012-12-17 18:58 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-12 20:19 [PATCH] Add VDSO time function support for x86 32-bit kernel stefani
2012-12-12 23:34 ` H. Peter Anvin
2012-12-13  5:53   ` Stefani Seibold
2012-12-13  6:10     ` H. Peter Anvin
2012-12-13  6:14     ` H. Peter Anvin
2012-12-13  6:17       ` Stefani Seibold
2012-12-13  6:47         ` H. Peter Anvin
2012-12-13  7:17           ` Stefani Seibold
2012-12-13 19:32             ` Andy Lutomirski
2012-12-14  0:09               ` H. Peter Anvin
2012-12-14  0:20                 ` Andy Lutomirski
2012-12-14  0:36                   ` H. Peter Anvin
2012-12-14  1:32                   ` H. Peter Anvin
2012-12-14  1:42                     ` Andy Lutomirski
2012-12-14  1:49                       ` H. Peter Anvin
2012-12-14  2:11                         ` Andy Lutomirski
2012-12-14  2:18                           ` H. Peter Anvin
2012-12-14  2:20                             ` Andy Lutomirski
2012-12-14  8:34                               ` [CRIU] " Pavel Emelyanov
2012-12-14 18:35                                 ` H. Peter Anvin
2012-12-14 18:44                                   ` Andy Lutomirski
2012-12-14 18:47                                     ` H. Peter Anvin
2012-12-14 20:12                                       ` Cyrill Gorcunov
2012-12-14 21:08                                         ` H. Peter Anvin
2012-12-14 21:20                                           ` Cyrill Gorcunov
2012-12-14 21:21                                             ` H. Peter Anvin
2012-12-14 21:27                                           ` Andy Lutomirski
2012-12-14 22:00                                             ` H. Peter Anvin
2012-12-14 22:25                                               ` Cyrill Gorcunov
2012-12-14 22:27                                                 ` H. Peter Anvin
2012-12-14 22:43                                                   ` Cyrill Gorcunov
2012-12-14 22:48                                                     ` H. Peter Anvin
2012-12-14 23:48                                                       ` John Stultz
2012-12-14 23:55                                                         ` H. Peter Anvin
2012-12-17  9:05                                     ` Pavel Emelyanov
     [not found]                                       ` <fb2e871b-3e2a-4e96-9eb9-cb2dd4f66eaa@email.android! .com>
2012-12-17 15:21                                       ` H. Peter Anvin
2012-12-17 18:56                                         ` Pavel Emelyanov
2012-12-17 18:57                                           ` H. Peter Anvin
2012-12-14 22:46                                 ` H. Peter Anvin
2012-12-14 23:09                                   ` Stefani Seibold
2012-12-14 23:29                                     ` H. Peter Anvin
  -- strict thread matches above, loose matches on Subject: below --
2012-12-11 16:11 stefani
2012-12-11 19:27 ` John Stultz
2012-12-11 19:37   ` Andy Lutomirski
2012-12-11 20:54   ` Stefani Seibold
2012-12-11 21:18     ` Andy Lutomirski
2012-12-11 21:28       ` Stefani Seibold
2012-12-11 19:37 ` Andy Lutomirski
2012-12-11 20:40   ` Stefani Seibold
2012-12-12  1:29     ` Andy Lutomirski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).