All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] [PATCH] use "reliable" tsc properly when available, but verify
@ 2009-09-28 20:19 Dan Magenheimer
  2009-09-28 21:01 ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Magenheimer @ 2009-09-28 20:19 UTC (permalink / raw)
  To: Xen-Devel (E-mail)

[-- Attachment #1: Type: text/plain, Size: 11555 bytes --]

(This is compile-tested only.  Review requested especially
where marked with FIXME.  Also, code that does write_tsc
in smpboot.c may also need patching.  And Linux does
a touch_nmi_watchdog()... does Xen need to do
anything similar?)

Most modern Intel and AMD processors and servers have
fully synchronized, non-stop TSCs that don't even stop
in C3 state.  Recent upstream Linux kernels test a cpuid
bit and record this capability as
X86_FEATURE_TSC_RELIABLE.  According to Intel, all
recent Intel processors AND the systems built on them
have this property.  According to AMD, many recent AMD
processors (and all recent server processors) have
this property but apparently some of the systems built
on them do not.

So we trust but verify... if the cpuid bit is set
we assume a reliable tsc, but use the Linux boottime
check_tsc_warp algorithm to periodically verify that
the tsc hasn't skewed.  If it has, we fall back to Xen
managing (and periodically writing) the TSC.

The check_tsc_warp algorithm is CPU-intensive, so
we test it on a decaying schedule, at 1sec, 2sec,
4sec, 8sec, 16sec, 32sec, etc.

Also correct mis-spelling of NOSTOP to NONSTOP to
match the Linux spelling.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>

diff -r 1e33261a814f xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/Makefile	Mon Sep 28 13:26:11 2009 -0600
@@ -45,6 +45,7 @@ obj-y += string.o
 obj-y += string.o
 obj-y += sysctl.o
 obj-y += time.o
+obj-y += tsc_sync.o
 obj-y += trace.o
 obj-y += traps.o
 obj-y += usercopy.o
diff -r 1e33261a814f xen/arch/x86/cpu/amd.c
--- a/xen/arch/x86/cpu/amd.c	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/cpu/amd.c	Mon Sep 28 13:26:11 2009 -0600
@@ -463,7 +463,9 @@ static void __devinit init_amd(struct cp
 		c->x86_power = cpuid_edx(0x80000007);
 		if (c->x86_power & (1<<8)) {
 			set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-			set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+			set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
+                        if ( c->x86 != 0x11 )
+			    set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
 		}
 	}
 
diff -r 1e33261a814f xen/arch/x86/cpu/intel.c
--- a/xen/arch/x86/cpu/intel.c	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/cpu/intel.c	Mon Sep 28 13:26:11 2009 -0600
@@ -226,7 +226,8 @@ static void __devinit init_intel(struct 
 		set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
 	if (cpuid_edx(0x80000007) & (1u<<8)) {
 		set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-		set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+		set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
+		set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
 	}
 	if ((c->cpuid_level >= 0x00000006) &&
 	    (cpuid_eax(0x00000006) & (1u<<2)))
diff -r 1e33261a814f xen/arch/x86/time.c
--- a/xen/arch/x86/time.c	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/time.c	Mon Sep 28 13:26:11 2009 -0600
@@ -698,7 +698,7 @@ void cstate_restore_tsc(void)
     s_time_t stime_delta;
     u64 new_tsc;
 
-    if ( boot_cpu_has(X86_FEATURE_NOSTOP_TSC) )
+    if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
         return;
 
     stime_delta = read_platform_stime() - t->stime_master_stamp;
@@ -1100,6 +1100,11 @@ struct calibration_rendezvous {
     u64 master_tsc_stamp;
 };
 
+static void (*rendezvous_func) (void *info);
+static int tsc_reliable = 0;
+static unsigned long tsc_max_warp = 0;
+static unsigned long tsc_verify_decay = 0;
+
 static void time_calibration_tsc_rendezvous(void *_r)
 {
     int i;
@@ -1180,6 +1185,50 @@ static void time_calibration_std_rendezv
     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
 }
 
+static void time_verify_tsc_calibration_rendezvous(void *_r)
+{
+    struct cpu_calibration *c = &this_cpu(cpu_calibration);
+    struct calibration_rendezvous *r = _r;
+    unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
+
+    /* check_tsc_warp is VERY expensive so test only on log2 intervals */
+    tsc_verify_decay++;
+    if ( !(tsc_verify_decay & (tsc_verify_decay-1)) )
+    {
+        if ( smp_processor_id() == 0 )
+        {
+            while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
+                mb();
+            check_tsc_warp(cpu_khz, &tsc_max_warp);
+            atomic_inc(&r->semaphore);
+        }
+        else
+        {
+            atomic_inc(&r->semaphore);
+            while ( atomic_read(&r->semaphore) < total_cpus )
+                mb();
+            check_tsc_warp(cpu_khz, &tsc_max_warp);
+            atomic_inc(&r->semaphore);
+            while ( atomic_read(&r->semaphore) > total_cpus )
+                mb();
+        }
+    }
+
+    if ( tsc_max_warp && smp_processor_id() == 0 )
+    {
+        printk("TSC warp detected (%lu cycles), disabling reliable TSC\n",
+                tsc_max_warp);
+        tsc_reliable = -1;
+        rendezvous_func = time_calibration_tsc_rendezvous;
+    }
+
+    rdtscll(c->local_tsc_stamp);
+    c->stime_local_stamp = get_s_time();
+    c->stime_master_stamp = r->master_stime;
+
+    raise_softirq(TIME_CALIBRATE_SOFTIRQ);
+}
+
 static void time_calibration(void *unused)
 {
     struct calibration_rendezvous r = {
@@ -1188,11 +1237,7 @@ static void time_calibration(void *unuse
     };
 
     /* @wait=1 because we must wait for all cpus before freeing @r. */
-    on_selected_cpus(&r.cpu_calibration_map,
-                     opt_consistent_tscs
-                     ? time_calibration_tsc_rendezvous
-                     : time_calibration_std_rendezvous,
-                     &r, 1);
+    on_selected_cpus(&r.cpu_calibration_map, rendezvous_func, &r, 1);
 }
 
 void init_percpu_time(void)
@@ -1219,16 +1264,19 @@ void init_percpu_time(void)
 /* Late init function (after all CPUs are booted). */
 int __init init_xen_time(void)
 {
-    if ( !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
-        opt_consistent_tscs = 0;
-
-    /* If we have constant TSCs then scale factor can be shared. */
-    if ( opt_consistent_tscs )
+    /* If we have reliable TSCs then scale factor can be shared. */
+    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
     {
         int cpu;
         for_each_possible_cpu ( cpu )
             per_cpu(cpu_time, cpu).tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
+        rendezvous_func = time_verify_tsc_calibration_rendezvous;
+        tsc_reliable = 1;
     }
+    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+        rendezvous_func = time_calibration_tsc_rendezvous;
+    else
+        rendezvous_func = time_calibration_std_rendezvous;
 
     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
 
@@ -1463,6 +1511,13 @@ static void dump_softtsc(unsigned char k
     struct domain *d;
     int domcnt = 0;
 
+    if ( tsc_reliable > 0 )
+        printk("TSC is reliable\n");
+    else if ( tsc_reliable < 0 )
+        printk("Hardware determined TSC reliable, verification failed with "
+               "warp = %lu cycles\n", tsc_max_warp);
+    else
+        printk("TSC is not reliable\n");
     for_each_domain ( d )
     {
         if ( !d->arch.vtsc )
diff -r 1e33261a814f xen/arch/x86/tsc_sync.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/tsc_sync.c	Mon Sep 28 13:26:11 2009 -0600
@@ -0,0 +1,93 @@
+/*
+ * check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ * Modified for Xen by Dan Magenheimer, Oracle Corp.
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <xen/config.h>
+#include <xen/spinlock.h>
+#include <asm/processor.h>
+#include <asm/time.h>
+
+/* FIXME Are these OK for Xen? Xen has no _raw_spin_lock() */
+#define rdtsc_barrier  mb
+#define raw_spinlock_t spinlock_t
+#define __raw_spin_lock spin_lock
+#define __raw_spin_unlock spin_unlock
+#define __RAW_SPIN_LOCK_UNLOCKED  SPIN_LOCK_UNLOCKED
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
+static __cpuinitdata cycles_t last_tsc;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
+{
+	cycles_t start, now, prev, end;
+	int i;
+
+	rdtsc_barrier();
+	start = get_cycles();
+	rdtsc_barrier();
+	/*
+	 * The measurement runs for 2 msecs:
+	 */
+	end = start + tsc_khz * 2ULL;
+	now = start;
+
+	for (i = 0; ; i++) {
+		/*
+		 * We take the global lock, measure TSC, save the
+		 * previous TSC that was measured (possibly on
+		 * another CPU) and update the previous TSC timestamp.
+		 */
+		__raw_spin_lock(&sync_lock);
+		prev = last_tsc;
+		rdtsc_barrier();
+		now = get_cycles();
+		rdtsc_barrier();
+		last_tsc = now;
+		__raw_spin_unlock(&sync_lock);
+
+		/*
+		 * Be nice every now and then (and also check whether
+		 * measurement is done [we also insert a 10 million
+		 * loops safety exit, so we dont lock up in case the
+		 * TSC readout is totally broken]):
+		 */
+		if (unlikely(!(i & 7))) {
+			if (now > end || i > 10000000)
+				break;
+			cpu_relax();
+			/*touch_nmi_watchdog();*/
+		}
+		/*
+		 * Outside the critical section we can now see whether
+		 * we saw a time-warp of the TSC going backwards:
+		 */
+		if (unlikely(prev > now)) {
+			__raw_spin_lock(&sync_lock);
+			if ( *max_warp > prev - now )
+				*max_warp = prev - now;
+			__raw_spin_unlock(&sync_lock);
+		}
+	}
+}
diff -r 1e33261a814f xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/include/asm-x86/cpufeature.h	Mon Sep 28 13:26:11 2009 -0600
@@ -74,9 +74,10 @@
 #define X86_FEATURE_P3		(3*32+ 6) /* P3 */
 #define X86_FEATURE_P4		(3*32+ 7) /* P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_NOSTOP_TSC	(3*32+ 9) /* TSC does not stop in C states */
+#define X86_FEATURE_NONSTOP_TSC	(3*32+ 9) /* TSC does not stop in C states */
 #define X86_FEATURE_ARAT	(3*32+ 10) /* Always running APIC timer */
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_TSC_RELIABLE (3*32+12) /* TSC is known to be reliable */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff -r 1e33261a814f xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/include/asm-x86/time.h	Mon Sep 28 13:26:11 2009 -0600
@@ -43,4 +43,6 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
 
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
 
+void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp);
+
 #endif /* __X86_TIME_H__ */

[-- Attachment #2: tsc-reliable.patch --]
[-- Type: application/octet-stream, Size: 9991 bytes --]

diff -r 1e33261a814f xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/Makefile	Mon Sep 28 13:26:11 2009 -0600
@@ -45,6 +45,7 @@ obj-y += string.o
 obj-y += string.o
 obj-y += sysctl.o
 obj-y += time.o
+obj-y += tsc_sync.o
 obj-y += trace.o
 obj-y += traps.o
 obj-y += usercopy.o
diff -r 1e33261a814f xen/arch/x86/cpu/amd.c
--- a/xen/arch/x86/cpu/amd.c	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/cpu/amd.c	Mon Sep 28 13:26:11 2009 -0600
@@ -463,7 +463,9 @@ static void __devinit init_amd(struct cp
 		c->x86_power = cpuid_edx(0x80000007);
 		if (c->x86_power & (1<<8)) {
 			set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-			set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+			set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
+                        if ( c->x86 != 0x11 )
+			    set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
 		}
 	}
 
diff -r 1e33261a814f xen/arch/x86/cpu/intel.c
--- a/xen/arch/x86/cpu/intel.c	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/cpu/intel.c	Mon Sep 28 13:26:11 2009 -0600
@@ -226,7 +226,8 @@ static void __devinit init_intel(struct 
 		set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
 	if (cpuid_edx(0x80000007) & (1u<<8)) {
 		set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-		set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+		set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
+		set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
 	}
 	if ((c->cpuid_level >= 0x00000006) &&
 	    (cpuid_eax(0x00000006) & (1u<<2)))
diff -r 1e33261a814f xen/arch/x86/time.c
--- a/xen/arch/x86/time.c	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/arch/x86/time.c	Mon Sep 28 13:26:11 2009 -0600
@@ -698,7 +698,7 @@ void cstate_restore_tsc(void)
     s_time_t stime_delta;
     u64 new_tsc;
 
-    if ( boot_cpu_has(X86_FEATURE_NOSTOP_TSC) )
+    if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
         return;
 
     stime_delta = read_platform_stime() - t->stime_master_stamp;
@@ -1100,6 +1100,11 @@ struct calibration_rendezvous {
     u64 master_tsc_stamp;
 };
 
+static void (*rendezvous_func) (void *info);
+static int tsc_reliable = 0;
+static unsigned long tsc_max_warp = 0;
+static unsigned long tsc_verify_decay = 0;
+
 static void time_calibration_tsc_rendezvous(void *_r)
 {
     int i;
@@ -1180,6 +1185,50 @@ static void time_calibration_std_rendezv
     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
 }
 
+static void time_verify_tsc_calibration_rendezvous(void *_r)
+{
+    struct cpu_calibration *c = &this_cpu(cpu_calibration);
+    struct calibration_rendezvous *r = _r;
+    unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
+
+    /* check_tsc_warp is VERY expensive so test only on log2 intervals */
+    tsc_verify_decay++;
+    if ( !(tsc_verify_decay & (tsc_verify_decay-1)) )
+    {
+        if ( smp_processor_id() == 0 )
+        {
+            while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
+                mb();
+            check_tsc_warp(cpu_khz, &tsc_max_warp);
+            atomic_inc(&r->semaphore);
+        }
+        else
+        {
+            atomic_inc(&r->semaphore);
+            while ( atomic_read(&r->semaphore) < total_cpus )
+                mb();
+            check_tsc_warp(cpu_khz, &tsc_max_warp);
+            atomic_inc(&r->semaphore);
+            while ( atomic_read(&r->semaphore) > total_cpus )
+                mb();
+        }
+    }
+
+    if ( tsc_max_warp && smp_processor_id() == 0 )
+    {
+        printk("TSC warp detected (%lu cycles), disabling reliable TSC\n",
+                tsc_max_warp);
+        tsc_reliable = -1;
+        rendezvous_func = time_calibration_tsc_rendezvous;
+    }
+
+    rdtscll(c->local_tsc_stamp);
+    c->stime_local_stamp = get_s_time();
+    c->stime_master_stamp = r->master_stime;
+
+    raise_softirq(TIME_CALIBRATE_SOFTIRQ);
+}
+
 static void time_calibration(void *unused)
 {
     struct calibration_rendezvous r = {
@@ -1188,11 +1237,7 @@ static void time_calibration(void *unuse
     };
 
     /* @wait=1 because we must wait for all cpus before freeing @r. */
-    on_selected_cpus(&r.cpu_calibration_map,
-                     opt_consistent_tscs
-                     ? time_calibration_tsc_rendezvous
-                     : time_calibration_std_rendezvous,
-                     &r, 1);
+    on_selected_cpus(&r.cpu_calibration_map, rendezvous_func, &r, 1);
 }
 
 void init_percpu_time(void)
@@ -1219,16 +1264,19 @@ void init_percpu_time(void)
 /* Late init function (after all CPUs are booted). */
 int __init init_xen_time(void)
 {
-    if ( !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
-        opt_consistent_tscs = 0;
-
-    /* If we have constant TSCs then scale factor can be shared. */
-    if ( opt_consistent_tscs )
+    /* If we have reliable TSCs then scale factor can be shared. */
+    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
     {
         int cpu;
         for_each_possible_cpu ( cpu )
             per_cpu(cpu_time, cpu).tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
+        rendezvous_func = time_verify_tsc_calibration_rendezvous;
+        tsc_reliable = 1;
     }
+    else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+        rendezvous_func = time_calibration_tsc_rendezvous;
+    else
+        rendezvous_func = time_calibration_std_rendezvous;
 
     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
 
@@ -1463,6 +1511,13 @@ static void dump_softtsc(unsigned char k
     struct domain *d;
     int domcnt = 0;
 
+    if ( tsc_reliable > 0 )
+        printk("TSC is reliable\n");
+    else if ( tsc_reliable < 0 )
+        printk("Hardware determined TSC reliable, verification failed with "
+               "warp = %lu cycles\n", tsc_max_warp);
+    else
+        printk("TSC is not reliable\n");
     for_each_domain ( d )
     {
         if ( !d->arch.vtsc )
diff -r 1e33261a814f xen/arch/x86/tsc_sync.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/tsc_sync.c	Mon Sep 28 13:26:11 2009 -0600
@@ -0,0 +1,93 @@
+/*
+ * check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ * Modified for Xen by Dan Magenheimer, Oracle Corp.
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <xen/config.h>
+#include <xen/spinlock.h>
+#include <asm/processor.h>
+#include <asm/time.h>
+
+/* FIXME Are these OK for Xen? Xen has no _raw_spin_lock() */
+#define rdtsc_barrier  mb
+#define raw_spinlock_t spinlock_t
+#define __raw_spin_lock spin_lock
+#define __raw_spin_unlock spin_unlock
+#define __RAW_SPIN_LOCK_UNLOCKED  SPIN_LOCK_UNLOCKED
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
+static __cpuinitdata cycles_t last_tsc;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
+{
+	cycles_t start, now, prev, end;
+	int i;
+
+	rdtsc_barrier();
+	start = get_cycles();
+	rdtsc_barrier();
+	/*
+	 * The measurement runs for 2 msecs:
+	 */
+	end = start + tsc_khz * 2ULL;
+	now = start;
+
+	for (i = 0; ; i++) {
+		/*
+		 * We take the global lock, measure TSC, save the
+		 * previous TSC that was measured (possibly on
+		 * another CPU) and update the previous TSC timestamp.
+		 */
+		__raw_spin_lock(&sync_lock);
+		prev = last_tsc;
+		rdtsc_barrier();
+		now = get_cycles();
+		rdtsc_barrier();
+		last_tsc = now;
+		__raw_spin_unlock(&sync_lock);
+
+		/*
+		 * Be nice every now and then (and also check whether
+		 * measurement is done [we also insert a 10 million
+		 * loops safety exit, so we dont lock up in case the
+		 * TSC readout is totally broken]):
+		 */
+		if (unlikely(!(i & 7))) {
+			if (now > end || i > 10000000)
+				break;
+			cpu_relax();
+			/*touch_nmi_watchdog();*/
+		}
+		/*
+		 * Outside the critical section we can now see whether
+		 * we saw a time-warp of the TSC going backwards:
+		 */
+		if (unlikely(prev > now)) {
+			__raw_spin_lock(&sync_lock);
+			if ( *max_warp > prev - now )
+				*max_warp = prev - now;
+			__raw_spin_unlock(&sync_lock);
+		}
+	}
+}
diff -r 1e33261a814f xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/include/asm-x86/cpufeature.h	Mon Sep 28 13:26:11 2009 -0600
@@ -74,9 +74,10 @@
 #define X86_FEATURE_P3		(3*32+ 6) /* P3 */
 #define X86_FEATURE_P4		(3*32+ 7) /* P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_NOSTOP_TSC	(3*32+ 9) /* TSC does not stop in C states */
+#define X86_FEATURE_NONSTOP_TSC	(3*32+ 9) /* TSC does not stop in C states */
 #define X86_FEATURE_ARAT	(3*32+ 10) /* Always running APIC timer */
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_TSC_RELIABLE (3*32+12) /* TSC is known to be reliable */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff -r 1e33261a814f xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h	Mon Sep 28 13:59:35 2009 +0100
+++ b/xen/include/asm-x86/time.h	Mon Sep 28 13:26:11 2009 -0600
@@ -43,4 +43,6 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
 
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
 
+void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp);
+
 #endif /* __X86_TIME_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-28 20:19 [RFC] [PATCH] use "reliable" tsc properly when available, but verify Dan Magenheimer
@ 2009-09-28 21:01 ` Keir Fraser
  2009-09-28 21:06   ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2009-09-28 21:01 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail)

On 28/09/2009 21:19, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> So we trust but verify... if the cpuid bit is set
> we assume a reliable tsc, but use the Linux boottime
> check_tsc_warp algorithm to periodically verify that
> the tsc hasn't skewed.  If it has, we fall back to Xen
> managing (and periodically writing) the TSC.
> 
> The check_tsc_warp algorithm is CPU-intensive, so
> we test it on a decaying schedule, at 1sec, 2sec,
> 4sec, 8sec, 16sec, 32sec, etc.

Surely it should be sufficient to check TSCs for consistency across all CPUs
periodically, and against the chosen platform timer, and ensure none are
drifting? An operation which would not require us to loop for 2ms and would
provide rather more useful information than an ad-hoc multi-CPU
race-to-update-a-shared-variable-an-arbitrary-and-large-number-of-times.

I wouldn't take anything like this algorithm.

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-28 21:01 ` Keir Fraser
@ 2009-09-28 21:06   ` Keir Fraser
  2009-09-28 22:05     ` Dan Magenheimer
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2009-09-28 21:06 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail)

On 28/09/2009 22:01, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

> 
> Surely it should be sufficient to check TSCs for consistency across all CPUs
> periodically, and against the chosen platform timer, and ensure none are
> drifting? An operation which would not require us to loop for 2ms and would
> provide rather more useful information than an ad-hoc multi-CPU
> race-to-update-a-shared-variable-an-arbitrary-and-large-number-of-times.

I should add, not only is the algorithm stupid and slow, but it doesn't even
check for exactly what RELIABLE_TSC guarantees -- constant-rate TSCs. This
would be useless on a single-CPU system, for example, or perhaps more
practically a single-socket system where all TSCs skewed together due to
package-wide power management. In the latter case TSCs would not skew
relative to each other, even though they could 'skew' relative to wallclock
(represented in Xen by the platform timer).

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-28 21:06   ` Keir Fraser
@ 2009-09-28 22:05     ` Dan Magenheimer
  2009-09-29  7:43       ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Magenheimer @ 2009-09-28 22:05 UTC (permalink / raw)
  To: Keir Fraser, Xen-Devel (E-mail)

> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Surely it should be sufficient to check TSCs for consistency 
> across all CPUs
> periodically, and against the chosen platform timer, and 
> ensure none are
> drifting? An operation which would not require us to loop for 
> 2ms and would
> provide rather more useful information than an ad-hoc multi-CPU
> race-to-update-a-shared-variable-an-arbitrary-and-large-number
> -of-times.
> 
> I wouldn't take anything like this algorithm.

The algorithm ensures that the skew between any two
processors is sufficiently small so that it is unobservable
by any app (e.g. smaller than "a cache bounce").  I'm
not sure it is possible to "check for consistency across
all CPUs" and get that guarantee any other way... unless
there is some easy way to measure the minimum cost of a cache
bounce.

I'm not sure why Linux chooses to run the test
for 20ms but I think it is because it is only running
it once at boottime so it has to eat up some time to
give the tsc's a chance to skew sufficiently.  If we
are running it more than once (and Xen hasn't written
to the tsc's recently), it's probably sufficient to
run it for far fewer iterations, but given all the
possible CPU race conditions due to caches and pipelining
and such, I'm not sure how many iterations is enough.

Note that upstream Linux NEVER writes to TSC anymore.
If the check_tsc_warp test fails, tsc is simply marked
as an unreliable mechanism other than for interpolating
within a jiffie.  If OS's had some intrinsic to describe
this "reliable vs unreliable TSC" to apps, lots of troubles
could have been avoided.  But that's roughly what I
am trying to do with pvrdtscp so I'm trying to be very
sure that when Xen says it is, TSC is both reliable and
continues to be reliable.  (Though maybe once at boottime 
is sufficient.)

Which points out another alternative:  check_tsc_warp
need only be run if one or more domains have tsc_native
enabled AND have some mechanism (such as pvrdtscp or
a userland hypercall) to ask Xen if the TSC is reliable
or not.

But since this might be minutes/hours/days after Xen
boots, I'd still like to avoid Xen mucking around
using write_tsc in the meantime as it may be "fixing"
something that ain't broke.

> I should add, not only is the algorithm stupid and slow, but 
> it doesn't even
> check for exactly what RELIABLE_TSC guarantees -- 
> constant-rate TSCs. This
> would be useless on a single-CPU system, for example, or perhaps more
> practically a single-socket system where all TSCs skewed 
> together due to
> package-wide power management. In the latter case TSCs would not skew
> relative to each other, even though they could 'skew' 
> relative to wallclock
> (represented in Xen by the platform timer).

It's only checking for TSC skew relative to other
processors in an SMP system.  What's important to
an app is that time (as measured by sampling the
TSC on random processors) never goes backwards.
That IS what RELIABLE_TSC is supposed to guarantee.
I agree that check_tsc_warp doesn't test for skew
relative to a platform timer (though I suspect
they are driven from the same crystal) and need not
be run on a single-CPU system.

Dan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-28 22:05     ` Dan Magenheimer
@ 2009-09-29  7:43       ` Keir Fraser
  2009-09-29 15:51         ` Dan Magenheimer
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2009-09-29  7:43 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail)

On 28/09/2009 23:05, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> Note that upstream Linux NEVER writes to TSC anymore.
> If the check_tsc_warp test fails, tsc is simply marked
> as an unreliable mechanism other than for interpolating
> within a jiffie.  If OS's had some intrinsic to describe
> this "reliable vs unreliable TSC" to apps, lots of troubles
> could have been avoided.  But that's roughly what I
> am trying to do with pvrdtscp so I'm trying to be very
> sure that when Xen says it is, TSC is both reliable and
> continues to be reliable.  (Though maybe once at boottime
> is sufficient.)

Given that TSC is now emulated, who cares what the underlying CPUs say about
TSC reliability. Xen emulates the TSC with its own system time, and even
explicitly checks that the returned TSC value is monotonically increasing.
So TSC is alweays 'reliable' for guests, regardless of host TSC behaviour.
So on this count, too, the patch is a reject.

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29  7:43       ` Keir Fraser
@ 2009-09-29 15:51         ` Dan Magenheimer
  2009-09-29 17:17           ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Magenheimer @ 2009-09-29 15:51 UTC (permalink / raw)
  To: Keir Fraser, Xen-Devel (E-mail)

> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> 
> Given that TSC is now emulated, who cares what the underlying 
> CPUs say about
> TSC reliability. Xen emulates the TSC with its own system 
> time, and even
> explicitly checks that the returned TSC value is 
> monotonically increasing.
> So TSC is alweays 'reliable' for guests, regardless of host 
> TSC behaviour.
> So on this count, too, the patch is a reject.

Ouch.  I thought RFC was "request for comment", not
"request for castration" ;-) ;-)

Let me clarify my intent:

I remain very much committed to emulated-tsc
("correctness") as the default for unmodified apps
even if there is a significant loss of performance.
Call this Phase I

BUT I am continuing to work on how an "aware" app
(or an "aware" OS) in a constrained environment can
obtain both correctness AND performance.  Call this
Phase II.  Pvrdtscp and Xiantao's scaling are
different approaches to Phase II, for pvm and
hvm respectively. Vsyscall+pvclock also if a PV
OS can be made aware whether tsc_native is enabled
or not.

This proposed patch is really only important for Phase
II, but given all the confusion around whether tsc is
reliable/safe/constant/nonstop on various machines,
I think it might be good to get code into Xen --
sooner rather than later -- that "measures" this
so we can confirm if the promises made by processor
and system vendors are (or are not) being delivered.

Does that make more sense?

Thanks,
Dan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29 15:51         ` Dan Magenheimer
@ 2009-09-29 17:17           ` Keir Fraser
  2009-09-29 18:10             ` Dan Magenheimer
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2009-09-29 17:17 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail)

On 29/09/2009 16:51, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> This proposed patch is really only important for Phase
> II, but given all the confusion around whether tsc is
> reliable/safe/constant/nonstop on various machines,
> I think it might be good to get code into Xen --
> sooner rather than later -- that "measures" this
> so we can confirm if the promises made by processor
> and system vendors are (or are not) being delivered.
> 
> Does that make more sense?

That makes it a nice-to-have. But not nice enough to have all CPUs groove
off for 2ms every once in a while, imo.

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29 17:17           ` Keir Fraser
@ 2009-09-29 18:10             ` Dan Magenheimer
  2009-09-29 18:21               ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Magenheimer @ 2009-09-29 18:10 UTC (permalink / raw)
  To: Keir Fraser, Xen-Devel (E-mail)

> > Does that make more sense?
> 
> That makes it a nice-to-have. But not nice enough to have all 
> CPUs groove
> off for 2ms every once in a while, imo.

OK, rechecking may be too paranoid.  I will rewrite to
run once at boottime and resubmit. (Is 20ms at boottime
OK? 20ms matches Linux.)

> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Monday, September 21, 2009 12:17 PM
>
> Well, if it is at least true for 99% of systems, then it 
> might be worth
> enabling constant_tsc support by default, and detect TSC divergence at
> runtime and disbale dynamically. I think that's what Linux 
> does (i.e., it
> has a fallback at runtime if its TSC assumptions turn out to 
> be wrong).

Looking more closely at the upstream Linux code though,
any processor deemed to have X86_FEATURE_TSC_RELIABLE
NEVER runs check_tsc_warp and never validates TSC (except
in the unusual circumstance that the kernel was compiled
with NUMAQ enabled).  If this feature is set,
Linux just uses tsc as the best clocksource and afaict
has no fallback.

I'd feel better running check_tsc_warp (at least) once
on ALL processors, even if X86_FEATURE_TSC_RELIABLE
is set.  Is that OK?  Or should we just trust it under
exactly the same circumstances as Linux does?

Thanks,
Dan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29 18:10             ` Dan Magenheimer
@ 2009-09-29 18:21               ` Keir Fraser
  2009-09-29 18:45                 ` Dan Magenheimer
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2009-09-29 18:21 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail)

On 29/09/2009 19:10, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> I'd feel better running check_tsc_warp (at least) once
> on ALL processors, even if X86_FEATURE_TSC_RELIABLE
> is set.  Is that OK?  Or should we just trust it under
> exactly the same circumstances as Linux does?

Well, what does trusting it imply for us right now? Do you have another
patch in mind, even if we decide against check_tsc_warp?

I mean, I'm not unhappy with our TSC management code as it is. I'm not
greatly excited to mess with it yet again.

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29 18:21               ` Keir Fraser
@ 2009-09-29 18:45                 ` Dan Magenheimer
  2009-09-29 19:03                   ` Keir Fraser
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Magenheimer @ 2009-09-29 18:45 UTC (permalink / raw)
  To: Keir Fraser, Xen-Devel (E-mail)

> > I'd feel better running check_tsc_warp (at least) once
> > on ALL processors, even if X86_FEATURE_TSC_RELIABLE
> > is set.  Is that OK?  Or should we just trust it under
> > exactly the same circumstances as Linux does?
> 
> Well, what does trusting it imply for us right now? Do you 
> have another
> patch in mind, even if we decide against check_tsc_warp?
> 
> I mean, I'm not unhappy with our TSC management code as it is. I'm not
> greatly excited to mess with it yet again.

Well, if it is indeed reliable, there's no reason that
Xen should write_tsc, though it might be interesting
to check_tsc_warp on various hardware before and after
Xen does write_tsc (to see if Xen is breaking anything).

But I could certainly code the patch to just "measure
and report" for now and we could revisit Xen's use of
write_tsc at a later time if that's your preference.

I definitely have more patches in mind for pvrdtsc but
may not get back to them for a few weeks, so was
hoping to validate tsc reliability on some machines
in the meantime.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29 18:45                 ` Dan Magenheimer
@ 2009-09-29 19:03                   ` Keir Fraser
  2009-09-29 19:34                     ` Dan Magenheimer
  0 siblings, 1 reply; 12+ messages in thread
From: Keir Fraser @ 2009-09-29 19:03 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail)

On 29/09/2009 19:45, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> But I could certainly code the patch to just "measure
> and report" for now and we could revisit Xen's use of
> write_tsc at a later time if that's your preference.
> 
> I definitely have more patches in mind for pvrdtsc but
> may not get back to them for a few weeks, so was
> hoping to validate tsc reliability on some machines
> in the meantime.

Who's actually going to go looking for this data in their logs and report it
to you? I think we should leave this until plans are more fully formed.

 -- Keir

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [RFC] [PATCH] use "reliable" tsc properly when available, but verify
  2009-09-29 19:03                   ` Keir Fraser
@ 2009-09-29 19:34                     ` Dan Magenheimer
  0 siblings, 0 replies; 12+ messages in thread
From: Dan Magenheimer @ 2009-09-29 19:34 UTC (permalink / raw)
  To: Keir Fraser, Xen-Devel (E-mail)

> Who's actually going to go looking for this data in their 
> logs and report it to you?

I was thinking about posting an appeal on xen-devel
and xen-users asking anyone using a recent xen-unstable
hypervisor to run

# xm debug-key s
# xm dmesg | tail
# cat /proc/cpuinfo | grep "model name"

and post or send the results.

And of course I was going to do the same on any
machines I can get my hands on.

> I think we should leave this until plans are more 
> fully formed.

OK, then I will move forward with the assumption that
what the processor and server vendors say is correct
(that tsc IS really reliable if the cpuid bit says
it is) and ignore Jeremy's cautions and my own
paranoia.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-09-29 19:34 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-28 20:19 [RFC] [PATCH] use "reliable" tsc properly when available, but verify Dan Magenheimer
2009-09-28 21:01 ` Keir Fraser
2009-09-28 21:06   ` Keir Fraser
2009-09-28 22:05     ` Dan Magenheimer
2009-09-29  7:43       ` Keir Fraser
2009-09-29 15:51         ` Dan Magenheimer
2009-09-29 17:17           ` Keir Fraser
2009-09-29 18:10             ` Dan Magenheimer
2009-09-29 18:21               ` Keir Fraser
2009-09-29 18:45                 ` Dan Magenheimer
2009-09-29 19:03                   ` Keir Fraser
2009-09-29 19:34                     ` Dan Magenheimer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.