[RFC PATCH] tsc: synchronize TSCs on buggy Intel Xeon E5 CPUs with offset error

* [RFC PATCH] tsc: synchronize TSCs on buggy Intel Xeon E5 CPUs with offset error
@ 2015-11-09 19:59 gratian.crisan
  2015-11-09 22:02 ` Peter Zijlstra
  0 siblings, 1 reply; 11+ messages in thread
From: gratian.crisan @ 2015-11-09 19:59 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel, Ingo Molnar, H . Peter Anvin, x86, Borislav Petkov,
	Peter Zijlstra, Josh Cartwright, gratian

From: Gratian Crisan <gratian.crisan@ni.com>

The Intel Xeon E5 processor family suffers from errata[1] BT81:
"TSC is Not Affected by Warm Reset.
Problem: The TSC (Time Stamp Counter MSR 10H) should be cleared on reset.
Due to this erratum the TSC is not affected by warm reset.
Implication: The TSC is not cleared by a warm reset. The TSC is cleared by
power-on reset as expected. Intel has not observed any functional failures
due to this erratum.
Workaround: None identified.
Status: There are no plans to fix this erratum."

The observed behavior: after a warm reset the TSC gets reset for CPU0 but
not for any of the other cores i.e. they continue incrementing from the
value they had before the reset. The TSCs are otherwise stable and
always-running so the offset error stays constant.

Add x86 bug flag if an Intel Xeon E5 gets detected and based on that
synchronize the TSC offset by performing the following measurement:

target     source
  t0 ---\
         \-->
             ts
         /--
  t1 <--/

Where:
  * target is the target CPU who's TSC offset we are trying to correct;
  * source is the source CPU used as a reference (i.e. the boot CPU);
  * t0, t1 are TSC time-stamps obtained on the target CPU;
  * ts is the time-stamp acquired on the source CPU.

If the source and target CPU TSCs are synchronized, and the interconnect is
symmetric, then ts falls exactly half-way between t0 and t1. In practice
the measured offset will include the RDTSC instruction latency as well as
the latency introduced by the interconnect. To account for these latencies
we are performing the offset measurement in a loop and use for correction
the minimum measured offset; the idea being that it contains the least
amount of unaccounted for latency. The minimum measured offset is then used
to adjust the TSC register on the target CPU.

Signed-off-by: Gratian Crisan <gratian.crisan@ni.com>

[1] http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/xeon-e5-family-spec-update.pdf
---
 arch/x86/include/asm/cpufeature.h |   1 +
 arch/x86/kernel/cpu/intel.c       |   9 +++
 arch/x86/kernel/tsc_sync.c        | 124 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e4f8010..3fb0b62 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -272,6 +272,7 @@
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_TSC_OFFSET	X86_BUG(9) /* CPU has skewed but stable TSCs */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 209ac1e..42732dc 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -296,6 +296,15 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 #else
 static void intel_workarounds(struct cpuinfo_x86 *c)
 {
+#ifdef CONFIG_X86_TSC
+	/*
+	 * Xeon E5 BT81 errata: TSC is not affected by warm reset.
+	 * The TSC registers for CPUs other than CPU0 are not cleared by a warm
+	 * reset resulting in a constant offset error.
+	 */
+	if ((c->x86 == 6) && (c->x86_model == 0x3f))
+		set_cpu_bug(c, X86_BUG_TSC_OFFSET);
+#endif
 }
 #endif
 
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 78083bf..0d0f40c 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,6 +114,124 @@ static inline unsigned int loop_timeout(int cpu)
 }
 
 /*
+ * Read the current TSC counter value excluding time-stamps that are zero.
+ * Zero is treated as a special measurement synchronization value in the TSC
+ * offset synchronization code.
+ */
+static inline unsigned long long get_cycles_nz(void)
+{
+	unsigned long long ts;
+again:
+	ts = rdtsc_ordered();
+	if (unlikely(!ts))
+		goto again;
+	return ts;
+}
+
+static atomic64_t target_t0;
+static atomic64_t target_t1;
+static atomic64_t source_ts;
+/*
+ * Measure the TSC offset for the target CPU being brought up vs. the source
+ * CPU. We are collecting three time-stamps:
+ *
+ * target     source
+ *   t0 ---\
+ *          \-->
+ *              ts
+ *          /--
+ *   t1 <--/
+ *
+ * If the source and target TSCs are synchronized, and the interconnect is
+ * symmetric, then ts falls exactly half-way between t0 and t1. We are returning
+ * any deviation from [t0..t1] mid-point as the offset of the target TSC vs. the
+ * source TSC. The measured offset will contain errors like the latency of RDTSC
+ * instruction and the latency introduced by the interconnect. Multiple
+ * measurements are required to filter out these errors.
+ */
+static s64 target_tsc_offset(void)
+{
+	u64 t0, t1, ts;
+	s64 offset;
+
+	t0 = get_cycles_nz();
+	atomic64_set(&target_t0, t0);
+
+	while (!(ts = atomic64_read(&source_ts)))
+		cpu_relax();
+	atomic64_set(&source_ts, 0);
+
+	t1 = get_cycles_nz();
+
+	/* Calculate the offset w/o overflow. */
+	offset = t0/2 + t1/2 - ts;
+	offset += ((t0 & 0x1) & (t1 & 0x1));
+
+	atomic64_set(&target_t1, t1);
+
+	return offset;
+}
+
+static void source_tsc_offset(void)
+{
+	while (!atomic64_read(&target_t0))
+		cpu_relax();
+	atomic64_set(&target_t0, 0);
+
+	atomic64_set(&source_ts, get_cycles_nz());
+
+	while (!atomic64_read(&target_t1))
+		cpu_relax();
+	atomic64_set(&target_t1, 0);
+}
+
+static void adjust_tsc_offset(s64 offset)
+{
+	u64 ts;
+
+	ts = rdtsc_ordered();
+	ts -= offset;
+	write_tsc((u32)ts, (u32)(ts >> 32));
+}
+
+/*
+ * Synchronize a target CPU that has a constant offset vs. a source CPU.
+ * Multiple measurements of the TSC offset are performed and the minimum
+ * value is used for adjustment. This is to eliminate as much of the measurement
+ * latency as possible; it will also filter out the errors in the first
+ * iteration caused by the target CPU arriving early.
+ */
+#define NUM_SYNC_ROUNDS 64
+static void sync_tsc_target(void)
+{
+	int i;
+	s64 off, min_off;
+
+	min_off = S64_MAX;
+	for (i = 0; i < NUM_SYNC_ROUNDS; i++) {
+		off = target_tsc_offset();
+		if (i && (abs64(off) < abs64(min_off)))
+			min_off = off;
+		if (unlikely(!(i & 7)))
+			touch_nmi_watchdog();
+	}
+	adjust_tsc_offset(min_off);
+}
+
+static void sync_tsc_source(void)
+{
+	int i;
+
+	preempt_disable();
+	for (i = 0; i < NUM_SYNC_ROUNDS; i++) {
+		source_tsc_offset();
+		if (unlikely(!(i & 7)))
+			touch_nmi_watchdog();
+	}
+	preempt_enable();
+}
+
+/*
  * Source CPU calls into this - it waits for the freshly booted
  * target CPU to arrive and then starts the measurement:
  */
@@ -121,6 +239,9 @@ void check_tsc_sync_source(int cpu)
 {
 	int cpus = 2;
 
+	if (static_cpu_has_bug(X86_BUG_TSC_OFFSET))
+		sync_tsc_source();
+
 	/*
 	 * No need to check if we already know that the TSC is not
 	 * synchronized or if we have no TSC.
@@ -187,6 +308,9 @@ void check_tsc_sync_target(void)
 {
 	int cpus = 2;
 
+	if (static_cpu_has_bug(X86_BUG_TSC_OFFSET))
+		sync_tsc_target();
+
 	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread