From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Jan Beulich" Subject: [PATCH 3/8] x86/time: introduce and use rdtsc_ordered() Date: Wed, 15 Jun 2016 04:27:23 -0600 Message-ID: <576149AB02000078000F539D@prv-mh.provo.novell.com> References: <576140F302000078000F52FE@prv-mh.provo.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=__PartE8DED39B.1__=" Return-path: Received: from mail6.bemta5.messagelabs.com ([195.245.231.135]) by lists.xenproject.org with esmtp (Exim 4.84_2) (envelope-from ) id 1bD82i-0007Po-Vq for xen-devel@lists.xenproject.org; Wed, 15 Jun 2016 10:27:29 +0000 In-Reply-To: <576140F302000078000F52FE@prv-mh.provo.novell.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: xen-devel Cc: Andrew Cooper , Dario Faggioli , Joao Martins List-Id: xen-devel@lists.xenproject.org This is a MIME message. If you are reading this text, you may want to consider changing to a mail reader or gateway that understands how to properly handle MIME multipart messages. --=__PartE8DED39B.1__= Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Content-Disposition: inline Matching Linux commit 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and use it in trivial call sites") and earlier ones it builds upon, let's make sure timing loops don't have their rdtsc()-s re-ordered, as that would harm precision of the result (values were observed to be several hundred clocks off without this adjustment). Signed-off-by: Jan Beulich --- a/xen/arch/x86/apic.c +++ b/xen/arch/x86/apic.c @@ -1137,7 +1137,7 @@ static int __init calibrate_APIC_clock(v /* * We wrapped around just now. Let's start: */ - t1 =3D rdtsc(); + t1 =3D rdtsc_ordered(); tt1 =3D apic_read(APIC_TMCCT); =20 /* @@ -1147,7 +1147,7 @@ static int __init calibrate_APIC_clock(v wait_8254_wraparound(); =20 tt2 =3D apic_read(APIC_TMCCT); - t2 =3D rdtsc(); + t2 =3D rdtsc_ordered(); =20 /* * The APIC bus clock counter is 32 bits only, it --- a/xen/arch/x86/cpu/amd.c +++ b/xen/arch/x86/cpu/amd.c @@ -541,6 +541,9 @@ static void init_amd(struct cpuinfo_x86 wrmsr_amd_safe(0xc001100d, l, h & ~1); } =20 + /* MFENCE stops RDTSC speculation */ + __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); + switch(c->x86) { case 0xf ... 0x17: --- a/xen/arch/x86/delay.c +++ b/xen/arch/x86/delay.c @@ -21,10 +21,10 @@ void __udelay(unsigned long usecs) unsigned long ticks =3D usecs * (cpu_khz / 1000); unsigned long s, e; =20 - s =3D rdtsc(); + s =3D rdtsc_ordered(); do { rep_nop(); - e =3D rdtsc(); + e =3D rdtsc_ordered(); } while ((e-s) < ticks); } --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -123,7 +123,7 @@ static void synchronize_tsc_master(unsig =20 for ( i =3D 1; i <=3D 5; i++ ) { - tsc_value =3D rdtsc(); + tsc_value =3D rdtsc_ordered(); wmb(); atomic_inc(&tsc_count); while ( atomic_read(&tsc_count) !=3D (i<<1) ) --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -257,10 +257,10 @@ static u64 init_pit_and_calibrate_tsc(vo outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */ outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */ =20 - start =3D rdtsc(); + start =3D rdtsc_ordered(); for ( count =3D 0; (inb(0x61) & 0x20) =3D=3D 0; count++ ) continue; - end =3D rdtsc(); + end =3D rdtsc_ordered(); =20 /* Error if the CTC doesn't behave itself. */ if ( count =3D=3D 0 ) @@ -760,7 +760,7 @@ s_time_t get_s_time_fixed(u64 at_tsc) if ( at_tsc ) tsc =3D at_tsc; else - tsc =3D rdtsc(); + tsc =3D rdtsc_ordered(); delta =3D tsc - t->local_tsc_stamp; now =3D t->stime_local_stamp + scale_delta(delta, &t->tsc_scale); =20 @@ -933,7 +933,7 @@ int cpu_frequency_change(u64 freq) /* TSC-extrapolated time may be bogus after frequency change. */ /*t->stime_local_stamp =3D get_s_time();*/ t->stime_local_stamp =3D t->stime_master_stamp; - curr_tsc =3D rdtsc(); + curr_tsc =3D rdtsc_ordered(); t->local_tsc_stamp =3D curr_tsc; set_time_scale(&t->tsc_scale, freq); local_irq_enable(); @@ -1248,7 +1248,7 @@ static void time_calibration_tsc_rendezv if ( r->master_stime =3D=3D 0 ) { r->master_stime =3D read_platform_stime(); - r->master_tsc_stamp =3D rdtsc(); + r->master_tsc_stamp =3D rdtsc_ordered(); } atomic_inc(&r->semaphore); =20 @@ -1274,7 +1274,7 @@ static void time_calibration_tsc_rendezv } } =20 - c->local_tsc_stamp =3D rdtsc(); + c->local_tsc_stamp =3D rdtsc_ordered(); c->stime_local_stamp =3D get_s_time_fixed(c->local_tsc_stamp); c->stime_master_stamp =3D r->master_stime; =20 @@ -1304,7 +1304,7 @@ static void time_calibration_std_rendezv mb(); /* receive signal /then/ read r->master_stime */ } =20 - c->local_tsc_stamp =3D rdtsc(); + c->local_tsc_stamp =3D rdtsc_ordered(); c->stime_local_stamp =3D get_s_time_fixed(c->local_tsc_stamp); c->stime_master_stamp =3D r->master_stime; =20 @@ -1338,7 +1338,7 @@ void time_latch_stamps(void) { =20 local_irq_save(flags); ap_bringup_ref.master_stime =3D read_platform_stime(); - tsc =3D rdtsc(); + tsc =3D rdtsc_ordered(); local_irq_restore(flags); =20 ap_bringup_ref.local_stime =3D get_s_time_fixed(tsc); @@ -1356,7 +1356,7 @@ void init_percpu_time(void) =20 local_irq_save(flags); now =3D read_platform_stime(); - tsc =3D rdtsc(); + tsc =3D rdtsc_ordered(); local_irq_restore(flags); =20 t->stime_master_stamp =3D now; --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -16,6 +16,7 @@ XEN_CPUFEATURE(XTOPOLOGY, (FSCAPIN XEN_CPUFEATURE(CPUID_FAULTING, (FSCAPINTS+0)*32+ 6) /* cpuid faulting */ XEN_CPUFEATURE(CLFLUSH_MONITOR, (FSCAPINTS+0)*32+ 7) /* clflush reqd with = monitor */ XEN_CPUFEATURE(APERFMPERF, (FSCAPINTS+0)*32+ 8) /* APERFMPERF */ +XEN_CPUFEATURE(MFENCE_RDTSC, (FSCAPINTS+0)*32+ 9) /* MFENCE synchronize= s RDTSC */ =20 #define NCAPINTS (FSCAPINTS + 1) /* N 32-bit words worth of info */ =20 --- a/xen/include/asm-x86/msr.h +++ b/xen/include/asm-x86/msr.h @@ -80,6 +80,22 @@ static inline uint64_t rdtsc(void) return ((uint64_t)high << 32) | low; } =20 +static inline uint64_t rdtsc_ordered(void) +{ + /* + * The RDTSC instruction is not ordered relative to memory access. + * The Intel SDM and the AMD APM are both vague on this point, but + * empirically an RDTSC instruction can be speculatively executed + * before prior loads. An RDTSC immediately after an appropriate + * barrier appears to be ordered as a normal load, that is, it + * provides the same ordering guarantees as reading from a global + * memory location that some other imaginary CPU is updating + * continuously with a time stamp. + */ + alternative("lfence", "mfence", X86_FEATURE_MFENCE_RDTSC); + return rdtsc(); +} + #define __write_tsc(val) wrmsrl(MSR_IA32_TSC, val) #define write_tsc(val) ({ \ /* Reliable TSCs are in lockstep across all CPUs. We should \ --=__PartE8DED39B.1__= Content-Type: text/plain; name="x86-RDTSC-ordered.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="x86-RDTSC-ordered.patch" x86/time: introduce and use rdtsc_ordered()=0A=0AMatching Linux commit = 03b9730b76 ("x86/asm/tsc: Add rdtsc_ordered() and=0Ause it in trivial call = sites") and earlier ones it builds upon, let's=0Amake sure timing loops = don't have their rdtsc()-s re-ordered, as that=0Awould harm precision of = the result (values were observed to be several=0Ahundred clocks off = without this adjustment).=0A=0ASigned-off-by: Jan Beulich =0A=0A--- a/xen/arch/x86/apic.c=0A+++ b/xen/arch/x86/apic.c=0A@@ -1137,7 = +1137,7 @@ static int __init calibrate_APIC_clock(v=0A /*=0A * We = wrapped around just now. Let's start:=0A */=0A- t1 =3D rdtsc();=0A+= t1 =3D rdtsc_ordered();=0A tt1 =3D apic_read(APIC_TMCCT);=0A =0A = /*=0A@@ -1147,7 +1147,7 @@ static int __init calibrate_APIC_clock(v=0A = wait_8254_wraparound();=0A =0A tt2 =3D apic_read(APIC_TMCCT);=0A-= t2 =3D rdtsc();=0A+ t2 =3D rdtsc_ordered();=0A =0A /*=0A * = The APIC bus clock counter is 32 bits only, it=0A--- a/xen/arch/x86/cpu/amd= .c=0A+++ b/xen/arch/x86/cpu/amd.c=0A@@ -541,6 +541,9 @@ static void = init_amd(struct cpuinfo_x86=0A wrmsr_amd_safe(0xc001100d, = l, h & ~1);=0A }=0A =0A+ /* MFENCE stops RDTSC speculation */=0A+ = __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);=0A+=0A switch(c->x= 86)=0A {=0A case 0xf ... 0x17:=0A--- a/xen/arch/x86/delay.c=0A+++ = b/xen/arch/x86/delay.c=0A@@ -21,10 +21,10 @@ void __udelay(unsigned long = usecs)=0A unsigned long ticks =3D usecs * (cpu_khz / 1000);=0A = unsigned long s, e;=0A =0A- s =3D rdtsc();=0A+ s =3D rdtsc_ordered();= =0A do=0A {=0A rep_nop();=0A- e =3D rdtsc();=0A+ = e =3D rdtsc_ordered();=0A } while ((e-s) < ticks);=0A }=0A--- = a/xen/arch/x86/smpboot.c=0A+++ b/xen/arch/x86/smpboot.c=0A@@ -123,7 +123,7 = @@ static void synchronize_tsc_master(unsig=0A =0A for ( i =3D 1; i = <=3D 5; i++ )=0A {=0A- tsc_value =3D rdtsc();=0A+ = tsc_value =3D rdtsc_ordered();=0A wmb();=0A atomic_inc(&tsc= _count);=0A while ( atomic_read(&tsc_count) !=3D (i<<1) )=0A--- = a/xen/arch/x86/time.c=0A+++ b/xen/arch/x86/time.c=0A@@ -257,10 +257,10 @@ = static u64 init_pit_and_calibrate_tsc(vo=0A outb(CALIBRATE_LATCH & = 0xff, PIT_CH2); /* LSB of count */=0A outb(CALIBRATE_LATCH >> 8, = PIT_CH2); /* MSB of count */=0A =0A- start =3D rdtsc();=0A+ start = =3D rdtsc_ordered();=0A for ( count =3D 0; (inb(0x61) & 0x20) =3D=3D = 0; count++ )=0A continue;=0A- end =3D rdtsc();=0A+ end =3D = rdtsc_ordered();=0A =0A /* Error if the CTC doesn't behave itself. = */=0A if ( count =3D=3D 0 )=0A@@ -760,7 +760,7 @@ s_time_t get_s_time_f= ixed(u64 at_tsc)=0A if ( at_tsc )=0A tsc =3D at_tsc;=0A = else=0A- tsc =3D rdtsc();=0A+ tsc =3D rdtsc_ordered();=0A = delta =3D tsc - t->local_tsc_stamp;=0A now =3D t->stime_local_stamp + = scale_delta(delta, &t->tsc_scale);=0A =0A@@ -933,7 +933,7 @@ int cpu_freque= ncy_change(u64 freq)=0A /* TSC-extrapolated time may be bogus after = frequency change. */=0A /*t->stime_local_stamp =3D get_s_time();*/=0A = t->stime_local_stamp =3D t->stime_master_stamp;=0A- curr_tsc =3D = rdtsc();=0A+ curr_tsc =3D rdtsc_ordered();=0A t->local_tsc_stamp = =3D curr_tsc;=0A set_time_scale(&t->tsc_scale, freq);=0A local_irq_= enable();=0A@@ -1248,7 +1248,7 @@ static void time_calibration_tsc_rendezv= =0A if ( r->master_stime =3D=3D 0 )=0A {=0A = r->master_stime =3D read_platform_stime();=0A- = r->master_tsc_stamp =3D rdtsc();=0A+ r->master_tsc_stamp = =3D rdtsc_ordered();=0A }=0A atomic_inc(&r->semapho= re);=0A =0A@@ -1274,7 +1274,7 @@ static void time_calibration_tsc_rendezv= =0A }=0A }=0A =0A- c->local_tsc_stamp =3D rdtsc();=0A+ = c->local_tsc_stamp =3D rdtsc_ordered();=0A c->stime_local_stamp =3D = get_s_time_fixed(c->local_tsc_stamp);=0A c->stime_master_stamp =3D = r->master_stime;=0A =0A@@ -1304,7 +1304,7 @@ static void time_calibration_s= td_rendezv=0A mb(); /* receive signal /then/ read r->master_stime = */=0A }=0A =0A- c->local_tsc_stamp =3D rdtsc();=0A+ c->local_tsc_= stamp =3D rdtsc_ordered();=0A c->stime_local_stamp =3D get_s_time_fixed= (c->local_tsc_stamp);=0A c->stime_master_stamp =3D r->master_stime;=0A = =0A@@ -1338,7 +1338,7 @@ void time_latch_stamps(void) {=0A =0A = local_irq_save(flags);=0A ap_bringup_ref.master_stime =3D read_platform= _stime();=0A- tsc =3D rdtsc();=0A+ tsc =3D rdtsc_ordered();=0A = local_irq_restore(flags);=0A =0A ap_bringup_ref.local_stime =3D = get_s_time_fixed(tsc);=0A@@ -1356,7 +1356,7 @@ void init_percpu_time(void)= =0A =0A local_irq_save(flags);=0A now =3D read_platform_stime();=0A= - tsc =3D rdtsc();=0A+ tsc =3D rdtsc_ordered();=0A local_irq_rest= ore(flags);=0A =0A t->stime_master_stamp =3D now;=0A--- a/xen/include/a= sm-x86/cpufeature.h=0A+++ b/xen/include/asm-x86/cpufeature.h=0A@@ -16,6 = +16,7 @@ XEN_CPUFEATURE(XTOPOLOGY, (FSCAPIN=0A XEN_CPUFEATURE(CPUID_F= AULTING, (FSCAPINTS+0)*32+ 6) /* cpuid faulting */=0A XEN_CPUFEATURE(CLFLU= SH_MONITOR, (FSCAPINTS+0)*32+ 7) /* clflush reqd with monitor */=0A = XEN_CPUFEATURE(APERFMPERF, (FSCAPINTS+0)*32+ 8) /* APERFMPERF = */=0A+XEN_CPUFEATURE(MFENCE_RDTSC, (FSCAPINTS+0)*32+ 9) /* MFENCE = synchronizes RDTSC */=0A =0A #define NCAPINTS (FSCAPINTS + 1) /* N 32-bit = words worth of info */=0A =0A--- a/xen/include/asm-x86/msr.h=0A+++ = b/xen/include/asm-x86/msr.h=0A@@ -80,6 +80,22 @@ static inline uint64_t = rdtsc(void)=0A return ((uint64_t)high << 32) | low;=0A }=0A =0A+static = inline uint64_t rdtsc_ordered(void)=0A+{=0A+ /*=0A+ * The RDTSC = instruction is not ordered relative to memory access.=0A+ * The = Intel SDM and the AMD APM are both vague on this point, but=0A+ * = empirically an RDTSC instruction can be speculatively executed=0A+ * = before prior loads. An RDTSC immediately after an appropriate=0A+ * = barrier appears to be ordered as a normal load, that is, it=0A+ * = provides the same ordering guarantees as reading from a global=0A+ * = memory location that some other imaginary CPU is updating=0A+ * = continuously with a time stamp.=0A+ */=0A+ alternative("lfence", = "mfence", X86_FEATURE_MFENCE_RDTSC);=0A+ return rdtsc();=0A+}=0A+=0A= #define __write_tsc(val) wrmsrl(MSR_IA32_TSC, val)=0A #define write_tsc(va= l) ({ \=0A /* Reliable TSCs are = in lockstep across all CPUs. We should \=0A --=__PartE8DED39B.1__= Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwOi8vbGlzdHMueGVuLm9y Zy94ZW4tZGV2ZWwK --=__PartE8DED39B.1__=--