linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
@ 2018-03-12  9:14 Jason Vas Dias
  2018-03-12 17:41 ` kbuild test robot
  0 siblings, 1 reply; 13+ messages in thread
From: Jason Vas Dias @ 2018-03-12  9:14 UTC (permalink / raw)
  To: x86, LKML, Thomas Gleixner, andi, Peter Zijlstra


  Currently the VDSO does not handle
     clock_gettime( CLOCK_MONOTONIC_RAW, &ts )
  on Intel / AMD - it calls
     vdso_fallback_gettime()
  for this clock, which issues a syscall, having an unacceptably high
  latency (minimum measurable time or time between measurements)
  of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
  machines under various versions of Linux.

  Sometimes, particularly when correlating elapsed time to performance
  counter values,  code needs to know elapsed time from the perspective
  of the CPU no matter how "hot" / fast or "cold" / slow it might be
  running wrt NTP / PTP ; when code needs this, the latencies with
  a syscall are often unacceptably high.

  I reported this as Bug #198161 :
    'https://bugzilla.kernel.org/show_bug.cgi?id=198961'
  and in previous posts with subjects matching 'CLOCK_MONOTONIC_RAW' .
     
  This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
  by exporting the raw clock calibration, last cycles, last xtime_nsec,
  and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .

  Now the new do_monotonic_raw() function in the vDSO has a latency of @ 24ns
  on average, and the test program:
   tools/testing/selftest/timers/inconsistency-check.c
  succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.

  The patch is against Linus' latest 4.16-rc5 tree,
  current HEAD of :
    git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
  .

  This patch affects only files:

   arch/x86/include/asm/msr.h
   arch/x86/include/asm/vgtod.h
   arch/x86/entry/vdso/vclock_gettime.c
   arch/x86/entry/vsyscall/vsyscall_gtod.c
   
  This is the second patch in the series,
  which adds use of rdtscp .

  Best Regards,
     Jason Vas Dias  .
     
---
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c
--- linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1	2018-03-12 08:12:17.110120433 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c	2018-03-12 08:59:21.135475862 +0000
@@ -187,7 +187,7 @@ notrace static u64 vread_tsc_raw(void)
 	u64 tsc
 	  , last = gtod->raw_cycle_last;
 
-	tsc	      = rdtsc_ordered();
+	tsc = gtod->has_rdtscp ? rdtscp((void*)0UL) : rdtsc_ordered();
 	if (likely(tsc >= last))
 		return tsc;
 	asm volatile ("");
diff -up linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c
--- linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1	2018-03-12 07:58:07.974214168 +0000
+++ linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c	2018-03-12 08:54:07.490267640 +0000
@@ -16,6 +16,7 @@
 #include <linux/timekeeper_internal.h>
 #include <asm/vgtod.h>
 #include <asm/vvar.h>
+#include <cpufeatures.h>
 
 int vclocks_used __read_mostly;
 
@@ -49,6 +50,7 @@ void update_vsyscall(struct timekeeper *
 	vdata->raw_mask		= tk->tkr_raw.mask;
 	vdata->raw_mult		= tk->tkr_raw.mult;
 	vdata->raw_shift	= tk->tkr_raw.shift;
+	vdata->has_rdtscp       = static_cpu_has(X86_FEATURE_RDTSCP);
 
 	vdata->wall_time_sec		= tk->xtime_sec;
 	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
diff -up linux-4.16-rc5/arch/x86/include/asm/msr.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/asm/msr.h
--- linux-4.16-rc5/arch/x86/include/asm/msr.h.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/include/asm/msr.h	2018-03-12 09:06:03.902728749 +0000
@@ -218,6 +218,36 @@ static __always_inline unsigned long lon
 	return rdtsc();
 }
 
+/**
+ * rdtscp() - read the current TSC and (optionally) CPU number, with built-in
+ *            cancellation point replacing barrier - only available
+ *            if static_cpu_has(X86_FEATURE_RDTSCP) .
+ * returns:   The 64-bit Time Stamp Counter (TSC) value.
+ * Optionally, 'cpu_out' can be non-null, and on return it will contain
+ * the number (Intel CPU ID) of the CPU that the task is currently running on.
+ * As does EAX_EDT_RET, this uses the "open-coded asm" style to
+ * force the compiler + assembler to always use (eax, edx, ecx) registers,
+ * NOT whole (rax, rdx, rcx) on x86_64 , because only 32-bit 
+ * variables are used - exactly the same code should be generated
+ * for this instruction on 32-bit as on 64-bit when this asm stanza is used.
+ * See: SDM , Vol #2, RDTSCP instruction.
+ */
+static __always_inline u64 rdtscp(u32 *cpu_out)
+{
+	u32	tsc_lo, tsc_hi, tsc_cpu;
+	asm volatile
+		( "rdtscp"
+			:   "=a" (tsc_lo)
+			  , "=d" (tsc_hi)
+			  , "=c" (tsc_cpu)
+		);
+	if ( unlikely(cpu_out != ((void*)0)) )
+		*cpu_out = tsc_cpu;
+	return ((((u64)tsc_hi) << 32) |
+		(((u64)tsc_lo) & 0x0ffffffffULL )
+	       );
+}
+
 /* Deprecated, keep it for a cycle for easier merging: */
 #define rdtscll(now)	do { (now) = rdtsc_ordered(); } while (0)
 
diff -up linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/asm/vgtod.h
--- linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1	2018-03-12 07:44:17.910539760 +0000
+++ linux-4.16-rc5/arch/x86/include/asm/vgtod.h	2018-03-12 08:51:48.204845624 +0000
@@ -26,6 +26,7 @@ struct vsyscall_gtod_data {
 	u64	raw_mask;
 	u32	raw_mult;
 	u32	raw_shift;
+	u32     has_rdtscp;
 
 	/* open coded 'struct timespec' */
 	u64		wall_time_snsec;
---

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-12  9:14 [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW Jason Vas Dias
@ 2018-03-12 17:41 ` kbuild test robot
  0 siblings, 0 replies; 13+ messages in thread
From: kbuild test robot @ 2018-03-12 17:41 UTC (permalink / raw)
  To: Jason Vas Dias
  Cc: kbuild-all, x86, LKML, Thomas Gleixner, andi, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 894 bytes --]

Hi Jason,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v4.16-rc4]

url:    https://github.com/0day-ci/linux/commits/Jason-Vas-Dias/x86-vdso-on-Intel-VDSO-should-handle-CLOCK_MONOTONIC_RAW/20180313-011110
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

>> arch/x86/entry/vsyscall/vsyscall_gtod.c:19:10: fatal error: cpufeatures.h: No such file or directory
    #include <cpufeatures.h>
             ^~~~~~~~~~~~~~~
   compilation terminated.

vim +19 arch/x86/entry/vsyscall/vsyscall_gtod.c

  > 19	#include <cpufeatures.h>
    20	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6733 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-14 12:55       ` Jason Vas Dias
                           ` (2 preceding siblings ...)
  2018-03-14 13:16         ` Peter Zijlstra
@ 2018-03-14 13:29         ` Peter Zijlstra
  3 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2018-03-14 13:29 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Thomas Gleixner, andi

On Wed, Mar 14, 2018 at 12:55:20PM +0000, Jason Vas Dias wrote:
> > You could read the time using the group_fd's mmap() page. That actually
> > includes the TSC mult,shift,offset as used by perf clocks.
> >
> 
> Yes, but as mentioned earlier, that presupposes I want to use the mmap()
> sample method - I don't - I want to use the Group FD method, so
> that I can be sure the measurements are for the same code sequence
> over the same period of time.

You can use both, you can use the data from the mmap page to convert the
times obtained from the read() syscall back to raw TSC ticks for all I
care (in fact, that's what some people do).

Then your userspace can use saw RDTSC instructions and not worry about
scaling anything.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-14 12:55       ` Jason Vas Dias
  2018-03-14 13:11         ` Peter Zijlstra
  2018-03-14 13:12         ` Peter Zijlstra
@ 2018-03-14 13:16         ` Peter Zijlstra
  2018-03-14 13:29         ` Peter Zijlstra
  3 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2018-03-14 13:16 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Thomas Gleixner, andi

On Wed, Mar 14, 2018 at 12:55:20PM +0000, Jason Vas Dias wrote:
> > If you want to correlate to CLOCK_MONOTONIC_RAW you have to read
> > CLOCK_MONOTONIC_RAW and not some random other clock value.
> >
> 
> Exactly ! Hence the need for the patch so that users can get
> CLOCK_MONOTONIC_RAW values with low latency and correlate them
> with PERF CPU_CLOCK values.

No, you _CANNOT_ correlate CLOCK_MONOTONIC_RAW with CPU_CLOCK, that is
_BROKEN_.

Yes it 'works', but that's mostly a happy accident. There is no
guarantee that CLOCK_MONOTONIC_RAW runs off the TSC, and even if both
CPU_CLOCK and CLOCK_MONOTONIC_RAW use the TSC, they need not use the
same rate (and they didn't for a long time).

Do not mix clocks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-14 12:55       ` Jason Vas Dias
  2018-03-14 13:11         ` Peter Zijlstra
@ 2018-03-14 13:12         ` Peter Zijlstra
  2018-03-14 13:16         ` Peter Zijlstra
  2018-03-14 13:29         ` Peter Zijlstra
  3 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2018-03-14 13:12 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Thomas Gleixner, andi

On Wed, Mar 14, 2018 at 12:55:20PM +0000, Jason Vas Dias wrote:
> > While CPU_CLOCK is TSC based, there is no guarantee it has any
> > correlation to CLOCK_MONOTONIC_RAW (even if that is also TSC based).
> >
> > (although, I think I might have fixed that recently and it might just
> > work, but it's very much not guaranteed).
> 
> Yes, I believe the CPU_CLOCK is effectively the converted TSC -
> it does appear to correlate well with the new CLOCK_MONOTONIC_RAW
> values from the patched VDSO.

It (now) runs at the same rate, but there is no guarantee for this, in
fact it didn't for a very long time.

Relying on this is broken.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-14 12:55       ` Jason Vas Dias
@ 2018-03-14 13:11         ` Peter Zijlstra
  2018-03-14 13:12         ` Peter Zijlstra
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2018-03-14 13:11 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Thomas Gleixner, andi

On Wed, Mar 14, 2018 at 12:55:20PM +0000, Jason Vas Dias wrote:

> > So you can avoid the whole ioctl(ENABLE), ioctl(DISABLE) nonsense and
> > just let them run and do:
> >
> > 	read(group_fd, &buf_pre, size);
> > 	/* your code section */
> > 	read(group_fd, &buf_post, size);
> >
> > 	/* compute buf_post - buf_pre */
> >
> > Which is only 2 system calls, not 4.
> 
> But I can't, really - I am trying to restrict the
> performance counter measurements
> to only a subset of the code, and exclude
> performance measurement result processing  -
> so the timeline is like:

>                       struct timespec t_start, t_end;
>                       perf_event_open(...);
>                       thread_main_loop() { ... do {
>           t     _    clock_gettime(CLOCK_MONOTONIC_RAW, &t_start);
>           t+x _   enable_perf  ();
>                       total_work = do_some_work();
>                       disable_perf ();
>                       clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
>            t+y_
>                       read_perf_counters_and_store_results
>                        ( perf_grp_fd, &results ,  total_work,
>                          TS2T( &t_end ) - TS2T( &t_start)
>                         );
>            } while (.... );
>         }
> 
>    Now. here the bandwidth / performance results recorded by
>    my 'read_perf_counters_and_store_results' method
>    is very sensitive to the measurement of the OUTER
>    elapsed time .

I still don't see why you have to do that enable_perf() / disable_perf()
stuff. What goes wrong if you just let them run and do 2
read_perf*() things?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-14  9:45     ` Peter Zijlstra
@ 2018-03-14 12:55       ` Jason Vas Dias
  2018-03-14 13:11         ` Peter Zijlstra
                           ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Jason Vas Dias @ 2018-03-14 12:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: x86, LKML, Thomas Gleixner, andi

[-- Attachment #1: Type: text/plain, Size: 7670 bytes --]

Thanks for the helpful comments, Peter -
re:
On 14/03/2018, Peter Zijlstra <peterz@infradead.org> wrote:
>
>> Yes, I am sampling perf counters,
>
> You're not in fact sampling, you're just reading the counters.

Correct, using Linux-ese terminology - but "sampling" in looser English.


>> Reading performance counters does involve  2 ioctls and a read() ,
>
> So you can avoid the whole ioctl(ENABLE), ioctl(DISABLE) nonsense and
> just let them run and do:
>
> 	read(group_fd, &buf_pre, size);
> 	/* your code section */
> 	read(group_fd, &buf_post, size);
>
> 	/* compute buf_post - buf_pre */
>
> Which is only 2 system calls, not 4.

But I can't, really - I am trying to restrict the
performance counter measurements
to only a subset of the code, and exclude
performance measurement result processing  -
so the timeline is like:
                      struct timespec t_start, t_end;
                      perf_event_open(...);
                      thread_main_loop() { ... do {
          t     _    clock_gettime(CLOCK_MONOTONIC_RAW, &t_start);
          t+x _   enable_perf  ();
                      total_work = do_some_work();
                      disable_perf ();
                      clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
           t+y_
                      read_perf_counters_and_store_results
                       ( perf_grp_fd, &results ,  total_work,
                         TS2T( &t_end ) - TS2T( &t_start)
                        );
           } while (.... );
        }

   Now. here the bandwidth / performance results recorded by
   my 'read_perf_counters_and_store_results' method
   is very sensitive to the measurement of the OUTER
   elapsed time .

>
> Also, a while back there was the proposal to extend the mmap()
> self-monitoring interface to groups, see:
>
> https://lkml.kernel.org/r/20170530172555.5ya3ilfw3sowokjz@hirez.programming.kicks-ass.net
>
> I never did get around to writing the actual code for it, but it
> shouldn't be too hard.
>

Great, I'm looking forward to trying it - but meanwhile,
to get NON-MULTIPLEXED measurements for the SAME CODE SEQUENCE
over the SAME TIME I believe the group FD method is what is implemented
and what works.


>> The CPU_CLOCK software counter should give the converted TSC cycles
>> seen between the ioctl( grp_fd, PERF_EVENT_IOC_ENABLE , ...)
>> and the  ioctl( grp_fd, PERF_EVENT_IOC_DISABLE ), and the
>> difference between the event->time_running and time_enabled
>> should also measure elapsed time .
>
> While CPU_CLOCK is TSC based, there is no guarantee it has any
> correlation to CLOCK_MONOTONIC_RAW (even if that is also TSC based).
>
> (although, I think I might have fixed that recently and it might just
> work, but it's very much not guaranteed).

Yes, I believe the CPU_CLOCK is effectively the converted TSC -
it does appear to correlate well with the new CLOCK_MONOTONIC_RAW
values from the patched VDSO.

> If you want to correlate to CLOCK_MONOTONIC_RAW you have to read
> CLOCK_MONOTONIC_RAW and not some random other clock value.
>

Exactly ! Hence the need for the patch so that users can get
CLOCK_MONOTONIC_RAW values with low latency and correlate them
with PERF CPU_CLOCK values.

>> This gives the "inner" elapsed time, from the perpective of the kernel,
>> while the measured code section had the counters enabled.
>>
>> But unless the user-space program  also has a way of measuring elapsed
>> time from the CPU's perspective , ie. without being subject to
>> operator or NTP / PTP adjustment, it has no way of correlating this
>> inner elapsed time with any "outer"
>
> You could read the time using the group_fd's mmap() page. That actually
> includes the TSC mult,shift,offset as used by perf clocks.
>

Yes, but as mentioned earlier, that presupposes I want to use the mmap()
sample method - I don't - I want to use the Group FD method, so
that I can be sure the measurements are for the same code sequence
over the same period of time.

>> Currently, users must parse the log file or use gdb / objdump to
>> inspect /proc/kcore to get the TSC calibration and exact
>> mult+shift values for the TSC value conversion.
>
> Which ;-) there's multiple floating around..
>

Yes, but why must Linux make it so difficult ?
I think it has to be recognized that the vDSO or user-space program
are the only places in which low-latency clock values can be generated
for use by user-space programs with sufficiently low latencies to be useful.
So why does it not export the TSC calibration which is so complex to
calibrate when such calibration information is available nowhere else ?


>> Intel does not publish, nor does the CPU come with in ROM or firmware,
>> the actual precise TSC frequency - this must be calibrated against the
>> other clocks , according to a complicated procedure in section 18.2 of
>> the SDM . My TSC has a "rated" / nominal TSC frequency , which one
>> can compute from CPUID leaves, of 2.3ghz, but the "Refined TSC frequency"
>> is 2.8333ghz .
>
> You might want to look at commit:
>
>   b51120309348 ("x86/tsc: Fix erroneous TSC rate on Skylake Xeon")
>
> There is no such thing as a precise TSC frequency, there's a reason we
> have NTP/PTP.
>

By my reading of the SDM, yes, the TSC frequency IS quite precise and
fixed ,  especially on modern CPUs with Always Running Timer (ART)
hardware,  but its precise frequency is not known in advance and
must be calibrated - but once known, it is quite precise & fixed at
that value.

I want to count the number of ticks the CPU thinks has elapsed,
regardless of how fast or slow it is currently running , NOT the
NTP / PTP time.


ntpd itself is running very happily and is reporting MUCH lower
offsets and drift with the VDSO CLOCK_MONOTONIC_RAW
patch applied.

>> Hence I think Linux should export this calibrated frequency somehow ;
>> its "calibration" is expressed as the raw clocksource 'mult' and 'shift'
>> values, and is exported to the VDSO .
>>
>> I think the VDSO should read the TSC and use the calibration
>> to render the raw, unadjusted time from the CPU's perspective.
>>
>> Hence, the patch I am preparing , which is again attached.
>
> I have no objection to adding CLOCK_MONOTONIC_RAW support to the VDSO,
> but you seem to be rather confused on how things work.
>
> Now, if you wanted to actually have CLOCK_MONOTONIC_RAW times from perf
> you'd need something like the below patch.
>
> You'd need to create your events with:
>
> 	attr.use_clockid = 1;
> 	attr.clockid = CLOCK_MONOTONIC_RAW;
> 	attr.read_format |= PERF_FORMAT_TIME;
>

Yes, I am doing this, but I want to also be able to read
CLOCK_MONOTONIC_RAW using clock_gettime()
with low latency. I do not think this is an unreasonable desire
or an expectation not encouraged by the documentation.

> But whatever you do, you really have to stop mixing clocks, that's
> broken, even if it magically works for now.
>

That is precisely what I am trying to avoid.

When comparing the CPU-perspective CPU_CLOCK values
with any other time values, I want to make sure a low-latency
non NTP / PTP adjusted value is  used.

I hope someone other than me can see the sense in above.

This is the latency I measure now with the timer_latency program
under 4.15.9 with the patch applied:
<quote><pre>
$ ./timer_latency
sum: 2313
Total time: 0.000002313S - Average Latency: 0.000000023S
$ ./timer_latency -m
sum: 5556
Total time: 0.000005556S - Average Latency: 0.000000055S


whereas before the patch was applied I measured latencies
of 300-1000ns for the CLOCK_MONOTONIC_RAW clock source.

This in itself to me makes the patch worth applying. I hope others
can see the sense in that.

Thanks & Best Regards,
Jason

[-- Attachment #2: timer_latency.c --]
[-- Type: text/x-csrc, Size: 2568 bytes --]

/* 
 * Program to measure high-res timer latency.
 *
 */
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

int main(int argc, char *const* argv, char *const* envp)
{ clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1;
  for(; argn < argc; argn+=1)
    if( argv[argn] != NULL )
      if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	    clk = CLOCK_MONOTONIC;
	    break;
	  case 'd':
	  case 'D':
	    do_dump = true;
	    break;
	case '?':
	case 'h':
	case 'u':
	case 'U':
	case 'H':
	  fprintf(stderr,"Usage: timer_latency [ -m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW) ;  -d : dump timespec contents. N_SAMPLES: " STR(N_SAMPLES) "\n\t"
	          "Calculates average timer latency (minimum time that can be measured) over N_SAMPLES.\n"
	  );
	  return 0;
	}
  struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 ;
  do
  { if( 0 != clock_gettime(clk, &sample[s++]) )
    { fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno));
      return 1;
    }
  }while( --cnt );
  clock_gettime(clk, &sample[s]);
#define TS2NS(_TS_) ((((unsigned long long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long long)((_TS_).tv_nsec)))) 
  unsigned long long
    deltas [ N_SAMPLES ]
  , t1, t2, sum=0, zd=0
  , t_start = TS2NS(sample[0]);
  for(s=1; s < (N_SAMPLES+1); s+=1)
  { t1 = TS2NS(sample[s-1]);
    t2 = TS2NS(sample[s]);
    if ( t1 > t2 )
    { fprintf(stderr,"Inconsistency: %llu %llu %lu.%lu %lu.%lu\n", t1 , t2
            , sample[s-1].tv_sec, sample[s-1].tv_nsec
            , sample[s].tv_sec,   sample[s].tv_nsec
      );
      continue;
    }
    unsigned long long d =t2-t1;
    if ( d == 0 )
    { if( zd == 0 )
	fprintf(stderr, "0 delta!\n");
      zd += 1;
    }
    deltas[s-1] = d;
    if(do_dump)
      fprintf(stderr, "%lu %lu %llu\n",
              sample[s].tv_sec, sample[s].tv_nsec, d
             );
  }
  if( zd > 0 )
    fprintf(stderr,"%u 0 deltas\n");
  for(s = 0; s < N_SAMPLES; s+=1)
    sum += deltas[s];
  fprintf(stderr,"sum: %llu\n",sum);
  unsigned long long avg_ns = sum / N_SAMPLES;
  t1=(t2 - t_start);
  printf("Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS\n",
          t1 / 1000000000,       t1 % 1000000000,
          avg_ns / 1000000000,   avg_ns % 1000000000
        );
  return 0;
}


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-13 23:45   ` Jason Vas Dias
@ 2018-03-14  9:45     ` Peter Zijlstra
  2018-03-14 12:55       ` Jason Vas Dias
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2018-03-14  9:45 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Thomas Gleixner, andi

On Tue, Mar 13, 2018 at 11:45:45PM +0000, Jason Vas Dias wrote:
> On 12/03/2018, Peter Zijlstra <peterz@infradead.org> wrote:
> > On Mon, Mar 12, 2018 at 07:01:20AM +0000, Jason Vas Dias wrote:
> >>   Sometimes, particularly when correlating elapsed time to performance
> >>   counter values,
> >
> > So what actual problem are you tring to solve here? Perf can already
> > give you sample time in various clocks, including MONOTONIC_RAW.
> >
> >
> 
> Yes, I am sampling perf counters,

You're not in fact sampling, you're just reading the counters.

> including CPU_CYCLES , INSTRUCTIONS,
> CPU_CLOCK, TASK_CLOCK, etc, in a Group FD I open with
> perf_event_open() , for the current thread on the current CPU -
> I am doing this for 4 threads , on Intel & ARM cpus.
> 
> Reading performance counters does involve  2 ioctls and a read() ,
> which takes time that  already far exceeds the time required to read
> the TSC or CNTPCT in the VDSO .

So you can avoid the whole ioctl(ENABLE), ioctl(DISABLE) nonsense and
just let them run and do:

	read(group_fd, &buf_pre, size);
	/* your code section */
	read(group_fd, &buf_post, size);

	/* compute buf_post - buf_pre */

Which is only 2 system calls, not 4.

Also, a while back there was the proposal to extend the mmap()
self-monitoring interface to groups, see:

 https://lkml.kernel.org/r/20170530172555.5ya3ilfw3sowokjz@hirez.programming.kicks-ass.net

I never did get around to writing the actual code for it, but it
shouldn't be too hard.

> The CPU_CLOCK software counter should give the converted TSC cycles
> seen between the ioctl( grp_fd, PERF_EVENT_IOC_ENABLE , ...)
> and the  ioctl( grp_fd, PERF_EVENT_IOC_DISABLE ), and the
> difference between the event->time_running and time_enabled
> should also measure elapsed time .

While CPU_CLOCK is TSC based, there is no guarantee it has any
correlation to CLOCK_MONOTONIC_RAW (even if that is also TSC based).

(although, I think I might have fixed that recently and it might just
work, but it's very much not guaranteed).

If you want to correlate to CLOCK_MONOTONIC_RAW you have to read
CLOCK_MONOTONIC_RAW and not some random other clock value.

> This gives the "inner" elapsed time, from the perpective of the kernel,
> while the measured code section had the counters enabled.
> 
> But unless the user-space program  also has a way of measuring elapsed
> time from the CPU's perspective , ie. without being subject to
> operator or NTP / PTP adjustment, it has no way of correlating this
> inner elapsed time with any "outer"

You could read the time using the group_fd's mmap() page. That actually
includes the TSC mult,shift,offset as used by perf clocks.

> Currently, users must parse the log file or use gdb / objdump to
> inspect /proc/kcore to get the TSC calibration and exact
> mult+shift values for the TSC value conversion.

Which ;-) there's multiple floating around..

> Intel does not publish, nor does the CPU come with in ROM or firmware,
> the actual precise TSC frequency - this must be calibrated against the
> other clocks , according to a complicated procedure in section 18.2 of
> the SDM . My TSC has a "rated" / nominal TSC frequency , which one
> can compute from CPUID leaves, of 2.3ghz, but the "Refined TSC frequency"
> is 2.8333ghz .

You might want to look at commit:

  b51120309348 ("x86/tsc: Fix erroneous TSC rate on Skylake Xeon")

There is no such thing as a precise TSC frequency, there's a reason we
have NTP/PTP.

> Hence I think Linux should export this calibrated frequency somehow ;
> its "calibration" is expressed as the raw clocksource 'mult' and 'shift'
> values, and is exported to the VDSO .
> 
> I think the VDSO should read the TSC and use the calibration
> to render the raw, unadjusted time from the CPU's perspective.
> 
> Hence, the patch I am preparing , which is again attached.

I have no objection to adding CLOCK_MONOTONIC_RAW support to the VDSO,
but you seem to be rather confused on how things work.

Now, if you wanted to actually have CLOCK_MONOTONIC_RAW times from perf
you'd need something like the below patch.

You'd need to create your events with:

	attr.use_clockid = 1;
	attr.clockid = CLOCK_MONOTONIC_RAW;
	attr.read_format |= PERF_FORMAT_TIME;

But whatever you do, you really have to stop mixing clocks, that's
broken, even if it magically works for now.

---
 include/uapi/linux/perf_event.h |  5 ++++-
 kernel/events/core.c            | 23 ++++++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 912b85b52344..e210c9a97f2b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -271,9 +271,11 @@ enum {
  *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
  *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		id;           } && PERF_FORMAT_ID
+ *	  { u64		time;         } && PERF_FORMAT_TIME
  *	} && !PERF_FORMAT_GROUP
  *
  *	{ u64		nr;
+ *	  { u64         time;         } && PERF_FORMAT_TIME
  *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
  *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		value;
@@ -287,8 +289,9 @@ enum perf_event_read_format {
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
 	PERF_FORMAT_GROUP			= 1U << 3,
+	PERF_FORMAT_TIME			= 1U << 4,
 
-	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 5,		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c87decf03757..4298b4a39bc0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1707,6 +1707,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
 		size += sizeof(u64);
 	}
 
+	if (event->attr.read_format & PERF_FORMAT_TIME)
+		size += sizeof(u64);
+
 	size += entry * nr;
 	event->read_size = size;
 }
@@ -4685,6 +4688,9 @@ static int __perf_read_group_add(struct perf_event *leader,
 	int n = 1; /* skip @nr */
 	int ret;
 
+	if (read_format & PERF_FORMAT_TIME)
+		n++; /* skip @time */
+
 	ret = perf_event_read(leader, true);
 	if (ret)
 		return ret;
@@ -4739,6 +4745,9 @@ static int perf_read_group(struct perf_event *event,
 
 	values[0] = 1 + leader->nr_siblings;
 
+	if (read_format & PERF_FORMAT_TIME)
+		values[1] = perf_event_clock(event);
+
 	/*
 	 * By locking the child_mutex of the leader we effectively
 	 * lock the child list of all siblings.. XXX explain how.
@@ -4773,7 +4782,7 @@ static int perf_read_one(struct perf_event *event,
 				 u64 read_format, char __user *buf)
 {
 	u64 enabled, running;
-	u64 values[4];
+	u64 values[5];
 	int n = 0;
 
 	values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -4783,6 +4792,8 @@ static int perf_read_one(struct perf_event *event,
 		values[n++] = running;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
+	if (read_format & PERF_FORMAT_TIME)
+		values[n++] = perf_event_clock(event)
 
 	if (copy_to_user(buf, values, n * sizeof(u64)))
 		return -EFAULT;
@@ -6034,7 +6045,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 				 u64 enabled, u64 running)
 {
 	u64 read_format = event->attr.read_format;
-	u64 values[4];
+	u64 values[5];
 	int n = 0;
 
 	values[n++] = perf_event_count(event);
@@ -6049,6 +6060,9 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
 
+	if (read_format & PERF_FORMAT_TIME)
+		values[n++] = perf_event_clock(event);
+
 	__output_copy(handle, values, n * sizeof(u64));
 }
 
@@ -6058,11 +6072,14 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
-	u64 values[5];
+	u64 values[6];
 	int n = 0;
 
 	values[n++] = 1 + leader->nr_siblings;
 
+	if (read_format & PERF_FORMAT_TIME)
+		values[n++] = perf_event_clock(event);
+
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = enabled;
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-12  8:27 ` Peter Zijlstra
@ 2018-03-13 23:45   ` Jason Vas Dias
  2018-03-14  9:45     ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Jason Vas Dias @ 2018-03-13 23:45 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: x86, LKML, Thomas Gleixner, andi

[-- Attachment #1: Type: text/plain, Size: 4201 bytes --]

On 12/03/2018, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Mar 12, 2018 at 07:01:20AM +0000, Jason Vas Dias wrote:
>>   Sometimes, particularly when correlating elapsed time to performance
>>   counter values,
>
> So what actual problem are you tring to solve here? Perf can already
> give you sample time in various clocks, including MONOTONIC_RAW.
>
>

Yes, I am sampling perf counters, including CPU_CYCLES , INSTRUCTIONS,
CPU_CLOCK, TASK_CLOCK, etc, in a Group FD I open with
perf_event_open() , for the current thread on the current CPU -
I am doing this for 4 threads , on Intel & ARM cpus.

Reading performance counters does involve  2 ioctls and a read() ,
which takes time that  already far exceeds the time required to read
the TSC or CNTPCT in the VDSO .

The CPU_CLOCK software counter should give the converted TSC cycles
seen between the ioctl( grp_fd, PERF_EVENT_IOC_ENABLE , ...)
and the  ioctl( grp_fd, PERF_EVENT_IOC_DISABLE ), and the
difference between the event->time_running and time_enabled
should also measure elapsed time .

This gives the "inner" elapsed time, from the perpective of the kernel,
while the measured code section had the counters enabled.

But unless the user-space program  also has a way of measuring elapsed time
from the CPU's perspective , ie. without being subject to operator or NTP / PTP
adjustment, it has no way of correlating this inner elapsed time with
any "outer"
elapsed time measurement it may have made - I also measure the time
taken by I/O operations between threads, for instance.

So that is my primary motivation - for each thread's main run loop, I
enable performance counters and count several PMU counters
and the CPU_CLOCK & TASK_CLOCK .  I want to determine
with maximal accuracy how much elapsed time was used
actually executing the task's instructions on the CPU ,
and how long they took to execute.
I want to try to exclude the time spent gathering and making
and analysing the performance measurements from the
time spent running the threads' main loop .

To do this accurately, it is best to exclude variations in time
that occur because of operator or NTP / PTP adjustments .

The CLOCK_MONOTONIC_RAW clock is the ONLY
clock that is MEANT to be immune from any adjustment.

It is meant to be high - resolution clock with 1ns resolution
that should be subject to no adjustment, and hence one would expect
it it have the lowest latency.

But the way Linux has up to now implemented it , CLOCK_MONOTONIC_RAW
has a resolution (minimum time that can be measured)
that varies from 300 - 1000ns .

I can read the TSC  and store a 16-byte timespec value in @ 8ns
on the same CPU .

I understand that linux must conform to the POSIX interface which
means it cannot provide sub-nanosecond resolution timers, but
it could allow user-space programs to easily discover the timer calibration
so that user-space programs can read the timers themselves.

Currently, users must parse the log file or use gdb / objdump to
inspect /proc/kcore to get the TSC calibration and exact
mult+shift values for the TSC value conversion.

Intel does not publish, nor does the CPU come with in ROM or firmware,
the actual precise TSC frequency - this must be calibrated against the
other clocks , according to a complicated procedure in section 18.2 of
the SDM . My TSC has a "rated" / nominal TSC frequency , which one
can compute from CPUID leaves, of 2.3ghz, but the "Refined TSC frequency"
is 2.8333ghz .

Hence I think Linux should export this calibrated frequency somehow ;
its "calibration" is expressed as the raw clocksource 'mult' and 'shift'
values, and is exported to the VDSO .

I think the VDSO should read the TSC and use the calibration
to render the raw, unadjusted time from the CPU's perspective.

Hence, the patch I am preparing , which is again attached.

I will submit it properly via email once I figure out
how to obtain the 'git-send-mail' tool, and how to
use it to send multiple patches, which seems
to be the only way to submit acceptable patches.

Also the attached timer program measures a latency
of @ 20ns with my patch 4.15.9 kernel, when it
measured a latency of 300-1000ns without it.

Thanks & Regards,

Jason

[-- Attachment #2: vdso_clock_monotonic_raw_1.patch --]
[-- Type: application/octet-stream, Size: 3834 bytes --]

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..fbc7371 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -182,6 +182,18 @@ notrace static u64 vread_tsc(void)
 	return last;
 }
 
+notrace static u64 vread_tsc_raw(void)
+{
+	u64 tsc
+	  , last = gtod->raw_cycle_last;
+
+	tsc	      = rdtsc_ordered();
+	if (likely(tsc >= last))
+		return tsc;
+	asm volatile ("");
+	return last;
+}
+
 notrace static inline u64 vgetsns(int *mode)
 {
 	u64 v;
@@ -203,6 +215,27 @@ notrace static inline u64 vgetsns(int *mode)
 	return v * gtod->mult;
 }
 
+notrace static inline u64 vgetsns_raw(int *mode)
+{
+	u64 v;
+	cycles_t cycles;
+
+	if (gtod->vclock_mode == VCLOCK_TSC)
+		cycles = vread_tsc_raw();
+#ifdef CONFIG_PARAVIRT_CLOCK
+	else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
+		cycles = vread_pvclock(mode);
+#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+	else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
+		cycles = vread_hvclock(mode);
+#endif
+	else
+		return 0;
+	v = (cycles - gtod->raw_cycle_last) & gtod->raw_mask;
+	return v * gtod->raw_mult;
+}
+
 /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
 notrace static int __always_inline do_realtime(struct timespec *ts)
 {
@@ -246,6 +279,27 @@ notrace static int __always_inline do_monotonic(struct timespec *ts)
 	return mode;
 }
 
+notrace static __always_inline int do_monotonic_raw(struct timespec *ts)
+{
+	unsigned long seq;
+	u64 ns;
+	int mode;
+
+	do {
+		seq = gtod_read_begin(gtod);
+		mode = gtod->vclock_mode;
+		ts->tv_sec = gtod->monotonic_time_raw_sec;
+		ns = gtod->monotonic_time_raw_nsec;
+		ns += vgetsns_raw(&mode);
+		ns >>= gtod->raw_shift;
+	} while (unlikely(gtod_read_retry(gtod, seq)));
+
+	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return mode;
+}
+
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -277,6 +331,10 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		if (do_monotonic(ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		if (do_monotonic_raw(ts) == VCLOCK_NONE)
+			goto fallback;
+		break;
 	case CLOCK_REALTIME_COARSE:
 		do_realtime_coarse(ts);
 		break;
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index e1216dd..5af7093 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -45,6 +45,11 @@ void update_vsyscall(struct timekeeper *tk)
 	vdata->mult		= tk->tkr_mono.mult;
 	vdata->shift		= tk->tkr_mono.shift;
 
+	vdata->raw_cycle_last	= tk->tkr_raw.cycle_last;
+	vdata->raw_mask		= tk->tkr_raw.mask;
+	vdata->raw_mult		= tk->tkr_raw.mult;
+	vdata->raw_shift	= tk->tkr_raw.shift;
+
 	vdata->wall_time_sec		= tk->xtime_sec;
 	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
 
@@ -74,5 +79,8 @@ void update_vsyscall(struct timekeeper *tk)
 		vdata->monotonic_time_coarse_sec++;
 	}
 
+	vdata->monotonic_time_raw_sec  = tk->raw_sec;
+	vdata->monotonic_time_raw_nsec = tk->tkr_raw.xtime_nsec;
+
 	gtod_write_end(vdata);
 }
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index fb856c9..24e4d45 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -22,6 +22,10 @@ struct vsyscall_gtod_data {
 	u64	mask;
 	u32	mult;
 	u32	shift;
+	u64	raw_cycle_last;
+	u64	raw_mask;
+	u32	raw_mult;
+	u32	raw_shift;
 
 	/* open coded 'struct timespec' */
 	u64		wall_time_snsec;
@@ -32,6 +36,8 @@ struct vsyscall_gtod_data {
 	gtod_long_t	wall_time_coarse_nsec;
 	gtod_long_t	monotonic_time_coarse_sec;
 	gtod_long_t	monotonic_time_coarse_nsec;
+	gtod_long_t	monotonic_time_raw_sec;
+	gtod_long_t	monotonic_time_raw_nsec;
 
 	int		tz_minuteswest;
 	int		tz_dsttime;

[-- Attachment #3: timer_latency.c --]
[-- Type: text/x-csrc, Size: 2443 bytes --]

/* 
 * Program to measure high-res timer latency.
 *
 */
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

int main(int argc, char *const* argv, char *const* envp)
{ clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1;
  for(; argn < argc; argn+=1)
    if( argv[argn] != NULL )
      if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	    clk = CLOCK_MONOTONIC;
	    break;
	  case 'd':
	  case 'D':
	    do_dump = true;
	    break;
	case '?':
	case 'h':
	case 'u':
	case 'U':
	case 'H':
	  fprintf(stderr,"Usage: timer_latency [ -m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW) ;  -d : dump timespec contents. N_SAMPLES: " STR(N_SAMPLES) "\n\t"
	          "Calculates average timer latency (minimum time that can be measured) over N_SAMPLES.\n"
	  );
	  return 0;
	}
  struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 ;
  do
  { if( 0 != clock_gettime(clk, &sample[s++]) )
    { fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno));
      return 1;
    }
  }while( --cnt );
  clock_gettime(clk, &sample[s]);
#define TS2NS(_TS_) ((((unsigned long long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long long)((_TS_).tv_nsec)))) 
  unsigned long long
    deltas [ N_SAMPLES ]
  , t1, t2, sum=0, zd=0
  , t_start = TS2NS(sample[0]);
  for(s=1; s < (N_SAMPLES+1); s+=1)
  { t1 = TS2NS(sample[s-1]);
    t2 = TS2NS(sample[s]);
    if ( t1 > t2 )
    { fprintf(stderr,"Inconsistency: %llu %llu %lu.%lu %lu.%lu\n", t1 , t2
            , sample[s-1].tv_sec, sample[s-1].tv_nsec
            , sample[s].tv_sec,   sample[s].tv_nsec
      );
      continue;
    }
    unsigned long long d =t2-t1;
    if ( d == 0 )
    { if( zd == 0 )
	fprintf(stderr, "0 delta!\n");
      zd += 1;
    }
    deltas[s-1] = d;
  }
  if( zd > 0 )
    fprintf(stderr,"%u 0 deltas\n");
  for(s = 0; s < N_SAMPLES; s+=1)
    sum += deltas[s];
  fprintf(stderr,"sum: %llu\n",sum);
  unsigned long long avg_ns = sum / N_SAMPLES;
  t1=(t2 - t_start);
  printf("Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS\n",
          t1 / 1000000000,       t1 % 1000000000,
          avg_ns / 1000000000,   avg_ns % 1000000000
        );
  return 0;
}


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
  2018-03-12  7:01 Jason Vas Dias
@ 2018-03-12  8:27 ` Peter Zijlstra
  2018-03-13 23:45   ` Jason Vas Dias
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2018-03-12  8:27 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Thomas Gleixner, andi

On Mon, Mar 12, 2018 at 07:01:20AM +0000, Jason Vas Dias wrote:
>   Sometimes, particularly when correlating elapsed time to performance
>   counter values, 

So what actual problem are you tring to solve here? Perf can already
give you sample time in various clocks, including MONOTONIC_RAW.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
@ 2018-03-12  7:01 Jason Vas Dias
  2018-03-12  8:27 ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Jason Vas Dias @ 2018-03-12  7:01 UTC (permalink / raw)
  To: x86, LKML, Thomas Gleixner, andi, Peter Zijlstra


  Currently the VDSO does not handle
     clock_gettime( CLOCK_MONOTONIC_RAW, &ts )
  on Intel / AMD - it calls
     vdso_fallback_gettime()
  for this clock, which issues a syscall, having an unacceptably high
  latency (minimum measurable time or time between measurements)
  of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
  machines under various versions of Linux.

  Sometimes, particularly when correlating elapsed time to performance
  counter values,  code needs to know elapsed time from the perspective
  of the CPU no matter how "hot" / fast or "cold" / slow it might be
  running wrt NTP / PTP ; when code needs this, the latencies with
  a syscall are often unacceptably high.

  I reported this as Bug #198161 :
    'https://bugzilla.kernel.org/show_bug.cgi?id=198961'
  and in previous posts with subjects matching 'CLOCK_MONOTONIC_RAW' .
     
  This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
  by exporting the raw clock calibration, last cycles, last xtime_nsec,
  and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .

  Now the new do_monotonic_raw() function in the vDSO has a latency of @ 24ns
  on average, and the test program:
   tools/testing/selftest/timers/inconsistency-check.c
  succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.

  The patch is against Linus' latest 4.16-rc5 tree,
  current HEAD of :
    git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
  .

  This patch affects only files:
  
   arch/x86/include/asm/vgtod.h
   arch/x86/entry/vdso/vclock_gettime.c
   arch/x86/entry/vsyscall/vsyscall_gtod.c   
   arch/x86/entry/vdso/vdso.lds.S
   arch/x86/entry/vdso/vdsox32.lds.S
   arch/x86/entry/vdso/vdso32/vdso32.lds.S      


  and adds one new file:
   arch/x86/include/uapi/asm/vdso_tsc_calibration.h
   
  This is a second patch in the series, 
  which adds a record of the calibrated tsc frequency to the VDSO,
  and a new header:
    uapi/asm/vdso_tsc_calibration.h
  which defines a structure :
    struct linux_tsc_calibration { u32 tsc_khz, mult, shift ; };
  and a getter function in the VDSO that can optionally be used
  by user-space code to implement sub-nanosecond precision clocks .
  This second patch is entirely optional but I think greatly
  expands the scope of user-space TSC readers .

  Resent : Oops, in previous version of this patch (#2),
  the comments in the new vdso_tsc_calibration were wrong,
  for an earlier version - sorry about that.

  Best Regards,
     Jason Vas Dias  .

 PATCH 2/2:
---
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c
--- linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1	2018-03-12 04:29:27.296982872 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c	2018-03-12 05:38:53.019891195 +0000
@@ -21,6 +21,7 @@
 #include <linux/math64.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -385,3 +386,22 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+extern unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *);
+
+notrace	unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *tsc_cal)
+{
+	if ( (gtod->vclock_mode == VCLOCK_TSC) && (tsc_cal != ((void*)0UL)) )
+	{
+		tsc_cal -> tsc_khz = gtod->tsc_khz;
+		tsc_cal -> mult    = gtod->raw_mult;
+		tsc_cal -> shift   = gtod->raw_shift;
+		return 1;
+	}
+	return 0;
+}
+
+unsigned linux_tsc_calibration(void)
+	__attribute((weak, alias("__vdso_linux_tsc_calibration")));
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S	2018-03-12 05:18:36.380673342 +0000
@@ -25,6 +25,8 @@ VERSION {
 		__vdso_getcpu;
 		time;
 		__vdso_time;
+		linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S	2018-03-12 05:19:10.765022295 +0000
@@ -26,6 +26,7 @@ VERSION
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_time;
+		__vdso_linux_tsc_calibration;
 	};
 
 	LINUX_2.5 {
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S	2018-03-12 05:18:51.626827852 +0000
@@ -21,6 +21,7 @@ VERSION {
 		__vdso_gettimeofday;
 		__vdso_getcpu;
 		__vdso_time;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c
--- linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1	2018-03-12 04:23:10.005141993 +0000
+++ linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c	2018-03-12 05:07:09.246115115 +0000
@@ -18,6 +18,8 @@
 #include <asm/vvar.h>
 #include <asm/cpufeature.h>
 
+extern unsigned tsc_khz;
+
 int vclocks_used __read_mostly;
 
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
@@ -51,6 +53,7 @@ void update_vsyscall(struct timekeeper *
 	vdata->raw_mult		= tk->tkr_raw.mult;
 	vdata->raw_shift	= tk->tkr_raw.shift;
 	vdata->has_rdtscp	= static_cpu_has(X86_FEATURE_RDTSCP);
+	vdata->tsc_khz          = tsc_khz;
 
 	vdata->wall_time_sec		= tk->xtime_sec;
 	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
diff -up linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/asm/vgtod.h
--- linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1	2018-03-12 04:23:10.006142006 +0000
+++ linux-4.16-rc5/arch/x86/include/asm/vgtod.h	2018-03-12 05:03:37.312278324 +0000
@@ -27,6 +27,7 @@ struct vsyscall_gtod_data {
 	u32	raw_mult;
 	u32	raw_shift;
 	u32	has_rdtscp;
+	u32     tsc_khz;
 
 	/* open coded 'struct timespec' */
 	u64		wall_time_snsec;
diff -up linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
--- linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h.4.16-rc5-p1	2018-03-12 05:13:26.014607615 +0000
+++ linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h	2018-03-12 06:52:43.782286294 +0000
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/* 
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ *    __vdso_linux_tsc_calibration(struct linux_tsc_calibration_s *);
+ * ( one has to resolve this symbol as in 
+ *   tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which fills in a structure
+ * with the following layout :
+ */
+
+/** struct linux_tsc_calibration -
+ * mult:    amount to multiply 64-bit TSC value by
+ * shift:   the right shift to apply to (mult*TSC) yielding nanoseconds
+ * tsc_khz: the calibrated TSC frequency in KHz from which previous members calculated
+ */
+struct linux_tsc_calibration
+{
+        unsigned int mult;
+        unsigned int shift;
+        unsigned int tsc_khz;
+};
+
+/* To use:
+ *
+ *  static unsigned
+ *  (*linux_tsc_cal)(struct linux_tsc_calibration *linux_tsc_cal) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration");
+ *  if( linux_tsc_cal == 0UL )
+ *  { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not applied to the kernel.\n");
+ *    return ERROR;
+ *  }
+ *  static struct linux_tsc_calibration clock_source={0};
+ *  if((clock_source.mult==0) && ! (*linux_tsc_cal)(&clock_source) )
+ *    fprintf(stderr,"TSC is not the system clocksource.\n");
+ *  unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ *  asm volatile
+ *  ( "rdtscp" : (=a) tsc_hi,  (=d) tsc_lo, (=c) tsc_cpu );
+ *  unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ *  unsigned long nanoseconds =
+ *   (( clock_source . mult ) * tsc ) >> (clock_source . shift);
+ *
+ *  nanoseconds is now TSC value converted to nanoseconds,
+ *  according to Linux' clocksource calibration values.
+ *  Incidentally, 'tsc_cpu' is the number of the CPU the task is running on.
+ *
+ * But better results are obtained by applying this to the difference (delta)
+ * and adding this to some previous timespec value:
+ *   static u64 previous_tsc=0, previous_nsec=0, previous_sec=0;
+ *   u64  tsc      = rdtscp();
+ *   u64  delta    = tsc - previous_tsc;
+ *   u64  nsec     = ((delta * clock_source.mult) + previous_nsec )
+ *	           >> clock_source.shift;
+ *   ts->tv_sec    = previous_sec + (nsec / NSEC_PER_SEC);
+ *   ts->tv_nsec   = nsec % NSEC_PER_SEC;
+ *   previous_tsc  = tsc
+ *   previous_sec  = ts->tv_sec;
+ *   previous_nsec = ts->tv_nsec << clock_source.shift;
+ *   return ts;
+ * This is the approach taken by Linux kernel & in VDSO .
+ *
+ * Or, in user-space, with floating point, one could use the rdtscp value as number of picoseconds :
+ *     u64 ns = lround( ((double)rdtscp()) / (((double)clock_source.tsc_khz) / 1e3) );
+ * (ie. if tsc_khz is 3000 , there are 3 tsc ticks per nanosecond, so divide tsc ticks by 3).
+ *
+ * There should actually be very little difference between the two values obtained (@ 0.02% )
+ * by either method.
+ */
+
+#endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
@ 2018-03-12  5:44 Jason Vas Dias
  0 siblings, 0 replies; 13+ messages in thread
From: Jason Vas Dias @ 2018-03-12  5:44 UTC (permalink / raw)
  To: x86, LKML, Thomas Gleixner, andi, Peter Zijlstra


  Currently the VDSO does not handle
     clock_gettime( CLOCK_MONOTONIC_RAW, &ts )
  on Intel / AMD - it calls
     vdso_fallback_gettime()
  for this clock, which issues a syscall, having an unacceptably high
  latency (minimum measurable time or time between measurements)
  of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
  machines under various versions of Linux.

  Sometimes, particularly when correlating elapsed time to performance
  counter values,  code needs to know elapsed time from the perspective
  of the CPU no matter how "hot" / fast or "cold" / slow it might be
  running wrt NTP / PTP ; when code needs this, the latencies with
  a syscall are often unacceptably high.

  I reported this as Bug #198161 :
    'https://bugzilla.kernel.org/show_bug.cgi?id=198961'
  and in previous posts with subjects matching 'CLOCK_MONOTONIC_RAW' .
     
  This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
  by exporting the raw clock calibration, last cycles, last xtime_nsec,
  and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .

  Now the new do_monotonic_raw() function in the vDSO has a latency of @ 24ns
  on average, and the test program:
   tools/testing/selftest/timers/inconsistency-check.c
  succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.

  The patch is against Linus' latest 4.16-rc5 tree,
  current HEAD of :
    git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
  .

  This patch affects only files:
  
   arch/x86/include/asm/vgtod.h
   arch/x86/entry/vdso/vclock_gettime.c
   arch/x86/entry/vdso/vdso.lds.S
   arch/x86/entry/vdso/vdsox32.lds.S
   arch/x86/entry/vdso/vdso32/vdso32.lds.S      
   arch/x86/entry/vsyscall/vsyscall_gtod.c
   
  This is a second patch in the series, 
  which adds a record of the calibrated tsc frequency to the VDSO,
  and a new header:
    uapi/asm/vdso_tsc_calibration.h
  which defines a structure :
    struct linux_tsc_calibration { u32 tsc_khz, mult, shift ; };
  and a getter function in the VDSO that can optionally be used
  by user-space code to implement sub-nanosecond precision clocks .
  This second patch is entirely optional but I think greatly
  expands the scope of user-space TSC readers .

  Oops, previous version of this second patch
  mistakenly copied the changed part of vclock_gettime.c.

  Best Regards,
     Jason Vas Dias  .
     
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c
--- linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1	2018-03-12 04:29:27.296982872 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c	2018-03-12 05:38:53.019891195 +0000
@@ -21,6 +21,7 @@
 #include <linux/math64.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -385,3 +386,22 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+extern unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *);
+
+notrace	unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *tsc_cal)
+{
+	if ( (gtod->vclock_mode == VCLOCK_TSC) && (tsc_cal != ((void*)0UL)) )
+	{
+		tsc_cal -> tsc_khz = gtod->tsc_khz;
+		tsc_cal -> mult    = gtod->raw_mult;
+		tsc_cal -> shift   = gtod->raw_shift;
+		return 1;
+	}
+	return 0;
+}
+
+unsigned linux_tsc_calibration(void)
+	__attribute((weak, alias("__vdso_linux_tsc_calibration")));
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S	2018-03-12 05:18:36.380673342 +0000
@@ -25,6 +25,8 @@ VERSION {
 		__vdso_getcpu;
 		time;
 		__vdso_time;
+		linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S	2018-03-12 05:19:10.765022295 +0000
@@ -26,6 +26,7 @@ VERSION
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_time;
+		__vdso_linux_tsc_calibration;
 	};
 
 	LINUX_2.5 {
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S	2018-03-12 05:18:51.626827852 +0000
@@ -21,6 +21,7 @@ VERSION {
 		__vdso_gettimeofday;
 		__vdso_getcpu;
 		__vdso_time;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c
--- linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1	2018-03-12 04:23:10.005141993 +0000
+++ linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c	2018-03-12 05:07:09.246115115 +0000
@@ -18,6 +18,8 @@
 #include <asm/vvar.h>
 #include <asm/cpufeature.h>
 
+extern unsigned tsc_khz;
+
 int vclocks_used __read_mostly;
 
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
@@ -51,6 +53,7 @@ void update_vsyscall(struct timekeeper *
 	vdata->raw_mult		= tk->tkr_raw.mult;
 	vdata->raw_shift	= tk->tkr_raw.shift;
 	vdata->has_rdtscp	= static_cpu_has(X86_FEATURE_RDTSCP);
+	vdata->tsc_khz          = tsc_khz;
 
 	vdata->wall_time_sec		= tk->xtime_sec;
 	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
diff -up linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/asm/vgtod.h
--- linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1	2018-03-12 04:23:10.006142006 +0000
+++ linux-4.16-rc5/arch/x86/include/asm/vgtod.h	2018-03-12 05:03:37.312278324 +0000
@@ -27,6 +27,7 @@ struct vsyscall_gtod_data {
 	u32	raw_mult;
 	u32	raw_shift;
 	u32	has_rdtscp;
+	u32     tsc_khz;
 
 	/* open coded 'struct timespec' */
 	u64		wall_time_snsec;
diff -up linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
--- linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h.4.16-rc5-p1	2018-03-12 05:13:26.014607615 +0000
+++ linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h	2018-03-11 20:47:05.409960497 +0000
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/* 
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ *    __vdso_linux_tsc_calibration(struct linux_tsc_calibration_s *);
+ * ( one has to resolve this symbol as in 
+ *   tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which fills in a structure
+ * with the following layout :
+ */
+
+/** struct linux_tsc_calibration -
+ * mult:    amount to multiply 64-bit TSC value by
+ * shift:   the right shift to apply to (mult*TSC) yielding nanoseconds
+ * tsc_khz: the calibrated TSC frequency in KHz from which previous members calculated
+ */
+struct linux_tsc_calibration
+{
+        unsigned int mult;
+        unsigned int shift;
+        unsigned int tsc_khz;
+};
+
+/* To use:
+ *
+ *  static unsigned
+ *  (*linux_tsc_cal)(struct linux_tsc_calibration *linux_tsc_cal) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration");
+ *  if( linux_tsc_cal == 0UL )
+ *  { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not applied to the kernel.\n");
+ *    return ERROR;
+ *  }
+ *  static const struct linux_tsc_calibration *clock_source=(void*)0UL;
+ *  if( ! (*linux_tsc_cal)(&clock_source) )
+ *    fprintf(stderr,"TSC is not the system clocksource.\n");
+ *  unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ *  asm volatile
+ *  ( "rdtscp" : (=a) tsc_hi,  (=d) tsc_lo, (=c) tsc_cpu );
+ *  unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ *  unsigned long nanoseconds =
+ *   (( clock_source -> mult ) * tsc ) >> (clock_source -> shift);
+ *
+ *  nanoseconds is now TSC value converted to nanoseconds,
+ *  according to Linux' clocksource calibration values.
+ *  Incidentally, 'tsc_cpu' is the number of the CPU the task is running on.
+ *
+ * But better results are obtained by applying this to the difference (delta)
+ * and adding this to some previous timespec value:
+ *   static u64 previous_tsc=0, previous_nsec=0, previous_sec=0;
+ *   u64  tsc      = rdtscp();
+ *   u64  delta    = tsc - previous_tsc;
+ *   u64  nsec     = ((delta * clock_source->mult) + previous_nsec )
+ *	           >> clock_source->shift;
+ *   ts->tv_sec    = previous_sec + (nsec / NSEC_PER_SEC);
+ *   ts->tv_nsec   = nsec % NSEC_PER_SEC;
+ *   previous_tsc  = tsc
+ *   previous_sec  = ts->tv_sec;
+ *   previous_nsec = ts->tv_nsec << clock_source->shift;
+ *   return ts;
+ * This is the approach taken by Linux kernel & in VDSO .
+ *
+ * Or, in user-space, with floating point, one could use the rdtscp value as number of picoseconds :
+ *     u64 ns = lround( ((double)rdtscp()) / (((double)clock_source->tsc_khz) / 1e3) );
+ * (ie. if tsc_khz is 3000 , there are 3 tsc ticks per nanosecond, so divide tsc ticks by 3).
+ *
+ * There should actually be very little difference between the two values obtained (@ 0.02% )
+ * by either method.
+ */
+
+#endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW
@ 2018-03-12  5:33 Jason Vas Dias
  0 siblings, 0 replies; 13+ messages in thread
From: Jason Vas Dias @ 2018-03-12  5:33 UTC (permalink / raw)
  To: x86, LKML, Thomas Gleixner, andi, Peter Zijlstra


  Currently the VDSO does not handle
     clock_gettime( CLOCK_MONOTONIC_RAW, &ts )
  on Intel / AMD - it calls
     vdso_fallback_gettime()
  for this clock, which issues a syscall, having an unacceptably high
  latency (minimum measurable time or time between measurements)
  of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
  machines under various versions of Linux.

  Sometimes, particularly when correlating elapsed time to performance
  counter values,  code needs to know elapsed time from the perspective
  of the CPU no matter how "hot" / fast or "cold" / slow it might be
  running wrt NTP / PTP ; when code needs this, the latencies with
  a syscall are often unacceptably high.

  I reported this as Bug #198161 :
    'https://bugzilla.kernel.org/show_bug.cgi?id=198961'
  and in previous posts with subjects matching 'CLOCK_MONOTONIC_RAW' .
     
  This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
  by exporting the raw clock calibration, last cycles, last xtime_nsec,
  and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .

  Now the new do_monotonic_raw() function in the vDSO has a latency of @ 24ns
  on average, and the test program:
   tools/testing/selftest/timers/inconsistency-check.c
  succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.

  The patch is against Linus' latest 4.16-rc5 tree,
  current HEAD of :
    git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
  .

  This patch affects only files:
  
   arch/x86/include/asm/vgtod.h
   arch/x86/entry/vdso/vclock_gettime.c
   arch/x86/entry/vdso/vdso.lds.S
   arch/x86/entry/vdso/vdsox32.lds.S
   arch/x86/entry/vdso/vdso32/vdso32.lds.S      
   arch/x86/entry/vsyscall/vsyscall_gtod.c
   
  This is a second patch in the series, 
  which adds a record of the calibrated tsc frequency to the VDSO,
  and a new header:
    uapi/asm/vdso_tsc_calibration.h
  which defines a structure :
    struct linux_tsc_calibration { u32 tsc_khz, mult, shift ; };
  and a getter function in the VDSO that can optionally be used
  by user-space code to implement sub-nanosecond precision clocks .
  This second patch is entirely optional but I think greatly
  expands the scope of user-space TSC readers .

  Best Regards,
     Jason Vas Dias  .
     
---
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c
--- linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc5-p1	2018-03-12 04:29:27.296982872 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vclock_gettime.c	2018-03-12 05:10:53.185158334 +0000
@@ -21,6 +21,7 @@
 #include <linux/math64.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -385,3 +386,41 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+extern unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *);
+
+notrace	unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *tsc_cal)
+{
+	if ( (gtod->vclock_mode == VCLOCK_TSC) && (tsc_cal != ((void*)0UL)) )
+	{
+		tsc_cal -> tsc_khz = gtod->tsc_khz;
+		tsc_cal -> mult    = gtod->raw_mult;
+		tsc_cal -> shift   = gtod->raw_shift;
+		return 1;
+	}
+	return 0;
+}
+
+unsigned linux_tsc_calibration(void)
+	__attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
+extern unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *);
+
+notrace	unsigned
+__vdso_linux_tsc_calibration(struct linux_tsc_calibration *tsc_cal)
+{
+	if ( (gtod->vclock_mode == VCLOCK_TSC) && (tsc_cal != ((void*)0UL)) )
+	{
+		tsc_cal -> tsc_khz = gtod->tsc_khz;
+		tsc_cal -> mult    = gtod->raw_mult;
+		tsc_cal -> shift   = gtod->raw_shift;
+		return 1;
+	}
+	return 0;
+}
+
+unsigned linux_tsc_calibration(void)
+	__attribute((weak, alias("__vdso_linux_tsc_calibration")));
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdso.lds.S	2018-03-12 05:18:36.380673342 +0000
@@ -25,6 +25,8 @@ VERSION {
 		__vdso_getcpu;
 		time;
 		__vdso_time;
+		linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdso32/vdso32.lds.S	2018-03-12 05:19:10.765022295 +0000
@@ -26,6 +26,7 @@ VERSION
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_time;
+		__vdso_linux_tsc_calibration;
 	};
 
 	LINUX_2.5 {
diff -up linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S
--- linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S.4.16-rc5-p1	2018-03-12 00:25:09.000000000 +0000
+++ linux-4.16-rc5/arch/x86/entry/vdso/vdsox32.lds.S	2018-03-12 05:18:51.626827852 +0000
@@ -21,6 +21,7 @@ VERSION {
 		__vdso_gettimeofday;
 		__vdso_getcpu;
 		__vdso_time;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1 linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c
--- linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc5-p1	2018-03-12 04:23:10.005141993 +0000
+++ linux-4.16-rc5/arch/x86/entry/vsyscall/vsyscall_gtod.c	2018-03-12 05:07:09.246115115 +0000
@@ -18,6 +18,8 @@
 #include <asm/vvar.h>
 #include <asm/cpufeature.h>
 
+extern unsigned tsc_khz;
+
 int vclocks_used __read_mostly;
 
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
@@ -51,6 +53,7 @@ void update_vsyscall(struct timekeeper *
 	vdata->raw_mult		= tk->tkr_raw.mult;
 	vdata->raw_shift	= tk->tkr_raw.shift;
 	vdata->has_rdtscp	= static_cpu_has(X86_FEATURE_RDTSCP);
+	vdata->tsc_khz          = tsc_khz;
 
 	vdata->wall_time_sec		= tk->xtime_sec;
 	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
diff -up linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/asm/vgtod.h
--- linux-4.16-rc5/arch/x86/include/asm/vgtod.h.4.16-rc5-p1	2018-03-12 04:23:10.006142006 +0000
+++ linux-4.16-rc5/arch/x86/include/asm/vgtod.h	2018-03-12 05:03:37.312278324 +0000
@@ -27,6 +27,7 @@ struct vsyscall_gtod_data {
 	u32	raw_mult;
 	u32	raw_shift;
 	u32	has_rdtscp;
+	u32     tsc_khz;
 
 	/* open coded 'struct timespec' */
 	u64		wall_time_snsec;
diff -up linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h.4.16-rc5-p1 linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
--- linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h.4.16-rc5-p1	2018-03-12 05:13:26.014607615 +0000
+++ linux-4.16-rc5/arch/x86/include/uapi/asm/vdso_tsc_calibration.h	2018-03-11 20:47:05.409960497 +0000
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/* 
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ *    __vdso_linux_tsc_calibration(struct linux_tsc_calibration_s *);
+ * ( one has to resolve this symbol as in 
+ *   tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which fills in a structure
+ * with the following layout :
+ */
+
+/** struct linux_tsc_calibration -
+ * mult:    amount to multiply 64-bit TSC value by
+ * shift:   the right shift to apply to (mult*TSC) yielding nanoseconds
+ * tsc_khz: the calibrated TSC frequency in KHz from which previous members calculated
+ */
+struct linux_tsc_calibration
+{
+        unsigned int mult;
+        unsigned int shift;
+        unsigned int tsc_khz;
+};
+
+/* To use:
+ *
+ *  static unsigned
+ *  (*linux_tsc_cal)(struct linux_tsc_calibration *linux_tsc_cal) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration");
+ *  if( linux_tsc_cal == 0UL )
+ *  { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not applied to the kernel.\n");
+ *    return ERROR;
+ *  }
+ *  static const struct linux_tsc_calibration *clock_source=(void*)0UL;
+ *  if( ! (*linux_tsc_cal)(&clock_source) )
+ *    fprintf(stderr,"TSC is not the system clocksource.\n");
+ *  unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ *  asm volatile
+ *  ( "rdtscp" : (=a) tsc_hi,  (=d) tsc_lo, (=c) tsc_cpu );
+ *  unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ *  unsigned long nanoseconds =
+ *   (( clock_source -> mult ) * tsc ) >> (clock_source -> shift);
+ *
+ *  nanoseconds is now TSC value converted to nanoseconds,
+ *  according to Linux' clocksource calibration values.
+ *  Incidentally, 'tsc_cpu' is the number of the CPU the task is running on.
+ *
+ * But better results are obtained by applying this to the difference (delta)
+ * and adding this to some previous timespec value:
+ *   static u64 previous_tsc=0, previous_nsec=0, previous_sec=0;
+ *   u64  tsc      = rdtscp();
+ *   u64  delta    = tsc - previous_tsc;
+ *   u64  nsec     = ((delta * clock_source->mult) + previous_nsec )
+ *	           >> clock_source->shift;
+ *   ts->tv_sec    = previous_sec + (nsec / NSEC_PER_SEC);
+ *   ts->tv_nsec   = nsec % NSEC_PER_SEC;
+ *   previous_tsc  = tsc
+ *   previous_sec  = ts->tv_sec;
+ *   previous_nsec = ts->tv_nsec << clock_source->shift;
+ *   return ts;
+ * This is the approach taken by Linux kernel & in VDSO .
+ *
+ * Or, in user-space, with floating point, one could use the rdtscp value as number of picoseconds :
+ *     u64 ns = lround( ((double)rdtscp()) / (((double)clock_source->tsc_khz) / 1e3) );
+ * (ie. if tsc_khz is 3000 , there are 3 tsc ticks per nanosecond, so divide tsc ticks by 3).
+ *
+ * There should actually be very little difference between the two values obtained (@ 0.02% )
+ * by either method.
+ */
+
+#endif
---

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-03-14 13:29 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-12  9:14 [PATCH v4.16-rc4 2/2] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW Jason Vas Dias
2018-03-12 17:41 ` kbuild test robot
  -- strict thread matches above, loose matches on Subject: below --
2018-03-12  7:01 Jason Vas Dias
2018-03-12  8:27 ` Peter Zijlstra
2018-03-13 23:45   ` Jason Vas Dias
2018-03-14  9:45     ` Peter Zijlstra
2018-03-14 12:55       ` Jason Vas Dias
2018-03-14 13:11         ` Peter Zijlstra
2018-03-14 13:12         ` Peter Zijlstra
2018-03-14 13:16         ` Peter Zijlstra
2018-03-14 13:29         ` Peter Zijlstra
2018-03-12  5:44 Jason Vas Dias
2018-03-12  5:33 Jason Vas Dias

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).