All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
@ 2018-03-05  2:59 Jason Vas Dias
  2018-03-06  9:33 ` Thomas Gleixner
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Vas Dias @ 2018-03-05  2:59 UTC (permalink / raw)
  To: x86, linux-kernel, andi, tglx

On Intel / AMD platforms, when the clock source is TSC,
this makes the VDSO support
    clock_gettime(CLOCK_MONOTONIC_RAW, &timespec)
calls by issuing a 'rdtscp' instruction and doing performing
conversion of the value according to kernel TSC calibration
'mult' and 'shift' values in the vsyscall_gtod_data structure :
  ...
    tsc  = rdtscp();
    tsc   *= gtod->mult;
    tsc >>=gtod->shift;
    ts->tv_sec = __iter_div_u64_rem( tsc, 1000000000UL, &tsc->tv_nsec );
  ...
 instead of calling vdso_fallback_gtod()  for CLOCK_MONOTONIC_RAW
 clockid_t values.

It also provides a new function in the VDSO :

struct linux_timestamp_conversion
{ u32  mult;
  u32  shift;
};
extern
  const struct linux_timestamp_conversion *
  __vdso_linux_tsc_calibration(void);

which can be used by user-space rdtsc / rdtscp issuers
by using code such as in
   tools/testing/selftests/vDSO/parse_vdso.c
to call vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration"),
which returns a pointer to the function in the VDSO, which
returns the address of the 'mult' field in the vsyscall_gtod_data.

Thus user-space programs can use rdtscp and interpret its return values
in exactly the same way the kernel would, but without entering the kernel.

 As pointed out in Bug # 198961 :
   https://bugzilla.kernel.org/show_bug.cgi?id=198961
 which contains extra test programs and the full story behind this change,
 using CLOCK_MONOTONIC_RAW without the patch results in
 a minimum measurable time (latency) of @ 300 - 700ns because of
 the syscall used by vdso_fallback_gtod() .

With the patch, the latency falls to @ 100ns .

The latency would be @ 16 - 32 ns  if the do_monotonic_raw()
handler could record its previous TSC value and seconds return value
somewhere, but since the VDSO has no data region or writable page,
of course it cannot . Hence, to enable effective use of TSC by user
space programs, Linux must provide a way for them to discover the
calibration mult and shift values the kernel uses for the clock source ;
only by doing so can user-space get values that are comparable to
kernel generated values.

And I'd really like to know: why does the gtod->mult value change ?
After TSC calibration, it and the shift are calculated to render the
best approximation of a nanoseconds value from the TSC value.

The TSC is MEANT to be monotonic and to continue in sleep states
on modern Intel CPUs . So why does the gtod->mult change ?

But the mult value does change.  Currently there is no way for
user-space programs
to discover that such a change has occurred, or when . With this very
tiny simple
patch, they could know instantly when such changes occur, and could implement
TSC readers that perform the full conversion with latencies of 15-30ns
(on my CPU).

Here is the patch:

BEGIN PATCH :

diff --git a/arch/x86/entry/vdso/vclock_gettime.c
b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..63f5f18 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -246,6 +246,28 @@ notrace static int __always_inline
do_monotonic(struct timespec *ts)
 	return mode;
 }

+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{
+        volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
generated for 64-bit as for 32-bit builds
+        u64 ns;
+        register u64 tsc=0;
+        if (gtod->vclock_mode == VCLOCK_TSC)
+        { asm volatile
+            ( "rdtscp"
+            : "=a" (tsc_lo)
+            , "=d" (tsc_hi)
+            , "=c" (tsc_cpu)
+            ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+	  tsc     = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo)
& 0xffffffffUL);
+	  tsc    *= gtod->mult;
+	  tsc   >>= gtod->shift;
+	  ts->tv_sec  = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns);
+	  ts->tv_nsec = ns;
+	  return VCLOCK_TSC;
+        }
+        return VCLOCK_NONE;
+}
+
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -277,6 +299,10 @@ notrace int __vdso_clock_gettime(clockid_t clock,
struct timespec *ts)
 		if (do_monotonic(ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		if (do_monotonic_raw(ts) == VCLOCK_NONE)
+			goto fallback;
+		break;
 	case CLOCK_REALTIME_COARSE:
 		do_realtime_coarse(ts);
 		break;
@@ -326,3 +352,25 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+
+struct linux_timestamp_conversion
+{ u32  mult;
+  u32  shift;
+};
+
+extern
+  const struct linux_timestamp_conversion *
+  __vdso_linux_tsc_calibration(void);
+
+notrace
+  const struct linux_timestamp_conversion *
+  __vdso_linux_tsc_calibration(void)
+{ if( gtod->vclock_mode == VCLOCK_TSC )
+    return ((struct linux_timestamp_conversion*) &gtod->mult);
+  return 0UL;
+}
+
+const struct linux_timestamp_conversion *
+  linux_tsc_calibration(void) __attribute((weak,
alias("__vdso_linux_tsc_calibration")));
+
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..41a2ca5 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -24,7 +24,9 @@ VERSION {
 		getcpu;
 		__vdso_getcpu;
 		time;
-		__vdso_time;
+	        __vdso_time;
+                linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..d53bd73 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,7 +25,8 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	};

 	LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S
b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..fb13b16 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -20,7 +20,8 @@ VERSION {
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_getcpu;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	local: *;
 	};
 }


: END PATCH

This patch is Attachment #274527 to Bug #198961 :
https://bugzilla.kernel.org/attachment.cgi?id=274527&action=diff

and here is an example of its usage, which must be linked with
object compiled from  tools/testing/selftests/vDSO/parse_vdso.c :

BEGIN EXAMPLE

#include <time.h>
#include <sys/auxv.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>

#include ".../path_to_kernel/tools/testing/selftests/vDSO/parse_vdso.c"

typedef
struct lnx_tsc_calibration_s
{ unsigned int mult, shift;
} LnxTSCCalibration_t;

void clock_get_time_raw( struct timespec *ts , const
LnxTSCCalibration_t * tsc_cal, unsigned long *last_tsc, unsigned long
*last_sec)
{ volatile unsigned int  tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same
instrs generated for 64-bit as for 32-bit builds
  register unsigned long tsc=0;
  asm volatile
  ( "rdtscp"
  : "=a" (tsc_lo)
  , "=d" (tsc_hi)
  , "=c" (tsc_cpu)
  ); // : eax, edx, ecx used - NOT rax, rdx, rcx
  tsc     = ((((unsigned long)tsc_hi) & 0xffffffffUL) << 32) |
(((unsigned long)tsc_lo) & 0xffffffffUL);
  tsc    *= tsc_cal->mult;
  tsc   >>= tsc_cal->shift;
  if ( last_tsc && *last_tsc && last_sec )
  { register unsigned long tsc_diff = tsc - *last_tsc;
    if ( tsc_diff > 999999999UL )
    { ts->tv_sec  = tsc / 1000000000;
      ts->tv_nsec = tsc % 1000000000;
    }else
    { ts->tv_sec = *last_sec;
      ts->tv_nsec = tsc_diff;
    }
    *last_tsc = tsc;
    *last_sec = ts->tv_sec;
  }else
  { ts->tv_sec  = tsc / 1000000000;
    ts->tv_nsec = tsc % 1000000000;
  }
}

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif

int main( int argc, const char *const* argv , const char **const envp )
{
  register unsigned long sysinfo_ehdr = getauxval( AT_SYSINFO_EHDR );
  if( 0 == sysinfo_ehdr )
  { fprintf(stderr,"getauxval failed: %d : '%s'.\n", errno, strerror(errno));
    return 1;
  }
  vdso_init_from_sysinfo_ehdr( sysinfo_ehdr );
  if ( ! vdso_info.valid )
  { fprintf(stderr,"vdso_init_from_sysinfo_ehdr failed\n");
    return 1;
  }
  const struct lnx_tsc_calibration_s*
    (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6",
"__vdso_linux_tsc_calibration");
  if( linux_tsc_cal == 0UL )
  { fprintf(stderr,"vdso_sym failed\n");
    return 1;
  }
  const struct lnx_tsc_calibration_s *clock_source = (*linux_tsc_cal)();
  fprintf(stderr,"Got TSC calibration @ %p: mult: %u shift: %u\n",
          (void*)clock_source, clock_source->mult, clock_source->shift
         );
#define TS2NS(_TS_) ((((unsigned long
long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long
long)((_TS_).tv_nsec))))
  struct timespec t_s;
  unsigned long last_tsc=0, last_seconds=0;
  clock_get_time_raw( &t_s, clock_source, &last_tsc, &last_seconds );
  unsigned long long
    sample [ N_SAMPLES ]
  , t1, t2
  , t_start = TS2NS(t_s);
  unsigned int s=0;
  do
  { clock_get_time_raw( &t_s, clock_source, &last_tsc, &last_seconds);
    t1 = TS2NS(t_s);
    clock_get_time_raw( &t_s, clock_source, &last_tsc, &last_seconds);
    t2 = TS2NS(t_s);
    sample [ s ] = t2 - t1;
  }while(++s < N_SAMPLES);
  unsigned long long sum = 0;
  for(s = 0; s < N_SAMPLES; s+=1)
    sum += sample[s];
  fprintf(stderr,"sum: %llu\n",sum);
  unsigned long long avg_ns = sum / N_SAMPLES;
  t1=(t2 - t_start);
  fprintf(stderr,
	  "Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS\n",
          t1/1000000000,       t1-((t1/1000000000)*1000000000),
          avg_ns/1000000000,   avg_ns-((avg_ns/1000000000)*1000000000)
	  );	
  return 0;
}

: END EXAMPLE

EXAMPLE Usage :
$ gcc -std=gnu11 -o t_vdso_tsc t_vdso_tsc.c
$ ./t_vdso_tsc
Got TSC calibration @ 0x7ffdb9be5098: mult: 5798705 shift: 24
sum: 2222
Total time: 0.000004859S - Average Latency: 0.000000022S

Latencies are typically @ 15 - 30 ns .

That multiplication and shift really doesn't leave very many
significant seconds bits!

Please, can the VDSO include some similar functionality to NOT always
enter the kernel for CLOCK_MONOTONIC_RAW , and to export a pointer to
the LIVE (kernel updated) gtod->mult and gtod->shift values somehow .

The documentation states for CLOCK_MONOTONIC_RAW that it is the
same as CLOCK_MONOTONIC except it is NOT subject to NTP adjustments .
This is very far from the case currently, without a patch like the one above.

And the kernel should not restrict user-space programs to only being able
to either measure an NTP adjusted time value, or a time value
difference of greater
than 1000ns with any accuracy, on a modern Intel CPU whose TSC ticks 2.8 times
per nanosecond (picosecond resolution is theoretically possible).

Please, include something like the above patch in future Linux versions.

Thanks & Best Regards,
Jason Vas Dias <jason.vas.dias@gmail.com>

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
  2018-03-05  2:59 [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer Jason Vas Dias
@ 2018-03-06  9:33 ` Thomas Gleixner
       [not found]   ` <CALyZvKxORRjBurwj7+YV5fwJfwjMVWif+zpjC9AVM3z3WTVDaA@mail.gmail.com>
  0 siblings, 1 reply; 7+ messages in thread
From: Thomas Gleixner @ 2018-03-06  9:33 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, Andi Kleen, Peter Zijlstra

Jason,

On Mon, 5 Mar 2018, Jason Vas Dias wrote:

thanks for providing this. A few formal nits first.

Please read Documentation/process/submitting-patches.rst

Patches need a concise subject line and the subject line wants a prefix, in
this case 'x86/vdso'.

Please don't put anything past the patch. Your delimiters are human
readable, but cannot be handled by tools.

Also please follow the kernel coding style guide lines.

> It also provides a new function in the VDSO :
> 
> struct linux_timestamp_conversion
> { u32  mult;
>   u32  shift;
> };
> extern
>   const struct linux_timestamp_conversion *
>   __vdso_linux_tsc_calibration(void);
> 
> which can be used by user-space rdtsc / rdtscp issuers
> by using code such as in
>    tools/testing/selftests/vDSO/parse_vdso.c
> to call vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration"),
> which returns a pointer to the function in the VDSO, which
> returns the address of the 'mult' field in the vsyscall_gtod_data.

No, that's just wrong. The VDSO data is solely there for the VDSO accessor
functions and not to be exposed to random user space.

> Thus user-space programs can use rdtscp and interpret its return values
> in exactly the same way the kernel would, but without entering the kernel.

The VDSO clock_gettime() functions are providing exactly this mechanism.

>  As pointed out in Bug # 198961 :
>    https://bugzilla.kernel.org/show_bug.cgi?id=198961
>  which contains extra test programs and the full story behind this change,
>  using CLOCK_MONOTONIC_RAW without the patch results in
>  a minimum measurable time (latency) of @ 300 - 700ns because of
>  the syscall used by vdso_fallback_gtod() .
> 
> With the patch, the latency falls to @ 100ns .
> 
> The latency would be @ 16 - 32 ns  if the do_monotonic_raw()
> handler could record its previous TSC value and seconds return value
> somewhere, but since the VDSO has no data region or writable page,
> of course it cannot .

And even if it could, it's not as simple as you want it to be. Clocksources
can change during runtime and without effective protection the values are
just garbage.

> Hence, to enable effective use of TSC by user space programs, Linux must
> provide a way for them to discover the calibration mult and shift values
> the kernel uses for the clock source ; only by doing so can user-space
> get values that are comparable to kernel generated values.

Linux must not do anything. It can provide a vdso implementation of
CLOCK_MONOTONIC_RAW, which does not enter the kernel, but not exposure to
data which is not reliably accessible by random user space code.

> And I'd really like to know: why does the gtod->mult value change ?
> After TSC calibration, it and the shift are calculated to render the
> best approximation of a nanoseconds value from the TSC value.
> 
> The TSC is MEANT to be monotonic and to continue in sleep states
> on modern Intel CPUs . So why does the gtod->mult change ?

You are missing the fact that gtod->mult/shift are used for CLOCK_MONOTONIC
and CLOCK_REALTIME, which are adjusted by NTP/PTP to provide network
synchronized time. That means CLOCK_MONOTONIC is providing accurate
and slope compensated nanoseconds.

The raw TSC conversion, even if it is sane hardware, provides just some
approximation of nanoseconds which can be off by quite a margin.

> But the mult value does change.  Currently there is no way for user-space
> programs to discover that such a change has occurred, or when . With this
> very tiny simple patch, they could know instantly when such changes
> occur, and could implement TSC readers that perform the full conversion
> with latencies of 15-30ns (on my CPU).

No. Accessing the mult/shift pair without protection is racy and can lead
to completely erratic results.

> +notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
> +{
> +        volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
> generated for 64-bit as for 32-bit builds
> +        u64 ns;
> +        register u64 tsc=0;
> +        if (gtod->vclock_mode == VCLOCK_TSC)
> +        { asm volatile
> +            ( "rdtscp"
> +            : "=a" (tsc_lo)
> +            , "=d" (tsc_hi)
> +            , "=c" (tsc_cpu)
> +            ); // : eax, edx, ecx used - NOT rax, rdx, rcx

If you look at the existing VDSO time getters then you'll notice that
they use accessor functions and not open coded asm constructs.

> +	  tsc     = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo)
> & 0xffffffffUL);
> +	  tsc    *= gtod->mult;
> +	  tsc   >>= gtod->shift;
> +	  ts->tv_sec  = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns);
> +	  ts->tv_nsec = ns;

This is horrible. Please look at the kernel side implementation of
clock_gettime(CLOCK_MONOTONIC_RAW). The VDSO side can be implemented in the
same way. All what is required is to expose the relevant information in the
existing vsyscall_gtod_data data structure.

> +struct linux_timestamp_conversion
> +{ u32  mult;
> +  u32  shift;
> +};

This wont happen because unprotected access is bad. But even if it would go
anywhere defining the structure which should be accessible by user space
code in the C-file is just wrong. If at all it needs to be defined in a
user space exposed header.

> +extern
> +  const struct linux_timestamp_conversion *
> +  __vdso_linux_tsc_calibration(void);
> +
> +notrace
> +  const struct linux_timestamp_conversion *
> +  __vdso_linux_tsc_calibration(void)
> +{ if( gtod->vclock_mode == VCLOCK_TSC )
> +    return ((struct linux_timestamp_conversion*) &gtod->mult);
> +  return 0UL;

This is the most horrible coding style I saw in a long time. The kernel has
a very well defined coding style.....

> That multiplication and shift really doesn't leave very many
> significant seconds bits!

It's the wrong way to do that.

> Please, can the VDSO include some similar functionality to NOT always
> enter the kernel for CLOCK_MONOTONIC_RAW ,

Yes, this can be done if properly implemented.

> and to export a pointer to the LIVE (kernel updated) gtod->mult and
> gtod->shift values somehow .

No. That's not going to happen.

> The documentation states for CLOCK_MONOTONIC_RAW that it is the
> same as CLOCK_MONOTONIC except it is NOT subject to NTP adjustments .
> This is very far from the case currently, without a patch like the one above.

Errm. The syscall based implementation provides exactly that, so your claim
is just wrong.

> And the kernel should not restrict user-space programs to only being able
> to either measure an NTP adjusted time value, or a time value difference
> of greater than 1000ns with any accuracy, on a modern Intel CPU whose TSC
> ticks 2.8 times per nanosecond (picosecond resolution is theoretically
> possible).

The kernel does not restrict anything. It provides a fast CLOCK_MONOTONIC
time getter in the VDSO and as nobody so far asked for a fast
CLOCK_MONOTONIC_RAW time getter, it just has never been implemented. It can
be done, but it has to be done by following the rules of the VDSO and not
by randomly exposing data.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Fwd: [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
       [not found]   ` <CALyZvKxORRjBurwj7+YV5fwJfwjMVWif+zpjC9AVM3z3WTVDaA@mail.gmail.com>
@ 2018-03-06 17:13     ` Jason Vas Dias
  2018-03-08 12:57       ` Thomas Gleixner
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Vas Dias @ 2018-03-06 17:13 UTC (permalink / raw)
  To: x86, linux-kernel, andi, tglx

[-- Attachment #1: Type: text/plain, Size: 17590 bytes --]

On 06/03/2018, Thomas Gleixner <tglx@linutronix.de> wrote:
> Jason,
>
> On Mon, 5 Mar 2018, Jason Vas Dias wrote:
>
> thanks for providing this. A few formal nits first.
>
> Please read Documentation/process/submitting-patches.rst
>
> Patches need a concise subject line and the subject line wants a prefix, in
> this case 'x86/vdso'.
>
> Please don't put anything past the patch. Your delimiters are human
> readable, but cannot be handled by tools.
>
> Also please follow the kernel coding style guide lines.
>
>> It also provides a new function in the VDSO :
>>
>> struct linux_timestamp_conversion
>> { u32 mult;
>> u32 shift;
>> };
>> extern
>> const struct linux_timestamp_conversion *
>> __vdso_linux_tsc_calibration(void);
>>
>> which can be used by user-space rdtsc / rdtscp issuers
>> by using code such as in
>> tools/testing/selftests/vDSO/parse_vdso.c
>> to call vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration"),
>> which returns a pointer to the function in the VDSO, which
>> returns the address of the 'mult' field in the vsyscall_gtod_data.
>
> No, that's just wrong. The VDSO data is solely there for the VDSO accessor
> functions and not to be exposed to random user space.
>
>> Thus user-space programs can use rdtscp and interpret its return values
>> in exactly the same way the kernel would, but without entering the
>> kernel.
>
> The VDSO clock_gettime() functions are providing exactly this mechanism.
>
>> As pointed out in Bug # 198961 :
>> https://bugzilla.kernel.org/show_bug.cgi?id=198961
>> which contains extra test programs and the full story behind this
>> change,
>> using CLOCK_MONOTONIC_RAW without the patch results in
>> a minimum measurable time (latency) of @ 300 - 700ns because of
>> the syscall used by vdso_fallback_gtod() .
>>
>> With the patch, the latency falls to @ 100ns .
>>
>> The latency would be @ 16 - 32 ns if the do_monotonic_raw()
>> handler could record its previous TSC value and seconds return value
>> somewhere, but since the VDSO has no data region or writable page,
>> of course it cannot .
>
> And even if it could, it's not as simple as you want it to be. Clocksources
> can change during runtime and without effective protection the values are
> just garbage.
>
>> Hence, to enable effective use of TSC by user space programs, Linux must
>> provide a way for them to discover the calibration mult and shift values
>> the kernel uses for the clock source ; only by doing so can user-space
>> get values that are comparable to kernel generated values.
>
> Linux must not do anything. It can provide a vdso implementation of
> CLOCK_MONOTONIC_RAW, which does not enter the kernel, but not exposure to
> data which is not reliably accessible by random user space code.
>
>> And I'd really like to know: why does the gtod->mult value change ?
>> After TSC calibration, it and the shift are calculated to render the
>> best approximation of a nanoseconds value from the TSC value.
>>
>> The TSC is MEANT to be monotonic and to continue in sleep states
>> on modern Intel CPUs . So why does the gtod->mult change ?
>
> You are missing the fact that gtod->mult/shift are used for CLOCK_MONOTONIC
> and CLOCK_REALTIME, which are adjusted by NTP/PTP to provide network
> synchronized time. That means CLOCK_MONOTONIC is providing accurate
> and slope compensated nanoseconds.
>
> The raw TSC conversion, even if it is sane hardware, provides just some
> approximation of nanoseconds which can be off by quite a margin.
>
>> But the mult value does change. Currently there is no way for user-space
>> programs to discover that such a change has occurred, or when . With this
>> very tiny simple patch, they could know instantly when such changes
>> occur, and could implement TSC readers that perform the full conversion
>> with latencies of 15-30ns (on my CPU).
>
> No. Accessing the mult/shift pair without protection is racy and can lead
> to completely erratic results.
>
>> +notrace static int __always_inline do_monotonic_raw( struct timespec
>> *ts)
>> +{
>> + volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs
>> generated for 64-bit as for 32-bit builds
>> + u64 ns;
>> + register u64 tsc=0;
>> + if (gtod->vclock_mode == VCLOCK_TSC)
>> + { asm volatile
>> + ( "rdtscp"
>> + : "=a" (tsc_lo)
>> + , "=d" (tsc_hi)
>> + , "=c" (tsc_cpu)
>> + ); // : eax, edx, ecx used - NOT rax, rdx, rcx
>
> If you look at the existing VDSO time getters then you'll notice that
> they use accessor functions and not open coded asm constructs.
>
>> +	 tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo)
>> & 0xffffffffUL);
>> +	 tsc *= gtod->mult;
>> +	 tsc >>= gtod->shift;
>> +	 ts->tv_sec = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns);
>> +	 ts->tv_nsec = ns;
>
> This is horrible. Please look at the kernel side implementation of
> clock_gettime(CLOCK_MONOTONIC_RAW). The VDSO side can be implemented in the
> same way. All what is required is to expose the relevant information in the
> existing vsyscall_gtod_data data structure.
>
>> +struct linux_timestamp_conversion
>> +{ u32 mult;
>> + u32 shift;
>> +};
>
> This wont happen because unprotected access is bad. But even if it would go
> anywhere defining the structure which should be accessible by user space
> code in the C-file is just wrong. If at all it needs to be defined in a
> user space exposed header.
>
>> +extern
>> + const struct linux_timestamp_conversion *
>> + __vdso_linux_tsc_calibration(void);
>> +
>> +notrace
>> + const struct linux_timestamp_conversion *
>> + __vdso_linux_tsc_calibration(void)
>> +{ if( gtod->vclock_mode == VCLOCK_TSC )
>> + return ((struct linux_timestamp_conversion*) &gtod->mult);
>> + return 0UL;
>
> This is the most horrible coding style I saw in a long time. The kernel has
> a very well defined coding style.....
>
>> That multiplication and shift really doesn't leave very many
>> significant seconds bits!
>
> It's the wrong way to do that.
>
>> Please, can the VDSO include some similar functionality to NOT always
>> enter the kernel for CLOCK_MONOTONIC_RAW ,
>
> Yes, this can be done if properly implemented.
>
>> and to export a pointer to the LIVE (kernel updated) gtod->mult and
>> gtod->shift values somehow .
>
> No. That's not going to happen.
>
>> The documentation states for CLOCK_MONOTONIC_RAW that it is the
>> same as CLOCK_MONOTONIC except it is NOT subject to NTP adjustments .
>> This is very far from the case currently, without a patch like the one
>> above.
>
> Errm. The syscall based implementation provides exactly that, so your claim
> is just wrong.
>
>> And the kernel should not restrict user-space programs to only being able
>> to either measure an NTP adjusted time value, or a time value difference
>> of greater than 1000ns with any accuracy, on a modern Intel CPU whose TSC
>> ticks 2.8 times per nanosecond (picosecond resolution is theoretically
>> possible).
>
> The kernel does not restrict anything. It provides a fast CLOCK_MONOTONIC
> time getter in the VDSO and as nobody so far asked for a fast
> CLOCK_MONOTONIC_RAW time getter, it just has never been implemented. It can
> be done, but it has to be done by following the rules of the VDSO and not
> by randomly exposing data.
>
> Thanks,
>
> 	tglx
>


Thank you very much for your guidance, Thomas.

I will prepare a new patch that meets submission + coding style guidelines and
does not expose pointers within the vsyscall_gtod_data region to
user-space code -
but I don't really understand why not, since only the gtod->mult value will
change as long as the clocksource remains TSC, and updates to it by the kernel
are atomic and partial values cannot be read .

The code in the patch reverts to old behavior for clocks which are not the
TSC and provides a way for users to determine if the  clock is still the TSC
by calling '__vdso_linux_tsc_calibration()', which would return NULL if
the clock is not the TSC .

I have never seen Linux on a modern intel box spontaneously decide to
switch from the TSC clocksource after calibration succeeds and
it has decided to use the TSC as the system / platform clock source -
what would make it do this ?

But for the highly controlled systems I am doing performance testing on,
I can guarantee that the clocksource does not change.

There is no way user code can write those pointers or do anything other
than read them, so I see no harm in exposing them to user-space ; then
user-space programs can issue rdtscp and use the same calibration values
as the kernel, and use some cached 'previous timespec value' to avoid
doing the long division every time.

If the shift & mult are not accurate TSC calibration values, then the
kernel should put other more accurate calibration values in the gtod .

RE:
> Please look at the kernel side implementation of
> clock_gettime(CLOCK_MONOTONIC_RAW).
> The VDSO side can be implemented in the
> same way.
> All what is required is to expose the relevant information in the
> existing vsyscall_gtod_data data structure.

I agree - that is my point entirely , & what I was trying to do .

 The code in the kernel that handles clock_gettime
 CLOCK_MONOTONIC_RAW syscall, when the
 clock source is TSC, does in fact seem to use the mult & shift
 values in the same way :

AFAICS, the call chain for clock_gettime(CLOCK_MONOTONIC_RAW, &ts)
is :

kernel/time/posix-timers.c , @ line 233

static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
	getrawmonotonic64(tp);
	return 0;
}

...
@line 1058:
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
		struct timespec __user *,tp)
{
	const struct k_clock *kc = clockid_to_kclock(which_clock);
	...
	error = kc->clock_get(which_clock, &kernel_tp);
	if (!error && put_timespec64(&kernel_tp, tp))
 ...
}
...
static const struct k_clock clock_monotonic_raw = {
	.clock_getres		= posix_get_hrtimer_res,
	.clock_get		= posix_get_monotonic_raw,
};
...

getmonotonicraw64() is in kernel/time/timekeeping.c , @ line 1418:

void getrawmonotonic64(struct timespec64 *ts)
{
	struct timekeeper *tk = &tk_core.timekeeper;
	unsigned long seq;
	u64 nsecs;

	do {
		seq = read_seqcount_begin(&tk_core.seq);
#                       ^-- I think this is the source of the locking
#                            and the very long latencies !
#
		ts->tv_sec = tk->raw_sec;
		nsecs = timekeeping_get_ns(&tk->tkr_raw);

	} while (read_seqcount_retry(&tk_core.seq, seq));

	ts->tv_nsec = 0;
	timespec64_add_ns(ts, nsecs);
}

this uses, from an earlier part of timekeeping.c , @ line 346:

static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
{
	u64 nsec;

	nsec = delta * tkr->mult + tkr->xtime_nsec;
	nsec >>= tkr->shift;

	/* If arch requires, add in get_arch_timeoffset() */
	return nsec + arch_gettimeoffset();
}

static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
{
	u64 cycle_now, delta;

	/* read clocksource */
	cycle_now = tk_clock_read(tkr);

	/* calculate the delta since the last update_wall_time */
	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);

	return delta;
}

static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
{
	u64 delta;

	delta = timekeeping_get_delta(tkr);
	return timekeeping_delta_to_ns(tkr, delta);
}


So the kernel is basically doing, in timekeeping_delta_to_ns :

        nsec_delta = delta * gtod->mult;
	nsec_delta >>= gtod->shift;
        add_nsec_delta_to_previous_value( previous_value, nsec_delta);



So in fact, when the clock source is TSC, the value recorded in 'ts'
by clock_gettime(CLOCK_MONOTONIC_RAW, &ts) is very similar to
      u64 tsc = rdtscp();
      tsc *= gtod->mult;
      tsc >>= gtod->shift;
      ts.tv_sec=tsc / NSEC_PER_SEC;
      ts.tv_nsec=tsc % NSEC_PER_SEC;

which is the algorithm I was using in the VDSO fast TSC reader,
do_monotonic_raw() .

Please, suggest a better algorithm using available data in the
gtod, and I'll use it.

The problem with doing anything more in the VDSO is that there
is of course nowhere in the VDSO to store any data, as it has
no data section or writable pages . So some kind of writable
page would need to be added to the vdso , complicating its
vdso/vma.c, etc., which is not desirable.

But it is equally not desirable to have the clock_gettime
system call have a latency exceeding 300ns for CLOCK_MONOTONIC_RAW
clocks, nor to measure all the adjustments & previous value dependencies
that the current implementation is prone to .

Perhaps I should instead use the calibrated TSC frequency and just do
a multiply divide ?

ie. the calibrated TSC frequency is 2893.300 MHz , so each TSC tick
has a period of  1/2,893,300,000 seconds or 1 / 2.8933 nanoseconds ,
or 3.4562 NS.

So, if that number was communicated somehow to the VDSO, it could do:

 do_monotonic_raw() { ...
       u64 tsc = rdtscp();
       tsc = ((tsc * 1000)  / gtod->pico_period); /* period of clock
in picoseconds */
       ts.tv_sec=tsc / NSEC_PER_SEC;
       ts.tv_nsec=tsc % NSEC_PER_SEC;
}

But there is no other communication of TSC calibration data currently
in the gtod
other than 'shift' and 'mult' ,which do actually give results which correlate
approximately to the pico_period division above.

Attempts to change the vsyscall_gtod_data structure in any way have resulted
in nasty core dumps of clock_gettime() using processes.
I'd much appreciate any tips on how to change its layout without breaking
everything.

One currently needs to parse the system log messages to get the
'Refined TSC clocksource calibration', or lookup its value in the
live kernel by looking up the 'tsc_khz' symbol at its
address given in System.map  with gdb / objdump ,
converting to offset into data region, and using
lseek() on /proc/kcore (as root).

The point is, user-space must rely on the kernel to perform
TSC calibration properly, and the only data in the gtod that
reflects the result of that calibration are the mult and shift values.
If linux wants to support user-space TSC readers, it needs to communicate
the TSC calibration somehow in the VDSO gtod area.

I've been getting good low-latency time values using my current patch under both
Linux 4.15.7 and 3.10.0-693.17.1.el7.jvd - I've attached a log of 100
consecutive
calls and a printout of the sum and averages :

#define N_SAMPLES 10

  unsigned int cnt=N_SAMPLES, s=0;
  do
    clock_gettime( CLOCK_MONOTONIC_RAW, &sample_ts[s++] );
  while(--cnt);
#define TS2NS(_TS_) ((((unsigned long
long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long
long)((_TS_).tv_nsec))))
  unsigned long long sum = 0;
  std::cerr <<  sample_ts[0].tv_sec << " " << sample_ts[0].tv_nsec <<
std::endl;
  for( s=1; s< N_SAMPLES; s+=1)
  { sum += TS2NS(sample_ts[s]) - TS2NS(sample_ts[s-1]);
    std::cerr <<  sample_ts[s].tv_sec << " " << sample_ts[s].tv_nsec
<< std::endl;
  }
  std::cerr << "Sum: " << sum  << " avg: " << (sum/N_SAMPLES) << std::endl;


I enclose a log of the above program's output (tts.log) .


The print outs of the timespec structure values show monotonically increasing
values with very similar deltas (@ 16ns in this case) , eg:
       871 255831891
       871 255831911
       871 255831928
       871 255831949
       871 255831966
       871 255831986
       871 255832004
and end with a low latency average (minimum measurable time):
       Sum: 1920 avg: 19

The deltas for clock_gettime(CLOCK_MONOTONIC_RAW) under a kernel
without the patch are @ 300-700ns  :

$ ./timer
sum: 69234
Total time: 0.000069234S - Average Latency: 0.000000692S

The timer program source is also attached, as are the patches
for linux-4.15.7 and 3.10.0 I am testing with .

I've also attached the user-space tsc-reading
  'clock_get_time_raw()' function (in test_vdso_tsc.c) ,
which illustrates user-space use of the TSC calibration
shift and mult in the gtod, and a way to avoid long division
every time, which achieves much lower latencies than
the VDSO do_monotonic_raw() function .


Sorry, but I urgently need to use the TSC from user-space to
obtain nanosecond resolution unadjusted timer values,
with latencies of less than 30ns,
which are comparable to those returned by linux , and
this appears to be the only way to do it .

Please, if there is a better way to read the TSC and convert
its value in the same way linux does, using its calibration, and
rendering comparable results, but in user-space, let me know
and I'll  use it .

And please consider SOME patch to the VDSO to implement
user-space TSC reading on the intel / AMD , to return a completely
unadjusted value for CLOCK_MONOTONIC_RAW, as
documented, but not as Linux currently does.

BTW, reading the Documentation/process/submitting-patches.rst file:
       6) No MIME, no links, no compression, no attachments.  Just plain text
       ...
       For this reason, all patches should be submitted by e-mail "inline".
       Be wary of your editor's word-wrap corrupting your patch,
       if you choose to cut-n-paste your patch.
       Do not attach the patch as a MIME attachment, compressed or not.

So I'm a bit confused as to how to attach patches.
How do you attach a patch without mime ?
Where is the inline patch submission format defined ?
BTW, it is not my 'editor's word-wrap corruping your patch', it is the
Gmail IMAP server, which believes all lines in plain-text messages
must be wrapped to 72 characters max, and  enforces this belief,
correct or not.  I think kernel.org needs to update its tools to parse
mime attachments or handle email server  line-wrapping, which I have
no control over - I wish I could move from the gmail server, but it would
be too much trouble.

Thanks & Best Regards,
Jason

[-- Attachment #2: tts.log --]
[-- Type: text/plain, Size: 1522 bytes --]

High Resolution TimeStamps Enabled - Got TSC calibration @ 0xffffffffff5ff0a0: mult: 4945757 shift: 24
871 255831236
871 255831281
871 255831319
871 255831347
871 255831365
871 255831383
871 255831401
871 255831420
871 255831438
871 255831458
871 255831476
871 255831496
871 255831513
871 255831533
871 255831551
871 255831571
871 255831589
871 255831609
871 255831627
871 255831647
871 255831664
871 255831684
871 255831702
871 255831722
871 255831740
871 255831760
871 255831778
871 255831798
871 255831815
871 255831835
871 255831853
871 255831873
871 255831891
871 255831911
871 255831928
871 255831949
871 255831966
871 255831986
871 255832004
871 255832024
871 255832042
871 255832062
871 255832079
871 255832099
871 255832117
871 255832137
871 255832155
871 255832175
871 255832193
871 255832213
871 255832230
871 255832250
871 255832268
871 255832288
871 255832306
871 255832326
871 255832344
871 255832364
871 255832381
871 255832401
871 255832419
871 255832439
871 255832457
871 255832477
871 255832494
871 255832515
871 255832532
871 255832552
871 255832570
871 255832590
871 255832608
871 255832628
871 255832645
871 255832665
871 255832683
871 255832703
871 255832721
871 255832741
871 255832759
871 255832779
871 255832796
871 255832816
871 255832834
871 255832854
871 255832872
871 255832892
871 255832910
871 255832930
871 255832947
871 255832967
871 255832985
871 255833005
871 255833023
871 255833043
871 255833060
871 255833081
871 255833098
871 255833118
871 255833136
871 255833156
Sum: 1920 avg: 19


[-- Attachment #3: timer.c --]
[-- Type: text/x-csrc, Size: 1536 bytes --]

/* 
 * Program to measure high-res timer latency.
 *
 */
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif


int main(int argc, char *const* argv, char *const* envp)
{ clockid_t clk = CLOCK_MONOTONIC_RAW;
  if((argc > 1) && (argv[1] != NULL) && (0 == strcmp(argv[1],"-m")))
    clk = CLOCK_MONOTONIC;  
  struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 ;
  do
  { if( 0 != clock_gettime(CLOCK_MONOTONIC_RAW, &sample[s++]) )
    { fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno));
      return 1;
    }
  }while( --cnt );
  clock_gettime(CLOCK_MONOTONIC_RAW, &sample[s]);
#define TS2NS(_TS_) ((((unsigned long long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long long)((_TS_).tv_nsec)))) 
  unsigned long long
    deltas [ N_SAMPLES ]
  , t1, t2, sum=0
  , t_start = TS2NS(sample[0]);
  for(s=1; s < (N_SAMPLES+1); s+=1)
  { t1 = TS2NS(sample[s-1]);
    t2 = TS2NS(sample[s]);
    deltas[s-1]=t2-t1;
  }  
  for(s = 0; s < N_SAMPLES; s+=1)
    sum += deltas[s];
  fprintf(stderr,"sum: %llu\n",sum);
  unsigned long long avg_ns = sum / N_SAMPLES;
  t1=(t2 - t_start);
  fprintf(stderr,
	  "Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS\n",
          t1/1000000000,       t1-((t1/1000000000)*1000000000),
          avg_ns/1000000000,   avg_ns-((avg_ns/1000000000)*1000000000)   
	  );	  
  return 0;
}


[-- Attachment #4: vdso-vclock_gettime-4.15.7.patch --]
[-- Type: application/octet-stream, Size: 3366 bytes --]

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..63f5f18 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -246,6 +246,28 @@ notrace static int __always_inline do_monotonic(struct timespec *ts)
 	return mode;
 }
 
+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{       
+        volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs generated for 64-bit as for 32-bit builds
+        u64 ns;
+        register u64 tsc=0;
+        if (gtod->vclock_mode == VCLOCK_TSC)
+        { asm volatile
+            ( "rdtscp"
+            : "=a" (tsc_lo)
+            , "=d" (tsc_hi)
+            , "=c" (tsc_cpu)
+            ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+	  tsc     = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL);
+	  tsc    *= gtod->mult;
+	  tsc   >>= gtod->shift;
+	  ts->tv_sec  = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns);            
+	  ts->tv_nsec = ns;          
+	  return VCLOCK_TSC;
+        }
+        return VCLOCK_NONE;
+}
+
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -277,6 +299,10 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		if (do_monotonic(ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		if (do_monotonic_raw(ts) == VCLOCK_NONE)
+			goto fallback;
+		break;
 	case CLOCK_REALTIME_COARSE:
 		do_realtime_coarse(ts);
 		break;
@@ -326,3 +352,25 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+
+struct linux_timestamp_conversion
+{ u32  mult;
+  u32  shift;
+};
+
+extern
+  const struct linux_timestamp_conversion *
+  __vdso_linux_tsc_calibration(void);
+
+notrace
+  const struct linux_timestamp_conversion *
+  __vdso_linux_tsc_calibration(void)
+{ if( gtod->vclock_mode == VCLOCK_TSC ) 
+    return ((struct linux_timestamp_conversion*) &gtod->mult);
+  return 0UL;
+}
+
+const struct linux_timestamp_conversion *
+  linux_tsc_calibration(void) __attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..41a2ca5 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -24,7 +24,9 @@ VERSION {
 		getcpu;
 		__vdso_getcpu;
 		time;
-		__vdso_time;
+	        __vdso_time;
+                linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..d53bd73 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,7 +25,8 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	};
 
 	LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..fb13b16 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -20,7 +20,8 @@ VERSION {
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_getcpu;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	local: *;
 	};
 }

[-- Attachment #5: linux-3.10.0-693.17.1.el7.jvd.vdso_vclock_gettime.patch --]
[-- Type: application/octet-stream, Size: 3709 bytes --]

diff -up kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vclock_gettime.c.3.10.0-693.17.1.el7 kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vclock_gettime.c
--- kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vclock_gettime.c.3.10.0-693.17.1.el7	2018-01-14 15:02:54.000000000 +0000
+++ kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vclock_gettime.c	2018-03-05 13:21:56.838784217 +0000
@@ -200,6 +200,28 @@ notrace static int do_monotonic(struct t
 	return mode;
 }
 
+notrace static int do_monotonic_raw( struct timespec *ts)
+{       
+        volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs generated for 64-bit as for 32-bit builds
+        u64 ns;
+        register u64 tsc=0;
+        if (gtod->clock.vclock_mode == VCLOCK_TSC)
+        { asm volatile
+            ( "rdtscp"
+            : "=a" (tsc_lo)
+            , "=d" (tsc_hi)
+            , "=c" (tsc_cpu)
+            ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+	  tsc     = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL);
+	  tsc    *= gtod->clock.mult;
+	  tsc   >>= gtod->clock.shift;
+	  ts->tv_sec  = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns);            
+	  ts->tv_nsec = ns;          
+	  return VCLOCK_TSC;
+        }
+        return VCLOCK_NONE;
+}
+
 notrace static int do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -234,6 +256,9 @@ notrace int __vdso_clock_gettime(clockid
 	case CLOCK_MONOTONIC:
 		ret = do_monotonic(ts);
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		ret = do_monotonic_raw(ts);
+		break;
 	case CLOCK_REALTIME_COARSE:
 		return do_realtime_coarse(ts);
 	case CLOCK_MONOTONIC_COARSE:
@@ -286,3 +311,24 @@ notrace time_t __vdso_time(time_t *t)
 }
 int time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+struct linux_timestamp_conversion
+{ u32  mult;
+  u32  shift;
+};
+
+extern
+  const struct linux_timestamp_conversion *
+  __vdso_linux_tsc_calibration(void);
+
+notrace
+  const struct linux_timestamp_conversion *
+  __vdso_linux_tsc_calibration(void)
+{ if( gtod->clock.vclock_mode == VCLOCK_TSC ) 
+    return ((struct linux_timestamp_conversion*) &gtod->clock.mult);
+  return 0UL;
+}
+
+const struct linux_timestamp_conversion *
+  linux_tsc_calibration(void) __attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
diff -up kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdso.lds.S.3.10.0-693.17.1.el7 kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdso.lds.S
--- kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdso.lds.S.3.10.0-693.17.1.el7	2018-01-14 15:02:54.000000000 +0000
+++ kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdso.lds.S	2018-03-05 13:22:22.189846489 +0000
@@ -23,8 +23,10 @@ VERSION {
 		__vdso_gettimeofday;
 		getcpu;
 		__vdso_getcpu;
-		time;
+	        time;
 		__vdso_time;
+		linux_tsc_calibration;
+	 	__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff -up kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdsox32.lds.S.3.10.0-693.17.1.el7 kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdsox32.lds.S
--- kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdsox32.lds.S.3.10.0-693.17.1.el7	2018-01-14 15:02:54.000000000 +0000
+++ kernel-3.10.0-693.17.1.el7/linux-3.10.0-693.17.1.el7.x86_64/arch/x86/vdso/vdsox32.lds.S	2018-03-05 13:22:47.452196031 +0000
@@ -21,6 +21,7 @@ VERSION {
 		__vdso_gettimeofday;
 		__vdso_getcpu;
 		__vdso_time;
+	        __vdso_linux_tsc_calibration;	
 	local: *;
 	};
 }

[-- Attachment #6: test_vdso_tsc.c --]
[-- Type: text/x-csrc, Size: 3889 bytes --]

/* 
 * this a a copy of 
 *  tools/testing/selftests/vDSO/parse_vdso.c
 * from the Linux kernel tree, with a little main() function at the end
 * which uses the new '__vdso_linux_tsc_calibration' function to get a pointer to
 * the kernel maintained clocksource mult and shift values in the vsyscall_gtod_data : 
 */
#include <stdbool.h>
#include <stdint.h>
#include <limits.h>
#include <time.h>
#include <sys/auxv.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>

#ifndef  PATH_TO_PARSE_VDSO_C
#error   Please set compilation flag -DPATH_TO_PARSE_VDSO_C="..." .
#endif

#include PATH_TO_PARSE_VDSO_C

typedef
struct lnx_tsc_calibration_s
{ unsigned int mult, shift;
} LnxTSCCalibration_t;

typedef
struct tsc_buffer_s
{ unsigned long tsc, sec, nsec;
} LnxTSCBuffer_t;

void clock_get_time_raw( register struct timespec *ts , register const LnxTSCCalibration_t * tsc_cal, register LnxTSCBuffer_t *last )
{ volatile unsigned int  tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs generated for 64-bit as for 32-bit builds
  register unsigned long tsc=0; 
  asm volatile
  ( "rdtscp"
  : "=a" (tsc_lo)
  , "=d" (tsc_hi)
  , "=c" (tsc_cpu)
  ); // : eax, edx, ecx used - NOT rax, rdx, rcx
  tsc     = ((((unsigned long)tsc_hi) & 0xffffffffUL) << 32) | (((unsigned long)tsc_lo) & 0xffffffffUL);
  tsc    *= tsc_cal->mult;
  tsc   >>= tsc_cal->shift;
  if ( last && last->tsc )
  { register unsigned long tsc_diff = tsc - last->tsc;
    if ( (tsc_diff + last->nsec) > 999999999UL )
    { ts->tv_sec  = tsc / 1000000000;      
      ts->tv_nsec = tsc % 1000000000;          
    }else
    { ts->tv_sec  = last->sec;
      ts->tv_nsec = tsc_diff + last->nsec;
    }
    last->tsc     = tsc;
    last->sec     = ts->tv_sec;
    last->nsec    = ts->tv_nsec;    
  }else
  { ts->tv_sec    = tsc / 1000000000;      
    ts->tv_nsec   = tsc % 1000000000;
    if(last)
    { last->tsc   = tsc;
      last->sec   = ts->tv_sec;
      last->nsec  = ts->tv_nsec;    
    }
  }
}

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif

int main( int argc, const char *const* argv , const char **const envp )
{
  register unsigned long sysinfo_ehdr = getauxval( AT_SYSINFO_EHDR );
  if( 0 == sysinfo_ehdr )
  { fprintf(stderr,"getauxval failed: %d : '%s'.\n", errno, strerror(errno));
    return 1;
  }
  vdso_init_from_sysinfo_ehdr( sysinfo_ehdr );
  if ( ! vdso_info.valid )
  { fprintf(stderr,"vdso_init_from_sysinfo_ehdr failed\n");
    return 1;  
  }  
  const struct lnx_tsc_calibration_s*
    (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration");
  if( linux_tsc_cal == 0UL )
  { fprintf(stderr,"vdso_sym failed\n");
    return 1;
  }
  const struct lnx_tsc_calibration_s *clock_source = (*linux_tsc_cal)();
  fprintf(stderr,"Got TSC calibration @ %p: mult: %u shift: %u\n",
          (void*)clock_source, clock_source->mult, clock_source->shift
         );
#define TS2NS(_TS_) ((((unsigned long long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long long)((_TS_).tv_nsec))))     
  struct timespec t_s={0,0};
  LnxTSCBuffer_t  last_tsc = {0,0,0};
  clock_get_time_raw( &t_s, clock_source, &last_tsc);
  unsigned long long
    sample [ N_SAMPLES ]
  , t1, t2
  , t_start = TS2NS(t_s);
  unsigned int s=0;
  do
  { clock_get_time_raw( &t_s, clock_source, &last_tsc);
    t1 = TS2NS(t_s);
    clock_get_time_raw( &t_s, clock_source, &last_tsc);
    t2 = TS2NS(t_s);
    sample [ s ] = t2 - t1;         
  }while(++s < N_SAMPLES);
  unsigned long long sum = 0;
  for(s = 0; s < N_SAMPLES; s+=1)
    sum += sample[s];
  fprintf(stderr,"sum: %llu\n",sum);
  unsigned long long avg_ns = sum / N_SAMPLES;
  t1=(t2 - t_start);
  fprintf(stderr,
	  "Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS\n",
          t1/1000000000,       t1-((t1/1000000000)*1000000000),
          avg_ns/1000000000,   avg_ns-((avg_ns/1000000000)*1000000000)   
	  );	  
  return 0;
}

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: Fwd: [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
  2018-03-06 17:13     ` Fwd: " Jason Vas Dias
@ 2018-03-08 12:57       ` Thomas Gleixner
  2018-03-08 15:16         ` Jason Vas Dias
  0 siblings, 1 reply; 7+ messages in thread
From: Thomas Gleixner @ 2018-03-08 12:57 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, linux-kernel, andi

On Tue, 6 Mar 2018, Jason Vas Dias wrote:
> I will prepare a new patch that meets submission + coding style guidelines and
> does not expose pointers within the vsyscall_gtod_data region to
> user-space code -
> but I don't really understand why not, since only the gtod->mult value will
> change as long as the clocksource remains TSC, and updates to it by the kernel
> are atomic and partial values cannot be read .
> 
> The code in the patch reverts to old behavior for clocks which are not the
> TSC and provides a way for users to determine if the  clock is still the TSC
> by calling '__vdso_linux_tsc_calibration()', which would return NULL if
> the clock is not the TSC .
> 
> I have never seen Linux on a modern intel box spontaneously decide to
> switch from the TSC clocksource after calibration succeeds and
> it has decided to use the TSC as the system / platform clock source -
> what would make it do this ?
> 
> But for the highly controlled systems I am doing performance testing on,
> I can guarantee that the clocksource does not change.

We are not writing code for a particular highly controlled system. We
expose functionality which operates under all circumstances. There are
various reasons why TSC can be disabled at runtime, crappy BIOS/SMI,
sockets getting out of sync .....

> There is no way user code can write those pointers or do anything other
> than read them, so I see no harm in exposing them to user-space ; then
> user-space programs can issue rdtscp and use the same calibration values
> as the kernel, and use some cached 'previous timespec value' to avoid
> doing the long division every time.
> 
> If the shift & mult are not accurate TSC calibration values, then the
> kernel should put other more accurate calibration values in the gtod .

The raw calibration values are as accurate as the kernel can make them. But
they can be rather far off from converting to real nanoseconds for various
reasons. The NTP/PTP adjusted conversion is matching real units and is
obviously more accurate.

> > Please look at the kernel side implementation of
> > clock_gettime(CLOCK_MONOTONIC_RAW).
> > The VDSO side can be implemented in the
> > same way.
> > All what is required is to expose the relevant information in the
> > existing vsyscall_gtod_data data structure.
> 
> I agree - that is my point entirely , & what I was trying to do .

Well, you did not expose the raw conversion data in vsyscall_gtod_data. You
are using:

+         tsc    *= gtod->mult;
+         tsc   >>= gtod->shift;

That's is the adjusted mult/shift value which can change when NTP/PTP is
enabled and you _cannot_ use it unprotected.

> void getrawmonotonic64(struct timespec64 *ts)
> {
> 	struct timekeeper *tk = &tk_core.timekeeper;
> 	unsigned long seq;
> 	u64 nsecs;
> 
> 	do {
> 		seq = read_seqcount_begin(&tk_core.seq);
> #                       ^-- I think this is the source of the locking
> #                            and the very long latencies !

This protects tk->raw_sec from changing which would result in random time
stamps. Yes, it can cause slightly larger latencies when the timekeeper is
updated on another CPU concurrently, but that's not the main reason why
this is slower in general than the VDSO functions. The syscall overhead is
there for every invocation and it's substantial.

> So in fact, when the clock source is TSC, the value recorded in 'ts'
> by clock_gettime(CLOCK_MONOTONIC_RAW, &ts) is very similar to
>       u64 tsc = rdtscp();
>       tsc *= gtod->mult;
>       tsc >>= gtod->shift;
>       ts.tv_sec=tsc / NSEC_PER_SEC;
>       ts.tv_nsec=tsc % NSEC_PER_SEC;
> 
> which is the algorithm I was using in the VDSO fast TSC reader,
> do_monotonic_raw() .

Except that you are using the adjusted conversion values and not the raw
ones. So your VDSO implementation of monotonic raw access is just wrong and
not matching the syscall based implementation in any way.

> The problem with doing anything more in the VDSO is that there
> is of course nowhere in the VDSO to store any data, as it has
> no data section or writable pages . So some kind of writable
> page would need to be added to the vdso , complicating its
> vdso/vma.c, etc., which is not desirable.

No, you don't need any writeable memory in the VDSO. Look at how the
CLOCK_MONOTONIC and CLOCK_REALTIME functions are implemented in the VDSO.

> But it is equally not desirable to have the clock_gettime
> system call have a latency exceeding 300ns for CLOCK_MONOTONIC_RAW

I said before and I do it once more: CLOCK_MONOTONIC_RAW can be implemented
in the VDSO by adding the relevant data to vsyscall_gtod_data and by using
the proper accessor functions which are there for a reason.

> clocks, nor to measure all the adjustments & previous value dependencies
> that the current implementation is prone to .

I have no idea what you are talking about. If done right,
CLOCK_MONOTONIC_RAW does not have any adjustments.

> But there is no other communication of TSC calibration data currently
> in the gtod
> other than 'shift' and 'mult' ,which do actually give results which correlate
> approximately to the pico_period division above.

Even if I'm repeating myself. vsyscall_gtod_data does not have the raw
information and it simply can be added.

> Attempts to change the vsyscall_gtod_data structure in any way have resulted
> in nasty core dumps of clock_gettime() using processes.
> I'd much appreciate any tips on how to change its layout without breaking
> everything.

I have no idea what you have done, but vsyscall_gtod_data is a purely
kernel internal data structure and can be changed as the kernel requires.

> One currently needs to parse the system log messages to get the
> 'Refined TSC clocksource calibration', or lookup its value in the
> live kernel by looking up the 'tsc_khz' symbol at its
> address given in System.map  with gdb / objdump ,
> converting to offset into data region, and using
> lseek() on /proc/kcore (as root).

Groan.

> The point is, user-space must rely on the kernel to perform
> TSC calibration properly, and the only data in the gtod that
> reflects the result of that calibration are the mult and shift values.
> If linux wants to support user-space TSC readers, it needs to communicate
> the TSC calibration somehow in the VDSO gtod area.

I'm getting tired of this slowly. The kernel supports user space TSC access
with CLOCK_MONOTONIC and CLOCK_REALTIME today in the VDSO.

> And please consider SOME patch to the VDSO to implement user-space TSC
> reading on the intel / AMD , to return a completely unadjusted value for
> CLOCK_MONOTONIC_RAW, as documented, but not as Linux currently does.

clock_gettime(CLOCK_MONOTONIC_RAW) returns the raw and unadjusted
nanosecond time from any clocksource (TSC or whatever) today. It does so as
documented.

The only missing piece is a VDSO counterpart which avoids the syscall
overhead.

> BTW, it is not my 'editor's word-wrap corruping your patch', it is the
> Gmail IMAP server, which believes all lines in plain-text messages
> must be wrapped to 72 characters max, and  enforces this belief,
> correct or not.  I think kernel.org needs to update its tools to parse
> mime attachments or handle email server  line-wrapping, which I have
> no control over - I wish I could move from the gmail server, but it would
> be too much trouble.

Sorry, a lot of people use gmail for kernel work and I think there is
enough documentation out there how to do that.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Fwd: [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
  2018-03-08 12:57       ` Thomas Gleixner
@ 2018-03-08 15:16         ` Jason Vas Dias
  2018-03-08 16:40           ` Thomas Gleixner
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Vas Dias @ 2018-03-08 15:16 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: x86, linux-kernel, andi

[-- Attachment #1: Type: text/plain, Size: 18448 bytes --]

On 08/03/2018, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 6 Mar 2018, Jason Vas Dias wrote:
>> I will prepare a new patch that meets submission + coding style guidelines
>> and
>> does not expose pointers within the vsyscall_gtod_data region to
>> user-space code -
>> but I don't really understand why not, since only the gtod->mult value
>> will
>> change as long as the clocksource remains TSC, and updates to it by the
>> kernel
>> are atomic and partial values cannot be read .
>>
>> The code in the patch reverts to old behavior for clocks which are not
>> the
>> TSC and provides a way for users to determine if the  clock is still the
>> TSC
>> by calling '__vdso_linux_tsc_calibration()', which would return NULL if
>> the clock is not the TSC .
>>
>> I have never seen Linux on a modern intel box spontaneously decide to
>> switch from the TSC clocksource after calibration succeeds and
>> it has decided to use the TSC as the system / platform clock source -
>> what would make it do this ?
>>
>> But for the highly controlled systems I am doing performance testing on,
>> I can guarantee that the clocksource does not change.
>
> We are not writing code for a particular highly controlled system. We
> expose functionality which operates under all circumstances. There are
> various reasons why TSC can be disabled at runtime, crappy BIOS/SMI,
> sockets getting out of sync .....
>
>> There is no way user code can write those pointers or do anything other
>> than read them, so I see no harm in exposing them to user-space ; then
>> user-space programs can issue rdtscp and use the same calibration values
>> as the kernel, and use some cached 'previous timespec value' to avoid
>> doing the long division every time.
>>
>> If the shift & mult are not accurate TSC calibration values, then the
>> kernel should put other more accurate calibration values in the gtod .
>
> The raw calibration values are as accurate as the kernel can make them. But
> they can be rather far off from converting to real nanoseconds for various
> reasons. The NTP/PTP adjusted conversion is matching real units and is
> obviously more accurate.
>
>> > Please look at the kernel side implementation of
>> > clock_gettime(CLOCK_MONOTONIC_RAW).
>> > The VDSO side can be implemented in the
>> > same way.
>> > All what is required is to expose the relevant information in the
>> > existing vsyscall_gtod_data data structure.
>>
>> I agree - that is my point entirely , & what I was trying to do .
>
> Well, you did not expose the raw conversion data in vsyscall_gtod_data. You
> are using:
>
> +         tsc    *= gtod->mult;
> +         tsc   >>= gtod->shift;
>
> That's is the adjusted mult/shift value which can change when NTP/PTP is
> enabled and you _cannot_ use it unprotected.
>
>> void getrawmonotonic64(struct timespec64 *ts)
>> {
>> 	struct timekeeper *tk = &tk_core.timekeeper;
>> 	unsigned long seq;
>> 	u64 nsecs;
>>
>> 	do {
>> 		seq = read_seqcount_begin(&tk_core.seq);
>> #                       ^-- I think this is the source of the locking
>> #                            and the very long latencies !
>
> This protects tk->raw_sec from changing which would result in random time
> stamps. Yes, it can cause slightly larger latencies when the timekeeper is
> updated on another CPU concurrently, but that's not the main reason why
> this is slower in general than the VDSO functions. The syscall overhead is
> there for every invocation and it's substantial.
>
>> So in fact, when the clock source is TSC, the value recorded in 'ts'
>> by clock_gettime(CLOCK_MONOTONIC_RAW, &ts) is very similar to
>>       u64 tsc = rdtscp();
>>       tsc *= gtod->mult;
>>       tsc >>= gtod->shift;
>>       ts.tv_sec=tsc / NSEC_PER_SEC;
>>       ts.tv_nsec=tsc % NSEC_PER_SEC;
>>
>> which is the algorithm I was using in the VDSO fast TSC reader,
>> do_monotonic_raw() .
>
> Except that you are using the adjusted conversion values and not the raw
> ones. So your VDSO implementation of monotonic raw access is just wrong and
> not matching the syscall based implementation in any way.
>
>> The problem with doing anything more in the VDSO is that there
>> is of course nowhere in the VDSO to store any data, as it has
>> no data section or writable pages . So some kind of writable
>> page would need to be added to the vdso , complicating its
>> vdso/vma.c, etc., which is not desirable.
>
> No, you don't need any writeable memory in the VDSO. Look at how the
> CLOCK_MONOTONIC and CLOCK_REALTIME functions are implemented in the VDSO.
>
>> But it is equally not desirable to have the clock_gettime
>> system call have a latency exceeding 300ns for CLOCK_MONOTONIC_RAW
>
> I said before and I do it once more: CLOCK_MONOTONIC_RAW can be implemented
> in the VDSO by adding the relevant data to vsyscall_gtod_data and by using
> the proper accessor functions which are there for a reason.
>
>> clocks, nor to measure all the adjustments & previous value dependencies
>> that the current implementation is prone to .
>
> I have no idea what you are talking about. If done right,
> CLOCK_MONOTONIC_RAW does not have any adjustments.
>
>> But there is no other communication of TSC calibration data currently
>> in the gtod
>> other than 'shift' and 'mult' ,which do actually give results which
>> correlate
>> approximately to the pico_period division above.
>
> Even if I'm repeating myself. vsyscall_gtod_data does not have the raw
> information and it simply can be added.
>
>> Attempts to change the vsyscall_gtod_data structure in any way have
>> resulted
>> in nasty core dumps of clock_gettime() using processes.
>> I'd much appreciate any tips on how to change its layout without breaking
>> everything.
>
> I have no idea what you have done, but vsyscall_gtod_data is a purely
> kernel internal data structure and can be changed as the kernel requires.
>
>> One currently needs to parse the system log messages to get the
>> 'Refined TSC clocksource calibration', or lookup its value in the
>> live kernel by looking up the 'tsc_khz' symbol at its
>> address given in System.map  with gdb / objdump ,
>> converting to offset into data region, and using
>> lseek() on /proc/kcore (as root).
>
> Groan.
>
>> The point is, user-space must rely on the kernel to perform
>> TSC calibration properly, and the only data in the gtod that
>> reflects the result of that calibration are the mult and shift values.
>> If linux wants to support user-space TSC readers, it needs to communicate
>> the TSC calibration somehow in the VDSO gtod area.
>
> I'm getting tired of this slowly. The kernel supports user space TSC access
> with CLOCK_MONOTONIC and CLOCK_REALTIME today in the VDSO.
>
>> And please consider SOME patch to the VDSO to implement user-space TSC
>> reading on the intel / AMD , to return a completely unadjusted value for
>> CLOCK_MONOTONIC_RAW, as documented, but not as Linux currently does.
>
> clock_gettime(CLOCK_MONOTONIC_RAW) returns the raw and unadjusted
> nanosecond time from any clocksource (TSC or whatever) today. It does so as
> documented.
>
> The only missing piece is a VDSO counterpart which avoids the syscall
> overhead.
>
>> BTW, it is not my 'editor's word-wrap corruping your patch', it is the
>> Gmail IMAP server, which believes all lines in plain-text messages
>> must be wrapped to 72 characters max, and  enforces this belief,
>> correct or not.  I think kernel.org needs to update its tools to parse
>> mime attachments or handle email server  line-wrapping, which I have
>> no control over - I wish I could move from the gmail server, but it would
>> be too much trouble.
>
> Sorry, a lot of people use gmail for kernel work and I think there is
> enough documentation out there how to do that.
>
> Thanks,
>
> 	tglx
>



Thanks Thomas -

I'd appreciate clarification on few points :

> Well, you did not expose the raw conversion data in vsyscall_gtod_data. You
> are using:
>
> +         tsc    *= gtod->mult;
> +         tsc   >>= gtod->shift;
>
> That's is the adjusted mult/shift value which can change when NTP/PTP is
> enabled and you _cannot_ use it unprotected.
...
> vsyscall_gtod_data does not have the raw
> information and it simply can be added.


By "the raw information" here,  do you mean :
  o   the 'tsc_khz'  "Refined TSC clocksource calibration" ?
  o   or some other data ?

The shift and mult values are calculated from
tsc_khz by :
<quote><code>
tsc_refine_calibration_work() , in arch/x86/kernel/tsc.c , @ line 1164:
{ ...
  	tsc_khz = freq;
	pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
		(unsigned long)tsc_khz / 1000,
		(unsigned long)tsc_khz % 1000);

	/* Inform the TSC deadline clockevent devices about the recalibration */
	lapic_update_tsc_freq();

	/* Update the sched_clock() rate to match the clocksource one */
	for_each_possible_cpu(cpu)
		set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);

  ...
}

static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long
long tsc_now)
{ ...
	clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
			       NSEC_PER_MSEC, 0);
  ...
}


kernel/clocksource.c, @ line 64:

void
clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
{ ...
}
</code></quote>


So by "the raw information"  do you mean this  initial mult & shift  ,
calculated from the Refined TSC Frequency (tsc_khz) which does not change ?

I understand that the vsyscall_gtod_data contains the adjusted mult & shift ,
but since this is the only data available there, and the kernel
syscall implementation
of clock_gettime(CLOCK_MONOTONIC_RAW) does seem to access
these same adjusted shift & mult values - as I showed in previous post,
kernel/time/timekeeping.c's getmonotonicraw64() does use these same
mult and shift values:

timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
{...
        u64 nsec;
...
        nsec = delta * tkr->mult + tkr->xtime_nsec;
        nsec >>= tkr->shift;
...
}

Are the vdso gtod->mult and gtod->shift values somehow different from
  'tkr->mult' and 'tkr->shift' ?
I thought they were actually pointers to the same calibration data.
Yes, as noted in previous posts, the 'mult' value does seem to
change by small increments . I guess this is purely because of NTP / PTP ?
So the syscall implementation of clock_gettime(CLOCK_MONOTONIC_RAW,&ts)
actually does return an NTP / PTP adjusted value ?
Then it is  even worse than I thought - not only does it
have an unusably high latency, but it does not do what
it is documented to do at all, if those mult/shift values
are changed by NTP/PTP / CPU frequency adjustments.

I did redo the patches yesterday , addressing the formatting issues
and adding a header for the 'struct linux_tsc_calibration' structure -
see mail subject:
    '[PATCH v4.15.7 1/1] x86/vdso: handle
clock_gettime(CLOCK_MONOTONIC_RAW, &ts) in VDSO' .

I also enclose the updated patch as an attachment to this mail .

I do strongly believe that if the VDSO is going to support accessing the TSC
from user-space,
it should provide some access to the  TSC calibration values that would
permit user-space TSC readers to convert the TSC value to nanoseconds
accurately and efficiently, in a way not prone to NTP or PTP adjustments ;
otherwise the VDSO do_monotonic_raw must be used, and as it has nowhere
to cache previous values, it must do the long division every time,
which enforces
a latency of >= @ 100ns.
My  user-space TSC reader in 'test_vdso_tsc.c' (sent as attachments to
previous mails)  uses the pointer returned by
__vdso_linux_tsc_calibration() , and a
cached previous value, to achieve latencies of 10-20ns .

I can't see how the contents pointed to by the pointer returned by
__vdso_linux_tsc_calibration()
will be "garbage" , as long as the clock source remains TSC .

Updates to 64 bit memory locations are atomic on x86_64 , so either
an updated valid 'mult' value is read, or the previous valid 'mult' value
is read (the shift does not change) - I can't see the harm in that.

Yes, users do need to check that the clocksource is still 'TSC' ,
by checking that __vdso_linux_tsc_calibration() still returns non-null ,
or that /sys/devices/system/clocksource/clocksource0/current_clocksource
is still 'tsc'  .

By 'Controlled' I just mean I can control whether the clocksource will change
during a run of given test program that uses the TSC or not - I
believe any other user would also be in the same position - don't
change the clock source during
a run of your test program, and it will yield valid results.  It is
difficult to imagine
a time measurement program that could be expected to withstand a change
of the system clocksource and still provide valid results - one would want to
discard results from a program that experienced a system clock-source change.

Programs which want to withstand system-clock source changes can use
Events or the value of the __vdso_linux_tsc_calibration() function to determine
if the clock source has changed.

So I don't agree that it is a priori wrong to return a pointer into
the VDSO vvar
page to user-space, which can only be used in a read-only way, and
whose contents
will be valid as long as the clock source does not change, when users
can easily determine if the clock source has changed or not before using
the pointer - I don't see how the contents can spontaneously become 'garbage'
if the clock source does not change, and if it does change, users can easily
find out about it.

So I guess it comes down to what TSC calibration data should be put in
the VDSO -
I guess these are the initial unchanging 'mult' and 'shift' values ?
Or just 'tsc_khz'  ?

The only reason I use the mult & shift is because this is the way
Linux does it ,
and we'd like to obtain TSC time values that correlate well with those generated
by the kernel.
It does discard alot of resolution and seconds bits , but the values do appear
to correlate well to the calculated TSC frequency  - here's a PERL program
that demonstrates accuracy of mult+shift conversion method:

<quote><code>
#!/usr/bin/perl
#
# This demonstrates that the  Linux TSC 'mult' and 'shift'
# values really do provide a close approximation to precise nanosecond
# conversion:
#
$tsc_cal_msg='';
if     ( -r '/var/log/messages' )
{
    $tsc_cal_msg = `grep -F 'Refined TSC clocksource calibration'
/var/log/messages  | tail -n 1`;
}elsif ( -x '/usr/bin/journalctl' )
{
    $tsc_cal_msg = `journalctl -b -0 | grep -F 'Refined TSC
clocksource calibration' | tail -n 1`;
}
$tsc_freq_mhz = 0.0;
if ( $tsc_cal_msg =~ /([0-9\.]+)[[:space:]]*MHz[[:space:]]*$/ )
{  $tsc_freq_mhz = $1;
}
$tsc_freq_ghz = $tsc_freq_mhz/1000;
print STDERR "Calibrated TSC Frequency: $tsc_freq_mhz MHz :
$tsc_freq_ghz GHz\n";
$lnx_kern_mult  = 4945810;
$lnx_kern_shift = 24;
# the last shift & mult values read from kernel vsyscall_gtod_data -
# you'll need to run 'test/tts.cpp' - the first output line is like:
# High Resolution TimeStamps Enabled - Got TSC calibration @
0xffffffffff5ff0a0: mult: 4945820 shift: 24
# and plug in the mult and shift values above. Doing this by examining
the live kernel is really complicated -
# this is in a way the whole point of the patch.
#
$tsc_val = 1000000000;
$c=0;
do
{
$tsc_nanos_1 = int($tsc_val / $tsc_freq_ghz); # not nice for Linux to
do long division, also it does not have floating point support in
kernel.
# so linux has computed the shift and mult to give a close approximation:
$tsc_nanos_2 = ($tsc_val * $lnx_kern_mult) >> $lnx_kern_shift;
$diff_pc = sprintf("%1.3e",(abs($tsc_nanos_2 - $tsc_nanos_1) /
$tsc_nanos_2) * 100.0);
print STDERR "TSC: $tsc_val Calculated nanos: $tsc_nanos_1 (by
tsc/$tsc_freq_ghz ) ; $tsc_nanos_2 (by (tsc*mult)>>shift) - difference
as percentage: $diff_pc\% \n";
$tsc_val += $tsc_val + (10**$c++);
} while ($c <= 10);
</code></quote>

This prints, on my system:

$ cat tsc_numbers.log
Calibrated TSC Frequency: 3392.144 MHz : 3.392144 GHz
TSC: 1000000000 Calculated nanos: 294798805 (by tsc/3.392144 ) ;
294792652 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 2000000001 Calculated nanos: 589597611 (by tsc/3.392144 ) ;
589585304 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 4000000012 Calculated nanos: 1179195226 (by tsc/3.392144 ) ;
1179170612 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 8000000124 Calculated nanos: 2358390482 (by tsc/3.392144 ) ;
2358341253 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 16000001248 Calculated nanos: 4716781259 (by tsc/3.392144 ) ;
4716682801 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 32000012496 Calculated nanos: 9433565466 (by tsc/3.392144 ) ;
9433368551 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 64000124992 Calculated nanos: 18867160413 (by tsc/3.392144 ) ;
18866766583 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 128001249984 Calculated nanos: 37734615624 (by tsc/3.392144 ) ;
37733827958 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 256012499968 Calculated nanos: 75472179237 (by tsc/3.392144 ) ;
75470603844 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 512124999936 Calculated nanos: 150973838355 (by tsc/3.392144 ) ;
150970686953 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%
TSC: 1025249999872 Calculated nanos: 302242475517 (by tsc/3.392144 ) ;
302236166558 (by (tsc*mult)>>shift) - difference as percentage:
2.087e-03%

So they look accurate enough to me - a constant 0.002087% difference beween
the value calculated by ( tsc / tsc_ticks_per_nanosecond) is
sufficiently close to
( (tsc * mult) >> shift )  .   But there are not many bits left for
seconds values
in either calculation :    @  10, by my estimate : (64 - (24 +30)) == 10
- a 24 bit right shift plus @ 30 bits of nanosecond leaves 10
effective seconds bits.
If there is a better way of converting  TSC to nanoseconds, let me know .
I'd much prefer to use the TSC value as an exact number of picoseconds ,
but use of 'struct timespec' , which must contain a nanosecond resolution
value, precludes this.

Please, I'm not saying my patch must be integrated as-is into Linux, I'm
just saying we really need a low-latency TSC reader in the VDSO for
   clock_gettime(CLOCK_MONOTONIC_RAW, &ts)
under Linux on Intel / AMD, and presenting one way I've found to provide one.
If you find a better way, please let me know.

Thanks & Best Regards,
Jason

[-- Attachment #2: vdso_vclock_gettime_userspace_tsc.patch --]
[-- Type: application/octet-stream, Size: 5606 bytes --]

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index f19856d..e840600 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -21,6 +21,7 @@
 #include <linux/math64.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <uapi/asm/vdso_tsc_calibration.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -246,6 +247,29 @@ notrace static int __always_inline do_monotonic(struct timespec *ts)
 	return mode;
 }
 
+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{       
+        volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs generated for 64-bit as for 32-bit builds
+        u64 ns;
+        register u64 tsc=0;
+        if (gtod->vclock_mode == VCLOCK_TSC)
+        {
+                asm volatile
+                ( "rdtscp"
+                : "=a" (tsc_lo)
+                , "=d" (tsc_hi)
+                , "=c" (tsc_cpu)
+                ); // : eax, edx, ecx used - NOT rax, rdx, rcx
+                tsc     = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL);
+                tsc    *= gtod->mult;
+                tsc   >>= gtod->shift;
+                ts->tv_sec  = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns);            
+                ts->tv_nsec = ns;          
+                return VCLOCK_TSC;                
+        }
+        return VCLOCK_NONE;
+}
+
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -277,6 +301,10 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		if (do_monotonic(ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		if (do_monotonic_raw(ts) == VCLOCK_NONE)
+			goto fallback;
+		break;
 	case CLOCK_REALTIME_COARSE:
 		do_realtime_coarse(ts);
 		break;
@@ -326,3 +354,18 @@ notrace time_t __vdso_time(time_t *t)
 }
 time_t time(time_t *t)
 	__attribute__((weak, alias("__vdso_time")));
+
+extern const struct linux_tsc_calibration *
+        __vdso_linux_tsc_calibration(void);
+
+notrace  const struct linux_tsc_calibration *
+  __vdso_linux_tsc_calibration(void)
+{
+        if( gtod->vclock_mode == VCLOCK_TSC ) 
+                return ((const struct linux_tsc_calibration*) &gtod->mult);
+        return 0UL;
+}
+
+const struct linux_tsc_calibration * linux_tsc_calibration(void)
+        __attribute((weak, alias("__vdso_linux_tsc_calibration")));
+
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..41a2ca5 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -24,7 +24,9 @@ VERSION {
 		getcpu;
 		__vdso_getcpu;
 		time;
-		__vdso_time;
+	        __vdso_time;
+                linux_tsc_calibration;
+		__vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..d53bd73 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,7 +25,8 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	};
 
 	LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..fb13b16 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -20,7 +20,8 @@ VERSION {
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_getcpu;
-		__vdso_time;
+	        __vdso_time;
+                __vdso_linux_tsc_calibration;
 	local: *;
 	};
 }
diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
new file mode 100644
index 0000000..6febb84
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H
+#define _ASM_X86_VDSO_TSC_CALIBRATION_H
+/* 
+ * Programs that want to use rdtsc / rdtscp instructions
+ * from user-space can make use of the Linux kernel TSC calibration
+ * by calling :
+ *    __vdso_linux_tsc_calibration(void);
+ * ( one has to resolve this symbol as in 
+ *   tools/testing/selftests/vDSO/parse_vdso.c
+ * )
+ * which returns a pointer into the read-only
+ * vvar_page (the vsyscall_gtod_data structure),
+ * with the following layout :
+ */
+
+struct linux_tsc_calibration
+{
+        unsigned int mult;   /* amount to multiply 64-bit TSC value by */
+
+        unsigned int shift;  /* the right shift to apply to (mult*TSC) yielding nanoseconds */ 
+};
+
+/* To use:
+ *
+ *  static const struct lnx_tsc_calibration_s*
+ *  (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration");
+ *  if( linux_tsc_cal == 0UL )
+ *  { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not applied to the kernel.\n");
+ *    return ERROR;
+ *  }
+ *  static const struct lnx_tsc_calibration_s *clock_source = (*linux_tsc_cal)();
+ *  if( clock_source == ((void*)0UL))
+ *    fprintf(stderr,"TSC is not the system clocksource.\n");
+ *  volatile unsigned int tsc_lo, tsc_hi, tsc_cpu;
+ *  asm volatile
+ *  ( "rdtscp" : (=a) tsc_hi,  (=d) tsc_lo, (=c) tsc_cpu );
+ *  unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo;
+ *  unsigned long nanoseconds =
+ *   (( clock_source -> mult ) * tsc ) >> (clock_source -> shift);
+ *  
+ *  nanoseconds is now TSC value converted to nanoseconds, 
+ *  according to Linux' clocksource calibration values. 
+ *  
+ */
+
+#endif

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: Fwd: [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
  2018-03-08 15:16         ` Jason Vas Dias
@ 2018-03-08 16:40           ` Thomas Gleixner
  0 siblings, 0 replies; 7+ messages in thread
From: Thomas Gleixner @ 2018-03-08 16:40 UTC (permalink / raw)
  To: Jason Vas Dias; +Cc: x86, LKML, andi, Peter Zijlstra

On Thu, 8 Mar 2018, Jason Vas Dias wrote:
> On 08/03/2018, Thomas Gleixner <tglx@linutronix.de> wrote:
> I'd appreciate clarification on few points :
> 
> > Well, you did not expose the raw conversion data in vsyscall_gtod_data. You
> > are using:
> >
> > +         tsc    *= gtod->mult;
> > +         tsc   >>= gtod->shift;
> >
> > That's is the adjusted mult/shift value which can change when NTP/PTP is
> > enabled and you _cannot_ use it unprotected.
> ...
> > vsyscall_gtod_data does not have the raw
> > information and it simply can be added.
> 
> 
> By "the raw information" here,  do you mean :
>   o   the 'tsc_khz'  "Refined TSC clocksource calibration" ?
>   o   or some other data ?

The raw conversion data which is used in getrawmonotonic64() as I pointed
out several times now.

> The shift and mult values are calculated from
> tsc_khz by :
> <quote><code>
> tsc_refine_calibration_work() , in arch/x86/kernel/tsc.c , @ line 1164:
> { ...
>   	tsc_khz = freq;
> 	pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
> 		(unsigned long)tsc_khz / 1000,
> 		(unsigned long)tsc_khz % 1000);

There is really no point in copying tons of code in your replies. I already
know how that works, really.

> I understand that the vsyscall_gtod_data contains the adjusted mult & shift ,
> but since this is the only data available there, and the kernel
> syscall implementation
> of clock_gettime(CLOCK_MONOTONIC_RAW) does seem to access
> these same adjusted shift & mult values - as I showed in previous post,
> kernel/time/timekeeping.c's getmonotonicraw64() does use these same
> mult and shift values:

It really does not and if you can't be bothered to actually look at the
difference of:

void getrawmonotonic64(struct timespec64 *ts)
{
	...
	nsecs = timekeeping_get_ns(&tk->tkr_raw);

versus:

void ktime_get_ts64(struct timespec64 *ts)
{
	...
	nsec = timekeeping_get_ns(&tk->tkr_mono);

then I really can't help it.

> Are the vdso gtod->mult and gtod->shift values somehow different from
>   'tkr->mult' and 'tkr->shift' ?

They are the adjusted values used for CLOCK_MONOTONIC and they are updated
when the kernel side timekeeper->tkr_mono data is updated. What's not
stored in the VDSO data right now is the raw data. As I told you a
gazillion times now, it can be stored there and then the VDSO based mono
raw accessor can be implemented similar to the monotonic/realtime
implementations.

> I thought they were actually pointers to the same calibration data.

Care to look at how the vdso data is updated?

> Yes, as noted in previous posts, the 'mult' value does seem to
> change by small increments . I guess this is purely because of NTP / PTP ?
> So the syscall implementation of clock_gettime(CLOCK_MONOTONIC_RAW,&ts)
> actually does return an NTP / PTP adjusted value ?
> Then it is  even worse than I thought - not only does it
> have an unusably high latency, but it does not do what
> it is documented to do at all, if those mult/shift values
> are changed by NTP/PTP / CPU frequency adjustments.

Once again: CLOCK_MONOTONIC_RAW does exactly what is documented and the
conversion values are not the same as those for CLOCK_MONOTONIC.

> I do strongly believe that if the VDSO is going to support accessing the TSC
> from user-space,
> it should provide some access to the  TSC calibration values that would
> permit user-space TSC readers to convert the TSC value to nanoseconds
> accurately and efficiently, in a way not prone to NTP or PTP adjustments ;

You surely are free to believe what you want.

> otherwise the VDSO do_monotonic_raw must be used, and as it has nowhere
> to cache previous values, it must do the long division every time,
> which enforces
> a latency of >= @ 100ns.

I told you before that there is neither a requirement to store anything nor
a requirement to use a long division. I gave you the pointers to the
monotonic/realtime accessors which do neither store anything nor do long
divisions.

> My  user-space TSC reader in 'test_vdso_tsc.c' (sent as attachments to
> previous mails)  uses the pointer returned by
> __vdso_linux_tsc_calibration() , and a
> cached previous value, to achieve latencies of 10-20ns .

I don't care about that as this is a completely different thing. Providing
the raw TSC conversion factors to user space might be a worthwhile
exercise, but has nothing to do with the VDSO based clock_gettime()
implementation at all. And as I pointed out in the other reply, it's not
going to happen in a prone to be broken way.

> I can't see how the contents pointed to by the pointer returned by
> __vdso_linux_tsc_calibration()
> will be "garbage" , as long as the clock source remains TSC .

There is no guarantee that the clock source remains TSC. And I don't care
about your 'controlled' system at all. Interfaces have to be usable on all
systems and cannot be implemented on a 'might work if you know what you are
doing' basis. That's not the way the kernel implements interfaces ever.

> Updates to 64 bit memory locations are atomic on x86_64 , so either
> an updated valid 'mult' value is read, or the previous valid 'mult' value
> is read (the shift does not change) - I can't see the harm in that.

Again: Trying to implement MONOTONIC_RAW with the adjusted mult value is
wrong. You cannot even implement MONOTONIC with just looking at gtod->mult.
Please study the underlying algorithm of the monotonic accessor before
making claims like that.

> By 'Controlled' I just mean I can control whether the clocksource will change
> during a run of given test program that uses the TSC or not - I
> believe any other user would also be in the same position - don't
> change the clock source during
> a run of your test program, and it will yield valid results.  It is
> difficult to imagine
> a time measurement program that could be expected to withstand a change
> of the system clocksource and still provide valid results - one would want to
> discard results from a program that experienced a system clock-source change.

Well, no. An interface exported by the kernel has to provide safe data
under all circumstances and the existing syscall and VDSO based accessors
provide correct time stamps even accross a clocksource change.

> So I don't agree that it is a priori wrong to return a pointer into
> the VDSO vvar
> page to user-space, which can only be used in a read-only way, and
> whose contents
> will be valid as long as the clock source does not change, when users
> can easily determine if the clock source has changed or not before using
> the pointer - I don't see how the contents can spontaneously become 'garbage'
> if the clock source does not change, and if it does change, users can easily
> find out about it.

It's already garbage to use the changing mult value blindy on the TSC
readout. The conversion is only valid for a given time slice and if you
look at the clock monotonic access (condensed):

     do {
     	seq = gtod_read_begin(gtod);
	ts->tv_sec = gtod->monotonic_time_sec;
	ns = gtod->monotonic_time_snsec;
	  cycles = vread_tsc();
	  v = (cycles - gtod->cycle_last) & gtod->mask;
	  v *= gtod->mult;
	ns += v;
     } while (unlikely(gtod_read_retry(gtod, seq)));

     ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
     ts->tv_nsec += ns;
     
which is the same algorithm as in the syscall based implementation then
you'll notice that this calculates the delta between the TSC value and
gtod->cycle_last. __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns) is actually not
a division, it's just making sure that the nsec part does not get larger
than NSEC_PER_SEC and it's guaranteed by the underlying update mechanism
that this is not a unbound loop, but at max ONE adjustment step.

This is accurate, fast and safe against changes of both the conversion
factor and the underlying clocksource.

> The only reason I use the mult & shift is because this is the way Linux
> does it , and we'd like to obtain TSC time values that correlate well
> with those generated by the kernel.

The right way to do that is to put the raw conversion values and the raw
seconds base value into the vdso data and implement the counterpart of
getrawmonotonic64(). And if that is done, then it can be done for _ALL_
clocksources which support VDSO access and not just for the TSC.

> It does discard alot of resolution and seconds bits , but the values do appear

The kernel does not lose bits at all and the accuracy is determined by the
shift value, which is usually about 24. So the internal resolution of
timekeeping is 1/(1 << 24) nanoseconds, which more than sufficient to
maintain accuracy.

> Please, I'm not saying my patch must be integrated as-is into Linux, I'm
> just saying we really need a low-latency TSC reader in the VDSO for
>    clock_gettime(CLOCK_MONOTONIC_RAW, &ts)
> under Linux on Intel / AMD, and presenting one way I've found to provide one.

Just that it does neither provide CLOCK_MONOTONIC_RAW time stamps nor does
it work for more than 4 seconds straight.

> If you find a better way, please let me know.

I think I gave you enough hints by now, how this should be done. If you
don't feel up to the task to make it work, then please let me know and just
ask for a VDSO implementation of clock_gettime(CLOCK_MONOTONIC_RAW) instead
of making completely false claims about the correctness of the kernel
timekeeping infrastructure.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Fwd: [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer
@ 2018-03-11  7:08 Jason Vas Dias
  0 siblings, 0 replies; 7+ messages in thread
From: Jason Vas Dias @ 2018-03-11  7:08 UTC (permalink / raw)
  To: x86, LKML, Thomas Gleixner, andi, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 2107 bytes --]

Hi Thomas -

Thanks very much for your help & guidance in previous mail:

RE: On 08/03/2018, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> The right way to do that is to put the raw conversion values and the raw
> seconds base value into the vdso data and implement the counterpart of
> getrawmonotonic64(). And if that is done, then it can be done for _ALL_
> clocksources which support VDSO access and not just for the TSC.
>

I have done this now with a new patch, sent in mail with subject :
  
'[PATCH v4.16-rc4 1/1] x86/vdso: on Intel, VDSO should handle CLOCK_MONOTONIC_RAW' 

which should address all the concerns you raise.

> I already  know how that works, really.

I never doubted or meant to impugn that !

I am beginning to know a little how that works also, thanks in great
part to your help last week - thanks for your patience.

I was impatient last week to get access to low latency timers for a work
project, and was trying to read the unadjusted clock .

> instead of making completely false claims about the correctness of the kernel
> timekeeping infrastructure.

I really didn't mean to make any such claims - I'm sorry if I did .  I was just trying
to say that by the time the results of clock_gettime(CLOCK_MONOTONIC_RAW,&ts) were
available to the caller they were not of much use because of the
latencies often dwarfing the time differences .

Anyway, I hope sometime you will consider putting such a patch in the
kernel.

I have developed a verson for ARM also, but that depends on making
CNTPCT + CNTFRQ registers readable in user-space, which is not meant
to be secure and is not normally done , but does work - but it is
against the Texas Instruments (ti-linux) kernel and can be enabled
with a new KConfig option, and brings latencies down from > 300ns
to < 20ns . Maybe I should post that also to kernel.org, or to
ti.com ?

I have a separate patch for the vdso_tsc_calibration export of the
tsc_khz and calibration which no longer returns pointers into the VDSO -
I can post this as a patch if you like.

Thanks & Best Regards,
Jason Vas Dias <jason.vas.dias@gmail.com>


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Patch against v4.16-rc4 to implement CLOCK_MONOTONIC_RAW in VDSO --]
[-- Type: text/x-patch, Size: 5065 bytes --]

diff -up linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc4 linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c
--- linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c.4.16-rc4	2018-03-04 22:54:11.000000000 +0000
+++ linux-4.16-rc4/arch/x86/entry/vdso/vclock_gettime.c	2018-03-11 05:08:31.137681337 +0000
@@ -182,6 +182,29 @@ notrace static u64 vread_tsc(void)
 	return last;
 }
 
+notrace static u64 vread_tsc_raw(void)
+{
+        u64 tsc, last=gtod->raw_cycle_last;
+        if( likely( gtod->has_rdtscp ) ) {
+                u32     tsc_lo, tsc_hi,
+                        tsc_cpu __attribute__((unused));
+                asm volatile
+                ( "rdtscp"
+                /* ^- has built-in cancellation point / pipeline stall "barrier" */
+                : "=a" (tsc_lo)
+                , "=d" (tsc_hi)
+                , "=c" (tsc_cpu)
+                ); // since all variables 32-bit, eax, edx, ecx used - NOT rax, rdx, rcx                 
+                tsc      = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL);
+        } else {
+                tsc      = rdtsc_ordered();
+        }
+	if (likely(tsc >= last))
+		return tsc;        
+        asm volatile ("");                                
+        return last;
+}
+
 notrace static inline u64 vgetsns(int *mode)
 {
 	u64 v;
@@ -203,6 +226,27 @@ notrace static inline u64 vgetsns(int *m
 	return v * gtod->mult;
 }
 
+notrace static inline u64 vgetsns_raw(int *mode)
+{
+	u64 v;
+	cycles_t cycles;
+
+	if (gtod->vclock_mode == VCLOCK_TSC)
+		cycles = vread_tsc_raw();
+#ifdef CONFIG_PARAVIRT_CLOCK
+	else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
+		cycles = vread_pvclock(mode);
+#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+	else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
+		cycles = vread_hvclock(mode);
+#endif
+	else
+		return 0;
+	v = (cycles - gtod->raw_cycle_last) & gtod->raw_mask;
+	return v * gtod->raw_mult;
+}
+
 /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
 notrace static int __always_inline do_realtime(struct timespec *ts)
 {
@@ -246,6 +290,27 @@ notrace static int __always_inline do_mo
 	return mode;
 }
 
+notrace static int __always_inline do_monotonic_raw( struct timespec *ts)
+{
+	unsigned long seq;
+	u64 ns;
+	int mode;
+
+	do {
+		seq = gtod_read_begin(gtod);
+		mode = gtod->vclock_mode;
+		ts->tv_sec = gtod->monotonic_time_raw_sec;
+		ns = gtod->monotonic_time_raw_nsec;
+		ns += vgetsns_raw(&mode);
+		ns >>= gtod->raw_shift;
+	} while (unlikely(gtod_read_retry(gtod, seq)));
+
+	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return mode;
+}
+
 notrace static void do_realtime_coarse(struct timespec *ts)
 {
 	unsigned long seq;
@@ -277,6 +342,10 @@ notrace int __vdso_clock_gettime(clockid
 		if (do_monotonic(ts) == VCLOCK_NONE)
 			goto fallback;
 		break;
+	case CLOCK_MONOTONIC_RAW:
+		if (do_monotonic_raw(ts) == VCLOCK_NONE)
+			goto fallback;
+		break;
 	case CLOCK_REALTIME_COARSE:
 		do_realtime_coarse(ts);
 		break;
diff -up linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc4 linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c
--- linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c.4.16-rc4	2018-03-04 22:54:11.000000000 +0000
+++ linux-4.16-rc4/arch/x86/entry/vsyscall/vsyscall_gtod.c	2018-03-11 05:10:57.371197747 +0000
@@ -16,6 +16,7 @@
 #include <linux/timekeeper_internal.h>
 #include <asm/vgtod.h>
 #include <asm/vvar.h>
+#include <asm/cpufeature.h>
 
 int vclocks_used __read_mostly;
 
@@ -45,6 +46,12 @@ void update_vsyscall(struct timekeeper *
 	vdata->mult		= tk->tkr_mono.mult;
 	vdata->shift		= tk->tkr_mono.shift;
 
+	vdata->raw_cycle_last   = tk->tkr_raw.cycle_last;
+	vdata->raw_mask         = tk->tkr_raw.mask;
+	vdata->raw_mult         = tk->tkr_raw.mult;
+	vdata->raw_shift	= tk->tkr_raw.shift;
+	vdata->has_rdtscp	= static_cpu_has(X86_FEATURE_RDTSCP);
+
 	vdata->wall_time_sec		= tk->xtime_sec;
 	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
 
@@ -74,5 +81,8 @@ void update_vsyscall(struct timekeeper *
 		vdata->monotonic_time_coarse_sec++;
 	}
 
+	vdata->monotonic_time_raw_sec  = tk->raw_sec;
+	vdata->monotonic_time_raw_nsec = tk->tkr_raw.xtime_nsec;
+
 	gtod_write_end(vdata);
 }
diff -up linux-4.16-rc4/arch/x86/include/asm/vgtod.h.4.16-rc4 linux-4.16-rc4/arch/x86/include/asm/vgtod.h
--- linux-4.16-rc4/arch/x86/include/asm/vgtod.h.4.16-rc4	2018-03-04 22:54:11.000000000 +0000
+++ linux-4.16-rc4/arch/x86/include/asm/vgtod.h	2018-03-11 05:12:35.916338703 +0000
@@ -22,6 +22,11 @@ struct vsyscall_gtod_data {
 	u64	mask;
 	u32	mult;
 	u32	shift;
+	u64	raw_cycle_last;
+	u64     raw_mask;
+	u32     raw_mult;
+	u32     raw_shift;
+	u32     has_rdtscp;
 
 	/* open coded 'struct timespec' */
 	u64		wall_time_snsec;
@@ -32,6 +37,8 @@ struct vsyscall_gtod_data {
 	gtod_long_t	wall_time_coarse_nsec;
 	gtod_long_t	monotonic_time_coarse_sec;
 	gtod_long_t	monotonic_time_coarse_nsec;
+	gtod_long_t	monotonic_time_raw_sec;
+	gtod_long_t	monotonic_time_raw_nsec;
 
 	int		tz_minuteswest;
 	int		tz_dsttime;

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-03-11  7:09 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-05  2:59 [PATCH v4.15.7 1/1] on Intel, VDSO should handle CLOCK_MONOTONIC_RAW and export 'tsc_calibration' pointer Jason Vas Dias
2018-03-06  9:33 ` Thomas Gleixner
     [not found]   ` <CALyZvKxORRjBurwj7+YV5fwJfwjMVWif+zpjC9AVM3z3WTVDaA@mail.gmail.com>
2018-03-06 17:13     ` Fwd: " Jason Vas Dias
2018-03-08 12:57       ` Thomas Gleixner
2018-03-08 15:16         ` Jason Vas Dias
2018-03-08 16:40           ` Thomas Gleixner
2018-03-11  7:08 Jason Vas Dias

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.