All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
@ 2016-01-28 14:04 Paolo Bonzini
  2016-01-28 14:04 ` Paolo Bonzini
                   ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-01-28 14:04 UTC (permalink / raw)
  To: kvm; +Cc: Andrey Smetanin, Roman Kagan, Denis V. Lunev

The test checks the relative precision of the reference TSC page
and the time reference counter.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
	Andrey, the test has a relative error of approximately 3 parts
	per million on my machine.  In other words it drifts by 3
	microseconds every second, which I don't think is acceptable.
	The problem is that virtual_tsc_khz is 2593993 while the actual
	frequency is more like 2594001 kHz.

	But I have a problem in general with using tsc_khz to compute
	the hyperv reference clock scale.  The maximum possible accuracy
	on a 2.5 GHz machine is around 200 ppb, corresponding to a drift
	of about 16 milliseconds per day.  Perhaps we could use Linux
	(and kvmclock's) timekeeping parameters to derive the tsc_scale
	and tsc_offset?

 config/config-x86-common.mak |   2 +
 config/config-x86_64.mak     |   1 +
 x86/hyperv.h                 |   9 ++
 x86/hyperv_clock.c           | 193 +++++++++++++++++++++++++++++++++++++++++++
 x86/unittests.cfg            |   5 ++
 5 files changed, 210 insertions(+)
 create mode 100644 x86/hyperv_clock.c

diff --git a/config/config-x86-common.mak b/config/config-x86-common.mak
index 72b95e3..621413b 100644
--- a/config/config-x86-common.mak
+++ b/config/config-x86-common.mak
@@ -119,6 +119,8 @@ $(TEST_DIR)/hyperv_synic.elf: $(cstart.o) $(TEST_DIR)/hyperv.o \
 $(TEST_DIR)/hyperv_stimer.elf: $(cstart.o) $(TEST_DIR)/hyperv.o \
                                $(TEST_DIR)/hyperv_stimer.o
 
+$(TEST_DIR)/hyperv_clock.elf: $(cstart.o) $(TEST_DIR)/hyperv.o
+
 arch_clean:
 	$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat $(TEST_DIR)/*.elf \
 	$(TEST_DIR)/.*.d lib/x86/.*.d
diff --git a/config/config-x86_64.mak b/config/config-x86_64.mak
index 1764701..634d09d 100644
--- a/config/config-x86_64.mak
+++ b/config/config-x86_64.mak
@@ -12,5 +12,6 @@ tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
 tests += $(TEST_DIR)/svm.flat
 tests += $(TEST_DIR)/vmx.flat
 tests += $(TEST_DIR)/tscdeadline_latency.flat
+tests += $(TEST_DIR)/hyperv_clock.flat
 
 include config/config-x86-common.mak
diff --git a/x86/hyperv.h b/x86/hyperv.h
index faf931b..974df56 100644
--- a/x86/hyperv.h
+++ b/x86/hyperv.h
@@ -12,6 +12,7 @@
 #define HV_X64_MSR_SYNTIMER_AVAILABLE           (1 << 3)
 
 #define HV_X64_MSR_TIME_REF_COUNT               0x40000020
+#define HV_X64_MSR_REFERENCE_TSC                0x40000021
 
 /* Define synthetic interrupt controller model specific registers. */
 #define HV_X64_MSR_SCONTROL                     0x40000080
@@ -180,4 +181,12 @@ void synic_sint_create(int vcpu, int sint, int vec, bool auto_eoi);
 void synic_sint_set(int vcpu, int sint);
 void synic_sint_destroy(int vcpu, int sint);
 
+struct hv_reference_tsc_page {
+        uint32_t tsc_sequence;
+        uint32_t res1;
+        uint64_t tsc_scale;
+        int64_t tsc_offset;
+};
+
+
 #endif
diff --git a/x86/hyperv_clock.c b/x86/hyperv_clock.c
new file mode 100644
index 0000000..680ba7c
--- /dev/null
+++ b/x86/hyperv_clock.c
@@ -0,0 +1,193 @@
+#include "libcflat.h"
+#include "smp.h"
+#include "atomic.h"
+#include "processor.h"
+#include "hyperv.h"
+#include "vm.h"
+
+#define MAX_CPU 4
+#define TICKS_PER_SEC (1000000000 / 100)
+
+struct hv_reference_tsc_page *hv_clock;
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u64 mul_frac)
+{
+	u64 product, unused;
+
+	__asm__ (
+		"mul %3"
+		: "=d" (product), "=a" (unused) : "1" (delta), "rm" ((u64)mul_frac) );
+
+	return product;
+}
+
+static u64 hvclock_tsc_to_ticks(struct hv_reference_tsc_page *shadow, uint64_t tsc)
+{
+	u64 delta = tsc;
+	return scale_delta(delta, shadow->tsc_scale) + shadow->tsc_offset;
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static void hvclock_get_time_values(struct hv_reference_tsc_page *shadow,
+				    struct hv_reference_tsc_page *page)
+{
+	int seq;
+	do {
+		seq = page->tsc_sequence;
+		rmb();		/* fetch version before data */
+		*shadow = *page;
+		rmb();		/* test version after fetching data */
+	} while (shadow->tsc_sequence != seq);
+}
+
+uint64_t hv_clock_read(void)
+{
+	struct hv_reference_tsc_page shadow;
+
+	hvclock_get_time_values(&shadow, hv_clock);
+	return hvclock_tsc_to_ticks(&shadow, rdtsc());
+}
+
+atomic_t cpus_left;
+bool ok[MAX_CPU];
+uint64_t loops[MAX_CPU];
+
+static void hv_clock_test(void *data)
+{
+	int i = smp_id();
+	uint64_t t = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	uint64_t end = t + 3 * TICKS_PER_SEC;
+
+	ok[i] = true;
+	do {
+		uint64_t now = hv_clock_read();
+		if (now < t) {
+			printf("warp on CPU %d!\n", smp_id());
+			ok[i] = false;
+			break;
+		}
+		t = now;
+	} while(t < end);
+
+	barrier();
+	if (t >= end) {
+		int64_t ref = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+		if (i == 0)
+			printf("Time reference MSR drift: %lld\n\n", ref - end);
+		ok[i] &= (ref - end) > -5 && (ref - end) < 5;
+	}
+
+	atomic_dec(&cpus_left);
+}
+
+static void check_test(int ncpus)
+{
+	int i;
+	bool pass;
+
+	atomic_set(&cpus_left, ncpus);
+	for (i = ncpus - 1; i >= 0; i--)
+		on_cpu_async(i, hv_clock_test, NULL);
+
+	/* Wait for the end of other vcpu */
+	while(atomic_read(&cpus_left))
+		;
+
+	pass = true;
+	for (i = ncpus - 1; i >= 0; i--)
+		pass &= ok[i];
+
+	report("TSC reference precision test", pass);
+}
+
+static void hv_perf_test(void *data)
+{
+	uint64_t t = hv_clock_read();
+	uint64_t end = t + 1000000000 / 100;
+	uint64_t local_loops = 0;
+
+	do {
+		t = hv_clock_read();
+		local_loops++;
+	} while(t < end);
+
+	loops[smp_id()] = local_loops;
+	atomic_dec(&cpus_left);
+}
+
+static void perf_test(int ncpus)
+{
+	int i;
+	uint64_t total_loops;
+
+	atomic_set(&cpus_left, ncpus);
+	for (i = ncpus - 1; i >= 0; i--)
+		on_cpu_async(i, hv_perf_test, NULL);
+
+	/* Wait for the end of other vcpu */
+	while(atomic_read(&cpus_left))
+		;
+
+	total_loops = 0;
+	for (i = ncpus - 1; i >= 0; i--)
+		total_loops += loops[i];
+	printf("iterations/sec:  %lld\n", total_loops / ncpus);
+}
+
+int main(int ac, char **av)
+{
+	int nerr = 0;
+	int ncpus;
+	struct hv_reference_tsc_page shadow;
+	uint64_t tsc1, t1, tsc2, t2;
+	uint64_t ref1, ref2;
+
+	setup_vm();
+	smp_init();
+
+	hv_clock = alloc_page();
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, (u64)(uintptr_t)hv_clock | 1);
+	report("MSR value after enabling",
+	       rdmsr(HV_X64_MSR_REFERENCE_TSC) == ((u64)(uintptr_t)hv_clock | 1));
+
+	hvclock_get_time_values(&shadow, hv_clock);
+	if (shadow.tsc_sequence == 0 || shadow.tsc_sequence == 0xFFFFFFFF) {
+		printf("Reference TSC page not available\n");
+		exit(1);
+	}
+
+	printf("scale: %llx offset: %lld\n", shadow.tsc_scale, shadow.tsc_offset);
+	ref1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	tsc1 = rdtsc();
+	t1 = hvclock_tsc_to_ticks(&shadow, tsc1);
+	printf("refcnt %lld, TSC %llx, TSC reference %lld\n",
+	       ref1, tsc1, t1);
+
+	do
+		ref2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	while (ref2 < ref1 + 2 * TICKS_PER_SEC);
+
+	tsc2 = rdtsc();
+	t2 = hvclock_tsc_to_ticks(&shadow, tsc2);
+	printf("refcnt %lld (delta %lld), TSC %llx, TSC reference %lld (delta %lld)\n",
+	       ref2, ref2 - ref1, tsc2, t2, t2 - t1);
+
+	ncpus = cpu_count();
+	if (ncpus > MAX_CPU)
+		ncpus = MAX_CPU;
+
+	check_test(ncpus);
+	perf_test(ncpus);
+
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, 0LL);
+	report("MSR value after disabling", rdmsr(HV_X64_MSR_REFERENCE_TSC) == 0);
+
+	return nerr > 0 ? 1 : 0;
+}
diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 99eff26..e981c00 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -188,3 +188,8 @@ extra_params = -cpu kvm64,hv_synic -device hyperv-testdev
 file = hyperv_stimer.flat
 smp = 2
 extra_params = -cpu kvm64,hv_time,hv_synic,hv_stimer -device hyperv-testdev
+
+[hyperv_clock]
+file = hyperv_clock.flat
+smp = 2
+extra_params = -cpu kvm64,hv_time
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 14:04 [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case Paolo Bonzini
@ 2016-01-28 14:04 ` Paolo Bonzini
  2016-01-28 14:25 ` Andrey Smetanin
  2016-01-28 16:22 ` Roman Kagan
  2 siblings, 0 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-01-28 14:04 UTC (permalink / raw)
  To: kvm

The test checks the relative precision of the reference TSC page
and the time reference counter.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 config/config-x86-common.mak |   2 +
 config/config-x86_64.mak     |   1 +
 x86/hyperv.h                 |   9 ++
 x86/hyperv_clock.c           | 193 +++++++++++++++++++++++++++++++++++++++++++
 x86/unittests.cfg            |   5 ++
 5 files changed, 210 insertions(+)
 create mode 100644 x86/hyperv_clock.c

diff --git a/config/config-x86-common.mak b/config/config-x86-common.mak
index 72b95e3..621413b 100644
--- a/config/config-x86-common.mak
+++ b/config/config-x86-common.mak
@@ -119,6 +119,8 @@ $(TEST_DIR)/hyperv_synic.elf: $(cstart.o) $(TEST_DIR)/hyperv.o \
 $(TEST_DIR)/hyperv_stimer.elf: $(cstart.o) $(TEST_DIR)/hyperv.o \
                                $(TEST_DIR)/hyperv_stimer.o
 
+$(TEST_DIR)/hyperv_clock.elf: $(cstart.o) $(TEST_DIR)/hyperv.o
+
 arch_clean:
 	$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat $(TEST_DIR)/*.elf \
 	$(TEST_DIR)/.*.d lib/x86/.*.d
diff --git a/config/config-x86_64.mak b/config/config-x86_64.mak
index 1764701..634d09d 100644
--- a/config/config-x86_64.mak
+++ b/config/config-x86_64.mak
@@ -12,5 +12,6 @@ tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
 tests += $(TEST_DIR)/svm.flat
 tests += $(TEST_DIR)/vmx.flat
 tests += $(TEST_DIR)/tscdeadline_latency.flat
+tests += $(TEST_DIR)/hyperv_clock.flat
 
 include config/config-x86-common.mak
diff --git a/x86/hyperv.h b/x86/hyperv.h
index faf931b..974df56 100644
--- a/x86/hyperv.h
+++ b/x86/hyperv.h
@@ -12,6 +12,7 @@
 #define HV_X64_MSR_SYNTIMER_AVAILABLE           (1 << 3)
 
 #define HV_X64_MSR_TIME_REF_COUNT               0x40000020
+#define HV_X64_MSR_REFERENCE_TSC                0x40000021
 
 /* Define synthetic interrupt controller model specific registers. */
 #define HV_X64_MSR_SCONTROL                     0x40000080
@@ -180,4 +181,12 @@ void synic_sint_create(int vcpu, int sint, int vec, bool auto_eoi);
 void synic_sint_set(int vcpu, int sint);
 void synic_sint_destroy(int vcpu, int sint);
 
+struct hv_reference_tsc_page {
+        uint32_t tsc_sequence;
+        uint32_t res1;
+        uint64_t tsc_scale;
+        int64_t tsc_offset;
+};
+
+
 #endif
diff --git a/x86/hyperv_clock.c b/x86/hyperv_clock.c
new file mode 100644
index 0000000..680ba7c
--- /dev/null
+++ b/x86/hyperv_clock.c
@@ -0,0 +1,193 @@
+#include "libcflat.h"
+#include "smp.h"
+#include "atomic.h"
+#include "processor.h"
+#include "hyperv.h"
+#include "vm.h"
+
+#define MAX_CPU 4
+#define TICKS_PER_SEC (1000000000 / 100)
+
+struct hv_reference_tsc_page *hv_clock;
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u64 mul_frac)
+{
+	u64 product, unused;
+
+	__asm__ (
+		"mul %3"
+		: "=d" (product), "=a" (unused) : "1" (delta), "rm" ((u64)mul_frac) );
+
+	return product;
+}
+
+static u64 hvclock_tsc_to_ticks(struct hv_reference_tsc_page *shadow, uint64_t tsc)
+{
+	u64 delta = tsc;
+	return scale_delta(delta, shadow->tsc_scale) + shadow->tsc_offset;
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static void hvclock_get_time_values(struct hv_reference_tsc_page *shadow,
+				    struct hv_reference_tsc_page *page)
+{
+	int seq;
+	do {
+		seq = page->tsc_sequence;
+		rmb();		/* fetch version before data */
+		*shadow = *page;
+		rmb();		/* test version after fetching data */
+	} while (shadow->tsc_sequence != seq);
+}
+
+uint64_t hv_clock_read(void)
+{
+	struct hv_reference_tsc_page shadow;
+
+	hvclock_get_time_values(&shadow, hv_clock);
+	return hvclock_tsc_to_ticks(&shadow, rdtsc());
+}
+
+atomic_t cpus_left;
+bool ok[MAX_CPU];
+uint64_t loops[MAX_CPU];
+
+static void hv_clock_test(void *data)
+{
+	int i = smp_id();
+	uint64_t t = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	uint64_t end = t + 3 * TICKS_PER_SEC;
+
+	ok[i] = true;
+	do {
+		uint64_t now = hv_clock_read();
+		if (now < t) {
+			printf("warp on CPU %d!\n", smp_id());
+			ok[i] = false;
+			break;
+		}
+		t = now;
+	} while(t < end);
+
+	barrier();
+	if (t >= end) {
+		int64_t ref = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+		if (i == 0)
+			printf("Time reference MSR drift: %lld\n\n", ref - end);
+		ok[i] &= (ref - end) > -5 && (ref - end) < 5;
+	}
+
+	atomic_dec(&cpus_left);
+}
+
+static void check_test(int ncpus)
+{
+	int i;
+	bool pass;
+
+	atomic_set(&cpus_left, ncpus);
+	for (i = ncpus - 1; i >= 0; i--)
+		on_cpu_async(i, hv_clock_test, NULL);
+
+	/* Wait for the end of other vcpu */
+	while(atomic_read(&cpus_left))
+		;
+
+	pass = true;
+	for (i = ncpus - 1; i >= 0; i--)
+		pass &= ok[i];
+
+	report("TSC reference precision test", pass);
+}
+
+static void hv_perf_test(void *data)
+{
+	uint64_t t = hv_clock_read();
+	uint64_t end = t + 1000000000 / 100;
+	uint64_t local_loops = 0;
+
+	do {
+		t = hv_clock_read();
+		local_loops++;
+	} while(t < end);
+
+	loops[smp_id()] = local_loops;
+	atomic_dec(&cpus_left);
+}
+
+static void perf_test(int ncpus)
+{
+	int i;
+	uint64_t total_loops;
+
+	atomic_set(&cpus_left, ncpus);
+	for (i = ncpus - 1; i >= 0; i--)
+		on_cpu_async(i, hv_perf_test, NULL);
+
+	/* Wait for the end of other vcpu */
+	while(atomic_read(&cpus_left))
+		;
+
+	total_loops = 0;
+	for (i = ncpus - 1; i >= 0; i--)
+		total_loops += loops[i];
+	printf("iterations/sec:  %lld\n", total_loops / ncpus);
+}
+
+int main(int ac, char **av)
+{
+	int nerr = 0;
+	int ncpus;
+	struct hv_reference_tsc_page shadow;
+	uint64_t tsc1, t1, tsc2, t2;
+	uint64_t ref1, ref2;
+
+	setup_vm();
+	smp_init();
+
+	hv_clock = alloc_page();
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, (u64)(uintptr_t)hv_clock | 1);
+	report("MSR value after enabling",
+	       rdmsr(HV_X64_MSR_REFERENCE_TSC) == ((u64)(uintptr_t)hv_clock | 1));
+
+	hvclock_get_time_values(&shadow, hv_clock);
+	if (shadow.tsc_sequence == 0 || shadow.tsc_sequence == 0xFFFFFFFF) {
+		printf("Reference TSC page not available\n");
+		exit(1);
+	}
+
+	printf("scale: %llx offset: %lld\n", shadow.tsc_scale, shadow.tsc_offset);
+	ref1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	tsc1 = rdtsc();
+	t1 = hvclock_tsc_to_ticks(&shadow, tsc1);
+	printf("refcnt %lld, TSC %llx, TSC reference %lld\n",
+	       ref1, tsc1, t1);
+
+	do
+		ref2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	while (ref2 < ref1 + 2 * TICKS_PER_SEC);
+
+	tsc2 = rdtsc();
+	t2 = hvclock_tsc_to_ticks(&shadow, tsc2);
+	printf("refcnt %lld (delta %lld), TSC %llx, TSC reference %lld (delta %lld)\n",
+	       ref2, ref2 - ref1, tsc2, t2, t2 - t1);
+
+	ncpus = cpu_count();
+	if (ncpus > MAX_CPU)
+		ncpus = MAX_CPU;
+
+	check_test(ncpus);
+	perf_test(ncpus);
+
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, 0LL);
+	report("MSR value after disabling", rdmsr(HV_X64_MSR_REFERENCE_TSC) == 0);
+
+	return nerr > 0 ? 1 : 0;
+}
diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 99eff26..e981c00 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -188,3 +188,8 @@ extra_params = -cpu kvm64,hv_synic -device hyperv-testdev
 file = hyperv_stimer.flat
 smp = 2
 extra_params = -cpu kvm64,hv_time,hv_synic,hv_stimer -device hyperv-testdev
+
+[hyperv_clock]
+file = hyperv_clock.flat
+smp = 2
+extra_params = -cpu kvm64,hv_time
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 14:04 [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case Paolo Bonzini
  2016-01-28 14:04 ` Paolo Bonzini
@ 2016-01-28 14:25 ` Andrey Smetanin
  2016-01-28 14:50   ` Paolo Bonzini
  2016-01-28 16:22 ` Roman Kagan
  2 siblings, 1 reply; 36+ messages in thread
From: Andrey Smetanin @ 2016-01-28 14:25 UTC (permalink / raw)
  To: Paolo Bonzini, kvm; +Cc: Roman Kagan, Denis V. Lunev



On 01/28/2016 05:04 PM, Paolo Bonzini wrote:
> The test checks the relative precision of the reference TSC page
> and the time reference counter.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> 	Andrey, the test has a relative error of approximately 3 parts
> 	per million on my machine.  In other words it drifts by 3
> 	microseconds every second, which I don't think is acceptable.
> 	The problem is that virtual_tsc_khz is 2593993 while the actual
> 	frequency is more like 2594001 kHz.
>
> 	But I have a problem in general with using tsc_khz to compute
> 	the hyperv reference clock scale.  The maximum possible accuracy
> 	on a 2.5 GHz machine is around 200 ppb, corresponding to a drift
> 	of about 16 milliseconds per day.  Perhaps we could use Linux
> 	(and kvmclock's) timekeeping parameters to derive the tsc_scale
> 	and tsc_offset?
Probably need to check the code. But can we just workaround it
by host periodic timer with period ~10ms which will issue(by scheduling 
vcpu KVM_REQ_*) refresh of guest tsc page to prevent drifting?
>
>   config/config-x86-common.mak |   2 +
>   config/config-x86_64.mak     |   1 +
>   x86/hyperv.h                 |   9 ++
>   x86/hyperv_clock.c           | 193 +++++++++++++++++++++++++++++++++++++++++++
>   x86/unittests.cfg            |   5 ++
>   5 files changed, 210 insertions(+)
>   create mode 100644 x86/hyperv_clock.c
>
> diff --git a/config/config-x86-common.mak b/config/config-x86-common.mak
> index 72b95e3..621413b 100644
> --- a/config/config-x86-common.mak
> +++ b/config/config-x86-common.mak
> @@ -119,6 +119,8 @@ $(TEST_DIR)/hyperv_synic.elf: $(cstart.o) $(TEST_DIR)/hyperv.o \
>   $(TEST_DIR)/hyperv_stimer.elf: $(cstart.o) $(TEST_DIR)/hyperv.o \
>                                  $(TEST_DIR)/hyperv_stimer.o
>
> +$(TEST_DIR)/hyperv_clock.elf: $(cstart.o) $(TEST_DIR)/hyperv.o
> +
>   arch_clean:
>   	$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat $(TEST_DIR)/*.elf \
>   	$(TEST_DIR)/.*.d lib/x86/.*.d
> diff --git a/config/config-x86_64.mak b/config/config-x86_64.mak
> index 1764701..634d09d 100644
> --- a/config/config-x86_64.mak
> +++ b/config/config-x86_64.mak
> @@ -12,5 +12,6 @@ tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
>   tests += $(TEST_DIR)/svm.flat
>   tests += $(TEST_DIR)/vmx.flat
>   tests += $(TEST_DIR)/tscdeadline_latency.flat
> +tests += $(TEST_DIR)/hyperv_clock.flat
>
>   include config/config-x86-common.mak
> diff --git a/x86/hyperv.h b/x86/hyperv.h
> index faf931b..974df56 100644
> --- a/x86/hyperv.h
> +++ b/x86/hyperv.h
> @@ -12,6 +12,7 @@
>   #define HV_X64_MSR_SYNTIMER_AVAILABLE           (1 << 3)
>
>   #define HV_X64_MSR_TIME_REF_COUNT               0x40000020
> +#define HV_X64_MSR_REFERENCE_TSC                0x40000021
>
>   /* Define synthetic interrupt controller model specific registers. */
>   #define HV_X64_MSR_SCONTROL                     0x40000080
> @@ -180,4 +181,12 @@ void synic_sint_create(int vcpu, int sint, int vec, bool auto_eoi);
>   void synic_sint_set(int vcpu, int sint);
>   void synic_sint_destroy(int vcpu, int sint);
>
> +struct hv_reference_tsc_page {
> +        uint32_t tsc_sequence;
> +        uint32_t res1;
> +        uint64_t tsc_scale;
> +        int64_t tsc_offset;
> +};
> +
> +
>   #endif
> diff --git a/x86/hyperv_clock.c b/x86/hyperv_clock.c
> new file mode 100644
> index 0000000..680ba7c
> --- /dev/null
> +++ b/x86/hyperv_clock.c
> @@ -0,0 +1,193 @@
> +#include "libcflat.h"
> +#include "smp.h"
> +#include "atomic.h"
> +#include "processor.h"
> +#include "hyperv.h"
> +#include "vm.h"
> +
> +#define MAX_CPU 4
> +#define TICKS_PER_SEC (1000000000 / 100)
> +
> +struct hv_reference_tsc_page *hv_clock;
> +
> +/*
> + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
> + * yielding a 64-bit result.
> + */
> +static inline u64 scale_delta(u64 delta, u64 mul_frac)
> +{
> +	u64 product, unused;
> +
> +	__asm__ (
> +		"mul %3"
> +		: "=d" (product), "=a" (unused) : "1" (delta), "rm" ((u64)mul_frac) );
> +
> +	return product;
> +}
> +
> +static u64 hvclock_tsc_to_ticks(struct hv_reference_tsc_page *shadow, uint64_t tsc)
> +{
> +	u64 delta = tsc;
> +	return scale_delta(delta, shadow->tsc_scale) + shadow->tsc_offset;
> +}
> +
> +/*
> + * Reads a consistent set of time-base values from hypervisor,
> + * into a shadow data area.
> + */
> +static void hvclock_get_time_values(struct hv_reference_tsc_page *shadow,
> +				    struct hv_reference_tsc_page *page)
> +{
> +	int seq;
> +	do {
> +		seq = page->tsc_sequence;
> +		rmb();		/* fetch version before data */
> +		*shadow = *page;
> +		rmb();		/* test version after fetching data */
> +	} while (shadow->tsc_sequence != seq);
> +}
> +
> +uint64_t hv_clock_read(void)
> +{
> +	struct hv_reference_tsc_page shadow;
> +
> +	hvclock_get_time_values(&shadow, hv_clock);
> +	return hvclock_tsc_to_ticks(&shadow, rdtsc());
> +}
> +
> +atomic_t cpus_left;
> +bool ok[MAX_CPU];
> +uint64_t loops[MAX_CPU];
> +
> +static void hv_clock_test(void *data)
> +{
> +	int i = smp_id();
> +	uint64_t t = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
> +	uint64_t end = t + 3 * TICKS_PER_SEC;
> +
> +	ok[i] = true;
> +	do {
> +		uint64_t now = hv_clock_read();
> +		if (now < t) {
> +			printf("warp on CPU %d!\n", smp_id());
> +			ok[i] = false;
> +			break;
> +		}
> +		t = now;
> +	} while(t < end);
> +
> +	barrier();
> +	if (t >= end) {
> +		int64_t ref = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
> +		if (i == 0)
> +			printf("Time reference MSR drift: %lld\n\n", ref - end);
> +		ok[i] &= (ref - end) > -5 && (ref - end) < 5;
> +	}
> +
> +	atomic_dec(&cpus_left);
> +}
> +
> +static void check_test(int ncpus)
> +{
> +	int i;
> +	bool pass;
> +
> +	atomic_set(&cpus_left, ncpus);
> +	for (i = ncpus - 1; i >= 0; i--)
> +		on_cpu_async(i, hv_clock_test, NULL);
> +
> +	/* Wait for the end of other vcpu */
> +	while(atomic_read(&cpus_left))
> +		;
> +
> +	pass = true;
> +	for (i = ncpus - 1; i >= 0; i--)
> +		pass &= ok[i];
> +
> +	report("TSC reference precision test", pass);
> +}
> +
> +static void hv_perf_test(void *data)
> +{
> +	uint64_t t = hv_clock_read();
> +	uint64_t end = t + 1000000000 / 100;
> +	uint64_t local_loops = 0;
> +
> +	do {
> +		t = hv_clock_read();
> +		local_loops++;
> +	} while(t < end);
> +
> +	loops[smp_id()] = local_loops;
> +	atomic_dec(&cpus_left);
> +}
> +
> +static void perf_test(int ncpus)
> +{
> +	int i;
> +	uint64_t total_loops;
> +
> +	atomic_set(&cpus_left, ncpus);
> +	for (i = ncpus - 1; i >= 0; i--)
> +		on_cpu_async(i, hv_perf_test, NULL);
> +
> +	/* Wait for the end of other vcpu */
> +	while(atomic_read(&cpus_left))
> +		;
> +
> +	total_loops = 0;
> +	for (i = ncpus - 1; i >= 0; i--)
> +		total_loops += loops[i];
> +	printf("iterations/sec:  %lld\n", total_loops / ncpus);
> +}
> +
> +int main(int ac, char **av)
> +{
> +	int nerr = 0;
> +	int ncpus;
> +	struct hv_reference_tsc_page shadow;
> +	uint64_t tsc1, t1, tsc2, t2;
> +	uint64_t ref1, ref2;
> +
> +	setup_vm();
> +	smp_init();
> +
> +	hv_clock = alloc_page();
> +	wrmsr(HV_X64_MSR_REFERENCE_TSC, (u64)(uintptr_t)hv_clock | 1);
> +	report("MSR value after enabling",
> +	       rdmsr(HV_X64_MSR_REFERENCE_TSC) == ((u64)(uintptr_t)hv_clock | 1));
> +
> +	hvclock_get_time_values(&shadow, hv_clock);
> +	if (shadow.tsc_sequence == 0 || shadow.tsc_sequence == 0xFFFFFFFF) {
> +		printf("Reference TSC page not available\n");
> +		exit(1);
> +	}
> +
> +	printf("scale: %llx offset: %lld\n", shadow.tsc_scale, shadow.tsc_offset);
> +	ref1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
> +	tsc1 = rdtsc();
> +	t1 = hvclock_tsc_to_ticks(&shadow, tsc1);
> +	printf("refcnt %lld, TSC %llx, TSC reference %lld\n",
> +	       ref1, tsc1, t1);
> +
> +	do
> +		ref2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
> +	while (ref2 < ref1 + 2 * TICKS_PER_SEC);
> +
> +	tsc2 = rdtsc();
> +	t2 = hvclock_tsc_to_ticks(&shadow, tsc2);
> +	printf("refcnt %lld (delta %lld), TSC %llx, TSC reference %lld (delta %lld)\n",
> +	       ref2, ref2 - ref1, tsc2, t2, t2 - t1);
> +
> +	ncpus = cpu_count();
> +	if (ncpus > MAX_CPU)
> +		ncpus = MAX_CPU;
> +
> +	check_test(ncpus);
> +	perf_test(ncpus);
> +
> +	wrmsr(HV_X64_MSR_REFERENCE_TSC, 0LL);
> +	report("MSR value after disabling", rdmsr(HV_X64_MSR_REFERENCE_TSC) == 0);
> +
> +	return nerr > 0 ? 1 : 0;
> +}
> diff --git a/x86/unittests.cfg b/x86/unittests.cfg
> index 99eff26..e981c00 100644
> --- a/x86/unittests.cfg
> +++ b/x86/unittests.cfg
> @@ -188,3 +188,8 @@ extra_params = -cpu kvm64,hv_synic -device hyperv-testdev
>   file = hyperv_stimer.flat
>   smp = 2
>   extra_params = -cpu kvm64,hv_time,hv_synic,hv_stimer -device hyperv-testdev
> +
> +[hyperv_clock]
> +file = hyperv_clock.flat
> +smp = 2
> +extra_params = -cpu kvm64,hv_time
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 14:25 ` Andrey Smetanin
@ 2016-01-28 14:50   ` Paolo Bonzini
  2016-01-28 15:53     ` Paolo Bonzini
  2016-01-28 18:53     ` Roman Kagan
  0 siblings, 2 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-01-28 14:50 UTC (permalink / raw)
  To: asmetanin, kvm; +Cc: Roman Kagan, Denis V. Lunev, Marcelo Tosatti



On 28/01/2016 15:25, Andrey Smetanin wrote:
>>
>>     Andrey, the test has a relative error of approximately 3 parts
>>     per million on my machine.  In other words it drifts by 3
>>     microseconds every second, which I don't think is acceptable.
>>     The problem is that virtual_tsc_khz is 2593993 while the actual
>>     frequency is more like 2594001 kHz.
>>
>>     But I have a problem in general with using tsc_khz to compute
>>     the hyperv reference clock scale.  The maximum possible accuracy
>>     on a 2.5 GHz machine is around 200 ppb, corresponding to a drift
>>     of about 16 milliseconds per day.  Perhaps we could use Linux
>>     (and kvmclock's) timekeeping parameters to derive the tsc_scale
>>     and tsc_offset?
>
> Probably need to check the code. But can we just workaround it
> by host periodic timer with period ~10ms which will issue(by scheduling
> vcpu KVM_REQ_*) refresh of guest tsc page to prevent drifting?

That would kill performance.

However, after a bit more testing, kvmclock seems to have the same
issue.  The formula to convert the kvmclock scale to the hyper-v one is

	tsc_to_nsec_system_mul * 2^(32 + tsc_shift)/100

which on my system gives 0xFCA52ED99999999; that's within 0.1 ppb of
what the Hyper-V code computes from a 2593993 kHz TSC.

So the patch seems okay, but we need to adjust kvm_guest_time_update to
support the Hyper-V clock.  In kvm_guest_time_update we can compute a
new scale and offset; if the scale changes, we set sequence to 0.  If
the new offset is such that the clock would go backwards, we set
sequence to 0.  If the scale remains the same and the jump is forwards,
we write a new offset in the Hyper-V TSC reference page.

If you want to prepare a patch to do this, that would be great.  (Hint
for debugging: you can define a random MSR that does the offset update,
and call it from the testcase).

Still, the drift that I get on my machine is really excessive, and I
should look into it more.  Marcelo, does it ring a bell?  Any reason why
get_kernel_ns() would be more precise than kvmclock?  The computer has
been up only for about 25 hours.

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 14:50   ` Paolo Bonzini
@ 2016-01-28 15:53     ` Paolo Bonzini
  2016-01-28 18:45       ` Roman Kagan
  2016-01-28 18:53     ` Roman Kagan
  1 sibling, 1 reply; 36+ messages in thread
From: Paolo Bonzini @ 2016-01-28 15:53 UTC (permalink / raw)
  To: asmetanin, kvm; +Cc: Roman Kagan, Denis V. Lunev, Marcelo Tosatti



On 28/01/2016 15:50, Paolo Bonzini wrote:
> 
> 
> On 28/01/2016 15:25, Andrey Smetanin wrote:
>>>
>>>     Andrey, the test has a relative error of approximately 3 parts
>>>     per million on my machine.  In other words it drifts by 3
>>>     microseconds every second, which I don't think is acceptable.
>>>     The problem is that virtual_tsc_khz is 2593993 while the actual
>>>     frequency is more like 2594001 kHz.
>>>
>>>     But I have a problem in general with using tsc_khz to compute
>>>     the hyperv reference clock scale.  The maximum possible accuracy
>>>     on a 2.5 GHz machine is around 200 ppb, corresponding to a drift
>>>     of about 16 milliseconds per day.  Perhaps we could use Linux
>>>     (and kvmclock's) timekeeping parameters to derive the tsc_scale
>>>     and tsc_offset?
>>
>> Probably need to check the code. But can we just workaround it
>> by host periodic timer with period ~10ms which will issue(by scheduling
>> vcpu KVM_REQ_*) refresh of guest tsc page to prevent drifting?
> 
> That would kill performance.
> 
> However, after a bit more testing, kvmclock seems to have the same
> issue.  The formula to convert the kvmclock scale to the hyper-v one is
> 
> 	tsc_to_nsec_system_mul * 2^(32 + tsc_shift)/100
> 
> which on my system gives 0xFCA52ED99999999; that's within 0.1 ppb of
> what the Hyper-V code computes from a 2593993 kHz TSC.
> 
> So the patch seems okay, but we need to adjust kvm_guest_time_update to
> support the Hyper-V clock.  In kvm_guest_time_update we can compute a
> new scale and offset; if the scale changes, we set sequence to 0.  If
> the new offset is such that the clock would go backwards, we set
> sequence to 0.

... No, that doesn't work.  The clock would go backwards because the
TIME_REF_COUNT MSR is counting more slowly than the TSC-based clock.

I guess what we can do is to compute the TSC reference page parameters
as soon as the master clock is activated, and use them instead of
get_kernel_ns() to compute the reference count MSR (again if the master
clock is activated).  But I'm now a bit wary to enable the hyper-v TSC
reference page without knowing the effects of a divergence between the
two mechanisms.  Especially if a migration could lead to a switch from
one to the other and thus to the time going backwards.

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 14:04 [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case Paolo Bonzini
  2016-01-28 14:04 ` Paolo Bonzini
  2016-01-28 14:25 ` Andrey Smetanin
@ 2016-01-28 16:22 ` Roman Kagan
  2016-02-03 16:37   ` Paolo Bonzini
  2 siblings, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-01-28 16:22 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Andrey Smetanin, Denis V. Lunev

On Thu, Jan 28, 2016 at 03:04:58PM +0100, Paolo Bonzini wrote:
> The test checks the relative precision of the reference TSC page
> and the time reference counter.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> 	Andrey, the test has a relative error of approximately 3 parts
> 	per million on my machine.  In other words it drifts by 3
> 	microseconds every second, which I don't think is acceptable.
> 	The problem is that virtual_tsc_khz is 2593993 while the actual
> 	frequency is more like 2594001 kHz.

Hmm, how come?  I thought virtual_tsc_khz could only diverge from
tsc_khz on migration.

[Or maybe I just misunderstand what you mean by "the actual frequency".]

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 15:53     ` Paolo Bonzini
@ 2016-01-28 18:45       ` Roman Kagan
  0 siblings, 0 replies; 36+ messages in thread
From: Roman Kagan @ 2016-01-28 18:45 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: asmetanin, kvm, Denis V. Lunev, Marcelo Tosatti

On Thu, Jan 28, 2016 at 04:53:31PM +0100, Paolo Bonzini wrote:
> I guess what we can do is to compute the TSC reference page parameters
> as soon as the master clock is activated, and use them instead of
> get_kernel_ns() to compute the reference count MSR (again if the master
> clock is activated).  But I'm now a bit wary to enable the hyper-v TSC
> reference page without knowing the effects of a divergence between the
> two mechanisms.  Especially if a migration could lead to a switch from
> one to the other and thus to the time going backwards.

IIUC Hyper-V treats the TSC reference page as an alternative an better
way to obtain that very same time reference counter.  So no, we may not
allow the two to diverge IMO.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 14:50   ` Paolo Bonzini
  2016-01-28 15:53     ` Paolo Bonzini
@ 2016-01-28 18:53     ` Roman Kagan
  2016-01-28 21:28       ` Paolo Bonzini
  1 sibling, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-01-28 18:53 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: asmetanin, kvm, Denis V. Lunev, Marcelo Tosatti

On Thu, Jan 28, 2016 at 03:50:30PM +0100, Paolo Bonzini wrote:
> Still, the drift that I get on my machine is really excessive, and I
> should look into it more.  Marcelo, does it ring a bell?  Any reason why
> get_kernel_ns() would be more precise than kvmclock?  The computer has
> been up only for about 25 hours.

Are you running an ntp client on the host by chance?

get_kernel_ns() goes through timekeeper, and it may have slightly
different idea of the tsc rate than tsc_khz calculated early at boot.

I can't run a test myself ATM, but I'd try to with ntp disabled and see
if makes a difference.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 18:53     ` Roman Kagan
@ 2016-01-28 21:28       ` Paolo Bonzini
  0 siblings, 0 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-01-28 21:28 UTC (permalink / raw)
  To: Roman Kagan, asmetanin, kvm, Denis V. Lunev, Marcelo Tosatti



On 28/01/2016 19:53, Roman Kagan wrote:
>> > Still, the drift that I get on my machine is really excessive, and I
>> > should look into it more.  Marcelo, does it ring a bell?  Any reason why
>> > get_kernel_ns() would be more precise than kvmclock?  The computer has
>> > been up only for about 25 hours.
> Are you running an ntp client on the host by chance?
> 
> get_kernel_ns() goes through timekeeper, and it may have slightly
> different idea of the tsc rate than tsc_khz calculated early at boot.

I get the same result both with and without chrony running (though in
one case the actual tsc_khz is greater than the initial measurement, and
in the other case it is smaller).

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-01-28 16:22 ` Roman Kagan
@ 2016-02-03 16:37   ` Paolo Bonzini
  2016-02-04  9:33     ` Roman Kagan
  2016-04-21 17:01     ` Roman Kagan
  0 siblings, 2 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-03 16:37 UTC (permalink / raw)
  To: Roman Kagan, kvm, Andrey Smetanin, Denis V. Lunev, Marcelo Tosatti



On 28/01/2016 17:22, Roman Kagan wrote:
> On Thu, Jan 28, 2016 at 03:04:58PM +0100, Paolo Bonzini wrote:
>> The test checks the relative precision of the reference TSC page
>> and the time reference counter.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>> 	Andrey, the test has a relative error of approximately 3 parts
>> 	per million on my machine.  In other words it drifts by 3
>> 	microseconds every second, which I don't think is acceptable.
>> 	The problem is that virtual_tsc_khz is 2593993 while the actual
>> 	frequency is more like 2594001 kHz.
> 
> Hmm, how come?  I thought virtual_tsc_khz could only diverge from
> tsc_khz on migration.
> 
> [Or maybe I just misunderstand what you mean by "the actual frequency".]

Talking to Marcelo helped me understanding it. :)

The issue is that the TSC kHz is not the correct value to use for the  
the TSC page.  Instead, we want to pass to the guest a scale value 
corresponding to the kernel's timekeeping multiplier.  This is what 
both kvmclock and the time reference counter use.

The bit that I was missing last week was how the guest could perceive
updates to the TSC page parameters as atomic.  We actually _can_
emulate seqlock-like behavior in Hyper-V by writing 0 to seq during
an update.  Instead of looping like seqlocks do, the guest will simply
use the time reference counter and take a hypervisor exit.  The result
however is still valid, because we want the time reference counter to
be perfectly synchronized with the Hyper-V clock and lets you handle
any kvmclock update scenario safely.

Therefore the algorithm would be like patch 2/2 I sent, but with
important differences.  You'd still

	if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

on MSR writes, but then hook into kvm_guest_time_update rather than
kvm_gen_update_masterclock, like:

	if (v == kvm_get_vcpu(v->kvm, 0))
	        kvm_write_hyperv_tsc_page(v->kvm, &vcpu->hv_clock);

and in kvm_write_hyperv_tsc_page the calculation would be based on
the kvmclock parameters:

        {
                write 0 to seq;
                if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
                        return;

                compute scale and offset from hv_clock mul and shift;
                write scale and offset;
		write sequence
        }

all of scale, offset and sequence can be computed from kvmclock parameters.

For sequence we have to convert "even values between 0 and 0xFFFFFFFE"
into "values between "1 and 0xFFFFFFFE".  For example:

	hyper-v sequence = (kvmclock sequence >> 1) + 1

will do it.  Computing the scale and offset is something like:

        // KVM formula:
	//    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul << tsc_shift + system_time
	//
	// hyper-v formula:
	//    nsec/100 = ticks * scale / 2^32 + offset
	//
	// when tsc_timestamp, offset is zero so remove them both:
        //    ticks * tsc_to_system_mul << tsc_shift / 100 = ticks * scale / 2^32
	//
	// multiply both sides by 2^32 / ticks and you get scale:
	//    scale = tsc_to_system_mul << (32 + tsc_shift) / 100
	//
	// check if it would overflow, and then use the time ref counter
	//    tsc_to_system_mul << (32 + tsc_shift) / 100 >= 2^32
	//    tsc_to_system_mul << 32 >= 100 * 2^32 << -tsc_shift
	//    tsc_to_system_mul >= 100 << -tsc_shift

	if (shift < 0)
		rhs = 100 << -shift;
	else
		rhs = 100 >> shift;

	if (tsc_to_system_mul >= rhs)
		return;

        scale = mul_u64_u32_div(1ULL << (32 + tsc_shift), tsc_to_system_mul, 100);

        // now expand the kvmclock formula:
        //    nsec = ticks * tsc_to_system_mul << tsc_shift - (tsc_timestamp * tsc_to_system_mul << tsc_shift) + system_time 
	// divide by 100:
	//    nsec/100 = ticks * tsc_to_system_mul << tsc_shift /100 - (tsc_timestamp * tsc_to_system_mul << tsc_shift) /100 + system_time /100
	// replace tsc_to_system_mul << tsc_shift /100 by scale / 2^32:
        //    nsec/100 = ticks * scale / 2^32 - (tsc_timestamp * scale / 2^32) + system_time / 100
	// comparing with the Hyper-V formula:
	//    offset = system_time / 100 - (tsc_timestamp * scale / 2^32)

	offset = system_time;
	do_div(offset, 100);

	timestamp_offset = tsc_timestamp;
	do_div32_shl32(timestamp_offset, scale);
        offset = offset - timestamp_offset;

The remaining part is _further_ adjusting the offset to
Would you like to finish implementing this and test it with the unit test?
kvm/queue already has the patch to introduce do_div32_shl32.

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-02-03 16:37   ` Paolo Bonzini
@ 2016-02-04  9:33     ` Roman Kagan
  2016-02-04 10:13       ` Paolo Bonzini
  2016-04-21 17:01     ` Roman Kagan
  1 sibling, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-02-04  9:33 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Andrey Smetanin, Denis V. Lunev, Marcelo Tosatti

On Wed, Feb 03, 2016 at 05:37:08PM +0100, Paolo Bonzini wrote:
> On 28/01/2016 17:22, Roman Kagan wrote:
> > On Thu, Jan 28, 2016 at 03:04:58PM +0100, Paolo Bonzini wrote:
> >> The test checks the relative precision of the reference TSC page
> >> and the time reference counter.
> >>
> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> ---
> >> 	Andrey, the test has a relative error of approximately 3 parts
> >> 	per million on my machine.  In other words it drifts by 3
> >> 	microseconds every second, which I don't think is acceptable.
> >> 	The problem is that virtual_tsc_khz is 2593993 while the actual
> >> 	frequency is more like 2594001 kHz.
> > 
> > Hmm, how come?  I thought virtual_tsc_khz could only diverge from
> > tsc_khz on migration.
> > 
> > [Or maybe I just misunderstand what you mean by "the actual frequency".]
> 
> Talking to Marcelo helped me understanding it. :)
> 
> The issue is that the TSC kHz is not the correct value to use for the  
> the TSC page.  Instead, we want to pass to the guest a scale value 
> corresponding to the kernel's timekeeping multiplier.  This is what 
> both kvmclock and the time reference counter use.

Indeed.

> The bit that I was missing last week was how the guest could perceive
> updates to the TSC page parameters as atomic.  We actually _can_
> emulate seqlock-like behavior in Hyper-V by writing 0 to seq during
> an update.  Instead of looping like seqlocks do, the guest will simply
> use the time reference counter and take a hypervisor exit.  The result
> however is still valid, because we want the time reference counter to
> be perfectly synchronized with the Hyper-V clock and lets you handle
> any kvmclock update scenario safely.

That's smart, thanks for the idea!  We'll only need to check that
Windows goes through all these steps on every clock read, rather than
remembering the failure and sticking with the MSR for good.

> Therefore the algorithm would be like patch 2/2 I sent, but with
> important differences.  You'd still
> 
> 	if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> 
> on MSR writes, but then hook into kvm_guest_time_update rather than
> kvm_gen_update_masterclock, like:
> 
> 	if (v == kvm_get_vcpu(v->kvm, 0))
> 	        kvm_write_hyperv_tsc_page(v->kvm, &vcpu->hv_clock);
> 
> and in kvm_write_hyperv_tsc_page the calculation would be based on
> the kvmclock parameters:
> 
>         {
>                 write 0 to seq;
>                 if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
>                         return;
> 
>                 compute scale and offset from hv_clock mul and shift;
>                 write scale and offset;
> 		write sequence
>         }
> 
> all of scale, offset and sequence can be computed from kvmclock parameters.
> 
> For sequence we have to convert "even values between 0 and 0xFFFFFFFE"
> into "values between "1 and 0xFFFFFFFE".  For example:
> 
> 	hyper-v sequence = (kvmclock sequence >> 1) + 1
> 
> will do it.  Computing the scale and offset is something like:
> 
>         // KVM formula:
> 	//    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul << tsc_shift + system_time
> 	//
> 	// hyper-v formula:
> 	//    nsec/100 = ticks * scale / 2^32 + offset
> 	//
> 	// when tsc_timestamp, offset is zero so remove them both:
>         //    ticks * tsc_to_system_mul << tsc_shift / 100 = ticks * scale / 2^32
> 	//
> 	// multiply both sides by 2^32 / ticks and you get scale:
> 	//    scale = tsc_to_system_mul << (32 + tsc_shift) / 100
> 	//
> 	// check if it would overflow, and then use the time ref counter
> 	//    tsc_to_system_mul << (32 + tsc_shift) / 100 >= 2^32
> 	//    tsc_to_system_mul << 32 >= 100 * 2^32 << -tsc_shift
> 	//    tsc_to_system_mul >= 100 << -tsc_shift
> 
> 	if (shift < 0)
> 		rhs = 100 << -shift;
> 	else
> 		rhs = 100 >> shift;
> 
> 	if (tsc_to_system_mul >= rhs)
> 		return;
> 
>         scale = mul_u64_u32_div(1ULL << (32 + tsc_shift), tsc_to_system_mul, 100);
> 
>         // now expand the kvmclock formula:
>         //    nsec = ticks * tsc_to_system_mul << tsc_shift - (tsc_timestamp * tsc_to_system_mul << tsc_shift) + system_time 
> 	// divide by 100:
> 	//    nsec/100 = ticks * tsc_to_system_mul << tsc_shift /100 - (tsc_timestamp * tsc_to_system_mul << tsc_shift) /100 + system_time /100
> 	// replace tsc_to_system_mul << tsc_shift /100 by scale / 2^32:
>         //    nsec/100 = ticks * scale / 2^32 - (tsc_timestamp * scale / 2^32) + system_time / 100
> 	// comparing with the Hyper-V formula:
> 	//    offset = system_time / 100 - (tsc_timestamp * scale / 2^32)
> 
> 	offset = system_time;
> 	do_div(offset, 100);
> 
> 	timestamp_offset = tsc_timestamp;
> 	do_div32_shl32(timestamp_offset, scale);
>         offset = offset - timestamp_offset;
> 
> The remaining part is _further_ adjusting the offset to

...?

> Would you like to finish implementing this and test it with the unit test?
> kvm/queue already has the patch to introduce do_div32_shl32.

We will but probably not at the top priority; our main focus ATM is
getting Hyper-V VMBus working.

Thanks,
Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-02-04  9:33     ` Roman Kagan
@ 2016-02-04 10:13       ` Paolo Bonzini
  2016-02-04 11:12         ` Roman Kagan
  0 siblings, 1 reply; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-04 10:13 UTC (permalink / raw)
  To: Roman Kagan, kvm, Andrey Smetanin, Denis V. Lunev, Marcelo Tosatti



On 04/02/2016 10:33, Roman Kagan wrote:
>> The bit that I was missing last week was how the guest could perceive
>> updates to the TSC page parameters as atomic.  We actually _can_
>> emulate seqlock-like behavior in Hyper-V by writing 0 to seq during
>> an update.  Instead of looping like seqlocks do, the guest will simply
>> use the time reference counter and take a hypervisor exit.  The result
>> however is still valid, because we want the time reference counter to
>> be perfectly synchronized with the Hyper-V clock and lets you handle
>> any kvmclock update scenario safely.
> 
> That's smart, thanks for the idea!  We'll only need to check that
> Windows goes through all these steps on every clock read, rather than
> remembering the failure and sticking with the MSR for good.

Andrey pointed me to https://lkml.org/lkml/2015/11/2/655, which has the
source for Windows's clock read code.  It should work.

>> The remaining part is _further_ adjusting the offset to
> 
> ...?

Cut and paste. :)  I'll take a look at it if it's not super-high
priority for you.

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-02-04 10:13       ` Paolo Bonzini
@ 2016-02-04 11:12         ` Roman Kagan
  0 siblings, 0 replies; 36+ messages in thread
From: Roman Kagan @ 2016-02-04 11:12 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Andrey Smetanin, Denis V. Lunev, Marcelo Tosatti

On Thu, Feb 04, 2016 at 11:13:58AM +0100, Paolo Bonzini wrote:
> On 04/02/2016 10:33, Roman Kagan wrote:
> >> The bit that I was missing last week was how the guest could perceive
> >> updates to the TSC page parameters as atomic.  We actually _can_
> >> emulate seqlock-like behavior in Hyper-V by writing 0 to seq during
> >> an update.  Instead of looping like seqlocks do, the guest will simply
> >> use the time reference counter and take a hypervisor exit.  The result
> >> however is still valid, because we want the time reference counter to
> >> be perfectly synchronized with the Hyper-V clock and lets you handle
> >> any kvmclock update scenario safely.
> > 
> > That's smart, thanks for the idea!  We'll only need to check that
> > Windows goes through all these steps on every clock read, rather than
> > remembering the failure and sticking with the MSR for good.
> 
> Andrey pointed me to https://lkml.org/lkml/2015/11/2/655, which has the
> source for Windows's clock read code.  It should work.

Indeed.  (Shame on me for not remembering that as I participated in that
analysis :)

> I'll take a look at it if it's not super-high priority for you.

Sure.

Thanks,
Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 0/4] kvmclock: improve accuracy
@ 2016-02-08 15:18 Paolo Bonzini
  2016-02-08 15:18 ` [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz Paolo Bonzini
                   ` (4 more replies)
  0 siblings, 5 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-08 15:18 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: mtosatti

Currently kvmclock is obtaining the multiplier and shift value from
the TSC kHz.  These however are less accurate than the host kernel's
clock, which includes corrections made through NTP.

These patches change kvmclock to tick at the same frequency as the
host kernel's clocksource (as obtained through the pvclock_gtod
notifier).  This is precise enough that the Hyper-V clock can be
implemented on top of it.

Paolo Bonzini (4):
  KVM: x86: rename argument to kvm_set_tsc_khz
  KVM: x86: rewrite handling of scaled TSC for kvmclock
  KVM: x86: pass kvm_get_time_scale arguments in hertz
  KVM: x86: track actual TSC frequency from the timekeeper struct

 arch/x86/include/asm/kvm_host.h |  3 +-
 arch/x86/kvm/x86.c              | 73 +++++++++++++++++++++++++----------------
 2 files changed, 47 insertions(+), 29 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz
  2016-02-08 15:18 [PATCH 0/4] kvmclock: improve accuracy Paolo Bonzini
@ 2016-02-08 15:18 ` Paolo Bonzini
  2016-02-11 15:01   ` Marcelo Tosatti
  2016-02-08 15:18 ` [PATCH 2/4] KVM: x86: rewrite handling of scaled TSC for kvmclock Paolo Bonzini
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-08 15:18 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: mtosatti

This refers to the desired (scaled) frequency, which is called
user_tsc_khz in the rest of the file.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aafbcf9f9776..1ca2b44f70bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1290,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 	return 0;
 }
 
-static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 {
 	u32 thresh_lo, thresh_hi;
 	int use_scaling = 0;
 
 	/* tsc_khz can be zero if TSC calibration fails */
-	if (this_tsc_khz == 0) {
+	if (user_tsc_khz == 0) {
 		/* set tsc_scaling_ratio to a safe value */
 		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		return -1;
 	}
 
 	/* Compute a scale to convert nanoseconds in TSC cycles */
-	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+	kvm_get_time_scale(user_tsc_khz, NSEC_PER_SEC / 1000,
 			   &vcpu->arch.virtual_tsc_shift,
 			   &vcpu->arch.virtual_tsc_mult);
-	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 
 	/*
 	 * Compute the variation in TSC rate which is acceptable
@@ -1316,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 	 */
 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
 	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
 		use_scaling = 1;
 	}
-	return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
 }
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 2/4] KVM: x86: rewrite handling of scaled TSC for kvmclock
  2016-02-08 15:18 [PATCH 0/4] kvmclock: improve accuracy Paolo Bonzini
  2016-02-08 15:18 ` [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz Paolo Bonzini
@ 2016-02-08 15:18 ` Paolo Bonzini
  2016-02-11 15:23   ` Marcelo Tosatti
  2016-02-08 15:18 ` [PATCH 3/4] KVM: x86: pass kvm_get_time_scale arguments in hertz Paolo Bonzini
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-08 15:18 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: mtosatti

This is the same as before:

    kvm_scale_tsc(tgt_tsc_khz)
        = tgt_tsc_khz * ratio
        = tgt_tsc_khz * user_tsc_khz / tsc_khz   (see set_tsc_khz)
        = user_tsc_khz                           (see kvm_guest_time_update)
        = vcpu->arch.virtual_tsc_khz             (see kvm_set_tsc_khz)

However, computing it through kvm_scale_tsc makes it possible
to include the NTP correction in tgt_tsc_khz, as done in the
next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1ca2b44f70bb..6db3c219795b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1713,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+	unsigned long flags, tgt_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
@@ -1739,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-	if (unlikely(this_tsc_khz == 0)) {
+	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+	if (unlikely(tgt_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
@@ -1775,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (!vcpu->pv_time_enabled)
 		return 0;
 
-	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-		tgt_tsc_khz = kvm_has_tsc_control ?
-			vcpu->virtual_tsc_khz : this_tsc_khz;
+	if (kvm_has_tsc_control)
+		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
 		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
-		vcpu->hw_tsc_khz = this_tsc_khz;
+		vcpu->hw_tsc_khz = tgt_tsc_khz;
 	}
 
 	/* With all the info we got, fill in the values */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 3/4] KVM: x86: pass kvm_get_time_scale arguments in hertz
  2016-02-08 15:18 [PATCH 0/4] kvmclock: improve accuracy Paolo Bonzini
  2016-02-08 15:18 ` [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz Paolo Bonzini
  2016-02-08 15:18 ` [PATCH 2/4] KVM: x86: rewrite handling of scaled TSC for kvmclock Paolo Bonzini
@ 2016-02-08 15:18 ` Paolo Bonzini
  2016-02-08 15:18 ` [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct Paolo Bonzini
  2016-02-16 14:00 ` [PATCH 0/4] kvmclock: improve accuracy Marcelo Tosatti
  4 siblings, 0 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-08 15:18 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: mtosatti

Prepare for improving the precision in the next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6db3c219795b..e0bf8f10dc22 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1203,7 +1203,7 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 	return dividend;
 }
 
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
 			       s8 *pshift, u32 *pmultiplier)
 {
 	uint64_t scaled64;
@@ -1211,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 	uint64_t tps64;
 	uint32_t tps32;
 
-	tps64 = base_khz * 1000LL;
-	scaled64 = scaled_khz * 1000LL;
+	tps64 = base_hz;
+	scaled64 = scaled_hz;
 	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
@@ -1230,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 	*pshift = shift;
 	*pmultiplier = div_frac(scaled64, tps32);
 
-	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
+	pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+		 __func__, base_hz, scaled_hz, shift, *pmultiplier);
 }
 
 #ifdef CONFIG_X86_64
@@ -1303,7 +1303,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 	}
 
 	/* Compute a scale to convert nanoseconds in TSC cycles */
-	kvm_get_time_scale(user_tsc_khz, NSEC_PER_SEC / 1000,
+	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
 			   &vcpu->arch.virtual_tsc_shift,
 			   &vcpu->arch.virtual_tsc_mult);
 	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
@@ -1779,7 +1779,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
 
 	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
-		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
 		vcpu->hw_tsc_khz = tgt_tsc_khz;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-08 15:18 [PATCH 0/4] kvmclock: improve accuracy Paolo Bonzini
                   ` (2 preceding siblings ...)
  2016-02-08 15:18 ` [PATCH 3/4] KVM: x86: pass kvm_get_time_scale arguments in hertz Paolo Bonzini
@ 2016-02-08 15:18 ` Paolo Bonzini
  2016-02-09 18:41   ` Owen Hofmann
  2016-02-16 13:48   ` Marcelo Tosatti
  2016-02-16 14:00 ` [PATCH 0/4] kvmclock: improve accuracy Marcelo Tosatti
  4 siblings, 2 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-08 15:18 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: mtosatti

When an NTP server is running, it may adjust the time substantially
compared to the "official" frequency of the TSC.  A 12 ppm change
sums up to one second per day.

This already shows up if the guest compares kvmclock with e.g. the
PM timer.  It shows up even more once we add support for the Hyper-V
TSC page, because the guest expects it to be in sync with the time
reference counter; effectively the time reference counter is just a
slow path to access the same clock that is in the TSC page.

Therefore, we want kvmclock to provide the host kernel's
ktime_get_boot_ns() value, at least if the master clock is active.
To do so, reverse-compute the host's "actual" TSC frequency from
pvclock_gtod_data and return it from kvm_get_time_and_clockread.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/x86.c              | 46 +++++++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b5459982433..7dd6d55b4868 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -538,7 +538,7 @@ struct kvm_vcpu_arch {
 
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
-	unsigned int hw_tsc_khz;
+	u64 hv_clock_tsc_hz;
 	struct gfn_to_hva_cache pv_time;
 	bool pv_time_enabled;
 	/* set guest stopped flag in pvclock flags field */
@@ -731,6 +731,7 @@ struct kvm_arch {
 
 	spinlock_t pvclock_gtod_sync_lock;
 	bool use_master_clock;
+	u64 master_tsc_hz;
 	u64 master_kernel_ns;
 	cycle_t master_cycle_now;
 	struct delayed_work kvmclock_update_work;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e0bf8f10dc22..c99101d4cc5e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1580,7 +1580,16 @@ static inline u64 vgettsc(cycle_t *cycle_now)
 	return v * gtod->clock.mult;
 }
 
-static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
+static u64 pvclock_gtod_tsc_hz(void)
+{
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+        u64 gtod_tsc_hz = 1000000000ULL << gtod->clock.shift;
+        do_div(gtod_tsc_hz, gtod->clock.mult);
+        return gtod_tsc_hz;
+}
+
+static int do_monotonic_boot(s64 *t, cycle_t *cycle_now, u64 *tsc_hz)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	unsigned long seq;
@@ -1589,6 +1598,7 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
 
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
+		*tsc_hz = pvclock_gtod_tsc_hz();
 		mode = gtod->clock.vclock_mode;
 		ns = gtod->nsec_base;
 		ns += vgettsc(cycle_now);
@@ -1601,13 +1611,14 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
 }
 
 /* returns true if host is using tsc clocksource */
-static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now,
+				       u64 *tsc_hz)
 {
 	/* checked again under seqlock below */
 	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
 		return false;
 
-	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
+	return do_monotonic_boot(kernel_ns, cycle_now, tsc_hz) == VCLOCK_TSC;
 }
 #endif
 
@@ -1668,7 +1679,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 	 */
 	host_tsc_clocksource = kvm_get_time_and_clockread(
 					&ka->master_kernel_ns,
-					&ka->master_cycle_now);
+					&ka->master_cycle_now,
+					&ka->master_tsc_hz);
 
 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
 				&& !backwards_tsc_observed
@@ -1713,7 +1725,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	unsigned long flags, tgt_tsc_khz;
+	unsigned long flags;
+	u64 tgt_tsc_hz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
@@ -1724,6 +1737,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	kernel_ns = 0;
 	host_tsc = 0;
+	tgt_tsc_hz = 0;
 
 	/*
 	 * If the host uses TSC clock, then passthrough TSC as stable
@@ -1734,20 +1748,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (use_master_clock) {
 		host_tsc = ka->master_cycle_now;
 		kernel_ns = ka->master_kernel_ns;
+		tgt_tsc_hz = ka->master_tsc_hz;
 	}
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-	if (unlikely(tgt_tsc_khz == 0)) {
-		local_irq_restore(flags);
-		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
-		return 1;
-	}
 	if (!use_master_clock) {
 		host_tsc = rdtsc();
 		kernel_ns = get_kernel_ns();
+		tgt_tsc_hz = __this_cpu_read(cpu_tsc_khz) * 1000;
+	}
+
+	if (unlikely(tgt_tsc_hz == 0)) {
+		local_irq_restore(flags);
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+		return 1;
 	}
 
 	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1776,13 +1792,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 0;
 
 	if (kvm_has_tsc_control)
-		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+		tgt_tsc_hz = kvm_scale_tsc(v, tgt_tsc_hz);
 
-	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
-		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
+	if (unlikely(vcpu->hv_clock_tsc_hz != tgt_tsc_hz)) {
+		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_hz,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
-		vcpu->hw_tsc_khz = tgt_tsc_khz;
+		vcpu->hv_clock_tsc_hz = tgt_tsc_hz;
 	}
 
 	/* With all the info we got, fill in the values */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-08 15:18 ` [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct Paolo Bonzini
@ 2016-02-09 18:41   ` Owen Hofmann
  2016-02-10 13:57     ` Paolo Bonzini
  2016-02-16 13:48   ` Marcelo Tosatti
  1 sibling, 1 reply; 36+ messages in thread
From: Owen Hofmann @ 2016-02-09 18:41 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: LKML, KVM General, Marcelo Tosatti

Hi,
Should this patch change the condition in pvclock_gtod_notify?
Currently it looks like we'll only request a masterclock update when
tsc is no longer a good clocksource.

On Mon, Feb 8, 2016 at 7:18 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> When an NTP server is running, it may adjust the time substantially
> compared to the "official" frequency of the TSC.  A 12 ppm change
> sums up to one second per day.
>
> This already shows up if the guest compares kvmclock with e.g. the
> PM timer.  It shows up even more once we add support for the Hyper-V
> TSC page, because the guest expects it to be in sync with the time
> reference counter; effectively the time reference counter is just a
> slow path to access the same clock that is in the TSC page.
>
> Therefore, we want kvmclock to provide the host kernel's
> ktime_get_boot_ns() value, at least if the master clock is active.
> To do so, reverse-compute the host's "actual" TSC frequency from
> pvclock_gtod_data and return it from kvm_get_time_and_clockread.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 ++-
>  arch/x86/kvm/x86.c              | 46 +++++++++++++++++++++++++++--------------
>  2 files changed, 33 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7b5459982433..7dd6d55b4868 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -538,7 +538,7 @@ struct kvm_vcpu_arch {
>
>         gpa_t time;
>         struct pvclock_vcpu_time_info hv_clock;
> -       unsigned int hw_tsc_khz;
> +       u64 hv_clock_tsc_hz;
>         struct gfn_to_hva_cache pv_time;
>         bool pv_time_enabled;
>         /* set guest stopped flag in pvclock flags field */
> @@ -731,6 +731,7 @@ struct kvm_arch {
>
>         spinlock_t pvclock_gtod_sync_lock;
>         bool use_master_clock;
> +       u64 master_tsc_hz;
>         u64 master_kernel_ns;
>         cycle_t master_cycle_now;
>         struct delayed_work kvmclock_update_work;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e0bf8f10dc22..c99101d4cc5e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1580,7 +1580,16 @@ static inline u64 vgettsc(cycle_t *cycle_now)
>         return v * gtod->clock.mult;
>  }
>
> -static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
> +static u64 pvclock_gtod_tsc_hz(void)
> +{
> +       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> +
> +        u64 gtod_tsc_hz = 1000000000ULL << gtod->clock.shift;
> +        do_div(gtod_tsc_hz, gtod->clock.mult);
> +        return gtod_tsc_hz;
> +}
> +
> +static int do_monotonic_boot(s64 *t, cycle_t *cycle_now, u64 *tsc_hz)
>  {
>         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
>         unsigned long seq;
> @@ -1589,6 +1598,7 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
>
>         do {
>                 seq = read_seqcount_begin(&gtod->seq);
> +               *tsc_hz = pvclock_gtod_tsc_hz();
>                 mode = gtod->clock.vclock_mode;
>                 ns = gtod->nsec_base;
>                 ns += vgettsc(cycle_now);
> @@ -1601,13 +1611,14 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
>  }
>
>  /* returns true if host is using tsc clocksource */
> -static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
> +static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now,
> +                                      u64 *tsc_hz)
>  {
>         /* checked again under seqlock below */
>         if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
>                 return false;
>
> -       return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
> +       return do_monotonic_boot(kernel_ns, cycle_now, tsc_hz) == VCLOCK_TSC;
>  }
>  #endif
>
> @@ -1668,7 +1679,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
>          */
>         host_tsc_clocksource = kvm_get_time_and_clockread(
>                                         &ka->master_kernel_ns,
> -                                       &ka->master_cycle_now);
> +                                       &ka->master_cycle_now,
> +                                       &ka->master_tsc_hz);
>
>         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
>                                 && !backwards_tsc_observed
> @@ -1713,7 +1725,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>
>  static int kvm_guest_time_update(struct kvm_vcpu *v)
>  {
> -       unsigned long flags, tgt_tsc_khz;
> +       unsigned long flags;
> +       u64 tgt_tsc_hz;
>         struct kvm_vcpu_arch *vcpu = &v->arch;
>         struct kvm_arch *ka = &v->kvm->arch;
>         s64 kernel_ns;
> @@ -1724,6 +1737,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>
>         kernel_ns = 0;
>         host_tsc = 0;
> +       tgt_tsc_hz = 0;
>
>         /*
>          * If the host uses TSC clock, then passthrough TSC as stable
> @@ -1734,20 +1748,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>         if (use_master_clock) {
>                 host_tsc = ka->master_cycle_now;
>                 kernel_ns = ka->master_kernel_ns;
> +               tgt_tsc_hz = ka->master_tsc_hz;
>         }
>         spin_unlock(&ka->pvclock_gtod_sync_lock);
>
>         /* Keep irq disabled to prevent changes to the clock */
>         local_irq_save(flags);
> -       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
> -       if (unlikely(tgt_tsc_khz == 0)) {
> -               local_irq_restore(flags);
> -               kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
> -               return 1;
> -       }
>         if (!use_master_clock) {
>                 host_tsc = rdtsc();
>                 kernel_ns = get_kernel_ns();
> +               tgt_tsc_hz = __this_cpu_read(cpu_tsc_khz) * 1000;
> +       }
> +
> +       if (unlikely(tgt_tsc_hz == 0)) {
> +               local_irq_restore(flags);
> +               kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
> +               return 1;
>         }
>
>         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
> @@ -1776,13 +1792,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>                 return 0;
>
>         if (kvm_has_tsc_control)
> -               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
> +               tgt_tsc_hz = kvm_scale_tsc(v, tgt_tsc_hz);
>
> -       if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
> -               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
> +       if (unlikely(vcpu->hv_clock_tsc_hz != tgt_tsc_hz)) {
> +               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_hz,
>                                    &vcpu->hv_clock.tsc_shift,
>                                    &vcpu->hv_clock.tsc_to_system_mul);
> -               vcpu->hw_tsc_khz = tgt_tsc_khz;
> +               vcpu->hv_clock_tsc_hz = tgt_tsc_hz;
>         }
>
>         /* With all the info we got, fill in the values */
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-09 18:41   ` Owen Hofmann
@ 2016-02-10 13:57     ` Paolo Bonzini
  0 siblings, 0 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-10 13:57 UTC (permalink / raw)
  To: Owen Hofmann; +Cc: LKML, KVM General, Marcelo Tosatti



On 09/02/2016 19:41, Owen Hofmann wrote:
> Hi,
> Should this patch change the condition in pvclock_gtod_notify?
> Currently it looks like we'll only request a masterclock update when
> tsc is no longer a good clocksource.

Yes, you're right.

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz
  2016-02-08 15:18 ` [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz Paolo Bonzini
@ 2016-02-11 15:01   ` Marcelo Tosatti
  0 siblings, 0 replies; 36+ messages in thread
From: Marcelo Tosatti @ 2016-02-11 15:01 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm

On Mon, Feb 08, 2016 at 04:18:28PM +0100, Paolo Bonzini wrote:
> This refers to the desired (scaled) frequency, which is called
> user_tsc_khz in the rest of the file.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/x86.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index aafbcf9f9776..1ca2b44f70bb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1290,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
>  	return 0;
>  }
>  
> -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
> +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
>  {
>  	u32 thresh_lo, thresh_hi;
>  	int use_scaling = 0;
>  
>  	/* tsc_khz can be zero if TSC calibration fails */
> -	if (this_tsc_khz == 0) {
> +	if (user_tsc_khz == 0) {
>  		/* set tsc_scaling_ratio to a safe value */
>  		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
>  		return -1;
>  	}
>  
>  	/* Compute a scale to convert nanoseconds in TSC cycles */
> -	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
> +	kvm_get_time_scale(user_tsc_khz, NSEC_PER_SEC / 1000,
>  			   &vcpu->arch.virtual_tsc_shift,
>  			   &vcpu->arch.virtual_tsc_mult);
> -	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
> +	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
>  
>  	/*
>  	 * Compute the variation in TSC rate which is acceptable
> @@ -1316,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
>  	 */
>  	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
>  	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
> -	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
> -		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
> +	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
> +		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
>  		use_scaling = 1;
>  	}
> -	return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
> +	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
>  }
>  
>  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
> -- 
> 1.8.3.1
> 

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 2/4] KVM: x86: rewrite handling of scaled TSC for kvmclock
  2016-02-08 15:18 ` [PATCH 2/4] KVM: x86: rewrite handling of scaled TSC for kvmclock Paolo Bonzini
@ 2016-02-11 15:23   ` Marcelo Tosatti
  0 siblings, 0 replies; 36+ messages in thread
From: Marcelo Tosatti @ 2016-02-11 15:23 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm

On Mon, Feb 08, 2016 at 04:18:29PM +0100, Paolo Bonzini wrote:
> This is the same as before:
> 
>     kvm_scale_tsc(tgt_tsc_khz)
>         = tgt_tsc_khz * ratio
>         = tgt_tsc_khz * user_tsc_khz / tsc_khz   (see set_tsc_khz)
>         = user_tsc_khz                           (see kvm_guest_time_update)
>         = vcpu->arch.virtual_tsc_khz             (see kvm_set_tsc_khz)
> 
> However, computing it through kvm_scale_tsc makes it possible
> to include the NTP correction in tgt_tsc_khz, as done in the
> next patch.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/x86.c | 15 ++++++++-------
>  1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1ca2b44f70bb..6db3c219795b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1713,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  
>  static int kvm_guest_time_update(struct kvm_vcpu *v)
>  {
> -	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
> +	unsigned long flags, tgt_tsc_khz;
>  	struct kvm_vcpu_arch *vcpu = &v->arch;
>  	struct kvm_arch *ka = &v->kvm->arch;
>  	s64 kernel_ns;
> @@ -1739,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  
>  	/* Keep irq disabled to prevent changes to the clock */
>  	local_irq_save(flags);
> -	this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
> -	if (unlikely(this_tsc_khz == 0)) {
> +	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
> +	if (unlikely(tgt_tsc_khz == 0)) {
>  		local_irq_restore(flags);
>  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
>  		return 1;
> @@ -1775,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  	if (!vcpu->pv_time_enabled)
>  		return 0;
>  
> -	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
> -		tgt_tsc_khz = kvm_has_tsc_control ?
> -			vcpu->virtual_tsc_khz : this_tsc_khz;
> +	if (kvm_has_tsc_control)
> +		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
> +
> +	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
>  		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
>  				   &vcpu->hv_clock.tsc_shift,
>  				   &vcpu->hv_clock.tsc_to_system_mul);
> -		vcpu->hw_tsc_khz = this_tsc_khz;
> +		vcpu->hw_tsc_khz = tgt_tsc_khz;
>  	}
>  
>  	/* With all the info we got, fill in the values */
> -- 
> 1.8.3.1
> 

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-08 15:18 ` [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct Paolo Bonzini
  2016-02-09 18:41   ` Owen Hofmann
@ 2016-02-16 13:48   ` Marcelo Tosatti
  2016-02-16 14:25     ` Marcelo Tosatti
  1 sibling, 1 reply; 36+ messages in thread
From: Marcelo Tosatti @ 2016-02-16 13:48 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm

On Mon, Feb 08, 2016 at 04:18:31PM +0100, Paolo Bonzini wrote:
> When an NTP server is running, it may adjust the time substantially
> compared to the "official" frequency of the TSC.  A 12 ppm change
> sums up to one second per day.
> 
> This already shows up if the guest compares kvmclock with e.g. the
> PM timer.  It shows up even more once we add support for the Hyper-V
> TSC page, because the guest expects it to be in sync with the time
> reference counter; effectively the time reference counter is just a
> slow path to access the same clock that is in the TSC page.
> 
> Therefore, we want kvmclock to provide the host kernel's
> ktime_get_boot_ns() value, at least if the master clock is active.
> To do so, reverse-compute the host's "actual" TSC frequency from
> pvclock_gtod_data and return it from kvm_get_time_and_clockread.

Paolo,

You'd have to generate an update to the guest structures as well, 
to reflect the new {mult,shift} values calculated by the host. 
Here:

        /* disable master clock if host does not trust, or does not
         * use, TSC clocksource
         */
        if (gtod->clock.vclock_mode != VCLOCK_TSC &&
            atomic_read(&kvm_guest_has_master_clock) != 0)
                queue_work(system_long_wq, &pvclock_gtod_work);

No?

At first, i'm afraid this might be heavy, so it might be interesting
to rate limit the update operation.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 0/4] kvmclock: improve accuracy
  2016-02-08 15:18 [PATCH 0/4] kvmclock: improve accuracy Paolo Bonzini
                   ` (3 preceding siblings ...)
  2016-02-08 15:18 ` [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct Paolo Bonzini
@ 2016-02-16 14:00 ` Marcelo Tosatti
  4 siblings, 0 replies; 36+ messages in thread
From: Marcelo Tosatti @ 2016-02-16 14:00 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm

On Mon, Feb 08, 2016 at 04:18:27PM +0100, Paolo Bonzini wrote:
> Currently kvmclock is obtaining the multiplier and shift value from
> the TSC kHz.  These however are less accurate than the host kernel's
> clock, which includes corrections made through NTP.
> 
> These patches change kvmclock to tick at the same frequency as the
> host kernel's clocksource (as obtained through the pvclock_gtod
> notifier).  This is precise enough that the Hyper-V clock can be
> implemented on top of it.

Note the current option to sync kvmclock periodically every 300 seconds:

#define KVMCLOCK_SYNC_PERIOD (300 * HZ)

static void kvmclock_sync_fn(struct work_struct *work)
{
        struct delayed_work *dwork = to_delayed_work(work);
        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
                                           kvmclock_sync_work);
        struct kvm *kvm = container_of(ka, struct kvm, arch);

        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
}

How large is the error when the corrections are applied every 5 minutes? 

Does this match the errors you are seeing?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-16 13:48   ` Marcelo Tosatti
@ 2016-02-16 14:25     ` Marcelo Tosatti
  2016-02-16 16:59       ` Paolo Bonzini
  0 siblings, 1 reply; 36+ messages in thread
From: Marcelo Tosatti @ 2016-02-16 14:25 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm

On Tue, Feb 16, 2016 at 02:48:16PM +0100, Marcelo Tosatti wrote:
> On Mon, Feb 08, 2016 at 04:18:31PM +0100, Paolo Bonzini wrote:
> > When an NTP server is running, it may adjust the time substantially
> > compared to the "official" frequency of the TSC.  A 12 ppm change
> > sums up to one second per day.
> > 
> > This already shows up if the guest compares kvmclock with e.g. the
> > PM timer.  It shows up even more once we add support for the Hyper-V
> > TSC page, because the guest expects it to be in sync with the time
> > reference counter; effectively the time reference counter is just a
> > slow path to access the same clock that is in the TSC page.
> > 
> > Therefore, we want kvmclock to provide the host kernel's
> > ktime_get_boot_ns() value, at least if the master clock is active.
> > To do so, reverse-compute the host's "actual" TSC frequency from
> > pvclock_gtod_data and return it from kvm_get_time_and_clockread.
> 
> Paolo,
> 
> You'd have to generate an update to the guest structures as well, 
> to reflect the new {mult,shift} values calculated by the host. 
> Here:
> 
>         /* disable master clock if host does not trust, or does not
>          * use, TSC clocksource
>          */
>         if (gtod->clock.vclock_mode != VCLOCK_TSC &&
>             atomic_read(&kvm_guest_has_master_clock) != 0)
>                 queue_work(system_long_wq, &pvclock_gtod_work);
> 
> No?
> 
> At first, i'm afraid this might be heavy, so it might be interesting
> to rate limit the update operation.
> 

Paolo,

I suppose its not sufficient: 

500ppm of 300 seconds = .0005*300 = 0.15 seconds.

Should aim at avoiding time backwards event in the following situation:


T1) t1_kvmclock_read = get_nanoseconds(); 
    /* NTP correction to kernel clock = 500ppm */
    /* TSC correction via mult,shift = 0ppm */

    VM-exit, update kvmclock (or Hyper-V) clock data with 
    new values 

T2) t2_kvmclock_read = get_nanoseconds();
    /* NTP correction to kernel clock = 500ppm */
    /* TSC correction via mult,shift = 500ppm */


So should not allow the host clock (or system_timestamp) to diverge 
from (TSC based calculation) more than the duration of the event:

    VM-exit, update kvmclock (or Hyper-V) with new data.

To avoid t2_kvmclock_read < t1_kvmclock_read

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-16 14:25     ` Marcelo Tosatti
@ 2016-02-16 16:59       ` Paolo Bonzini
  2016-02-19 14:12         ` Marcelo Tosatti
  0 siblings, 1 reply; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-16 16:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-kernel, kvm



On 16/02/2016 15:25, Marcelo Tosatti wrote:
> On Tue, Feb 16, 2016 at 02:48:16PM +0100, Marcelo Tosatti wrote:
>> On Mon, Feb 08, 2016 at 04:18:31PM +0100, Paolo Bonzini wrote:
>>> When an NTP server is running, it may adjust the time substantially
>>> compared to the "official" frequency of the TSC.  A 12 ppm change
>>> sums up to one second per day.
>>>
>>> This already shows up if the guest compares kvmclock with e.g. the
>>> PM timer.  It shows up even more once we add support for the Hyper-V
>>> TSC page, because the guest expects it to be in sync with the time
>>> reference counter; effectively the time reference counter is just a
>>> slow path to access the same clock that is in the TSC page.
>>>
>>> Therefore, we want kvmclock to provide the host kernel's
>>> ktime_get_boot_ns() value, at least if the master clock is active.
>>> To do so, reverse-compute the host's "actual" TSC frequency from
>>> pvclock_gtod_data and return it from kvm_get_time_and_clockread.
>>
>> Paolo,
>>
>> You'd have to generate an update to the guest structures as well, 
>> to reflect the new {mult,shift} values calculated by the host. 
>> Here:
>>
>>         /* disable master clock if host does not trust, or does not
>>          * use, TSC clocksource
>>          */
>>         if (gtod->clock.vclock_mode != VCLOCK_TSC &&
>>             atomic_read(&kvm_guest_has_master_clock) != 0)
>>                 queue_work(system_long_wq, &pvclock_gtod_work);
>>
>> No?
>>
>> At first, i'm afraid this might be heavy, so it might be interesting
>> to rate limit the update operation.
>>
> 
> Paolo,
> 
> I suppose its not sufficient: 
> 
> 500ppm of 300 seconds = .0005*300 = 0.15 seconds.
> 
> Should aim at avoiding time backwards event in the following situation:
> 
> 
> T1) t1_kvmclock_read = get_nanoseconds(); 
>     /* NTP correction to kernel clock = 500ppm */
>     /* TSC correction via mult,shift = 0ppm */
> 
>     VM-exit, update kvmclock (or Hyper-V) clock data with 
>     new values 
> 
> T2) t2_kvmclock_read = get_nanoseconds();
>     /* NTP correction to kernel clock = 500ppm */
>     /* TSC correction via mult,shift = 500ppm */
> 
> 
> So should not allow the host clock (or system_timestamp) to diverge 
> from (TSC based calculation) more than the duration of the event:
> 
>     VM-exit, update kvmclock (or Hyper-V) with new data.
> 
> To avoid t2_kvmclock_read < t1_kvmclock_read

If I don't do rate limiting, that would not be a problem I think.  The
host timekeeper code should take care of updating the base timestamps
(TSC and nanoseconds) in a way that doesn't cause a clock-goes-backwards
event?  I need to check how often the timekeeper updates the parameters.

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-16 16:59       ` Paolo Bonzini
@ 2016-02-19 14:12         ` Marcelo Tosatti
  2016-02-19 15:53           ` Paolo Bonzini
  0 siblings, 1 reply; 36+ messages in thread
From: Marcelo Tosatti @ 2016-02-19 14:12 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm

On Tue, Feb 16, 2016 at 05:59:57PM +0100, Paolo Bonzini wrote:
> 
> 
> On 16/02/2016 15:25, Marcelo Tosatti wrote:
> > On Tue, Feb 16, 2016 at 02:48:16PM +0100, Marcelo Tosatti wrote:
> >> On Mon, Feb 08, 2016 at 04:18:31PM +0100, Paolo Bonzini wrote:
> >>> When an NTP server is running, it may adjust the time substantially
> >>> compared to the "official" frequency of the TSC.  A 12 ppm change
> >>> sums up to one second per day.
> >>>
> >>> This already shows up if the guest compares kvmclock with e.g. the
> >>> PM timer.  It shows up even more once we add support for the Hyper-V
> >>> TSC page, because the guest expects it to be in sync with the time
> >>> reference counter; effectively the time reference counter is just a
> >>> slow path to access the same clock that is in the TSC page.
> >>>
> >>> Therefore, we want kvmclock to provide the host kernel's
> >>> ktime_get_boot_ns() value, at least if the master clock is active.
> >>> To do so, reverse-compute the host's "actual" TSC frequency from
> >>> pvclock_gtod_data and return it from kvm_get_time_and_clockread.
> >>
> >> Paolo,
> >>
> >> You'd have to generate an update to the guest structures as well, 
> >> to reflect the new {mult,shift} values calculated by the host. 
> >> Here:
> >>
> >>         /* disable master clock if host does not trust, or does not
> >>          * use, TSC clocksource
> >>          */
> >>         if (gtod->clock.vclock_mode != VCLOCK_TSC &&
> >>             atomic_read(&kvm_guest_has_master_clock) != 0)
> >>                 queue_work(system_long_wq, &pvclock_gtod_work);
> >>
> >> No?
> >>
> >> At first, i'm afraid this might be heavy, so it might be interesting
> >> to rate limit the update operation.
> >>
> > 
> > Paolo,
> > 
> > I suppose its not sufficient: 
> > 
> > 500ppm of 300 seconds = .0005*300 = 0.15 seconds.
> > 
> > Should aim at avoiding time backwards event in the following situation:
> > 
> > 
> > T1) t1_kvmclock_read = get_nanoseconds(); 
> >     /* NTP correction to kernel clock = 500ppm */
> >     /* TSC correction via mult,shift = 0ppm */
> > 
> >     VM-exit, update kvmclock (or Hyper-V) clock data with 
> >     new values 
> > 
> > T2) t2_kvmclock_read = get_nanoseconds();
> >     /* NTP correction to kernel clock = 500ppm */
> >     /* TSC correction via mult,shift = 500ppm */
> > 
> > 
> > So should not allow the host clock (or system_timestamp) to diverge 
> > from (TSC based calculation) more than the duration of the event:
> > 
> >     VM-exit, update kvmclock (or Hyper-V) with new data.
> > 
> > To avoid t2_kvmclock_read < t1_kvmclock_read
> 
> If I don't do rate limiting, that would not be a problem I think. 

Correct.

>  The
> host timekeeper code should take care of updating the base timestamps
> (TSC and nanoseconds) in a way that doesn't cause a clock-goes-backwards
> event?

Yes.

>  I need to check how often the timekeeper updates the parameters.

I'd assume once every tick, the function is called (the notifier).

But you can optimize that away by only updating the TSC frequency
when mult/shift are updated, which should be much rarer.

(Note this issue is also a problem for Linux based kvmclock today).

> 
> Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct
  2016-02-19 14:12         ` Marcelo Tosatti
@ 2016-02-19 15:53           ` Paolo Bonzini
  0 siblings, 0 replies; 36+ messages in thread
From: Paolo Bonzini @ 2016-02-19 15:53 UTC (permalink / raw)
  To: Marcelo Tosatti, KVM list, linux-kernel



On 19/02/2016 15:12, Marcelo Tosatti wrote:
> 
>> >  I need to check how often the timekeeper updates the parameters.
> I'd assume once every tick, the function is called (the notifier).
> 
> But you can optimize that away by only updating the TSC frequency
> when mult/shift are updated, which should be much rarer.

Yes, exactly.  That would still not be rate limiting.

But worst case it can be a lot of adjustments, up to HZ per second...

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-02-03 16:37   ` Paolo Bonzini
  2016-02-04  9:33     ` Roman Kagan
@ 2016-04-21 17:01     ` Roman Kagan
  2016-04-22 13:32       ` Roman Kagan
  1 sibling, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-04-21 17:01 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Andrey Smetanin, Denis V. Lunev, Marcelo Tosatti

On Wed, Feb 03, 2016 at 05:37:08PM +0100, Paolo Bonzini wrote:
> On 28/01/2016 17:22, Roman Kagan wrote:
> > On Thu, Jan 28, 2016 at 03:04:58PM +0100, Paolo Bonzini wrote:
> >> The test checks the relative precision of the reference TSC page
> >> and the time reference counter.
> >>
> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> >> ---
> >> 	Andrey, the test has a relative error of approximately 3 parts
> >> 	per million on my machine.  In other words it drifts by 3
> >> 	microseconds every second, which I don't think is acceptable.
> >> 	The problem is that virtual_tsc_khz is 2593993 while the actual
> >> 	frequency is more like 2594001 kHz.
> > 
> > Hmm, how come?  I thought virtual_tsc_khz could only diverge from
> > tsc_khz on migration.
> > 
> > [Or maybe I just misunderstand what you mean by "the actual frequency".]
> 
> Talking to Marcelo helped me understanding it. :)
> 
> The issue is that the TSC kHz is not the correct value to use for the  
> the TSC page.  Instead, we want to pass to the guest a scale value 
> corresponding to the kernel's timekeeping multiplier.  This is what 
> both kvmclock and the time reference counter use.
> 
> The bit that I was missing last week was how the guest could perceive
> updates to the TSC page parameters as atomic.  We actually _can_
> emulate seqlock-like behavior in Hyper-V by writing 0 to seq during
> an update.  Instead of looping like seqlocks do, the guest will simply
> use the time reference counter and take a hypervisor exit.  The result
> however is still valid, because we want the time reference counter to
> be perfectly synchronized with the Hyper-V clock and lets you handle
> any kvmclock update scenario safely.
> 
> Therefore the algorithm would be like patch 2/2 I sent, but with
> important differences.  You'd still
> 
> 	if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> 
> on MSR writes, but then hook into kvm_guest_time_update rather than
> kvm_gen_update_masterclock, like:
> 
> 	if (v == kvm_get_vcpu(v->kvm, 0))
> 	        kvm_write_hyperv_tsc_page(v->kvm, &vcpu->hv_clock);
> 
> and in kvm_write_hyperv_tsc_page the calculation would be based on
> the kvmclock parameters:
> 
>         {
>                 write 0 to seq;
>                 if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
>                         return;
> 
>                 compute scale and offset from hv_clock mul and shift;
>                 write scale and offset;
> 		write sequence
>         }
> 
> all of scale, offset and sequence can be computed from kvmclock parameters.
> 
> For sequence we have to convert "even values between 0 and 0xFFFFFFFE"
> into "values between "1 and 0xFFFFFFFE".  For example:
> 
> 	hyper-v sequence = (kvmclock sequence >> 1) + 1
> 
> will do it.  Computing the scale and offset is something like:
> 
>         // KVM formula:
> 	//    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul << tsc_shift + system_time
> 	//
> 	// hyper-v formula:
> 	//    nsec/100 = ticks * scale / 2^32 + offset
> 	//
> 	// when tsc_timestamp, offset is zero so remove them both:
>         //    ticks * tsc_to_system_mul << tsc_shift / 100 = ticks * scale / 2^32
> 	//
> 	// multiply both sides by 2^32 / ticks and you get scale:
> 	//    scale = tsc_to_system_mul << (32 + tsc_shift) / 100
> 	//
> 	// check if it would overflow, and then use the time ref counter
> 	//    tsc_to_system_mul << (32 + tsc_shift) / 100 >= 2^32
> 	//    tsc_to_system_mul << 32 >= 100 * 2^32 << -tsc_shift
> 	//    tsc_to_system_mul >= 100 << -tsc_shift
> 
> 	if (shift < 0)
> 		rhs = 100 << -shift;
> 	else
> 		rhs = 100 >> shift;
> 
> 	if (tsc_to_system_mul >= rhs)
> 		return;
> 
>         scale = mul_u64_u32_div(1ULL << (32 + tsc_shift), tsc_to_system_mul, 100);
> 
>         // now expand the kvmclock formula:
>         //    nsec = ticks * tsc_to_system_mul << tsc_shift - (tsc_timestamp * tsc_to_system_mul << tsc_shift) + system_time 
> 	// divide by 100:
> 	//    nsec/100 = ticks * tsc_to_system_mul << tsc_shift /100 - (tsc_timestamp * tsc_to_system_mul << tsc_shift) /100 + system_time /100
> 	// replace tsc_to_system_mul << tsc_shift /100 by scale / 2^32:
>         //    nsec/100 = ticks * scale / 2^32 - (tsc_timestamp * scale / 2^32) + system_time / 100
> 	// comparing with the Hyper-V formula:
> 	//    offset = system_time / 100 - (tsc_timestamp * scale / 2^32)
> 
> 	offset = system_time;
> 	do_div(offset, 100);
> 
> 	timestamp_offset = tsc_timestamp;
> 	do_div32_shl32(timestamp_offset, scale);
>         offset = offset - timestamp_offset;
> 
> The remaining part is _further_ adjusting the offset to
> Would you like to finish implementing this and test it with the unit test?
> kvm/queue already has the patch to introduce do_div32_shl32.

At last I've got some time to tackle this.  I cooked up a patch to do
approximately this (fixing the math and the logic in a few places), and
adjusted the test to build, and now I'm about to submit both of them.

As to the drift problem, I reliably reproduce it, too, but it's a bug
somewhere in kvm_clock, and the new hyperv ref tsc page now demonstrates
it by virtue of relying on kvm_clock's values.  On my machine I observe
as much as +14 ppm and I'll try to chase it down, but this is
independent of the hyperv ref tsc page patchset.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-04-21 17:01     ` Roman Kagan
@ 2016-04-22 13:32       ` Roman Kagan
  2016-04-22 18:08         ` Paolo Bonzini
  0 siblings, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-04-22 13:32 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Denis V. Lunev, Marcelo Tosatti

On Thu, Apr 21, 2016 at 08:01:58PM +0300, Roman Kagan wrote:
> As to the drift problem, I reliably reproduce it, too, but it's a bug
> somewhere in kvm_clock, and the new hyperv ref tsc page now demonstrates
> it by virtue of relying on kvm_clock's values.  On my machine I observe
> as much as +14 ppm and I'll try to chase it down, but this is
> independent of the hyperv ref tsc page patchset.

A simple demonstration of it: in a CentOS 7.2 x86_64 guest:

# dmesg | grep -Fw tsc:
[    0.000000] tsc: Detected 3411.482 MHz processor
[    2.192473] tsc: Refined TSC clocksource calibration: 3411.532 MHz

The first value is derived from the kvm_clock's tsc_to_system_mul and
tsc_shift, and matches hosts's vcpu->hw_tsc_khz.  The second is
calibrated using emulated HPET.  The difference is those +14 ppm.

This is on i7-2600, invariant TSC present, TSC scaling not present.

I'll dig further but I'd appreciate any comment on whether it was within
tolerance or not.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-04-22 13:32       ` Roman Kagan
@ 2016-04-22 18:08         ` Paolo Bonzini
  2016-04-25  8:47           ` Roman Kagan
  0 siblings, 1 reply; 36+ messages in thread
From: Paolo Bonzini @ 2016-04-22 18:08 UTC (permalink / raw)
  To: Roman Kagan, kvm, Denis V. Lunev, Marcelo Tosatti



On 22/04/2016 15:32, Roman Kagan wrote:
> The first value is derived from the kvm_clock's tsc_to_system_mul and
> tsc_shift, and matches hosts's vcpu->hw_tsc_khz.  The second is
> calibrated using emulated HPET.  The difference is those +14 ppm.
> 
> This is on i7-2600, invariant TSC present, TSC scaling not present.
> 
> I'll dig further but I'd appreciate any comment on whether it was within
> tolerance or not.

The solution to the bug is to change the Hyper-V reference time MSR to
use the same formula as the Hyper-V TSC-based clock.  Likewise,
KVM_GET_CLOCK and KVM_SET_CLOCK should not use ktime_get_ns().

Paolo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-04-22 18:08         ` Paolo Bonzini
@ 2016-04-25  8:47           ` Roman Kagan
  2016-04-26 10:34             ` Roman Kagan
  0 siblings, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-04-25  8:47 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Denis V. Lunev, Marcelo Tosatti

On Fri, Apr 22, 2016 at 08:08:47PM +0200, Paolo Bonzini wrote:
> On 22/04/2016 15:32, Roman Kagan wrote:
> > The first value is derived from the kvm_clock's tsc_to_system_mul and
> > tsc_shift, and matches hosts's vcpu->hw_tsc_khz.  The second is
> > calibrated using emulated HPET.  The difference is those +14 ppm.
> > 
> > This is on i7-2600, invariant TSC present, TSC scaling not present.
> > 
> > I'll dig further but I'd appreciate any comment on whether it was within
> > tolerance or not.
> 
> The solution to the bug is to change the Hyper-V reference time MSR to
> use the same formula as the Hyper-V TSC-based clock.  Likewise,
> KVM_GET_CLOCK and KVM_SET_CLOCK should not use ktime_get_ns().

Umm, I'm not sure it's a good idea...

E.g. virtualized HPET sits in userspace and thus uses
clock_gettime(CLOCK_MONOTONIC), so the drift will remain.

AFAICT the root cause is the following: KVM master clock uses the same
multiplier/shift as the vsyscall time in host userspace.  However, the
offsets in vsyscall_gtod_data get updated all the time with corrections
from NTP and so on.  Therefore even if the TSC rate is somewhat
miscalibrated, the error is kept small in vsyscall time functions.  OTOH
the offsets in KVM clock are basically never updated, so the error keeps
linearly growing over time.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-04-25  8:47           ` Roman Kagan
@ 2016-04-26 10:34             ` Roman Kagan
  2016-05-25 18:33               ` Roman Kagan
  0 siblings, 1 reply; 36+ messages in thread
From: Roman Kagan @ 2016-04-26 10:34 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Denis V. Lunev, Marcelo Tosatti

On Mon, Apr 25, 2016 at 11:47:23AM +0300, Roman Kagan wrote:
> On Fri, Apr 22, 2016 at 08:08:47PM +0200, Paolo Bonzini wrote:
> > On 22/04/2016 15:32, Roman Kagan wrote:
> > > The first value is derived from the kvm_clock's tsc_to_system_mul and
> > > tsc_shift, and matches hosts's vcpu->hw_tsc_khz.  The second is
> > > calibrated using emulated HPET.  The difference is those +14 ppm.
> > > 
> > > This is on i7-2600, invariant TSC present, TSC scaling not present.
> > > 
> > > I'll dig further but I'd appreciate any comment on whether it was within
> > > tolerance or not.
> > 
> > The solution to the bug is to change the Hyper-V reference time MSR to
> > use the same formula as the Hyper-V TSC-based clock.  Likewise,
> > KVM_GET_CLOCK and KVM_SET_CLOCK should not use ktime_get_ns().
> 
> Umm, I'm not sure it's a good idea...
> 
> E.g. virtualized HPET sits in userspace and thus uses
> clock_gettime(CLOCK_MONOTONIC), so the drift will remain.
> 
> AFAICT the root cause is the following: KVM master clock uses the same
> multiplier/shift as the vsyscall time in host userspace.  However, the
> offsets in vsyscall_gtod_data get updated all the time with corrections
> from NTP and so on.  Therefore even if the TSC rate is somewhat
> miscalibrated, the error is kept small in vsyscall time functions.  OTOH
> the offsets in KVM clock are basically never updated, so the error keeps
> linearly growing over time.

This seems to be due to a typo:

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5819,7 +5819,7 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
        /* disable master clock if host does not trust, or does not
         * use, TSC clocksource
         */
-       if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+       if (gtod->clock.vclock_mode == VCLOCK_TSC &&
            atomic_read(&kvm_guest_has_master_clock) != 0)
                queue_work(system_long_wq, &pvclock_gtod_work);


as a result, the global pvclock_gtod_data was kept up to date, but the
requests to update per-vm copies were never issued.

With the patch I'm now seeing different test failures which I'm looking
into.

Meanwhile I'm wondering if this scheme is not too costly: on my machine
pvclock_gtod_notify() is called at kHz rate, and the work it schedules
does

static void pvclock_gtod_update_fn(struct work_struct *work)
{
[...]
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
        atomic_set(&kvm_guest_has_master_clock, 0);
        spin_unlock(&kvm_lock);
}

KVM_REQ_MASTERCLOCK_UPDATE makes all VCPUs synchronize:

static void kvm_gen_update_masterclock(struct kvm *kvm)
{
[...]
        spin_lock(&ka->pvclock_gtod_sync_lock);
        kvm_make_mclock_inprogress_request(kvm);
        /* no guest entries from this point */
        pvclock_update_vm_gtod_copy(kvm);

        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

        /* guest entries allowed */
        kvm_for_each_vcpu(i, vcpu, kvm)
                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);

        spin_unlock(&ka->pvclock_gtod_sync_lock);
[...]
}

so on a host with many VMs it may become an issue.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-04-26 10:34             ` Roman Kagan
@ 2016-05-25 18:33               ` Roman Kagan
  2016-05-26 14:47                 ` Roman Kagan
  2016-05-29 22:34                 ` Marcelo Tosatti
  0 siblings, 2 replies; 36+ messages in thread
From: Roman Kagan @ 2016-05-25 18:33 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Denis V. Lunev, Marcelo Tosatti

On Tue, Apr 26, 2016 at 01:34:56PM +0300, Roman Kagan wrote:
> On Mon, Apr 25, 2016 at 11:47:23AM +0300, Roman Kagan wrote:
> > On Fri, Apr 22, 2016 at 08:08:47PM +0200, Paolo Bonzini wrote:
> > > On 22/04/2016 15:32, Roman Kagan wrote:
> > > > The first value is derived from the kvm_clock's tsc_to_system_mul and
> > > > tsc_shift, and matches hosts's vcpu->hw_tsc_khz.  The second is
> > > > calibrated using emulated HPET.  The difference is those +14 ppm.
> > > > 
> > > > This is on i7-2600, invariant TSC present, TSC scaling not present.
> > > > 
> > > > I'll dig further but I'd appreciate any comment on whether it was within
> > > > tolerance or not.
> > > 
> > > The solution to the bug is to change the Hyper-V reference time MSR to
> > > use the same formula as the Hyper-V TSC-based clock.  Likewise,
> > > KVM_GET_CLOCK and KVM_SET_CLOCK should not use ktime_get_ns().
> > 
> > Umm, I'm not sure it's a good idea...
> > 
> > E.g. virtualized HPET sits in userspace and thus uses
> > clock_gettime(CLOCK_MONOTONIC), so the drift will remain.
> > 
> > AFAICT the root cause is the following: KVM master clock uses the same
> > multiplier/shift as the vsyscall time in host userspace.  However, the
> > offsets in vsyscall_gtod_data get updated all the time with corrections
> > from NTP and so on.  Therefore even if the TSC rate is somewhat
> > miscalibrated, the error is kept small in vsyscall time functions.  OTOH
> > the offsets in KVM clock are basically never updated, so the error keeps
> > linearly growing over time.
> 
> This seems to be due to a typo:
> 
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5819,7 +5819,7 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
>         /* disable master clock if host does not trust, or does not
>          * use, TSC clocksource
>          */
> -       if (gtod->clock.vclock_mode != VCLOCK_TSC &&
> +       if (gtod->clock.vclock_mode == VCLOCK_TSC &&
>             atomic_read(&kvm_guest_has_master_clock) != 0)
>                 queue_work(system_long_wq, &pvclock_gtod_work);
> 
> 
> as a result, the global pvclock_gtod_data was kept up to date, but the
> requests to update per-vm copies were never issued.
> 
> With the patch I'm now seeing different test failures which I'm looking
> into.
> 
> Meanwhile I'm wondering if this scheme is not too costly: on my machine
> pvclock_gtod_notify() is called at kHz rate, and the work it schedules
> does
> 
> static void pvclock_gtod_update_fn(struct work_struct *work)
> {
> [...]
>         spin_lock(&kvm_lock);
>         list_for_each_entry(kvm, &vm_list, vm_list)
>                 kvm_for_each_vcpu(i, vcpu, kvm)
>                         kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
>         atomic_set(&kvm_guest_has_master_clock, 0);
>         spin_unlock(&kvm_lock);
> }
> 
> KVM_REQ_MASTERCLOCK_UPDATE makes all VCPUs synchronize:
> 
> static void kvm_gen_update_masterclock(struct kvm *kvm)
> {
> [...]
>         spin_lock(&ka->pvclock_gtod_sync_lock);
>         kvm_make_mclock_inprogress_request(kvm);
>         /* no guest entries from this point */
>         pvclock_update_vm_gtod_copy(kvm);
> 
>         kvm_for_each_vcpu(i, vcpu, kvm)
>                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> 
>         /* guest entries allowed */
>         kvm_for_each_vcpu(i, vcpu, kvm)
>                 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
> 
>         spin_unlock(&ka->pvclock_gtod_sync_lock);
> [...]
> }
> 
> so on a host with many VMs it may become an issue.

Ping

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-05-25 18:33               ` Roman Kagan
@ 2016-05-26 14:47                 ` Roman Kagan
  2016-05-29 22:34                 ` Marcelo Tosatti
  1 sibling, 0 replies; 36+ messages in thread
From: Roman Kagan @ 2016-05-26 14:47 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, Denis V. Lunev, Marcelo Tosatti, Owen Hofmann

On Wed, May 25, 2016 at 09:33:07PM +0300, Roman Kagan wrote:
> On Tue, Apr 26, 2016 at 01:34:56PM +0300, Roman Kagan wrote:
> > On Mon, Apr 25, 2016 at 11:47:23AM +0300, Roman Kagan wrote:
> > > AFAICT the root cause is the following: KVM master clock uses the same
> > > multiplier/shift as the vsyscall time in host userspace.  However, the
> > > offsets in vsyscall_gtod_data get updated all the time with corrections
> > > from NTP and so on.  Therefore even if the TSC rate is somewhat
> > > miscalibrated, the error is kept small in vsyscall time functions.  OTOH
> > > the offsets in KVM clock are basically never updated, so the error keeps
> > > linearly growing over time.
> > 
> > This seems to be due to a typo:
> > 
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5819,7 +5819,7 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
> >         /* disable master clock if host does not trust, or does not
> >          * use, TSC clocksource
> >          */
> > -       if (gtod->clock.vclock_mode != VCLOCK_TSC &&
> > +       if (gtod->clock.vclock_mode == VCLOCK_TSC &&
> >             atomic_read(&kvm_guest_has_master_clock) != 0)
> >                 queue_work(system_long_wq, &pvclock_gtod_work);
> > 
> > 
> > as a result, the global pvclock_gtod_data was kept up to date, but the
> > requests to update per-vm copies were never issued.

Looks like it has already been noticed and acknowledged as a bug during
review, but made it into the upstream tree nonetheless:

On Wed, Feb 10, 2016 at 02:57:31PM +0100, Paolo Bonzini wrote:
> On 09/02/2016 19:41, Owen Hofmann wrote:
> > Should this patch change the condition in pvclock_gtod_notify?
> > Currently it looks like we'll only request a masterclock update when
> > tsc is no longer a good clocksource.
> 
> Yes, you're right.
> 
> Paolo

I'm gonna post a patch to fix it.

Roman.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case
  2016-05-25 18:33               ` Roman Kagan
  2016-05-26 14:47                 ` Roman Kagan
@ 2016-05-29 22:34                 ` Marcelo Tosatti
  1 sibling, 0 replies; 36+ messages in thread
From: Marcelo Tosatti @ 2016-05-29 22:34 UTC (permalink / raw)
  To: Roman Kagan, Paolo Bonzini, kvm, Denis V. Lunev

On Wed, May 25, 2016 at 09:33:07PM +0300, Roman Kagan wrote:
> On Tue, Apr 26, 2016 at 01:34:56PM +0300, Roman Kagan wrote:
> > On Mon, Apr 25, 2016 at 11:47:23AM +0300, Roman Kagan wrote:
> > > On Fri, Apr 22, 2016 at 08:08:47PM +0200, Paolo Bonzini wrote:
> > > > On 22/04/2016 15:32, Roman Kagan wrote:
> > > > > The first value is derived from the kvm_clock's tsc_to_system_mul and
> > > > > tsc_shift, and matches hosts's vcpu->hw_tsc_khz.  The second is
> > > > > calibrated using emulated HPET.  The difference is those +14 ppm.
> > > > > 
> > > > > This is on i7-2600, invariant TSC present, TSC scaling not present.
> > > > > 
> > > > > I'll dig further but I'd appreciate any comment on whether it was within
> > > > > tolerance or not.
> > > > 
> > > > The solution to the bug is to change the Hyper-V reference time MSR to
> > > > use the same formula as the Hyper-V TSC-based clock.  Likewise,
> > > > KVM_GET_CLOCK and KVM_SET_CLOCK should not use ktime_get_ns().
> > > 
> > > Umm, I'm not sure it's a good idea...
> > > 
> > > E.g. virtualized HPET sits in userspace and thus uses
> > > clock_gettime(CLOCK_MONOTONIC), so the drift will remain.
> > > 
> > > AFAICT the root cause is the following: KVM master clock uses the same
> > > multiplier/shift as the vsyscall time in host userspace.  However, the
> > > offsets in vsyscall_gtod_data get updated all the time with corrections
> > > from NTP and so on.  Therefore even if the TSC rate is somewhat
> > > miscalibrated, the error is kept small in vsyscall time functions.  OTOH
> > > the offsets in KVM clock are basically never updated, so the error keeps
> > > linearly growing over time.
> > 
> > This seems to be due to a typo:

Its not a typo, the code only updated the notifier on 
VCLOCK_TSC -> !VCLOCK_TSC transition.

> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5819,7 +5819,7 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
> >         /* disable master clock if host does not trust, or does not
> >          * use, TSC clocksource
> >          */
> > -       if (gtod->clock.vclock_mode != VCLOCK_TSC &&
> > +       if (gtod->clock.vclock_mode == VCLOCK_TSC &&
> >             atomic_read(&kvm_guest_has_master_clock) != 0)
> >                 queue_work(system_long_wq, &pvclock_gtod_work);
> > 
> > 
> > as a result, the global pvclock_gtod_data was kept up to date, but the
> > requests to update per-vm copies were never issued.
> > 
> > With the patch I'm now seeing different test failures which I'm looking
> > into.

The queue_work is not enough: it opens a window where guest clock
(via shared memory) and CLOCK_GETTIME can go out of sync.

> > 
> > Meanwhile I'm wondering if this scheme is not too costly: on my machine
> > pvclock_gtod_notify() is called at kHz rate, and the work it schedules
> > does
> > 
> > static void pvclock_gtod_update_fn(struct work_struct *work)
> > {
> > [...]
> >         spin_lock(&kvm_lock);
> >         list_for_each_entry(kvm, &vm_list, vm_list)
> >                 kvm_for_each_vcpu(i, vcpu, kvm)
> >                         kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> >         atomic_set(&kvm_guest_has_master_clock, 0);
> >         spin_unlock(&kvm_lock);
> > }
> > 
> > KVM_REQ_MASTERCLOCK_UPDATE makes all VCPUs synchronize:
> > 
> > static void kvm_gen_update_masterclock(struct kvm *kvm)
> > {
> > [...]
> >         spin_lock(&ka->pvclock_gtod_sync_lock);
> >         kvm_make_mclock_inprogress_request(kvm);
> >         /* no guest entries from this point */
> >         pvclock_update_vm_gtod_copy(kvm);
> > 
> >         kvm_for_each_vcpu(i, vcpu, kvm)
> >                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> > 
> >         /* guest entries allowed */
> >         kvm_for_each_vcpu(i, vcpu, kvm)
> >                 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
> > 
> >         spin_unlock(&ka->pvclock_gtod_sync_lock);
> > [...]
> > }
> > 
> > so on a host with many VMs it may become an issue.
> 
> Ping
> 
> Roman.

1) Can call notifier only when frequency changes.
2) Can calculate how much drift between clocks and do not allow guest
entry.

Will post a patch soon, one or two weeks max (again, independent of your patchset).


^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2016-05-30 10:41 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-08 15:18 [PATCH 0/4] kvmclock: improve accuracy Paolo Bonzini
2016-02-08 15:18 ` [PATCH 1/4] KVM: x86: rename argument to kvm_set_tsc_khz Paolo Bonzini
2016-02-11 15:01   ` Marcelo Tosatti
2016-02-08 15:18 ` [PATCH 2/4] KVM: x86: rewrite handling of scaled TSC for kvmclock Paolo Bonzini
2016-02-11 15:23   ` Marcelo Tosatti
2016-02-08 15:18 ` [PATCH 3/4] KVM: x86: pass kvm_get_time_scale arguments in hertz Paolo Bonzini
2016-02-08 15:18 ` [PATCH 4/4] KVM: x86: track actual TSC frequency from the timekeeper struct Paolo Bonzini
2016-02-09 18:41   ` Owen Hofmann
2016-02-10 13:57     ` Paolo Bonzini
2016-02-16 13:48   ` Marcelo Tosatti
2016-02-16 14:25     ` Marcelo Tosatti
2016-02-16 16:59       ` Paolo Bonzini
2016-02-19 14:12         ` Marcelo Tosatti
2016-02-19 15:53           ` Paolo Bonzini
2016-02-16 14:00 ` [PATCH 0/4] kvmclock: improve accuracy Marcelo Tosatti
  -- strict thread matches above, loose matches on Subject: below --
2016-01-28 14:04 [PATCH kvm-unit-tests] KVM: x86: add hyperv clock test case Paolo Bonzini
2016-01-28 14:04 ` Paolo Bonzini
2016-01-28 14:25 ` Andrey Smetanin
2016-01-28 14:50   ` Paolo Bonzini
2016-01-28 15:53     ` Paolo Bonzini
2016-01-28 18:45       ` Roman Kagan
2016-01-28 18:53     ` Roman Kagan
2016-01-28 21:28       ` Paolo Bonzini
2016-01-28 16:22 ` Roman Kagan
2016-02-03 16:37   ` Paolo Bonzini
2016-02-04  9:33     ` Roman Kagan
2016-02-04 10:13       ` Paolo Bonzini
2016-02-04 11:12         ` Roman Kagan
2016-04-21 17:01     ` Roman Kagan
2016-04-22 13:32       ` Roman Kagan
2016-04-22 18:08         ` Paolo Bonzini
2016-04-25  8:47           ` Roman Kagan
2016-04-26 10:34             ` Roman Kagan
2016-05-25 18:33               ` Roman Kagan
2016-05-26 14:47                 ` Roman Kagan
2016-05-29 22:34                 ` Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.