[PATCH 0/3] x86/kvm: implement Hyper-V reference TSC page clock

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/3] x86/kvm: implement Hyper-V reference TSC page clock
@ 2016-04-21 17:11 Roman Kagan
  2016-04-21 17:11 ` [PATCH 1/3] hyperv: add define for time unit Roman Kagan
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Roman Kagan @ 2016-04-21 17:11 UTC (permalink / raw)
  To: kvm; +Cc: Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev, Roman Kagan

Implement Hyper-V paravirtualized clock -- reference TSC page -- in a
similar manner to kvm_clock.

The bulk of the change is in the last patch in the series; the preceding
ones are cleanups and refactorings paving the way to it.

Based on the patches and ideas by Andrey Smetanin and Paolo Bonzini.

Roman Kagan (3):
  hyperv: add define for time unit
  x86/kvm: factor out updating kvm_clock in guest
  x86/kvm: implement Hyper-V reference TSC page clock

 arch/x86/include/uapi/asm/hyperv.h |   3 +
 arch/x86/kernel/cpu/mshyperv.c     |   3 +-
 arch/x86/kvm/hyperv.c              | 135 +++++++++++++++++++++++++++++++------
 arch/x86/kvm/hyperv.h              |   3 +
 arch/x86/kvm/trace.h               |  25 +++++++
 arch/x86/kvm/x86.c                 | 122 ++++++++++++++++++---------------
 drivers/hv/hv.c                    |   3 +-
 drivers/hv/hv_util.c               |   2 +-
 8 files changed, 218 insertions(+), 78 deletions(-)

-- 
2.5.5


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] hyperv: add define for time unit
  2016-04-21 17:11 [PATCH 0/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
@ 2016-04-21 17:11 ` Roman Kagan
  2016-04-21 17:11 ` [PATCH 2/3] x86/kvm: factor out updating kvm_clock in guest Roman Kagan
  2016-04-21 17:11 ` [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
  2 siblings, 0 replies; 9+ messages in thread
From: Roman Kagan @ 2016-04-21 17:11 UTC (permalink / raw)
  To: kvm; +Cc: Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev, Roman Kagan

Hyper-V uses 100ns time units in all time-related places (clocks,
timers, etc.).  Define a constant for it and use it throughout the code.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
---
 arch/x86/include/uapi/asm/hyperv.h |  3 +++
 arch/x86/kernel/cpu/mshyperv.c     |  3 ++-
 arch/x86/kvm/hyperv.c              | 12 ++++++++----
 drivers/hv/hv.c                    |  3 ++-
 drivers/hv/hv_util.c               |  2 +-
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 9b1a918..bd0d629 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -18,6 +18,9 @@
 #define HYPERV_CPUID_MIN			0x40000005
 #define HYPERV_CPUID_MAX			0x4000ffff
 
+/* time unit in Hyper-V clocks and timers */
+#define HV_NSEC_PER_TICK			100
+
 /*
  * Feature identification. EAX indicates which features are available
  * to the partition based upon the current partition privileges.
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 4e7c693..ae9421f 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -180,7 +180,8 @@ static void __init ms_hyperv_init_platform(void)
 #endif
 
 	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
-		clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+		clocksource_register_hz(&hyperv_cs,
+					NSEC_PER_SEC / HV_NSEC_PER_TICK);
 
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01bd7b7..2641907 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,8 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
 
 static u64 get_time_ref_counter(struct kvm *kvm)
 {
-	return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+	return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset,
+		       HV_NSEC_PER_TICK);
 }
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -460,7 +461,8 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
 
 		hrtimer_start(&stimer->timer,
 			      ktime_add_ns(ktime_now,
-					   100 * (stimer->exp_time - time_now)),
+					   HV_NSEC_PER_TICK *
+					   (stimer->exp_time - time_now)),
 			      HRTIMER_MODE_ABS);
 		return 0;
 	}
@@ -481,7 +483,9 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
 					   time_now, stimer->count);
 
 	hrtimer_start(&stimer->timer,
-		      ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+		      ktime_add_ns(ktime_now,
+				   HV_NSEC_PER_TICK *
+				   (stimer->count - time_now)),
 		      HRTIMER_MODE_ABS);
 	return 0;
 }
@@ -836,7 +840,7 @@ static u64 current_task_runtime_100ns(void)
 	cputime_t utime, stime;
 
 	task_cputime_adjusted(current, &utime, &stime);
-	return div_u64(cputime_to_nsecs(utime + stime), 100);
+	return div_u64(cputime_to_nsecs(utime + stime), HV_NSEC_PER_TICK);
 }
 
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index a1c086b..46b5f8c 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -255,7 +255,8 @@ int hv_init(void)
 		tsc_msr.guest_physical_address = vmalloc_to_pfn(va_tsc);
 
 		wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
-		clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
+		clocksource_register_hz(&hyperv_cs_tsc,
+					NSEC_PER_SEC / HV_NSEC_PER_TICK);
 	}
 #endif
 	return 0;
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index d5acaa2..fe19605 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -168,7 +168,7 @@ static inline void do_adj_guesttime(u64 hosttime)
 	s64 host_tns;
 	struct timespec host_ts;
 
-	host_tns = (hosttime - WLTIMEDELTA) * 100;
+	host_tns = (hosttime - WLTIMEDELTA) * HV_NSEC_PER_TICK;
 	host_ts = ns_to_timespec(host_tns);
 
 	do_settimeofday(&host_ts);
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] x86/kvm: factor out updating kvm_clock in guest
  2016-04-21 17:11 [PATCH 0/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
  2016-04-21 17:11 ` [PATCH 1/3] hyperv: add define for time unit Roman Kagan
@ 2016-04-21 17:11 ` Roman Kagan
  2016-04-21 17:11 ` [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
  2 siblings, 0 replies; 9+ messages in thread
From: Roman Kagan @ 2016-04-21 17:11 UTC (permalink / raw)
  To: kvm; +Cc: Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev, Roman Kagan

Factor out the actual updating of the kvm_clock in the guest memory from
preparing the values to be put there.

This paves the way to adding support for another kind of shared memory
clock (Hyper-V reference TSC page) that will use the same data.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
---
 arch/x86/kvm/x86.c | 117 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 62 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9b7798c..a4a32ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1713,6 +1713,67 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static void kvm_pvclock_update(struct kvm_vcpu *v, bool use_master_clock)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	struct pvclock_vcpu_time_info guest_hv_clock;
+	u8 pvclock_flags;
+
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+		&guest_hv_clock, sizeof(guest_hv_clock))))
+		return;
+
+	/* This VCPU is paused, but it's legal for a guest to read another
+	 * VCPU's kvmclock, so we really have to follow the specification where
+	 * it says that version is odd if data is being modified, and even after
+	 * it is consistent.
+	 *
+	 * Version field updates must be kept separate.  This is because
+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
+	 * writes within a string instruction are weakly ordered.  So there
+	 * are three writes overall.
+	 *
+	 * As a small optimization, only write the version field in the first
+	 * and third write.  The vcpu->pv_time cache is still valid, because the
+	 * version field is the first in the struct.
+	 */
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+
+	smp_wmb();
+
+	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	/* If the host uses TSC clocksource, then it is stable */
+	if (use_master_clock)
+		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+	vcpu->hv_clock.flags = pvclock_flags;
+
+	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
+
+	smp_wmb();
+
+	vcpu->hv_clock.version++;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, tgt_tsc_khz;
@@ -1720,8 +1781,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
 	u64 tsc_timestamp, host_tsc;
-	struct pvclock_vcpu_time_info guest_hv_clock;
-	u8 pvclock_flags;
 	bool use_master_clock;
 
 	kernel_ns = 0;
@@ -1792,59 +1851,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-		&guest_hv_clock, sizeof(guest_hv_clock))))
-		return 0;
-
-	/* This VCPU is paused, but it's legal for a guest to read another
-	 * VCPU's kvmclock, so we really have to follow the specification where
-	 * it says that version is odd if data is being modified, and even after
-	 * it is consistent.
-	 *
-	 * Version field updates must be kept separate.  This is because
-	 * kvm_write_guest_cached might use a "rep movs" instruction, and
-	 * writes within a string instruction are weakly ordered.  So there
-	 * are three writes overall.
-	 *
-	 * As a small optimization, only write the version field in the first
-	 * and third write.  The vcpu->pv_time cache is still valid, because the
-	 * version field is the first in the struct.
-	 */
-	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
-
-	smp_wmb();
-
-	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-	if (vcpu->pvclock_set_guest_stopped_request) {
-		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-		vcpu->pvclock_set_guest_stopped_request = false;
-	}
-
-	/* If the host uses TSC clocksource, then it is stable */
-	if (use_master_clock)
-		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
-
-	vcpu->hv_clock.flags = pvclock_flags;
-
-	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
-
-	smp_wmb();
-
-	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	kvm_pvclock_update(v, use_master_clock);
 	return 0;
 }
 
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock
  2016-04-21 17:11 [PATCH 0/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
  2016-04-21 17:11 ` [PATCH 1/3] hyperv: add define for time unit Roman Kagan
  2016-04-21 17:11 ` [PATCH 2/3] x86/kvm: factor out updating kvm_clock in guest Roman Kagan
@ 2016-04-21 17:11 ` Roman Kagan
  2016-04-25 20:54   ` Radim Krčmář
  2 siblings, 1 reply; 9+ messages in thread
From: Roman Kagan @ 2016-04-21 17:11 UTC (permalink / raw)
  To: kvm; +Cc: Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev, Roman Kagan

Hyper-V's paravirtualized clock -- reference TSC page -- is similar to
kvm_clock with a few important differences:

 - it uses slightly different arithmetic expression for the time in the
   guest, and therefore needs different values to be stored in the guest
   memory

 - the sequence number doesn't have the seqlock semantics; however, its
   value of zero indicates that the clock is in invalid state and the
   guest should proceed to alternative clock sources.  This is used to
   make the guest skip this clock source when it's being updated by the
   host [suggested by Paolo Bonzini].

 - there's a single reference TSC page per VM, therefore the update of
   its contents is bound to VCPU #0 only

In all other respects the clocks are similar, so implement Hyper-V
reference TSC page by updating its contents at the same point and using
the same data as that for kvm_clock.

Based on the patches and ideas by Andrey Smetanin and Paolo Bonzini.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
---
Note: the new clocksource demonstrates a drift of ~15 ppm on my test
machine; it's identical to the drift of kvm_clock, and is a bug in the
calculation of kvm_clock parameters which I'm still chasing, and which
affects both clocks.

 arch/x86/kvm/hyperv.c | 123 ++++++++++++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/hyperv.h |   3 ++
 arch/x86/kvm/trace.h  |  25 ++++++++++
 arch/x86/kvm/x86.c    |   7 ++-
 4 files changed, 141 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2641907..dd453a7 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <asm/apicdef.h>
 #include <trace/events/kvm.h>
+#include <asm/pvclock.h>
 
 #include "trace.h"
 
@@ -797,23 +798,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 		break;
-	}
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 		return kvm_hv_msr_set_crash_data(vcpu,
 						 msr - HV_X64_MSR_CRASH_P0,
@@ -1143,3 +1132,107 @@ set_result:
 	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
+
+bool kvm_hv_tscpage_need_update(struct kvm_vcpu *v)
+{
+	return v == kvm_get_vcpu(v->kvm, 0) &&
+		(v->kvm->arch.hyperv.hv_tsc_page &
+		 HV_X64_MSR_TSC_REFERENCE_ENABLE);
+}
+
+static int pvclock_to_tscpage(struct pvclock_vcpu_time_info *hv_clock,
+			       HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+#ifdef CONFIG_X86_64
+	/*
+	 * kvm_clock:
+	 * nsec = ((((rdtsc - tsc_timestamp) << tsc_shift) *
+	 *          tsc_to_system_mul) >> 32) + system_time
+	 *
+	 * hyperv ref tsc page:
+	 * nsec / 100 = ((rdtsc * tsc_scale) >> 64) + tsc_offset
+	 *
+	 * this gives:
+	 *
+	 * tsc_scale = (tsc_to_system_mul << (tsc_shift + 32)) / 100
+	 *
+	 * tsc_offset = (system_time -
+	 *               pvclock_scale_delta(tsc_timestamp, tsc_to_system_mul
+	 *                                   tsc_shift)) / 100
+	 *
+	 * Note that although tsc_to_system_mul is 32 bit, we may need 128 bit
+	 * division to calculate tsc_scale
+	 */
+	u64 tsc_scale_hi, tsc_scale_lo;
+	s64 tsc_offset;
+
+	/* impossible by definition of tsc_shift but we'd better check */
+	if (hv_clock->tsc_shift >= 32 || hv_clock->tsc_shift <= -32)
+		return -ERANGE;
+
+	tsc_scale_lo = hv_clock->tsc_to_system_mul;
+	tsc_scale_hi = tsc_scale_lo >> (32 - hv_clock->tsc_shift);
+	/* check if division will overflow */
+	if (tsc_scale_hi >= HV_NSEC_PER_TICK)
+		return -ERANGE;
+	tsc_scale_lo <<= (32 + hv_clock->tsc_shift);
+
+	__asm__ ("divq %[divisor]"
+		 : "+a"(tsc_scale_lo), "+d"(tsc_scale_hi)
+		 : [divisor]"rm"((u64)HV_NSEC_PER_TICK));
+	tsc_ref->tsc_scale = tsc_scale_lo;
+
+	tsc_offset = hv_clock->system_time;
+	tsc_offset -= pvclock_scale_delta(hv_clock->tsc_timestamp,
+					  hv_clock->tsc_to_system_mul,
+					  hv_clock->tsc_shift);
+	tsc_ref->tsc_offset = div_s64(tsc_offset, HV_NSEC_PER_TICK);
+
+	return 0;
+#else
+	return -ERANGE;
+#endif
+}
+
+void kvm_hv_tscpage_update(struct kvm_vcpu *v, bool use_master_clock)
+{
+	struct kvm *kvm = v->kvm;
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	HV_REFERENCE_TSC_PAGE tsc_ref;
+	u32 tsc_seq;
+	gpa_t gpa;
+	u64 gfn;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	gpa = gfn_to_gpa(gfn);
+
+	if (kvm_read_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref)))
+		return;
+
+	/*
+	 * mark tsc page invalid either for the duration of contents update or
+	 * for good if it can't be used
+	 */
+	tsc_seq = tsc_ref.tsc_sequence;
+	tsc_ref.tsc_sequence = 0;
+
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+	kvm_write_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref.tsc_sequence));
+	smp_wmb();
+
+	if (!use_master_clock)
+		return;
+
+	if (pvclock_to_tscpage(&v->arch.hv_clock, &tsc_ref))
+		return;
+
+	kvm_write_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref));
+	smp_wmb();
+
+	tsc_ref.tsc_sequence = tsc_seq + 1;
+	if (tsc_ref.tsc_sequence == 0xffffffff || tsc_ref.tsc_sequence == 0)
+		tsc_ref.tsc_sequence = 1;
+	trace_kvm_hv_tscpage_update(&tsc_ref);
+
+	kvm_write_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref.tsc_sequence));
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4..c2fd494 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+bool kvm_hv_tscpage_need_update(struct kvm_vcpu *v);
+void kvm_hv_tscpage_update(struct kvm_vcpu *v, bool use_master_clock);
+
 #endif
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2f1ea2f..7f76c87 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1292,6 +1292,31 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
 		  __entry->vcpu_id, __entry->timer_index)
 );
 
+/*
+ * Tracepoint for kvm_hv_tscpage_update.
+ */
+TRACE_EVENT(kvm_hv_tscpage_update,
+	TP_PROTO(HV_REFERENCE_TSC_PAGE *tsc_ref),
+	TP_ARGS(tsc_ref),
+
+	TP_STRUCT__entry(
+		__field(u32, tsc_sequence)
+		__field(u64, tsc_scale)
+		__field(s64, tsc_offset)
+	),
+
+	TP_fast_assign(
+		__entry->tsc_sequence = tsc_ref->tsc_sequence;
+		__entry->tsc_scale = tsc_ref->tsc_scale;
+		__entry->tsc_offset = tsc_ref->tsc_offset;
+	),
+
+	TP_printk("tsc_ref_page { sequence %u, scale 0x%llx, offset %lld }",
+		  __entry->tsc_sequence,
+		  __entry->tsc_scale,
+		  __entry->tsc_offset)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a4a32ab..f12130a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1833,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->pv_time_enabled)
+	if (!vcpu->pv_time_enabled && !kvm_hv_tscpage_need_update(v))
 		return 0;
 
 	if (kvm_has_tsc_control)
@@ -1851,7 +1851,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	kvm_pvclock_update(v, use_master_clock);
+	if (vcpu->pv_time_enabled)
+		kvm_pvclock_update(v, use_master_clock);
+	if (kvm_hv_tscpage_need_update(v))
+		kvm_hv_tscpage_update(v, use_master_clock);
 	return 0;
 }
 
-- 
2.5.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock
  2016-04-21 17:11 ` [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
@ 2016-04-25 20:54   ` Radim Krčmář
  2016-04-26  9:02     ` Roman Kagan
  0 siblings, 1 reply; 9+ messages in thread
From: Radim Krčmář @ 2016-04-25 20:54 UTC (permalink / raw)
  To: Roman Kagan; +Cc: kvm, Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev

2016-04-21 20:11+0300, Roman Kagan:
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> @@ -797,23 +798,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
>  		mark_page_dirty(kvm, gfn);
>  		break;
>  	}
> +	case HV_X64_MSR_REFERENCE_TSC:

(Would be nicer to check for HV_X64_MSR_REFERENCE_TSC_AVAILABLE.)

>  		hv->hv_tsc_page = data;
> +		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> +			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

The MSR value is global and will be seen by other VCPUs before we write
the page for the first time, which means there is an extremely unlikely
race that could read random data from a guest page and interpret it as
time.  Initialization before setting hv_tsc_page would be fine.

(Also, TLFS 4.0b says that the guest can pick any frame in the GPA
 space.  The guest could specify a frame that wouldn't be mapped in KVM
 and the guest would fail for no good reason.  HyperV's "overlay pages"
 likely don't read or overwrite content of mapped frames either.
 I think it would be safer to create a new mapping for the page ...)

> @@ -1143,3 +1132,107 @@ set_result:
> +static int pvclock_to_tscpage(struct pvclock_vcpu_time_info *hv_clock,
> +			       HV_REFERENCE_TSC_PAGE *tsc_ref)
> +{
| [...]
> +	 * tsc_scale = (tsc_to_system_mul << (tsc_shift + 32)) / 100
| [...]
> +	 * Note that although tsc_to_system_mul is 32 bit, we may need 128 bit
> +	 * division to calculate tsc_scale

Linux has a function that handles u96/u64 with 64 and even 32 bit
arithmetics, mul_u64_u32_div().  Do you think this would be ok?

  tsc_ref->tsc_scale = mul_u64_u32_div(1ULL << hv_clock->tsc_shift + 32,
                                       tsc_to_system_mul, 100);

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock
  2016-04-25 20:54   ` Radim Krčmář
@ 2016-04-26  9:02     ` Roman Kagan
  2016-04-26 13:00       ` Radim Krčmář
  0 siblings, 1 reply; 9+ messages in thread
From: Roman Kagan @ 2016-04-26  9:02 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev

On Mon, Apr 25, 2016 at 10:54:12PM +0200, Radim Krčmář wrote:
> 2016-04-21 20:11+0300, Roman Kagan:
> > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> > @@ -797,23 +798,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
> >  		mark_page_dirty(kvm, gfn);
> >  		break;
> >  	}
> > +	case HV_X64_MSR_REFERENCE_TSC:
> 
> (Would be nicer to check for HV_X64_MSR_REFERENCE_TSC_AVAILABLE.)

Hmm, interesting point.  This is a jugdement call, whether we should
refuse processing this MSR if we didn't announce its support to the
guest in the respective cpuid leaf (I personally don't think so).  We
don't do it for a number of other MSRs, if we should then it probably
has to be a separate patch fixing all of them.

> >  		hv->hv_tsc_page = data;
> > +		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> > +			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> 
> The MSR value is global and will be seen by other VCPUs before we write
> the page for the first time, which means there is an extremely unlikely
> race that could read random data from a guest page and interpret it as
> time.  Initialization before setting hv_tsc_page would be fine.

KVM_REQ_MASTERCLOCK_UPDATE will make sure the page has valid contents
before returning to the guest.  As for other VCPUs it's up to the guest
to synchronize access to the page with this VCPU; we can't prevent them
from reading it before we return to the guest.

> (Also, TLFS 4.0b says that the guest can pick any frame in the GPA
>  space.  The guest could specify a frame that wouldn't be mapped in KVM
>  and the guest would fail for no good reason.  HyperV's "overlay pages"
>  likely don't read or overwrite content of mapped frames either.
>  I think it would be safer to create a new mapping for the page ...)

I've never seen this happen; if this is really possible we'll have to do
more (e.g. the migration of the contents of this page won't happen
automatically).  I'll double-check with the spec, thanks.

> > @@ -1143,3 +1132,107 @@ set_result:
> > +static int pvclock_to_tscpage(struct pvclock_vcpu_time_info *hv_clock,
> > +			       HV_REFERENCE_TSC_PAGE *tsc_ref)
> > +{
> | [...]
> > +	 * tsc_scale = (tsc_to_system_mul << (tsc_shift + 32)) / 100
> | [...]
> > +	 * Note that although tsc_to_system_mul is 32 bit, we may need 128 bit
> > +	 * division to calculate tsc_scale
> 
> Linux has a function that handles u96/u64 with 64 and even 32 bit
> arithmetics, mul_u64_u32_div().  Do you think this would be ok?
> 
>   tsc_ref->tsc_scale = mul_u64_u32_div(1ULL << hv_clock->tsc_shift + 32,
>                                        tsc_to_system_mul, 100);

Indeed, thanks (shame on me for missing it in math64.h).

Roman.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock
  2016-04-26  9:02     ` Roman Kagan
@ 2016-04-26 13:00       ` Radim Krčmář
  2016-04-26 14:16         ` Roman Kagan
  0 siblings, 1 reply; 9+ messages in thread
From: Radim Krčmář @ 2016-04-26 13:00 UTC (permalink / raw)
  To: Roman Kagan, kvm, Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev

2016-04-26 12:02+0300, Roman Kagan:
> On Mon, Apr 25, 2016 at 10:54:12PM +0200, Radim Krčmář wrote:
>> 2016-04-21 20:11+0300, Roman Kagan:
>> > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> > @@ -797,23 +798,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
>> >  		mark_page_dirty(kvm, gfn);
>> >  		break;
>> >  	}
>> > +	case HV_X64_MSR_REFERENCE_TSC:
>> 
>> (Would be nicer to check for HV_X64_MSR_REFERENCE_TSC_AVAILABLE.)
> 
> Hmm, interesting point.  This is a jugdement call, whether we should
> refuse processing this MSR if we didn't announce its support to the
> guest in the respective cpuid leaf (I personally don't think so).  We
> don't do it for a number of other MSRs, if we should then it probably
> has to be a separate patch fixing all of them.

Ok.

>> >  		hv->hv_tsc_page = data;
>> > +		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
>> > +			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
>> 
>> The MSR value is global and will be seen by other VCPUs before we write
>> the page for the first time, which means there is an extremely unlikely
>> race that could read random data from a guest page and interpret it as
>> time.  Initialization before setting hv_tsc_page would be fine.
> 
> KVM_REQ_MASTERCLOCK_UPDATE will make sure the page has valid contents
> before returning to the guest.

Yes.

> before returning to the guest.  As for other VCPUs it's up to the guest
> to synchronize access to the page with this VCPU;

One method of synchronization is checking whether the other vcpu already
enabled HV_X64_MSR_REFERENCE_TSC by reading the MSR ... the method is
not a clear guest error (though people capable of doing it are going to
bug) and we'd have this race

    vcpu0                      |   vcpu1
  hv->hv_tsc_page = data;      | *guest rdmsr HV_X64_MSR_REFERENCE_TSC*
                               | data = hv->hv_tsc_page;
                               | kvm_x86_ops->run(vcpu);
                               | *guest reads the page*
  kvm_gen_update_masterclock() |

Another minor benefit of zeroing TscSequence before writing data is that
counting always starts at 0.
(The code doesn't handle remapping anyway.)

>                                                   we can't prevent them
> from reading it before we return to the guest.

(Yeah, it's not impossible, but we don't want to.)

>> (Also, TLFS 4.0b says that the guest can pick any frame in the GPA
>>  space.  The guest could specify a frame that wouldn't be mapped in KVM
>>  and the guest would fail for no good reason.  HyperV's "overlay pages"
>>  likely don't read or overwrite content of mapped frames either.
>>  I think it would be safer to create a new mapping for the page ...)
> 
> I've never seen this happen; if this is really possible we'll have to do
> more (e.g. the migration of the contents of this page won't happen
> automatically).  I'll double-check with the spec, thanks.

Thanks, I was reading mainly 8.1.3 GPA Overlay Pages.
Guests probably don't utilize that, but all overlay pages would have
this bug, so I'm ok with ignoring it for now too.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock
  2016-04-26 13:00       ` Radim Krčmář
@ 2016-04-26 14:16         ` Roman Kagan
  2016-04-26 14:36           ` Radim Krčmář
  0 siblings, 1 reply; 9+ messages in thread
From: Roman Kagan @ 2016-04-26 14:16 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev

On Tue, Apr 26, 2016 at 03:00:45PM +0200, Radim Krčmář wrote:
> 2016-04-26 12:02+0300, Roman Kagan:
> > On Mon, Apr 25, 2016 at 10:54:12PM +0200, Radim Krčmář wrote:
> >> 2016-04-21 20:11+0300, Roman Kagan:
> >> >  		hv->hv_tsc_page = data;
> >> > +		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> >> > +			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> >> 
> >> The MSR value is global and will be seen by other VCPUs before we write
> >> the page for the first time, which means there is an extremely unlikely
> >> race that could read random data from a guest page and interpret it as
> >> time.  Initialization before setting hv_tsc_page would be fine.
> > 
> > KVM_REQ_MASTERCLOCK_UPDATE will make sure the page has valid contents
> > before returning to the guest.
> 
> Yes.
> 
> > before returning to the guest.  As for other VCPUs it's up to the guest
> > to synchronize access to the page with this VCPU;
> 
> One method of synchronization is checking whether the other vcpu already
> enabled HV_X64_MSR_REFERENCE_TSC by reading the MSR ... the method is
> not a clear guest error (though people capable of doing it are going to
> bug) and we'd have this race
> 
>     vcpu0                      |   vcpu1
>   hv->hv_tsc_page = data;      | *guest rdmsr HV_X64_MSR_REFERENCE_TSC*
>                                | data = hv->hv_tsc_page;
>                                | kvm_x86_ops->run(vcpu);
>                                | *guest reads the page*
>   kvm_gen_update_masterclock() |

I can hardly imagine introducing a clocksource to avoid MSR reads, and
synchronizing access to it by reads of another MSR ;)

> Another minor benefit of zeroing TscSequence before writing data is that
> counting always starts at 0.

... which is arguably a minor disadvantage as it would reset the
sequence number on migration.

That said, I don't really feel strong about it, and I'm OK zeroing the
tsc page out if you think it worth while.

Roman.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock
  2016-04-26 14:16         ` Roman Kagan
@ 2016-04-26 14:36           ` Radim Krčmář
  0 siblings, 0 replies; 9+ messages in thread
From: Radim Krčmář @ 2016-04-26 14:36 UTC (permalink / raw)
  To: Roman Kagan, kvm, Paolo Bonzini, Marcelo Tosatti, Denis V. Lunev

2016-04-26 17:16+0300, Roman Kagan:
> On Tue, Apr 26, 2016 at 03:00:45PM +0200, Radim Krčmář wrote:
>> 2016-04-26 12:02+0300, Roman Kagan:
>> > On Mon, Apr 25, 2016 at 10:54:12PM +0200, Radim Krčmář wrote:
>> >> 2016-04-21 20:11+0300, Roman Kagan:
>> >> >  		hv->hv_tsc_page = data;
>> >> > +		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
>> >> > +			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
>> >> 
>> >> The MSR value is global and will be seen by other VCPUs before we write
>> >> the page for the first time, which means there is an extremely unlikely
>> >> race that could read random data from a guest page and interpret it as
>> >> time.  Initialization before setting hv_tsc_page would be fine.
>> > 
>> > KVM_REQ_MASTERCLOCK_UPDATE will make sure the page has valid contents
>> > before returning to the guest.  As for other VCPUs it's up to the guest
>> > to synchronize access to the page with this VCPU;
>> 
>> One method of synchronization is checking whether the other vcpu already
>> enabled HV_X64_MSR_REFERENCE_TSC by reading the MSR ... the method is
>> not a clear guest error (though people capable of doing it are going to
>> bug) and we'd have this race
>> 
>>     vcpu0                      |   vcpu1
>>   hv->hv_tsc_page = data;      | *guest rdmsr HV_X64_MSR_REFERENCE_TSC*
>>                                | data = hv->hv_tsc_page;
>>                                | kvm_x86_ops->run(vcpu);
>>                                | *guest reads the page*
>>   kvm_gen_update_masterclock() |
> 
> I can hardly imagine introducing a clocksource to avoid MSR reads, and
> synchronizing access to it by reads of another MSR ;)

Yes, any sane guest would definitely use memory for that, but
synchronization (= letting all VCPUs know where the TSC page is present)
is a boot-time only operation and doesn't ruin the idea.  Multiple OSes
inside one partitioned VM are harder to synchronize, but luckily no-one
does that. :)

>> Another minor benefit of zeroing TscSequence before writing data is that
>> counting always starts at 0.
> 
> ... which is arguably a minor disadvantage as it would reset the
> sequence number on migration.

Migration (= save/restore) shouldn't write to the MSR.  I can see that
other VCPUs might write the same value at/after boot time and expect
that the counter wouldn't reset, though ...

> That said, I don't really feel strong about it, and I'm OK zeroing the
> tsc page out if you think it worth while.

Nah, it turned out that guests can bug both ways, so let's keep it
uninitialized.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-04-26 14:36 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-21 17:11 [PATCH 0/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
2016-04-21 17:11 ` [PATCH 1/3] hyperv: add define for time unit Roman Kagan
2016-04-21 17:11 ` [PATCH 2/3] x86/kvm: factor out updating kvm_clock in guest Roman Kagan
2016-04-21 17:11 ` [PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock Roman Kagan
2016-04-25 20:54   ` Radim Krčmář
2016-04-26  9:02     ` Roman Kagan
2016-04-26 13:00       ` Radim Krčmář
2016-04-26 14:16         ` Roman Kagan
2016-04-26 14:36           ` Radim Krčmář

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.