[PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page
@ 2016-09-19 11:39 Paolo Bonzini
  2016-09-19 11:39 ` [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock Paolo Bonzini
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Paolo Bonzini @ 2016-09-19 11:39 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, den, rkagan, peterhornyack

This implements the plan that we discussed at KVM Forum.  Patch 3
implements a new function that returns the current value of kvmclock as
read by the guest.  The function is then used to provide the Hyper-V
time reference counter.  Now that the reference counter is synchronized
with kvmclock, the TSC page can be supported nicely.

Paolo

v2->v3: use TSC page information to compute time reference counter value
	fixed case where TSC seq is 0xFFFFFFFE

Paolo Bonzini (4):
  KVM: x86: always fill in vcpu->arch.hv_clock
  KVM: x86: initialize kvmclock_offset
  KVM: x86: introduce get_kvmclock_ns
  KVM: x86: Hyper-V tsc page setup

 arch/x86/entry/vdso/vclock_gettime.c |   2 +-
 arch/x86/include/asm/kvm_host.h      |   2 +
 arch/x86/include/asm/pvclock.h       |   5 +-
 arch/x86/kernel/pvclock.c            |   2 +-
 arch/x86/kvm/hyperv.c                | 162 +++++++++++++++++++++++++++++++----
 arch/x86/kvm/hyperv.h                |   3 +
 arch/x86/kvm/x86.c                   | 158 ++++++++++++++++++++--------------
 arch/x86/kvm/x86.h                   |   6 +-
 8 files changed, 253 insertions(+), 87 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock
  2016-09-19 11:39 [PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
@ 2016-09-19 11:39 ` Paolo Bonzini
  2016-09-19 16:33   ` Roman Kagan
  2016-09-19 11:39 ` [PATCH 2/4] KVM: x86: initialize kvmclock_offset Paolo Bonzini
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2016-09-19 11:39 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, den, rkagan, peterhornyack

We will use it in the next patches for KVM_GET_CLOCK and as a basis for the
contents of the Hyper-V TSC page.  Get the values from the Linux
timekeeper even if kvmclock is not enabled.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 109 +++++++++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 50 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4ae92599d34..d1e830715e40 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1722,6 +1722,60 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	struct pvclock_vcpu_time_info guest_hv_clock;
+
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+		&guest_hv_clock, sizeof(guest_hv_clock))))
+		return;
+
+	/* This VCPU is paused, but it's legal for a guest to read another
+	 * VCPU's kvmclock, so we really have to follow the specification where
+	 * it says that version is odd if data is being modified, and even after
+	 * it is consistent.
+	 *
+	 * Version field updates must be kept separate.  This is because
+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
+	 * writes within a string instruction are weakly ordered.  So there
+	 * are three writes overall.
+	 *
+	 * As a small optimization, only write the version field in the first
+	 * and third write.  The vcpu->pv_time cache is still valid, because the
+	 * version field is the first in the struct.
+	 */
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+
+	smp_wmb();
+
+	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
+
+	smp_wmb();
+
+	vcpu->hv_clock.version++;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, tgt_tsc_khz;
@@ -1729,7 +1783,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
 	u64 tsc_timestamp, host_tsc;
-	struct pvclock_vcpu_time_info guest_hv_clock;
 	u8 pvclock_flags;
 	bool use_master_clock;
 
@@ -1783,8 +1836,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->pv_time_enabled)
-		return 0;
+	/* With all the info we got, fill in the values */
 
 	if (kvm_has_tsc_control)
 		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1796,64 +1848,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->hw_tsc_khz = tgt_tsc_khz;
 	}
 
-	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-		&guest_hv_clock, sizeof(guest_hv_clock))))
-		return 0;
-
-	/* This VCPU is paused, but it's legal for a guest to read another
-	 * VCPU's kvmclock, so we really have to follow the specification where
-	 * it says that version is odd if data is being modified, and even after
-	 * it is consistent.
-	 *
-	 * Version field updates must be kept separate.  This is because
-	 * kvm_write_guest_cached might use a "rep movs" instruction, and
-	 * writes within a string instruction are weakly ordered.  So there
-	 * are three writes overall.
-	 *
-	 * As a small optimization, only write the version field in the first
-	 * and third write.  The vcpu->pv_time cache is still valid, because the
-	 * version field is the first in the struct.
-	 */
-	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
-
-	smp_wmb();
-
-	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-	if (vcpu->pvclock_set_guest_stopped_request) {
-		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-		vcpu->pvclock_set_guest_stopped_request = false;
-	}
-
 	/* If the host uses TSC clocksource, then it is stable */
+	pvclock_flags = 0;
 	if (use_master_clock)
 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
-
-	smp_wmb();
+	if (!vcpu->pv_time_enabled)
+		return 0;
 
-	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	kvm_setup_pvclock_page(v);
 	return 0;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/4] KVM: x86: initialize kvmclock_offset
  2016-09-19 11:39 [PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
  2016-09-19 11:39 ` [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock Paolo Bonzini
@ 2016-09-19 11:39 ` Paolo Bonzini
  2016-09-19 11:39 ` [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns Paolo Bonzini
  2016-09-19 11:39 ` [PATCH 4/4] KVM: x86: Hyper-V tsc page setup Paolo Bonzini
  3 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2016-09-19 11:39 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, den, rkagan, peterhornyack

Make the guest's kvmclock count up from zero, not from the host boot
time.  The guest cannot rely on that anyway because it changes on
migration, the numbers are easier on the eye and finally it matches the
desired semantics of the Hyper-V time reference counter.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d1e830715e40..00e569c3ca71 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7779,6 +7779,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	mutex_init(&kvm->arch.apic_map_lock);
 	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
+	kvm->arch.kvmclock_offset = -get_kernel_ns();
 	pvclock_update_vm_gtod_copy(kvm);
 
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns
  2016-09-19 11:39 [PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
  2016-09-19 11:39 ` [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock Paolo Bonzini
  2016-09-19 11:39 ` [PATCH 2/4] KVM: x86: initialize kvmclock_offset Paolo Bonzini
@ 2016-09-19 11:39 ` Paolo Bonzini
  2016-09-19 16:33   ` Roman Kagan
  2016-09-19 11:39 ` [PATCH 4/4] KVM: x86: Hyper-V tsc page setup Paolo Bonzini
  3 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2016-09-19 11:39 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, den, rkagan, peterhornyack

Introduce a function that reads the exact nanoseconds value that is
provided to the guest in kvmclock.  This crystallizes the notion of
kvmclock as a thin veneer over a stable TSC, that the guest will
(hopefully) convert with NTP.  In other words, kvmclock is *not* a
paravirtualized host-to-guest NTP.

Drop the get_kernel_ns() function, that was used both to get the base
value of the master clock and to get the current value of kvmclock.
The former use is replaced by ktime_get_boot_ns(), the latter is
the purpose of get_kernel_ns().

This also allows KVM to provide a Hyper-V time reference counter that
is synchronized with the time that is computed from the TSC page.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/entry/vdso/vclock_gettime.c |  2 +-
 arch/x86/include/asm/pvclock.h       |  5 ++--
 arch/x86/kernel/pvclock.c            |  2 +-
 arch/x86/kvm/hyperv.c                |  2 +-
 arch/x86/kvm/x86.c                   | 48 +++++++++++++++++++++++++++---------
 arch/x86/kvm/x86.h                   |  6 +----
 6 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 94d54d0defa7..02223cb4bcfd 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 			return 0;
 		}
 
-		ret = __pvclock_read_cycles(pvti);
+		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
 	} while (pvclock_read_retry(pvti, version));
 
 	/* refer to vread_tsc() comment for rationale */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d019f0cc80ec..3ad741b84072 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+			      u64 tsc)
 {
-	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+	u64 delta = tsc - src->tsc_timestamp;
 	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
 					     src->tsc_shift);
 	return src->system_time + offset;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 3599404e3089..5b2cc889ce34 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 
 	do {
 		version = pvclock_read_begin(src);
-		ret = __pvclock_read_cycles(src);
+		ret = __pvclock_read_cycles(src, rdtsc_ordered());
 		flags = src->flags;
 	} while (pvclock_read_retry(src, version));
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01bd7b7a6866..ed5b77f39ffb 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
 
 static u64 get_time_ref_counter(struct kvm *kvm)
 {
-	return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+	return div_u64(get_kvmclock_ns(kvm), 100);
 }
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00e569c3ca71..81e9945cdf28 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1431,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
-	ns = get_kernel_ns();
+	ns = ktime_get_boot_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
@@ -1722,6 +1722,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+	struct kvm_arch *ka = &kvm->arch;
+	s64 ns;
+
+	if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+		u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+		ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+	} else {
+		ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+	}
+
+	return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+	unsigned long flags;
+	s64 ns;
+
+	local_irq_save(flags);
+	ns = __get_kvmclock_ns(kvm);
+	local_irq_restore(flags);
+
+	return ns;
+}
+
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 {
 	struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -1811,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	}
 	if (!use_master_clock) {
 		host_tsc = rdtsc();
-		kernel_ns = get_kernel_ns();
+		kernel_ns = ktime_get_boot_ns();
 	}
 
 	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -4054,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_SET_CLOCK: {
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
-		s64 delta;
 
 		r = -EFAULT;
 		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4066,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		r = 0;
 		local_irq_disable();
-		now_ns = get_kernel_ns();
-		delta = user_ns.clock - now_ns;
+		now_ns = __get_kvmclock_ns(kvm);
+		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
 		local_irq_enable();
-		kvm->arch.kvmclock_offset = delta;
 		kvm_gen_update_masterclock(kvm);
 		break;
 	}
@@ -4077,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		local_irq_disable();
-		now_ns = get_kernel_ns();
-		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
-		local_irq_enable();
+		now_ns = get_kvmclock_ns(kvm);
+		user_ns.clock = now_ns;
 		user_ns.flags = 0;
 		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
@@ -7544,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
 	 * before any KVM threads can be running.  Unfortunately, we can't
 	 * bring the TSCs fully up to date with real time, as we aren't yet far
 	 * enough into CPU bringup that we know how much real time has actually
-	 * elapsed; our helper function, get_kernel_ns() will be using boot
+	 * elapsed; our helper function, ktime_get_boot_ns() will be using boot
 	 * variables that haven't been updated yet.
 	 *
 	 * So we simply find the maximum observed TSC above, then record the
@@ -7779,7 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	mutex_init(&kvm->arch.apic_map_lock);
 	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
-	kvm->arch.kvmclock_offset = -get_kernel_ns();
+	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
 	pvclock_update_vm_gtod_copy(kvm);
 
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a82ca466b62e..e8ff3e4ce38a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
 	return kvm_register_write(vcpu, reg, val);
 }
 
-static inline u64 get_kernel_ns(void)
-{
-	return ktime_get_boot_ns();
-}
-
 static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
 {
 	return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/4] KVM: x86: Hyper-V tsc page setup
  2016-09-19 11:39 [PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
                   ` (2 preceding siblings ...)
  2016-09-19 11:39 ` [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns Paolo Bonzini
@ 2016-09-19 11:39 ` Paolo Bonzini
  2016-09-19 16:38   ` Roman Kagan
  3 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2016-09-19 11:39 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, den, rkagan, peterhornyack, Andrey Smetanin

Lately tsc page was implemented but filled with empty
values. This patch setup tsc page scale and offset based
on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.

The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
reads count to zero which potentially improves performance.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Radim Krčmář <rkrcmar@redhat.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
[Computation of TSC page parameters rewritten to use the Linux timekeeper
 parameters. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/hyperv.c           | 162 ++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/hyperv.h           |   3 +
 arch/x86/kvm/x86.c              |   8 +-
 4 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 32a43a25d415..4b20f7304b9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -702,6 +702,8 @@ struct kvm_hv {
 	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
 	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
 	u64 hv_crash_ctl;
+
+	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
 struct kvm_arch {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ed5b77f39ffb..555951625350 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
 
 static u64 get_time_ref_counter(struct kvm *kvm)
 {
-	return div_u64(get_kvmclock_ns(kvm), 100);
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	struct kvm_vcpu *vcpu;
+	u64 tsc;
+
+	/*
+	 * The guest has not set up the TSC page or the clock isn't
+	 * stable, fall back to get_kvmclock_ns.
+	 */
+	if (!hv->tsc_ref.tsc_sequence)
+		return div_u64(get_kvmclock_ns(kvm), 100);
+
+	vcpu = kvm_get_vcpu(kvm, 0);
+	tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+	return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+		+ hv->tsc_ref.tsc_offset;
 }
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +774,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+					HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+	u64 max_mul;
+
+	if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+		return false;
+
+	/*
+	 * check if scale would overflow, if so we use the time ref counter
+	 *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+	 *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+	 *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+	 */
+	max_mul = 100ull << (32 - hv_clock->tsc_shift);
+	if (hv_clock->tsc_to_system_mul >= max_mul)
+		return false;
+
+	/*
+	 * Otherwise compute the scale and offset according to the formulas
+	 * derived above.
+	 */
+	tsc_ref->tsc_scale =
+		mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+				hv_clock->tsc_to_system_mul,
+				100);
+
+	tsc_ref->tsc_offset = hv_clock->system_time;
+	do_div(tsc_ref->tsc_offset, 100);
+	tsc_ref->tsc_offset -=
+		mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+	return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	u32 tsc_seq;
+	u64 gfn;
+
+	BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+	if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+		return;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	/*
+	 * Because the TSC parameters only vary when there is a
+	 * change in the master clock, do not bother with caching.
+	 */
+	if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+				    &tsc_seq, sizeof(tsc_seq))))
+		return;
+
+	/*
+	 * While we're computing and writing the parameters, force the
+	 * guest to use the time reference count MSR.
+	 */
+	hv->tsc_ref.tsc_sequence = 0;
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			    &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+		return;
+
+	if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+		return;
+
+	/* Ensure sequence is zero before writing the rest of the struct.  */
+	smp_wmb();
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+		return;
+
+	/*
+	 * Now switch to the TSC page mechanism by writing the sequence.
+	 */
+	tsc_seq++;
+	if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+		tsc_seq = 1;
+
+	/* Write the struct entirely before the non-zero sequence.  */
+	smp_wmb();
+
+	hv->tsc_ref.tsc_sequence = tsc_seq;
+	kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 			     bool host)
 {
@@ -793,23 +935,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 		break;
-	}
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 		return kvm_hv_msr_set_crash_data(vcpu,
 						 msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 81e9945cdf28..3ee8a91a78c3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1887,10 +1887,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	if (!vcpu->pv_time_enabled)
-		return 0;
-
-	kvm_setup_pvclock_page(v);
+	if (vcpu->pv_time_enabled)
+		kvm_setup_pvclock_page(v);
+	if (v == kvm_get_vcpu(v->kvm, 0))
+		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock
  2016-09-19 11:39 ` [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock Paolo Bonzini
@ 2016-09-19 16:33   ` Roman Kagan
  0 siblings, 0 replies; 10+ messages in thread
From: Roman Kagan @ 2016-09-19 16:33 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, rkrcmar, den, peterhornyack

On Mon, Sep 19, 2016 at 01:39:10PM +0200, Paolo Bonzini wrote:
> We will use it in the next patches for KVM_GET_CLOCK and as a basis for the
> contents of the Hyper-V TSC page.  Get the values from the Linux
> timekeeper even if kvmclock is not enabled.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/x86.c | 109 +++++++++++++++++++++++++++++------------------------
>  1 file changed, 59 insertions(+), 50 deletions(-)

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns
  2016-09-19 11:39 ` [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns Paolo Bonzini
@ 2016-09-19 16:33   ` Roman Kagan
  0 siblings, 0 replies; 10+ messages in thread
From: Roman Kagan @ 2016-09-19 16:33 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, rkrcmar, den, peterhornyack

On Mon, Sep 19, 2016 at 01:39:12PM +0200, Paolo Bonzini wrote:
> Introduce a function that reads the exact nanoseconds value that is
> provided to the guest in kvmclock.  This crystallizes the notion of
> kvmclock as a thin veneer over a stable TSC, that the guest will
> (hopefully) convert with NTP.  In other words, kvmclock is *not* a
> paravirtualized host-to-guest NTP.
> 
> Drop the get_kernel_ns() function, that was used both to get the base
> value of the master clock and to get the current value of kvmclock.
> The former use is replaced by ktime_get_boot_ns(), the latter is
> the purpose of get_kernel_ns().
> 
> This also allows KVM to provide a Hyper-V time reference counter that
> is synchronized with the time that is computed from the TSC page.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/entry/vdso/vclock_gettime.c |  2 +-
>  arch/x86/include/asm/pvclock.h       |  5 ++--
>  arch/x86/kernel/pvclock.c            |  2 +-
>  arch/x86/kvm/hyperv.c                |  2 +-
>  arch/x86/kvm/x86.c                   | 48 +++++++++++++++++++++++++++---------
>  arch/x86/kvm/x86.h                   |  6 +----
>  6 files changed, 43 insertions(+), 22 deletions(-)

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] KVM: x86: Hyper-V tsc page setup
  2016-09-19 11:39 ` [PATCH 4/4] KVM: x86: Hyper-V tsc page setup Paolo Bonzini
@ 2016-09-19 16:38   ` Roman Kagan
  0 siblings, 0 replies; 10+ messages in thread
From: Roman Kagan @ 2016-09-19 16:38 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm, rkrcmar, den, peterhornyack, Andrey Smetanin

On Mon, Sep 19, 2016 at 01:39:13PM +0200, Paolo Bonzini wrote:
> Lately tsc page was implemented but filled with empty
> values. This patch setup tsc page scale and offset based
> on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.
> 
> The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
> reads count to zero which potentially improves performance.
> 
> Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
> Reviewed-by: Peter Hornyack <peterhornyack@google.com>
> CC: Paolo Bonzini <pbonzini@redhat.com>
> CC: Radim Krčmář <rkrcmar@redhat.com>
> CC: Roman Kagan <rkagan@virtuozzo.com>
> CC: Denis V. Lunev <den@openvz.org>
> [Computation of TSC page parameters rewritten to use the Linux timekeeper
>  parameters. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   2 +
>  arch/x86/kvm/hyperv.c           | 162 ++++++++++++++++++++++++++++++++++++----
>  arch/x86/kvm/hyperv.h           |   3 +
>  arch/x86/kvm/x86.c              |   8 +-
>  4 files changed, 155 insertions(+), 20 deletions(-)

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] KVM: x86: Hyper-V tsc page setup
  2016-09-01 15:26 ` [PATCH 4/4] KVM: x86: Hyper-V tsc page setup Paolo Bonzini
@ 2016-09-02 14:17   ` Roman Kagan
  0 siblings, 0 replies; 10+ messages in thread
From: Roman Kagan @ 2016-09-02 14:17 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm, peterhornyack, rkrcmar, den, Andrey Smetanin

On Thu, Sep 01, 2016 at 05:26:15PM +0200, Paolo Bonzini wrote:
> Lately tsc page was implemented but filled with empty
> values. This patch setup tsc page scale and offset based
> on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.
> 
> The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
> reads count to zero which potentially improves performance.
> 
> Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
> Reviewed-by: Peter Hornyack <peterhornyack@google.com>
> CC: Paolo Bonzini <pbonzini@redhat.com>
> CC: Radim Krčmář <rkrcmar@redhat.com>
> CC: Roman Kagan <rkagan@virtuozzo.com>
> CC: Denis V. Lunev <den@openvz.org>
> [Computation of TSC page parameters rewritten to use the Linux timekeeper
>  parameters. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/hyperv.c | 141 ++++++++++++++++++++++++++++++++++++++++++++------
>  arch/x86/kvm/hyperv.h |   3 ++
>  arch/x86/kvm/x86.c    |   8 +--
>  3 files changed, 133 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index ed5b77f39ffb..e089d1f52dc0 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -756,6 +756,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> +/*
> + * The kvmclock and Hyper-V TSC page use similar formulas.  Because the KVM
> + * one is more precise, it is a little more complex.  However, converting

I'm not sure you're right regarding which one is more precise :)
Hyper-V uses a right shift of 64 which is higher precision than typical
kvmclock shift of around 22.

> + * between them is possible:
> + *
> + * kvmclock formula:
> + *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
> + *           + system_time
> + *
> + * Hyper-V formula:
> + *    nsec/100 = ticks * scale / 2^64 + offset
> + *
> + * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
> + * By dividing the kvmclock formula by 100 and equating what's left we get:
> + *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
> + *
> + * Now expand the kvmclock formula and divide by 100:
> + *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
> + *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
> + *           + system_time
> + *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + *               + system_time / 100
> + *
> + * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
> + *    nsec/100 = ticks * scale / 2^64
> + *               - tsc_timestamp * scale / 2^64
> + *               + system_time / 100
> + *
> + * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
> + *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
> + *
> + * These two equivalencies are implemented in this function.
> + */
> +static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
> +					HV_REFERENCE_TSC_PAGE *tsc_ref)
> +{
> +	u64 max_mul;
> +
> +	if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
> +		return false;
> +
> +	/*
> +	 * check if scale would overflow, if so we use the time ref counter
> +	 *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
> +	 *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
> +	 *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
> +	 */
> +	max_mul = 100ull << (32 - hv_clock->tsc_shift);
> +	if (hv_clock->tsc_to_system_mul >= max_mul)
> +		return false;
> +
> +	/*
> +	 * Otherwise compute the scale and offset according to the formulas
> +	 * derived above.
> +	 */
> +	tsc_ref->tsc_scale =
> +		mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
> +				hv_clock->tsc_to_system_mul,
> +				100);
> +
> +	tsc_ref->tsc_offset = hv_clock->system_time;
> +	do_div(tsc_ref->tsc_offset, 100);
> +	tsc_ref->tsc_offset -=
> +		mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
> +	return true;

Although this is correct, we may want to make it slightly easier to
follow: note that the hv_clock contents is essentially populated using
vcpu->hw_tsc_khz and vcpu->last_guest_tsc, so at this point we can just
directly calculate ->tsc_scale and ->tsc_offset from them.  If we also
stash them somewhere on vcpu we can make the reference counter use
exactly the same procedure as the guest would with tsc page, and
guarantee against precision errors.

Dunno if that really matters much, though.

> +}
> +
> +void kvm_hv_setup_tsc_page(struct kvm *kvm,
> +			   struct pvclock_vcpu_time_info *hv_clock)
> +{
> +	struct kvm_hv *hv = &kvm->arch.hyperv;
> +	HV_REFERENCE_TSC_PAGE tsc_ref = { 0 };
> +	u32 tsc_seq;
> +	u64 gfn;
> +
> +	BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(tsc_ref.tsc_sequence));
> +	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
> +
> +	if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
> +		return;
> +
> +	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> +	/*
> +	 * Because the TSC parameters only vary when there is a
> +	 * change in the master clock, do not bother with caching.
> +	 */
> +	if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
> +				    &tsc_seq, sizeof(tsc_seq))))
> +		return;
> +
> +	/*
> +	 * While we're computing and writing the parameters, force the
> +	 * guest to use the time reference count MSR.
> +	 */
> +	if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
> +			    &tsc_ref, sizeof(tsc_ref.tsc_sequence)))
> +		return;
> +
> +	if (!compute_tsc_page_parameters(hv_clock, &tsc_ref))
> +		return;
> +
> +	/* Ensure sequence is zero before writing the rest of the struct.  */
> +	smp_wmb();
> +	if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_ref, sizeof(tsc_ref)))
> +		return;
> +
> +	/*
> +	 * Now switch to the TSC page mechanism by writing the sequence.
> +	 */
> +	if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
> +		tsc_seq = 1;
> +	else
> +		tsc_seq++;

If tsc_seq = 0xFFFFFFFE you end up with invalid tsc_seq.
You rather need to unconditionally do the increment first, and compare
it to invalid values after that.

> +
> +	/* Write the struct entirely before the non-zero sequence.  */
> +	smp_wmb();
> +
> +	kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq));
> +}
> +
>  static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
>  			     bool host)
>  {
> @@ -793,23 +916,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
>  		mark_page_dirty(kvm, gfn);
>  		break;
>  	}
> -	case HV_X64_MSR_REFERENCE_TSC: {
> -		u64 gfn;
> -		HV_REFERENCE_TSC_PAGE tsc_ref;
> -
> -		memset(&tsc_ref, 0, sizeof(tsc_ref));
> +	case HV_X64_MSR_REFERENCE_TSC:
>  		hv->hv_tsc_page = data;
> -		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
> -			break;
> -		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> -		if (kvm_write_guest(
> -				kvm,
> -				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
> -				&tsc_ref, sizeof(tsc_ref)))
> -			return 1;
> -		mark_page_dirty(kvm, gfn);
> +		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> +			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
>  		break;
> -	}
>  	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
>  		return kvm_hv_msr_set_crash_data(vcpu,
>  						 msr - HV_X64_MSR_CRASH_P0,
> diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
> index 60eccd4bd1d3..cd1119538add 100644
> --- a/arch/x86/kvm/hyperv.h
> +++ b/arch/x86/kvm/hyperv.h
> @@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
>  
>  void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
>  
> +void kvm_hv_setup_tsc_page(struct kvm *kvm,
> +			   struct pvclock_vcpu_time_info *hv_clock);
> +
>  #endif
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2edcfa054cbe..2e792bf80912 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1881,10 +1881,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  
>  	vcpu->hv_clock.flags = pvclock_flags;
>  
> -	if (!vcpu->pv_time_enabled)
> -		return 0;
> -
> -	kvm_setup_pvclock_page(v);
> +	if (vcpu->pv_time_enabled)
> +		kvm_setup_pvclock_page(v);
> +	if (v == kvm_get_vcpu(v->kvm, 0))
> +		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
>  	return 0;
>  }
>  
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4/4] KVM: x86: Hyper-V tsc page setup
  2016-09-01 15:26 [PATCH 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
@ 2016-09-01 15:26 ` Paolo Bonzini
  2016-09-02 14:17   ` Roman Kagan
  0 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2016-09-01 15:26 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: peterhornyack, rkrcmar, den, rkagan, Andrey Smetanin

Lately tsc page was implemented but filled with empty
values. This patch setup tsc page scale and offset based
on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.

The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
reads count to zero which potentially improves performance.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Peter Hornyack <peterhornyack@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Radim Krčmář <rkrcmar@redhat.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
[Computation of TSC page parameters rewritten to use the Linux timekeeper
 parameters. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 141 ++++++++++++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/hyperv.h |   3 ++
 arch/x86/kvm/x86.c    |   8 +--
 3 files changed, 133 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ed5b77f39ffb..e089d1f52dc0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -756,6 +756,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas.  Because the KVM
+ * one is more precise, it is a little more complex.  However, converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+					HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+	u64 max_mul;
+
+	if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+		return false;
+
+	/*
+	 * check if scale would overflow, if so we use the time ref counter
+	 *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+	 *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+	 *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+	 */
+	max_mul = 100ull << (32 - hv_clock->tsc_shift);
+	if (hv_clock->tsc_to_system_mul >= max_mul)
+		return false;
+
+	/*
+	 * Otherwise compute the scale and offset according to the formulas
+	 * derived above.
+	 */
+	tsc_ref->tsc_scale =
+		mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+				hv_clock->tsc_to_system_mul,
+				100);
+
+	tsc_ref->tsc_offset = hv_clock->system_time;
+	do_div(tsc_ref->tsc_offset, 100);
+	tsc_ref->tsc_offset -=
+		mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+	return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	HV_REFERENCE_TSC_PAGE tsc_ref = { 0 };
+	u32 tsc_seq;
+	u64 gfn;
+
+	BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(tsc_ref.tsc_sequence));
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+	if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+		return;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	/*
+	 * Because the TSC parameters only vary when there is a
+	 * change in the master clock, do not bother with caching.
+	 */
+	if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+				    &tsc_seq, sizeof(tsc_seq))))
+		return;
+
+	/*
+	 * While we're computing and writing the parameters, force the
+	 * guest to use the time reference count MSR.
+	 */
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			    &tsc_ref, sizeof(tsc_ref.tsc_sequence)))
+		return;
+
+	if (!compute_tsc_page_parameters(hv_clock, &tsc_ref))
+		return;
+
+	/* Ensure sequence is zero before writing the rest of the struct.  */
+	smp_wmb();
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_ref, sizeof(tsc_ref)))
+		return;
+
+	/*
+	 * Now switch to the TSC page mechanism by writing the sequence.
+	 */
+	if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+		tsc_seq = 1;
+	else
+		tsc_seq++;
+
+	/* Write the struct entirely before the non-zero sequence.  */
+	smp_wmb();
+
+	kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq));
+}
+
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 			     bool host)
 {
@@ -793,23 +916,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 		break;
-	}
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 		return kvm_hv_msr_set_crash_data(vcpu,
 						 msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2edcfa054cbe..2e792bf80912 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1881,10 +1881,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	if (!vcpu->pv_time_enabled)
-		return 0;
-
-	kvm_setup_pvclock_page(v);
+	if (vcpu->pv_time_enabled)
+		kvm_setup_pvclock_page(v);
+	if (v == kvm_get_vcpu(v->kvm, 0))
+		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-09-19 19:05 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-19 11:39 [PATCH v2 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
2016-09-19 11:39 ` [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock Paolo Bonzini
2016-09-19 16:33   ` Roman Kagan
2016-09-19 11:39 ` [PATCH 2/4] KVM: x86: initialize kvmclock_offset Paolo Bonzini
2016-09-19 11:39 ` [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns Paolo Bonzini
2016-09-19 16:33   ` Roman Kagan
2016-09-19 11:39 ` [PATCH 4/4] KVM: x86: Hyper-V tsc page setup Paolo Bonzini
2016-09-19 16:38   ` Roman Kagan
  -- strict thread matches above, loose matches on Subject: below --
2016-09-01 15:26 [PATCH 0/4] KVM: x86: kvmclock cleanups for Hyper-V TSC page Paolo Bonzini
2016-09-01 15:26 ` [PATCH 4/4] KVM: x86: Hyper-V tsc page setup Paolo Bonzini
2016-09-02 14:17   ` Roman Kagan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.