[PATCH v2 0/2] if running under KVM, use kvmclock to compute TSC deadline value

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 0/2] if running under KVM, use kvmclock to compute TSC deadline value
@ 2016-09-06 22:29 Paolo Bonzini
  2016-09-06 22:29 ` [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops Paolo Bonzini
  2016-09-06 22:29 ` [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value Paolo Bonzini
  0 siblings, 2 replies; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-06 22:29 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, dmatlack, luto, peterhornyack, x86

v1 posted here: https://patchwork.kernel.org/patch/9214993/

The motivation for this patch is in patch 2 (or you can read it from v1).

This version is larger but the hooks into apic.c are cleaner than in
v1.  Instead of arranging for kvmclock to replace only a small part of
setup_apic_timer, it registers its own clockevent.  The downside is that
kvmclock now needs to hook into the LAPIC timer interrupt to invoke the
event_handler of the new clockevent, but this is pretty straightforward
with a new pvop (assuming the introduction of new pvops is straightforward
at all).

Thanks,

Paolo

Paolo Bonzini (2):
  x86: paravirt: add local_apic_timer_interrupt to pv_ops
  x86, kvm: use kvmclock to compute TSC deadline value

 arch/x86/include/asm/apic.h           |   2 +
 arch/x86/include/asm/paravirt.h       |   5 ++
 arch/x86/include/asm/paravirt_types.h |   1 +
 arch/x86/kernel/apic/apic.c           |   4 +-
 arch/x86/kernel/kvmclock.c            | 156 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/paravirt.c            |   1 +
 6 files changed, 167 insertions(+), 2 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops
  2016-09-06 22:29 [PATCH v2 0/2] if running under KVM, use kvmclock to compute TSC deadline value Paolo Bonzini
@ 2016-09-06 22:29 ` Paolo Bonzini
  2016-09-07  6:25   ` kbuild test robot
  2016-09-07  6:33   ` kbuild test robot
  2016-09-06 22:29 ` [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value Paolo Bonzini
  1 sibling, 2 replies; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-06 22:29 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, dmatlack, luto, peterhornyack, x86

local_apic_timer_interrupt refers to a static clockevent in
apic.c.  Paravirtualized implementations need to check a different
clockevent, so allow customizing the guts of the APIC timer interrupt.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/apic.h           | 1 +
 arch/x86/include/asm/paravirt.h       | 5 +++++
 arch/x86/include/asm/paravirt_types.h | 1 +
 arch/x86/kernel/apic/apic.c           | 2 +-
 arch/x86/kernel/paravirt.c            | 1 +
 5 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 124357773ffa..f6e0bad1cde2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -109,6 +109,7 @@ extern void native_apic_wait_icr_idle(void);
 extern u32 native_safe_apic_wait_icr_idle(void);
 extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
+extern void native_local_apic_timer_interrupt(void);
 
 static inline bool apic_is_x2apic_enabled(void)
 {
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2970d22d7766..71ec3a73745f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -197,6 +197,11 @@ static inline u64 paravirt_steal_clock(int cpu)
 	return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
 }
 
+static inline void local_apic_timer_interrupt(void)
+{
+	PVOP_VCALL0(pv_time_ops.local_apic_timer_interrupt);
+}
+
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
 	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7fa9e7740ba3..ea1f27400098 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -96,6 +96,7 @@ struct pv_lazy_ops {
 struct pv_time_ops {
 	unsigned long long (*sched_clock)(void);
 	unsigned long long (*steal_clock)(int cpu);
+	void (*local_apic_timer_interrupt)(void);
 };
 
 struct pv_cpu_ops {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index cea4fc19e844..5b63bec7d0af 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -902,7 +902,7 @@ void setup_secondary_APIC_clock(void)
 /*
  * The guts of the apic timer interrupt
  */
-static void local_apic_timer_interrupt(void)
+void native_local_apic_timer_interrupt(void)
 {
 	int cpu = smp_processor_id();
 	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ad5bc9578a73..0e056271d714 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -310,6 +310,7 @@ struct pv_init_ops pv_init_ops = {
 struct pv_time_ops pv_time_ops = {
 	.sched_clock = native_sched_clock,
 	.steal_clock = native_steal_clock,
+	.local_apic_timer_interrupt = native_local_apic_timer_interrupt,
 };
 
 __visible struct pv_irq_ops pv_irq_ops = {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops
  2016-09-06 22:29 ` [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops Paolo Bonzini
@ 2016-09-07  6:25   ` kbuild test robot
  2016-09-07  6:33   ` kbuild test robot
  1 sibling, 0 replies; 16+ messages in thread
From: kbuild test robot @ 2016-09-07  6:25 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: kbuild-all, linux-kernel, kvm, rkrcmar, dmatlack, luto,
	peterhornyack, x86

[-- Attachment #1: Type: text/plain, Size: 2595 bytes --]

Hi Paolo,

[auto build test ERROR on tip/x86/core]
[also build test ERROR on v4.8-rc5 next-20160906]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Paolo-Bonzini/if-running-under-KVM-use-kvmclock-to-compute-TSC-deadline-value/20160907-131553
config: i386-defconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   arch/x86/kernel/apic/apic.c: In function 'smp_apic_timer_interrupt':
>> arch/x86/kernel/apic/apic.c:931:2: error: implicit declaration of function 'local_apic_timer_interrupt' [-Werror=implicit-function-declaration]
     local_apic_timer_interrupt();
     ^~~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/local_apic_timer_interrupt +931 arch/x86/kernel/apic/apic.c

eddc0e92 arch/x86/kernel/apic/apic.c Seiji Aguchi    2013-06-20  925  	 *
0e078e2f arch/x86/kernel/apic_64.c   Thomas Gleixner 2008-01-30  926  	 * update_process_times() expects us to have done irq_enter().
0e078e2f arch/x86/kernel/apic_64.c   Thomas Gleixner 2008-01-30  927  	 * Besides, if we don't timer interrupts ignore the global
0e078e2f arch/x86/kernel/apic_64.c   Thomas Gleixner 2008-01-30  928  	 * interrupt lock, which is the WrongThing (tm) to do.
0e078e2f arch/x86/kernel/apic_64.c   Thomas Gleixner 2008-01-30  929  	 */
eddc0e92 arch/x86/kernel/apic/apic.c Seiji Aguchi    2013-06-20  930  	entering_ack_irq();
0e078e2f arch/x86/kernel/apic_64.c   Thomas Gleixner 2008-01-30 @931  	local_apic_timer_interrupt();
eddc0e92 arch/x86/kernel/apic/apic.c Seiji Aguchi    2013-06-20  932  	exiting_irq();
274cfe59 arch/x86/kernel/apic_64.c   Cyrill Gorcunov 2008-08-16  933  
0e078e2f arch/x86/kernel/apic_64.c   Thomas Gleixner 2008-01-30  934  	set_irq_regs(old_regs);

:::::: The code at line 931 was first introduced by commit
:::::: 0e078e2f5060e06f9b3f32e55665ea835343440e x86: prepare merging arch/x86/kernel/apic_32/64.c

:::::: TO: Thomas Gleixner <tglx@linutronix.de>
:::::: CC: Ingo Molnar <mingo@elte.hu>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 24374 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops
  2016-09-06 22:29 ` [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops Paolo Bonzini
  2016-09-07  6:25   ` kbuild test robot
@ 2016-09-07  6:33   ` kbuild test robot
  1 sibling, 0 replies; 16+ messages in thread
From: kbuild test robot @ 2016-09-07  6:33 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: kbuild-all, linux-kernel, kvm, rkrcmar, dmatlack, luto,
	peterhornyack, x86

[-- Attachment #1: Type: text/plain, Size: 1625 bytes --]

Hi Paolo,

[auto build test ERROR on tip/x86/core]
[also build test ERROR on v4.8-rc5 next-20160906]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Paolo-Bonzini/if-running-under-KVM-use-kvmclock-to-compute-TSC-deadline-value/20160907-131553
config: i386-randconfig-s1-201636 (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

>> arch/x86/kernel/paravirt.c:313:32: error: 'native_local_apic_timer_interrupt' undeclared here (not in a function)
     .local_apic_timer_interrupt = native_local_apic_timer_interrupt,
                                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vim +/native_local_apic_timer_interrupt +313 arch/x86/kernel/paravirt.c

   307		.patch = native_patch,
   308	};
   309	
   310	struct pv_time_ops pv_time_ops = {
   311		.sched_clock = native_sched_clock,
   312		.steal_clock = native_steal_clock,
 > 313		.local_apic_timer_interrupt = native_local_apic_timer_interrupt,
   314	};
   315	
   316	__visible struct pv_irq_ops pv_irq_ops = {

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 26872 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-06 22:29 [PATCH v2 0/2] if running under KVM, use kvmclock to compute TSC deadline value Paolo Bonzini
  2016-09-06 22:29 ` [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops Paolo Bonzini
@ 2016-09-06 22:29 ` Paolo Bonzini
  2016-09-08 22:13   ` David Matlack
  2016-09-15 15:09   ` Radim Krčmář
  1 sibling, 2 replies; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-06 22:29 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: rkrcmar, dmatlack, luto, peterhornyack, x86

Bad things happen if a guest using the TSC deadline timer is migrated.
The guest doesn't re-calibrate the TSC after migration, and the
TSC frequency can and will change unless your processor supports TSC
scaling (on Intel this is only Skylake) or your data center is perfectly
homogeneous.

The solution in this patch is to skip tsc_khz, and instead derive the
frequency from kvmclock's (mult, shift) pair.  Because kvmclock
parameters convert from tsc to nanoseconds, this needs a division
but that's the least of our problems when the TSC_DEADLINE_MSR write
costs 2000 clock cycles.  Luckily tsc_khz is really used by very little
outside the tsc clocksource (which kvmclock replaces) and the TSC
deadline timer.  Because KVM's local APIC doesn't need quirks, we
provide a paravirt clockevent that still uses the deadline timer
under the hood (as suggested by Andy Lutomirski).

This patch does not handle the very first deadline, hoping that it
is masked by the migration downtime (i.e. that the timer fires late
anyway).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/apic.h |   1 +
 arch/x86/kernel/apic/apic.c |   2 +-
 arch/x86/kernel/kvmclock.c  | 156 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f6e0bad1cde2..c88b0dcfdf3a 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -53,6 +53,7 @@ extern unsigned int apic_verbosity;
 extern int local_apic_timer_c2_ok;
 
 extern int disable_apic;
+extern int disable_apic_timer;
 extern unsigned int lapic_timer_frequency;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 5b63bec7d0af..d0c6d1e3d627 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -169,7 +169,7 @@ __setup("apicpmtimer", setup_apicpmtimer);
 unsigned long mp_lapic_addr;
 int disable_apic;
 /* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __initdata;
+int disable_apic_timer __initdata;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1d39bfbd26bb..365fa6494dd3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -17,6 +17,7 @@
 */
 
 #include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/kvm_para.h>
 #include <asm/pvclock.h>
 #include <asm/msr.h>
@@ -245,6 +246,155 @@ static void kvm_shutdown(void)
 	native_machine_shutdown();
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * kvmclock-based clock event implementation, used only together with the
+ * TSC deadline timer.  A subset of the normal LAPIC clockevent, but it
+ * uses kvmclock to convert nanoseconds to TSC.  This is necessary to
+ * handle changes to the TSC frequency, e.g. from live migration.
+ */
+
+static void kvmclock_lapic_timer_setup(unsigned lvtt_value)
+{
+	lvtt_value |= LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_TSCDEADLINE;
+	apic_write(APIC_LVTT, lvtt_value);
+}
+
+static int kvmclock_lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+	kvmclock_lapic_timer_setup(0);
+	printk_once(KERN_DEBUG "kvmclock: TSC deadline timer enabled\n");
+
+	/*
+	 * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+	 * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+	 * According to Intel, MFENCE can do the serialization here.
+	 */
+	asm volatile("mfence" : : : "memory");
+	return 0;
+}
+
+static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
+{
+	kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
+	wrmsrl(MSR_IA32_TSC_DEADLINE, -1);
+	return 0;
+}
+
+/*
+ * We already have the inverse of the (mult,shift) pair, though this means
+ * we need a division.  To avoid it we could compute a multiplicative inverse
+ * every time src->version changes.
+ */
+#define KVMCLOCK_TSC_DEADLINE_MAX_BITS	38
+#define KVMCLOCK_TSC_DEADLINE_MAX	((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
+
+static int kvmclock_lapic_next_ktime(ktime_t expires,
+				     struct clock_event_device *evt)
+{
+	u64 ns, tsc;
+	u32 version;
+	int cpu;
+	struct pvclock_vcpu_time_info *src;
+
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+	ns = ktime_to_ns(expires);
+
+	do {
+		u64 delta_ns;
+		int shift;
+
+		version = pvclock_read_begin(src);
+		if (unlikely(ns < src->system_time)) {
+			tsc = src->tsc_timestamp;
+			virt_rmb();
+			continue;
+		}
+
+		delta_ns = ns - src->system_time;
+
+		/* Cap the wait to avoid overflow.  */
+		if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
+			delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
+
+		/*
+		 * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
+		 * The shift is split in two steps so that a 38 bits (275 s)
+		 * deadline fits into the 64-bit dividend.
+		 */
+		shift = 32 - src->tsc_shift;
+		
+		/* First shift step... */
+		delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
+		shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
+
+		/* ... division... */
+		tsc = div_u64(delta_ns, src->tsc_to_system_mul);
+
+		/* ... and second shift step for the remaining bits.  */
+		if (shift >= 0)
+			tsc <<= shift;
+		else
+			tsc >>= -shift;
+
+		tsc += src->tsc_timestamp;
+	} while (pvclock_read_retry(src, version));
+
+	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
+	return 0;
+}
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device kvm_clockevent = {
+	.name			= "lapic",
+	/* Under KVM the LAPIC timer always runs in deep C-states.  */
+	.features		= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
+	.set_state_shutdown	= kvmclock_lapic_timer_stop,
+	.set_state_oneshot	= kvmclock_lapic_timer_set_oneshot,
+	.set_next_ktime		= kvmclock_lapic_next_ktime,
+	.mult			= 1,
+	/* Make LAPIC timer preferrable over percpu HPET */
+	.rating			= 150,
+	.irq			= -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
+
+static void kvmclock_local_apic_timer_interrupt(void)
+{
+	int cpu = smp_processor_id();
+	struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
+
+	/*
+	 * Defer to the native clockevent if ours hasn't been setup yet.
+	 */
+	if (!evt->event_handler) {
+		native_local_apic_timer_interrupt();
+		return;
+	}
+
+	inc_irq_stat(apic_timer_irqs);
+	evt->event_handler(evt);
+}
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initialized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void setup_kvmclock_timer(void)
+{
+	struct clock_event_device *evt = this_cpu_ptr(&kvm_events);
+
+	kvmclock_lapic_timer_stop(evt);
+
+	memcpy(evt, &kvm_clockevent, sizeof(*evt));
+	evt->cpumask = cpumask_of(smp_processor_id());
+	clockevents_register_device(evt);
+}
+#endif
+
 void __init kvmclock_init(void)
 {
 	struct pvclock_vcpu_time_info *vcpu_time;
@@ -292,6 +442,12 @@ void __init kvmclock_init(void)
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
+	if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) &&
+	    !disable_apic && !disable_apic_timer) {
+		pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt;
+		x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer;
+		x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer;
+	}
 	x86_cpuinit.early_percpu_clock_init =
 		kvm_setup_secondary_clock;
 #endif
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-06 22:29 ` [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value Paolo Bonzini
@ 2016-09-08 22:13   ` David Matlack
  2016-09-09 16:38     ` Paolo Bonzini
  2016-09-15 15:09   ` Radim Krčmář
  1 sibling, 1 reply; 16+ messages in thread
From: David Matlack @ 2016-09-08 22:13 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm list, Radim Krčmář,
	Andy Lutomirski, Peter Hornyack, X86 ML

Hi Paolo,

On Tue, Sep 6, 2016 at 3:29 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Bad things happen if a guest using the TSC deadline timer is migrated.
> The guest doesn't re-calibrate the TSC after migration, and the
> TSC frequency can and will change unless your processor supports TSC
> scaling (on Intel this is only Skylake) or your data center is perfectly
> homogeneous.

Sorry, I forgot to follow up on our discussion in v1. One thing we
discussed there was using the APIC Timer to workaround a changing TSC
rate. You pointed out KVM's TSC deadline timer got a nice performance
boost recently, which makes it preferable. Does it makes sense to
apply the same optimization (using the VMX preemption timer) to the
APIC Timer, instead of creating a new dependency on kvmclock?

>
> The solution in this patch is to skip tsc_khz, and instead derive the
> frequency from kvmclock's (mult, shift) pair.  Because kvmclock
> parameters convert from tsc to nanoseconds, this needs a division
> but that's the least of our problems when the TSC_DEADLINE_MSR write
> costs 2000 clock cycles.  Luckily tsc_khz is really used by very little
> outside the tsc clocksource (which kvmclock replaces) and the TSC
> deadline timer.  Because KVM's local APIC doesn't need quirks, we
> provide a paravirt clockevent that still uses the deadline timer
> under the hood (as suggested by Andy Lutomirski).
>
> This patch does not handle the very first deadline, hoping that it
> is masked by the migration downtime (i.e. that the timer fires late
> anyway).
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/apic.h |   1 +
>  arch/x86/kernel/apic/apic.c |   2 +-
>  arch/x86/kernel/kvmclock.c  | 156 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 158 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index f6e0bad1cde2..c88b0dcfdf3a 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -53,6 +53,7 @@ extern unsigned int apic_verbosity;
>  extern int local_apic_timer_c2_ok;
>
>  extern int disable_apic;
> +extern int disable_apic_timer;
>  extern unsigned int lapic_timer_frequency;
>
>  #ifdef CONFIG_SMP
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 5b63bec7d0af..d0c6d1e3d627 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -169,7 +169,7 @@ __setup("apicpmtimer", setup_apicpmtimer);
>  unsigned long mp_lapic_addr;
>  int disable_apic;
>  /* Disable local APIC timer from the kernel commandline or via dmi quirk */
> -static int disable_apic_timer __initdata;
> +int disable_apic_timer __initdata;
>  /* Local APIC timer works in C2 */
>  int local_apic_timer_c2_ok;
>  EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1d39bfbd26bb..365fa6494dd3 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -17,6 +17,7 @@
>  */
>
>  #include <linux/clocksource.h>
> +#include <linux/clockchips.h>
>  #include <linux/kvm_para.h>
>  #include <asm/pvclock.h>
>  #include <asm/msr.h>
> @@ -245,6 +246,155 @@ static void kvm_shutdown(void)
>         native_machine_shutdown();
>  }
>
> +#ifdef CONFIG_X86_LOCAL_APIC
> +/*
> + * kvmclock-based clock event implementation, used only together with the
> + * TSC deadline timer.  A subset of the normal LAPIC clockevent, but it
> + * uses kvmclock to convert nanoseconds to TSC.  This is necessary to
> + * handle changes to the TSC frequency, e.g. from live migration.
> + */
> +
> +static void kvmclock_lapic_timer_setup(unsigned lvtt_value)
> +{
> +       lvtt_value |= LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_TSCDEADLINE;
> +       apic_write(APIC_LVTT, lvtt_value);
> +}
> +
> +static int kvmclock_lapic_timer_set_oneshot(struct clock_event_device *evt)
> +{
> +       kvmclock_lapic_timer_setup(0);
> +       printk_once(KERN_DEBUG "kvmclock: TSC deadline timer enabled\n");
> +
> +       /*
> +        * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
> +        * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
> +        * According to Intel, MFENCE can do the serialization here.
> +        */
> +       asm volatile("mfence" : : : "memory");
> +       return 0;
> +}
> +
> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
> +{
> +       kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
> +       wrmsrl(MSR_IA32_TSC_DEADLINE, -1);
> +       return 0;
> +}
> +
> +/*
> + * We already have the inverse of the (mult,shift) pair, though this means
> + * we need a division.  To avoid it we could compute a multiplicative inverse
> + * every time src->version changes.
> + */
> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38
> +#define KVMCLOCK_TSC_DEADLINE_MAX      ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
> +
> +static int kvmclock_lapic_next_ktime(ktime_t expires,
> +                                    struct clock_event_device *evt)
> +{
> +       u64 ns, tsc;
> +       u32 version;
> +       int cpu;
> +       struct pvclock_vcpu_time_info *src;
> +
> +       cpu = smp_processor_id();
> +       src = &hv_clock[cpu].pvti;
> +       ns = ktime_to_ns(expires);
> +
> +       do {
> +               u64 delta_ns;
> +               int shift;
> +
> +               version = pvclock_read_begin(src);
> +               if (unlikely(ns < src->system_time)) {
> +                       tsc = src->tsc_timestamp;
> +                       virt_rmb();
> +                       continue;
> +               }
> +
> +               delta_ns = ns - src->system_time;
> +
> +               /* Cap the wait to avoid overflow.  */
> +               if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
> +                       delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
> +
> +               /*
> +                * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
> +                * The shift is split in two steps so that a 38 bits (275 s)
> +                * deadline fits into the 64-bit dividend.
> +                */
> +               shift = 32 - src->tsc_shift;
> +
> +               /* First shift step... */
> +               delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +               shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +
> +               /* ... division... */
> +               tsc = div_u64(delta_ns, src->tsc_to_system_mul);
> +
> +               /* ... and second shift step for the remaining bits.  */
> +               if (shift >= 0)
> +                       tsc <<= shift;
> +               else
> +                       tsc >>= -shift;
> +
> +               tsc += src->tsc_timestamp;
> +       } while (pvclock_read_retry(src, version));
> +
> +       wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
> +       return 0;
> +}
> +
> +/*
> + * The local apic timer can be used for any function which is CPU local.
> + */
> +static struct clock_event_device kvm_clockevent = {
> +       .name                   = "lapic",

Should we encode "kvm" or "kvmclock" in the name? I'm not sure how
this name gets used, but it might be nice to distinguish it from the
native TSC deadline timer clock_event_device.

> +       /* Under KVM the LAPIC timer always runs in deep C-states.  */
> +       .features               = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
> +       .set_state_shutdown     = kvmclock_lapic_timer_stop,
> +       .set_state_oneshot      = kvmclock_lapic_timer_set_oneshot,
> +       .set_next_ktime         = kvmclock_lapic_next_ktime,
> +       .mult                   = 1,
> +       /* Make LAPIC timer preferrable over percpu HPET */
> +       .rating                 = 150,
> +       .irq                    = -1,
> +};
> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
> +
> +static void kvmclock_local_apic_timer_interrupt(void)
> +{
> +       int cpu = smp_processor_id();
> +       struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
> +
> +       /*
> +        * Defer to the native clockevent if ours hasn't been setup yet.
> +        */
> +       if (!evt->event_handler) {
> +               native_local_apic_timer_interrupt();
> +               return;
> +       }
> +
> +       inc_irq_stat(apic_timer_irqs);
> +       evt->event_handler(evt);
> +}
> +
> +/*
> + * Setup the local APIC timer for this CPU. Copy the initialized values
> + * of the boot CPU and register the clock event in the framework.
> + */
> +static void setup_kvmclock_timer(void)
> +{
> +       struct clock_event_device *evt = this_cpu_ptr(&kvm_events);
> +
> +       kvmclock_lapic_timer_stop(evt);
> +
> +       memcpy(evt, &kvm_clockevent, sizeof(*evt));
> +       evt->cpumask = cpumask_of(smp_processor_id());
> +       clockevents_register_device(evt);
> +}
> +#endif
> +
>  void __init kvmclock_init(void)
>  {
>         struct pvclock_vcpu_time_info *vcpu_time;
> @@ -292,6 +442,12 @@ void __init kvmclock_init(void)
>         x86_platform.get_wallclock = kvm_get_wallclock;
>         x86_platform.set_wallclock = kvm_set_wallclock;
>  #ifdef CONFIG_X86_LOCAL_APIC
> +       if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) &&
> +           !disable_apic && !disable_apic_timer) {

Request that this be a hypervisor-controllable feature. e.g. we could
add a new bit to KVM's CPUID leaf to indicate kvmclock is the
definitive source of TSC rate.

> +               pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt;
> +               x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer;
> +               x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer;
> +       }
>         x86_cpuinit.early_percpu_clock_init =
>                 kvm_setup_secondary_clock;
>  #endif
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-08 22:13   ` David Matlack
@ 2016-09-09 16:38     ` Paolo Bonzini
  2016-09-09 20:05       ` David Matlack
  2016-10-11  4:05       ` Wanpeng Li
  0 siblings, 2 replies; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-09 16:38 UTC (permalink / raw)
  To: David Matlack
  Cc: linux-kernel, kvm list, Radim Krčmář,
	Andy Lutomirski, Peter Hornyack, X86 ML



On 09/09/2016 00:13, David Matlack wrote:
> Hi Paolo,
> 
> On Tue, Sep 6, 2016 at 3:29 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Bad things happen if a guest using the TSC deadline timer is migrated.
>> The guest doesn't re-calibrate the TSC after migration, and the
>> TSC frequency can and will change unless your processor supports TSC
>> scaling (on Intel this is only Skylake) or your data center is perfectly
>> homogeneous.
> 
> Sorry, I forgot to follow up on our discussion in v1. One thing we
> discussed there was using the APIC Timer to workaround a changing TSC
> rate. You pointed out KVM's TSC deadline timer got a nice performance
> boost recently, which makes it preferable. Does it makes sense to
> apply the same optimization (using the VMX preemption timer) to the
> APIC Timer, instead of creating a new dependency on kvmclock?

Hi, yes it does.  If we go that way kvmclock.c should be patched to
blacklist the TSC deadline timer.  However, I won't have time to work on
it anytime soon, so _if I get reviews_ I'll take this patch first.

Thanks,

Paolo

>>
>> The solution in this patch is to skip tsc_khz, and instead derive the
>> frequency from kvmclock's (mult, shift) pair.  Because kvmclock
>> parameters convert from tsc to nanoseconds, this needs a division
>> but that's the least of our problems when the TSC_DEADLINE_MSR write
>> costs 2000 clock cycles.  Luckily tsc_khz is really used by very little
>> outside the tsc clocksource (which kvmclock replaces) and the TSC
>> deadline timer.  Because KVM's local APIC doesn't need quirks, we
>> provide a paravirt clockevent that still uses the deadline timer
>> under the hood (as suggested by Andy Lutomirski).
>>
>> This patch does not handle the very first deadline, hoping that it
>> is masked by the migration downtime (i.e. that the timer fires late
>> anyway).
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>>  arch/x86/include/asm/apic.h |   1 +
>>  arch/x86/kernel/apic/apic.c |   2 +-
>>  arch/x86/kernel/kvmclock.c  | 156 ++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 158 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
>> index f6e0bad1cde2..c88b0dcfdf3a 100644
>> --- a/arch/x86/include/asm/apic.h
>> +++ b/arch/x86/include/asm/apic.h
>> @@ -53,6 +53,7 @@ extern unsigned int apic_verbosity;
>>  extern int local_apic_timer_c2_ok;
>>
>>  extern int disable_apic;
>> +extern int disable_apic_timer;
>>  extern unsigned int lapic_timer_frequency;
>>
>>  #ifdef CONFIG_SMP
>> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
>> index 5b63bec7d0af..d0c6d1e3d627 100644
>> --- a/arch/x86/kernel/apic/apic.c
>> +++ b/arch/x86/kernel/apic/apic.c
>> @@ -169,7 +169,7 @@ __setup("apicpmtimer", setup_apicpmtimer);
>>  unsigned long mp_lapic_addr;
>>  int disable_apic;
>>  /* Disable local APIC timer from the kernel commandline or via dmi quirk */
>> -static int disable_apic_timer __initdata;
>> +int disable_apic_timer __initdata;
>>  /* Local APIC timer works in C2 */
>>  int local_apic_timer_c2_ok;
>>  EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> index 1d39bfbd26bb..365fa6494dd3 100644
>> --- a/arch/x86/kernel/kvmclock.c
>> +++ b/arch/x86/kernel/kvmclock.c
>> @@ -17,6 +17,7 @@
>>  */
>>
>>  #include <linux/clocksource.h>
>> +#include <linux/clockchips.h>
>>  #include <linux/kvm_para.h>
>>  #include <asm/pvclock.h>
>>  #include <asm/msr.h>
>> @@ -245,6 +246,155 @@ static void kvm_shutdown(void)
>>         native_machine_shutdown();
>>  }
>>
>> +#ifdef CONFIG_X86_LOCAL_APIC
>> +/*
>> + * kvmclock-based clock event implementation, used only together with the
>> + * TSC deadline timer.  A subset of the normal LAPIC clockevent, but it
>> + * uses kvmclock to convert nanoseconds to TSC.  This is necessary to
>> + * handle changes to the TSC frequency, e.g. from live migration.
>> + */
>> +
>> +static void kvmclock_lapic_timer_setup(unsigned lvtt_value)
>> +{
>> +       lvtt_value |= LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_TSCDEADLINE;
>> +       apic_write(APIC_LVTT, lvtt_value);
>> +}
>> +
>> +static int kvmclock_lapic_timer_set_oneshot(struct clock_event_device *evt)
>> +{
>> +       kvmclock_lapic_timer_setup(0);
>> +       printk_once(KERN_DEBUG "kvmclock: TSC deadline timer enabled\n");
>> +
>> +       /*
>> +        * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
>> +        * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
>> +        * According to Intel, MFENCE can do the serialization here.
>> +        */
>> +       asm volatile("mfence" : : : "memory");
>> +       return 0;
>> +}
>> +
>> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
>> +{
>> +       kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
>> +       wrmsrl(MSR_IA32_TSC_DEADLINE, -1);
>> +       return 0;
>> +}
>> +
>> +/*
>> + * We already have the inverse of the (mult,shift) pair, though this means
>> + * we need a division.  To avoid it we could compute a multiplicative inverse
>> + * every time src->version changes.
>> + */
>> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS 38
>> +#define KVMCLOCK_TSC_DEADLINE_MAX      ((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
>> +
>> +static int kvmclock_lapic_next_ktime(ktime_t expires,
>> +                                    struct clock_event_device *evt)
>> +{
>> +       u64 ns, tsc;
>> +       u32 version;
>> +       int cpu;
>> +       struct pvclock_vcpu_time_info *src;
>> +
>> +       cpu = smp_processor_id();
>> +       src = &hv_clock[cpu].pvti;
>> +       ns = ktime_to_ns(expires);
>> +
>> +       do {
>> +               u64 delta_ns;
>> +               int shift;
>> +
>> +               version = pvclock_read_begin(src);
>> +               if (unlikely(ns < src->system_time)) {
>> +                       tsc = src->tsc_timestamp;
>> +                       virt_rmb();
>> +                       continue;
>> +               }
>> +
>> +               delta_ns = ns - src->system_time;
>> +
>> +               /* Cap the wait to avoid overflow.  */
>> +               if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
>> +                       delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
>> +
>> +               /*
>> +                * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
>> +                * The shift is split in two steps so that a 38 bits (275 s)
>> +                * deadline fits into the 64-bit dividend.
>> +                */
>> +               shift = 32 - src->tsc_shift;
>> +
>> +               /* First shift step... */
>> +               delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
>> +               shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
>> +
>> +               /* ... division... */
>> +               tsc = div_u64(delta_ns, src->tsc_to_system_mul);
>> +
>> +               /* ... and second shift step for the remaining bits.  */
>> +               if (shift >= 0)
>> +                       tsc <<= shift;
>> +               else
>> +                       tsc >>= -shift;
>> +
>> +               tsc += src->tsc_timestamp;
>> +       } while (pvclock_read_retry(src, version));
>> +
>> +       wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
>> +       return 0;
>> +}
>> +
>> +/*
>> + * The local apic timer can be used for any function which is CPU local.
>> + */
>> +static struct clock_event_device kvm_clockevent = {
>> +       .name                   = "lapic",
> 
> Should we encode "kvm" or "kvmclock" in the name? I'm not sure how
> this name gets used, but it might be nice to distinguish it from the
> native TSC deadline timer clock_event_device.
> 
>> +       /* Under KVM the LAPIC timer always runs in deep C-states.  */
>> +       .features               = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
>> +       .set_state_shutdown     = kvmclock_lapic_timer_stop,
>> +       .set_state_oneshot      = kvmclock_lapic_timer_set_oneshot,
>> +       .set_next_ktime         = kvmclock_lapic_next_ktime,
>> +       .mult                   = 1,
>> +       /* Make LAPIC timer preferrable over percpu HPET */
>> +       .rating                 = 150,
>> +       .irq                    = -1,
>> +};
>> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
>> +
>> +static void kvmclock_local_apic_timer_interrupt(void)
>> +{
>> +       int cpu = smp_processor_id();
>> +       struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
>> +
>> +       /*
>> +        * Defer to the native clockevent if ours hasn't been setup yet.
>> +        */
>> +       if (!evt->event_handler) {
>> +               native_local_apic_timer_interrupt();
>> +               return;
>> +       }
>> +
>> +       inc_irq_stat(apic_timer_irqs);
>> +       evt->event_handler(evt);
>> +}
>> +
>> +/*
>> + * Setup the local APIC timer for this CPU. Copy the initialized values
>> + * of the boot CPU and register the clock event in the framework.
>> + */
>> +static void setup_kvmclock_timer(void)
>> +{
>> +       struct clock_event_device *evt = this_cpu_ptr(&kvm_events);
>> +
>> +       kvmclock_lapic_timer_stop(evt);
>> +
>> +       memcpy(evt, &kvm_clockevent, sizeof(*evt));
>> +       evt->cpumask = cpumask_of(smp_processor_id());
>> +       clockevents_register_device(evt);
>> +}
>> +#endif
>> +
>>  void __init kvmclock_init(void)
>>  {
>>         struct pvclock_vcpu_time_info *vcpu_time;
>> @@ -292,6 +442,12 @@ void __init kvmclock_init(void)
>>         x86_platform.get_wallclock = kvm_get_wallclock;
>>         x86_platform.set_wallclock = kvm_set_wallclock;
>>  #ifdef CONFIG_X86_LOCAL_APIC
>> +       if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) &&
>> +           !disable_apic && !disable_apic_timer) {
> 
> Request that this be a hypervisor-controllable feature. e.g. we could
> add a new bit to KVM's CPUID leaf to indicate kvmclock is the
> definitive source of TSC rate.
> 
>> +               pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt;
>> +               x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer;
>> +               x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer;
>> +       }
>>         x86_cpuinit.early_percpu_clock_init =
>>                 kvm_setup_secondary_clock;
>>  #endif
>> --
>> 1.8.3.1
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-09 16:38     ` Paolo Bonzini
@ 2016-09-09 20:05       ` David Matlack
  2016-10-11  4:05       ` Wanpeng Li
  1 sibling, 0 replies; 16+ messages in thread
From: David Matlack @ 2016-09-09 20:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm list, Radim Krčmář,
	Andy Lutomirski, Peter Hornyack, X86 ML

On Fri, Sep 9, 2016 at 9:38 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 09/09/2016 00:13, David Matlack wrote:
>> Hi Paolo,
>>
>> On Tue, Sep 6, 2016 at 3:29 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> Bad things happen if a guest using the TSC deadline timer is migrated.
>>> The guest doesn't re-calibrate the TSC after migration, and the
>>> TSC frequency can and will change unless your processor supports TSC
>>> scaling (on Intel this is only Skylake) or your data center is perfectly
>>> homogeneous.
>>
>> Sorry, I forgot to follow up on our discussion in v1. One thing we
>> discussed there was using the APIC Timer to workaround a changing TSC
>> rate. You pointed out KVM's TSC deadline timer got a nice performance
>> boost recently, which makes it preferable. Does it makes sense to
>> apply the same optimization (using the VMX preemption timer) to the
>> APIC Timer, instead of creating a new dependency on kvmclock?
>
> Hi, yes it does.  If we go that way kvmclock.c should be patched to
> blacklist the TSC deadline timer.  However, I won't have time to work on
> it anytime soon, so _if I get reviews_ I'll take this patch first.

Got it, thanks for the context.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-09 16:38     ` Paolo Bonzini
  2016-09-09 20:05       ` David Matlack
@ 2016-10-11  4:05       ` Wanpeng Li
  1 sibling, 0 replies; 16+ messages in thread
From: Wanpeng Li @ 2016-10-11  4:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: David Matlack, linux-kernel, kvm list,
	Radim Krčmář,
	Andy Lutomirski, Peter Hornyack, X86 ML

2016-09-10 0:38 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>:
>
>
> On 09/09/2016 00:13, David Matlack wrote:
>> Hi Paolo,
>>
>> On Tue, Sep 6, 2016 at 3:29 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> Bad things happen if a guest using the TSC deadline timer is migrated.
>>> The guest doesn't re-calibrate the TSC after migration, and the
>>> TSC frequency can and will change unless your processor supports TSC
>>> scaling (on Intel this is only Skylake) or your data center is perfectly
>>> homogeneous.
>>
>> Sorry, I forgot to follow up on our discussion in v1. One thing we
>> discussed there was using the APIC Timer to workaround a changing TSC
>> rate. You pointed out KVM's TSC deadline timer got a nice performance
>> boost recently, which makes it preferable. Does it makes sense to
>> apply the same optimization (using the VMX preemption timer) to the
>> APIC Timer, instead of creating a new dependency on kvmclock?
>
> Hi, yes it does.

Windows 2008 server, 2012 server etc are still using APIC Timer
periodic mode, so I am adding the VMX preemption timer support to APIC
Timer one shot/periodic mode currently.

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-06 22:29 ` [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value Paolo Bonzini
  2016-09-08 22:13   ` David Matlack
@ 2016-09-15 15:09   ` Radim Krčmář
  2016-09-15 16:00     ` Paolo Bonzini
  1 sibling, 1 reply; 16+ messages in thread
From: Radim Krčmář @ 2016-09-15 15:09 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86

2016-09-07 00:29+0200, Paolo Bonzini:
> Bad things happen if a guest using the TSC deadline timer is migrated.
> The guest doesn't re-calibrate the TSC after migration, and the
> TSC frequency can and will change unless your processor supports TSC
> scaling (on Intel this is only Skylake) or your data center is perfectly
> homogeneous.

KVM has software scaling thanks to tsc_catchup and virtual_tsc_khz to
allow the guest to use hardware TSC directly.  The software scaling
should recalibrate TSC on vmexits and is therefore losing precision in
between -- are you working around the imprecision?

Host should translate the requested tsc deadline into nanoseconds based
on vcpu->arch.virtual_tsc_khz, which doesn't change on migration, so the
solution shouldn't work, because we'd be scaling twice.

Don't we have a bug in the TSC recalibration/scaling on KVM side?

> The solution in this patch is to skip tsc_khz, and instead derive the
> frequency from kvmclock's (mult, shift) pair.  Because kvmclock
> parameters convert from tsc to nanoseconds, this needs a division
> but that's the least of our problems when the TSC_DEADLINE_MSR write
> costs 2000 clock cycles.  Luckily tsc_khz is really used by very little
> outside the tsc clocksource (which kvmclock replaces) and the TSC
> deadline timer.  Because KVM's local APIC doesn't need quirks, we
> provide a paravirt clockevent that still uses the deadline timer
> under the hood (as suggested by Andy Lutomirski).

I don't the general idea.  TSC and TSC_DEADLINE_TIMER is a standard
feature;  If we don't emulate it, then we should not expose it in CPUID.
rdtsc and wrmsr would still be available for kvmclock, but naive
operating systems would not be misled.

When we are already going the paravirtual route, we could add an
interface that accepts the deadline in kvmclock nanoseconds.
It would be much more maintanable than adding a fragile paravirtual
layer on top of random interfaces.

> This patch does not handle the very first deadline, hoping that it
> is masked by the migration downtime (i.e. that the timer fires late
> anyway).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Nonetheless, I also did a code review ...

> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> @@ -245,6 +246,155 @@ static void kvm_shutdown(void)
> +
> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
> +{
> +	kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
> +	wrmsrl(MSR_IA32_TSC_DEADLINE, -1);

This wrmsrl() can't inject an interrupt, because later switch to
TSCDEADLINE mode disarms the timer, but it would be better to remove it.

> +	return 0;
> +}
> +
> +/*
> + * We already have the inverse of the (mult,shift) pair, though this means
> + * we need a division.  To avoid it we could compute a multiplicative inverse
> + * every time src->version changes.
> + */
> +#define KVMCLOCK_TSC_DEADLINE_MAX_BITS	38
> +#define KVMCLOCK_TSC_DEADLINE_MAX	((1ull << KVMCLOCK_TSC_DEADLINE_MAX_BITS) - 1)
> +
> +static int kvmclock_lapic_next_ktime(ktime_t expires,
> +				     struct clock_event_device *evt)
> +{
> +	u64 ns, tsc;
> +	u32 version;
> +	int cpu;
> +	struct pvclock_vcpu_time_info *src;
> +
> +	cpu = smp_processor_id();
> +	src = &hv_clock[cpu].pvti;
> +	ns = ktime_to_ns(expires);
> +
> +	do {
> +		u64 delta_ns;
> +		int shift;
> +
> +		version = pvclock_read_begin(src);
> +		if (unlikely(ns < src->system_time)) {
> +			tsc = src->tsc_timestamp;
> +			virt_rmb();
> +			continue;
> +		}
> +
> +		delta_ns = ns - src->system_time;
> +
> +		/* Cap the wait to avoid overflow.  */
> +		if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
> +			delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
> +
> +		/*
> +		 * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
> +		 * The shift is split in two steps so that a 38 bits (275 s)
> +		 * deadline fits into the 64-bit dividend.
> +		 */
> +		shift = 32 - src->tsc_shift;
> +		
> +		/* First shift step... */
> +		delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> +		shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;

Considering that the usual shift seems to be -1, we'd be losing 7 bits
of precision.  The nice thing is that the precision is bounded up to the
cap ... I dislike the cap and LOC count, though, so the following,
although not as tightly bounded seems a bit better:

  u64 mult = div_u64(1ULL << 63, tsc_to_system_mul);
  int shift = 63 - (32 - tsc_shift));

  tsc = src->tsc_timestamp + mul_u64_u64_shr(delta_ns, mult, shift);

mult should be quite stable, so we could cache it.
(And if we didn't care about performance loss from 4(?) divisions, we
 could have much nicer shl_div().)

> +
> +		/* ... division... */
> +		tsc = div_u64(delta_ns, src->tsc_to_system_mul);
> +
> +		/* ... and second shift step for the remaining bits.  */
> +		if (shift >= 0)
> +			tsc <<= shift;
> +		else
> +			tsc >>= -shift;
> +
> +		tsc += src->tsc_timestamp;
> +	} while (pvclock_read_retry(src, version));
> +	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc);
> +	return 0;
> +}
> +
> +/*
> + * The local apic timer can be used for any function which is CPU local.
> + */
> +static struct clock_event_device kvm_clockevent = {
> +	.name			= "lapic",

"lapic" is already used -- what about "kvm-lapic" or "kvm-tsc-deadline"?

> +	/* Under KVM the LAPIC timer always runs in deep C-states.  */
> +	.features		= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
> +	.set_state_shutdown	= kvmclock_lapic_timer_stop,
> +	.set_state_oneshot	= kvmclock_lapic_timer_set_oneshot,
> +	.set_next_ktime		= kvmclock_lapic_next_ktime,
> +	.mult			= 1,
> +	/* Make LAPIC timer preferrable over percpu HPET */
> +	.rating			= 150,
> +	.irq			= -1,
> +};
> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
> +
> +static void kvmclock_local_apic_timer_interrupt(void)
> +{
> +	int cpu = smp_processor_id();
> +	struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
> +
> +	/*
> +	 * Defer to the native clockevent if ours hasn't been setup yet.
> +	 */
> +	if (!evt->event_handler) {
> +		native_local_apic_timer_interrupt();

Don't we completely replace the native apic timer, so it can't even be
registered?  The comment doesn't explain what we expect from the call,
so it's a bit confusing.

I think the call expects that per_cpu(lapic_events, cpu).event_handler
== NULL, so we do it to print the warning and disable the LAPIC timer.

> +		return;
> +	}
> +
> +	inc_irq_stat(apic_timer_irqs);
> +	evt->event_handler(evt);
> +}
> +
> +/*
> + * Setup the local APIC timer for this CPU. Copy the initialized values
> + * of the boot CPU and register the clock event in the framework.
> + */
> +static void setup_kvmclock_timer(void)
> +{
> +	struct clock_event_device *evt = this_cpu_ptr(&kvm_events);
> +
> +	kvmclock_lapic_timer_stop(evt);

Why stop the timer here?  We don't even know if the clockevent device
will be used, so it seems out of order.

> +	memcpy(evt, &kvm_clockevent, sizeof(*evt));
> +	evt->cpumask = cpumask_of(smp_processor_id());
> +	clockevents_register_device(evt);
> +}
> +#endif
> +
>  void __init kvmclock_init(void)
>  {
>  	struct pvclock_vcpu_time_info *vcpu_time;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-15 15:09   ` Radim Krčmář
@ 2016-09-15 16:00     ` Paolo Bonzini
  2016-09-15 19:59       ` Radim Krčmář
  0 siblings, 1 reply; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-15 16:00 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86



On 15/09/2016 17:09, Radim Krčmář wrote:
> 2016-09-07 00:29+0200, Paolo Bonzini:
>> Bad things happen if a guest using the TSC deadline timer is migrated.
>> The guest doesn't re-calibrate the TSC after migration, and the
>> TSC frequency can and will change unless your processor supports TSC
>> scaling (on Intel this is only Skylake) or your data center is perfectly
>> homogeneous.
> 
> KVM has software scaling thanks to tsc_catchup and virtual_tsc_khz to
> allow the guest to use hardware TSC directly.  The software scaling
> should recalibrate TSC on vmexits and is therefore losing precision in
> between -- are you working around the imprecision?
> 
> Host should translate the requested tsc deadline into nanoseconds based
> on vcpu->arch.virtual_tsc_khz, which doesn't change on migration, so the
> solution shouldn't work, because we'd be scaling twice.

tsc_catchup is pretty much broken and not really fixable; IIRC it causes
a huge performance loss because it updates kvmclock on every vmexit.

So virtual_tsc_khz does change on migration, at least if your host
doesn't have TSC scaling (which is pretty much all Intel hosts in
existence).  Safety against TSC rate changes is pretty much the reason
why kvmclock exists: it used to be that TSC rate changed on pCPU
migration, now it's only host migration but the idea is the same. :)

> I don't the general idea.  TSC and TSC_DEADLINE_TIMER is a standard
> feature;  If we don't emulate it, then we should not expose it in CPUID.
> rdtsc and wrmsr would still be available for kvmclock, but naive
> operating systems would not be misled.

We do emulate it TSC_DEADLINE_TIMER correctly.  The problem is that the
kernel sets a deadline in nanoseconds and the nanoseconds->TSC relation
can change.

It's exactly the same issue that kvmclock is handling, except that
kvmclock handles TSC->nanoseconds and this one is the other way round.

(Another small benefit of this patch vs. the native clockevent is that
it doesn't waste time calibrating the LAPIC timer when we know the
frequency from kvmclock).

> When we are already going the paravirtual route, we could add an
> interface that accepts the deadline in kvmclock nanoseconds.
> It would be much more maintanable than adding a fragile paravirtual
> layer on top of random interfaces.

Good idea.  This still wouldn't handle old hosts of course.

>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> @@ -245,6 +246,155 @@ static void kvm_shutdown(void)
>> +
>> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
>> +{
>> +	kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
>> +	wrmsrl(MSR_IA32_TSC_DEADLINE, -1);
> 
> This wrmsrl() can't inject an interrupt, because later switch to
> TSCDEADLINE mode disarms the timer, but it would be better to remove it.

You mean leave only kvmclock_lapic_timer_setup(APIC_LVT_MASKED)?

>> +		/* Cap the wait to avoid overflow.  */
>> +		if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
>> +			delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
>> +
>> +		/*
>> +		 * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
>> +		 * The shift is split in two steps so that a 38 bits (275 s)
>> +		 * deadline fits into the 64-bit dividend.
>> +		 */
>> +		shift = 32 - src->tsc_shift;
>> +		
>> +		/* First shift step... */
>> +		delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
>> +		shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
> 
> Considering that the usual shift seems to be -1, we'd be losing 7 bits
> of precision.  The nice thing is that the precision is bounded up to the
> cap ... I dislike the cap and LOC count, though, so the following,
> although not as tightly bounded seems a bit better:
> 
>   u64 mult = div_u64(1ULL << 63, tsc_to_system_mul);
>   int shift = 63 - (32 - tsc_shift));
> 
>   tsc = src->tsc_timestamp + mul_u64_u64_shr(delta_ns, mult, shift);
> 
> mult should be quite stable, so we could cache it.
> (And if we didn't care about performance loss from 4(?) divisions, we
>  could have much nicer shl_div().)

What are the four divisions?  (And yes, I agree this is quite nice).

>> +/*
>> + * The local apic timer can be used for any function which is CPU local.
>> + */
>> +static struct clock_event_device kvm_clockevent = {
>> +	.name			= "lapic",
> 
> "lapic" is already used -- what about "kvm-lapic" or "kvm-tsc-deadline"?

Of course.

>> +	/* Under KVM the LAPIC timer always runs in deep C-states.  */
>> +	.features		= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
>> +	.set_state_shutdown	= kvmclock_lapic_timer_stop,
>> +	.set_state_oneshot	= kvmclock_lapic_timer_set_oneshot,
>> +	.set_next_ktime		= kvmclock_lapic_next_ktime,
>> +	.mult			= 1,
>> +	/* Make LAPIC timer preferrable over percpu HPET */
>> +	.rating			= 150,
>> +	.irq			= -1,
>> +};
>> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
>> +
>> +static void kvmclock_local_apic_timer_interrupt(void)
>> +{
>> +	int cpu = smp_processor_id();
>> +	struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
>> +
>> +	/*
>> +	 * Defer to the native clockevent if ours hasn't been setup yet.
>> +	 */
>> +	if (!evt->event_handler) {
>> +		native_local_apic_timer_interrupt();
> 
> Don't we completely replace the native apic timer, so it can't even be
> registered?  The comment doesn't explain what we expect from the call,
> so it's a bit confusing.
> 
> I think the call expects that per_cpu(lapic_events, cpu).event_handler
> == NULL, so we do it to print the warning and disable the LAPIC timer.

This pvop is always called instead of native_local_apic_timer_interrupt.
 We need to defer to the native implementation if the kvmclock
clocksource is not in use, for example if there's no TSC deadline timer
in the guest.

So I should do s/hasn't been setup yet/isn't in use/.

>> +		return;
>> +	}
>> +
>> +	inc_irq_stat(apic_timer_irqs);
>> +	evt->event_handler(evt);
>> +}
>> +
>> +/*
>> + * Setup the local APIC timer for this CPU. Copy the initialized values
>> + * of the boot CPU and register the clock event in the framework.
>> + */
>> +static void setup_kvmclock_timer(void)
>> +{
>> +	struct clock_event_device *evt = this_cpu_ptr(&kvm_events);
>> +
>> +	kvmclock_lapic_timer_stop(evt);
> 
> Why stop the timer here?  We don't even know if the clockevent device
> will be used, so it seems out of order.

Yeah, you're right.  And as you noticed we never switch from native to
pv clockevent, because setup_percpu_clockev is replaced completely.

Paolo

>> +	memcpy(evt, &kvm_clockevent, sizeof(*evt));
>> +	evt->cpumask = cpumask_of(smp_processor_id());
>> +	clockevents_register_device(evt);
>> +}
>> +#endif
>> +
>>  void __init kvmclock_init(void)
>>  {
>>  	struct pvclock_vcpu_time_info *vcpu_time;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-15 16:00     ` Paolo Bonzini
@ 2016-09-15 19:59       ` Radim Krčmář
  2016-09-15 21:02         ` Paolo Bonzini
  0 siblings, 1 reply; 16+ messages in thread
From: Radim Krčmář @ 2016-09-15 19:59 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86

2016-09-15 18:00+0200, Paolo Bonzini:
> On 15/09/2016 17:09, Radim Krčmář wrote:
>> 2016-09-07 00:29+0200, Paolo Bonzini:
>>> Bad things happen if a guest using the TSC deadline timer is migrated.
>>> The guest doesn't re-calibrate the TSC after migration, and the
>>> TSC frequency can and will change unless your processor supports TSC
>>> scaling (on Intel this is only Skylake) or your data center is perfectly
>>> homogeneous.
>> 
>> KVM has software scaling thanks to tsc_catchup and virtual_tsc_khz to
>> allow the guest to use hardware TSC directly.  The software scaling
>> should recalibrate TSC on vmexits and is therefore losing precision in
>> between -- are you working around the imprecision?
>> 
>> Host should translate the requested tsc deadline into nanoseconds based
>> on vcpu->arch.virtual_tsc_khz, which doesn't change on migration, so the
>> solution shouldn't work, because we'd be scaling twice.
> 
> tsc_catchup is pretty much broken and not really fixable; IIRC it causes
> a huge performance loss because it updates kvmclock on every vmexit.

Yeah ... catchup needs to update TSC_OFFSET, which can't be done without
updating kvmclock.

> So virtual_tsc_khz does change on migration, at least if your host
> doesn't have TSC scaling (which is pretty much all Intel hosts in
> existence).

Ugh, I'd consider exposing constant TSC to the guest (which depends
solely on CPUID -- modern models have it), allowing migration, and not
preserving the TSC rate as a userspace bug.

Invariant TSC seems to prevent migration and the only thing that changed
between constant and invariant TSC is that invariant TSC doesn't stop in
guest-controlled deep C states.

Hm, and Linux uses TSC_DEADLINE_TIMER without invariant TSC: setting a
timer and halting could mean that the timer never fires ...
maybe real hardware always has both, so it is an implicit condition?

>              Safety against TSC rate changes is pretty much the reason
> why kvmclock exists: it used to be that TSC rate changed on pCPU
> migration, now it's only host migration but the idea is the same. :)

Yep.  I think that TSC shouldn't have been allowed outside of kvmclock.

>> I don't the general idea.  TSC and TSC_DEADLINE_TIMER is a standard
>> feature;  If we don't emulate it, then we should not expose it in CPUID.
>> rdtsc and wrmsr would still be available for kvmclock, but naive
>> operating systems would not be misled.
> 
> We do emulate it TSC_DEADLINE_TIMER correctly.  The problem is that the
> kernel sets a deadline in nanoseconds and the nanoseconds->TSC relation
> can change.
> 
> It's exactly the same issue that kvmclock is handling, except that
> kvmclock handles TSC->nanoseconds and this one is the other way round.

True, it is the TSC that is not emulated correctly.
TSC_DEADLINE_TIMER is a victim of past decisions.

> (Another small benefit of this patch vs. the native clockevent is that
> it doesn't waste time calibrating the LAPIC timer when we know the
> frequency from kvmclock).

calibrate_APIC_clock() just returns 0 early when using the TSC deadline
timer and setup re-uses the TSC calibration that was done earlier, so
it's even smaller benefit.

>> When we are already going the paravirtual route, we could add an
>> interface that accepts the deadline in kvmclock nanoseconds.
>> It would be much more maintanable than adding a fragile paravirtual
>> layer on top of random interfaces.
>
> Good idea.

I'll prepare a prototype.

>             This still wouldn't handle old hosts of course.

The question is whether we want to carry around 150 LOC because of old
hosts.  I'd just fix Linux to avoid deadline TSC without invariant TSC.
:)

>>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>>> @@ -245,6 +246,155 @@ static void kvm_shutdown(void)
>>> +
>>> +static int kvmclock_lapic_timer_stop(struct clock_event_device *evt)
>>> +{
>>> +	kvmclock_lapic_timer_setup(APIC_LVT_MASKED);
>>> +	wrmsrl(MSR_IA32_TSC_DEADLINE, -1);
>> 
>> This wrmsrl() can't inject an interrupt, because later switch to
>> TSCDEADLINE mode disarms the timer, but it would be better to remove it.
>
>You mean leave only kvmclock_lapic_timer_setup(APIC_LVT_MASKED)?

Yes.  I didn't understand the -1 write at all, when 0 is the one that
disarms the timer and even that isn't needed.

>>> +		/* Cap the wait to avoid overflow.  */
>>> +		if (unlikely(delta_ns > KVMCLOCK_TSC_DEADLINE_MAX))
>>> +			delta_ns = KVMCLOCK_TSC_DEADLINE_MAX;
>>> +
>>> +		/*
>>> +		 * delta_tsc = delta_ns << (32-tsc_shift) / tsc_to_system_mul.
>>> +		 * The shift is split in two steps so that a 38 bits (275 s)
>>> +		 * deadline fits into the 64-bit dividend.
>>> +		 */
>>> +		shift = 32 - src->tsc_shift;
>>> +		
>>> +		/* First shift step... */
>>> +		delta_ns <<= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
>>> +		shift -= 64 - KVMCLOCK_TSC_DEADLINE_MAX_BITS;
>> 
>> Considering that the usual shift seems to be -1, we'd be losing 7 bits
>> of precision.  The nice thing is that the precision is bounded up to the
>> cap ... I dislike the cap and LOC count, though, so the following,
>> although not as tightly bounded seems a bit better:
>> 
>>   u64 mult = div_u64(1ULL << 63, tsc_to_system_mul);
>>   int shift = 63 - (32 - tsc_shift));
>> 
>>   tsc = src->tsc_timestamp + mul_u64_u64_shr(delta_ns, mult, shift);
>> 
>> mult should be quite stable, so we could cache it.
>> (And if we didn't care about performance loss from 4(?) divisions, we
>>  could have much nicer shl_div().)
> 
> What are the four divisions?

Ah, sorry, just a guess of how many divisions would it take to do
"u128 / u32", resp.  "(u64 << u6) / u32" (the shl_div() in question),
with full precision.  I don't know big number arithmetic well enough.

>>> +	/* Under KVM the LAPIC timer always runs in deep C-states.  */
>>> +	.features		= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME,
>>> +	.set_state_shutdown	= kvmclock_lapic_timer_stop,
>>> +	.set_state_oneshot	= kvmclock_lapic_timer_set_oneshot,
>>> +	.set_next_ktime		= kvmclock_lapic_next_ktime,
>>> +	.mult			= 1,
>>> +	/* Make LAPIC timer preferrable over percpu HPET */
>>> +	.rating			= 150,
>>> +	.irq			= -1,
>>> +};
>>> +static DEFINE_PER_CPU(struct clock_event_device, kvm_events);
>>> +
>>> +static void kvmclock_local_apic_timer_interrupt(void)
>>> +{
>>> +	int cpu = smp_processor_id();
>>> +	struct clock_event_device *evt = &per_cpu(kvm_events, cpu);
>>> +
>>> +	/*
>>> +	 * Defer to the native clockevent if ours hasn't been setup yet.
>>> +	 */
>>> +	if (!evt->event_handler) {
>>> +		native_local_apic_timer_interrupt();
>> 
>> Don't we completely replace the native apic timer, so it can't even be
>> registered?  The comment doesn't explain what we expect from the call,
>> so it's a bit confusing.
>> 
>> I think the call expects that per_cpu(lapic_events, cpu).event_handler
>> == NULL, so we do it to print the warning and disable the LAPIC timer.
> 
> This pvop is always called instead of native_local_apic_timer_interrupt.
>  We need to defer to the native implementation if the kvmclock
> clocksource is not in use, for example if there's no TSC deadline timer
> in the guest.

No, the pvop is for that.  If there is no TSC deadline timer, then the
pvop will call native_local_apic_timer_interrupt(), because we don't
overwrite it:

>>> +	if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) &&
>>> +	    !disable_apic && !disable_apic_timer) {
>>> +		pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt;
>>> +		x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer;
>>> +		x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer;
>>> +	}
> 
> So I should do s/hasn't been setup yet/isn't in use/.

Worse that no comment, IMO. ;)

I still think it is only to call this block in
native_local_apic_timer_interrupt():

 	if (!evt->event_handler) {
 		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
 		/* Switch it off */
 		lapic_timer_shutdown(evt);
 		return;
 	}

Those dependencies make it confusing.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-15 19:59       ` Radim Krčmář
@ 2016-09-15 21:02         ` Paolo Bonzini
  2016-09-16 14:59           ` Radim Krčmář
  0 siblings, 1 reply; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-15 21:02 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86



On 15/09/2016 21:59, Radim Krčmář wrote:
> 2016-09-15 18:00+0200, Paolo Bonzini:
>> So virtual_tsc_khz does change on migration, at least if your host
>> doesn't have TSC scaling (which is pretty much all Intel hosts in
>> existence).
> 
> Ugh, I'd consider exposing constant TSC to the guest (which depends
> solely on CPUID -- modern models have it), allowing migration, and not
> preserving the TSC rate as a userspace bug.

Technically it is, but without hardware help I guess it's justified...

> Invariant TSC seems to prevent migration and the only thing that changed
> between constant and invariant TSC is that invariant TSC doesn't stop in
> guest-controlled deep C states.
> 
> Hm, and Linux uses TSC_DEADLINE_TIMER without invariant TSC: setting a
> timer and halting could mean that the timer never fires ...
> maybe real hardware always has both, so it is an implicit condition?

Yeah, I guess so.  And the kvmclock-based clockevent would not be able
to fix this.  So we also need to blacklist the TSC deadline timer if
kvmclock doesn't set PVCLOCK_TSC_STABLE_BIT.

>>              Safety against TSC rate changes is pretty much the reason
>> why kvmclock exists: it used to be that TSC rate changed on pCPU
>> migration, now it's only host migration but the idea is the same. :)
> 
> Yep.  I think that TSC shouldn't have been allowed outside of kvmclock.

Luckily TSC is only used by kvmclock (which is okay) and... the TSC
deadline timer.  Also by nested KVM's kvmclock, but that's the last of
our worries I guess.  So we're close.

>>> When we are already going the paravirtual route, we could add an
>>> interface that accepts the deadline in kvmclock nanoseconds.
>>> It would be much more maintanable than adding a fragile paravirtual
>>> layer on top of random interfaces.
>>
>> Good idea.
> 
> I'll prepare a prototype.

So how would this work?  A single MSR, used after setting TSC deadline
mode in LVTT?  Could you write it and read TSC deadline or vice versa?

My idea would be "yes" for writing nsec deadline and reading TSC
deadline, but "no" for writing TSC deadline and reading nsec deadline.
In the latter case, reading nsec deadline might return an impossible
value such as -1; this lets userspace decide whether to set a nsec-based
deadline or a TSC-based deadline after migration.  If you have other
ideas, I'm all ears!

>>             This still wouldn't handle old hosts of course.
> 
> The question is whether we want to carry around 150 LOC because of old
> hosts.  I'd just fix Linux to avoid deadline TSC without invariant TSC.
> :)

Yes, that would automatically blacklist it on KVM.  You'd also need to
update the recent optimization to the TSC deadline timer, to also work
on other APIC timer modes or at least in your new PV mode.

>>> Don't we completely replace the native apic timer, so it can't even be
>>> registered?  The comment doesn't explain what we expect from the call,
>>> so it's a bit confusing.
>>>
>>> I think the call expects that per_cpu(lapic_events, cpu).event_handler
>>> == NULL, so we do it to print the warning and disable the LAPIC timer.
>>
>> This pvop is always called instead of native_local_apic_timer_interrupt.
>>  We need to defer to the native implementation if the kvmclock
>> clocksource is not in use, for example if there's no TSC deadline timer
>> in the guest.
> 
> No, the pvop is for that.  If there is no TSC deadline timer, then the
> pvop will call native_local_apic_timer_interrupt(), because we don't
> overwrite it:
> 
>>>> +	if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) &&
>>>> +	    !disable_apic && !disable_apic_timer) {
>>>> +		pv_time_ops.local_apic_timer_interrupt = kvmclock_local_apic_timer_interrupt;
>>>> +		x86_init.timers.setup_percpu_clockev = setup_kvmclock_timer;
>>>> +		x86_cpuinit.setup_percpu_clockev = setup_kvmclock_timer;
>>>> +	}
>>
>> So I should do s/hasn't been setup yet/isn't in use/.
> 
> Worse that no comment, IMO. ;)
> 
> I still think it is only to call this block in
> native_local_apic_timer_interrupt():
> 
>  	if (!evt->event_handler) {
>  		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
>  		/* Switch it off */
>  		lapic_timer_shutdown(evt);
>  		return;
>  	}

No, it was needed for something else. :)  I just don't recall for what
anymore, since your observation on
pv_time_ops.local_apic_timer_interrupt is obviously correct.

Paolo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-15 21:02         ` Paolo Bonzini
@ 2016-09-16 14:59           ` Radim Krčmář
  2016-09-16 15:06             ` Paolo Bonzini
  0 siblings, 1 reply; 16+ messages in thread
From: Radim Krčmář @ 2016-09-16 14:59 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86

2016-09-15 23:02+0200, Paolo Bonzini:
> On 15/09/2016 21:59, Radim Krčmář wrote:
>> 2016-09-15 18:00+0200, Paolo Bonzini:
>>>> When we are already going the paravirtual route, we could add an
>>>> interface that accepts the deadline in kvmclock nanoseconds.
>>>> It would be much more maintanable than adding a fragile paravirtual
>>>> layer on top of random interfaces.
>>>
>>> Good idea.
>> 
>> I'll prepare a prototype.
> 
> So how would this work?  A single MSR, used after setting TSC deadline
> mode in LVTT?  Could you write it and read TSC deadline or vice versa?

So far, I think that adding KVM_MSR_DEADLINE (probably more descriptive
name in the end) that works only in LVTT mode seems reasonable.

I am tempted to add a second LVTT-like MSR to completely isolate it from
LAPIC timers, but sharing the VMX_PREEMPTION_TIMER would be needlessly
complicated.

>                Could you write it and read TSC deadline or vice versa?

KVM_MSR_DEADLINE would be interface in kvmclock nanosecond values and
MSR_IA32_TSCDEADLINE in TSC values.  KVM_MSR_DEADLINE would follow
similar rules as MSR_IA32_TSCDEADLINE -- the interrupt fires when
kvmclock reaches the value, you read what you write, and 0 disarms it.

If the TSC deadline timer was enabled, then the guest could write to
both MSR_IA32_TSCDEADLINE and KVM_MSR_DEADLINE, but only one could be
armed at any time (non-zero write to one will set the other to 0).

The dual interface would allow unconditinal addition of the PV feature
without regressing users that currently use MSR_IA32_TSCDEADLINE and
adapted their stack to handle KVM's TSC shortcomings ...

> My idea would be "yes" for writing nsec deadline and reading TSC
> deadline, but "no" for writing TSC deadline and reading nsec deadline.
> In the latter case, reading nsec deadline might return an impossible
> value such as -1;

Both MSRs would read what was written or 0 if fired/disarmed in between.
I'm not sure if I understood what you meant, though.

>                   this lets userspace decide whether to set a nsec-based
> deadline or a TSC-based deadline after migration.

Hm, isn't switching to TSC-based deadline after migration pointless?
We don't have any migration notifiers, so the guest interface would have
to always check what interface to use.

>>>             This still wouldn't handle old hosts of course.
>> 
>> The question is whether we want to carry around 150 LOC because of old
>> hosts.  I'd just fix Linux to avoid deadline TSC without invariant TSC.
>> :)
> 
> Yes, that would automatically blacklist it on KVM.  You'd also need to
> update the recent optimization to the TSC deadline timer, to also work
> on other APIC timer modes or at least in your new PV mode.

All modes shouldn't be much harder than just the PV mode.

Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-16 14:59           ` Radim Krčmář
@ 2016-09-16 15:06             ` Paolo Bonzini
  2016-09-16 15:24               ` Radim Krčmář
  0 siblings, 1 reply; 16+ messages in thread
From: Paolo Bonzini @ 2016-09-16 15:06 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86

On 16/09/2016 16:59, Radim Krčmář wrote:
> KVM_MSR_DEADLINE would be interface in kvmclock nanosecond values and
> MSR_IA32_TSCDEADLINE in TSC values.  KVM_MSR_DEADLINE would follow
> similar rules as MSR_IA32_TSCDEADLINE -- the interrupt fires when
> kvmclock reaches the value, you read what you write, and 0 disarms it.
> 
> If the TSC deadline timer was enabled, then the guest could write to
> both MSR_IA32_TSCDEADLINE and KVM_MSR_DEADLINE, but only one could be
> armed at any time (non-zero write to one will set the other to 0).
> 
> The dual interface would allow unconditinal addition of the PV feature
> without regressing users that currently use MSR_IA32_TSCDEADLINE and
> adapted their stack to handle KVM's TSC shortcomings ...

So far so good.  My question is: what happens if you write to
KVM_MSR_DEADLINE and read from MSR_IA32_TSCDEADLINE, or vice versa?

The possibilities are:

a) you read a 0

b) you read the value converted to the other unit

c) you read another value such as -1

(a) and (c) are the simplest of course.  (c) may make sense when writing
to MSR_IA32_TSCDEADLINE and reading from KVM_MSR_DEADLINE, since we can
decide which values are valid or not; -1 is technically a valid TSC
deadline.

I'm not sure about whether to allow (b).  In the end KVM is going to
convert a nsec deadline to a TSC value internally, and vice versa.  On
the other hand, if we do, userspace needs to figure out (on migration)
whether the guest set up a TSC or a nanosecond deadline.

>>                   this lets userspace decide whether to set a nsec-based
>> deadline or a TSC-based deadline after migration.
> 
> Hm, isn't switching to TSC-based deadline after migration pointless?

Yes, but I didn't mean that.  I meant preserving which MSR was written
to arm the timer, and redoing the same on the destination.

>>>>             This still wouldn't handle old hosts of course.
>>>
>>> The question is whether we want to carry around 150 LOC because of old
>>> hosts.  I'd just fix Linux to avoid deadline TSC without invariant TSC.
>>> :)
>>
>> Yes, that would automatically blacklist it on KVM.  You'd also need to
>> update the recent optimization to the TSC deadline timer, to also work
>> on other APIC timer modes or at least in your new PV mode.
> 
> All modes shouldn't be much harder than just the PV mode.

The PV mode would still be a bit easier since it's still the TSC
deadline timer just with a nicer interface that is not based on the TSC.
 Depends on how you code it though, I guess.

Paolo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value
  2016-09-16 15:06             ` Paolo Bonzini
@ 2016-09-16 15:24               ` Radim Krčmář
  0 siblings, 0 replies; 16+ messages in thread
From: Radim Krčmář @ 2016-09-16 15:24 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack, luto, peterhornyack, x86

2016-09-16 17:06+0200, Paolo Bonzini:
> On 16/09/2016 16:59, Radim Krčmář wrote:
>> KVM_MSR_DEADLINE would be interface in kvmclock nanosecond values and
>> MSR_IA32_TSCDEADLINE in TSC values.  KVM_MSR_DEADLINE would follow
>> similar rules as MSR_IA32_TSCDEADLINE -- the interrupt fires when
>> kvmclock reaches the value, you read what you write, and 0 disarms it.
>> 
>> If the TSC deadline timer was enabled, then the guest could write to
>> both MSR_IA32_TSCDEADLINE and KVM_MSR_DEADLINE, but only one could be
>> armed at any time (non-zero write to one will set the other to 0).
>> 
>> The dual interface would allow unconditinal addition of the PV feature
>> without regressing users that currently use MSR_IA32_TSCDEADLINE and
>> adapted their stack to handle KVM's TSC shortcomings ...
> 
> So far so good.  My question is: what happens if you write to
> KVM_MSR_DEADLINE and read from MSR_IA32_TSCDEADLINE, or vice versa?

(The second paragraph covered it ;])

> The possibilities are:
> 
> a) you read a 0

This one.

> b) you read the value converted to the other unit

Too much hassle. :)

> c) you read another value such as -1

Having common "disarmed" value is nicer and MSR_IA32_TSCDEADLINE has 0.

> (a) and (c) are the simplest of course.  (c) may make sense when writing
> to MSR_IA32_TSCDEADLINE and reading from KVM_MSR_DEADLINE, since we can
> decide which values are valid or not; -1 is technically a valid TSC
> deadline.
> 
> I'm not sure about whether to allow (b).  In the end KVM is going to
> convert a nsec deadline to a TSC value internally, and vice versa.

It is not necessary to convert nsec deadline to guest-TSC, only to
host-TSC in case the VMX_PREEPTION_TIMER is used.
I would only have the host-TSC internal representation, which is not
exportable to the guest or migratable.

>                                                                     On
> the other hand, if we do, userspace needs to figure out (on migration)
> whether the guest set up a TSC or a nanosecond deadline.

Yeah, I think the solution described below (writing 0 doesn't disarm the
other one) is not bad.

>>>                   this lets userspace decide whether to set a nsec-based
>>> deadline or a TSC-based deadline after migration.
>> 
>> Hm, isn't switching to TSC-based deadline after migration pointless?
> 
> Yes, but I didn't mean that.  I meant preserving which MSR was written
> to arm the timer, and redoing the same on the destination.

Ah, I see.  Both MSRs read what deadline written to them (if they are
armed) and at most one can be non-zero.
KVM will add MSR_IA32_TSCDEADLINE to the list of emulated MSRs, so
userspace will save/restore both deadline MSRs and zero writes will not
disarm the other timer, so the correct timer will be armed.

No special logic to try to avoid TSC-related bugs.

>>>>>             This still wouldn't handle old hosts of course.
>>>>
>>>> The question is whether we want to carry around 150 LOC because of old
>>>> hosts.  I'd just fix Linux to avoid deadline TSC without invariant TSC.
>>>> :)
>>>
>>> Yes, that would automatically blacklist it on KVM.  You'd also need to
>>> update the recent optimization to the TSC deadline timer, to also work
>>> on other APIC timer modes or at least in your new PV mode.
>> 
>> All modes shouldn't be much harder than just the PV mode.
> 
> The PV mode would still be a bit easier since it's still the TSC
> deadline timer just with a nicer interface that is not based on the TSC.
>  Depends on how you code it though, I guess.

Yeah, we'll see.  I am planning to carry around the deadline value in
nanoseconds (to avoid needless conversions), so it would have similar
requirements as the APIC timer.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-10-11  4:05 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-06 22:29 [PATCH v2 0/2] if running under KVM, use kvmclock to compute TSC deadline value Paolo Bonzini
2016-09-06 22:29 ` [PATCH 1/2] x86: paravirt: add local_apic_timer_interrupt to pv_ops Paolo Bonzini
2016-09-07  6:25   ` kbuild test robot
2016-09-07  6:33   ` kbuild test robot
2016-09-06 22:29 ` [PATCH 2/2] x86, kvm: use kvmclock to compute TSC deadline value Paolo Bonzini
2016-09-08 22:13   ` David Matlack
2016-09-09 16:38     ` Paolo Bonzini
2016-09-09 20:05       ` David Matlack
2016-10-11  4:05       ` Wanpeng Li
2016-09-15 15:09   ` Radim Krčmář
2016-09-15 16:00     ` Paolo Bonzini
2016-09-15 19:59       ` Radim Krčmář
2016-09-15 21:02         ` Paolo Bonzini
2016-09-16 14:59           ` Radim Krčmář
2016-09-16 15:06             ` Paolo Bonzini
2016-09-16 15:24               ` Radim Krčmář

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).