linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM
@ 2021-02-05 10:03 Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd Zhimin Feng
                   ` (10 more replies)
  0 siblings, 11 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

The main motivation for this patch is to improve the performance of VM.
This patch series introduces how to enable the timer passthrough in
non-root mode.

The main idea is to offload the host timer to the preemtion timer in
non-root mode. Through doing this, guest can write tscdeadline msr directly
in non-root mode and host timer isn't lost. If CPU is in root mode,
guest timer is switched to software timer.

Testing on Intel(R) Xeon(R) Platinum 8260 server.

The guest OS is Debian(kernel: 4.19.28). The specific configuration is
 is as follows: 8 cpu, 16GB memory, guest idle=poll
memcached in guest(memcached -d -t 8 -u root)

I use the memtier_benchmark tool to test performance
(memtier_benchmark -P memcache_text -s guest_ip -c 16 -t 32
 --key-maximum=10000000000 --random-data --data-size-range=64-128 -p 11211
 --generate-keys --ratio 5:1 --test-time=500)

Total Ops can be improved 25% and Avg.Latency can be improved 20% when
the timer-passthrough is enabled.

=============================================================
               | Enable timer-passth | Disable timer-passth |
=============================================================
Totals Ops/sec |    514869.67        |     411766.67        |
-------------------------------------------------------------
Avg.Latency    |    0.99483          |     1.24294          |
=============================================================


Zhimin Feng (9):
  KVM: vmx: hook set_next_event for getting the host tscd
  KVM: vmx: enable host lapic timer offload preemtion timer
  KVM: vmx: enable passthrough timer to guest
  KVM: vmx: enable passth timer switch to sw timer
  KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough
  KVM: vmx: check enable_timer_passth strictly
  KVM: vmx: save the initial value of host tscd
  KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm
  KVM: vmx: query the state of timer-passth for vm

 arch/x86/include/asm/kvm_host.h |  27 ++++
 arch/x86/kvm/lapic.c            |   1 +
 arch/x86/kvm/vmx/vmx.c          | 331 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |  26 +++-
 include/linux/kvm_host.h        |   1 +
 include/uapi/linux/kvm.h        |   3 +
 kernel/time/tick-common.c       |   1 +
 tools/include/uapi/linux/kvm.h  |   3 +
 virt/kvm/kvm_main.c             |   1 +
 9 files changed, 389 insertions(+), 5 deletions(-)

-- 
2.11.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 12:28   ` Peter Zijlstra
  2021-02-05 18:11   ` Thomas Gleixner
  2021-02-05 10:03 ` [RFC: timer passthrough 2/9] KVM: vmx: enable host lapic timer offload preemtion timer Zhimin Feng
                   ` (9 subsequent siblings)
  10 siblings, 2 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

In order to get the host tscd value,
we need to hook set_next_event function

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h | 21 +++++++++++++++++
 arch/x86/kvm/vmx/vmx.c          | 51 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  1 +
 kernel/time/tick-common.c       |  1 +
 4 files changed, 74 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e5f33a0d0e2..eb6a611963b7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,6 +34,7 @@
 #include <asm/kvm_page_track.h>
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/hyperv-tlfs.h>
+#include <linux/clockchips.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
@@ -520,6 +521,24 @@ struct kvm_vcpu_hv {
 	cpumask_t tlb_flush;
 };
 
+enum tick_device_mode {
+	TICKDEV_MODE_PERIODIC,
+	TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+	struct clock_event_device *evtdev;
+	enum tick_device_mode mode;
+};
+
+struct timer_passth_info {
+	u64 host_tscd;
+	struct clock_event_device *curr_dev;
+
+	void (*orig_event_handler)(struct clock_event_device *dev);
+	int (*orig_set_next_event)(unsigned long evt, struct clock_event_device *dev);
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -805,6 +824,8 @@ struct kvm_vcpu_arch {
 		 */
 		bool enforce;
 	} pv_cpuid;
+
+	bool timer_passth_enable;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 47b8357b9751..38b8d80fa157 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -128,6 +128,11 @@ static bool __read_mostly enable_preemption_timer = 1;
 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 #endif
 
+static bool __read_mostly enable_timer_passth;
+#ifdef CONFIG_X86_64
+module_param_named(timer_passth, enable_timer_passth, bool, 0444);
+#endif
+
 extern bool __read_mostly allow_smaller_maxphyaddr;
 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 
@@ -220,6 +225,46 @@ static const struct {
 	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 };
 
+#define TSC_DIVISOR  8
+static DEFINE_PER_CPU(struct timer_passth_info, passth_info);
+
+static int override_lapic_next_event(unsigned long delta,
+		struct clock_event_device *evt)
+{
+	struct timer_passth_info *local_timer_info;
+	u64 tsc;
+	u64 tscd;
+
+	local_timer_info = &per_cpu(passth_info, smp_processor_id());
+	tsc = rdtsc();
+	tscd = tsc + (((u64) delta) * TSC_DIVISOR);
+	local_timer_info->host_tscd = tscd;
+	wrmsrl(MSR_IA32_TSCDEADLINE, tscd);
+
+	return 0;
+}
+
+static void vmx_host_timer_passth_init(void *junk)
+{
+	struct timer_passth_info *local_timer_info;
+	int cpu = smp_processor_id();
+
+	local_timer_info = &per_cpu(passth_info, cpu);
+	local_timer_info->curr_dev = per_cpu(tick_cpu_device, cpu).evtdev;
+	local_timer_info->orig_set_next_event =
+		local_timer_info->curr_dev->set_next_event;
+	local_timer_info->curr_dev->set_next_event = override_lapic_next_event;
+}
+
+static void vmx_host_timer_restore(void *junk)
+{
+	struct timer_passth_info *local_timer_info;
+
+	local_timer_info = &per_cpu(passth_info, smp_processor_id());
+	local_timer_info->curr_dev->set_next_event =
+		local_timer_info->orig_set_next_event;
+}
+
 #define L1D_CACHE_ORDER 4
 static void *vmx_l1d_flush_pages;
 
@@ -7573,6 +7618,9 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
 
 static void hardware_unsetup(void)
 {
+	if (enable_timer_passth)
+		on_each_cpu(vmx_host_timer_restore, NULL, 1);
+
 	if (nested)
 		nested_vmx_hardware_unsetup();
 
@@ -7884,6 +7932,9 @@ static __init int hardware_setup(void)
 
 	vmx_set_cpu_caps();
 
+	if (enable_timer_passth)
+		on_each_cpu(vmx_host_timer_passth_init, NULL, 1);
+
 	r = alloc_kvm_area();
 	if (r)
 		nested_vmx_hardware_unsetup();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e545a8a613b1..5d353a9c9881 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9911,6 +9911,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pending_external_vector = -1;
 	vcpu->arch.preempted_in_kernel = false;
+	vcpu->arch.timer_passth_enable = false;
 
 	kvm_hv_vcpu_init(vcpu);
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 6c9c342dd0e5..bc50f4a1a7c0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -26,6 +26,7 @@
  * Tick devices
  */
 DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
+EXPORT_SYMBOL_GPL(tick_cpu_device);
 /*
  * Tick next event: keeps track of the tick time
  */
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 2/9] KVM: vmx: enable host lapic timer offload preemtion timer
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 3/9] KVM: vmx: enable passthrough timer to guest Zhimin Feng
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

Use preemption timer to handle host timer

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/vmx.c          | 54 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  1 +
 3 files changed, 56 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eb6a611963b7..82a51f0d01a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -826,6 +826,7 @@ struct kvm_vcpu_arch {
 	} pv_cpuid;
 
 	bool timer_passth_enable;
+	u64 tscd;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 38b8d80fa157..0bf9941df842 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5629,6 +5629,13 @@ static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vcpu->arch.timer_passth_enable) {
+		local_irq_disable();
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), LOCAL_TIMER_VECTOR);
+		local_irq_enable();
+
+		return EXIT_FASTPATH_NONE;
+	}
 	if (!vmx->req_immediate_exit &&
 	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
 		kvm_lapic_expired_hv_timer(vcpu);
@@ -6640,6 +6647,51 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
+static void vmx_host_lapic_timer_offload(struct kvm_vcpu *vcpu)
+{
+	struct timer_passth_info *local_timer_info;
+	u64 tscl;
+	u64 guest_tscl;
+	u64 delta_tsc;
+	struct hrtimer *timer;
+
+	if (!vcpu->arch.timer_passth_enable)
+		return;
+
+	local_timer_info = &per_cpu(passth_info, smp_processor_id());
+
+	tscl = rdtsc();
+	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+
+	timer = &vcpu->arch.apic->lapic_timer.timer;
+	if (hrtimer_active(timer))
+		hrtimer_cancel(timer);
+
+	if (local_timer_info->host_tscd > tscl) {
+		delta_tsc = (u32)((local_timer_info->host_tscd - tscl) >>
+				cpu_preemption_timer_multi);
+		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+				PIN_BASED_VMX_PREEMPTION_TIMER);
+	} else {
+		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+				PIN_BASED_VMX_PREEMPTION_TIMER);
+	}
+
+	wrmsrl(MSR_IA32_TSCDEADLINE, 0);
+	if (vcpu->arch.tscd > guest_tscl) {
+		wrmsrl(MSR_IA32_TSCDEADLINE, vcpu->arch.tscd);
+	} else {
+		if (vcpu->arch.tscd > 0) {
+			if (!atomic_read(&vcpu->arch.apic->lapic_timer.pending)) {
+				atomic_inc(&vcpu->arch.apic->lapic_timer.pending);
+				kvm_inject_pending_timer_irqs(vcpu);
+				kvm_x86_ops.sync_pir_to_irr(vcpu);
+			}
+		}
+	}
+}
+
 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 					struct vcpu_vmx *vmx)
 {
@@ -6761,6 +6813,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	kvm_wait_lapic_expire(vcpu);
 
+	vmx_host_lapic_timer_offload(vcpu);
+
 	/*
 	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
 	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d353a9c9881..e51fd52a4862 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9912,6 +9912,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.pending_external_vector = -1;
 	vcpu->arch.preempted_in_kernel = false;
 	vcpu->arch.timer_passth_enable = false;
+	vcpu->arch.tscd = 0;
 
 	kvm_hv_vcpu_init(vcpu);
 
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 3/9] KVM: vmx: enable passthrough timer to guest
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 2/9] KVM: vmx: enable host lapic timer offload preemtion timer Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 4/9] KVM: vmx: enable passth timer switch to sw timer Zhimin Feng
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

Allow Guest to write tscdeadline msr directly.

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/lapic.c            |  9 +++++++
 arch/x86/kvm/vmx/vmx.c          | 56 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 82a51f0d01a2..500fa031297d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -533,6 +533,7 @@ struct tick_device {
 
 struct timer_passth_info {
 	u64 host_tscd;
+	bool host_in_tscdeadline;
 	struct clock_event_device *curr_dev;
 
 	void (*orig_event_handler)(struct clock_event_device *dev);
@@ -1302,6 +1303,8 @@ struct kvm_x86_ops {
 
 	void (*migrate_timers)(struct kvm_vcpu *vcpu);
 	void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
+	void (*set_timer_passthrough)(struct kvm_vcpu *vcpu, bool enable);
+	int (*host_timer_can_passth)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 86c33d53c90a..9b2f8b99fbf6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1508,6 +1508,15 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 		}
 		apic->lapic_timer.timer_mode = timer_mode;
 		limit_periodic_timer_frequency(apic);
+
+		if (kvm_x86_ops.host_timer_can_passth(apic->vcpu)) {
+			if (apic_lvtt_tscdeadline(apic)) {
+				kvm_x86_ops.set_timer_passthrough(apic->vcpu, true);
+			} else {
+				if (apic->vcpu->arch.timer_passth_enable)
+					kvm_x86_ops.set_timer_passthrough(apic->vcpu, false);
+			}
+		}
 	}
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0bf9941df842..0c1b5ee4bb8e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -47,6 +47,7 @@
 #include <asm/spec-ctrl.h>
 #include <asm/virtext.h>
 #include <asm/vmx.h>
+#include <asm/apicdef.h>
 
 #include "capabilities.h"
 #include "cpuid.h"
@@ -705,6 +706,8 @@ static bool is_valid_passthrough_msr(u32 msr)
 	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
 		return true;
+	case MSR_IA32_TSC_DEADLINE:
+		return true;
 	}
 
 	r = possible_passthrough_msr_slot(msr) != -ENOENT;
@@ -7670,6 +7673,54 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void host_lapic_timer_in_deadline(void *junk)
+{
+	unsigned int v;
+	struct timer_passth_info *local_timer_info;
+	int cpu = smp_processor_id();
+
+	local_timer_info = &per_cpu(passth_info, cpu);
+	v = apic_read(APIC_LVTT);
+	local_timer_info->host_in_tscdeadline = (v & APIC_LVT_TIMER_TSCDEADLINE);
+}
+
+static bool host_all_cpu_in_tscdeadline(void)
+{
+	int cpu;
+	struct timer_passth_info *local_timer_info;
+
+	for_each_online_cpu(cpu) {
+		local_timer_info = &per_cpu(passth_info, cpu);
+		if (!local_timer_info->host_in_tscdeadline)
+			return false;
+	}
+
+	return true;
+}
+
+static int vmx_host_timer_can_passth(struct kvm_vcpu *vcpu)
+{
+	if (!enable_timer_passth || !cpu_has_vmx_msr_bitmap() ||
+			!host_all_cpu_in_tscdeadline())
+		return 0;
+	return 1;
+}
+
+static void vmx_set_timer_passthrough(struct kvm_vcpu *vcpu, bool enable)
+{
+	if (enable) {
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC_DEADLINE,
+									  MSR_TYPE_RW);
+		vcpu->arch.timer_passth_enable = 1;
+	} else {
+		vmx_enable_intercept_for_msr(vcpu, MSR_IA32_TSC_DEADLINE,
+									 MSR_TYPE_RW);
+		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+				PIN_BASED_VMX_PREEMPTION_TIMER);
+		vcpu->arch.timer_passth_enable = 0;
+	}
+}
+
 static void hardware_unsetup(void)
 {
 	if (enable_timer_passth)
@@ -7817,6 +7868,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.migrate_timers = vmx_migrate_timers,
 
 	.msr_filter_changed = vmx_msr_filter_changed,
+	.set_timer_passthrough = vmx_set_timer_passthrough,
+	.host_timer_can_passth = vmx_host_timer_can_passth,
 };
 
 static __init int hardware_setup(void)
@@ -7987,6 +8040,9 @@ static __init int hardware_setup(void)
 	vmx_set_cpu_caps();
 
 	if (enable_timer_passth)
+		on_each_cpu(host_lapic_timer_in_deadline, NULL, 1);
+
+	if (enable_timer_passth)
 		on_each_cpu(vmx_host_timer_passth_init, NULL, 1);
 
 	r = alloc_kvm_area();
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 4/9] KVM: vmx: enable passth timer switch to sw timer
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (2 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 3/9] KVM: vmx: enable passthrough timer to guest Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough Zhimin Feng
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

Switch the guest timer to software timer when the
VCPU is scheduled.

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/vmx.c          | 65 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 12 ++++++--
 3 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 500fa031297d..be8fc230f7c4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1305,6 +1305,7 @@ struct kvm_x86_ops {
 	void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
 	void (*set_timer_passthrough)(struct kvm_vcpu *vcpu, bool enable);
 	int (*host_timer_can_passth)(struct kvm_vcpu *vcpu);
+	void (*switch_to_sw_timer)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 42cf0a3ad493..f824ee46e2d3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -28,6 +28,8 @@
 #include <linux/tboot.h>
 #include <linux/trace_events.h>
 #include <linux/entry-kvm.h>
+#include <linux/percpu.h>
+#include <linux/tick.h>
 
 #include <asm/apic.h>
 #include <asm/asm.h>
@@ -6702,6 +6704,27 @@ static void vmx_host_lapic_timer_offload(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_restore_passth_timer(struct kvm_vcpu *vcpu)
+{
+	struct timer_passth_info *local_timer_info;
+	u64 host_tscd;
+	u64 guest_tscd;
+
+	if (vcpu->arch.timer_passth_enable) {
+		local_timer_info = &per_cpu(passth_info, smp_processor_id());
+		host_tscd = local_timer_info->host_tscd;
+		rdmsrl(MSR_IA32_TSC_DEADLINE, guest_tscd);
+
+		if (guest_tscd != 0 &&
+			guest_tscd != host_tscd) {
+			vcpu->arch.tscd = guest_tscd;
+		}
+
+		if (host_tscd > rdtsc())
+			wrmsrl(MSR_IA32_TSC_DEADLINE, host_tscd);
+	}
+}
+
 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 					struct vcpu_vmx *vmx)
 {
@@ -6836,6 +6859,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	/* The actual VMENTER/EXIT is in the .noinstr.text section. */
 	vmx_vcpu_enter_exit(vcpu, vmx);
 
+	vmx_restore_passth_timer(vcpu);
 	/*
 	 * We do not use IBRS in the kernel. If this vCPU has used the
 	 * SPEC_CTRL MSR it may have left it on; save the value and
@@ -7589,11 +7613,50 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+static void vmx_passth_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_timer *ktimer;
+	ktime_t expire;
+	u64 guest_tscl;
+	ktime_t now;
+	u64 ns;
+	unsigned long flags;
+	unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+
+	ktimer = &vcpu->arch.apic->lapic_timer;
+	if (hrtimer_active(&ktimer->timer))
+		return;
+
+	local_irq_save(flags);
+	now = ktime_get();
+
+	guest_tscl = kvm_read_l1_tsc(vcpu, rdtsc());
+	ns = (vcpu->arch.tscd - guest_tscl) * 1000000ULL;
+	do_div(ns, this_tsc_khz);
+	if (likely(vcpu->arch.tscd > guest_tscl) &&
+		likely(ns > ktimer->timer_advance_ns)) {
+		expire = ktime_add_ns(now, ns);
+		expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+		hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
+	} else {
+		if (vcpu->arch.tscd > 0) {
+			if (!atomic_read(&vcpu->arch.apic->lapic_timer.pending)) {
+				atomic_inc(&vcpu->arch.apic->lapic_timer.pending);
+				kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+			}
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
 {
 	if (pi_pre_block(vcpu))
 		return 1;
 
+	vmx_passth_switch_to_sw_timer(vcpu);
+
 	if (kvm_lapic_hv_timer_in_use(vcpu))
 		kvm_lapic_switch_to_sw_timer(vcpu);
 
@@ -7722,6 +7785,7 @@ static void vmx_set_timer_passthrough(struct kvm_vcpu *vcpu, bool enable)
 	} else {
 		vmx_enable_intercept_for_msr(vcpu, MSR_IA32_TSC_DEADLINE,
 									 MSR_TYPE_RW);
+		vmx_passth_switch_to_sw_timer(vcpu);
 		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
 				PIN_BASED_VMX_PREEMPTION_TIMER);
 		vcpu->arch.timer_passth_enable = 0;
@@ -7877,6 +7941,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.msr_filter_changed = vmx_msr_filter_changed,
 	.set_timer_passthrough = vmx_set_timer_passthrough,
 	.host_timer_can_passth = vmx_host_timer_can_passth,
+	.switch_to_sw_timer = vmx_passth_switch_to_sw_timer,
 };
 
 static __init int hardware_setup(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e51fd52a4862..2b4aa925d6d9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9094,8 +9094,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			r = vcpu_block(kvm, vcpu);
 		}
 
-		if (r <= 0)
+		if (r <= 0) {
+			if (kvm_x86_ops.switch_to_sw_timer)
+				kvm_x86_ops.switch_to_sw_timer(vcpu);
 			break;
+		}
 
 		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
@@ -9106,14 +9109,19 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			r = 0;
 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 			++vcpu->stat.request_irq_exits;
+			if (kvm_x86_ops.switch_to_sw_timer)
+				kvm_x86_ops.switch_to_sw_timer(vcpu);
 			break;
 		}
 
 		if (__xfer_to_guest_mode_work_pending()) {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			r = xfer_to_guest_mode_handle_work(vcpu);
-			if (r)
+			if (r) {
+				if (kvm_x86_ops.switch_to_sw_timer)
+					kvm_x86_ops.switch_to_sw_timer(vcpu);
 				return r;
+			}
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		}
 	}
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (3 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 4/9] KVM: vmx: enable passth timer switch to sw timer Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 12:43   ` Peter Zijlstra
  2021-02-05 10:03 ` [RFC: timer passthrough 6/9] KVM: vmx: check enable_timer_passth strictly Zhimin Feng
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

when in vm:
rdtsc = host_tsc * (TSC multiplier) + tsc_offset(<0)
so when vm write tsc_deadline_msr the value always less than
tsc stampcounter msr value, the irq never be triggered.

the tsc_adjust msr use as below, host execute
rdtsc = host_tsc + tsc_adjust

when vmentry, we set the tsc_adjust equal tsc_offset and vmcs
tsc offset filed equal 0, so the vm execute rdtsc the result like this:
rdtsc = host_tsc + tsc_adjust + 0
the tsc_deadline_msr value will equal tsc stampcounter msr and
the irq will trigger success.

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/vmx.c          | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index be8fc230f7c4..7971c9e755a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -534,6 +534,7 @@ struct tick_device {
 struct timer_passth_info {
 	u64 host_tscd;
 	bool host_in_tscdeadline;
+	u64 host_tsc_adjust;
 	struct clock_event_device *curr_dev;
 
 	void (*orig_event_handler)(struct clock_event_device *dev);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f824ee46e2d3..44b2fd59587e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6659,6 +6659,27 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, bool to_host)
+{
+	u64 tsc_adjust;
+	struct timer_passth_info *local_timer_info;
+
+	local_timer_info = &per_cpu(passth_info, smp_processor_id());
+
+	if (to_host) {
+		tsc_adjust = local_timer_info->host_tsc_adjust;
+		wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust);
+		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+	} else {
+		rdmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust);
+		local_timer_info->host_tsc_adjust = tsc_adjust;
+
+		wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust + vcpu->arch.tsc_offset);
+		vmcs_write64(TSC_OFFSET, 0);
+
+	}
+}
+
 static void vmx_host_lapic_timer_offload(struct kvm_vcpu *vcpu)
 {
 	struct timer_passth_info *local_timer_info;
@@ -6690,6 +6711,7 @@ static void vmx_host_lapic_timer_offload(struct kvm_vcpu *vcpu)
 				PIN_BASED_VMX_PREEMPTION_TIMER);
 	}
 
+	vmx_adjust_tsc_offset(vcpu, false);
 	wrmsrl(MSR_IA32_TSCDEADLINE, 0);
 	if (vcpu->arch.tscd > guest_tscl) {
 		wrmsrl(MSR_IA32_TSCDEADLINE, vcpu->arch.tscd);
@@ -6711,6 +6733,7 @@ static void vmx_restore_passth_timer(struct kvm_vcpu *vcpu)
 	u64 guest_tscd;
 
 	if (vcpu->arch.timer_passth_enable) {
+		vmx_adjust_tsc_offset(vcpu, true);
 		local_timer_info = &per_cpu(passth_info, smp_processor_id());
 		host_tscd = local_timer_info->host_tscd;
 		rdmsrl(MSR_IA32_TSC_DEADLINE, guest_tscd);
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 6/9] KVM: vmx: check enable_timer_passth strictly
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (4 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 7/9] KVM: vmx: save the initial value of host tscd Zhimin Feng
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

preemption timer is default disabled
timer passthrough is default enabled

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/kvm/vmx/vmx.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 44b2fd59587e..a12da3cef86d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -126,12 +126,12 @@ module_param(dump_invalid_vmcs, bool, 0644);
 
 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 static int __read_mostly cpu_preemption_timer_multi;
-static bool __read_mostly enable_preemption_timer = 1;
+static bool __read_mostly enable_preemption_timer;
 #ifdef CONFIG_X86_64
 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 #endif
 
-static bool __read_mostly enable_timer_passth;
+static bool __read_mostly enable_timer_passth = 1;
 #ifdef CONFIG_X86_64
 module_param_named(timer_passth, enable_timer_passth, bool, 0444);
 #endif
@@ -8108,12 +8108,17 @@ static __init int hardware_setup(void)
 			enable_preemption_timer = false;
 	}
 
-	if (!enable_preemption_timer) {
+	if (!enable_preemption_timer || enable_timer_passth) {
 		vmx_x86_ops.set_hv_timer = NULL;
 		vmx_x86_ops.cancel_hv_timer = NULL;
 		vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
 	}
 
+	if (enable_preemption_timer && enable_timer_passth) {
+		pr_err("cannot enable timer passthrough and preemption timer same timer\n");
+		return -EINVAL;
+	}
+
 	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
 
 	kvm_mce_cap_supported |= MCG_LMCE_P;
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 7/9] KVM: vmx: save the initial value of host tscd
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (5 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 6/9] KVM: vmx: check enable_timer_passth strictly Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 8/9] KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm Zhimin Feng
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

Record the host tscd value.

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/kvm/vmx/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a12da3cef86d..98eca70d4251 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -251,8 +251,11 @@ static void vmx_host_timer_passth_init(void *junk)
 {
 	struct timer_passth_info *local_timer_info;
 	int cpu = smp_processor_id();
+	u64 tscd;
 
 	local_timer_info = &per_cpu(passth_info, cpu);
+	rdmsrl(MSR_IA32_TSC_DEADLINE, tscd);
+	local_timer_info->host_tscd = tscd;
 	local_timer_info->curr_dev = per_cpu(tick_cpu_device, cpu).evtdev;
 	local_timer_info->orig_set_next_event =
 		local_timer_info->curr_dev->set_next_event;
@@ -266,6 +269,7 @@ static void vmx_host_timer_restore(void *junk)
 	local_timer_info = &per_cpu(passth_info, smp_processor_id());
 	local_timer_info->curr_dev->set_next_event =
 		local_timer_info->orig_set_next_event;
+	local_timer_info->host_tscd = 0;
 }
 
 #define L1D_CACHE_ORDER 4
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 8/9] KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (6 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 7/9] KVM: vmx: save the initial value of host tscd Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 10:03 ` [RFC: timer passthrough 9/9] KVM: vmx: query the state of timer-passth for vm Zhimin Feng
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

Timer passthrough is default disabled

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +--
 arch/x86/kvm/lapic.c            | 10 +-------
 arch/x86/kvm/vmx/vmx.c          | 52 +++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.c              |  6 +++++
 include/linux/kvm_host.h        |  1 +
 include/uapi/linux/kvm.h        |  2 ++
 tools/include/uapi/linux/kvm.h  |  2 ++
 virt/kvm/kvm_main.c             |  1 +
 8 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7971c9e755a4..9855ef419793 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1304,9 +1304,8 @@ struct kvm_x86_ops {
 
 	void (*migrate_timers)(struct kvm_vcpu *vcpu);
 	void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
-	void (*set_timer_passthrough)(struct kvm_vcpu *vcpu, bool enable);
-	int (*host_timer_can_passth)(struct kvm_vcpu *vcpu);
 	void (*switch_to_sw_timer)(struct kvm_vcpu *vcpu);
+	int (*set_timer_passth_state)(struct kvm *kvm, void *argp);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9b2f8b99fbf6..9ba4157f9b81 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1508,15 +1508,6 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 		}
 		apic->lapic_timer.timer_mode = timer_mode;
 		limit_periodic_timer_frequency(apic);
-
-		if (kvm_x86_ops.host_timer_can_passth(apic->vcpu)) {
-			if (apic_lvtt_tscdeadline(apic)) {
-				kvm_x86_ops.set_timer_passthrough(apic->vcpu, true);
-			} else {
-				if (apic->vcpu->arch.timer_passth_enable)
-					kvm_x86_ops.set_timer_passthrough(apic->vcpu, false);
-			}
-		}
 	}
 }
 
@@ -2219,6 +2210,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 
 	hrtimer_cancel(&apic->lapic_timer.timer);
 	apic->lapic_timer.tscdeadline = data;
+	vcpu->arch.tscd = data;
 	start_apic_timer(apic);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 98eca70d4251..b88f744478e9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -216,6 +216,8 @@ static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 /* Storage for pre module init parameter parsing */
 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 
+static void vmx_set_timer_passthrough(struct kvm_vcpu *vcpu, bool enable);
+
 static const struct {
 	const char *option;
 	bool for_parse;
@@ -6742,9 +6744,9 @@ static void vmx_restore_passth_timer(struct kvm_vcpu *vcpu)
 		host_tscd = local_timer_info->host_tscd;
 		rdmsrl(MSR_IA32_TSC_DEADLINE, guest_tscd);
 
-		if (guest_tscd != 0 &&
-			guest_tscd != host_tscd) {
+		if (guest_tscd != 0 && guest_tscd != host_tscd) {
 			vcpu->arch.tscd = guest_tscd;
+			vcpu->arch.apic->lapic_timer.tscdeadline = vcpu->arch.tscd;
 		}
 
 		if (host_tscd > rdtsc())
@@ -6873,6 +6875,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	kvm_wait_lapic_expire(vcpu);
 
+	if (vcpu->arch.timer_passth_enable) {
+		if (!atomic_read(&vcpu->kvm->timer_passth_state)) {
+			vcpu->arch.apic->lapic_timer.tscdeadline =
+				vcpu->arch.tscd;
+			vmx_set_timer_passthrough(vcpu, false);
+		}
+	} else if (atomic_read(&vcpu->kvm->timer_passth_state)) {
+		vmx_set_timer_passthrough(vcpu, true);
+	}
 	vmx_host_lapic_timer_offload(vcpu);
 
 	/*
@@ -7838,6 +7849,40 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
 	return supported & BIT(bit);
 }
 
+static int vmx_set_timer_passth_state(struct kvm *kvm, void *argp)
+{
+	int r = -1;
+	int i;
+	struct kvm_vcpu *vcpu;
+	int state;
+
+	if (copy_from_user(&state, argp, sizeof(int)))
+		goto out;
+
+	if (!!state) {
+		/* judge whether support timer-pasth */
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (!vmx_host_timer_can_passth(vcpu) ||
+				(vcpu->arch.apic->lapic_timer.timer_mode !=
+				APIC_LVT_TIMER_TSCDEADLINE)) {
+				pr_err("host don't support timer passthrough\n");
+				goto out;
+			}
+		}
+	}
+
+	if (kvm->timer_passth_state.counter != (!!state)) {
+		atomic_set(&kvm->timer_passth_state, !!state);
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			kvm_vcpu_kick(vcpu);
+		}
+	}
+	r = 0;
+
+out:
+	return r;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.hardware_unsetup = hardware_unsetup,
 
@@ -7966,9 +8011,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.migrate_timers = vmx_migrate_timers,
 
 	.msr_filter_changed = vmx_msr_filter_changed,
-	.set_timer_passthrough = vmx_set_timer_passthrough,
-	.host_timer_can_passth = vmx_host_timer_can_passth,
 	.switch_to_sw_timer = vmx_passth_switch_to_sw_timer,
+	.set_timer_passth_state = vmx_set_timer_passth_state,
 };
 
 static __init int hardware_setup(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b4aa925d6d9..7db74bd9d362 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5692,6 +5692,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_X86_SET_MSR_FILTER:
 		r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
 		break;
+	case KVM_SET_TIMER_PASSTH_STATE: {
+		r = -EFAULT;
+		if (kvm_x86_ops.set_timer_passth_state)
+			r = kvm_x86_ops.set_timer_passth_state(kvm, argp);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..b3de12c3f473 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm {
 	struct srcu_struct irq_srcu;
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
+	atomic_t timer_passth_state;
 };
 
 #define kvm_err(fmt, ...) \
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca41220b40b8..6e26bc342599 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1557,6 +1557,8 @@ struct kvm_pv_cmd {
 /* Available with KVM_CAP_X86_MSR_FILTER */
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
+#define KVM_SET_TIMER_PASSTH_STATE  _IO(KVMIO,   0xc7)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index ca41220b40b8..6e26bc342599 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1557,6 +1557,8 @@ struct kvm_pv_cmd {
 /* Available with KVM_CAP_X86_MSR_FILTER */
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
+#define KVM_SET_TIMER_PASSTH_STATE  _IO(KVMIO,   0xc7)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..7e7a3adede62 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -751,6 +751,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
 	INIT_LIST_HEAD(&kvm->devices);
+	atomic_set(&kvm->timer_passth_state, 0);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC: timer passthrough 9/9] KVM: vmx: query the state of timer-passth for vm
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (7 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 8/9] KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm Zhimin Feng
@ 2021-02-05 10:03 ` Zhimin Feng
  2021-02-05 13:34 ` [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Paolo Bonzini
  2021-02-08 18:13 ` Konrad Rzeszutek Wilk
  10 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-05 10:03 UTC (permalink / raw)
  To: x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, tglx,
	mingo, bp, hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

query the state of timer passthrough of specific vm

Signed-off-by: Zhimin Feng <fengzhimin@bytedance.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/vmx.c          | 11 +++++++++++
 arch/x86/kvm/x86.c              |  6 ++++++
 include/uapi/linux/kvm.h        |  1 +
 tools/include/uapi/linux/kvm.h  |  1 +
 5 files changed, 20 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9855ef419793..189c4f6f9d5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1306,6 +1306,7 @@ struct kvm_x86_ops {
 	void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
 	void (*switch_to_sw_timer)(struct kvm_vcpu *vcpu);
 	int (*set_timer_passth_state)(struct kvm *kvm, void *argp);
+	int (*get_timer_passth_state)(struct kvm *kvm, void __user *argp);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b88f744478e9..b760aa7bc6d5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7883,6 +7883,16 @@ static int vmx_set_timer_passth_state(struct kvm *kvm, void *argp)
 	return r;
 }
 
+static int vmx_get_timer_passth_state(struct kvm *kvm, void __user *argp)
+{
+	int state = atomic_read(&kvm->timer_passth_state);
+
+	if (copy_to_user(argp, &state, sizeof(state)))
+		return -1;
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.hardware_unsetup = hardware_unsetup,
 
@@ -8013,6 +8023,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.msr_filter_changed = vmx_msr_filter_changed,
 	.switch_to_sw_timer = vmx_passth_switch_to_sw_timer,
 	.set_timer_passth_state = vmx_set_timer_passth_state,
+	.get_timer_passth_state = vmx_get_timer_passth_state,
 };
 
 static __init int hardware_setup(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7db74bd9d362..a32927697e82 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5698,6 +5698,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = kvm_x86_ops.set_timer_passth_state(kvm, argp);
 		break;
 	}
+	case KVM_GET_TIMER_PASSTH_STATE: {
+		r = -EFAULT;
+		if (kvm_x86_ops.get_timer_passth_state)
+			r = kvm_x86_ops.get_timer_passth_state(kvm, argp);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6e26bc342599..2c0cefb8ffe2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1558,6 +1558,7 @@ struct kvm_pv_cmd {
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
 #define KVM_SET_TIMER_PASSTH_STATE  _IO(KVMIO,   0xc7)
+#define KVM_GET_TIMER_PASSTH_STATE  _IO(KVMIO,   0xc8)
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 6e26bc342599..2c0cefb8ffe2 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1558,6 +1558,7 @@ struct kvm_pv_cmd {
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
 #define KVM_SET_TIMER_PASSTH_STATE  _IO(KVMIO,   0xc7)
+#define KVM_GET_TIMER_PASSTH_STATE  _IO(KVMIO,   0xc8)
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd
  2021-02-05 10:03 ` [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd Zhimin Feng
@ 2021-02-05 12:28   ` Peter Zijlstra
  2021-02-05 18:11   ` Thomas Gleixner
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2021-02-05 12:28 UTC (permalink / raw)
  To: Zhimin Feng
  Cc: x86, kvm, linux-kernel, pbonzini, seanjc, vkuznets, wanpengli,
	jmattson, joro, tglx, mingo, bp, hpa, fweisbec, zhouyibo,
	zhanghaozhong

On Fri, Feb 05, 2021 at 06:03:09PM +0800, Zhimin Feng wrote:
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 6c9c342dd0e5..bc50f4a1a7c0 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -26,6 +26,7 @@
>   * Tick devices
>   */
>  DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
> +EXPORT_SYMBOL_GPL(tick_cpu_device);
>  /*
>   * Tick next event: keeps track of the tick time
>   */

Oh heck no. Modules have no business what so ever accessing this.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough
  2021-02-05 10:03 ` [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough Zhimin Feng
@ 2021-02-05 12:43   ` Peter Zijlstra
  2021-02-05 18:22     ` Thomas Gleixner
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-02-05 12:43 UTC (permalink / raw)
  To: Zhimin Feng
  Cc: x86, kvm, linux-kernel, pbonzini, seanjc, vkuznets, wanpengli,
	jmattson, joro, tglx, mingo, bp, hpa, fweisbec, zhouyibo,
	zhanghaozhong

On Fri, Feb 05, 2021 at 06:03:13PM +0800, Zhimin Feng wrote:
> when in vm:
> rdtsc = host_tsc * (TSC multiplier) + tsc_offset(<0)
> so when vm write tsc_deadline_msr the value always less than
> tsc stampcounter msr value, the irq never be triggered.
> 
> the tsc_adjust msr use as below, host execute
> rdtsc = host_tsc + tsc_adjust
> 
> when vmentry, we set the tsc_adjust equal tsc_offset and vmcs
> tsc offset filed equal 0, so the vm execute rdtsc the result like this:
> rdtsc = host_tsc + tsc_adjust + 0
> the tsc_deadline_msr value will equal tsc stampcounter msr and
> the irq will trigger success.

That above is unintelligible..

> +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, bool to_host)
> +{
> +	u64 tsc_adjust;
> +	struct timer_passth_info *local_timer_info;
> +
> +	local_timer_info = &per_cpu(passth_info, smp_processor_id());
> +
> +	if (to_host) {
> +		tsc_adjust = local_timer_info->host_tsc_adjust;
> +		wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust);
> +		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
> +	} else {
> +		rdmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust);
> +		local_timer_info->host_tsc_adjust = tsc_adjust;
> +
> +		wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust + vcpu->arch.tsc_offset);
> +		vmcs_write64(TSC_OFFSET, 0);
> +	}
> +}

NAK

This wrecks the host TSC value, any host code between this and actually
entering that VM will observe batshit time.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (8 preceding siblings ...)
  2021-02-05 10:03 ` [RFC: timer passthrough 9/9] KVM: vmx: query the state of timer-passth for vm Zhimin Feng
@ 2021-02-05 13:34 ` Paolo Bonzini
  2021-02-08 18:13 ` Konrad Rzeszutek Wilk
  10 siblings, 0 replies; 18+ messages in thread
From: Paolo Bonzini @ 2021-02-05 13:34 UTC (permalink / raw)
  To: Zhimin Feng, x86, kvm, linux-kernel
  Cc: seanjc, vkuznets, wanpengli, jmattson, joro, tglx, mingo, bp,
	hpa, fweisbec, zhouyibo, zhanghaozhong

On 05/02/21 11:03, Zhimin Feng wrote:
> The main motivation for this patch is to improve the performance of VM.
> This patch series introduces how to enable the timer passthrough in
> non-root mode.
> 
> The main idea is to offload the host timer to the preemtion timer in
> non-root mode. Through doing this, guest can write tscdeadline msr directly
> in non-root mode and host timer isn't lost. If CPU is in root mode,
> guest timer is switched to software timer.
> 
> Testing on Intel(R) Xeon(R) Platinum 8260 server.
> 
> The guest OS is Debian(kernel: 4.19.28). The specific configuration is
>   is as follows: 8 cpu, 16GB memory, guest idle=poll
> memcached in guest(memcached -d -t 8 -u root)
> 
> I use the memtier_benchmark tool to test performance
> (memtier_benchmark -P memcache_text -s guest_ip -c 16 -t 32
>   --key-maximum=10000000000 --random-data --data-size-range=64-128 -p 11211
>   --generate-keys --ratio 5:1 --test-time=500)
> 
> Total Ops can be improved 25% and Avg.Latency can be improved 20% when
> the timer-passthrough is enabled.

As Peter noticed, this is very invasive.  Perhaps you could try 
organizing the code like this:

1) just for the sake of these patches, completely disable the usage of 
the preemption timer

2) add a module parameter that:

- reads the TSC deadline MSR on vmentry and uses it to program the VMX 
preemption timer

- disables the host APIC timer while the guest runs

- injects a timer interrupt on preemption timer vmexits

3) also if the module parameter is 1, use the MSR autoload feature to 
use TSC_ADJUST instead of the VMCS TSC offset

4) also if the module parameter is 1 reintroduce the hv_timer callbacks, 
but this time to program the TSC deadline timer via MSR autoload

5) only when everything else is in place, figure out how to avoid the 
RDMSR for the TSC deadline MSR.

Thanks,

Paolo


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd
  2021-02-05 10:03 ` [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd Zhimin Feng
  2021-02-05 12:28   ` Peter Zijlstra
@ 2021-02-05 18:11   ` Thomas Gleixner
  2021-02-23 13:07     ` [External] " Zhimin Feng
  1 sibling, 1 reply; 18+ messages in thread
From: Thomas Gleixner @ 2021-02-05 18:11 UTC (permalink / raw)
  To: Zhimin Feng, x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, mingo, bp,
	hpa, fweisbec, zhouyibo, zhanghaozhong, Zhimin Feng

On Fri, Feb 05 2021 at 18:03, Zhimin Feng wrote:
> @@ -520,6 +521,24 @@ struct kvm_vcpu_hv {
>  	cpumask_t tlb_flush;
>  };
>  
> +enum tick_device_mode {
> +	TICKDEV_MODE_PERIODIC,
> +	TICKDEV_MODE_ONESHOT,
> +};
> +
> +struct tick_device {
> +	struct clock_event_device *evtdev;
> +	enum tick_device_mode mode;
> +};

There is a reason why these things are defined in a header file which is
not public. Nothing outside of kernel/time/ has to fiddle with
this. Aside of that how are these things supposed to stay in sync?

> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 6c9c342dd0e5..bc50f4a1a7c0 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -26,6 +26,7 @@
>   * Tick devices
>   */
>  DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
> +EXPORT_SYMBOL_GPL(tick_cpu_device);

Not going to happen ever.

> +#define TSC_DIVISOR  8
> +static DEFINE_PER_CPU(struct timer_passth_info, passth_info);
> +
> +static int override_lapic_next_event(unsigned long delta,
> +		struct clock_event_device *evt)
> +{
> +	struct timer_passth_info *local_timer_info;
> +	u64 tsc;
> +	u64 tscd;
> +
> +	local_timer_info = &per_cpu(passth_info, smp_processor_id());
> +	tsc = rdtsc();
> +	tscd = tsc + (((u64) delta) * TSC_DIVISOR);
> +	local_timer_info->host_tscd = tscd;
> +	wrmsrl(MSR_IA32_TSCDEADLINE, tscd);
> +	return 0;
> +}
> +
> +static void vmx_host_timer_passth_init(void *junk)
> +{
> +	struct timer_passth_info *local_timer_info;
> +	int cpu = smp_processor_id();
> +
> +	local_timer_info = &per_cpu(passth_info, cpu);
> +	local_timer_info->curr_dev = per_cpu(tick_cpu_device, cpu).evtdev;
> +	local_timer_info->orig_set_next_event =
> +		local_timer_info->curr_dev->set_next_event;
> +	local_timer_info->curr_dev->set_next_event = override_lapic_next_event;

So when loading the KVM module you steal the set_next_event pointer of
the clock event device which is currently active. What guarantees that

    1) The current active device is the tsc deadline timer
    2) The active device does not change

Nothing.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough
  2021-02-05 12:43   ` Peter Zijlstra
@ 2021-02-05 18:22     ` Thomas Gleixner
  0 siblings, 0 replies; 18+ messages in thread
From: Thomas Gleixner @ 2021-02-05 18:22 UTC (permalink / raw)
  To: Peter Zijlstra, Zhimin Feng
  Cc: x86, kvm, linux-kernel, pbonzini, seanjc, vkuznets, wanpengli,
	jmattson, joro, mingo, bp, hpa, fweisbec, zhouyibo,
	zhanghaozhong

On Fri, Feb 05 2021 at 13:43, Peter Zijlstra wrote:
> On Fri, Feb 05, 2021 at 06:03:13PM +0800, Zhimin Feng wrote:
>> +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, bool to_host)
>> +{
>> +	u64 tsc_adjust;
>> +	struct timer_passth_info *local_timer_info;
>> +
>> +	local_timer_info = &per_cpu(passth_info, smp_processor_id());
>> +
>> +	if (to_host) {
>> +		tsc_adjust = local_timer_info->host_tsc_adjust;
>> +		wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust);
>> +		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
>> +	} else {
>> +		rdmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust);
>> +		local_timer_info->host_tsc_adjust = tsc_adjust;
>> +
>> +		wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust + vcpu->arch.tsc_offset);
>> +		vmcs_write64(TSC_OFFSET, 0);
>> +	}
>> +}
>
> NAK
>
> This wrecks the host TSC value, any host code between this and actually
> entering that VM will observe batshit time.

VMCS TSC offset is there for a reason...

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM
  2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
                   ` (9 preceding siblings ...)
  2021-02-05 13:34 ` [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Paolo Bonzini
@ 2021-02-08 18:13 ` Konrad Rzeszutek Wilk
  2021-02-23 13:31   ` [External] " Zhimin Feng
  10 siblings, 1 reply; 18+ messages in thread
From: Konrad Rzeszutek Wilk @ 2021-02-08 18:13 UTC (permalink / raw)
  To: Zhimin Feng
  Cc: x86, kvm, linux-kernel, pbonzini, seanjc, vkuznets, wanpengli,
	jmattson, joro, tglx, mingo, bp, hpa, fweisbec, zhouyibo,
	zhanghaozhong

On Fri, Feb 05, 2021 at 06:03:08PM +0800, Zhimin Feng wrote:
> The main motivation for this patch is to improve the performance of VM.
> This patch series introduces how to enable the timer passthrough in
> non-root mode.

Nice! Those are impressive numbers!

> 
> The main idea is to offload the host timer to the preemtion timer in
> non-root mode. Through doing this, guest can write tscdeadline msr directly
> in non-root mode and host timer isn't lost. If CPU is in root mode,
> guest timer is switched to software timer.

I am sorry - but I am having a hard time understanding the sentence
above so let me ask some specific questions.

- How do you protect against the guest DoS-ing the host and mucking with
  the host timer?

- As in can you explain how the host can still continue scheduling it's
  own quanta?

And one more - what happens with Live Migration? I would assume that
becomes a no-go anymore unless you swap in the guest timer back in? So
we end up emulating the MSR again?

Thanks!

> 
> Testing on Intel(R) Xeon(R) Platinum 8260 server.
> 
> The guest OS is Debian(kernel: 4.19.28). The specific configuration is
>  is as follows: 8 cpu, 16GB memory, guest idle=poll
> memcached in guest(memcached -d -t 8 -u root)
> 
> I use the memtier_benchmark tool to test performance
> (memtier_benchmark -P memcache_text -s guest_ip -c 16 -t 32
>  --key-maximum=10000000000 --random-data --data-size-range=64-128 -p 11211
>  --generate-keys --ratio 5:1 --test-time=500)
> 
> Total Ops can be improved 25% and Avg.Latency can be improved 20% when
> the timer-passthrough is enabled.
> 
> =============================================================
>                | Enable timer-passth | Disable timer-passth |
> =============================================================
> Totals Ops/sec |    514869.67        |     411766.67        |
> -------------------------------------------------------------
> Avg.Latency    |    0.99483          |     1.24294          |
> =============================================================
> 
> 
> Zhimin Feng (9):
>   KVM: vmx: hook set_next_event for getting the host tscd
>   KVM: vmx: enable host lapic timer offload preemtion timer
>   KVM: vmx: enable passthrough timer to guest
>   KVM: vmx: enable passth timer switch to sw timer
>   KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough
>   KVM: vmx: check enable_timer_passth strictly
>   KVM: vmx: save the initial value of host tscd
>   KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm
>   KVM: vmx: query the state of timer-passth for vm
> 
>  arch/x86/include/asm/kvm_host.h |  27 ++++
>  arch/x86/kvm/lapic.c            |   1 +
>  arch/x86/kvm/vmx/vmx.c          | 331 +++++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              |  26 +++-
>  include/linux/kvm_host.h        |   1 +
>  include/uapi/linux/kvm.h        |   3 +
>  kernel/time/tick-common.c       |   1 +
>  tools/include/uapi/linux/kvm.h  |   3 +
>  virt/kvm/kvm_main.c             |   1 +
>  9 files changed, 389 insertions(+), 5 deletions(-)
> 
> -- 
> 2.11.0
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [External] Re: [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd
  2021-02-05 18:11   ` Thomas Gleixner
@ 2021-02-23 13:07     ` Zhimin Feng
  0 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-23 13:07 UTC (permalink / raw)
  To: Thomas Gleixner, x86, kvm, linux-kernel
  Cc: pbonzini, seanjc, vkuznets, wanpengli, jmattson, joro, mingo, bp,
	hpa, fweisbec, zhouyibo, zhanghaozhong

Hi tglx

This question is very nice,  we should be considered to judge whether 
the current active device is the tsc deadline timer. I will fix this in V2.

Thanks

Zhimin

在 2021/2/6 上午2:11, Thomas Gleixner 写道:
> On Fri, Feb 05 2021 at 18:03, Zhimin Feng wrote:
>> @@ -520,6 +521,24 @@ struct kvm_vcpu_hv {
>>   	cpumask_t tlb_flush;
>>   };
>>   
>> +enum tick_device_mode {
>> +	TICKDEV_MODE_PERIODIC,
>> +	TICKDEV_MODE_ONESHOT,
>> +};
>> +
>> +struct tick_device {
>> +	struct clock_event_device *evtdev;
>> +	enum tick_device_mode mode;
>> +};
> There is a reason why these things are defined in a header file which is
> not public. Nothing outside of kernel/time/ has to fiddle with
> this. Aside of that how are these things supposed to stay in sync?
>
>> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
>> index 6c9c342dd0e5..bc50f4a1a7c0 100644
>> --- a/kernel/time/tick-common.c
>> +++ b/kernel/time/tick-common.c
>> @@ -26,6 +26,7 @@
>>    * Tick devices
>>    */
>>   DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
>> +EXPORT_SYMBOL_GPL(tick_cpu_device);
> Not going to happen ever.
>
>> +#define TSC_DIVISOR  8
>> +static DEFINE_PER_CPU(struct timer_passth_info, passth_info);
>> +
>> +static int override_lapic_next_event(unsigned long delta,
>> +		struct clock_event_device *evt)
>> +{
>> +	struct timer_passth_info *local_timer_info;
>> +	u64 tsc;
>> +	u64 tscd;
>> +
>> +	local_timer_info = &per_cpu(passth_info, smp_processor_id());
>> +	tsc = rdtsc();
>> +	tscd = tsc + (((u64) delta) * TSC_DIVISOR);
>> +	local_timer_info->host_tscd = tscd;
>> +	wrmsrl(MSR_IA32_TSCDEADLINE, tscd);
>> +	return 0;
>> +}
>> +
>> +static void vmx_host_timer_passth_init(void *junk)
>> +{
>> +	struct timer_passth_info *local_timer_info;
>> +	int cpu = smp_processor_id();
>> +
>> +	local_timer_info = &per_cpu(passth_info, cpu);
>> +	local_timer_info->curr_dev = per_cpu(tick_cpu_device, cpu).evtdev;
>> +	local_timer_info->orig_set_next_event =
>> +		local_timer_info->curr_dev->set_next_event;
>> +	local_timer_info->curr_dev->set_next_event = override_lapic_next_event;
> So when loading the KVM module you steal the set_next_event pointer of
> the clock event device which is currently active. What guarantees that
>
>      1) The current active device is the tsc deadline timer
>      2) The active device does not change
>
> Nothing.
>
> Thanks,
>
>          tglx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [External] Re: [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM
  2021-02-08 18:13 ` Konrad Rzeszutek Wilk
@ 2021-02-23 13:31   ` Zhimin Feng
  0 siblings, 0 replies; 18+ messages in thread
From: Zhimin Feng @ 2021-02-23 13:31 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: x86, kvm, linux-kernel, pbonzini, seanjc, vkuznets, wanpengli,
	jmattson, joro, tglx, mingo, bp, hpa, fweisbec, zhouyibo,
	zhanghaozhong

Hi

The host timer would be saved when cpu entry the non-root mode, and it 
would be restored when cpu entry the root mode. So the guest doesn't the 
host timer.

The host timer would be written to the preemption timer in non-root 
mode. When the host timer is expired(preemption timer value is '0'), the 
preemption timer would trigger immediate VMExit, so the host timer would 
be handled in the preemption timer handler.

Thanks!

Zhimin

在 2021/2/9 上午2:13, Konrad Rzeszutek Wilk 写道:
> On Fri, Feb 05, 2021 at 06:03:08PM +0800, Zhimin Feng wrote:
>> The main motivation for this patch is to improve the performance of VM.
>> This patch series introduces how to enable the timer passthrough in
>> non-root mode.
> Nice! Those are impressive numbers!
>
>> The main idea is to offload the host timer to the preemtion timer in
>> non-root mode. Through doing this, guest can write tscdeadline msr directly
>> in non-root mode and host timer isn't lost. If CPU is in root mode,
>> guest timer is switched to software timer.
> I am sorry - but I am having a hard time understanding the sentence
> above so let me ask some specific questions.
>
> - How do you protect against the guest DoS-ing the host and mucking with
>    the host timer?
>
> - As in can you explain how the host can still continue scheduling it's
>    own quanta?
>
> And one more - what happens with Live Migration? I would assume that
> becomes a no-go anymore unless you swap in the guest timer back in? So
> we end up emulating the MSR again?
>
> Thanks!
>
>> Testing on Intel(R) Xeon(R) Platinum 8260 server.
>>
>> The guest OS is Debian(kernel: 4.19.28). The specific configuration is
>>   is as follows: 8 cpu, 16GB memory, guest idle=poll
>> memcached in guest(memcached -d -t 8 -u root)
>>
>> I use the memtier_benchmark tool to test performance
>> (memtier_benchmark -P memcache_text -s guest_ip -c 16 -t 32
>>   --key-maximum=10000000000 --random-data --data-size-range=64-128 -p 11211
>>   --generate-keys --ratio 5:1 --test-time=500)
>>
>> Total Ops can be improved 25% and Avg.Latency can be improved 20% when
>> the timer-passthrough is enabled.
>>
>> =============================================================
>>                 | Enable timer-passth | Disable timer-passth |
>> =============================================================
>> Totals Ops/sec |    514869.67        |     411766.67        |
>> -------------------------------------------------------------
>> Avg.Latency    |    0.99483          |     1.24294          |
>> =============================================================
>>
>>
>> Zhimin Feng (9):
>>    KVM: vmx: hook set_next_event for getting the host tscd
>>    KVM: vmx: enable host lapic timer offload preemtion timer
>>    KVM: vmx: enable passthrough timer to guest
>>    KVM: vmx: enable passth timer switch to sw timer
>>    KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough
>>    KVM: vmx: check enable_timer_passth strictly
>>    KVM: vmx: save the initial value of host tscd
>>    KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm
>>    KVM: vmx: query the state of timer-passth for vm
>>
>>   arch/x86/include/asm/kvm_host.h |  27 ++++
>>   arch/x86/kvm/lapic.c            |   1 +
>>   arch/x86/kvm/vmx/vmx.c          | 331 +++++++++++++++++++++++++++++++++++++++-
>>   arch/x86/kvm/x86.c              |  26 +++-
>>   include/linux/kvm_host.h        |   1 +
>>   include/uapi/linux/kvm.h        |   3 +
>>   kernel/time/tick-common.c       |   1 +
>>   tools/include/uapi/linux/kvm.h  |   3 +
>>   virt/kvm/kvm_main.c             |   1 +
>>   9 files changed, 389 insertions(+), 5 deletions(-)
>>
>> -- 
>> 2.11.0
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-02-23 13:33 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-05 10:03 [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 1/9] KVM: vmx: hook set_next_event for getting the host tscd Zhimin Feng
2021-02-05 12:28   ` Peter Zijlstra
2021-02-05 18:11   ` Thomas Gleixner
2021-02-23 13:07     ` [External] " Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 2/9] KVM: vmx: enable host lapic timer offload preemtion timer Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 3/9] KVM: vmx: enable passthrough timer to guest Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 4/9] KVM: vmx: enable passth timer switch to sw timer Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 5/9] KVM: vmx: use tsc_adjust to enable tsc_offset timer passthrough Zhimin Feng
2021-02-05 12:43   ` Peter Zijlstra
2021-02-05 18:22     ` Thomas Gleixner
2021-02-05 10:03 ` [RFC: timer passthrough 6/9] KVM: vmx: check enable_timer_passth strictly Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 7/9] KVM: vmx: save the initial value of host tscd Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 8/9] KVM: vmx: Dynamically open or close the timer-passthrough for pre-vm Zhimin Feng
2021-02-05 10:03 ` [RFC: timer passthrough 9/9] KVM: vmx: query the state of timer-passth for vm Zhimin Feng
2021-02-05 13:34 ` [RESEND RFC: timer passthrough 0/9] Support timer passthrough for VM Paolo Bonzini
2021-02-08 18:13 ` Konrad Rzeszutek Wilk
2021-02-23 13:31   ` [External] " Zhimin Feng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).