linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further
@ 2019-05-15  4:11 Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 1/4] KVM: LAPIC: Extract adaptive tune timer advancement logic Wanpeng Li
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Wanpeng Li @ 2019-05-15  4:11 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Paolo Bonzini, Radim Krčmář

Advance lapic timer tries to hidden the hypervisor overhead between the 
host emulated timer fires and the guest awares the timer is fired. However, 
it just hidden the time between apic_timer_fn/handle_preemption_timer -> 
wait_lapic_expire, instead of the real position of vmentry which is 
mentioned in the orignial commit d0659d946be0 ("KVM: x86: add option to 
advance tscdeadline hrtimer expiration"). There is 700+ cpu cycles between 
the end of wait_lapic_expire and before world switch on my haswell desktop, 
it will be 2400+ cycles if vmentry_l1d_flush is tuned to always. 

This patchset tries to narrow the last gap(wait_lapic_expire -> world switch), 
it takes the real overhead time between apic_timer_fn/handle_preemption_timer
and before world switch into consideration when adaptively tuning timer 
advancement. The patchset can reduce 40% latency (~1600+ cycles to ~1000+ 
cycles on a haswell desktop) for kvm-unit-tests/tscdeadline_latency when 
testing busy waits.

v1 -> v2:
 * fix indent in patch 1/4
 * remove the wait_lapic_expire() tracepoint and expose by debugfs
 * move the call to wait_lapic_expire() into vmx.c and svm.c

Wanpeng Li (4):
  KVM: LAPIC: Extract adaptive tune timer advancement logic
  KVM: LAPIC: Fix lapic_timer_advance_ns parameter overflow
  KVM: LAPIC: Expose per-vCPU timer adavance information to userspace
  KVM: LAPIC: Optimize timer latency further

 arch/x86/kvm/debugfs.c | 16 +++++++++++++
 arch/x86/kvm/lapic.c   | 62 +++++++++++++++++++++++++++++---------------------
 arch/x86/kvm/lapic.h   |  3 ++-
 arch/x86/kvm/svm.c     |  4 ++++
 arch/x86/kvm/trace.h   | 20 ----------------
 arch/x86/kvm/vmx/vmx.c |  4 ++++
 arch/x86/kvm/x86.c     |  5 +---
 7 files changed, 63 insertions(+), 51 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 1/4] KVM: LAPIC: Extract adaptive tune timer advancement logic
  2019-05-15  4:11 [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
@ 2019-05-15  4:11 ` Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 2/4] KVM: LAPIC: Fix lapic_timer_advance_ns parameter overflow Wanpeng Li
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Wanpeng Li @ 2019-05-15  4:11 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Paolo Bonzini, Radim Krčmář,
	Sean Christopherson, Liran Alon

From: Wanpeng Li <wanpengli@tencent.com>

Extract adaptive tune timer advancement logic to a single function.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/kvm/lapic.c | 57 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bd13fdd..2f364fe 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1501,11 +1501,40 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
 	}
 }
 
-void wait_lapic_expire(struct kvm_vcpu *vcpu)
+static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
+				u64 guest_tsc, u64 tsc_deadline)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
-	u64 guest_tsc, tsc_deadline, ns;
+	u64 ns;
+
+	/* too early */
+	if (guest_tsc < tsc_deadline) {
+		ns = (tsc_deadline - guest_tsc) * 1000000ULL;
+		do_div(ns, vcpu->arch.virtual_tsc_khz);
+		timer_advance_ns -= min((u32)ns,
+			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+	} else {
+	/* too late */
+		ns = (guest_tsc - tsc_deadline) * 1000000ULL;
+		do_div(ns, vcpu->arch.virtual_tsc_khz);
+		timer_advance_ns += min((u32)ns,
+			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+	}
+
+	if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+		apic->lapic_timer.timer_advance_adjust_done = true;
+	if (unlikely(timer_advance_ns > 5000)) {
+		timer_advance_ns = 0;
+		apic->lapic_timer.timer_advance_adjust_done = true;
+	}
+	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 guest_tsc, tsc_deadline;
 
 	if (apic->lapic_timer.expired_tscdeadline == 0)
 		return;
@@ -1521,28 +1550,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 	if (guest_tsc < tsc_deadline)
 		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
 
-	if (!apic->lapic_timer.timer_advance_adjust_done) {
-		/* too early */
-		if (guest_tsc < tsc_deadline) {
-			ns = (tsc_deadline - guest_tsc) * 1000000ULL;
-			do_div(ns, vcpu->arch.virtual_tsc_khz);
-			timer_advance_ns -= min((u32)ns,
-				timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
-		} else {
-		/* too late */
-			ns = (guest_tsc - tsc_deadline) * 1000000ULL;
-			do_div(ns, vcpu->arch.virtual_tsc_khz);
-			timer_advance_ns += min((u32)ns,
-				timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
-		}
-		if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
-			apic->lapic_timer.timer_advance_adjust_done = true;
-		if (unlikely(timer_advance_ns > 5000)) {
-			timer_advance_ns = 0;
-			apic->lapic_timer.timer_advance_adjust_done = true;
-		}
-		apic->lapic_timer.timer_advance_ns = timer_advance_ns;
-	}
+	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+		adaptive_tune_timer_advancement(vcpu, guest_tsc, tsc_deadline);
 }
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/4] KVM: LAPIC: Fix lapic_timer_advance_ns parameter overflow
  2019-05-15  4:11 [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 1/4] KVM: LAPIC: Extract adaptive tune timer advancement logic Wanpeng Li
@ 2019-05-15  4:11 ` Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
  3 siblings, 0 replies; 10+ messages in thread
From: Wanpeng Li @ 2019-05-15  4:11 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Paolo Bonzini, Radim Krčmář,
	Sean Christopherson, Liran Alon

From: Wanpeng Li <wanpengli@tencent.com>

After commit c3941d9e0 (KVM: lapic: Allow user to disable adaptive tuning of
timer advancement), '-1' enables adaptive tuning starting from default
advancment of 1000ns. However, we should expose an int instead of an overflow
uint module parameter.

Before patch:

/sys/module/kvm/parameters/lapic_timer_advance_ns:4294967295

After patch:

/sys/module/kvm/parameters/lapic_timer_advance_ns:-1

Fixes: c3941d9e0 (KVM: lapic: Allow user to disable adaptive tuning of timer advancement)
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d75bb97..1d89cb9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -143,7 +143,7 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
  * tuning, i.e. allows priveleged userspace to set an exact advancement time.
  */
 static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
 
 static bool __read_mostly vector_hashing = true;
 module_param(vector_hashing, bool, S_IRUGO);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace
  2019-05-15  4:11 [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 1/4] KVM: LAPIC: Extract adaptive tune timer advancement logic Wanpeng Li
  2019-05-15  4:11 ` [PATCH v2 2/4] KVM: LAPIC: Fix lapic_timer_advance_ns parameter overflow Wanpeng Li
@ 2019-05-15  4:11 ` Wanpeng Li
  2019-05-15 17:21   ` Sean Christopherson
  2019-05-15  4:11 ` [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
  3 siblings, 1 reply; 10+ messages in thread
From: Wanpeng Li @ 2019-05-15  4:11 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Paolo Bonzini, Radim Krčmář,
	Sean Christopherson, Liran Alon

From: Wanpeng Li <wanpengli@tencent.com>

Expose the per-vCPU advancement information to the user via per-vCPU debugfs 
entry. wait_lapic_expire() call was moved above guest_enter_irqoff() because 
of its tracepoint, which violated the RCU extended quiescent state invoked 
by guest_enter_irqoff()[1][2]. This patch simply removes the tracepoint, 
which would allow moving wait_lapic_expire(). Sean pointed out:

| Now that the advancement time is tracked per-vCPU, realizing a change 
| in the advancement time requires creating a new VM. For all intents 
| and purposes this makes it impractical to hand tune the advancement 
| in real time using the tracepoint as the feedback mechanism.

[1] Commit 8b89fe1f6c43 ("kvm: x86: move tracepoints outside extended quiescent state")
[2] https://patchwork.kernel.org/patch/7821111/

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/kvm/debugfs.c | 16 ++++++++++++++++
 arch/x86/kvm/lapic.c   | 16 ++++++++--------
 arch/x86/kvm/lapic.h   |  1 +
 arch/x86/kvm/trace.h   | 20 --------------------
 4 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index c19c7ed..8cf542e 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -9,12 +9,22 @@
  */
 #include <linux/kvm_host.h>
 #include <linux/debugfs.h>
+#include "lapic.h"
 
 bool kvm_arch_has_vcpu_debugfs(void)
 {
 	return true;
 }
 
+static int vcpu_get_timer_expire_delta(void *data, u64 *val)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+	*val = vcpu->arch.apic->lapic_timer.advance_expire_delta;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_expire_delta_fops, vcpu_get_timer_expire_delta, NULL, "%lld\n");
+
 static int vcpu_get_tsc_offset(void *data, u64 *val)
 {
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
@@ -51,6 +61,12 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 	if (!ret)
 		return -ENOMEM;
 
+	ret = debugfs_create_file("advance_expire_delta", 0444,
+							vcpu->debugfs_dentry,
+							vcpu, &vcpu_timer_expire_delta_fops);
+	if (!ret)
+		return -ENOMEM;
+
 	if (kvm_has_tsc_control) {
 		ret = debugfs_create_file("tsc-scaling-ratio", 0444,
 							vcpu->debugfs_dentry,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f364fe..af38ece 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1502,27 +1502,27 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
 }
 
 static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
-				u64 guest_tsc, u64 tsc_deadline)
+				s64 advance_expire_delta)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
 	u64 ns;
 
 	/* too early */
-	if (guest_tsc < tsc_deadline) {
-		ns = (tsc_deadline - guest_tsc) * 1000000ULL;
+	if (advance_expire_delta < 0) {
+		ns = -advance_expire_delta * 1000000ULL;
 		do_div(ns, vcpu->arch.virtual_tsc_khz);
 		timer_advance_ns -= min((u32)ns,
 			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
 	} else {
 	/* too late */
-		ns = (guest_tsc - tsc_deadline) * 1000000ULL;
+		ns = advance_expire_delta * 1000000ULL;
 		do_div(ns, vcpu->arch.virtual_tsc_khz);
 		timer_advance_ns += min((u32)ns,
 			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
 	}
 
-	if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
+	if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
 		apic->lapic_timer.timer_advance_adjust_done = true;
 	if (unlikely(timer_advance_ns > 5000)) {
 		timer_advance_ns = 0;
@@ -1545,13 +1545,13 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+	apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
 
-	if (guest_tsc < tsc_deadline)
+	if (apic->lapic_timer.advance_expire_delta < 0)
 		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
 
 	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
-		adaptive_tune_timer_advancement(vcpu, guest_tsc, tsc_deadline);
+		adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
 }
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index d6d049b..3e72a25 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -32,6 +32,7 @@ struct kvm_timer {
 	u64 tscdeadline;
 	u64 expired_tscdeadline;
 	u32 timer_advance_ns;
+	s64 advance_expire_delta;
 	atomic_t pending;			/* accumulated triggered timers */
 	bool hv_timer_in_use;
 	bool timer_advance_adjust_done;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4d47a26..3f9bc62 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -953,26 +953,6 @@ TRACE_EVENT(kvm_pvclock_update,
 		  __entry->flags)
 );
 
-TRACE_EVENT(kvm_wait_lapic_expire,
-	TP_PROTO(unsigned int vcpu_id, s64 delta),
-	TP_ARGS(vcpu_id, delta),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	vcpu_id		)
-		__field(	s64,		delta		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_id	   = vcpu_id;
-		__entry->delta             = delta;
-	),
-
-	TP_printk("vcpu %u: delta %lld (%s)",
-		  __entry->vcpu_id,
-		  __entry->delta,
-		  __entry->delta < 0 ? "early" : "late")
-);
-
 TRACE_EVENT(kvm_enter_smm,
 	TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
 	TP_ARGS(vcpu_id, smbase, entering),
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further
  2019-05-15  4:11 [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
                   ` (2 preceding siblings ...)
  2019-05-15  4:11 ` [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace Wanpeng Li
@ 2019-05-15  4:11 ` Wanpeng Li
  2019-05-15 17:42   ` Sean Christopherson
  3 siblings, 1 reply; 10+ messages in thread
From: Wanpeng Li @ 2019-05-15  4:11 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Paolo Bonzini, Radim Krčmář,
	Sean Christopherson, Liran Alon

From: Wanpeng Li <wanpengli@tencent.com>

Advance lapic timer tries to hidden the hypervisor overhead between the 
host emulated timer fires and the guest awares the timer is fired. However, 
it just hidden the time between apic_timer_fn/handle_preemption_timer -> 
wait_lapic_expire, instead of the real position of vmentry which is 
mentioned in the orignial commit d0659d946be0 ("KVM: x86: add option to 
advance tscdeadline hrtimer expiration"). There is 700+ cpu cycles between 
the end of wait_lapic_expire and before world switch on my haswell desktop, 
it will be 2400+ cycles if vmentry_l1d_flush is tuned to always. 

This patch tries to narrow the last gap(wait_lapic_expire -> world switch), 
it takes the real overhead time between apic_timer_fn/handle_preemption_timer
and before world switch into consideration when adaptively tuning timer 
advancement. The patch can reduce 40% latency (~1600+ cycles to ~1000+ cycles 
on a haswell desktop) for kvm-unit-tests/tscdeadline_latency when testing 
busy waits.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/kvm/lapic.c   | 3 ++-
 arch/x86/kvm/lapic.h   | 2 +-
 arch/x86/kvm/svm.c     | 4 ++++
 arch/x86/kvm/vmx/vmx.c | 4 ++++
 arch/x86/kvm/x86.c     | 3 ---
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index af38ece..63513de 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1531,7 +1531,7 @@ static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
 	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
 }
 
-void wait_lapic_expire(struct kvm_vcpu *vcpu)
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u64 guest_tsc, tsc_deadline;
@@ -1553,6 +1553,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
 		adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
 }
+EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
 {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 3e72a25..f974a3d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -220,7 +220,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
-void wait_lapic_expire(struct kvm_vcpu *vcpu);
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 406b558..740fb3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5646,6 +5646,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 */
 	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
+	if (lapic_in_kernel(vcpu) &&
+		vcpu->arch.apic->lapic_timer.timer_advance_ns)
+		kvm_wait_lapic_expire(vcpu);
+
 	local_irq_enable();
 
 	asm volatile (
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9663d41..1c49946 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6437,6 +6437,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.cr2 != read_cr2())
 		write_cr2(vcpu->arch.cr2);
 
+	if (lapic_in_kernel(vcpu) &&
+		vcpu->arch.apic->lapic_timer.timer_advance_ns)
+		kvm_wait_lapic_expire(vcpu);
+
 	vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
 				   vmx->loaded_vmcs->launched);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d89cb9..0eb9549 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7894,9 +7894,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
-	if (lapic_in_kernel(vcpu) &&
-	    vcpu->arch.apic->lapic_timer.timer_advance_ns)
-		wait_lapic_expire(vcpu);
 	guest_enter_irqoff();
 
 	fpregs_assert_state_consistent();
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace
  2019-05-15  4:11 ` [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace Wanpeng Li
@ 2019-05-15 17:21   ` Sean Christopherson
  2019-05-16  3:15     ` Wanpeng Li
  0 siblings, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2019-05-15 17:21 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: linux-kernel, kvm, Paolo Bonzini, Radim Krčmář,
	Liran Alon

On Wed, May 15, 2019 at 12:11:53PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> Expose the per-vCPU advancement information to the user via per-vCPU debugfs 
> entry. wait_lapic_expire() call was moved above guest_enter_irqoff() because 
> of its tracepoint, which violated the RCU extended quiescent state invoked 
> by guest_enter_irqoff()[1][2]. This patch simply removes the tracepoint, 
> which would allow moving wait_lapic_expire(). Sean pointed out:
> 
> | Now that the advancement time is tracked per-vCPU, realizing a change 
> | in the advancement time requires creating a new VM. For all intents 
> | and purposes this makes it impractical to hand tune the advancement 
> | in real time using the tracepoint as the feedback mechanism.
> 
> [1] Commit 8b89fe1f6c43 ("kvm: x86: move tracepoints outside extended quiescent state")
> [2] https://patchwork.kernel.org/patch/7821111/
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: Liran Alon <liran.alon@oracle.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
>  arch/x86/kvm/debugfs.c | 16 ++++++++++++++++
>  arch/x86/kvm/lapic.c   | 16 ++++++++--------
>  arch/x86/kvm/lapic.h   |  1 +
>  arch/x86/kvm/trace.h   | 20 --------------------
>  4 files changed, 25 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
> index c19c7ed..8cf542e 100644
> --- a/arch/x86/kvm/debugfs.c
> +++ b/arch/x86/kvm/debugfs.c
> @@ -9,12 +9,22 @@
>   */
>  #include <linux/kvm_host.h>
>  #include <linux/debugfs.h>
> +#include "lapic.h"
>  
>  bool kvm_arch_has_vcpu_debugfs(void)
>  {
>  	return true;
>  }
>  
> +static int vcpu_get_timer_expire_delta(void *data, u64 *val)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> +	*val = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_expire_delta_fops, vcpu_get_timer_expire_delta, NULL, "%lld\n");
> +
>  static int vcpu_get_tsc_offset(void *data, u64 *val)
>  {
>  	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> @@ -51,6 +61,12 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
>  	if (!ret)
>  		return -ENOMEM;
>  
> +	ret = debugfs_create_file("advance_expire_delta", 0444,
> +							vcpu->debugfs_dentry,
> +							vcpu, &vcpu_timer_expire_delta_fops);

I was thinking we would expose 'kvm_timer.timer_advance_ns', not the
delta, the idea being that being able to query the auto-adjusted value
is now the desired behavior.  But rethinking things, that enhancement is
orthogonal to removing the tracepoint.

Back to the tracepoint, an alternative solution would be to add
kvm_timer.advance_expire_delta as you did, but rather than add a new
debugfs entry, simply move the tracepoint below guest_exit_irqoff()
in vcpu_enter_guest().  I.e. snapshot the delta before VM-Enter, but
trace it after VM-Exit.

If we want to continue supporting hand tuning the advancement, then a
tracepoint is much easier for userspace to consume, e.g. it allows the
user to monitor the history of the delta while adjusting the advancement
time.  Manually approximating that behavior by sampling the value from
debugfs would be quite cumbersome.

> +	if (!ret)
> +		return -ENOMEM;
> +
>  	if (kvm_has_tsc_control) {
>  		ret = debugfs_create_file("tsc-scaling-ratio", 0444,
>  							vcpu->debugfs_dentry,
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2f364fe..af38ece 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1502,27 +1502,27 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
>  }
>  
>  static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
> -				u64 guest_tsc, u64 tsc_deadline)
> +				s64 advance_expire_delta)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
>  	u64 ns;
>  
>  	/* too early */
> -	if (guest_tsc < tsc_deadline) {
> -		ns = (tsc_deadline - guest_tsc) * 1000000ULL;
> +	if (advance_expire_delta < 0) {
> +		ns = -advance_expire_delta * 1000000ULL;
>  		do_div(ns, vcpu->arch.virtual_tsc_khz);
>  		timer_advance_ns -= min((u32)ns,
>  			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>  	} else {
>  	/* too late */
> -		ns = (guest_tsc - tsc_deadline) * 1000000ULL;
> +		ns = advance_expire_delta * 1000000ULL;
>  		do_div(ns, vcpu->arch.virtual_tsc_khz);
>  		timer_advance_ns += min((u32)ns,
>  			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>  	}
>  
> -	if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> +	if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
>  		apic->lapic_timer.timer_advance_adjust_done = true;
>  	if (unlikely(timer_advance_ns > 5000)) {
>  		timer_advance_ns = 0;
> @@ -1545,13 +1545,13 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
>  	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
>  	apic->lapic_timer.expired_tscdeadline = 0;
>  	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> -	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
> +	apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
>  
> -	if (guest_tsc < tsc_deadline)
> +	if (apic->lapic_timer.advance_expire_delta < 0)
>  		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
>  
>  	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> -		adaptive_tune_timer_advancement(vcpu, guest_tsc, tsc_deadline);
> +		adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
>  }
>  
>  static void start_sw_tscdeadline(struct kvm_lapic *apic)
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index d6d049b..3e72a25 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -32,6 +32,7 @@ struct kvm_timer {
>  	u64 tscdeadline;
>  	u64 expired_tscdeadline;
>  	u32 timer_advance_ns;
> +	s64 advance_expire_delta;
>  	atomic_t pending;			/* accumulated triggered timers */
>  	bool hv_timer_in_use;
>  	bool timer_advance_adjust_done;
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 4d47a26..3f9bc62 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -953,26 +953,6 @@ TRACE_EVENT(kvm_pvclock_update,
>  		  __entry->flags)
>  );
>  
> -TRACE_EVENT(kvm_wait_lapic_expire,
> -	TP_PROTO(unsigned int vcpu_id, s64 delta),
> -	TP_ARGS(vcpu_id, delta),
> -
> -	TP_STRUCT__entry(
> -		__field(	unsigned int,	vcpu_id		)
> -		__field(	s64,		delta		)
> -	),
> -
> -	TP_fast_assign(
> -		__entry->vcpu_id	   = vcpu_id;
> -		__entry->delta             = delta;
> -	),
> -
> -	TP_printk("vcpu %u: delta %lld (%s)",
> -		  __entry->vcpu_id,
> -		  __entry->delta,
> -		  __entry->delta < 0 ? "early" : "late")
> -);
> -
>  TRACE_EVENT(kvm_enter_smm,
>  	TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
>  	TP_ARGS(vcpu_id, smbase, entering),
> -- 
> 2.7.4
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further
  2019-05-15  4:11 ` [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
@ 2019-05-15 17:42   ` Sean Christopherson
  2019-05-16  3:14     ` Wanpeng Li
  0 siblings, 1 reply; 10+ messages in thread
From: Sean Christopherson @ 2019-05-15 17:42 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: linux-kernel, kvm, Paolo Bonzini, Radim Krčmář,
	Liran Alon

On Wed, May 15, 2019 at 12:11:54PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
> 
> Advance lapic timer tries to hidden the hypervisor overhead between the 
> host emulated timer fires and the guest awares the timer is fired. However, 
> it just hidden the time between apic_timer_fn/handle_preemption_timer -> 
> wait_lapic_expire, instead of the real position of vmentry which is 
> mentioned in the orignial commit d0659d946be0 ("KVM: x86: add option to 
> advance tscdeadline hrtimer expiration"). There is 700+ cpu cycles between 
> the end of wait_lapic_expire and before world switch on my haswell desktop, 
> it will be 2400+ cycles if vmentry_l1d_flush is tuned to always. 
> 
> This patch tries to narrow the last gap(wait_lapic_expire -> world switch), 
> it takes the real overhead time between apic_timer_fn/handle_preemption_timer
> and before world switch into consideration when adaptively tuning timer 
> advancement. The patch can reduce 40% latency (~1600+ cycles to ~1000+ cycles 
> on a haswell desktop) for kvm-unit-tests/tscdeadline_latency when testing 
> busy waits.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: Liran Alon <liran.alon@oracle.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
>  arch/x86/kvm/lapic.c   | 3 ++-
>  arch/x86/kvm/lapic.h   | 2 +-
>  arch/x86/kvm/svm.c     | 4 ++++
>  arch/x86/kvm/vmx/vmx.c | 4 ++++
>  arch/x86/kvm/x86.c     | 3 ---
>  5 files changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index af38ece..63513de 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1531,7 +1531,7 @@ static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
>  	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
>  }
>  
> -void wait_lapic_expire(struct kvm_vcpu *vcpu)
> +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u64 guest_tsc, tsc_deadline;
> @@ -1553,6 +1553,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
>  	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
>  		adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
>  }
> +EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
>  
>  static void start_sw_tscdeadline(struct kvm_lapic *apic)
>  {
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 3e72a25..f974a3d 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -220,7 +220,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
>  
>  bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
>  
> -void wait_lapic_expire(struct kvm_vcpu *vcpu);
> +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
>  
>  bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
>  			struct kvm_vcpu **dest_vcpu);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 406b558..740fb3f 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5646,6 +5646,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
>  	 */
>  	x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
>  
> +	if (lapic_in_kernel(vcpu) &&
> +		vcpu->arch.apic->lapic_timer.timer_advance_ns)
> +		kvm_wait_lapic_expire(vcpu);
> +
>  	local_irq_enable();
>  
>  	asm volatile (
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 9663d41..1c49946 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6437,6 +6437,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	if (vcpu->arch.cr2 != read_cr2())
>  		write_cr2(vcpu->arch.cr2);
>  
> +	if (lapic_in_kernel(vcpu) &&
> +		vcpu->arch.apic->lapic_timer.timer_advance_ns)
> +		kvm_wait_lapic_expire(vcpu);

One potential hiccup with this approach is that we're now accessing more
data after flushing the L1.  Not sure if that's actually a problem here,
but it probably should be explicitly addressed/considered.

> +
>  	vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
>  				   vmx->loaded_vmcs->launched);
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1d89cb9..0eb9549 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7894,9 +7894,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	}
>  
>  	trace_kvm_entry(vcpu->vcpu_id);
> -	if (lapic_in_kernel(vcpu) &&
> -	    vcpu->arch.apic->lapic_timer.timer_advance_ns)
> -		wait_lapic_expire(vcpu);
>  	guest_enter_irqoff();
>  
>  	fpregs_assert_state_consistent();
> -- 
> 2.7.4
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further
  2019-05-15 17:42   ` Sean Christopherson
@ 2019-05-16  3:14     ` Wanpeng Li
  0 siblings, 0 replies; 10+ messages in thread
From: Wanpeng Li @ 2019-05-16  3:14 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: LKML, kvm, Paolo Bonzini, Radim Krčmář, Liran Alon

On Thu, 16 May 2019 at 01:42, Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Wed, May 15, 2019 at 12:11:54PM +0800, Wanpeng Li wrote:
> > From: Wanpeng Li <wanpengli@tencent.com>
> >
> > Advance lapic timer tries to hidden the hypervisor overhead between the
> > host emulated timer fires and the guest awares the timer is fired. However,
> > it just hidden the time between apic_timer_fn/handle_preemption_timer ->
> > wait_lapic_expire, instead of the real position of vmentry which is
> > mentioned in the orignial commit d0659d946be0 ("KVM: x86: add option to
> > advance tscdeadline hrtimer expiration"). There is 700+ cpu cycles between
> > the end of wait_lapic_expire and before world switch on my haswell desktop,
> > it will be 2400+ cycles if vmentry_l1d_flush is tuned to always.
> >
> > This patch tries to narrow the last gap(wait_lapic_expire -> world switch),
> > it takes the real overhead time between apic_timer_fn/handle_preemption_timer
> > and before world switch into consideration when adaptively tuning timer
> > advancement. The patch can reduce 40% latency (~1600+ cycles to ~1000+ cycles
> > on a haswell desktop) for kvm-unit-tests/tscdeadline_latency when testing
> > busy waits.
> >
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> > Cc: Liran Alon <liran.alon@oracle.com>
> > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > ---
> >  arch/x86/kvm/lapic.c   | 3 ++-
> >  arch/x86/kvm/lapic.h   | 2 +-
> >  arch/x86/kvm/svm.c     | 4 ++++
> >  arch/x86/kvm/vmx/vmx.c | 4 ++++
> >  arch/x86/kvm/x86.c     | 3 ---
> >  5 files changed, 11 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index af38ece..63513de 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -1531,7 +1531,7 @@ static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
> >       apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> >  }
> >
> > -void wait_lapic_expire(struct kvm_vcpu *vcpu)
> > +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
> >  {
> >       struct kvm_lapic *apic = vcpu->arch.apic;
> >       u64 guest_tsc, tsc_deadline;
> > @@ -1553,6 +1553,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
> >       if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> >               adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
> >  }
> > +EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
> >
> >  static void start_sw_tscdeadline(struct kvm_lapic *apic)
> >  {
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index 3e72a25..f974a3d 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -220,7 +220,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
> >
> >  bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
> >
> > -void wait_lapic_expire(struct kvm_vcpu *vcpu);
> > +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
> >
> >  bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
> >                       struct kvm_vcpu **dest_vcpu);
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 406b558..740fb3f 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -5646,6 +5646,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
> >        */
> >       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
> >
> > +     if (lapic_in_kernel(vcpu) &&
> > +             vcpu->arch.apic->lapic_timer.timer_advance_ns)
> > +             kvm_wait_lapic_expire(vcpu);
> > +
> >       local_irq_enable();
> >
> >       asm volatile (
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 9663d41..1c49946 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -6437,6 +6437,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >       if (vcpu->arch.cr2 != read_cr2())
> >               write_cr2(vcpu->arch.cr2);
> >
> > +     if (lapic_in_kernel(vcpu) &&
> > +             vcpu->arch.apic->lapic_timer.timer_advance_ns)
> > +             kvm_wait_lapic_expire(vcpu);
>
> One potential hiccup with this approach is that we're now accessing more
> data after flushing the L1.  Not sure if that's actually a problem here,
> but it probably should be explicitly addressed/considered.

Handle it in v3.

Regards,
Wanpeng Li

>
> > +
> >       vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
> >                                  vmx->loaded_vmcs->launched);
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 1d89cb9..0eb9549 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -7894,9 +7894,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >       }
> >
> >       trace_kvm_entry(vcpu->vcpu_id);
> > -     if (lapic_in_kernel(vcpu) &&
> > -         vcpu->arch.apic->lapic_timer.timer_advance_ns)
> > -             wait_lapic_expire(vcpu);
> >       guest_enter_irqoff();
> >
> >       fpregs_assert_state_consistent();
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace
  2019-05-15 17:21   ` Sean Christopherson
@ 2019-05-16  3:15     ` Wanpeng Li
  0 siblings, 0 replies; 10+ messages in thread
From: Wanpeng Li @ 2019-05-16  3:15 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: LKML, kvm, Paolo Bonzini, Radim Krčmář, Liran Alon

On Thu, 16 May 2019 at 01:21, Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Wed, May 15, 2019 at 12:11:53PM +0800, Wanpeng Li wrote:
> > From: Wanpeng Li <wanpengli@tencent.com>
> >
> > Expose the per-vCPU advancement information to the user via per-vCPU debugfs
> > entry. wait_lapic_expire() call was moved above guest_enter_irqoff() because
> > of its tracepoint, which violated the RCU extended quiescent state invoked
> > by guest_enter_irqoff()[1][2]. This patch simply removes the tracepoint,
> > which would allow moving wait_lapic_expire(). Sean pointed out:
> >
> > | Now that the advancement time is tracked per-vCPU, realizing a change
> > | in the advancement time requires creating a new VM. For all intents
> > | and purposes this makes it impractical to hand tune the advancement
> > | in real time using the tracepoint as the feedback mechanism.
> >
> > [1] Commit 8b89fe1f6c43 ("kvm: x86: move tracepoints outside extended quiescent state")
> > [2] https://patchwork.kernel.org/patch/7821111/
> >
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Radim Krčmář <rkrcmar@redhat.com>
> > Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> > Cc: Liran Alon <liran.alon@oracle.com>
> > Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> > ---
> >  arch/x86/kvm/debugfs.c | 16 ++++++++++++++++
> >  arch/x86/kvm/lapic.c   | 16 ++++++++--------
> >  arch/x86/kvm/lapic.h   |  1 +
> >  arch/x86/kvm/trace.h   | 20 --------------------
> >  4 files changed, 25 insertions(+), 28 deletions(-)
> >
> > diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
> > index c19c7ed..8cf542e 100644
> > --- a/arch/x86/kvm/debugfs.c
> > +++ b/arch/x86/kvm/debugfs.c
> > @@ -9,12 +9,22 @@
> >   */
> >  #include <linux/kvm_host.h>
> >  #include <linux/debugfs.h>
> > +#include "lapic.h"
> >
> >  bool kvm_arch_has_vcpu_debugfs(void)
> >  {
> >       return true;
> >  }
> >
> > +static int vcpu_get_timer_expire_delta(void *data, u64 *val)
> > +{
> > +     struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> > +     *val = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> > +     return 0;
> > +}
> > +
> > +DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_expire_delta_fops, vcpu_get_timer_expire_delta, NULL, "%lld\n");
> > +
> >  static int vcpu_get_tsc_offset(void *data, u64 *val)
> >  {
> >       struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> > @@ -51,6 +61,12 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
> >       if (!ret)
> >               return -ENOMEM;
> >
> > +     ret = debugfs_create_file("advance_expire_delta", 0444,
> > +                                                     vcpu->debugfs_dentry,
> > +                                                     vcpu, &vcpu_timer_expire_delta_fops);
>
> I was thinking we would expose 'kvm_timer.timer_advance_ns', not the
> delta, the idea being that being able to query the auto-adjusted value
> is now the desired behavior.  But rethinking things, that enhancement is
> orthogonal to removing the tracepoint.
>
> Back to the tracepoint, an alternative solution would be to add
> kvm_timer.advance_expire_delta as you did, but rather than add a new
> debugfs entry, simply move the tracepoint below guest_exit_irqoff()
> in vcpu_enter_guest().  I.e. snapshot the delta before VM-Enter, but
> trace it after VM-Exit.
>
> If we want to continue supporting hand tuning the advancement, then a
> tracepoint is much easier for userspace to consume, e.g. it allows the
> user to monitor the history of the delta while adjusting the advancement
> time.  Manually approximating that behavior by sampling the value from
> debugfs would be quite cumbersome.

Good point, handle it in v3.

Regards,
Wanpeng Li

>
> > +     if (!ret)
> > +             return -ENOMEM;
> > +
> >       if (kvm_has_tsc_control) {
> >               ret = debugfs_create_file("tsc-scaling-ratio", 0444,
> >                                                       vcpu->debugfs_dentry,
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 2f364fe..af38ece 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -1502,27 +1502,27 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
> >  }
> >
> >  static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
> > -                             u64 guest_tsc, u64 tsc_deadline)
> > +                             s64 advance_expire_delta)
> >  {
> >       struct kvm_lapic *apic = vcpu->arch.apic;
> >       u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
> >       u64 ns;
> >
> >       /* too early */
> > -     if (guest_tsc < tsc_deadline) {
> > -             ns = (tsc_deadline - guest_tsc) * 1000000ULL;
> > +     if (advance_expire_delta < 0) {
> > +             ns = -advance_expire_delta * 1000000ULL;
> >               do_div(ns, vcpu->arch.virtual_tsc_khz);
> >               timer_advance_ns -= min((u32)ns,
> >                       timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> >       } else {
> >       /* too late */
> > -             ns = (guest_tsc - tsc_deadline) * 1000000ULL;
> > +             ns = advance_expire_delta * 1000000ULL;
> >               do_div(ns, vcpu->arch.virtual_tsc_khz);
> >               timer_advance_ns += min((u32)ns,
> >                       timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> >       }
> >
> > -     if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> > +     if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> >               apic->lapic_timer.timer_advance_adjust_done = true;
> >       if (unlikely(timer_advance_ns > 5000)) {
> >               timer_advance_ns = 0;
> > @@ -1545,13 +1545,13 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
> >       tsc_deadline = apic->lapic_timer.expired_tscdeadline;
> >       apic->lapic_timer.expired_tscdeadline = 0;
> >       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> > -     trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
> > +     apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
> >
> > -     if (guest_tsc < tsc_deadline)
> > +     if (apic->lapic_timer.advance_expire_delta < 0)
> >               __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
> >
> >       if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> > -             adaptive_tune_timer_advancement(vcpu, guest_tsc, tsc_deadline);
> > +             adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
> >  }
> >
> >  static void start_sw_tscdeadline(struct kvm_lapic *apic)
> > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> > index d6d049b..3e72a25 100644
> > --- a/arch/x86/kvm/lapic.h
> > +++ b/arch/x86/kvm/lapic.h
> > @@ -32,6 +32,7 @@ struct kvm_timer {
> >       u64 tscdeadline;
> >       u64 expired_tscdeadline;
> >       u32 timer_advance_ns;
> > +     s64 advance_expire_delta;
> >       atomic_t pending;                       /* accumulated triggered timers */
> >       bool hv_timer_in_use;
> >       bool timer_advance_adjust_done;
> > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> > index 4d47a26..3f9bc62 100644
> > --- a/arch/x86/kvm/trace.h
> > +++ b/arch/x86/kvm/trace.h
> > @@ -953,26 +953,6 @@ TRACE_EVENT(kvm_pvclock_update,
> >                 __entry->flags)
> >  );
> >
> > -TRACE_EVENT(kvm_wait_lapic_expire,
> > -     TP_PROTO(unsigned int vcpu_id, s64 delta),
> > -     TP_ARGS(vcpu_id, delta),
> > -
> > -     TP_STRUCT__entry(
> > -             __field(        unsigned int,   vcpu_id         )
> > -             __field(        s64,            delta           )
> > -     ),
> > -
> > -     TP_fast_assign(
> > -             __entry->vcpu_id           = vcpu_id;
> > -             __entry->delta             = delta;
> > -     ),
> > -
> > -     TP_printk("vcpu %u: delta %lld (%s)",
> > -               __entry->vcpu_id,
> > -               __entry->delta,
> > -               __entry->delta < 0 ? "early" : "late")
> > -);
> > -
> >  TRACE_EVENT(kvm_enter_smm,
> >       TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
> >       TP_ARGS(vcpu_id, smbase, entering),
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further
@ 2019-05-15  4:11 Wanpeng Li
  0 siblings, 0 replies; 10+ messages in thread
From: Wanpeng Li @ 2019-05-15  4:11 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Paolo Bonzini, Radim Krčmář

Advance lapic timer tries to hidden the hypervisor overhead between the 
host emulated timer fires and the guest awares the timer is fired. However, 
it just hidden the time between apic_timer_fn/handle_preemption_timer -> 
wait_lapic_expire, instead of the real position of vmentry which is 
mentioned in the orignial commit d0659d946be0 ("KVM: x86: add option to 
advance tscdeadline hrtimer expiration"). There is 700+ cpu cycles between 
the end of wait_lapic_expire and before world switch on my haswell desktop, 
it will be 2400+ cycles if vmentry_l1d_flush is tuned to always. 

This patchset tries to narrow the last gap(wait_lapic_expire -> world switch), 
it takes the real overhead time between apic_timer_fn/handle_preemption_timer
and before world switch into consideration when adaptively tuning timer 
advancement. The patchset can reduce 40% latency (~1600+ cycles to ~1000+ 
cycles on a haswell desktop) for kvm-unit-tests/tscdeadline_latency when 
testing busy waits.

v1 -> v2:
 * fix indent in patch 1/4
 * remove the wait_lapic_expire() tracepoint and expose by debugfs
 * move the call to wait_lapic_expire() into vmx.c and svm.c

Wanpeng Li (4):
  KVM: LAPIC: Extract adaptive tune timer advancement logic
  KVM: LAPIC: Fix lapic_timer_advance_ns parameter overflow
  KVM: LAPIC: Expose per-vCPU timer adavance information to userspace
  KVM: LAPIC: Optimize timer latency further

 arch/x86/kvm/debugfs.c | 16 +++++++++++++
 arch/x86/kvm/lapic.c   | 62 +++++++++++++++++++++++++++++---------------------
 arch/x86/kvm/lapic.h   |  3 ++-
 arch/x86/kvm/svm.c     |  4 ++++
 arch/x86/kvm/trace.h   | 20 ----------------
 arch/x86/kvm/vmx/vmx.c |  4 ++++
 arch/x86/kvm/x86.c     |  5 +---
 7 files changed, 63 insertions(+), 51 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-05-16  3:13 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-15  4:11 [PATCH v2 0/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
2019-05-15  4:11 ` [PATCH v2 1/4] KVM: LAPIC: Extract adaptive tune timer advancement logic Wanpeng Li
2019-05-15  4:11 ` [PATCH v2 2/4] KVM: LAPIC: Fix lapic_timer_advance_ns parameter overflow Wanpeng Li
2019-05-15  4:11 ` [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace Wanpeng Li
2019-05-15 17:21   ` Sean Christopherson
2019-05-16  3:15     ` Wanpeng Li
2019-05-15  4:11 ` [PATCH v2 4/4] KVM: LAPIC: Optimize timer latency further Wanpeng Li
2019-05-15 17:42   ` Sean Christopherson
2019-05-16  3:14     ` Wanpeng Li
  -- strict thread matches above, loose matches on Subject: below --
2019-05-15  4:11 [PATCH v2 0/4] " Wanpeng Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).