linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/7] nSVM/SVM features
@ 2022-03-01 14:36 Maxim Levitsky
  2022-03-01 14:36 ` [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running Maxim Levitsky
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

This is set of patches for SVM nested features rebased on top of kvm/queue,
all of them were already posted before.

Best regards,
	Maxim Levitsky

Maxim Levitsky (7):
  KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running
  KVM: x86: nSVM: implement nested LBR virtualization
  KVM: x86: nSVM: implement nested VMLOAD/VMSAVE
  KVM: x86: nSVM: support PAUSE filter threshold and count when
    cpu_pm=on
  KVM: x86: nSVM: implement nested vGIF
  KVM: x86: SVM: allow to force AVIC to be enabled
  KVM: x86: SVM: allow AVIC to co-exist with a nested guest running

 arch/x86/include/asm/kvm-x86-ops.h |   1 +
 arch/x86/include/asm/kvm_host.h    |   7 +-
 arch/x86/kvm/svm/avic.c            |   6 +-
 arch/x86/kvm/svm/nested.c          | 101 +++++++++++++---
 arch/x86/kvm/svm/svm.c             | 177 +++++++++++++++++++++++------
 arch/x86/kvm/svm/svm.h             |  39 ++++++-
 arch/x86/kvm/x86.c                 |  15 ++-
 7 files changed, 290 insertions(+), 56 deletions(-)

-- 
2.26.3



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-09 13:00   ` Paolo Bonzini
  2022-03-01 14:36 ` [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization Maxim Levitsky
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

When L2 is running without LBR virtualization, we should ensure
that L1's LBR msrs continue to update as usual.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 11 +++++
 arch/x86/kvm/svm/svm.c    | 98 +++++++++++++++++++++++++++++++--------
 arch/x86/kvm/svm/svm.h    |  2 +
 3 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96bab464967f2..d026f89ee94e6 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -564,6 +564,9 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
 		svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
 		vmcb_mark_dirty(svm->vmcb, VMCB_DR);
 	}
+
+	if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK))
+		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 }
 
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
@@ -621,6 +624,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 	svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
 	svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
 
+	svm->vmcb->control.virt_ext            = svm->vmcb01.ptr->control.virt_ext &
+						 LBR_CTL_ENABLE_MASK;
+
 	nested_svm_transition_tlb_flush(vcpu);
 
 	/* Enter Guest-Mode */
@@ -883,6 +889,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	svm_switch_vmcb(svm, &svm->vmcb01);
 
+	if (unlikely(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+		svm_copy_lbrs(svm->nested.vmcb02.ptr, svm->vmcb);
+		svm_update_lbrv(vcpu);
+	}
+
 	/*
 	 * On vmexit the  GIF is set to false and
 	 * no event can be injected in L1.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7038c76fa8410..aa6c04d15d2a7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -803,6 +803,17 @@ static void init_msrpm_offsets(void)
 	}
 }
 
+void svm_copy_lbrs(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
+	to_vmcb->save.br_from		= from_vmcb->save.br_from;
+	to_vmcb->save.br_to		= from_vmcb->save.br_to;
+	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
+	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
+
+	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -812,6 +823,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+	/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+	if (is_guest_mode(vcpu))
+		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 }
 
 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
@@ -823,6 +838,63 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+	/*
+	 * Move the LBR msrs back to the vmcb01 to avoid copying them
+	 * on nested guest entries.
+	 */
+	if (is_guest_mode(vcpu))
+		svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+}
+
+static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+{
+	/*
+	 * If the LBR virtualization is disabled, the LBR msrs are always
+	 * kept in the vmcb01 to avoid copying them on nested guest entries.
+	 *
+	 * If nested, and the LBR virtualization is enabled/disabled, the msrs
+	 * are moved between the vmcb01 and vmcb02 as needed.
+	 */
+	struct vmcb *vmcb =
+		(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
+			svm->vmcb : svm->vmcb01.ptr;
+
+	switch (index) {
+	case MSR_IA32_DEBUGCTLMSR:
+		return vmcb->save.dbgctl;
+	case MSR_IA32_LASTBRANCHFROMIP:
+		return vmcb->save.br_from;
+	case MSR_IA32_LASTBRANCHTOIP:
+		return vmcb->save.br_to;
+	case MSR_IA32_LASTINTFROMIP:
+		return vmcb->save.last_excp_from;
+	case MSR_IA32_LASTINTTOIP:
+		return vmcb->save.last_excp_to;
+	default:
+		KVM_BUG(false, svm->vcpu.kvm,
+			"%s: Unknown MSR 0x%x", __func__, index);
+		return 0;
+	}
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
+					   DEBUGCTLMSR_LBR;
+
+	bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
+				      LBR_CTL_ENABLE_MASK);
+
+	if (enable_lbrv == current_enable_lbrv)
+		return;
+
+	if (enable_lbrv)
+		svm_enable_lbrv(vcpu);
+	else
+		svm_disable_lbrv(vcpu);
 }
 
 void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -2588,25 +2660,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_TSC_AUX:
 		msr_info->data = svm->tsc_aux;
 		break;
-	/*
-	 * Nobody will change the following 5 values in the VMCB so we can
-	 * safely return them on rdmsr. They will always be 0 until LBRV is
-	 * implemented.
-	 */
 	case MSR_IA32_DEBUGCTLMSR:
-		msr_info->data = svm->vmcb->save.dbgctl;
-		break;
 	case MSR_IA32_LASTBRANCHFROMIP:
-		msr_info->data = svm->vmcb->save.br_from;
-		break;
 	case MSR_IA32_LASTBRANCHTOIP:
-		msr_info->data = svm->vmcb->save.br_to;
-		break;
 	case MSR_IA32_LASTINTFROMIP:
-		msr_info->data = svm->vmcb->save.last_excp_from;
-		break;
 	case MSR_IA32_LASTINTTOIP:
-		msr_info->data = svm->vmcb->save.last_excp_to;
+		msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
 		break;
 	case MSR_VM_HSAVE_PA:
 		msr_info->data = svm->nested.hsave_msr;
@@ -2837,12 +2896,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		if (data & DEBUGCTL_RESERVED_BITS)
 			return 1;
 
-		svm->vmcb->save.dbgctl = data;
-		vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
-		if (data & (1ULL<<0))
-			svm_enable_lbrv(vcpu);
+		if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
+			svm->vmcb->save.dbgctl = data;
 		else
-			svm_disable_lbrv(vcpu);
+			svm->vmcb01.ptr->save.dbgctl = data;
+
+		svm_update_lbrv(vcpu);
+
 		break;
 	case MSR_VM_HSAVE_PA:
 		/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 70850cbe5bcb5..2d0487968cba4 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -488,6 +488,8 @@ u32 svm_msrpm_offset(u32 msr);
 u32 *svm_vcpu_alloc_msrpm(void);
 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
 void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
 
 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
  2022-03-01 14:36 ` [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-09 13:00   ` Paolo Bonzini
  2022-03-01 14:36 ` [PATCH v3 3/7] KVM: x86: nSVM: implement nested VMLOAD/VMSAVE Maxim Levitsky
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

This was tested with kvm-unit-test that was developed
for this purpose.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 21 +++++++++++++++++++--
 arch/x86/kvm/svm/svm.c    |  8 ++++++++
 arch/x86/kvm/svm/svm.h    |  1 +
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index d026f89ee94e6..12e856bfce408 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -565,8 +565,19 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
 		vmcb_mark_dirty(svm->vmcb, VMCB_DR);
 	}
 
-	if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK))
+	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+
+		/* Copy LBR related registers from vmcb12,
+		 * but make sure that we only pick LBR enable bit from the guest.
+		 */
+
+		svm_copy_lbrs(vmcb12, svm->vmcb);
+		svm->vmcb->save.dbgctl &= LBR_CTL_ENABLE_MASK;
+		svm_update_lbrv(&svm->vcpu);
+
+	} else if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
 		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+	}
 }
 
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
@@ -626,6 +637,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 
 	svm->vmcb->control.virt_ext            = svm->vmcb01.ptr->control.virt_ext &
 						 LBR_CTL_ENABLE_MASK;
+	if (svm->lbrv_enabled)
+		svm->vmcb->control.virt_ext  |=
+			(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
 
 	nested_svm_transition_tlb_flush(vcpu);
 
@@ -889,7 +903,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	svm_switch_vmcb(svm, &svm->vmcb01);
 
-	if (unlikely(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+		svm_copy_lbrs(svm->nested.vmcb02.ptr, vmcb12);
+		svm_update_lbrv(vcpu);
+	} else if (unlikely(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
 		svm_copy_lbrs(svm->nested.vmcb02.ptr, svm->vmcb);
 		svm_update_lbrv(vcpu);
 	}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index aa6c04d15d2a7..8d87f84f93f30 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -888,6 +888,10 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu)
 	bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
 				      LBR_CTL_ENABLE_MASK);
 
+	if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+		if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
+			enable_lbrv = true;
+
 	if (enable_lbrv == current_enable_lbrv)
 		return;
 
@@ -3998,6 +4002,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 			     guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
 
 	svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+	svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
 
 	svm_recalc_instruction_intercepts(vcpu, svm);
 
@@ -4748,6 +4753,9 @@ static __init void svm_set_cpu_caps(void)
 		if (tsc_scaling)
 			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
 
+		if (lbrv)
+			kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2d0487968cba4..86dc1f997e01d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -232,6 +232,7 @@ struct vcpu_svm {
 	/* cached guest cpuid flags for faster access */
 	bool nrips_enabled                : 1;
 	bool tsc_scaling_enabled          : 1;
+	bool lbrv_enabled                 : 1;
 
 	u32 ldr_reg;
 	u32 dfr_reg;
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 3/7] KVM: x86: nSVM: implement nested VMLOAD/VMSAVE
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
  2022-03-01 14:36 ` [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running Maxim Levitsky
  2022-03-01 14:36 ` [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-01 14:36 ` [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on Maxim Levitsky
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

This was tested by booting L1,L2,L3 (all Linux) and checking
that no VMLOAD/VMSAVE vmexits happened.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 35 +++++++++++++++++++++++++++++------
 arch/x86/kvm/svm/svm.c    |  7 +++++++
 arch/x86/kvm/svm/svm.h    |  8 +++++++-
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 12e856bfce408..37510cb206190 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -121,6 +121,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 }
 
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+	if (!svm->v_vmload_vmsave_enabled)
+		return true;
+
+	if (!nested_npt_enabled(svm))
+		return true;
+
+	if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+		return true;
+
+	return false;
+}
+
 void recalc_intercepts(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *c, *h;
@@ -162,8 +176,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
 	if (!intercept_smi)
 		vmcb_clr_intercept(c, INTERCEPT_SMI);
 
-	vmcb_set_intercept(c, INTERCEPT_VMLOAD);
-	vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+	if (nested_vmcb_needs_vls_intercept(svm)) {
+		/*
+		 * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+		 * we must intercept these instructions to correctly
+		 * emulate them in case L1 doesn't intercept them.
+		 */
+		vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+		vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+	} else {
+		WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+	}
 }
 
 /*
@@ -454,10 +477,7 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
 	vmcb12->control.exit_int_info = exit_int_info;
 }
 
-static inline bool nested_npt_enabled(struct vcpu_svm *svm)
-{
-	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
-}
+
 
 static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
 {
@@ -641,6 +661,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 		svm->vmcb->control.virt_ext  |=
 			(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
 
+	if (!nested_vmcb_needs_vls_intercept(svm))
+		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
 	nested_svm_transition_tlb_flush(vcpu);
 
 	/* Enter Guest-Mode */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8d87f84f93f30..6a571eed32ef4 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1049,6 +1049,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
 
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+
+		svm->v_vmload_vmsave_enabled = false;
 	} else {
 		/*
 		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -4004,6 +4006,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
 	svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
 
+	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
 	svm_recalc_instruction_intercepts(vcpu, svm);
 
 	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
@@ -4756,6 +4760,9 @@ static __init void svm_set_cpu_caps(void)
 		if (lbrv)
 			kvm_cpu_cap_set(X86_FEATURE_LBRV);
 
+		if (vls)
+			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 86dc1f997e01d..a3c93f9c02847 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -229,10 +229,11 @@ struct vcpu_svm {
 	unsigned int3_injected;
 	unsigned long int3_rip;
 
-	/* cached guest cpuid flags for faster access */
+	/* optional nested SVM features that are enabled for this guest  */
 	bool nrips_enabled                : 1;
 	bool tsc_scaling_enabled          : 1;
 	bool lbrv_enabled                 : 1;
+	bool v_vmload_vmsave_enabled      : 1;
 
 	u32 ldr_reg;
 	u32 dfr_reg;
@@ -480,6 +481,11 @@ static inline bool gif_set(struct vcpu_svm *svm)
 		return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
 /* svm.c */
 #define MSR_INVALID				0xffffffffU
 
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
                   ` (2 preceding siblings ...)
  2022-03-01 14:36 ` [PATCH v3 3/7] KVM: x86: nSVM: implement nested VMLOAD/VMSAVE Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-09 13:12   ` Paolo Bonzini
  2022-03-09 18:35   ` Jim Mattson
  2022-03-01 14:36 ` [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF Maxim Levitsky
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

Allow L1 to use these settings if L0 disables PAUSE interception
(AKA cpu_pm=on)

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c |  6 ++++++
 arch/x86/kvm/svm/svm.c    | 17 +++++++++++++++++
 arch/x86/kvm/svm/svm.h    |  2 ++
 3 files changed, 25 insertions(+)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 37510cb206190..4cb0bc49986d5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -664,6 +664,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 	if (!nested_vmcb_needs_vls_intercept(svm))
 		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
 
+	if (svm->pause_filter_enabled)
+		svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
+
+	if (svm->pause_threshold_enabled)
+		svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
+
 	nested_svm_transition_tlb_flush(vcpu);
 
 	/* Enter Guest-Mode */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6a571eed32ef4..52198e63c5fc4 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4008,6 +4008,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
 	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
 
+	if (kvm_pause_in_guest(vcpu->kvm)) {
+		svm->pause_filter_enabled = pause_filter_count > 0 &&
+					    guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+
+		svm->pause_threshold_enabled = pause_filter_thresh > 0 &&
+					    guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+	} else {
+		svm->pause_filter_enabled = false;
+		svm->pause_threshold_enabled = false;
+	}
+
 	svm_recalc_instruction_intercepts(vcpu, svm);
 
 	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
@@ -4763,6 +4774,12 @@ static __init void svm_set_cpu_caps(void)
 		if (vls)
 			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
 
+		if (pause_filter_count)
+			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+		if (pause_filter_thresh)
+			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a3c93f9c02847..6fa81eb3ffb78 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -234,6 +234,8 @@ struct vcpu_svm {
 	bool tsc_scaling_enabled          : 1;
 	bool lbrv_enabled                 : 1;
 	bool v_vmload_vmsave_enabled      : 1;
+	bool pause_filter_enabled         : 1;
+	bool pause_threshold_enabled      : 1;
 
 	u32 ldr_reg;
 	u32 dfr_reg;
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
                   ` (3 preceding siblings ...)
  2022-03-01 14:36 ` [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-09 13:40   ` Paolo Bonzini
  2022-03-01 14:36 ` [PATCH v3 6/7] KVM: x86: SVM: allow to force AVIC to be enabled Maxim Levitsky
  2022-03-01 14:36 ` [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running Maxim Levitsky
  6 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

In case L1 enables vGIF for L2, the L2 cannot affect L1's GIF, regardless
of STGI/CLGI intercepts, and since VM entry enables GIF, this means
that L1's GIF is always 1 while L2 is running.

Thus in this case leave L1's vGIF in vmcb01, while letting L2
control the vGIF thus implementing nested vGIF.

Also allow KVM to toggle L1's GIF during nested entry/exit
by always using vmcb01.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 17 +++++++++++++----
 arch/x86/kvm/svm/svm.c    |  5 +++++
 arch/x86/kvm/svm/svm.h    | 25 +++++++++++++++++++++----
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4cb0bc49986d5..4f8b5a330096e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -436,6 +436,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 		 */
 		mask &= ~V_IRQ_MASK;
 	}
+
+	if (nested_vgif_enabled(svm))
+		mask |= V_GIF_MASK;
+
 	svm->nested.ctl.int_ctl        &= ~mask;
 	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
 }
@@ -602,10 +606,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
 
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
-	const u32 int_ctl_vmcb01_bits =
-		V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
-
-	const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+	u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+	u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
 
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 
@@ -620,6 +622,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 	 */
 	WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
 
+
+
+	if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+		int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+	else
+		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+
 	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
 	svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
 	svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 52198e63c5fc4..776585dd77769 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4019,6 +4019,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		svm->pause_threshold_enabled = false;
 	}
 
+	svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+
 	svm_recalc_instruction_intercepts(vcpu, svm);
 
 	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
@@ -4780,6 +4782,9 @@ static __init void svm_set_cpu_caps(void)
 		if (pause_filter_thresh)
 			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
 
+		if (vgif)
+			kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6fa81eb3ffb78..7e2f9bddf47dd 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -236,6 +236,7 @@ struct vcpu_svm {
 	bool v_vmload_vmsave_enabled      : 1;
 	bool pause_filter_enabled         : 1;
 	bool pause_threshold_enabled      : 1;
+	bool vgif_enabled                 : 1;
 
 	u32 ldr_reg;
 	u32 dfr_reg;
@@ -454,31 +455,47 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
 	return vmcb_is_intercept(&svm->vmcb->control, bit);
 }
 
+static bool nested_vgif_enabled(struct vcpu_svm *svm)
+{
+	if (!is_guest_mode(&svm->vcpu) || !svm->vgif_enabled)
+		return false;
+	return svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK;
+}
+
 static inline bool vgif_enabled(struct vcpu_svm *svm)
 {
-	return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
+
+	return !!(vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
 }
 
 static inline void enable_gif(struct vcpu_svm *svm)
 {
+	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
+
 	if (vgif_enabled(svm))
-		svm->vmcb->control.int_ctl |= V_GIF_MASK;
+		vmcb->control.int_ctl |= V_GIF_MASK;
 	else
 		svm->vcpu.arch.hflags |= HF_GIF_MASK;
 }
 
 static inline void disable_gif(struct vcpu_svm *svm)
 {
+	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
+
 	if (vgif_enabled(svm))
-		svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+		vmcb->control.int_ctl &= ~V_GIF_MASK;
 	else
 		svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+
 }
 
 static inline bool gif_set(struct vcpu_svm *svm)
 {
+	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
+
 	if (vgif_enabled(svm))
-		return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+		return !!(vmcb->control.int_ctl & V_GIF_MASK);
 	else
 		return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 6/7] KVM: x86: SVM: allow to force AVIC to be enabled
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
                   ` (4 preceding siblings ...)
  2022-03-01 14:36 ` [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-09 13:41   ` Paolo Bonzini
  2022-03-01 14:36 ` [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running Maxim Levitsky
  6 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

Apparently on some systems AVIC is disabled in CPUID but still usable.

Allow the user to override the CPUID if the user is willing to
take the risk.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 776585dd77769..a26b4c899899e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -202,6 +202,9 @@ module_param(tsc_scaling, int, 0444);
 static bool avic;
 module_param(avic, bool, 0444);
 
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
@@ -4896,10 +4899,14 @@ static __init int svm_hardware_setup(void)
 			nrips = false;
 	}
 
-	enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+	enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
 
 	if (enable_apicv) {
-		pr_info("AVIC enabled\n");
+		if (!boot_cpu_has(X86_FEATURE_AVIC)) {
+			pr_warn("AVIC is not supported in CPUID but force enabled");
+			pr_warn("Your system might crash and burn");
+		} else
+			pr_info("AVIC enabled\n");
 
 		amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 	} else {
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running
  2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
                   ` (5 preceding siblings ...)
  2022-03-01 14:36 ` [PATCH v3 6/7] KVM: x86: SVM: allow to force AVIC to be enabled Maxim Levitsky
@ 2022-03-01 14:36 ` Maxim Levitsky
  2022-03-09 13:50   ` Paolo Bonzini
  6 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-01 14:36 UTC (permalink / raw)
  To: kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li, Maxim Levitsky

Inhibit the AVIC of the vCPU that is running nested for the duration of the
nested run, so that all interrupts arriving from both its vCPU siblings
and from KVM are delivered using normal IPIs and cause that vCPU to vmexit.

Note that unlike normal AVIC inhibition, there is no need to
update the AVIC mmio memslot, because the nested guest uses its
own set of paging tables.
That also means that AVIC doesn't need to be inhibited VM wide.

Note that in the theory when a nested guest doesn't intercept
physical interrupts, we could continue using AVIC to deliver them
to it but don't bother doing so for now. Plus when nested AVIC
is implemented, the nested guest will likely use it, which will
not allow this optimization to be used

(can't use real AVIC to support both L1 and L2 at the same time)

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  7 ++++++-
 arch/x86/kvm/svm/avic.c            |  6 +++++-
 arch/x86/kvm/svm/nested.c          | 15 ++++++++++-----
 arch/x86/kvm/svm/svm.c             | 31 +++++++++++++++++++-----------
 arch/x86/kvm/svm/svm.h             |  1 +
 arch/x86/kvm/x86.c                 | 15 +++++++++++++--
 7 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 29affccb353cd..eb16e32117610 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -126,6 +126,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers)
 KVM_X86_OP(msr_filter_changed)
 KVM_X86_OP(complete_emulated_msr)
 KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_has_apicv_inhibit_condition);
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ccec837e520d8..efe7414361de8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1039,7 +1039,6 @@ struct kvm_x86_msr_filter {
 
 #define APICV_INHIBIT_REASON_DISABLE    0
 #define APICV_INHIBIT_REASON_HYPERV     1
-#define APICV_INHIBIT_REASON_NESTED     2
 #define APICV_INHIBIT_REASON_IRQWIN     3
 #define APICV_INHIBIT_REASON_PIT_REINJ  4
 #define APICV_INHIBIT_REASON_X2APIC	5
@@ -1490,6 +1489,12 @@ struct kvm_x86_ops {
 	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
 
 	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+	/*
+	 * Returns true if for some reason APICv (e.g guest mode)
+	 * must be inhibited on this vCPU
+	 */
+	bool (*vcpu_has_apicv_inhibit_condition)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index aea0b13773fd3..d5ce0868c5a74 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -357,6 +357,11 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+bool avic_has_vcpu_inhibit_condition(struct kvm_vcpu *vcpu)
+{
+	return is_guest_mode(vcpu);
+}
+
 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
 {
 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
@@ -859,7 +864,6 @@ bool avic_check_apicv_inhibit_reasons(ulong bit)
 	ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
 			  BIT(APICV_INHIBIT_REASON_ABSENT) |
 			  BIT(APICV_INHIBIT_REASON_HYPERV) |
-			  BIT(APICV_INHIBIT_REASON_NESTED) |
 			  BIT(APICV_INHIBIT_REASON_IRQWIN) |
 			  BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
 			  BIT(APICV_INHIBIT_REASON_X2APIC) |
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4f8b5a330096e..573e0be84e874 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -616,11 +616,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 	 * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
 	 */
 
-	/*
-	 * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
-	 * avic_physical_id.
-	 */
-	WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
 
 
 
@@ -746,6 +741,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
 
 	svm_set_gif(svm, true);
 
+	if (kvm_vcpu_apicv_active(vcpu))
+		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
 	return 0;
 }
 
@@ -1018,6 +1016,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
 		kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 
+	/*
+	 * Un-inhibit the AVIC right away, so that other vCPUs can start
+	 * to benefit from VM-exit less IPI right away
+	 */
+	if (kvm_apicv_activated(vcpu->kvm))
+		kvm_vcpu_update_apicv(vcpu);
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a26b4c899899e..ed620d6f68b91 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1474,7 +1474,8 @@ static void svm_set_vintr(struct vcpu_svm *svm)
 	/*
 	 * The following fields are ignored when AVIC is enabled
 	 */
-	WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+	if (!is_guest_mode(&svm->vcpu))
+		WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
 
 	svm_set_intercept(svm, INTERCEPT_VINTR);
 
@@ -2968,10 +2969,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 	svm_clear_vintr(to_svm(vcpu));
 
 	/*
-	 * For AVIC, the only reason to end up here is ExtINTs.
+	 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
 	 * In this case AVIC was temporarily disabled for
 	 * requesting the IRQ window and we have to re-enable it.
+	 *
+	 * If running nested, still uninhibit the AVIC in case irq window
+	 * was requested when it was not running nested.
+	 * All vCPUs which run nested will have their AVIC still
+	 * inhibited due to AVIC inhibition override for that.
 	 */
+
 	kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
 
 	++vcpu->stat.irq_window_exits;
@@ -3569,8 +3576,16 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
 		 * unless we have pending ExtINT since it cannot be injected
 		 * via AVIC. In such case, we need to temporarily disable AVIC,
 		 * and fallback to injecting IRQ via V_IRQ.
+		 *
+		 * If running nested, this vCPU will use separate page tables
+		 * which don't have L1's AVIC mapped, and the AVIC is
+		 * already inhibited thus there is no need for global
+		 * AVIC inhibition.
 		 */
-		kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
+
+		if (!is_guest_mode(vcpu))
+			kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
+
 		svm_set_vintr(svm);
 	}
 }
@@ -4041,14 +4056,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
 			kvm_request_apicv_update(vcpu->kvm, false,
 						 APICV_INHIBIT_REASON_X2APIC);
-
-		/*
-		 * Currently, AVIC does not work with nested virtualization.
-		 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
-		 */
-		if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
-			kvm_request_apicv_update(vcpu->kvm, false,
-						 APICV_INHIBIT_REASON_NESTED);
 	}
 	init_vmcb_after_set_cpuid(vcpu);
 }
@@ -4710,6 +4717,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.complete_emulated_msr = svm_complete_emulated_msr,
 
 	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+	.vcpu_has_apicv_inhibit_condition = avic_has_vcpu_inhibit_condition,
 };
 
 /*
@@ -4912,6 +4920,7 @@ static __init int svm_hardware_setup(void)
 	} else {
 		svm_x86_ops.vcpu_blocking = NULL;
 		svm_x86_ops.vcpu_unblocking = NULL;
+		svm_x86_ops.vcpu_has_apicv_inhibit_condition = NULL;
 	}
 
 	if (vls) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7e2f9bddf47dd..641f3bb23217d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -618,6 +618,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+bool avic_has_vcpu_inhibit_condition(struct kvm_vcpu *vcpu);
 
 /* sev.c */
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c712c33c1521f..14b964eb079e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9681,6 +9681,11 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
 	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 }
 
+static bool vcpu_has_apicv_inhibit_condition(struct kvm_vcpu *vcpu)
+{
+	return static_call(kvm_x86_vcpu_has_apicv_inhibit_condition)(vcpu);
+}
+
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 {
 	bool activate;
@@ -9690,7 +9695,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 
 	down_read(&vcpu->kvm->arch.apicv_update_lock);
 
-	activate = kvm_apicv_activated(vcpu->kvm);
+	activate = kvm_apicv_activated(vcpu->kvm) &&
+		   !vcpu_has_apicv_inhibit_condition(vcpu);
+
 	if (vcpu->arch.apicv_active == activate)
 		goto out;
 
@@ -10091,7 +10098,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		 * per-VM state, and responsing vCPUs must wait for the update
 		 * to complete before servicing KVM_REQ_APICV_UPDATE.
 		 */
-		WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+		if (vcpu_has_apicv_inhibit_condition(vcpu))
+			WARN_ON_ONCE(kvm_vcpu_apicv_active(vcpu));
+		else
+			WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
 
 		exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
 		if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running
  2022-03-01 14:36 ` [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running Maxim Levitsky
@ 2022-03-09 13:00   ` Paolo Bonzini
  2022-03-14 11:25     ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 13:00 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/1/22 15:36, Maxim Levitsky wrote:
> +void svm_copy_lbrs(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
> +{
> +	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
> +	to_vmcb->save.br_from		= from_vmcb->save.br_from;
> +	to_vmcb->save.br_to		= from_vmcb->save.br_to;
> +	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
> +	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
> +
> +	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
> +}
> +

I think "struct vmcb *to_vmcb, struct vmcb *from_vmcb" is more common 
(e.g. svm_copy_vmrun_state, svm_copy_vmloadsave_state).

Paolo

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization
  2022-03-01 14:36 ` [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization Maxim Levitsky
@ 2022-03-09 13:00   ` Paolo Bonzini
  2022-03-22 16:53     ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 13:00 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/1/22 15:36, Maxim Levitsky wrote:
> @@ -565,8 +565,19 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
>   		vmcb_mark_dirty(svm->vmcb, VMCB_DR);
>   	}
>   
> -	if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK))
> +	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
> +
> +		/* Copy LBR related registers from vmcb12,
> +		 * but make sure that we only pick LBR enable bit from the guest.
> +		 */
> +
> +		svm_copy_lbrs(vmcb12, svm->vmcb);
> +		svm->vmcb->save.dbgctl &= LBR_CTL_ENABLE_MASK;

This should be checked against DEBUGCTL_RESERVED_BITS instead in 
__nested_vmcb_check_save; remember to add dbgctl to struct 
vmcb_save_area_cached too.

Paolo

> +		svm_update_lbrv(&svm->vcpu);
> +
> +	} else if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
>   		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
> +	}
>   }


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-01 14:36 ` [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on Maxim Levitsky
@ 2022-03-09 13:12   ` Paolo Bonzini
  2022-03-22 16:52     ` Maxim Levitsky
  2022-03-09 18:35   ` Jim Mattson
  1 sibling, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 13:12 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/1/22 15:36, Maxim Levitsky wrote:
> Allow L1 to use these settings if L0 disables PAUSE interception
> (AKA cpu_pm=on)
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>   arch/x86/kvm/svm/nested.c |  6 ++++++
>   arch/x86/kvm/svm/svm.c    | 17 +++++++++++++++++
>   arch/x86/kvm/svm/svm.h    |  2 ++
>   3 files changed, 25 insertions(+)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 37510cb206190..4cb0bc49986d5 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -664,6 +664,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
>   	if (!nested_vmcb_needs_vls_intercept(svm))
>   		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
>   
> +	if (svm->pause_filter_enabled)
> +		svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
> +
> +	if (svm->pause_threshold_enabled)
> +		svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;

I think this should be

	if (kvm_pause_in_guest(vcpu->kvm)) {
		/* copy from VMCB12 if guest has CPUID, else set to 0 */
	} else {
		/* copy from VMCB01, unconditionally */
	}

and likewise it should be copied back to VMCB01 unconditionally on 
vmexit if !kvm_pause_in_guest(vcpu->kvm).

>   	nested_svm_transition_tlb_flush(vcpu);
>   
>   	/* Enter Guest-Mode */
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 6a571eed32ef4..52198e63c5fc4 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4008,6 +4008,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>   
>   	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
>   
> +	if (kvm_pause_in_guest(vcpu->kvm)) {
> +		svm->pause_filter_enabled = pause_filter_count > 0 &&
> +					    guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
> +
> +		svm->pause_threshold_enabled = pause_filter_thresh > 0 &&
> +					    guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);

Why only if the module parameters are >0?  The module parameter is 
unused if pause-in-guest is active.

> +	} else {
> +		svm->pause_filter_enabled = false;
> +		svm->pause_threshold_enabled = false;
> +	}
> +
>   	svm_recalc_instruction_intercepts(vcpu, svm);
>   
>   	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
> @@ -4763,6 +4774,12 @@ static __init void svm_set_cpu_caps(void)
>   		if (vls)
>   			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
>   
> +		if (pause_filter_count)
> +			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
> +
> +		if (pause_filter_thresh)
> +			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);

Likewise, this should be set using just boot_cpu_has, not the module 
parameters.

Paolo

>   		/* Nested VM can receive #VMEXIT instead of triggering #GP */
>   		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
>   	}
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index a3c93f9c02847..6fa81eb3ffb78 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -234,6 +234,8 @@ struct vcpu_svm {
>   	bool tsc_scaling_enabled          : 1;
>   	bool lbrv_enabled                 : 1;
>   	bool v_vmload_vmsave_enabled      : 1;
> +	bool pause_filter_enabled         : 1;
> +	bool pause_threshold_enabled      : 1;
>   
>   	u32 ldr_reg;
>   	u32 dfr_reg;


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF
  2022-03-01 14:36 ` [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF Maxim Levitsky
@ 2022-03-09 13:40   ` Paolo Bonzini
  2022-03-14 15:21     ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 13:40 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

The patch is good but I think it's possibly to rewrite some parts in an 
easier way.

On 3/1/22 15:36, Maxim Levitsky wrote:
> 
> +	if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
> +		int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
> +	else
> +		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);

To remember for later: svm->vmcb's V_GIF_ENABLE_MASK is always the same 
as vgif:

- if it comes from vmcb12, it must be 1 (and then vgif is also 1)

- if it comes from vmcb01, it must be equal to vgif (because 
V_GIF_ENABLE_MASK is set in init_vmcb and never touched again).

>   
> +static bool nested_vgif_enabled(struct vcpu_svm *svm)
> +{
> +	if (!is_guest_mode(&svm->vcpu) || !svm->vgif_enabled)
> +		return false;
> +	return svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK;
> +}
> +
>   static inline bool vgif_enabled(struct vcpu_svm *svm)
>   {
> -	return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
> +	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
> +
> +	return !!(vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
>   }
>   

Slight simplification:

- before the patch, vgif_enabled() is just "vgif", because 
V_GIF_ENABLE_MASK is set in init_vmcb and copied to vmcb02

- after the patch, vgif_enabled() is also just "vgif".  Outside guest 
mode the same reasoning applies.  If L2 has enabled vGIF,  vmcb01's 
V_GIF_ENABLE is equal to vgif per the previous bullet.  If L2 has not 
enabled vGIF, vmcb02's V_GIF_ENABLE uses svm->vmcb's int_ctl field which 
is always the same as vgif (see remark above).

You can make this simplification a separate patch.

>  static inline void enable_gif(struct vcpu_svm *svm)
>  {
> +	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
> +
>  	if (vgif_enabled(svm))
> -		svm->vmcb->control.int_ctl |= V_GIF_MASK;
> +		vmcb->control.int_ctl |= V_GIF_MASK;
>  	else
>  		svm->vcpu.arch.hflags |= HF_GIF_MASK;
>  }
>  
>  static inline void disable_gif(struct vcpu_svm *svm)
>  {
> +	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
> +
>  	if (vgif_enabled(svm))
> -		svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
> +		vmcb->control.int_ctl &= ~V_GIF_MASK;
>  	else
>  		svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
> +
>  }

Looks good.  For a little optimization however you could write

static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
{
	if (!vgif)
		return NULL;
	if (!is_guest_mode(&svm->vcpu)
		return svm->vmcb01.ptr;
	if ((svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
		return svm->vmcb01.ptr;
	else
		return svm->nested.vmcb02.ptr;
}

and then

	struct vmcb *vmcb = get_vgif_vmcb(svm);
	if (vmcb)
		/* use vmcb->control.int_ctl */
	else
		/* use hflags */

Paolo

>  
> +static bool nested_vgif_enabled(struct vcpu_svm *svm)
> +{
> +	if (!is_guest_mode(&svm->vcpu) || !svm->vgif_enabled)
> +		return false;
> +	return svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK;
> +}
> +


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 6/7] KVM: x86: SVM: allow to force AVIC to be enabled
  2022-03-01 14:36 ` [PATCH v3 6/7] KVM: x86: SVM: allow to force AVIC to be enabled Maxim Levitsky
@ 2022-03-09 13:41   ` Paolo Bonzini
  0 siblings, 0 replies; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 13:41 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/1/22 15:36, Maxim Levitsky wrote:
> Apparently on some systems AVIC is disabled in CPUID but still usable.
> 
> Allow the user to override the CPUID if the user is willing to
> take the risk.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>   arch/x86/kvm/svm/svm.c | 11 +++++++++--
>   1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 776585dd77769..a26b4c899899e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -202,6 +202,9 @@ module_param(tsc_scaling, int, 0444);
>   static bool avic;
>   module_param(avic, bool, 0444);
>   
> +static bool force_avic;
> +module_param_unsafe(force_avic, bool, 0444);
> +
>   bool __read_mostly dump_invalid_vmcb;
>   module_param(dump_invalid_vmcb, bool, 0644);
>   
> @@ -4896,10 +4899,14 @@ static __init int svm_hardware_setup(void)
>   			nrips = false;
>   	}
>   
> -	enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
> +	enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
>   
>   	if (enable_apicv) {
> -		pr_info("AVIC enabled\n");
> +		if (!boot_cpu_has(X86_FEATURE_AVIC)) {
> +			pr_warn("AVIC is not supported in CPUID but force enabled");
> +			pr_warn("Your system might crash and burn");
> +		} else
> +			pr_info("AVIC enabled\n");
>   
>   		amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
>   	} else {

Queued, thanks.

Paolo

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running
  2022-03-01 14:36 ` [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running Maxim Levitsky
@ 2022-03-09 13:50   ` Paolo Bonzini
  2022-03-09 18:14     ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 13:50 UTC (permalink / raw)
  To: Maxim Levitsky, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/1/22 15:36, Maxim Levitsky wrote:
>   	bool activate;
> @@ -9690,7 +9695,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
>   
>   	down_read(&vcpu->kvm->arch.apicv_update_lock);
>   
> -	activate = kvm_apicv_activated(vcpu->kvm);
> +	activate = kvm_apicv_activated(vcpu->kvm) &&
> +		   !vcpu_has_apicv_inhibit_condition(vcpu);
> +
>   	if (vcpu->arch.apicv_active == activate)
>   		goto out;
>   

Perhaps the callback could be named vcpu_apicv_inhibit_reasons, and it would
return APICV_INHIBIT_REASON_NESTED?  Then instead of the new function
vcpu_has_apicv_inhibit_condition(), you would have

bool kvm_vcpu_apicv_activated(struct vcpu_kvm *kvm)
{
	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
	ulong vcpu_reasons = static_call(kvm_x86_vcpu_apicv_inhibit_reasons)(vcpu);
         return (vm_reasons | vcpu_reasons) == 0;
}
EXPORT_SYMBOL_GPL(kvm_cpu_apicv_activated);

It's mostly aesthetics, but it would also be a bit more self explanatory I think.

Paolo

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running
  2022-03-09 13:50   ` Paolo Bonzini
@ 2022-03-09 18:14     ` Maxim Levitsky
  0 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-09 18:14 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Wed, 2022-03-09 at 14:50 +0100, Paolo Bonzini wrote:
> On 3/1/22 15:36, Maxim Levitsky wrote:
> >   	bool activate;
> > @@ -9690,7 +9695,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
> >   
> >   	down_read(&vcpu->kvm->arch.apicv_update_lock);
> >   
> > -	activate = kvm_apicv_activated(vcpu->kvm);
> > +	activate = kvm_apicv_activated(vcpu->kvm) &&
> > +		   !vcpu_has_apicv_inhibit_condition(vcpu);
> > +
> >   	if (vcpu->arch.apicv_active == activate)
> >   		goto out;
> >   
> 
> Perhaps the callback could be named vcpu_apicv_inhibit_reasons, and it would
> return APICV_INHIBIT_REASON_NESTED?  Then instead of the new function
> vcpu_has_apicv_inhibit_condition(), you would have
> 
> bool kvm_vcpu_apicv_activated(struct vcpu_kvm *kvm)
> {
> 	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
> 	ulong vcpu_reasons = static_call(kvm_x86_vcpu_apicv_inhibit_reasons)(vcpu);
>          return (vm_reasons | vcpu_reasons) == 0;
> }
> EXPORT_SYMBOL_GPL(kvm_cpu_apicv_activated);
> 
> It's mostly aesthetics, but it would also be a bit more self explanatory I think.
> 
> Paolo
> 

This is a great idea, I will do it in next version.
Thanks!

Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-01 14:36 ` [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on Maxim Levitsky
  2022-03-09 13:12   ` Paolo Bonzini
@ 2022-03-09 18:35   ` Jim Mattson
  2022-03-09 18:47     ` Paolo Bonzini
  1 sibling, 1 reply; 28+ messages in thread
From: Jim Mattson @ 2022-03-09 18:35 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: kvm, Ingo Molnar, Dave Hansen, Sean Christopherson,
	Borislav Petkov, H. Peter Anvin, Thomas Gleixner, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Paolo Bonzini,
	Wanpeng Li

On Tue, Mar 1, 2022 at 6:37 AM Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> Allow L1 to use these settings if L0 disables PAUSE interception
> (AKA cpu_pm=on)
>
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>

I didn't think pause filtering was virtualizable, since the value of
the internal counter isn't exposed on VM-exit.

On bare metal, for instance, assuming the hypervisor doesn't intercept
CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
the filter count set to 2.

1:
pause
cpuid
jmp 1

Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
iteration, and when L0 resumes L2, the internal counter will be set to
2 again. L1 will never see a PAUSE #VMEXIT.

How do you handle this?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-09 18:35   ` Jim Mattson
@ 2022-03-09 18:47     ` Paolo Bonzini
  2022-03-09 19:07       ` Jim Mattson
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-09 18:47 UTC (permalink / raw)
  To: Jim Mattson, Maxim Levitsky
  Cc: kvm, Ingo Molnar, Dave Hansen, Sean Christopherson,
	Borislav Petkov, H. Peter Anvin, Thomas Gleixner, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/9/22 19:35, Jim Mattson wrote:
> I didn't think pause filtering was virtualizable, since the value of
> the internal counter isn't exposed on VM-exit.
> 
> On bare metal, for instance, assuming the hypervisor doesn't intercept
> CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> the filter count set to 2.
> 
> 1:
> pause
> cpuid
> jmp 1
> 
> Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> iteration, and when L0 resumes L2, the internal counter will be set to
> 2 again. L1 will never see a PAUSE #VMEXIT.
> 
> How do you handle this?
> 

I would expect that the same would happen on an SMI or a host interrupt.

	1:
	pause
	outl al, 0xb2
	jmp 1

In general a PAUSE vmexit will mostly benefit the VM that is pausing, so 
having a partial implementation would be better than disabling it 
altogether.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-09 18:47     ` Paolo Bonzini
@ 2022-03-09 19:07       ` Jim Mattson
  2022-03-21 21:36         ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Jim Mattson @ 2022-03-09 19:07 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Maxim Levitsky, kvm, Ingo Molnar, Dave Hansen,
	Sean Christopherson, Borislav Petkov, H. Peter Anvin,
	Thomas Gleixner, x86, Vitaly Kuznetsov, Joerg Roedel,
	linux-kernel, Wanpeng Li

On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 3/9/22 19:35, Jim Mattson wrote:
> > I didn't think pause filtering was virtualizable, since the value of
> > the internal counter isn't exposed on VM-exit.
> >
> > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > the filter count set to 2.
> >
> > 1:
> > pause
> > cpuid
> > jmp 1
> >
> > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > iteration, and when L0 resumes L2, the internal counter will be set to
> > 2 again. L1 will never see a PAUSE #VMEXIT.
> >
> > How do you handle this?
> >
>
> I would expect that the same would happen on an SMI or a host interrupt.
>
>         1:
>         pause
>         outl al, 0xb2
>         jmp 1
>
> In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> having a partial implementation would be better than disabling it
> altogether.

Indeed, the APM does say, "Certain events, including SMI, can cause
the internal count to be reloaded from the VMCB." However, expanding
that set of events so much that some pause loops will *never* trigger
a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
filter may not be triggered, it could always choose to exit on every
PAUSE.

Having a partial implementation is only better than disabling it
altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
L0.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running
  2022-03-09 13:00   ` Paolo Bonzini
@ 2022-03-14 11:25     ` Maxim Levitsky
  0 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-14 11:25 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Wed, 2022-03-09 at 14:00 +0100, Paolo Bonzini wrote:
> On 3/1/22 15:36, Maxim Levitsky wrote:
> > +void svm_copy_lbrs(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
> > +{
> > +	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
> > +	to_vmcb->save.br_from		= from_vmcb->save.br_from;
> > +	to_vmcb->save.br_to		= from_vmcb->save.br_to;
> > +	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
> > +	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
> > +
> > +	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
> > +}
> > +
> 
> I think "struct vmcb *to_vmcb, struct vmcb *from_vmcb" is more common 
> (e.g. svm_copy_vmrun_state, svm_copy_vmloadsave_state).
> 
> Paolo
> 
Will do.

Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF
  2022-03-09 13:40   ` Paolo Bonzini
@ 2022-03-14 15:21     ` Maxim Levitsky
  0 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-14 15:21 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Wed, 2022-03-09 at 14:40 +0100, Paolo Bonzini wrote:
> The patch is good but I think it's possibly to rewrite some parts in an 
> easier way.
> 
> On 3/1/22 15:36, Maxim Levitsky wrote:
> > +	if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
> > +		int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
> > +	else
> > +		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
> 
> To remember for later: svm->vmcb's V_GIF_ENABLE_MASK is always the same 
> as vgif:
> 
> - if it comes from vmcb12, it must be 1 (and then vgif is also 1)
> 
> - if it comes from vmcb01, it must be equal to vgif (because 
> V_GIF_ENABLE_MASK is set in init_vmcb and never touched again).
> 
> >   
> > +static bool nested_vgif_enabled(struct vcpu_svm *svm)
> > +{
> > +	if (!is_guest_mode(&svm->vcpu) || !svm->vgif_enabled)
> > +		return false;
> > +	return svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK;
> > +}
> > +
> >   static inline bool vgif_enabled(struct vcpu_svm *svm)
> >   {
> > -	return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
> > +	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
> > +
> > +	return !!(vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
> >   }
> >   
> 
> Slight simplification:
> 
> - before the patch, vgif_enabled() is just "vgif", because 
> V_GIF_ENABLE_MASK is set in init_vmcb and copied to vmcb02
> 
> - after the patch, vgif_enabled() is also just "vgif".  Outside guest 
> mode the same reasoning applies.  If L2 has enabled vGIF,  vmcb01's 
> V_GIF_ENABLE is equal to vgif per the previous bullet.  If L2 has not 
> enabled vGIF, vmcb02's V_GIF_ENABLE uses svm->vmcb's int_ctl field which 
> is always the same as vgif (see remark above).
> 
> You can make this simplification a separate patch.


This is a very good idea - I'll do this in a separate patch.

> 
> >  static inline void enable_gif(struct vcpu_svm *svm)
> >  {
> > +	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
> > +
> >  	if (vgif_enabled(svm))
> > -		svm->vmcb->control.int_ctl |= V_GIF_MASK;
> > +		vmcb->control.int_ctl |= V_GIF_MASK;
> >  	else
> >  		svm->vcpu.arch.hflags |= HF_GIF_MASK;
> >  }
> >  
> >  static inline void disable_gif(struct vcpu_svm *svm)
> >  {
> > +	struct vmcb *vmcb = nested_vgif_enabled(svm) ? svm->vmcb01.ptr : svm->vmcb;
> > +
> >  	if (vgif_enabled(svm))
> > -		svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
> > +		vmcb->control.int_ctl &= ~V_GIF_MASK;
> >  	else
> >  		svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
> > +
> >  }
> 
> Looks good.  For a little optimization however you could write
> 
> static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
> {
> 	if (!vgif)
> 		return NULL;
> 	if (!is_guest_mode(&svm->vcpu)
> 		return svm->vmcb01.ptr;
> 	if ((svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
> 		return svm->vmcb01.ptr;
> 	else
> 		return svm->nested.vmcb02.ptr;
> }
> 
> and then
> 
> 	struct vmcb *vmcb = get_vgif_vmcb(svm);
> 	if (vmcb)
> 		/* use vmcb->control.int_ctl */
> 	else
> 		/* use hflags */

Good idea as well.

Thanks for the review!
Best regards,
	Maxim Levitsky

> 
> Paolo
> 
> >  
> > +static bool nested_vgif_enabled(struct vcpu_svm *svm)
> > +{
> > +	if (!is_guest_mode(&svm->vcpu) || !svm->vgif_enabled)
> > +		return false;
> > +	return svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK;
> > +}
> > +



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-09 19:07       ` Jim Mattson
@ 2022-03-21 21:36         ` Maxim Levitsky
  2022-03-21 21:59           ` Jim Mattson
  0 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-21 21:36 UTC (permalink / raw)
  To: Jim Mattson, Paolo Bonzini
  Cc: kvm, Ingo Molnar, Dave Hansen, Sean Christopherson,
	Borislav Petkov, H. Peter Anvin, Thomas Gleixner, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > On 3/9/22 19:35, Jim Mattson wrote:
> > > I didn't think pause filtering was virtualizable, since the value of
> > > the internal counter isn't exposed on VM-exit.
> > > 
> > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > the filter count set to 2.
> > > 
> > > 1:
> > > pause
> > > cpuid
> > > jmp 1
> > > 
> > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > 
> > > How do you handle this?
> > > 
> > 
> > I would expect that the same would happen on an SMI or a host interrupt.
> > 
> >         1:
> >         pause
> >         outl al, 0xb2
> >         jmp 1
> > 
> > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > having a partial implementation would be better than disabling it
> > altogether.
> 
> Indeed, the APM does say, "Certain events, including SMI, can cause
> the internal count to be reloaded from the VMCB." However, expanding
> that set of events so much that some pause loops will *never* trigger
> a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> filter may not be triggered, it could always choose to exit on every
> PAUSE.
> 
> Having a partial implementation is only better than disabling it
> altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> L0.
> 

Hi!
 
You bring up a very valid point, which I didn't think about.
 
However after thinking about this, I think that in practice,
this isn't a show stopper problem for exposing this feature to the guest.
 

This is what I am thinking:
 
First lets assume that the L2 is malicious. In this case no doubt
it can craft such a loop which will not VMexit on PAUSE.
But that isn't a problem - instead of this guest could have just used NOP
which is not possible to intercept anyway - no harm is done.
 
Now lets assume a non malicious L2:


First of all the problem can only happen when a VM exit is intercepted by L0,
and not by L1. Both above cases usually don't pass this criteria since L1 is highly
likely to intercept both CPUID and IO port access. It is also highly unlikely
to allow L2 direct access to L1's mmio ranges.
 
Overall there are very few cases of deterministic vm exit which is intercepted
by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
which is not different much from not catching it because of not suitable
thresholds.
 
Also note that this is an optimization only - due to count and threshold,
it is not guaranteed to catch all pause loops - in fact hypervisor has
to guess these values, and update them in attempt to catch as many such
loops as it can.
 
I think overall it is OK to expose that feature to the guest
and it should even improve performance in some cases - currently
at least nested KVM intercepts every PAUSE otherwise.
 
Best regards,
	Maxim Levitsky





^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-21 21:36         ` Maxim Levitsky
@ 2022-03-21 21:59           ` Jim Mattson
  2022-03-21 22:11             ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Jim Mattson @ 2022-03-21 21:59 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: Paolo Bonzini, kvm, Ingo Molnar, Dave Hansen,
	Sean Christopherson, Borislav Petkov, H. Peter Anvin,
	Thomas Gleixner, x86, Vitaly Kuznetsov, Joerg Roedel,
	linux-kernel, Wanpeng Li

On Mon, Mar 21, 2022 at 2:36 PM Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> > On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > On 3/9/22 19:35, Jim Mattson wrote:
> > > > I didn't think pause filtering was virtualizable, since the value of
> > > > the internal counter isn't exposed on VM-exit.
> > > >
> > > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > > the filter count set to 2.
> > > >
> > > > 1:
> > > > pause
> > > > cpuid
> > > > jmp 1
> > > >
> > > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > >
> > > > How do you handle this?
> > > >
> > >
> > > I would expect that the same would happen on an SMI or a host interrupt.
> > >
> > >         1:
> > >         pause
> > >         outl al, 0xb2
> > >         jmp 1
> > >
> > > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > > having a partial implementation would be better than disabling it
> > > altogether.
> >
> > Indeed, the APM does say, "Certain events, including SMI, can cause
> > the internal count to be reloaded from the VMCB." However, expanding
> > that set of events so much that some pause loops will *never* trigger
> > a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> > filter may not be triggered, it could always choose to exit on every
> > PAUSE.
> >
> > Having a partial implementation is only better than disabling it
> > altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> > L0.
> >
>
> Hi!
>
> You bring up a very valid point, which I didn't think about.
>
> However after thinking about this, I think that in practice,
> this isn't a show stopper problem for exposing this feature to the guest.
>
>
> This is what I am thinking:
>
> First lets assume that the L2 is malicious. In this case no doubt
> it can craft such a loop which will not VMexit on PAUSE.
> But that isn't a problem - instead of this guest could have just used NOP
> which is not possible to intercept anyway - no harm is done.
>
> Now lets assume a non malicious L2:
>
>
> First of all the problem can only happen when a VM exit is intercepted by L0,
> and not by L1. Both above cases usually don't pass this criteria since L1 is highly
> likely to intercept both CPUID and IO port access. It is also highly unlikely
> to allow L2 direct access to L1's mmio ranges.
>
> Overall there are very few cases of deterministic vm exit which is intercepted
> by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
> which is not different much from not catching it because of not suitable
> thresholds.
>
> Also note that this is an optimization only - due to count and threshold,
> it is not guaranteed to catch all pause loops - in fact hypervisor has
> to guess these values, and update them in attempt to catch as many such
> loops as it can.
>
> I think overall it is OK to expose that feature to the guest
> and it should even improve performance in some cases - currently
> at least nested KVM intercepts every PAUSE otherwise.

Can I at least request that this behavior be documented as a KVM
virtual CPU erratum?

>
> Best regards,
>         Maxim Levitsky
>
>
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-21 21:59           ` Jim Mattson
@ 2022-03-21 22:11             ` Maxim Levitsky
  2022-03-21 22:41               ` Jim Mattson
  0 siblings, 1 reply; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-21 22:11 UTC (permalink / raw)
  To: Jim Mattson
  Cc: Paolo Bonzini, kvm, Ingo Molnar, Dave Hansen,
	Sean Christopherson, Borislav Petkov, H. Peter Anvin,
	Thomas Gleixner, x86, Vitaly Kuznetsov, Joerg Roedel,
	linux-kernel, Wanpeng Li

On Mon, 2022-03-21 at 14:59 -0700, Jim Mattson wrote:
> On Mon, Mar 21, 2022 at 2:36 PM Maxim Levitsky <mlevitsk@redhat.com> wrote:
> > On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> > > On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > > On 3/9/22 19:35, Jim Mattson wrote:
> > > > > I didn't think pause filtering was virtualizable, since the value of
> > > > > the internal counter isn't exposed on VM-exit.
> > > > > 
> > > > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > > > the filter count set to 2.
> > > > > 
> > > > > 1:
> > > > > pause
> > > > > cpuid
> > > > > jmp 1
> > > > > 
> > > > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > > > 
> > > > > How do you handle this?
> > > > > 
> > > > 
> > > > I would expect that the same would happen on an SMI or a host interrupt.
> > > > 
> > > >         1:
> > > >         pause
> > > >         outl al, 0xb2
> > > >         jmp 1
> > > > 
> > > > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > > > having a partial implementation would be better than disabling it
> > > > altogether.
> > > 
> > > Indeed, the APM does say, "Certain events, including SMI, can cause
> > > the internal count to be reloaded from the VMCB." However, expanding
> > > that set of events so much that some pause loops will *never* trigger
> > > a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> > > filter may not be triggered, it could always choose to exit on every
> > > PAUSE.
> > > 
> > > Having a partial implementation is only better than disabling it
> > > altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> > > L0.
> > > 
> > 
> > Hi!
> > 
> > You bring up a very valid point, which I didn't think about.
> > 
> > However after thinking about this, I think that in practice,
> > this isn't a show stopper problem for exposing this feature to the guest.
> > 
> > 
> > This is what I am thinking:
> > 
> > First lets assume that the L2 is malicious. In this case no doubt
> > it can craft such a loop which will not VMexit on PAUSE.
> > But that isn't a problem - instead of this guest could have just used NOP
> > which is not possible to intercept anyway - no harm is done.
> > 
> > Now lets assume a non malicious L2:
> > 
> > 
> > First of all the problem can only happen when a VM exit is intercepted by L0,
> > and not by L1. Both above cases usually don't pass this criteria since L1 is highly
> > likely to intercept both CPUID and IO port access. It is also highly unlikely
> > to allow L2 direct access to L1's mmio ranges.
> > 
> > Overall there are very few cases of deterministic vm exit which is intercepted
> > by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
> > which is not different much from not catching it because of not suitable
> > thresholds.
> > 
> > Also note that this is an optimization only - due to count and threshold,
> > it is not guaranteed to catch all pause loops - in fact hypervisor has
> > to guess these values, and update them in attempt to catch as many such
> > loops as it can.
> > 
> > I think overall it is OK to expose that feature to the guest
> > and it should even improve performance in some cases - currently
> > at least nested KVM intercepts every PAUSE otherwise.
> 
> Can I at least request that this behavior be documented as a KVM
> virtual CPU erratum?

100%. Do you have a pointer where to document it?

Best regards,
	Maxim Levitsky

> 
> > Best regards,
> >         Maxim Levitsky
> > 
> > 
> > 
> > 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-21 22:11             ` Maxim Levitsky
@ 2022-03-21 22:41               ` Jim Mattson
  2022-03-22 10:12                 ` Paolo Bonzini
  0 siblings, 1 reply; 28+ messages in thread
From: Jim Mattson @ 2022-03-21 22:41 UTC (permalink / raw)
  To: Maxim Levitsky
  Cc: Paolo Bonzini, kvm, Ingo Molnar, Dave Hansen,
	Sean Christopherson, Borislav Petkov, H. Peter Anvin,
	Thomas Gleixner, x86, Vitaly Kuznetsov, Joerg Roedel,
	linux-kernel, Wanpeng Li

On Mon, Mar 21, 2022 at 3:11 PM Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> On Mon, 2022-03-21 at 14:59 -0700, Jim Mattson wrote:
> > On Mon, Mar 21, 2022 at 2:36 PM Maxim Levitsky <mlevitsk@redhat.com> wrote:
> > > On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> > > > On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > > > On 3/9/22 19:35, Jim Mattson wrote:
> > > > > > I didn't think pause filtering was virtualizable, since the value of
> > > > > > the internal counter isn't exposed on VM-exit.
> > > > > >
> > > > > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > > > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > > > > the filter count set to 2.
> > > > > >
> > > > > > 1:
> > > > > > pause
> > > > > > cpuid
> > > > > > jmp 1
> > > > > >
> > > > > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > > > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > > > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > > > >
> > > > > > How do you handle this?
> > > > > >
> > > > >
> > > > > I would expect that the same would happen on an SMI or a host interrupt.
> > > > >
> > > > >         1:
> > > > >         pause
> > > > >         outl al, 0xb2
> > > > >         jmp 1
> > > > >
> > > > > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > > > > having a partial implementation would be better than disabling it
> > > > > altogether.
> > > >
> > > > Indeed, the APM does say, "Certain events, including SMI, can cause
> > > > the internal count to be reloaded from the VMCB." However, expanding
> > > > that set of events so much that some pause loops will *never* trigger
> > > > a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> > > > filter may not be triggered, it could always choose to exit on every
> > > > PAUSE.
> > > >
> > > > Having a partial implementation is only better than disabling it
> > > > altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> > > > L0.
> > > >
> > >
> > > Hi!
> > >
> > > You bring up a very valid point, which I didn't think about.
> > >
> > > However after thinking about this, I think that in practice,
> > > this isn't a show stopper problem for exposing this feature to the guest.
> > >
> > >
> > > This is what I am thinking:
> > >
> > > First lets assume that the L2 is malicious. In this case no doubt
> > > it can craft such a loop which will not VMexit on PAUSE.
> > > But that isn't a problem - instead of this guest could have just used NOP
> > > which is not possible to intercept anyway - no harm is done.
> > >
> > > Now lets assume a non malicious L2:
> > >
> > >
> > > First of all the problem can only happen when a VM exit is intercepted by L0,
> > > and not by L1. Both above cases usually don't pass this criteria since L1 is highly
> > > likely to intercept both CPUID and IO port access. It is also highly unlikely
> > > to allow L2 direct access to L1's mmio ranges.
> > >
> > > Overall there are very few cases of deterministic vm exit which is intercepted
> > > by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
> > > which is not different much from not catching it because of not suitable
> > > thresholds.
> > >
> > > Also note that this is an optimization only - due to count and threshold,
> > > it is not guaranteed to catch all pause loops - in fact hypervisor has
> > > to guess these values, and update them in attempt to catch as many such
> > > loops as it can.
> > >
> > > I think overall it is OK to expose that feature to the guest
> > > and it should even improve performance in some cases - currently
> > > at least nested KVM intercepts every PAUSE otherwise.
> >
> > Can I at least request that this behavior be documented as a KVM
> > virtual CPU erratum?
>
> 100%. Do you have a pointer where to document it?

I think this will be the first KVM virtual CPU erratum documented,
though there are plenty of others that I'd like to see documented
(e.g. nVMX processes posted interrupts on emulated VM-entry, AMD's
merged PMU counters are only 48 bits wide, etc.).

Maybe Paolo has some ideas?

> Best regards,
>         Maxim Levitsky
>
> >
> > > Best regards,
> > >         Maxim Levitsky
> > >
> > >
> > >
> > >
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-21 22:41               ` Jim Mattson
@ 2022-03-22 10:12                 ` Paolo Bonzini
  2022-03-22 11:17                   ` Maxim Levitsky
  0 siblings, 1 reply; 28+ messages in thread
From: Paolo Bonzini @ 2022-03-22 10:12 UTC (permalink / raw)
  To: Jim Mattson, Maxim Levitsky
  Cc: kvm, Ingo Molnar, Dave Hansen, Sean Christopherson,
	Borislav Petkov, H. Peter Anvin, Thomas Gleixner, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On 3/21/22 23:41, Jim Mattson wrote:
>> 100%. Do you have a pointer where to document it?
> I think this will be the first KVM virtual CPU erratum documented,
> though there are plenty of others that I'd like to see documented
> (e.g. nVMX processes posted interrupts on emulated VM-entry, AMD's
> merged PMU counters are only 48 bits wide, etc.).
> 
> Maybe Paolo has some ideas?

So let's document them, that's a great idea.  I can help writing them 
down if you have a pointer to prior email discussions.  I'll send a 
skeleton.

Paolo


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-22 10:12                 ` Paolo Bonzini
@ 2022-03-22 11:17                   ` Maxim Levitsky
  0 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-22 11:17 UTC (permalink / raw)
  To: Paolo Bonzini, Jim Mattson
  Cc: kvm, Ingo Molnar, Dave Hansen, Sean Christopherson,
	Borislav Petkov, H. Peter Anvin, Thomas Gleixner, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Tue, 2022-03-22 at 11:12 +0100, Paolo Bonzini wrote:
> On 3/21/22 23:41, Jim Mattson wrote:
> > > 100%. Do you have a pointer where to document it?
> > I think this will be the first KVM virtual CPU erratum documented,
> > though there are plenty of others that I'd like to see documented
> > (e.g. nVMX processes posted interrupts on emulated VM-entry, AMD's
> > merged PMU counters are only 48 bits wide, etc.).
> > 
> > Maybe Paolo has some ideas?
> 
> So let's document them, that's a great idea.  I can help writing them 
> down if you have a pointer to prior email discussions.  I'll send a 
> skeleton.
> 
> Paolo
> 

Things that I know that don't work 100% correctly in KVM:

*  Relocation apic base. changing apic id also likely broken at least in some
   cases, and sure is with AVIC enabled.

   also likely some other obscure bits of the in-kernel emulation of APIC/IO apic/PIC/etc
   don't work correctly.

*  Emulator is not complete, so if you do unsupported instruction
   on mmio, it should fail.
   Also without unrestricted guest, emulator has to be used sometimes
   for arbitrary code so it wil fail fast.

*  Shadow mmu doesn't fully reflect real tlb, as tlb is usualy
   not shared between cpus.
   Also KVM's shadow mmu is more speculative vs real mmu - breaks old guests like win9x.

   Also no way to disable 1GB pages when NPT/EPT is enabled, since guest paging doesn't
   trap into the KVM.

*  Various minor issues with debug based on single stepping / DRs, etc,
   most of which I don't know well. Most of these can be fixed but it low priority,
   and I have seen many fixes in this area recently.
   Also proper support for nested monitor trap likely broken.

*  Various msrs are hardcoded/not supported - not much specific info on this.
   In particular no real support for mtrrs / pat - in fact KVM likes the guest memory to be
   always WB to avoid various cpu erratas.


Best regards,
	Maxim Levitsky


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on
  2022-03-09 13:12   ` Paolo Bonzini
@ 2022-03-22 16:52     ` Maxim Levitsky
  0 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-22 16:52 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Wed, 2022-03-09 at 14:12 +0100, Paolo Bonzini wrote:
> On 3/1/22 15:36, Maxim Levitsky wrote:
> > Allow L1 to use these settings if L0 disables PAUSE interception
> > (AKA cpu_pm=on)
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >   arch/x86/kvm/svm/nested.c |  6 ++++++
> >   arch/x86/kvm/svm/svm.c    | 17 +++++++++++++++++
> >   arch/x86/kvm/svm/svm.h    |  2 ++
> >   3 files changed, 25 insertions(+)
> > 
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 37510cb206190..4cb0bc49986d5 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -664,6 +664,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
> >   	if (!nested_vmcb_needs_vls_intercept(svm))
> >   		svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
> >   
> > +	if (svm->pause_filter_enabled)
> > +		svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
> > +
> > +	if (svm->pause_threshold_enabled)
> > +		svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
> 
> I think this should be
> 
> 	if (kvm_pause_in_guest(vcpu->kvm)) {
> 		/* copy from VMCB12 if guest has CPUID, else set to 0 */
> 	} else {
> 		/* copy from VMCB01, unconditionally */
> 	}

> and likewise it should be copied back to VMCB01 unconditionally on 
> vmexit if !kvm_pause_in_guest(vcpu->kvm).


I did something different in the next version of the patches.
Please take a look.


> 
> >   	nested_svm_transition_tlb_flush(vcpu);
> >   
> >   	/* Enter Guest-Mode */
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 6a571eed32ef4..52198e63c5fc4 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -4008,6 +4008,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> >   
> >   	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
> >   
> > +	if (kvm_pause_in_guest(vcpu->kvm)) {
> > +		svm->pause_filter_enabled = pause_filter_count > 0 &&
> > +					    guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
> > +
> > +		svm->pause_threshold_enabled = pause_filter_thresh > 0 &&
> > +					    guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
> 
> Why only if the module parameters are >0?  The module parameter is 
> unused if pause-in-guest is active.

Agree, will do.

> 
> > +	} else {
> > +		svm->pause_filter_enabled = false;
> > +		svm->pause_threshold_enabled = false;
> > +	}
> > +
> >   	svm_recalc_instruction_intercepts(vcpu, svm);
> >   
> >   	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
> > @@ -4763,6 +4774,12 @@ static __init void svm_set_cpu_caps(void)
> >   		if (vls)
> >   			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
> >   
> > +		if (pause_filter_count)
> > +			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
> > +
> > +		if (pause_filter_thresh)
> > +			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
> 
> Likewise, this should be set using just boot_cpu_has, not the module 
> parameters.

Agree as well + the check above is wrong - it should have been inverted.

> 
> Paolo
> 
> >   		/* Nested VM can receive #VMEXIT instead of triggering #GP */
> >   		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
> >   	}
> > diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> > index a3c93f9c02847..6fa81eb3ffb78 100644
> > --- a/arch/x86/kvm/svm/svm.h
> > +++ b/arch/x86/kvm/svm/svm.h
> > @@ -234,6 +234,8 @@ struct vcpu_svm {
> >   	bool tsc_scaling_enabled          : 1;
> >   	bool lbrv_enabled                 : 1;
> >   	bool v_vmload_vmsave_enabled      : 1;
> > +	bool pause_filter_enabled         : 1;
> > +	bool pause_threshold_enabled      : 1;
> >   
> >   	u32 ldr_reg;
> >   	u32 dfr_reg;


Thanks for the review!

Best regards,
	Maxim Levitsky







^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization
  2022-03-09 13:00   ` Paolo Bonzini
@ 2022-03-22 16:53     ` Maxim Levitsky
  0 siblings, 0 replies; 28+ messages in thread
From: Maxim Levitsky @ 2022-03-22 16:53 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Ingo Molnar, Dave Hansen, Sean Christopherson, Borislav Petkov,
	H. Peter Anvin, Thomas Gleixner, Jim Mattson, x86,
	Vitaly Kuznetsov, Joerg Roedel, linux-kernel, Wanpeng Li

On Wed, 2022-03-09 at 14:00 +0100, Paolo Bonzini wrote:
> On 3/1/22 15:36, Maxim Levitsky wrote:
> > @@ -565,8 +565,19 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
> >   		vmcb_mark_dirty(svm->vmcb, VMCB_DR);
> >   	}
> >   
> > -	if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK))
> > +	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
> > +
> > +		/* Copy LBR related registers from vmcb12,
> > +		 * but make sure that we only pick LBR enable bit from the guest.
> > +		 */
> > +
> > +		svm_copy_lbrs(vmcb12, svm->vmcb);
> > +		svm->vmcb->save.dbgctl &= LBR_CTL_ENABLE_MASK;
> 
> This should be checked against DEBUGCTL_RESERVED_BITS instead in 
> __nested_vmcb_check_save; remember to add dbgctl to struct 
> vmcb_save_area_cached too.

Actually the AMD's manual doesn't specify what happens when this field contains reserved bits.

I did the following test and I see that CPU ignores the reserved bits:


diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index fa7366fab3bd6..6a9f41941d9ea 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3875,6 +3875,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
                x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
+       svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+       svm->vmcb->save.dbgctl = 0xFFFFFFFFFFFFFFFFUL;
+
        svm_vcpu_enter_exit(vcpu);
 
        /*


The VM booted fine and reading DEBUGCTL in the guest returned 0x3f.

Its true that DEBUGCTL has few more bits that are defined on AMD which
I zero here, but for practical purposes only bit that matters is BTF,
and I decided that for now to leave it alone, as currently KVM doesn't
emulate it correctly anyway when LBRs are not enabled.
(look at svm_set_msr, MSR_IA32_DEBUGCTLMSR)

In theory this bit should be passed through by setting host's DEBUGCTL,
just prior to entry to the guest, when only it is enabled,
without LBR virtualization. I'll fix this later.

Best regards,
	Maxim Levitsky

> 
> Paolo
> 
> > +		svm_update_lbrv(&svm->vcpu);
> > +
> > +	} else if (unlikely(svm->vmcb01.ptr->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
> >   		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
> > +	}
> >   }





^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2022-03-22 16:54 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-01 14:36 [PATCH v3 0/7] nSVM/SVM features Maxim Levitsky
2022-03-01 14:36 ` [PATCH v3 1/7] KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running Maxim Levitsky
2022-03-09 13:00   ` Paolo Bonzini
2022-03-14 11:25     ` Maxim Levitsky
2022-03-01 14:36 ` [PATCH v3 2/7] KVM: x86: nSVM: implement nested LBR virtualization Maxim Levitsky
2022-03-09 13:00   ` Paolo Bonzini
2022-03-22 16:53     ` Maxim Levitsky
2022-03-01 14:36 ` [PATCH v3 3/7] KVM: x86: nSVM: implement nested VMLOAD/VMSAVE Maxim Levitsky
2022-03-01 14:36 ` [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on Maxim Levitsky
2022-03-09 13:12   ` Paolo Bonzini
2022-03-22 16:52     ` Maxim Levitsky
2022-03-09 18:35   ` Jim Mattson
2022-03-09 18:47     ` Paolo Bonzini
2022-03-09 19:07       ` Jim Mattson
2022-03-21 21:36         ` Maxim Levitsky
2022-03-21 21:59           ` Jim Mattson
2022-03-21 22:11             ` Maxim Levitsky
2022-03-21 22:41               ` Jim Mattson
2022-03-22 10:12                 ` Paolo Bonzini
2022-03-22 11:17                   ` Maxim Levitsky
2022-03-01 14:36 ` [PATCH v3 5/7] KVM: x86: nSVM: implement nested vGIF Maxim Levitsky
2022-03-09 13:40   ` Paolo Bonzini
2022-03-14 15:21     ` Maxim Levitsky
2022-03-01 14:36 ` [PATCH v3 6/7] KVM: x86: SVM: allow to force AVIC to be enabled Maxim Levitsky
2022-03-09 13:41   ` Paolo Bonzini
2022-03-01 14:36 ` [PATCH v3 7/7] KVM: x86: SVM: allow AVIC to co-exist with a nested guest running Maxim Levitsky
2022-03-09 13:50   ` Paolo Bonzini
2022-03-09 18:14     ` Maxim Levitsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).