kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 0/3] CET fix patches for nested guest
@ 2021-03-15  7:18 Yang Weijiang
  2021-03-15  7:18 ` [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2 Yang Weijiang
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Yang Weijiang @ 2021-03-15  7:18 UTC (permalink / raw)
  To: pbonzini, seanjc, vkuznets, kvm, linux-kernel; +Cc: Yang Weijiang

This patch series is to fix a few issues found during patch review and
testing on Linux, also including a patch to explictly disable CET support
in nested guest over Hyper-V(s).

change in v4:
- Added guest CET cpuid check before sync vmcs02 CET state to vmcs12.
- Opportunistically added similar fix for MPX.

Yang Weijiang (3):
  KVM: nVMX: Sync L2 guest CET states between L1/L2
  KVM: nVMX: Set X86_CR4_CET in cr4_fixed1_bits if CET IBT is enabled
  KVM: nVMX: Add CET entry/exit load bits to evmcs unsupported list

 arch/x86/kvm/cpuid.c      |  1 -
 arch/x86/kvm/vmx/evmcs.c  |  4 ++--
 arch/x86/kvm/vmx/evmcs.h  |  6 ++++--
 arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/vmx.c    |  1 +
 arch/x86/kvm/vmx/vmx.h    |  3 +++
 6 files changed, 43 insertions(+), 7 deletions(-)

-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2
  2021-03-15  7:18 [PATCH v4 0/3] CET fix patches for nested guest Yang Weijiang
@ 2021-03-15  7:18 ` Yang Weijiang
  2021-03-15 16:45   ` Sean Christopherson
  2021-03-15  7:18 ` [PATCH v4 2/3] KVM: nVMX: Set X86_CR4_CET in cr4_fixed1_bits if CET IBT is enabled Yang Weijiang
  2021-03-15  7:18 ` [PATCH v4 3/3] KVM: nVMX: Add CET entry/exit load bits to evmcs unsupported list Yang Weijiang
  2 siblings, 1 reply; 9+ messages in thread
From: Yang Weijiang @ 2021-03-15  7:18 UTC (permalink / raw)
  To: pbonzini, seanjc, vkuznets, kvm, linux-kernel; +Cc: Yang Weijiang

These fields are rarely updated by L1 QEMU/KVM, sync them when L1 is trying to
read/write them and after they're changed. If CET guest entry-load bit is not
set by L1 guest, migrate them to L2 manaully.

Opportunistically remove one blank line and add minor fix for MPX.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/kvm/cpuid.c      |  1 -
 arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx/vmx.h    |  3 +++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d191de769093..8692f53b8cd0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -143,7 +143,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
 		}
 		vcpu->arch.guest_supported_xss =
 			(((u64)best->edx << 32) | best->ecx) & supported_xss;
-
 	} else {
 		vcpu->arch.guest_supported_xss = 0;
 	}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 9728efd529a1..57ecd8225568 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2516,6 +2516,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
 
 	set_cr4_guest_host_mask(vmx);
+
+	if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
+	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
+		vmcs_writel(GUEST_SSP, vmcs12->guest_ssp);
+		vmcs_writel(GUEST_S_CET, vmcs12->guest_s_cet);
+		vmcs_writel(GUEST_INTR_SSP_TABLE, vmcs12->guest_ssp_tbl);
+	}
 }
 
 /*
@@ -2556,6 +2563,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
 	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
 		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+
+	if (kvm_cet_supported() && (!vmx->nested.nested_run_pending ||
+	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))) {
+		vmcs_writel(GUEST_SSP, vmx->nested.vmcs01_guest_ssp);
+		vmcs_writel(GUEST_S_CET, vmx->nested.vmcs01_guest_s_cet);
+		vmcs_writel(GUEST_INTR_SSP_TABLE,
+			    vmx->nested.vmcs01_guest_ssp_tbl);
+	}
+
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 
 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -3373,8 +3389,14 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 	if (kvm_mpx_supported() &&
-		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+	if (kvm_cet_supported() &&
+	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
+		vmx->nested.vmcs01_guest_ssp = vmcs_readl(GUEST_SSP);
+		vmx->nested.vmcs01_guest_s_cet = vmcs_readl(GUEST_S_CET);
+		vmx->nested.vmcs01_guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
+	}
 
 	/*
 	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
@@ -4001,6 +4023,9 @@ static bool is_vmcs12_ext_field(unsigned long field)
 	case GUEST_IDTR_BASE:
 	case GUEST_PENDING_DBG_EXCEPTIONS:
 	case GUEST_BNDCFGS:
+	case GUEST_SSP:
+	case GUEST_INTR_SSP_TABLE:
+	case GUEST_S_CET:
 		return true;
 	default:
 		break;
@@ -4050,8 +4075,14 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
 	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
-	if (kvm_mpx_supported())
+	if (kvm_mpx_supported() && guest_cpuid_has(vcpu, X86_FEATURE_MPX))
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+	if (kvm_cet_supported() && (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) ||
+	    guest_cpuid_has(vcpu, X86_FEATURE_IBT))) {
+		vmcs12->guest_ssp = vmcs_readl(GUEST_SSP);
+		vmcs12->guest_s_cet = vmcs_readl(GUEST_S_CET);
+		vmcs12->guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
+	}
 
 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
 }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9d3a557949ac..36dc4fdb0909 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -155,6 +155,9 @@ struct nested_vmx {
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
 	u64 vmcs01_guest_bndcfgs;
+	u64 vmcs01_guest_ssp;
+	u64 vmcs01_guest_s_cet;
+	u64 vmcs01_guest_ssp_tbl;
 
 	/* to migrate it to L1 if L2 writes to L1's CR8 directly */
 	int l1_tpr_threshold;
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/3] KVM: nVMX: Set X86_CR4_CET in cr4_fixed1_bits if CET IBT is enabled
  2021-03-15  7:18 [PATCH v4 0/3] CET fix patches for nested guest Yang Weijiang
  2021-03-15  7:18 ` [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2 Yang Weijiang
@ 2021-03-15  7:18 ` Yang Weijiang
  2021-03-15  7:18 ` [PATCH v4 3/3] KVM: nVMX: Add CET entry/exit load bits to evmcs unsupported list Yang Weijiang
  2 siblings, 0 replies; 9+ messages in thread
From: Yang Weijiang @ 2021-03-15  7:18 UTC (permalink / raw)
  To: pbonzini, seanjc, vkuznets, kvm, linux-kernel; +Cc: Yang Weijiang

CET SHSTK and IBT are independently controlled by kernel, set X86_CR4_CET
bit in cr4_fixed1_bits if either of them is enabled so that nested guest
can enjoy the feature.

Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/kvm/vmx/vmx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e78650bf8ae8..bd89b5a24c38 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7267,6 +7267,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 	cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
 	cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
 	cr4_fixed1_update(X86_CR4_CET,	      ecx, feature_bit(SHSTK));
+	cr4_fixed1_update(X86_CR4_CET,	      edx, feature_bit(IBT));
 
 #undef cr4_fixed1_update
 }
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 3/3] KVM: nVMX: Add CET entry/exit load bits to evmcs unsupported list
  2021-03-15  7:18 [PATCH v4 0/3] CET fix patches for nested guest Yang Weijiang
  2021-03-15  7:18 ` [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2 Yang Weijiang
  2021-03-15  7:18 ` [PATCH v4 2/3] KVM: nVMX: Set X86_CR4_CET in cr4_fixed1_bits if CET IBT is enabled Yang Weijiang
@ 2021-03-15  7:18 ` Yang Weijiang
  2 siblings, 0 replies; 9+ messages in thread
From: Yang Weijiang @ 2021-03-15  7:18 UTC (permalink / raw)
  To: pbonzini, seanjc, vkuznets, kvm, linux-kernel; +Cc: Yang Weijiang

Nested guest doesn't support CET when KVM is running as an intermediate
layer between two Hyper-Vs for now, so mask out related CET entry/exit
load bits. Relevant enabling patches will be posted as a separate patch
series.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
---
 arch/x86/kvm/vmx/evmcs.c | 4 ++--
 arch/x86/kvm/vmx/evmcs.h | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 41f24661af04..9f81db51fd8b 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -351,11 +351,11 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
 	switch (msr_index) {
 	case MSR_IA32_VMX_EXIT_CTLS:
 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
-		ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+		ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
 		break;
 	case MSR_IA32_VMX_ENTRY_CTLS:
 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
-		ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+		ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS2:
 		ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index bd41d9462355..25588694eb04 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -59,8 +59,10 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
 	 SECONDARY_EXEC_SHADOW_VMCS |					\
 	 SECONDARY_EXEC_TSC_SCALING |					\
 	 SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
-#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+					VM_EXIT_LOAD_CET_STATE)
+#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+					 VM_ENTRY_LOAD_CET_STATE)
 #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
 
 #if IS_ENABLED(CONFIG_HYPERV)
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2
  2021-03-15  7:18 ` [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2 Yang Weijiang
@ 2021-03-15 16:45   ` Sean Christopherson
  2021-03-16  9:03     ` Yang Weijiang
  0 siblings, 1 reply; 9+ messages in thread
From: Sean Christopherson @ 2021-03-15 16:45 UTC (permalink / raw)
  To: Yang Weijiang; +Cc: pbonzini, vkuznets, kvm, linux-kernel

On Mon, Mar 15, 2021, Yang Weijiang wrote:
> These fields are rarely updated by L1 QEMU/KVM, sync them when L1 is trying to
> read/write them and after they're changed. If CET guest entry-load bit is not
> set by L1 guest, migrate them to L2 manaully.
> 
> Opportunistically remove one blank line and add minor fix for MPX.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> ---
>  arch/x86/kvm/cpuid.c      |  1 -
>  arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/vmx/vmx.h    |  3 +++
>  3 files changed, 36 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index d191de769093..8692f53b8cd0 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -143,7 +143,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
>  		}
>  		vcpu->arch.guest_supported_xss =
>  			(((u64)best->edx << 32) | best->ecx) & supported_xss;
> -

Spurious whitespace deletion.

>  	} else {
>  		vcpu->arch.guest_supported_xss = 0;
>  	}
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 9728efd529a1..57ecd8225568 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2516,6 +2516,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
>  	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
>  
>  	set_cr4_guest_host_mask(vmx);
> +
> +	if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
> +	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> +		vmcs_writel(GUEST_SSP, vmcs12->guest_ssp);
> +		vmcs_writel(GUEST_S_CET, vmcs12->guest_s_cet);
> +		vmcs_writel(GUEST_INTR_SSP_TABLE, vmcs12->guest_ssp_tbl);
> +	}
>  }
>  
>  /*
> @@ -2556,6 +2563,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> +
> +	if (kvm_cet_supported() && (!vmx->nested.nested_run_pending ||
> +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))) {

Not your code per se, since this pattern comes from BNDCFGS and DEBUGCTL, but I
don't see how loading vmcs01 state in this combo is correct:

    a. kvm_xxx_supported()              == 1
    b. nested_run_pending               == false
    c. vm_entry_controls.load_xxx_state == true

nested_vmx_enter_non_root_mode() only snapshots vmcs01 if 
vm_entry_controls.load_xxx_state == false, which means the above combo is
loading stale values (or more likely, zeros).

I _think_ nested_vmx_enter_non_root_mode() just needs to snapshot vmcs01 if
nested_run_pending=false.  For migration, if userspace restores MSRs after
KVM_SET_NESTED_STATE, then what's done here is likely irrelevant.  If userspace
restores MSRs before nested state, then vmcs01 will hold the desired value since
setting MSRs would have written the value into vmcs01.

I suspect no one has reported this issue because guests simply don't use MPX,
and up until the recent LBR stuff, KVM effectively zeroed out DEBUGCTL for the
guest.

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 45622e9c4449..4184ff601120 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3298,10 +3298,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);

-       if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+       if (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-       if (kvm_mpx_supported() &&
-               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
                vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);

        /*


Side topic, all of this code is broken for SMM emulation.  SMI+RSM don't do a
full VM-Exit -> VM-Entry; the CPU forcefully exits non-root, but most state that
is loaded from the VMCS is left untouched.  It's the SMI handler's responsibility
to not enable features, e.g. to not set CR4.CET.  For sane use cases, this
probably doesn't matter as vmcs12 will be configured to context switch state,
but if L1 is doing anything out of the ordinary, SMI+RSM will corrupt state.

E.g. if L1 enables MPX in the guest, does not intercept L2 writes to BNDCFGS,
and does not load BNDCFGS on VM-Entry, then SMI+RSM would corrupt BNDCFGS since
the SMI "exit" would clear BNDCFGS, and the RSM "entry" would load zero.  This
is 100% contrived, and probably doesn't impact real world use cases, but it
still bugs me :-)

> +		vmcs_writel(GUEST_SSP, vmx->nested.vmcs01_guest_ssp);
> +		vmcs_writel(GUEST_S_CET, vmx->nested.vmcs01_guest_s_cet);
> +		vmcs_writel(GUEST_INTR_SSP_TABLE,
> +			    vmx->nested.vmcs01_guest_ssp_tbl);
> +	}
> +
>  	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>  
>  	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
> @@ -3373,8 +3389,14 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>  	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
>  		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
>  	if (kvm_mpx_supported() &&
> -		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
>  		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> +	if (kvm_cet_supported() &&
> +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> +		vmx->nested.vmcs01_guest_ssp = vmcs_readl(GUEST_SSP);
> +		vmx->nested.vmcs01_guest_s_cet = vmcs_readl(GUEST_S_CET);
> +		vmx->nested.vmcs01_guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
> +	}
>  
>  	/*
>  	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
> @@ -4001,6 +4023,9 @@ static bool is_vmcs12_ext_field(unsigned long field)
>  	case GUEST_IDTR_BASE:
>  	case GUEST_PENDING_DBG_EXCEPTIONS:
>  	case GUEST_BNDCFGS:
> +	case GUEST_SSP:
> +	case GUEST_INTR_SSP_TABLE:
> +	case GUEST_S_CET:
>  		return true;
>  	default:
>  		break;
> @@ -4050,8 +4075,14 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
>  	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
>  	vmcs12->guest_pending_dbg_exceptions =
>  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> -	if (kvm_mpx_supported())
> +	if (kvm_mpx_supported() && guest_cpuid_has(vcpu, X86_FEATURE_MPX))

Adding the CPUID check for MPX definitely needs to be a separate commit.

>  		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> +	if (kvm_cet_supported() && (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) ||
> +	    guest_cpuid_has(vcpu, X86_FEATURE_IBT))) {
> +		vmcs12->guest_ssp = vmcs_readl(GUEST_SSP);
> +		vmcs12->guest_s_cet = vmcs_readl(GUEST_S_CET);
> +		vmcs12->guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
> +	}
>  
>  	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
>  }
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 9d3a557949ac..36dc4fdb0909 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -155,6 +155,9 @@ struct nested_vmx {
>  	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
>  	u64 vmcs01_debugctl;
>  	u64 vmcs01_guest_bndcfgs;
> +	u64 vmcs01_guest_ssp;
> +	u64 vmcs01_guest_s_cet;
> +	u64 vmcs01_guest_ssp_tbl;
>  
>  	/* to migrate it to L1 if L2 writes to L1's CR8 directly */
>  	int l1_tpr_threshold;
> -- 
> 2.26.2
> 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2
  2021-03-15 16:45   ` Sean Christopherson
@ 2021-03-16  9:03     ` Yang Weijiang
  2021-03-23  0:43       ` Yang Weijiang
  0 siblings, 1 reply; 9+ messages in thread
From: Yang Weijiang @ 2021-03-16  9:03 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Yang Weijiang, pbonzini, vkuznets, kvm, linux-kernel

On Mon, Mar 15, 2021 at 09:45:11AM -0700, Sean Christopherson wrote:
> On Mon, Mar 15, 2021, Yang Weijiang wrote:
> > These fields are rarely updated by L1 QEMU/KVM, sync them when L1 is trying to
> > read/write them and after they're changed. If CET guest entry-load bit is not
> > set by L1 guest, migrate them to L2 manaully.
> > 
> > Opportunistically remove one blank line and add minor fix for MPX.
> > 
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> > ---
> >  arch/x86/kvm/cpuid.c      |  1 -
> >  arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++++++++++++++++--
> >  arch/x86/kvm/vmx/vmx.h    |  3 +++
> >  3 files changed, 36 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index d191de769093..8692f53b8cd0 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -143,7 +143,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
> >  		}
> >  		vcpu->arch.guest_supported_xss =
> >  			(((u64)best->edx << 32) | best->ecx) & supported_xss;
> > -
> 
> Spurious whitespace deletion.

Yes, Opportunistically did it as said in commit log :-D

> 
> >  	} else {
> >  		vcpu->arch.guest_supported_xss = 0;
> >  	}
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index 9728efd529a1..57ecd8225568 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -2516,6 +2516,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> >  	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
> >  
> >  	set_cr4_guest_host_mask(vmx);
> > +
> > +	if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
> > +	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > +		vmcs_writel(GUEST_SSP, vmcs12->guest_ssp);
> > +		vmcs_writel(GUEST_S_CET, vmcs12->guest_s_cet);
> > +		vmcs_writel(GUEST_INTR_SSP_TABLE, vmcs12->guest_ssp_tbl);
> > +	}
> >  }
> >  
> >  /*
> > @@ -2556,6 +2563,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> >  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> >  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> >  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> > +
> > +	if (kvm_cet_supported() && (!vmx->nested.nested_run_pending ||
> > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))) {
> 
> Not your code per se, since this pattern comes from BNDCFGS and DEBUGCTL, but I
> don't see how loading vmcs01 state in this combo is correct:
> 
>     a. kvm_xxx_supported()              == 1
>     b. nested_run_pending               == false
>     c. vm_entry_controls.load_xxx_state == true
> 
> nested_vmx_enter_non_root_mode() only snapshots vmcs01 if 
> vm_entry_controls.load_xxx_state == false, which means the above combo is
> loading stale values (or more likely, zeros).
> 
> I _think_ nested_vmx_enter_non_root_mode() just needs to snapshot vmcs01 if
> nested_run_pending=false.  For migration, if userspace restores MSRs after
> KVM_SET_NESTED_STATE, then what's done here is likely irrelevant.  If userspace
> restores MSRs before nested state, then vmcs01 will hold the desired value since
> setting MSRs would have written the value into vmcs01.

Then the code nested_vmx_enter_non_root_mode() would look like:

if (kvm_cet_supported() && !vmx->nested.nested_run_pending &&
    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
	...
    }

I have another concern now, if vm_entry_controls.load_cet_state == false, and L1
updated vmcs fields, so the latest states are in vmcs12, but they cannot
be synced to vmcs02 because in prepare_vmcs02_rare():

if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
	...
    }

so L2 got stale status. IMO, L1 guest sets vm_entry_controls.load_cet_state == false
should be rare case. We event can igore this case :-)

> 
> I suspect no one has reported this issue because guests simply don't use MPX,
> and up until the recent LBR stuff, KVM effectively zeroed out DEBUGCTL for the
> guest.
> 
So for MPX and DEBUGCTL, is it worth some separate fix patch?

> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 45622e9c4449..4184ff601120 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -3298,10 +3298,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
>                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
> 
> -       if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> +       if (!vmx->nested.nested_run_pending ||
> +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
>                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> -       if (kvm_mpx_supported() &&
> -               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> +       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> 
>         /*
> 
> 
> Side topic, all of this code is broken for SMM emulation.  SMI+RSM don't do a
> full VM-Exit -> VM-Entry; the CPU forcefully exits non-root, but most state that
> is loaded from the VMCS is left untouched.  It's the SMI handler's responsibility
> to not enable features, e.g. to not set CR4.CET.  For sane use cases, this
> probably doesn't matter as vmcs12 will be configured to context switch state,
> but if L1 is doing anything out of the ordinary, SMI+RSM will corrupt state.
> 
> E.g. if L1 enables MPX in the guest, does not intercept L2 writes to BNDCFGS,
> and does not load BNDCFGS on VM-Entry, then SMI+RSM would corrupt BNDCFGS since
> the SMI "exit" would clear BNDCFGS, and the RSM "entry" would load zero.  This
> is 100% contrived, and probably doesn't impact real world use cases, but it
> still bugs me :-)

Exactly, should it be fixed by separate patch or leave it as is?

> 
> > +		vmcs_writel(GUEST_SSP, vmx->nested.vmcs01_guest_ssp);
> > +		vmcs_writel(GUEST_S_CET, vmx->nested.vmcs01_guest_s_cet);
> > +		vmcs_writel(GUEST_INTR_SSP_TABLE,
> > +			    vmx->nested.vmcs01_guest_ssp_tbl);
> > +	}
> > +
> >  	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
> >  
> >  	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
> > @@ -3373,8 +3389,14 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> >  	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> >  		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> >  	if (kvm_mpx_supported() &&
> > -		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> >  		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > +	if (kvm_cet_supported() &&
> > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > +		vmx->nested.vmcs01_guest_ssp = vmcs_readl(GUEST_SSP);
> > +		vmx->nested.vmcs01_guest_s_cet = vmcs_readl(GUEST_S_CET);
> > +		vmx->nested.vmcs01_guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > +	}
> >  
> >  	/*
> >  	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
> > @@ -4001,6 +4023,9 @@ static bool is_vmcs12_ext_field(unsigned long field)
> >  	case GUEST_IDTR_BASE:
> >  	case GUEST_PENDING_DBG_EXCEPTIONS:
> >  	case GUEST_BNDCFGS:
> > +	case GUEST_SSP:
> > +	case GUEST_INTR_SSP_TABLE:
> > +	case GUEST_S_CET:
> >  		return true;
> >  	default:
> >  		break;
> > @@ -4050,8 +4075,14 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
> >  	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> >  	vmcs12->guest_pending_dbg_exceptions =
> >  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > -	if (kvm_mpx_supported())
> > +	if (kvm_mpx_supported() && guest_cpuid_has(vcpu, X86_FEATURE_MPX))
> 
> Adding the CPUID check for MPX definitely needs to be a separate commit.

Sure, will fix them by separate patch. Thanks for review!

> 
> >  		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > +	if (kvm_cet_supported() && (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) ||
> > +	    guest_cpuid_has(vcpu, X86_FEATURE_IBT))) {
> > +		vmcs12->guest_ssp = vmcs_readl(GUEST_SSP);
> > +		vmcs12->guest_s_cet = vmcs_readl(GUEST_S_CET);
> > +		vmcs12->guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > +	}
> >  
> >  	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
> >  }
> > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> > index 9d3a557949ac..36dc4fdb0909 100644
> > --- a/arch/x86/kvm/vmx/vmx.h
> > +++ b/arch/x86/kvm/vmx/vmx.h
> > @@ -155,6 +155,9 @@ struct nested_vmx {
> >  	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
> >  	u64 vmcs01_debugctl;
> >  	u64 vmcs01_guest_bndcfgs;
> > +	u64 vmcs01_guest_ssp;
> > +	u64 vmcs01_guest_s_cet;
> > +	u64 vmcs01_guest_ssp_tbl;
> >  
> >  	/* to migrate it to L1 if L2 writes to L1's CR8 directly */
> >  	int l1_tpr_threshold;
> > -- 
> > 2.26.2
> > 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2
  2021-03-16  9:03     ` Yang Weijiang
@ 2021-03-23  0:43       ` Yang Weijiang
  2021-03-23 15:56         ` Sean Christopherson
  0 siblings, 1 reply; 9+ messages in thread
From: Yang Weijiang @ 2021-03-23  0:43 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Yang Weijiang, pbonzini, vkuznets, kvm, linux-kernel

On Tue, Mar 16, 2021 at 05:03:47PM +0800, Yang Weijiang wrote:

Hi, Sean,
Could you respond my below rely? I'm not sure how to proceed, thanks!

> On Mon, Mar 15, 2021 at 09:45:11AM -0700, Sean Christopherson wrote:
> > On Mon, Mar 15, 2021, Yang Weijiang wrote:
> > > These fields are rarely updated by L1 QEMU/KVM, sync them when L1 is trying to
> > > read/write them and after they're changed. If CET guest entry-load bit is not
> > > set by L1 guest, migrate them to L2 manaully.
> > > 
> > > Opportunistically remove one blank line and add minor fix for MPX.
> > > 
> > > Suggested-by: Sean Christopherson <seanjc@google.com>
> > > Signed-off-by: Yang Weijiang <weijiang.yang@intel.com>
> > > ---
> > >  arch/x86/kvm/cpuid.c      |  1 -
> > >  arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++++++++++++++++--
> > >  arch/x86/kvm/vmx/vmx.h    |  3 +++
> > >  3 files changed, 36 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > > index d191de769093..8692f53b8cd0 100644
> > > --- a/arch/x86/kvm/cpuid.c
> > > +++ b/arch/x86/kvm/cpuid.c
> > > @@ -143,7 +143,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
> > >  		}
> > >  		vcpu->arch.guest_supported_xss =
> > >  			(((u64)best->edx << 32) | best->ecx) & supported_xss;
> > > -
> > 
> > Spurious whitespace deletion.
> 
> Yes, Opportunistically did it as said in commit log :-D
> 
> > 
> > >  	} else {
> > >  		vcpu->arch.guest_supported_xss = 0;
> > >  	}
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index 9728efd529a1..57ecd8225568 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -2516,6 +2516,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> > >  	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
> > >  
> > >  	set_cr4_guest_host_mask(vmx);
> > > +
> > > +	if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
> > > +	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > > +		vmcs_writel(GUEST_SSP, vmcs12->guest_ssp);
> > > +		vmcs_writel(GUEST_S_CET, vmcs12->guest_s_cet);
> > > +		vmcs_writel(GUEST_INTR_SSP_TABLE, vmcs12->guest_ssp_tbl);
> > > +	}
> > >  }
> > >  
> > >  /*
> > > @@ -2556,6 +2563,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> > >  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> > >  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> > >  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> > > +
> > > +	if (kvm_cet_supported() && (!vmx->nested.nested_run_pending ||
> > > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))) {
> > 
> > Not your code per se, since this pattern comes from BNDCFGS and DEBUGCTL, but I
> > don't see how loading vmcs01 state in this combo is correct:
> > 
> >     a. kvm_xxx_supported()              == 1
> >     b. nested_run_pending               == false
> >     c. vm_entry_controls.load_xxx_state == true
> > 
> > nested_vmx_enter_non_root_mode() only snapshots vmcs01 if 
> > vm_entry_controls.load_xxx_state == false, which means the above combo is
> > loading stale values (or more likely, zeros).
> > 
> > I _think_ nested_vmx_enter_non_root_mode() just needs to snapshot vmcs01 if
> > nested_run_pending=false.  For migration, if userspace restores MSRs after
> > KVM_SET_NESTED_STATE, then what's done here is likely irrelevant.  If userspace
> > restores MSRs before nested state, then vmcs01 will hold the desired value since
> > setting MSRs would have written the value into vmcs01.
> 
> Then the code nested_vmx_enter_non_root_mode() would look like:
> 
> if (kvm_cet_supported() && !vmx->nested.nested_run_pending &&
>     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> 	...
>     }
> 
> I have another concern now, if vm_entry_controls.load_cet_state == false, and L1
> updated vmcs fields, so the latest states are in vmcs12, but they cannot
> be synced to vmcs02 because in prepare_vmcs02_rare():
> 
> if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
>     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> 	...
>     }
> 
> so L2 got stale status. IMO, L1 guest sets vm_entry_controls.load_cet_state == false
> should be rare case. We can even igore this case :-)
> 
> > 
> > I suspect no one has reported this issue because guests simply don't use MPX,
> > and up until the recent LBR stuff, KVM effectively zeroed out DEBUGCTL for the
> > guest.
> > 
> So for MPX and DEBUGCTL, is it worth some separate fix patch?
> 
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index 45622e9c4449..4184ff601120 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -3298,10 +3298,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> >         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
> >                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
> > 
> > -       if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> > +       if (!vmx->nested.nested_run_pending ||
> > +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> >                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > -       if (kvm_mpx_supported() &&
> > -               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> > +       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> > +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> >                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > 
> >         /*
> > 
> > 
> > Side topic, all of this code is broken for SMM emulation.  SMI+RSM don't do a
> > full VM-Exit -> VM-Entry; the CPU forcefully exits non-root, but most state that
> > is loaded from the VMCS is left untouched.  It's the SMI handler's responsibility
> > to not enable features, e.g. to not set CR4.CET.  For sane use cases, this
> > probably doesn't matter as vmcs12 will be configured to context switch state,
> > but if L1 is doing anything out of the ordinary, SMI+RSM will corrupt state.
> > 
> > E.g. if L1 enables MPX in the guest, does not intercept L2 writes to BNDCFGS,
> > and does not load BNDCFGS on VM-Entry, then SMI+RSM would corrupt BNDCFGS since
> > the SMI "exit" would clear BNDCFGS, and the RSM "entry" would load zero.  This
> > is 100% contrived, and probably doesn't impact real world use cases, but it
> > still bugs me :-)
> 
> Exactly, should it be fixed by separate patch or leave it as is?
> 
> > 
> > > +		vmcs_writel(GUEST_SSP, vmx->nested.vmcs01_guest_ssp);
> > > +		vmcs_writel(GUEST_S_CET, vmx->nested.vmcs01_guest_s_cet);
> > > +		vmcs_writel(GUEST_INTR_SSP_TABLE,
> > > +			    vmx->nested.vmcs01_guest_ssp_tbl);
> > > +	}
> > > +
> > >  	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
> > >  
> > >  	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
> > > @@ -3373,8 +3389,14 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> > >  	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> > >  		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > >  	if (kvm_mpx_supported() &&
> > > -		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> > > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> > >  		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > > +	if (kvm_cet_supported() &&
> > > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > > +		vmx->nested.vmcs01_guest_ssp = vmcs_readl(GUEST_SSP);
> > > +		vmx->nested.vmcs01_guest_s_cet = vmcs_readl(GUEST_S_CET);
> > > +		vmx->nested.vmcs01_guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > > +	}
> > >  
> > >  	/*
> > >  	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
> > > @@ -4001,6 +4023,9 @@ static bool is_vmcs12_ext_field(unsigned long field)
> > >  	case GUEST_IDTR_BASE:
> > >  	case GUEST_PENDING_DBG_EXCEPTIONS:
> > >  	case GUEST_BNDCFGS:
> > > +	case GUEST_SSP:
> > > +	case GUEST_INTR_SSP_TABLE:
> > > +	case GUEST_S_CET:
> > >  		return true;
> > >  	default:
> > >  		break;
> > > @@ -4050,8 +4075,14 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
> > >  	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > >  	vmcs12->guest_pending_dbg_exceptions =
> > >  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > > -	if (kvm_mpx_supported())
> > > +	if (kvm_mpx_supported() && guest_cpuid_has(vcpu, X86_FEATURE_MPX))
> > 
> > Adding the CPUID check for MPX definitely needs to be a separate commit.
> 
> Sure, will fix them by separate patch. Thanks for review!
> 
> > 
> > >  		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > > +	if (kvm_cet_supported() && (guest_cpuid_has(vcpu, X86_FEATURE_SHSTK) ||
> > > +	    guest_cpuid_has(vcpu, X86_FEATURE_IBT))) {
> > > +		vmcs12->guest_ssp = vmcs_readl(GUEST_SSP);
> > > +		vmcs12->guest_s_cet = vmcs_readl(GUEST_S_CET);
> > > +		vmcs12->guest_ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
> > > +	}
> > >  
> > >  	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
> > >  }
> > > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> > > index 9d3a557949ac..36dc4fdb0909 100644
> > > --- a/arch/x86/kvm/vmx/vmx.h
> > > +++ b/arch/x86/kvm/vmx/vmx.h
> > > @@ -155,6 +155,9 @@ struct nested_vmx {
> > >  	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
> > >  	u64 vmcs01_debugctl;
> > >  	u64 vmcs01_guest_bndcfgs;
> > > +	u64 vmcs01_guest_ssp;
> > > +	u64 vmcs01_guest_s_cet;
> > > +	u64 vmcs01_guest_ssp_tbl;
> > >  
> > >  	/* to migrate it to L1 if L2 writes to L1's CR8 directly */
> > >  	int l1_tpr_threshold;
> > > -- 
> > > 2.26.2
> > > 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2
  2021-03-23  0:43       ` Yang Weijiang
@ 2021-03-23 15:56         ` Sean Christopherson
  2021-03-24 13:51           ` Yang Weijiang
  0 siblings, 1 reply; 9+ messages in thread
From: Sean Christopherson @ 2021-03-23 15:56 UTC (permalink / raw)
  To: Yang Weijiang; +Cc: pbonzini, vkuznets, kvm, linux-kernel

On Tue, Mar 23, 2021, Yang Weijiang wrote:
> On Tue, Mar 16, 2021 at 05:03:47PM +0800, Yang Weijiang wrote:
> 
> Hi, Sean,
> Could you respond my below rely? I'm not sure how to proceed, thanks!
> 
> > On Mon, Mar 15, 2021 at 09:45:11AM -0700, Sean Christopherson wrote:
> > > On Mon, Mar 15, 2021, Yang Weijiang wrote:

...

> > > > @@ -2556,6 +2563,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> > > >  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> > > >  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> > > >  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> > > > +
> > > > +	if (kvm_cet_supported() && (!vmx->nested.nested_run_pending ||
> > > > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))) {
> > > 
> > > Not your code per se, since this pattern comes from BNDCFGS and DEBUGCTL, but I
> > > don't see how loading vmcs01 state in this combo is correct:
> > > 
> > >     a. kvm_xxx_supported()              == 1
> > >     b. nested_run_pending               == false
> > >     c. vm_entry_controls.load_xxx_state == true
> > > 
> > > nested_vmx_enter_non_root_mode() only snapshots vmcs01 if 
> > > vm_entry_controls.load_xxx_state == false, which means the above combo is
> > > loading stale values (or more likely, zeros).
> > > 
> > > I _think_ nested_vmx_enter_non_root_mode() just needs to snapshot vmcs01 if
> > > nested_run_pending=false.  For migration, if userspace restores MSRs after
> > > KVM_SET_NESTED_STATE, then what's done here is likely irrelevant.  If userspace
> > > restores MSRs before nested state, then vmcs01 will hold the desired value since
> > > setting MSRs would have written the value into vmcs01.
> > 
> > Then the code nested_vmx_enter_non_root_mode() would look like:
> > 
> > if (kvm_cet_supported() && !vmx->nested.nested_run_pending &&
> >     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > 	...
> >     }
> > 
> > I have another concern now, if vm_entry_controls.load_cet_state == false, and L1
> > updated vmcs fields, so the latest states are in vmcs12, but they cannot
> > be synced to vmcs02 because in prepare_vmcs02_rare():
> > 
> > if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
> >     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > 	...
> >     }
> > 
> > so L2 got stale status. IMO, L1 guest sets vm_entry_controls.load_cet_state == false
> > should be rare case. We can even igore this case :-)

Yes, that's an L1 bug if it expects L2 state to come from vmcs12 in that case.
Architecturally, the vcms12 value won't be visible to L2 until L1 enables the
VM-Entry control, at which point KVM would detect the refreshed vmcs12 and sync
the "rare" fields.

> > > I suspect no one has reported this issue because guests simply don't use MPX,
> > > and up until the recent LBR stuff, KVM effectively zeroed out DEBUGCTL for the
> > > guest.
> > > 
> > So for MPX and DEBUGCTL, is it worth some separate fix patch?

Yes, assuming my analysis is correct.  That doesn't necessarily need to be your
responsibility, though patches are of course welcome :-)

Jim, Paolo, any thoughts?

> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index 45622e9c4449..4184ff601120 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -3298,10 +3298,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> > >         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
> > >                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
> > > 
> > > -       if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> > > +       if (!vmx->nested.nested_run_pending ||
> > > +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> > >                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > > -       if (kvm_mpx_supported() &&
> > > -               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> > > +       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> > > +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> > >                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > > 
> > >         /*
> > > 
> > > 
> > > Side topic, all of this code is broken for SMM emulation.  SMI+RSM don't do a
> > > full VM-Exit -> VM-Entry; the CPU forcefully exits non-root, but most state that
> > > is loaded from the VMCS is left untouched.  It's the SMI handler's responsibility
> > > to not enable features, e.g. to not set CR4.CET.  For sane use cases, this
> > > probably doesn't matter as vmcs12 will be configured to context switch state,
> > > but if L1 is doing anything out of the ordinary, SMI+RSM will corrupt state.
> > > 
> > > E.g. if L1 enables MPX in the guest, does not intercept L2 writes to BNDCFGS,
> > > and does not load BNDCFGS on VM-Entry, then SMI+RSM would corrupt BNDCFGS since
> > > the SMI "exit" would clear BNDCFGS, and the RSM "entry" would load zero.  This
> > > is 100% contrived, and probably doesn't impact real world use cases, but it
> > > still bugs me :-)
> > 
> > Exactly, should it be fixed by separate patch or leave it as is?

Definitely leave it for now, properly fixing the SMI+RSM code goes far beyond
basic CET support.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2
  2021-03-23 15:56         ` Sean Christopherson
@ 2021-03-24 13:51           ` Yang Weijiang
  0 siblings, 0 replies; 9+ messages in thread
From: Yang Weijiang @ 2021-03-24 13:51 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Yang Weijiang, pbonzini, vkuznets, kvm, linux-kernel

On Tue, Mar 23, 2021 at 03:56:30PM +0000, Sean Christopherson wrote:
> On Tue, Mar 23, 2021, Yang Weijiang wrote:
> > On Tue, Mar 16, 2021 at 05:03:47PM +0800, Yang Weijiang wrote:
> > 
> > Hi, Sean,
> > Could you respond my below rely? I'm not sure how to proceed, thanks!
> > 
> > > On Mon, Mar 15, 2021 at 09:45:11AM -0700, Sean Christopherson wrote:
> > > > On Mon, Mar 15, 2021, Yang Weijiang wrote:
> 
> ...
> 
> > > > > @@ -2556,6 +2563,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> > > > >  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> > > > >  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> > > > >  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> > > > > +
> > > > > +	if (kvm_cet_supported() && (!vmx->nested.nested_run_pending ||
> > > > > +	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))) {
> > > > 
> > > > Not your code per se, since this pattern comes from BNDCFGS and DEBUGCTL, but I
> > > > don't see how loading vmcs01 state in this combo is correct:
> > > > 
> > > >     a. kvm_xxx_supported()              == 1
> > > >     b. nested_run_pending               == false
> > > >     c. vm_entry_controls.load_xxx_state == true
> > > > 
> > > > nested_vmx_enter_non_root_mode() only snapshots vmcs01 if 
> > > > vm_entry_controls.load_xxx_state == false, which means the above combo is
> > > > loading stale values (or more likely, zeros).
> > > > 
> > > > I _think_ nested_vmx_enter_non_root_mode() just needs to snapshot vmcs01 if
> > > > nested_run_pending=false.  For migration, if userspace restores MSRs after
> > > > KVM_SET_NESTED_STATE, then what's done here is likely irrelevant.  If userspace
> > > > restores MSRs before nested state, then vmcs01 will hold the desired value since
> > > > setting MSRs would have written the value into vmcs01.
> > > 
> > > Then the code nested_vmx_enter_non_root_mode() would look like:
> > > 
> > > if (kvm_cet_supported() && !vmx->nested.nested_run_pending &&
> > >     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > > 	...
> > >     }
> > > 
> > > I have another concern now, if vm_entry_controls.load_cet_state == false, and L1
> > > updated vmcs fields, so the latest states are in vmcs12, but they cannot
> > > be synced to vmcs02 because in prepare_vmcs02_rare():
> > > 
> > > if (kvm_cet_supported() && vmx->nested.nested_run_pending &&
> > >     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
> > > 	...
> > >     }
> > > 
> > > so L2 got stale status. IMO, L1 guest sets vm_entry_controls.load_cet_state == false
> > > should be rare case. We can even igore this case :-)
> 
> Yes, that's an L1 bug if it expects L2 state to come from vmcs12 in that case.
> Architecturally, the vcms12 value won't be visible to L2 until L1 enables the
> VM-Entry control, at which point KVM would detect the refreshed vmcs12 and sync
> the "rare" fields.

Thanks, Sean!
So I'll change code as below:

if (kvm_cet_supported() && !vmx->nested.nested_run_pending &&
    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) {
      ...
    }
>
> > > > I suspect no one has reported this issue because guests simply don't use MPX,
> > > > and up until the recent LBR stuff, KVM effectively zeroed out DEBUGCTL for the
> > > > guest.
> > > > 
> > > So for MPX and DEBUGCTL, is it worth some separate fix patch?
> 
> Yes, assuming my analysis is correct.  That doesn't necessarily need to be your
> responsibility, though patches are of course welcome :-)
> 
> Jim, Paolo, any thoughts?
> 
OK, let me wait for Jim and Paolo's comments on this...

> > > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > > index 45622e9c4449..4184ff601120 100644
> > > > --- a/arch/x86/kvm/vmx/nested.c
> > > > +++ b/arch/x86/kvm/vmx/nested.c
> > > > @@ -3298,10 +3298,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> > > >         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
> > > >                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
> > > > 
> > > > -       if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> > > > +       if (!vmx->nested.nested_run_pending ||
> > > > +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
> > > >                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> > > > -       if (kvm_mpx_supported() &&
> > > > -               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
> > > > +       if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> > > > +           !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> > > >                 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> > > > 
> > > >         /*
> > > > 
> > > > 
> > > > Side topic, all of this code is broken for SMM emulation.  SMI+RSM don't do a
> > > > full VM-Exit -> VM-Entry; the CPU forcefully exits non-root, but most state that
> > > > is loaded from the VMCS is left untouched.  It's the SMI handler's responsibility
> > > > to not enable features, e.g. to not set CR4.CET.  For sane use cases, this
> > > > probably doesn't matter as vmcs12 will be configured to context switch state,
> > > > but if L1 is doing anything out of the ordinary, SMI+RSM will corrupt state.
> > > > 
> > > > E.g. if L1 enables MPX in the guest, does not intercept L2 writes to BNDCFGS,
> > > > and does not load BNDCFGS on VM-Entry, then SMI+RSM would corrupt BNDCFGS since
> > > > the SMI "exit" would clear BNDCFGS, and the RSM "entry" would load zero.  This
> > > > is 100% contrived, and probably doesn't impact real world use cases, but it
> > > > still bugs me :-)
> > > 
> > > Exactly, should it be fixed by separate patch or leave it as is?
> 
> Definitely leave it for now, properly fixing the SMI+RSM code goes far beyond
> basic CET support.

Sure.


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-03-24 13:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-15  7:18 [PATCH v4 0/3] CET fix patches for nested guest Yang Weijiang
2021-03-15  7:18 ` [PATCH v4 1/3] KVM: nVMX: Sync L2 guest CET states between L1/L2 Yang Weijiang
2021-03-15 16:45   ` Sean Christopherson
2021-03-16  9:03     ` Yang Weijiang
2021-03-23  0:43       ` Yang Weijiang
2021-03-23 15:56         ` Sean Christopherson
2021-03-24 13:51           ` Yang Weijiang
2021-03-15  7:18 ` [PATCH v4 2/3] KVM: nVMX: Set X86_CR4_CET in cr4_fixed1_bits if CET IBT is enabled Yang Weijiang
2021-03-15  7:18 ` [PATCH v4 3/3] KVM: nVMX: Add CET entry/exit load bits to evmcs unsupported list Yang Weijiang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).