linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] KVM: nVMX: msr bitmaps fixes
@ 2016-08-08 18:16 Radim Krčmář
  2016-08-08 18:16 ` [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC Radim Krčmář
  2016-08-08 18:16 ` [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write Radim Krčmář
  0 siblings, 2 replies; 16+ messages in thread
From: Radim Krčmář @ 2016-08-08 18:16 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

Jim found several bugs that allowed L2 to read L0's x2APIC MSRs and write to
TPR, EOI, and SELF_IPI, in the worst case.

The fix is split into two patches as possible causes were introduced by
two different commits.

I have not Cc'd stable as the patches are quite long and nVMX is not
considered to be ready for production, yet.


Radim Krčmář (2):
  KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write

 arch/x86/kvm/vmx.c | 120 +++++++++++++++++++++++++----------------------------
 1 file changed, 57 insertions(+), 63 deletions(-)

-- 
2.9.2

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-08 18:16 [PATCH 0/2] KVM: nVMX: msr bitmaps fixes Radim Krčmář
@ 2016-08-08 18:16 ` Radim Krčmář
  2016-08-09  9:32   ` Yang Zhang
  2016-08-16  2:53   ` Wanpeng Li
  2016-08-08 18:16 ` [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write Radim Krčmář
  1 sibling, 2 replies; 16+ messages in thread
From: Radim Krčmář @ 2016-08-08 18:16 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

msr bitmap can be used to avoid a VM exit (interception) on guest MSR
accesses.  In some configurations of VMX controls, the guest can even
directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
APIC ACCESSES.

L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
To do so, L1 would first trick KVM to disable all possible interceptions
by enabling APICv features and then would turn those features off;
nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
not intercept previously enabled MSRs even though they were not safe
with the new configuration.

Correctly re-enabling interceptions is not enough as a second bug would
still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
VMCSs, so L1 could trigger a race to get the desired combination of msr
bitmap and VMX controls.

This fix allocates a msr bitmap for every L1 VCPU, allows only safe
x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
have to intercept everything anyway.

Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
Reported-by: Jim Mattson <jmattson@google.com>
Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++-------------------------------
 1 file changed, 44 insertions(+), 63 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a45d8580f91e..c66ac2c70d22 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -435,6 +435,8 @@ struct nested_vmx {
 	bool pi_pending;
 	u16 posted_intr_nv;
 
+	unsigned long *msr_bitmap;
+
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
 
@@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_nested;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 	unsigned long *msr_bitmap;
 
 	if (is_guest_mode(vcpu))
-		msr_bitmap = vmx_msr_bitmap_nested;
+		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
 	else if (cpu_has_secondary_exec_ctrls() &&
 		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
 		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
@@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
 	if (!vmx_msr_bitmap_longmode_x2apic)
 		goto out4;
 
-	if (nested) {
-		vmx_msr_bitmap_nested =
-			(unsigned long *)__get_free_page(GFP_KERNEL);
-		if (!vmx_msr_bitmap_nested)
-			goto out5;
-	}
-
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
 		goto out6;
@@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
 
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-	if (nested)
-		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
@@ -6529,9 +6521,6 @@ out8:
 out7:
 	free_page((unsigned long)vmx_vmread_bitmap);
 out6:
-	if (nested)
-		free_page((unsigned long)vmx_msr_bitmap_nested);
-out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
@@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
 	free_page((unsigned long)vmx_io_bitmap_a);
 	free_page((unsigned long)vmx_vmwrite_bitmap);
 	free_page((unsigned long)vmx_vmread_bitmap);
-	if (nested)
-		free_page((unsigned long)vmx_msr_bitmap_nested);
 
 	free_kvm_area();
 }
@@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	if (cpu_has_vmx_msr_bitmap()) {
+		vmx->nested.msr_bitmap =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx->nested.msr_bitmap)
+			goto out_msr_bitmap;
+	}
+
 	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
 	if (!vmx->nested.cached_vmcs12)
-		return -ENOMEM;
+		goto out_cached_vmcs12;
 
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
-		if (!shadow_vmcs) {
-			kfree(vmx->nested.cached_vmcs12);
-			return -ENOMEM;
-		}
+		if (!shadow_vmcs)
+			goto out_shadow_vmcs;
 		/* mark vmcs as shadow */
 		shadow_vmcs->revision_id |= (1u << 31);
 		/* init shadow vmcs */
@@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
 	return 1;
+
+out_shadow_vmcs:
+	kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+	free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+	return -ENOMEM;
 }
 
 /*
@@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
 	vmx->nested.vmxon = false;
 	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
+	if (vmx->nested.msr_bitmap) {
+		free_page((unsigned long)vmx->nested.msr_bitmap);
+		vmx->nested.msr_bitmap = NULL;
+	}
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
 	kfree(vmx->nested.cached_vmcs12);
@@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 {
 	int msr;
 	struct page *page;
-	unsigned long *msr_bitmap;
+	unsigned long *msr_bitmap_l1;
+	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
 
+	/* This shortcut is ok because we support only x2APIC MSRs so far. */
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
 		return false;
 
@@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		WARN_ON(1);
 		return false;
 	}
-	msr_bitmap = (unsigned long *)kmap(page);
-	if (!msr_bitmap) {
+	msr_bitmap_l1 = (unsigned long *)kmap(page);
+	if (!msr_bitmap_l1) {
 		nested_release_page_clean(page);
 		WARN_ON(1);
 		return false;
 	}
 
+	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+
 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 		if (nested_cpu_has_apic_reg_virt(vmcs12))
 			for (msr = 0x800; msr <= 0x8ff; msr++)
 				nested_vmx_disable_intercept_for_msr(
-					msr_bitmap,
-					vmx_msr_bitmap_nested,
+					msr_bitmap_l1, msr_bitmap_l0,
 					msr, MSR_TYPE_R);
-		/* TPR is allowed */
-		nested_vmx_disable_intercept_for_msr(msr_bitmap,
-				vmx_msr_bitmap_nested,
+
+		nested_vmx_disable_intercept_for_msr(
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 				MSR_TYPE_R | MSR_TYPE_W);
+
 		if (nested_cpu_has_vid(vmcs12)) {
-			/* EOI and self-IPI are allowed */
 			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap,
-				vmx_msr_bitmap_nested,
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_EOI >> 4),
 				MSR_TYPE_W);
 			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap,
-				vmx_msr_bitmap_nested,
+				msr_bitmap_l1, msr_bitmap_l0,
 				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
 				MSR_TYPE_W);
 		}
-	} else {
-		/*
-		 * Enable reading intercept of all the x2apic
-		 * MSRs. We should not rely on vmcs12 to do any
-		 * optimizations here, it may have been modified
-		 * by L1.
-		 */
-		for (msr = 0x800; msr <= 0x8ff; msr++)
-			__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				msr,
-				MSR_TYPE_R);
-
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-				MSR_TYPE_W);
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_EOI >> 4),
-				MSR_TYPE_W);
-		__vmx_enable_intercept_for_msr(
-				vmx_msr_bitmap_nested,
-				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-				MSR_TYPE_W);
 	}
 	kunmap(page);
 	nested_release_page_clean(page);
@@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	}
 
 	if (cpu_has_vmx_msr_bitmap() &&
-	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
-		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
-		/* MSR_BITMAP will be set by following vmx_set_efer. */
-	} else
+	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+		; /* MSR_BITMAP will be set by following vmx_set_efer. */
+	else
 		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 
 	/*
-- 
2.9.2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-08 18:16 [PATCH 0/2] KVM: nVMX: msr bitmaps fixes Radim Krčmář
  2016-08-08 18:16 ` [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC Radim Krčmář
@ 2016-08-08 18:16 ` Radim Krčmář
  2016-08-12  6:07   ` Wanpeng Li
  2016-08-16  2:43   ` Wanpeng Li
  1 sibling, 2 replies; 16+ messages in thread
From: Radim Krčmář @ 2016-08-08 18:16 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
write with vmcs02 as the current VMCS.
This will incorrectly apply modifications intended for vmcs01 to vmcs02
and L2 can use it to gain access to L0's x2APIC registers by disabling
virtualized x2APIC while using msr bitmap that assumes enabled.

Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
current VMCS.  An alternative solution would temporarily make vmcs01 the
current VMCS, but it requires more care.

Fixes: 8d14695f9542 ("x86, apicv: add virtual x2apic support")
Reported-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c66ac2c70d22..ae111a07acc4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -422,6 +422,7 @@ struct nested_vmx {
 	struct list_head vmcs02_pool;
 	int vmcs02_num;
 	u64 vmcs01_tsc_offset;
+	bool change_vmcs01_virtual_x2apic_mode;
 	/* L2 must run next, and mustn't decide to exit to L1. */
 	bool nested_run_pending;
 	/*
@@ -8424,6 +8425,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 {
 	u32 sec_exec_control;
 
+	/* Postpone execution until vmcs01 is the current VMCS. */
+	if (is_guest_mode(vcpu)) {
+		to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+		return;
+	}
+
 	/*
 	 * There is not point to enable virtualize x2apic without enable
 	 * apicv
@@ -10749,6 +10756,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
 			      PIN_BASED_VMX_PREEMPTION_TIMER);
 
+	if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+		vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+		vmx_set_virtual_x2apic_mode(vcpu,
+				vcpu->arch.apic_base & X2APIC_ENABLE);
+	}
+
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;
 
-- 
2.9.2

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-08 18:16 ` [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC Radim Krčmář
@ 2016-08-09  9:32   ` Yang Zhang
  2016-08-09 10:19     ` Wincy Van
  2016-08-09 12:23     ` Radim Krčmář
  2016-08-16  2:53   ` Wanpeng Li
  1 sibling, 2 replies; 16+ messages in thread
From: Yang Zhang @ 2016-08-09  9:32 UTC (permalink / raw)
  To: Radim Krčmář, linux-kernel, kvm
  Cc: Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

On 2016/8/9 2:16, Radim Krčmář wrote:
> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
> accesses.  In some configurations of VMX controls, the guest can even
> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
> APIC ACCESSES.
>
> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
> To do so, L1 would first trick KVM to disable all possible interceptions
> by enabling APICv features and then would turn those features off;
> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
> not intercept previously enabled MSRs even though they were not safe
> with the new configuration.
>
> Correctly re-enabling interceptions is not enough as a second bug would
> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
> VMCSs, so L1 could trigger a race to get the desired combination of msr
> bitmap and VMX controls.
>
> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
> have to intercept everything anyway.
>
> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
> ---
>  arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++-------------------------------
>  1 file changed, 44 insertions(+), 63 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a45d8580f91e..c66ac2c70d22 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -435,6 +435,8 @@ struct nested_vmx {
>  	bool pi_pending;
>  	u16 posted_intr_nv;
>
> +	unsigned long *msr_bitmap;
> +
>  	struct hrtimer preemption_timer;
>  	bool preemption_timer_expired;
>
> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
> -static unsigned long *vmx_msr_bitmap_nested;
>  static unsigned long *vmx_vmread_bitmap;
>  static unsigned long *vmx_vmwrite_bitmap;
>
> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>  	unsigned long *msr_bitmap;
>
>  	if (is_guest_mode(vcpu))
> -		msr_bitmap = vmx_msr_bitmap_nested;
> +		msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>  	else if (cpu_has_secondary_exec_ctrls() &&
>  		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>  		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>  	if (!vmx_msr_bitmap_longmode_x2apic)
>  		goto out4;
>
> -	if (nested) {
> -		vmx_msr_bitmap_nested =
> -			(unsigned long *)__get_free_page(GFP_KERNEL);
> -		if (!vmx_msr_bitmap_nested)
> -			goto out5;
> -	}
> -
>  	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>  	if (!vmx_vmread_bitmap)
>  		goto out6;
> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>
>  	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>  	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
> -	if (nested)
> -		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>
>  	if (setup_vmcs_config(&vmcs_config) < 0) {
>  		r = -EIO;
> @@ -6529,9 +6521,6 @@ out8:
>  out7:
>  	free_page((unsigned long)vmx_vmread_bitmap);
>  out6:
> -	if (nested)
> -		free_page((unsigned long)vmx_msr_bitmap_nested);
> -out5:
>  	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>  out4:
>  	free_page((unsigned long)vmx_msr_bitmap_longmode);
> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>  	free_page((unsigned long)vmx_io_bitmap_a);
>  	free_page((unsigned long)vmx_vmwrite_bitmap);
>  	free_page((unsigned long)vmx_vmread_bitmap);
> -	if (nested)
> -		free_page((unsigned long)vmx_msr_bitmap_nested);
>
>  	free_kvm_area();
>  }
> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>  		return 1;
>  	}
>
> +	if (cpu_has_vmx_msr_bitmap()) {
> +		vmx->nested.msr_bitmap =
> +				(unsigned long *)__get_free_page(GFP_KERNEL);
> +		if (!vmx->nested.msr_bitmap)
> +			goto out_msr_bitmap;
> +	}
> +

We export msr_bitmap to guest even it is not supported by hardware. So 
we need to allocate msr_bitmap for L1 unconditionally.

>  	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>  	if (!vmx->nested.cached_vmcs12)
> -		return -ENOMEM;
> +		goto out_cached_vmcs12;
>
>  	if (enable_shadow_vmcs) {
>  		shadow_vmcs = alloc_vmcs();
> -		if (!shadow_vmcs) {
> -			kfree(vmx->nested.cached_vmcs12);
> -			return -ENOMEM;
> -		}
> +		if (!shadow_vmcs)
> +			goto out_shadow_vmcs;
>  		/* mark vmcs as shadow */
>  		shadow_vmcs->revision_id |= (1u << 31);
>  		/* init shadow vmcs */
> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>  	skip_emulated_instruction(vcpu);
>  	nested_vmx_succeed(vcpu);
>  	return 1;
> +
> +out_shadow_vmcs:
> +	kfree(vmx->nested.cached_vmcs12);
> +
> +out_cached_vmcs12:
> +	free_page((unsigned long)vmx->nested.msr_bitmap);
> +
> +out_msr_bitmap:
> +	return -ENOMEM;
>  }
>
>  /*
> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>  	vmx->nested.vmxon = false;
>  	free_vpid(vmx->nested.vpid02);
>  	nested_release_vmcs12(vmx);
> +	if (vmx->nested.msr_bitmap) {
> +		free_page((unsigned long)vmx->nested.msr_bitmap);
> +		vmx->nested.msr_bitmap = NULL;
> +	}
>  	if (enable_shadow_vmcs)
>  		free_vmcs(vmx->nested.current_shadow_vmcs);
>  	kfree(vmx->nested.cached_vmcs12);
> @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>  {
>  	int msr;
>  	struct page *page;
> -	unsigned long *msr_bitmap;
> +	unsigned long *msr_bitmap_l1;
> +	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>
> +	/* This shortcut is ok because we support only x2APIC MSRs so far. */
>  	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>  		return false;
>
> @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>  		WARN_ON(1);
>  		return false;
>  	}
> -	msr_bitmap = (unsigned long *)kmap(page);
> -	if (!msr_bitmap) {
> +	msr_bitmap_l1 = (unsigned long *)kmap(page);
> +	if (!msr_bitmap_l1) {
>  		nested_release_page_clean(page);
>  		WARN_ON(1);
>  		return false;
>  	}
>
> +	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
> +
>  	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>  		if (nested_cpu_has_apic_reg_virt(vmcs12))
>  			for (msr = 0x800; msr <= 0x8ff; msr++)
>  				nested_vmx_disable_intercept_for_msr(
> -					msr_bitmap,
> -					vmx_msr_bitmap_nested,
> +					msr_bitmap_l1, msr_bitmap_l0,
>  					msr, MSR_TYPE_R);
> -		/* TPR is allowed */
> -		nested_vmx_disable_intercept_for_msr(msr_bitmap,
> -				vmx_msr_bitmap_nested,
> +
> +		nested_vmx_disable_intercept_for_msr(
> +				msr_bitmap_l1, msr_bitmap_l0,
>  				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>  				MSR_TYPE_R | MSR_TYPE_W);
> +
>  		if (nested_cpu_has_vid(vmcs12)) {
> -			/* EOI and self-IPI are allowed */
>  			nested_vmx_disable_intercept_for_msr(
> -				msr_bitmap,
> -				vmx_msr_bitmap_nested,
> +				msr_bitmap_l1, msr_bitmap_l0,
>  				APIC_BASE_MSR + (APIC_EOI >> 4),
>  				MSR_TYPE_W);
>  			nested_vmx_disable_intercept_for_msr(
> -				msr_bitmap,
> -				vmx_msr_bitmap_nested,
> +				msr_bitmap_l1, msr_bitmap_l0,
>  				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>  				MSR_TYPE_W);
>  		}
> -	} else {
> -		/*
> -		 * Enable reading intercept of all the x2apic
> -		 * MSRs. We should not rely on vmcs12 to do any
> -		 * optimizations here, it may have been modified
> -		 * by L1.
> -		 */
> -		for (msr = 0x800; msr <= 0x8ff; msr++)
> -			__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				msr,
> -				MSR_TYPE_R);
> -
> -		__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
> -				MSR_TYPE_W);
> -		__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				APIC_BASE_MSR + (APIC_EOI >> 4),
> -				MSR_TYPE_W);
> -		__vmx_enable_intercept_for_msr(
> -				vmx_msr_bitmap_nested,
> -				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
> -				MSR_TYPE_W);
>  	}
>  	kunmap(page);
>  	nested_release_page_clean(page);
> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	}
>
>  	if (cpu_has_vmx_msr_bitmap() &&
> -	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
> -		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
> -		/* MSR_BITMAP will be set by following vmx_set_efer. */
> -	} else
> +	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
> +	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
> +		; /* MSR_BITMAP will be set by following vmx_set_efer. */
> +	else
>  		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>
>  	/*
>


-- 
Yang
Alibaba Cloud Computing

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-09  9:32   ` Yang Zhang
@ 2016-08-09 10:19     ` Wincy Van
  2016-08-10  5:52       ` Yang Zhang
  2016-08-09 12:23     ` Radim Krčmář
  1 sibling, 1 reply; 16+ messages in thread
From: Wincy Van @ 2016-08-09 10:19 UTC (permalink / raw)
  To: Yang Zhang
  Cc: Radim Krčmář,
	linux-kernel, kvm, Jim Mattson, Paolo Bonzini, Bandan Das

On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@gmail.com> wrote:
> On 2016/8/9 2:16, Radim Krčmář wrote:
>>
>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
>> accesses.  In some configurations of VMX controls, the guest can even
>> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
>> APIC ACCESSES.
>>
>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
>> To do so, L1 would first trick KVM to disable all possible interceptions
>> by enabling APICv features and then would turn those features off;
>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
>> not intercept previously enabled MSRs even though they were not safe
>> with the new configuration.
>>
>> Correctly re-enabling interceptions is not enough as a second bug would
>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
>> VMCSs, so L1 could trigger a race to get the desired combination of msr
>> bitmap and VMX controls.
>>
>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
>> have to intercept everything anyway.
>>
>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
>> Reported-by: Jim Mattson <jmattson@google.com>
>> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
>> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
>> ---
>>  arch/x86/kvm/vmx.c | 107
>> ++++++++++++++++++++++-------------------------------
>>  1 file changed, 44 insertions(+), 63 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index a45d8580f91e..c66ac2c70d22 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -435,6 +435,8 @@ struct nested_vmx {
>>         bool pi_pending;
>>         u16 posted_intr_nv;
>>
>> +       unsigned long *msr_bitmap;
>> +
>>         struct hrtimer preemption_timer;
>>         bool preemption_timer_expired;
>>
>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>>  static unsigned long *vmx_msr_bitmap_longmode;
>>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>> -static unsigned long *vmx_msr_bitmap_nested;
>>  static unsigned long *vmx_vmread_bitmap;
>>  static unsigned long *vmx_vmwrite_bitmap;
>>
>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu
>> *vcpu)
>>         unsigned long *msr_bitmap;
>>
>>         if (is_guest_mode(vcpu))
>> -               msr_bitmap = vmx_msr_bitmap_nested;
>> +               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>>         else if (cpu_has_secondary_exec_ctrls() &&
>>                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>>                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>>         if (!vmx_msr_bitmap_longmode_x2apic)
>>                 goto out4;
>>
>> -       if (nested) {
>> -               vmx_msr_bitmap_nested =
>> -                       (unsigned long *)__get_free_page(GFP_KERNEL);
>> -               if (!vmx_msr_bitmap_nested)
>> -                       goto out5;
>> -       }
>> -
>>         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>>         if (!vmx_vmread_bitmap)
>>                 goto out6;
>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>>
>>         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>>         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
>> -       if (nested)
>> -               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>>
>>         if (setup_vmcs_config(&vmcs_config) < 0) {
>>                 r = -EIO;
>> @@ -6529,9 +6521,6 @@ out8:
>>  out7:
>>         free_page((unsigned long)vmx_vmread_bitmap);
>>  out6:
>> -       if (nested)
>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>> -out5:
>>         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>>  out4:
>>         free_page((unsigned long)vmx_msr_bitmap_longmode);
>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>>         free_page((unsigned long)vmx_io_bitmap_a);
>>         free_page((unsigned long)vmx_vmwrite_bitmap);
>>         free_page((unsigned long)vmx_vmread_bitmap);
>> -       if (nested)
>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>>
>>         free_kvm_area();
>>  }
>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>                 return 1;
>>         }
>>
>> +       if (cpu_has_vmx_msr_bitmap()) {
>> +               vmx->nested.msr_bitmap =
>> +                               (unsigned long
>> *)__get_free_page(GFP_KERNEL);
>> +               if (!vmx->nested.msr_bitmap)
>> +                       goto out_msr_bitmap;
>> +       }
>> +
>
>
> We export msr_bitmap to guest even it is not supported by hardware. So we
> need to allocate msr_bitmap for L1 unconditionally.
>
>

Nice to see you, Yang :-)

I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and
used by hardware.
L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to
vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP.

Am I missed something?

Thanks,
Wincy

>>         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>>         if (!vmx->nested.cached_vmcs12)
>> -               return -ENOMEM;
>> +               goto out_cached_vmcs12;
>>
>>         if (enable_shadow_vmcs) {
>>                 shadow_vmcs = alloc_vmcs();
>> -               if (!shadow_vmcs) {
>> -                       kfree(vmx->nested.cached_vmcs12);
>> -                       return -ENOMEM;
>> -               }
>> +               if (!shadow_vmcs)
>> +                       goto out_shadow_vmcs;
>>                 /* mark vmcs as shadow */
>>                 shadow_vmcs->revision_id |= (1u << 31);
>>                 /* init shadow vmcs */
>> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>         skip_emulated_instruction(vcpu);
>>         nested_vmx_succeed(vcpu);
>>         return 1;
>> +
>> +out_shadow_vmcs:
>> +       kfree(vmx->nested.cached_vmcs12);
>> +
>> +out_cached_vmcs12:
>> +       free_page((unsigned long)vmx->nested.msr_bitmap);
>> +
>> +out_msr_bitmap:
>> +       return -ENOMEM;
>>  }
>>
>>  /*
>> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>>         vmx->nested.vmxon = false;
>>         free_vpid(vmx->nested.vpid02);
>>         nested_release_vmcs12(vmx);
>> +       if (vmx->nested.msr_bitmap) {
>> +               free_page((unsigned long)vmx->nested.msr_bitmap);
>> +               vmx->nested.msr_bitmap = NULL;
>> +       }
>>         if (enable_shadow_vmcs)
>>                 free_vmcs(vmx->nested.current_shadow_vmcs);
>>         kfree(vmx->nested.cached_vmcs12);
>> @@ -9472,8 +9477,10 @@ static inline bool
>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>  {
>>         int msr;
>>         struct page *page;
>> -       unsigned long *msr_bitmap;
>> +       unsigned long *msr_bitmap_l1;
>> +       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>>
>> +       /* This shortcut is ok because we support only x2APIC MSRs so far.
>> */
>>         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>>                 return false;
>>
>> @@ -9482,63 +9489,37 @@ static inline bool
>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>                 WARN_ON(1);
>>                 return false;
>>         }
>> -       msr_bitmap = (unsigned long *)kmap(page);
>> -       if (!msr_bitmap) {
>> +       msr_bitmap_l1 = (unsigned long *)kmap(page);
>> +       if (!msr_bitmap_l1) {
>>                 nested_release_page_clean(page);
>>                 WARN_ON(1);
>>                 return false;
>>         }
>>
>> +       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
>> +
>>         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>>                 if (nested_cpu_has_apic_reg_virt(vmcs12))
>>                         for (msr = 0x800; msr <= 0x8ff; msr++)
>>                                 nested_vmx_disable_intercept_for_msr(
>> -                                       msr_bitmap,
>> -                                       vmx_msr_bitmap_nested,
>> +                                       msr_bitmap_l1, msr_bitmap_l0,
>>                                         msr, MSR_TYPE_R);
>> -               /* TPR is allowed */
>> -               nested_vmx_disable_intercept_for_msr(msr_bitmap,
>> -                               vmx_msr_bitmap_nested,
>> +
>> +               nested_vmx_disable_intercept_for_msr(
>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>>                                 MSR_TYPE_R | MSR_TYPE_W);
>> +
>>                 if (nested_cpu_has_vid(vmcs12)) {
>> -                       /* EOI and self-IPI are allowed */
>>                         nested_vmx_disable_intercept_for_msr(
>> -                               msr_bitmap,
>> -                               vmx_msr_bitmap_nested,
>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>                                 APIC_BASE_MSR + (APIC_EOI >> 4),
>>                                 MSR_TYPE_W);
>>                         nested_vmx_disable_intercept_for_msr(
>> -                               msr_bitmap,
>> -                               vmx_msr_bitmap_nested,
>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>>                                 MSR_TYPE_W);
>>                 }
>> -       } else {
>> -               /*
>> -                * Enable reading intercept of all the x2apic
>> -                * MSRs. We should not rely on vmcs12 to do any
>> -                * optimizations here, it may have been modified
>> -                * by L1.
>> -                */
>> -               for (msr = 0x800; msr <= 0x8ff; msr++)
>> -                       __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               msr,
>> -                               MSR_TYPE_R);
>> -
>> -               __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>> -                               MSR_TYPE_W);
>> -               __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               APIC_BASE_MSR + (APIC_EOI >> 4),
>> -                               MSR_TYPE_W);
>> -               __vmx_enable_intercept_for_msr(
>> -                               vmx_msr_bitmap_nested,
>> -                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>> -                               MSR_TYPE_W);
>>         }
>>         kunmap(page);
>>         nested_release_page_clean(page);
>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)
>>         }
>>
>>         if (cpu_has_vmx_msr_bitmap() &&
>> -           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>> -               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>> -               /* MSR_BITMAP will be set by following vmx_set_efer. */
>> -       } else
>> +           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>> +           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>> +               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
>> +       else
>>                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>>
>>         /*
>>
>
>
> --
> Yang
> Alibaba Cloud Computing

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-09  9:32   ` Yang Zhang
  2016-08-09 10:19     ` Wincy Van
@ 2016-08-09 12:23     ` Radim Krčmář
  2016-08-10  5:55       ` Yang Zhang
  1 sibling, 1 reply; 16+ messages in thread
From: Radim Krčmář @ 2016-08-09 12:23 UTC (permalink / raw)
  To: Yang Zhang
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-09 17:32+0800, Yang Zhang:
> On 2016/8/9 2:16, Radim Krčmář wrote:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>  		return 1;
>>  	}
>> 
>> +	if (cpu_has_vmx_msr_bitmap()) {
>> +		vmx->nested.msr_bitmap =
>> +				(unsigned long *)__get_free_page(GFP_KERNEL);
>> +		if (!vmx->nested.msr_bitmap)
>> +			goto out_msr_bitmap;
>> +	}
>> +
> 
> We export msr_bitmap to guest even it is not supported by hardware. So we
> need to allocate msr_bitmap for L1 unconditionally.

We do emulate the feature, but the vmx->nested.msr_bitmap is used only
if VMX supports it to avoid some VM exits:

>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  	}
>> 
>>  	if (cpu_has_vmx_msr_bitmap() &&
>> -	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>> -		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>> -		/* MSR_BITMAP will be set by following vmx_set_efer. */
>> -	} else
>> +	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>> +	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>> +		; /* MSR_BITMAP will be set by following vmx_set_efer. */
>> +	else
>>  		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;

The else branch is taken if !cpu_has_vmx_msr_bitmap() and disables msr
bitmaps.  Similar check for vmx_set_msr_bitmap(), so the NULL doesn't
even get written to VMCS.

KVM always uses L1's msr bitmaps when emulating the feature.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-09 10:19     ` Wincy Van
@ 2016-08-10  5:52       ` Yang Zhang
  0 siblings, 0 replies; 16+ messages in thread
From: Yang Zhang @ 2016-08-10  5:52 UTC (permalink / raw)
  To: Wincy Van
  Cc: Radim Krčmář,
	linux-kernel, kvm, Jim Mattson, Paolo Bonzini, Bandan Das

On 2016/8/9 18:19, Wincy Van wrote:
> On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@gmail.com> wrote:
>> On 2016/8/9 2:16, Radim Krčmář wrote:
>>>
>>> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
>>> accesses.  In some configurations of VMX controls, the guest can even
>>> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
>>> APIC ACCESSES.
>>>
>>> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
>>> To do so, L1 would first trick KVM to disable all possible interceptions
>>> by enabling APICv features and then would turn those features off;
>>> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
>>> not intercept previously enabled MSRs even though they were not safe
>>> with the new configuration.
>>>
>>> Correctly re-enabling interceptions is not enough as a second bug would
>>> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
>>> VMCSs, so L1 could trigger a race to get the desired combination of msr
>>> bitmap and VMX controls.
>>>
>>> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
>>> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
>>> have to intercept everything anyway.
>>>
>>> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
>>> Reported-by: Jim Mattson <jmattson@google.com>
>>> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
>>> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
>>> ---
>>>  arch/x86/kvm/vmx.c | 107
>>> ++++++++++++++++++++++-------------------------------
>>>  1 file changed, 44 insertions(+), 63 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index a45d8580f91e..c66ac2c70d22 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -435,6 +435,8 @@ struct nested_vmx {
>>>         bool pi_pending;
>>>         u16 posted_intr_nv;
>>>
>>> +       unsigned long *msr_bitmap;
>>> +
>>>         struct hrtimer preemption_timer;
>>>         bool preemption_timer_expired;
>>>
>>> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>>>  static unsigned long *vmx_msr_bitmap_longmode;
>>>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>>>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>>> -static unsigned long *vmx_msr_bitmap_nested;
>>>  static unsigned long *vmx_vmread_bitmap;
>>>  static unsigned long *vmx_vmwrite_bitmap;
>>>
>>> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu
>>> *vcpu)
>>>         unsigned long *msr_bitmap;
>>>
>>>         if (is_guest_mode(vcpu))
>>> -               msr_bitmap = vmx_msr_bitmap_nested;
>>> +               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>>>         else if (cpu_has_secondary_exec_ctrls() &&
>>>                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>>>                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
>>> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>>>         if (!vmx_msr_bitmap_longmode_x2apic)
>>>                 goto out4;
>>>
>>> -       if (nested) {
>>> -               vmx_msr_bitmap_nested =
>>> -                       (unsigned long *)__get_free_page(GFP_KERNEL);
>>> -               if (!vmx_msr_bitmap_nested)
>>> -                       goto out5;
>>> -       }
>>> -
>>>         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>>>         if (!vmx_vmread_bitmap)
>>>                 goto out6;
>>> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>>>
>>>         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>>>         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
>>> -       if (nested)
>>> -               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>>>
>>>         if (setup_vmcs_config(&vmcs_config) < 0) {
>>>                 r = -EIO;
>>> @@ -6529,9 +6521,6 @@ out8:
>>>  out7:
>>>         free_page((unsigned long)vmx_vmread_bitmap);
>>>  out6:
>>> -       if (nested)
>>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>>> -out5:
>>>         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>>>  out4:
>>>         free_page((unsigned long)vmx_msr_bitmap_longmode);
>>> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>>>         free_page((unsigned long)vmx_io_bitmap_a);
>>>         free_page((unsigned long)vmx_vmwrite_bitmap);
>>>         free_page((unsigned long)vmx_vmread_bitmap);
>>> -       if (nested)
>>> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>>>
>>>         free_kvm_area();
>>>  }
>>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>                 return 1;
>>>         }
>>>
>>> +       if (cpu_has_vmx_msr_bitmap()) {
>>> +               vmx->nested.msr_bitmap =
>>> +                               (unsigned long
>>> *)__get_free_page(GFP_KERNEL);
>>> +               if (!vmx->nested.msr_bitmap)
>>> +                       goto out_msr_bitmap;
>>> +       }
>>> +
>>
>>
>> We export msr_bitmap to guest even it is not supported by hardware. So we
>> need to allocate msr_bitmap for L1 unconditionally.
>>
>>
>
> Nice to see you, Yang :-)

Niec to see you too.

>
> I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and
> used by hardware.
> L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to
> vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP.
>
> Am I missed something?

You are correct!

>
> Thanks,
> Wincy
>
>>>         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>>>         if (!vmx->nested.cached_vmcs12)
>>> -               return -ENOMEM;
>>> +               goto out_cached_vmcs12;
>>>
>>>         if (enable_shadow_vmcs) {
>>>                 shadow_vmcs = alloc_vmcs();
>>> -               if (!shadow_vmcs) {
>>> -                       kfree(vmx->nested.cached_vmcs12);
>>> -                       return -ENOMEM;
>>> -               }
>>> +               if (!shadow_vmcs)
>>> +                       goto out_shadow_vmcs;
>>>                 /* mark vmcs as shadow */
>>>                 shadow_vmcs->revision_id |= (1u << 31);
>>>                 /* init shadow vmcs */
>>> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>         skip_emulated_instruction(vcpu);
>>>         nested_vmx_succeed(vcpu);
>>>         return 1;
>>> +
>>> +out_shadow_vmcs:
>>> +       kfree(vmx->nested.cached_vmcs12);
>>> +
>>> +out_cached_vmcs12:
>>> +       free_page((unsigned long)vmx->nested.msr_bitmap);
>>> +
>>> +out_msr_bitmap:
>>> +       return -ENOMEM;
>>>  }
>>>
>>>  /*
>>> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>>>         vmx->nested.vmxon = false;
>>>         free_vpid(vmx->nested.vpid02);
>>>         nested_release_vmcs12(vmx);
>>> +       if (vmx->nested.msr_bitmap) {
>>> +               free_page((unsigned long)vmx->nested.msr_bitmap);
>>> +               vmx->nested.msr_bitmap = NULL;
>>> +       }
>>>         if (enable_shadow_vmcs)
>>>                 free_vmcs(vmx->nested.current_shadow_vmcs);
>>>         kfree(vmx->nested.cached_vmcs12);
>>> @@ -9472,8 +9477,10 @@ static inline bool
>>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>>  {
>>>         int msr;
>>>         struct page *page;
>>> -       unsigned long *msr_bitmap;
>>> +       unsigned long *msr_bitmap_l1;
>>> +       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>>>
>>> +       /* This shortcut is ok because we support only x2APIC MSRs so far.
>>> */
>>>         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>>>                 return false;
>>>
>>> @@ -9482,63 +9489,37 @@ static inline bool
>>> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>>>                 WARN_ON(1);
>>>                 return false;
>>>         }
>>> -       msr_bitmap = (unsigned long *)kmap(page);
>>> -       if (!msr_bitmap) {
>>> +       msr_bitmap_l1 = (unsigned long *)kmap(page);
>>> +       if (!msr_bitmap_l1) {
>>>                 nested_release_page_clean(page);
>>>                 WARN_ON(1);
>>>                 return false;
>>>         }
>>>
>>> +       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
>>> +
>>>         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>>>                 if (nested_cpu_has_apic_reg_virt(vmcs12))
>>>                         for (msr = 0x800; msr <= 0x8ff; msr++)
>>>                                 nested_vmx_disable_intercept_for_msr(
>>> -                                       msr_bitmap,
>>> -                                       vmx_msr_bitmap_nested,
>>> +                                       msr_bitmap_l1, msr_bitmap_l0,
>>>                                         msr, MSR_TYPE_R);
>>> -               /* TPR is allowed */
>>> -               nested_vmx_disable_intercept_for_msr(msr_bitmap,
>>> -                               vmx_msr_bitmap_nested,
>>> +
>>> +               nested_vmx_disable_intercept_for_msr(
>>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>>                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>>>                                 MSR_TYPE_R | MSR_TYPE_W);
>>> +
>>>                 if (nested_cpu_has_vid(vmcs12)) {
>>> -                       /* EOI and self-IPI are allowed */
>>>                         nested_vmx_disable_intercept_for_msr(
>>> -                               msr_bitmap,
>>> -                               vmx_msr_bitmap_nested,
>>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>>                                 APIC_BASE_MSR + (APIC_EOI >> 4),
>>>                                 MSR_TYPE_W);
>>>                         nested_vmx_disable_intercept_for_msr(
>>> -                               msr_bitmap,
>>> -                               vmx_msr_bitmap_nested,
>>> +                               msr_bitmap_l1, msr_bitmap_l0,
>>>                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>>>                                 MSR_TYPE_W);
>>>                 }
>>> -       } else {
>>> -               /*
>>> -                * Enable reading intercept of all the x2apic
>>> -                * MSRs. We should not rely on vmcs12 to do any
>>> -                * optimizations here, it may have been modified
>>> -                * by L1.
>>> -                */
>>> -               for (msr = 0x800; msr <= 0x8ff; msr++)
>>> -                       __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               msr,
>>> -                               MSR_TYPE_R);
>>> -
>>> -               __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>>> -                               MSR_TYPE_W);
>>> -               __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               APIC_BASE_MSR + (APIC_EOI >> 4),
>>> -                               MSR_TYPE_W);
>>> -               __vmx_enable_intercept_for_msr(
>>> -                               vmx_msr_bitmap_nested,
>>> -                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>>> -                               MSR_TYPE_W);
>>>         }
>>>         kunmap(page);
>>>         nested_release_page_clean(page);
>>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>>> struct vmcs12 *vmcs12)
>>>         }
>>>
>>>         if (cpu_has_vmx_msr_bitmap() &&
>>> -           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>>> -               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>>> -               /* MSR_BITMAP will be set by following vmx_set_efer. */
>>> -       } else
>>> +           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>>> +           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>>> +               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
>>> +       else
>>>                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>>>
>>>         /*
>>>
>>
>>
>> --
>> Yang
>> Alibaba Cloud Computing


-- 
Yang
Alibaba Cloud Computing

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-09 12:23     ` Radim Krčmář
@ 2016-08-10  5:55       ` Yang Zhang
  0 siblings, 0 replies; 16+ messages in thread
From: Yang Zhang @ 2016-08-10  5:55 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

On 2016/8/9 20:23, Radim Krčmář wrote:
> 2016-08-09 17:32+0800, Yang Zhang:
>> On 2016/8/9 2:16, Radim Krčmář wrote:
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>  		return 1;
>>>  	}
>>>
>>> +	if (cpu_has_vmx_msr_bitmap()) {
>>> +		vmx->nested.msr_bitmap =
>>> +				(unsigned long *)__get_free_page(GFP_KERNEL);
>>> +		if (!vmx->nested.msr_bitmap)
>>> +			goto out_msr_bitmap;
>>> +	}
>>> +
>>
>> We export msr_bitmap to guest even it is not supported by hardware. So we
>> need to allocate msr_bitmap for L1 unconditionally.
>
> We do emulate the feature, but the vmx->nested.msr_bitmap is used only
> if VMX supports it to avoid some VM exits:
>
>>> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>>  	}
>>>
>>>  	if (cpu_has_vmx_msr_bitmap() &&
>>> -	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
>>> -		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
>>> -		/* MSR_BITMAP will be set by following vmx_set_efer. */
>>> -	} else
>>> +	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
>>> +	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
>>> +		; /* MSR_BITMAP will be set by following vmx_set_efer. */
>>> +	else
>>>  		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>
> The else branch is taken if !cpu_has_vmx_msr_bitmap() and disables msr
> bitmaps.  Similar check for vmx_set_msr_bitmap(), so the NULL doesn't
> even get written to VMCS.
>
> KVM always uses L1's msr bitmaps when emulating the feature.
>

Yes, you are right. I forget the bitmap only used by hardware.:(

-- 
Yang
Alibaba Cloud Computing

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-08 18:16 ` [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write Radim Krčmář
@ 2016-08-12  6:07   ` Wanpeng Li
  2016-08-12  9:44     ` Radim Krčmář
  2016-08-16  2:43   ` Wanpeng Li
  1 sibling, 1 reply; 16+ messages in thread
From: Wanpeng Li @ 2016-08-12  6:07 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
> write with vmcs02 as the current VMCS.
> This will incorrectly apply modifications intended for vmcs01 to vmcs02
> and L2 can use it to gain access to L0's x2APIC registers by disabling
> virtualized x2APIC while using msr bitmap that assumes enabled.
>
> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
> current VMCS.  An alternative solution would temporarily make vmcs01 the
> current VMCS, but it requires more care.

There is a scenario both L1 and L2 are running on x2apic mode, L1
don't own the APIC_BASE writes, then L2 is intended to disable x2apic
mode, however, your logic will also disable x2apic mode for L1.

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-12  6:07   ` Wanpeng Li
@ 2016-08-12  9:44     ` Radim Krčmář
  2016-08-12 10:14       ` Wanpeng Li
  0 siblings, 1 reply; 16+ messages in thread
From: Radim Krčmář @ 2016-08-12  9:44 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-12 14:07+0800, Wanpeng Li:
> 2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
>> write with vmcs02 as the current VMCS.
>> This will incorrectly apply modifications intended for vmcs01 to vmcs02
>> and L2 can use it to gain access to L0's x2APIC registers by disabling
>> virtualized x2APIC while using msr bitmap that assumes enabled.
>>
>> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
>> current VMCS.  An alternative solution would temporarily make vmcs01 the
>> current VMCS, but it requires more care.
> 
> There is a scenario both L1 and L2 are running on x2apic mode, L1
> don't own the APIC_BASE writes, then L2 is intended to disable x2apic
> mode, however, your logic will also disable x2apic mode for L1.

You mean a case where L1 does intercept APIC_BASE?

That case is not affected, because it should cause a nested VM exit, so
vmx_set_virtual_x2apic_mode() won't be called in the first place.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-12  9:44     ` Radim Krčmář
@ 2016-08-12 10:14       ` Wanpeng Li
  2016-08-12 11:39         ` Radim Krčmář
  0 siblings, 1 reply; 16+ messages in thread
From: Wanpeng Li @ 2016-08-12 10:14 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-12 17:44 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> 2016-08-12 14:07+0800, Wanpeng Li:
>> 2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>>> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
>>> write with vmcs02 as the current VMCS.
>>> This will incorrectly apply modifications intended for vmcs01 to vmcs02
>>> and L2 can use it to gain access to L0's x2APIC registers by disabling
>>> virtualized x2APIC while using msr bitmap that assumes enabled.
>>>
>>> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
>>> current VMCS.  An alternative solution would temporarily make vmcs01 the
>>> current VMCS, but it requires more care.
>>
>> There is a scenario both L1 and L2 are running on x2apic mode, L1
>> don't own the APIC_BASE writes, then L2 is intended to disable x2apic
>> mode, however, your logic will also disable x2apic mode for L1.
>
> You mean a case where L1 does intercept APIC_BASE?
>
> That case is not affected, because it should cause a nested VM exit, so
> vmx_set_virtual_x2apic_mode() won't be called in the first place.

I mean L1 doesn't intercept APIC_BASE.

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-12 10:14       ` Wanpeng Li
@ 2016-08-12 11:39         ` Radim Krčmář
  2016-08-15  5:19           ` Wanpeng Li
  0 siblings, 1 reply; 16+ messages in thread
From: Radim Krčmář @ 2016-08-12 11:39 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-12 18:14+0800, Wanpeng Li:
> 2016-08-12 17:44 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>> 2016-08-12 14:07+0800, Wanpeng Li:
>>> 2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>>>> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
>>>> write with vmcs02 as the current VMCS.
>>>> This will incorrectly apply modifications intended for vmcs01 to vmcs02
>>>> and L2 can use it to gain access to L0's x2APIC registers by disabling
>>>> virtualized x2APIC while using msr bitmap that assumes enabled.
>>>>
>>>> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
>>>> current VMCS.  An alternative solution would temporarily make vmcs01 the
>>>> current VMCS, but it requires more care.
>>>
>>> There is a scenario both L1 and L2 are running on x2apic mode, L1
>>> don't own the APIC_BASE writes, then L2 is intended to disable x2apic
>>> mode, however, your logic will also disable x2apic mode for L1.
>>
>> You mean a case where L1 does intercept APIC_BASE?
>>
>> That case is not affected, because it should cause a nested VM exit, so
>> vmx_set_virtual_x2apic_mode() won't be called in the first place.
> 
> I mean L1 doesn't intercept APIC_BASE.

Then L2's write to APIC_BASE should only affect L1.
L2 is buggy if it intended to disable its x2APIC with the write
or L1 set up intercepts incorrectly for the indented L2.

In the non-nested case, if we didn't intercept APIC_BASE in KVM, then
the guest wouldn't change either;  only the host would change, so I
think it is correct to disable x2APIC mode in L1 only.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-12 11:39         ` Radim Krčmář
@ 2016-08-15  5:19           ` Wanpeng Li
  2016-08-15 14:31             ` Radim Krčmář
  0 siblings, 1 reply; 16+ messages in thread
From: Wanpeng Li @ 2016-08-15  5:19 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-12 19:39 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> 2016-08-12 18:14+0800, Wanpeng Li:
>> 2016-08-12 17:44 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>>> 2016-08-12 14:07+0800, Wanpeng Li:
>>>> 2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>>>>> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
>>>>> write with vmcs02 as the current VMCS.
>>>>> This will incorrectly apply modifications intended for vmcs01 to vmcs02
>>>>> and L2 can use it to gain access to L0's x2APIC registers by disabling
>>>>> virtualized x2APIC while using msr bitmap that assumes enabled.
>>>>>
>>>>> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
>>>>> current VMCS.  An alternative solution would temporarily make vmcs01 the
>>>>> current VMCS, but it requires more care.
>>>>
>>>> There is a scenario both L1 and L2 are running on x2apic mode, L1
>>>> don't own the APIC_BASE writes, then L2 is intended to disable x2apic
>>>> mode, however, your logic will also disable x2apic mode for L1.
>>>
>>> You mean a case where L1 does intercept APIC_BASE?
>>>
>>> That case is not affected, because it should cause a nested VM exit, so
>>> vmx_set_virtual_x2apic_mode() won't be called in the first place.
>>
>> I mean L1 doesn't intercept APIC_BASE.
>
> Then L2's write to APIC_BASE should only affect L1.
> L2 is buggy if it intended to disable its x2APIC with the write
> or L1 set up intercepts incorrectly for the indented L2.

Do you mean OS disable x2APIC during its running is buggy?

> In the non-nested case, if we didn't intercept APIC_BASE in KVM, then
> the guest wouldn't change either;  only the host would change, so I
> think it is correct to disable x2APIC mode in L1 only.

Agreed. :)

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-15  5:19           ` Wanpeng Li
@ 2016-08-15 14:31             ` Radim Krčmář
  0 siblings, 0 replies; 16+ messages in thread
From: Radim Krčmář @ 2016-08-15 14:31 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-15 13:19+0800, Wanpeng Li:
> 2016-08-12 19:39 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>> 2016-08-12 18:14+0800, Wanpeng Li:
>>> 2016-08-12 17:44 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>>>> 2016-08-12 14:07+0800, Wanpeng Li:
>>>>> 2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
>>>>>> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
>>>>>> write with vmcs02 as the current VMCS.
>>>>>> This will incorrectly apply modifications intended for vmcs01 to vmcs02
>>>>>> and L2 can use it to gain access to L0's x2APIC registers by disabling
>>>>>> virtualized x2APIC while using msr bitmap that assumes enabled.
>>>>>>
>>>>>> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
>>>>>> current VMCS.  An alternative solution would temporarily make vmcs01 the
>>>>>> current VMCS, but it requires more care.
>>>>>
>>>>> There is a scenario both L1 and L2 are running on x2apic mode, L1
>>>>> don't own the APIC_BASE writes, then L2 is intended to disable x2apic
>>>>> mode, however, your logic will also disable x2apic mode for L1.
>>>>
>>>> You mean a case where L1 does intercept APIC_BASE?
>>>>
>>>> That case is not affected, because it should cause a nested VM exit, so
>>>> vmx_set_virtual_x2apic_mode() won't be called in the first place.
>>>
>>> I mean L1 doesn't intercept APIC_BASE.
>>
>> Then L2's write to APIC_BASE should only affect L1.
>> L2 is buggy if it intended to disable its x2APIC with the write
>> or L1 set up intercepts incorrectly for the indented L2.
> 
> Do you mean OS disable x2APIC during its running is buggy?

Not in general, but if L1 doesn't intercept APIC_BASE and L2 writes to
it in order to disable its (L2's) x2APIC, then there is a bug in L2 or
L1.

If L1 intended to intercept, then it's a clear L1 bug, otherwise L2
should have known that L1 is a special hypervisor that doesn't intercept
APIC_BASE and the bug is on L2 side or on the user that ran unsuspecting
L2 on that L1.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write
  2016-08-08 18:16 ` [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write Radim Krčmář
  2016-08-12  6:07   ` Wanpeng Li
@ 2016-08-16  2:43   ` Wanpeng Li
  1 sibling, 0 replies; 16+ messages in thread
From: Wanpeng Li @ 2016-08-16  2:43 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the
> write with vmcs02 as the current VMCS.
> This will incorrectly apply modifications intended for vmcs01 to vmcs02
> and L2 can use it to gain access to L0's x2APIC registers by disabling
> virtualized x2APIC while using msr bitmap that assumes enabled.
>
> Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the
> current VMCS.  An alternative solution would temporarily make vmcs01 the
> current VMCS, but it requires more care.
>
> Fixes: 8d14695f9542 ("x86, apicv: add virtual x2apic support")
> Reported-by: Jim Mattson <jmattson@google.com>
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>

> ---
>  arch/x86/kvm/vmx.c | 13 +++++++++++++
>  1 file changed, 13 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index c66ac2c70d22..ae111a07acc4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -422,6 +422,7 @@ struct nested_vmx {
>         struct list_head vmcs02_pool;
>         int vmcs02_num;
>         u64 vmcs01_tsc_offset;
> +       bool change_vmcs01_virtual_x2apic_mode;
>         /* L2 must run next, and mustn't decide to exit to L1. */
>         bool nested_run_pending;
>         /*
> @@ -8424,6 +8425,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
>  {
>         u32 sec_exec_control;
>
> +       /* Postpone execution until vmcs01 is the current VMCS. */
> +       if (is_guest_mode(vcpu)) {
> +               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
> +               return;
> +       }
> +
>         /*
>          * There is not point to enable virtualize x2apic without enable
>          * apicv
> @@ -10749,6 +10756,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
>                 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
>                               PIN_BASED_VMX_PREEMPTION_TIMER);
>
> +       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
> +               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
> +               vmx_set_virtual_x2apic_mode(vcpu,
> +                               vcpu->arch.apic_base & X2APIC_ENABLE);
> +       }
> +
>         /* This is needed for same reason as it was needed in prepare_vmcs02 */
>         vmx->host_rsp = 0;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC
  2016-08-08 18:16 ` [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC Radim Krčmář
  2016-08-09  9:32   ` Yang Zhang
@ 2016-08-16  2:53   ` Wanpeng Li
  1 sibling, 0 replies; 16+ messages in thread
From: Wanpeng Li @ 2016-08-16  2:53 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: linux-kernel, kvm, Jim Mattson, Wincy Van, Paolo Bonzini, Bandan Das

2016-08-09 2:16 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> msr bitmap can be used to avoid a VM exit (interception) on guest MSR
> accesses.  In some configurations of VMX controls, the guest can even
> directly access host's x2APIC MSRs.  See SDM 29.5 VIRTUALIZING MSR-BASED
> APIC ACCESSES.
>
> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI.
> To do so, L1 would first trick KVM to disable all possible interceptions
> by enabling APICv features and then would turn those features off;
> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would
> not intercept previously enabled MSRs even though they were not safe
> with the new configuration.
>
> Correctly re-enabling interceptions is not enough as a second bug would
> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all
> VMCSs, so L1 could trigger a race to get the desired combination of msr
> bitmap and VMX controls.
>
> This fix allocates a msr bitmap for every L1 VCPU, allows only safe
> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would
> have to intercept everything anyway.
>
> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap")
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Wincy Van <fanwenyi0529@gmail.com>
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>

> ---
>  arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++-------------------------------
>  1 file changed, 44 insertions(+), 63 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a45d8580f91e..c66ac2c70d22 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -435,6 +435,8 @@ struct nested_vmx {
>         bool pi_pending;
>         u16 posted_intr_nv;
>
> +       unsigned long *msr_bitmap;
> +
>         struct hrtimer preemption_timer;
>         bool preemption_timer_expired;
>
> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy;
>  static unsigned long *vmx_msr_bitmap_longmode;
>  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
>  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
> -static unsigned long *vmx_msr_bitmap_nested;
>  static unsigned long *vmx_vmread_bitmap;
>  static unsigned long *vmx_vmwrite_bitmap;
>
> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>         unsigned long *msr_bitmap;
>
>         if (is_guest_mode(vcpu))
> -               msr_bitmap = vmx_msr_bitmap_nested;
> +               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
>         else if (cpu_has_secondary_exec_ctrls() &&
>                  (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
>                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void)
>         if (!vmx_msr_bitmap_longmode_x2apic)
>                 goto out4;
>
> -       if (nested) {
> -               vmx_msr_bitmap_nested =
> -                       (unsigned long *)__get_free_page(GFP_KERNEL);
> -               if (!vmx_msr_bitmap_nested)
> -                       goto out5;
> -       }
> -
>         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
>         if (!vmx_vmread_bitmap)
>                 goto out6;
> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void)
>
>         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
>         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
> -       if (nested)
> -               memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
>
>         if (setup_vmcs_config(&vmcs_config) < 0) {
>                 r = -EIO;
> @@ -6529,9 +6521,6 @@ out8:
>  out7:
>         free_page((unsigned long)vmx_vmread_bitmap);
>  out6:
> -       if (nested)
> -               free_page((unsigned long)vmx_msr_bitmap_nested);
> -out5:
>         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
>  out4:
>         free_page((unsigned long)vmx_msr_bitmap_longmode);
> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void)
>         free_page((unsigned long)vmx_io_bitmap_a);
>         free_page((unsigned long)vmx_vmwrite_bitmap);
>         free_page((unsigned long)vmx_vmread_bitmap);
> -       if (nested)
> -               free_page((unsigned long)vmx_msr_bitmap_nested);
>
>         free_kvm_area();
>  }
> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>                 return 1;
>         }
>
> +       if (cpu_has_vmx_msr_bitmap()) {
> +               vmx->nested.msr_bitmap =
> +                               (unsigned long *)__get_free_page(GFP_KERNEL);
> +               if (!vmx->nested.msr_bitmap)
> +                       goto out_msr_bitmap;
> +       }
> +
>         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
>         if (!vmx->nested.cached_vmcs12)
> -               return -ENOMEM;
> +               goto out_cached_vmcs12;
>
>         if (enable_shadow_vmcs) {
>                 shadow_vmcs = alloc_vmcs();
> -               if (!shadow_vmcs) {
> -                       kfree(vmx->nested.cached_vmcs12);
> -                       return -ENOMEM;
> -               }
> +               if (!shadow_vmcs)
> +                       goto out_shadow_vmcs;
>                 /* mark vmcs as shadow */
>                 shadow_vmcs->revision_id |= (1u << 31);
>                 /* init shadow vmcs */
> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>         skip_emulated_instruction(vcpu);
>         nested_vmx_succeed(vcpu);
>         return 1;
> +
> +out_shadow_vmcs:
> +       kfree(vmx->nested.cached_vmcs12);
> +
> +out_cached_vmcs12:
> +       free_page((unsigned long)vmx->nested.msr_bitmap);
> +
> +out_msr_bitmap:
> +       return -ENOMEM;
>  }
>
>  /*
> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx)
>         vmx->nested.vmxon = false;
>         free_vpid(vmx->nested.vpid02);
>         nested_release_vmcs12(vmx);
> +       if (vmx->nested.msr_bitmap) {
> +               free_page((unsigned long)vmx->nested.msr_bitmap);
> +               vmx->nested.msr_bitmap = NULL;
> +       }
>         if (enable_shadow_vmcs)
>                 free_vmcs(vmx->nested.current_shadow_vmcs);
>         kfree(vmx->nested.cached_vmcs12);
> @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>  {
>         int msr;
>         struct page *page;
> -       unsigned long *msr_bitmap;
> +       unsigned long *msr_bitmap_l1;
> +       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
>
> +       /* This shortcut is ok because we support only x2APIC MSRs so far. */
>         if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
>                 return false;
>
> @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
>                 WARN_ON(1);
>                 return false;
>         }
> -       msr_bitmap = (unsigned long *)kmap(page);
> -       if (!msr_bitmap) {
> +       msr_bitmap_l1 = (unsigned long *)kmap(page);
> +       if (!msr_bitmap_l1) {
>                 nested_release_page_clean(page);
>                 WARN_ON(1);
>                 return false;
>         }
>
> +       memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
> +
>         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
>                 if (nested_cpu_has_apic_reg_virt(vmcs12))
>                         for (msr = 0x800; msr <= 0x8ff; msr++)
>                                 nested_vmx_disable_intercept_for_msr(
> -                                       msr_bitmap,
> -                                       vmx_msr_bitmap_nested,
> +                                       msr_bitmap_l1, msr_bitmap_l0,
>                                         msr, MSR_TYPE_R);
> -               /* TPR is allowed */
> -               nested_vmx_disable_intercept_for_msr(msr_bitmap,
> -                               vmx_msr_bitmap_nested,
> +
> +               nested_vmx_disable_intercept_for_msr(
> +                               msr_bitmap_l1, msr_bitmap_l0,
>                                 APIC_BASE_MSR + (APIC_TASKPRI >> 4),
>                                 MSR_TYPE_R | MSR_TYPE_W);
> +
>                 if (nested_cpu_has_vid(vmcs12)) {
> -                       /* EOI and self-IPI are allowed */
>                         nested_vmx_disable_intercept_for_msr(
> -                               msr_bitmap,
> -                               vmx_msr_bitmap_nested,
> +                               msr_bitmap_l1, msr_bitmap_l0,
>                                 APIC_BASE_MSR + (APIC_EOI >> 4),
>                                 MSR_TYPE_W);
>                         nested_vmx_disable_intercept_for_msr(
> -                               msr_bitmap,
> -                               vmx_msr_bitmap_nested,
> +                               msr_bitmap_l1, msr_bitmap_l0,
>                                 APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
>                                 MSR_TYPE_W);
>                 }
> -       } else {
> -               /*
> -                * Enable reading intercept of all the x2apic
> -                * MSRs. We should not rely on vmcs12 to do any
> -                * optimizations here, it may have been modified
> -                * by L1.
> -                */
> -               for (msr = 0x800; msr <= 0x8ff; msr++)
> -                       __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               msr,
> -                               MSR_TYPE_R);
> -
> -               __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               APIC_BASE_MSR + (APIC_TASKPRI >> 4),
> -                               MSR_TYPE_W);
> -               __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               APIC_BASE_MSR + (APIC_EOI >> 4),
> -                               MSR_TYPE_W);
> -               __vmx_enable_intercept_for_msr(
> -                               vmx_msr_bitmap_nested,
> -                               APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
> -                               MSR_TYPE_W);
>         }
>         kunmap(page);
>         nested_release_page_clean(page);
> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>         }
>
>         if (cpu_has_vmx_msr_bitmap() &&
> -           exec_control & CPU_BASED_USE_MSR_BITMAPS) {
> -               nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
> -               /* MSR_BITMAP will be set by following vmx_set_efer. */
> -       } else
> +           exec_control & CPU_BASED_USE_MSR_BITMAPS &&
> +           nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
> +               ; /* MSR_BITMAP will be set by following vmx_set_efer. */
> +       else
>                 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
>
>         /*
> --
> 2.9.2
>



-- 
Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-08-16  2:53 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-08 18:16 [PATCH 0/2] KVM: nVMX: msr bitmaps fixes Radim Krčmář
2016-08-08 18:16 ` [PATCH 1/2] KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC Radim Krčmář
2016-08-09  9:32   ` Yang Zhang
2016-08-09 10:19     ` Wincy Van
2016-08-10  5:52       ` Yang Zhang
2016-08-09 12:23     ` Radim Krčmář
2016-08-10  5:55       ` Yang Zhang
2016-08-16  2:53   ` Wanpeng Li
2016-08-08 18:16 ` [PATCH 2/2] KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write Radim Krčmář
2016-08-12  6:07   ` Wanpeng Li
2016-08-12  9:44     ` Radim Krčmář
2016-08-12 10:14       ` Wanpeng Li
2016-08-12 11:39         ` Radim Krčmář
2016-08-15  5:19           ` Wanpeng Li
2016-08-15 14:31             ` Radim Krčmář
2016-08-16  2:43   ` Wanpeng Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).