linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag
@ 2019-03-26 13:07 Vitaly Kuznetsov
  2019-03-26 13:11 ` Liran Alon
  0 siblings, 1 reply; 5+ messages in thread
From: Vitaly Kuznetsov @ 2019-03-26 13:07 UTC (permalink / raw)
  To: kvm
  Cc: Paolo Bonzini, Radim Krčmář,
	Jon Doron, Sean Christopherson, Liran Alon, linux-kernel

Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
The check is correct, however, there is a special case when RSM is called
to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
set and in case VMXE was set before entering SMM we're failing to return.

Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
when ops->set_cr() is called from RSM.

Reported-by: Jon Doron <arilou@gmail.com>
Suggested-by: Liran Alon <liran.alon@oracle.com>
Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
- Instread of putting the temporary HF_SMM_MASK drop to
  rsm_enter_protected_mode() (as was suggested by Liran), move it to
  emulator_set_cr() modifying its interface. emulate.c seems to be
  vcpu-specifics-free at this moment, we may want to keep it this way.
- It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
  hangs even with this patch. These hangs, however, seem to be unrelated to
  rsm.
---
 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/kvm/emulate.c             | 27 ++++++++++++++-------------
 arch/x86/kvm/x86.c                 | 12 +++++++++++-
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 93c4bf598fb0..6c33caa82fa5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -203,7 +203,8 @@ struct x86_emulate_ops {
 	void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
-	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
+		      bool from_rsm);
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c338984c850d..a6204105d4d7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2413,7 +2413,7 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
 		cr3 &= ~0xfff;
 	}
 
-	bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+	bad = ctxt->ops->set_cr(ctxt, 3, cr3, true);
 	if (bad)
 		return X86EMUL_UNHANDLEABLE;
 
@@ -2422,20 +2422,20 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
 	 * Then enable protected mode.	However, PCID cannot be enabled
 	 * if EFER.LMA=0, so set it separately.
 	 */
-	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
 	if (bad)
 		return X86EMUL_UNHANDLEABLE;
 
-	bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+	bad = ctxt->ops->set_cr(ctxt, 0, cr0, true);
 	if (bad)
 		return X86EMUL_UNHANDLEABLE;
 
 	if (cr4 & X86_CR4_PCIDE) {
-		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+		bad = ctxt->ops->set_cr(ctxt, 4, cr4, true);
 		if (bad)
 			return X86EMUL_UNHANDLEABLE;
 		if (pcid) {
-			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid, true);
 			if (bad)
 				return X86EMUL_UNHANDLEABLE;
 		}
@@ -2581,7 +2581,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 
 		/* Zero CR4.PCIDE before CR0.PG.  */
 		if (cr4 & X86_CR4_PCIDE) {
-			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
 			cr4 &= ~X86_CR4_PCIDE;
 		}
 
@@ -2595,11 +2595,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	/* For the 64-bit case, this will clear EFER.LMA.  */
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	if (cr0 & X86_CR0_PE)
-		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE),
+				  true);
 
 	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
 	if (cr4 & X86_CR4_PAE)
-		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE, true);
 
 	/* And finally go back to 32-bit mode.  */
 	efer = 0;
@@ -3131,7 +3132,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	int ret;
 	u8 cpl;
 
-	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3, false))
 		return emulate_gp(ctxt, 0);
 	ctxt->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
@@ -3331,7 +3332,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
 	}
 
-	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
+	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS, false);
 	ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
 
 	if (has_error_code) {
@@ -3633,7 +3634,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
 
 static int em_cr_write(struct x86_emulate_ctxt *ctxt)
 {
-	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val, false))
 		return emulate_gp(ctxt, 0);
 
 	/* Disable writeback. */
@@ -3766,7 +3767,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
 
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	cr0 &= ~X86_CR0_TS;
-	ctxt->ops->set_cr(ctxt, 0, cr0);
+	ctxt->ops->set_cr(ctxt, 0, cr0, false);
 	return X86EMUL_CONTINUE;
 }
 
@@ -3866,7 +3867,7 @@ static int em_smsw(struct x86_emulate_ctxt *ctxt)
 static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 {
 	ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
-			  | (ctxt->src.val & 0x0f));
+			  | (ctxt->src.val & 0x0f), false);
 	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a419656521b6..f2745e3170b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5739,7 +5739,8 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
 	return value;
 }
 
-static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
+			   bool from_rsm)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	int res = 0;
@@ -5755,7 +5756,16 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 		res = kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
+		/*
+		 * set_cr4() may forbid to set certain flags (e.g. VMXE) from
+		 * SMM but we're actually leaving it; temporary drop HF_SMM_MASK
+		 * when setting CR4.
+		 */
+		if (from_rsm)
+			vcpu->arch.hflags &= ~HF_SMM_MASK;
 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		if (from_rsm)
+			vcpu->arch.hflags |= HF_SMM_MASK;
 		break;
 	case 8:
 		res = kvm_set_cr8(vcpu, val);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag
  2019-03-26 13:07 [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag Vitaly Kuznetsov
@ 2019-03-26 13:11 ` Liran Alon
  2019-03-26 13:48   ` Vitaly Kuznetsov
  0 siblings, 1 reply; 5+ messages in thread
From: Liran Alon @ 2019-03-26 13:11 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: kvm, Paolo Bonzini, Radim Krčmář,
	Jon Doron, Sean Christopherson, linux-kernel



> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
> The check is correct, however, there is a special case when RSM is called
> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
> set and in case VMXE was set before entering SMM we're failing to return.
> 
> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
> when ops->set_cr() is called from RSM.
> 
> Reported-by: Jon Doron <arilou@gmail.com>
> Suggested-by: Liran Alon <liran.alon@oracle.com>
> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>

Patch looks good to me.
Reviewed-by: Liran Alon <liran.alon@oracle.com>

> ---
> - Instread of putting the temporary HF_SMM_MASK drop to
>  rsm_enter_protected_mode() (as was suggested by Liran), move it to
>  emulator_set_cr() modifying its interface. emulate.c seems to be
>  vcpu-specifics-free at this moment, we may want to keep it this way.
> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>  hangs even with this patch. These hangs, however, seem to be unrelated to
>  rsm.

Feel free to share details on these hangs ;)

Great work,
-Liran

> ---
> arch/x86/include/asm/kvm_emulate.h |  3 ++-
> arch/x86/kvm/emulate.c             | 27 ++++++++++++++-------------
> arch/x86/kvm/x86.c                 | 12 +++++++++++-
> 3 files changed, 27 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
> index 93c4bf598fb0..6c33caa82fa5 100644
> --- a/arch/x86/include/asm/kvm_emulate.h
> +++ b/arch/x86/include/asm/kvm_emulate.h
> @@ -203,7 +203,8 @@ struct x86_emulate_ops {
> 	void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
> 	void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
> 	ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
> -	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
> +	int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
> +		      bool from_rsm);
> 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
> 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
> 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index c338984c850d..a6204105d4d7 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -2413,7 +2413,7 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
> 		cr3 &= ~0xfff;
> 	}
> 
> -	bad = ctxt->ops->set_cr(ctxt, 3, cr3);
> +	bad = ctxt->ops->set_cr(ctxt, 3, cr3, true);
> 	if (bad)
> 		return X86EMUL_UNHANDLEABLE;
> 
> @@ -2422,20 +2422,20 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
> 	 * Then enable protected mode.	However, PCID cannot be enabled
> 	 * if EFER.LMA=0, so set it separately.
> 	 */
> -	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
> +	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
> 	if (bad)
> 		return X86EMUL_UNHANDLEABLE;
> 
> -	bad = ctxt->ops->set_cr(ctxt, 0, cr0);
> +	bad = ctxt->ops->set_cr(ctxt, 0, cr0, true);
> 	if (bad)
> 		return X86EMUL_UNHANDLEABLE;
> 
> 	if (cr4 & X86_CR4_PCIDE) {
> -		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
> +		bad = ctxt->ops->set_cr(ctxt, 4, cr4, true);
> 		if (bad)
> 			return X86EMUL_UNHANDLEABLE;
> 		if (pcid) {
> -			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
> +			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid, true);
> 			if (bad)
> 				return X86EMUL_UNHANDLEABLE;
> 		}
> @@ -2581,7 +2581,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
> 
> 		/* Zero CR4.PCIDE before CR0.PG.  */
> 		if (cr4 & X86_CR4_PCIDE) {
> -			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
> +			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE, true);
> 			cr4 &= ~X86_CR4_PCIDE;
> 		}
> 
> @@ -2595,11 +2595,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
> 	/* For the 64-bit case, this will clear EFER.LMA.  */
> 	cr0 = ctxt->ops->get_cr(ctxt, 0);
> 	if (cr0 & X86_CR0_PE)
> -		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
> +		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE),
> +				  true);
> 
> 	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
> 	if (cr4 & X86_CR4_PAE)
> -		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
> +		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE, true);
> 
> 	/* And finally go back to 32-bit mode.  */
> 	efer = 0;
> @@ -3131,7 +3132,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
> 	int ret;
> 	u8 cpl;
> 
> -	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
> +	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3, false))
> 		return emulate_gp(ctxt, 0);
> 	ctxt->_eip = tss->eip;
> 	ctxt->eflags = tss->eflags | 2;
> @@ -3331,7 +3332,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
> 		write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
> 	}
> 
> -	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
> +	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS, false);
> 	ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
> 
> 	if (has_error_code) {
> @@ -3633,7 +3634,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
> 
> static int em_cr_write(struct x86_emulate_ctxt *ctxt)
> {
> -	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
> +	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val, false))
> 		return emulate_gp(ctxt, 0);
> 
> 	/* Disable writeback. */
> @@ -3766,7 +3767,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
> 
> 	cr0 = ctxt->ops->get_cr(ctxt, 0);
> 	cr0 &= ~X86_CR0_TS;
> -	ctxt->ops->set_cr(ctxt, 0, cr0);
> +	ctxt->ops->set_cr(ctxt, 0, cr0, false);
> 	return X86EMUL_CONTINUE;
> }
> 
> @@ -3866,7 +3867,7 @@ static int em_smsw(struct x86_emulate_ctxt *ctxt)
> static int em_lmsw(struct x86_emulate_ctxt *ctxt)
> {
> 	ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
> -			  | (ctxt->src.val & 0x0f));
> +			  | (ctxt->src.val & 0x0f), false);
> 	ctxt->dst.type = OP_NONE;
> 	return X86EMUL_CONTINUE;
> }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a419656521b6..f2745e3170b6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5739,7 +5739,8 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
> 	return value;
> }
> 
> -static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
> +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val,
> +			   bool from_rsm)
> {
> 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
> 	int res = 0;
> @@ -5755,7 +5756,16 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
> 		res = kvm_set_cr3(vcpu, val);
> 		break;
> 	case 4:
> +		/*
> +		 * set_cr4() may forbid to set certain flags (e.g. VMXE) from
> +		 * SMM but we're actually leaving it; temporary drop HF_SMM_MASK
> +		 * when setting CR4.
> +		 */
> +		if (from_rsm)
> +			vcpu->arch.hflags &= ~HF_SMM_MASK;
> 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
> +		if (from_rsm)
> +			vcpu->arch.hflags |= HF_SMM_MASK;
> 		break;
> 	case 8:
> 		res = kvm_set_cr8(vcpu, val);
> -- 
> 2.20.1
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag
  2019-03-26 13:11 ` Liran Alon
@ 2019-03-26 13:48   ` Vitaly Kuznetsov
  2019-03-26 15:02     ` Liran Alon
  0 siblings, 1 reply; 5+ messages in thread
From: Vitaly Kuznetsov @ 2019-03-26 13:48 UTC (permalink / raw)
  To: Liran Alon
  Cc: kvm, Paolo Bonzini, Radim Krčmář,
	Jon Doron, Sean Christopherson, linux-kernel

Liran Alon <liran.alon@oracle.com> writes:

>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>> 
>> Commit 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against
>> SMM") introduced a check to vmx_set_cr4() forbidding to set VMXE from SMM.
>> The check is correct, however, there is a special case when RSM is called
>> to leave SMM: rsm_enter_protected_mode() is called with HF_SMM_MASK still
>> set and in case VMXE was set before entering SMM we're failing to return.
>> 
>> Resolve the issue by temporary dropping HF_SMM_MASK around set_cr4() calls
>> when ops->set_cr() is called from RSM.
>> 
>> Reported-by: Jon Doron <arilou@gmail.com>
>> Suggested-by: Liran Alon <liran.alon@oracle.com>
>> Fixes: 5bea5123cbf0 ("KVM: VMX: check nested state and CR4.VMXE against SMM")
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>
> Patch looks good to me.
> Reviewed-by: Liran Alon <liran.alon@oracle.com>

Thanks!

>
>> ---
>> - Instread of putting the temporary HF_SMM_MASK drop to
>>  rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>  emulator_set_cr() modifying its interface. emulate.c seems to be
>>  vcpu-specifics-free at this moment, we may want to keep it this way.
>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>  hangs even with this patch. These hangs, however, seem to be unrelated to
>>  rsm.
>
> Feel free to share details on these hangs ;)
>

You've asked for it)

The immediate issue I'm observing is some sort of a lockup which is easy
to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
seems we get too many interrupts and combined with preemtion timer for
L2 we're not making any progress:

kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
kvm_set_irq:          gsi 18 level 1 source 0
kvm_msi_set_irq:      dst 0 vec 177 (Fixed|physical|level)
kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
kvm_fpu:              load
kvm_entry:            vcpu 0
kvm_exit:             reason VMRESUME rip 0xfffff80000848115 info 0 0
kvm_entry:            vcpu 0
kvm_exit:             reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0
kvm_nested_vmexit:    rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0
kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0
kvm_entry:            vcpu 0
kvm_exit:             reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0
kvm_apic:             apic_write APIC_EOI = 0x0
kvm_eoi:              apicid 0 vector 177
kvm_fpu:              unload
kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
...
(and the pattern repeats)

Maybe it is a usb-only/Qemu-only problem, maybe not.

-- 
Vitaly

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag
  2019-03-26 13:48   ` Vitaly Kuznetsov
@ 2019-03-26 15:02     ` Liran Alon
  2019-03-27 10:08       ` Vitaly Kuznetsov
  0 siblings, 1 reply; 5+ messages in thread
From: Liran Alon @ 2019-03-26 15:02 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: kvm, Paolo Bonzini, Radim Krčmář,
	Jon Doron, Sean Christopherson, linux-kernel



> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
> 
> Liran Alon <liran.alon@oracle.com> writes:
> 
>>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>> - Instread of putting the temporary HF_SMM_MASK drop to
>>> rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>> emulator_set_cr() modifying its interface. emulate.c seems to be
>>> vcpu-specifics-free at this moment, we may want to keep it this way.
>>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>> hangs even with this patch. These hangs, however, seem to be unrelated to
>>> rsm.
>> 
>> Feel free to share details on these hangs ;)
>> 
> 
> You've asked for it)
> 
> The immediate issue I'm observing is some sort of a lockup which is easy
> to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
> seems we get too many interrupts and combined with preemtion timer for
> L2 we're not making any progress:
> 
> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
> kvm_set_irq:          gsi 18 level 1 source 0
> kvm_msi_set_irq:      dst 0 vec 177 (Fixed|physical|level)
> kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
> kvm_fpu:              load
> kvm_entry:            vcpu 0
> kvm_exit:             reason VMRESUME rip 0xfffff80000848115 info 0 0
> kvm_entry:            vcpu 0
> kvm_exit:             reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0
> kvm_nested_vmexit:    rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0
> kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0
> kvm_entry:            vcpu 0
> kvm_exit:             reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0
> kvm_apic:             apic_write APIC_EOI = 0x0
> kvm_eoi:              apicid 0 vector 177
> kvm_fpu:              unload
> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
> ...
> (and the pattern repeats)
> 
> Maybe it is a usb-only/Qemu-only problem, maybe not.
> 
> -- 
> Vitaly

The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was called to inject an interrupt to L1 guest.
(I know that now we are running in L1 because next exit is a VMRESUME).

However, it is surprising to see that on next entry to guest, no interrupt was injected by vmx_inject_irq().
It may be because L1 guest is currently running with interrupt disabled and therefore only an IRQ-window was requested.
(Too bad we don’t have a trace for this…)

Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, active VMCS was changed from vmcs01 to vmcs02.
I believe the immediate exit later on preemption-timer was because the immediate-exit-request mechanism was invoked
which is now implemented by setting a VMX preemption-timer with value of 0 (Thanks to Sean).
(See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)).
(Note that the pending interrupt was evaluated because of a recent patch of mine to nested_vmx_enter_non_root_mode()
to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window)

Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER which will cause eventually L0 to call
vmx_check_nested_events() which notices now the pending interrupt that should have been injected before to L1
and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1.

Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an EOI to IOAPIC which immediately re-inject
the interrupt (after clearing the remote_irr) as the irq-line is still set. i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it clears remote-irr for this pin.

Also note that in trace we see only a single kvm_set_irq to level 1 but we don’t see immediately another kvm_set_irq to level 0.
This should indicate that in QEMU’s IOAPIC redirection-table, this pin is configured as level-triggered interrupt.
However, the trace of kvm_apic_accept_irq indicates that this interrupt is raised as an edge-triggered interrupt.

To sum up:
1) I would create a patch to add a trace to vcpu_enter_guest() when calling enable_smi_window() / enable_nmi_window() / enable_irq_window().
2) It is worth investigating why MSI trigger-mode is edge-triggered instead of level-triggered.
3) If this is indeed a level-triggered interrupt, it is worth investigating how the interrupt source behaves. i.e. What cause this device to lower the irq-line?
(As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler before performing the EOI)
4) Does this issue reproduce also when running with kernel-irqchip? (Instead of split-irqchip)

-Liran





^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag
  2019-03-26 15:02     ` Liran Alon
@ 2019-03-27 10:08       ` Vitaly Kuznetsov
  0 siblings, 0 replies; 5+ messages in thread
From: Vitaly Kuznetsov @ 2019-03-27 10:08 UTC (permalink / raw)
  To: Liran Alon
  Cc: kvm, Paolo Bonzini, Radim Krčmář,
	Jon Doron, Sean Christopherson, linux-kernel

Liran Alon <liran.alon@oracle.com> writes:

>> On 26 Mar 2019, at 15:48, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>> 
>> Liran Alon <liran.alon@oracle.com> writes:
>> 
>>>> On 26 Mar 2019, at 15:07, Vitaly Kuznetsov <vkuznets@redhat.com> wrote:
>>>> - Instread of putting the temporary HF_SMM_MASK drop to
>>>> rsm_enter_protected_mode() (as was suggested by Liran), move it to
>>>> emulator_set_cr() modifying its interface. emulate.c seems to be
>>>> vcpu-specifics-free at this moment, we may want to keep it this way.
>>>> - It seems that Hyper-V+UEFI on KVM is still broken, I'm observing sporadic
>>>> hangs even with this patch. These hangs, however, seem to be unrelated to
>>>> rsm.
>>> 
>>> Feel free to share details on these hangs ;)
>>> 
>> 
>> You've asked for it)
>> 
>> The immediate issue I'm observing is some sort of a lockup which is easy
>> to trigger with e.g. "-usb -device usb-tablet" on Qemu command line; it
>> seems we get too many interrupts and combined with preemtion timer for
>> L2 we're not making any progress:
>> 
>> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
>> kvm_set_irq:          gsi 18 level 1 source 0
>> kvm_msi_set_irq:      dst 0 vec 177 (Fixed|physical|level)
>> kvm_apic_accept_irq:  apicid 0 vec 177 (Fixed|edge)
>> kvm_fpu:              load
>> kvm_entry:            vcpu 0
>> kvm_exit:             reason VMRESUME rip 0xfffff80000848115 info 0 0
>> kvm_entry:            vcpu 0
>> kvm_exit:             reason PREEMPTION_TIMER rip 0xfffff800f4448e01 info 0 0
>> kvm_nested_vmexit:    rip fffff800f4448e01 reason PREEMPTION_TIMER info1 0 info2 0 int_info 0 int_info_err 0
>> kvm_nested_vmexit_inject: reason EXTERNAL_INTERRUPT info1 0 info2 0 int_info 800000b1 int_info_err 0
>> kvm_entry:            vcpu 0
>> kvm_exit:             reason APIC_ACCESS rip 0xfffff8000081fe11 info 10b0 0
>> kvm_apic:             apic_write APIC_EOI = 0x0
>> kvm_eoi:              apicid 0 vector 177
>> kvm_fpu:              unload
>> kvm_userspace_exit:   reason KVM_EXIT_IOAPIC_EOI (26)
>> ...
>> (and the pattern repeats)
>> 
>> Maybe it is a usb-only/Qemu-only problem, maybe not.
>> 
>> -- 
>> Vitaly
>
> The trace of kvm_apic_accept_irq should indicate that __apic_accept_irq() was called to inject an interrupt to L1 guest.
> (I know that now we are running in L1 because next exit is a VMRESUME).
>
> However, it is surprising to see that on next entry to guest, no interrupt was injected by vmx_inject_irq().
> It may be because L1 guest is currently running with interrupt disabled and therefore only an IRQ-window was requested.
> (Too bad we don’t have a trace for this…)
>
> Next, we got an exit from L1 guest on VMRESUME. As part of it’s handling, active VMCS was changed from vmcs01 to vmcs02.
> I believe the immediate exit later on preemption-timer was because the immediate-exit-request mechanism was invoked
> which is now implemented by setting a VMX preemption-timer with value of 0 (Thanks to Sean).
> (See vmx_vcpu_run() -> vmx_update_hv_timer() -> vmx_arm_hv_timer(vmx, 0)).
> (Note that the pending interrupt was evaluated because of a recent patch of mine to nested_vmx_enter_non_root_mode()
> to request KVM_REQ_EVENT when vmcs01 have requested an IRQ-window)
>
> Therefore when entering L2, you immediately get an exit on PREEMPTION_TIMER which will cause eventually L0 to call
> vmx_check_nested_events() which notices now the pending interrupt that should have been injected before to L1
> and now exit from L2 to L1 on EXTERNAL_INTERRUPT on vector 0xb1.
>
> Then L1 handles the interrupt by performing an EOI to LAPIC which propagate an EOI to IOAPIC which immediately re-inject
> the interrupt (after clearing the remote_irr) as the irq-line is still set. i.e. QEMU’s ioapic_eoi_broadcast() calls ioapic_service() immediate after it clears remote-irr for this pin.
>
> Also note that in trace we see only a single kvm_set_irq to level 1 but we don’t see immediately another kvm_set_irq to level 0.
> This should indicate that in QEMU’s IOAPIC redirection-table, this pin is configured as level-triggered interrupt.
> However, the trace of kvm_apic_accept_irq indicates that this interrupt is raised as an edge-triggered interrupt.
>
> To sum up:
> 1) I would create a patch to add a trace to vcpu_enter_guest() when calling enable_smi_window() / enable_nmi_window() / enable_irq_window().
> 2) It is worth investigating why MSI trigger-mode is edge-triggered instead of level-triggered.
> 3) If this is indeed a level-triggered interrupt, it is worth investigating how the interrupt source behaves. i.e. What cause this device to lower the irq-line?
> (As we don’t see any I/O Port or MMIO access by L1 guest interrupt-handler before performing the EOI)
> 4) Does this issue reproduce also when running with kernel-irqchip? (Instead of split-irqchip)
>

Thank you Liran,

all are valuable suggestions. It seems the isssue doesn't reproduce with
'kernel-irqchip=on' but reproduces with "kernel-irqchip=split". My first
guess would then be that we're less picky with in-kernel implementation
about the observed edge/level discrepancy. I'll be investigating and
share my findings.

-- 
Vitaly

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-03-27 10:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-26 13:07 [PATCH] KVM: x86: nVMX: allow RSM to restore VMXE CR4 flag Vitaly Kuznetsov
2019-03-26 13:11 ` Liran Alon
2019-03-26 13:48   ` Vitaly Kuznetsov
2019-03-26 15:02     ` Liran Alon
2019-03-27 10:08       ` Vitaly Kuznetsov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).