Re: [PATCH v1 12/23] KVM: VMX: Handle FRED event data

From: Chao Gao <chao.gao@intel.com>
To: Xin Li <xin3.li@intel.com>
Cc: <kvm@vger.kernel.org>, <linux-doc@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, <linux-hyperv@vger.kernel.org>,
	<linux-kselftest@vger.kernel.org>, <seanjc@google.com>,
	<pbonzini@redhat.com>, <corbet@lwn.net>, <kys@microsoft.com>,
	<haiyangz@microsoft.com>, <wei.liu@kernel.org>,
	<decui@microsoft.com>, <tglx@linutronix.de>, <mingo@redhat.com>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <x86@kernel.org>,
	<hpa@zytor.com>, <vkuznets@redhat.com>, <peterz@infradead.org>,
	<ravi.v.shankar@intel.com>
Subject: Re: [PATCH v1 12/23] KVM: VMX: Handle FRED event data
Date: Mon, 13 Nov 2023 18:14:57 +0800	[thread overview]
Message-ID: <ZVH3IUsfvzuPaj6L@chao-email> (raw)
In-Reply-To: <20231108183003.5981-13-xin3.li@intel.com>

On Wed, Nov 08, 2023 at 10:29:52AM -0800, Xin Li wrote:
>Set injected-event data when injecting a #PF, #DB, or #NM caused
>by extended feature disable using FRED event delivery, and save
>original-event data for being used as injected-event data.
>
>Unlike IDT using some extra CPU register as part of an event
>context, e.g., %cr2 for #PF, FRED saves a complete event context
>in its stack frame, e.g., FRED saves the faulting linear address
>of a #PF into the event data field defined in its stack frame.
>
>Thus a new VMX control field called injected-event data is added
>to provide the event data that will be pushed into a FRED stack
>frame for VM entries that inject an event using FRED event delivery.
>In addition, a new VM exit information field called original-event
>data is added to store the event data that would have saved into a
>FRED stack frame for VM exits that occur during FRED event delivery.
>After such a VM exit is handled to allow the original-event to be
>delivered, the data in the original-event data VMCS field needs to
>be set into the injected-event data VMCS field for the injection of
>the original event.
>
>Tested-by: Shan Kang <shan.kang@intel.com>
>Signed-off-by: Xin Li <xin3.li@intel.com>
>---
> arch/x86/include/asm/vmx.h |  4 ++
> arch/x86/kvm/vmx/vmx.c     | 84 +++++++++++++++++++++++++++++++++++---
> arch/x86/kvm/x86.c         | 10 ++++-
> 3 files changed, 91 insertions(+), 7 deletions(-)
>
>diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>index d54a1a1057b0..97729248e844 100644
>--- a/arch/x86/include/asm/vmx.h
>+++ b/arch/x86/include/asm/vmx.h
>@@ -253,8 +253,12 @@ enum vmcs_field {
> 	PID_POINTER_TABLE_HIGH		= 0x00002043,
> 	SECONDARY_VM_EXIT_CONTROLS	= 0x00002044,
> 	SECONDARY_VM_EXIT_CONTROLS_HIGH	= 0x00002045,
>+	INJECTED_EVENT_DATA		= 0x00002052,
>+	INJECTED_EVENT_DATA_HIGH	= 0x00002053,
> 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
> 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
>+	ORIGINAL_EVENT_DATA		= 0x00002404,
>+	ORIGINAL_EVENT_DATA_HIGH	= 0x00002405,
> 	VMCS_LINK_POINTER               = 0x00002800,
> 	VMCS_LINK_POINTER_HIGH          = 0x00002801,
> 	GUEST_IA32_DEBUGCTL             = 0x00002802,
>diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>index 58d01e845804..67fd4a56d031 100644
>--- a/arch/x86/kvm/vmx/vmx.c
>+++ b/arch/x86/kvm/vmx/vmx.c
>@@ -1880,9 +1880,30 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu)
> 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
> 			     vmx->vcpu.arch.event_exit_inst_len);
> 		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
>-	} else
>+	} else {
> 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
> 
>+		if (kvm_is_fred_enabled(vcpu)) {
>+			u64 event_data = 0;
>+
>+			if (is_debug(intr_info))
>+				/*
>+				 * Compared to DR6, FRED #DB event data saved on
>+				 * the stack frame have bits 4 ~ 11 and 16 ~ 31
>+				 * inverted, i.e.,
>+				 *   fred_db_event_data = dr6 ^ 0xFFFF0FF0UL
>+				 */
>+				event_data = vcpu->arch.dr6 ^ DR6_RESERVED;
>+			else if (is_page_fault(intr_info))
>+				event_data = vcpu->arch.cr2;
>+			else if (is_nm_fault(intr_info) &&
>+				 vcpu->arch.guest_fpu.fpstate->xfd)

does this necessarily mean the #NM is caused by XFD?

>+				event_data = vcpu->arch.guest_fpu.xfd_err;
>+
>+			vmcs_write64(INJECTED_EVENT_DATA, event_data);
>+		}
>+	}
>+
> 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
> 
> 	vmx_clear_hlt(vcpu);
>@@ -7226,7 +7247,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
> static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
> 				      u32 idt_vectoring_info,
> 				      int instr_len_field,
>-				      int error_code_field)
>+				      int error_code_field,
>+				      int event_data_field)

event_data_field is used to indicate whether this is a "cancel". I may think it is
better to simply use a boolean e.g., bool cancel.

> {
> 	u8 vector;
> 	int type;
>@@ -7260,6 +7282,37 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
> 		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
> 		fallthrough;
> 	case INTR_TYPE_HARD_EXCEPTION:
>+		if (kvm_is_fred_enabled(vcpu) && event_data_field) {
>+			/*
>+			 * Save original-event data for being used as injected-event data.
>+			 */

Looks we also expect CPU will update CR2/DR6/XFD_ERR. this hunk looks to me just a paranoid
check to ensure the cpu works as expected. if that's the case, I suggest documenting it
a bit in the comment.

>+			u64 event_data = vmcs_read64(event_data_field);
>+
>+			switch (vector) {
>+			case DB_VECTOR:
>+				get_debugreg(vcpu->arch.dr6, 6);
>+				WARN_ON(vcpu->arch.dr6 != (event_data ^ DR6_RESERVED));
>+				vcpu->arch.dr6 = event_data ^ DR6_RESERVED;
>+				break;
>+			case NM_VECTOR:
>+				if (vcpu->arch.guest_fpu.fpstate->xfd) {
>+					rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
>+					WARN_ON(vcpu->arch.guest_fpu.xfd_err != event_data);
>+					vcpu->arch.guest_fpu.xfd_err = event_data;
>+				} else {
>+					WARN_ON(event_data != 0);
>+				}
>+				break;
>+			case PF_VECTOR:
>+				WARN_ON(vcpu->arch.cr2 != event_data);
>+				vcpu->arch.cr2 = event_data;
>+				break;
>+			default:
>+				WARN_ON(event_data != 0);

I am not sure if this WARN_ON() can be triggeded by nested VMX. It is
legitimate for L1 VMM to inject any event w/ an event_data.

FRED spec says:

Section 5.2.1 specifies the event data that FRED event delivery of certain events saves
on the stack. When FRED event delivery is used for an event injected by VM entry, the
event data saved is the value of the injected-event-data field in the VMCS. This value is
used instead of what is specified in Section 5.2.1 and is done for __ALL__ injected events
using FRED event delivery

>+				break;
>+			}
>+		}
>+
> 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
> 			u32 err = vmcs_read32(error_code_field);
> 			kvm_requeue_exception_e(vcpu, vector, err);
>@@ -7279,9 +7332,11 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
> 
> static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
> {
>-	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
>+	__vmx_complete_interrupts(&vmx->vcpu,
>+				  vmx->idt_vectoring_info,
> 				  VM_EXIT_INSTRUCTION_LEN,
>-				  IDT_VECTORING_ERROR_CODE);
>+				  IDT_VECTORING_ERROR_CODE,
>+				  ORIGINAL_EVENT_DATA);
> }
> 
> static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
>@@ -7289,7 +7344,8 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
> 	__vmx_complete_interrupts(vcpu,
> 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
> 				  VM_ENTRY_INSTRUCTION_LEN,
>-				  VM_ENTRY_EXCEPTION_ERROR_CODE);
>+				  VM_ENTRY_EXCEPTION_ERROR_CODE,
>+				  0);
> 
> 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
> }