All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sean Christopherson <seanjc@google.com>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: linux-kernel@vger.kernel.org, kvm@vger.kernel.org,
	michael.roth@amd.com,  isaku.yamahata@intel.com,
	thomas.lendacky@amd.com
Subject: Re: [PATCH 07/21] KVM: VMX: Introduce test mode related to EPT violation VE
Date: Tue, 27 Feb 2024 17:56:27 -0800	[thread overview]
Message-ID: <Zd6Sy_PujXJVji0n@google.com> (raw)
In-Reply-To: <20240227232100.478238-8-pbonzini@redhat.com>

On Tue, Feb 27, 2024, Paolo Bonzini wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> To support TDX, KVM is enhanced to operate with #VE.  For TDX, KVM uses the
> suppress #VE bit in EPT entries selectively, in order to be able to trap
> non-present conditions.  However, #VE isn't used for VMX and it's a bug
> if it happens.  To be defensive and test that VMX case isn't broken
> introduce an option ept_violation_ve_test and when it's set, BUG the vm.

This needs to be two patches:

 1. Add the architecture #defines, enums, structures, and is_ve_fault().
 2. Add the forced #VE enabling test code

> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> Message-Id: <d6db6ba836605c0412e166359ba5c46a63c22f86.1705965635.git.isaku.yamahata@intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/vmx.h | 12 +++++++
>  arch/x86/kvm/vmx/vmcs.h    |  5 +++
>  arch/x86/kvm/vmx/vmx.c     | 69 +++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/vmx.h     |  6 +++-
>  4 files changed, 90 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 76ed39541a52..f703bae0c4ac 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -70,6 +70,7 @@
>  #define SECONDARY_EXEC_ENCLS_EXITING		VMCS_CONTROL_BIT(ENCLS_EXITING)
>  #define SECONDARY_EXEC_RDSEED_EXITING		VMCS_CONTROL_BIT(RDSEED_EXITING)
>  #define SECONDARY_EXEC_ENABLE_PML               VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
> +#define SECONDARY_EXEC_EPT_VIOLATION_VE		VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
>  #define SECONDARY_EXEC_PT_CONCEAL_VMX		VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
>  #define SECONDARY_EXEC_ENABLE_XSAVES		VMCS_CONTROL_BIT(XSAVES)
>  #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
> @@ -225,6 +226,8 @@ enum vmcs_field {
>  	VMREAD_BITMAP_HIGH              = 0x00002027,
>  	VMWRITE_BITMAP                  = 0x00002028,
>  	VMWRITE_BITMAP_HIGH             = 0x00002029,
> +	VE_INFORMATION_ADDRESS		= 0x0000202A,
> +	VE_INFORMATION_ADDRESS_HIGH	= 0x0000202B,
>  	XSS_EXIT_BITMAP                 = 0x0000202C,
>  	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
>  	ENCLS_EXITING_BITMAP		= 0x0000202E,
> @@ -630,4 +633,13 @@ enum vmx_l1d_flush_state {
>  
>  extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
>  
> +struct vmx_ve_information {
> +	u32 exit_reason;
> +	u32 delivery;
> +	u64 exit_qualification;
> +	u64 guest_linear_address;
> +	u64 guest_physical_address;
> +	u16 eptp_index;
> +};

Should this be __packed since it's hardware-defined, or are we ok relying on the
compiler to not be stupid?

>  #endif
> diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
> index 7c1996b433e2..b25625314658 100644
> --- a/arch/x86/kvm/vmx/vmcs.h
> +++ b/arch/x86/kvm/vmx/vmcs.h
> @@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
>  	return is_exception_n(intr_info, NM_VECTOR);
>  }
>  
> +static inline bool is_ve_fault(u32 intr_info)
> +{
> +	return is_exception_n(intr_info, VE_VECTOR);
> +}
> +
>  /* Undocumented: icebp/int1 */
>  static inline bool is_icebp(u32 intr_info)
>  {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 9239a89dea22..6468f421ba9e 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -126,6 +126,9 @@ module_param(error_on_inconsistent_vmcs_config, bool, 0444);
>  static bool __read_mostly dump_invalid_vmcs = 0;
>  module_param(dump_invalid_vmcs, bool, 0644);
>  
> +static bool __read_mostly ept_violation_ve_test;
> +module_param(ept_violation_ve_test, bool, 0444);

I would much prefer to enable #VE if CONFIG_KVM_PROVE_MMU=y.  We already have
too many module params to deal with for testing, and practically speaking the
only people who will ever turn this on are the same people that run with
CONFIG_KVM_PROVE_MMU=y.

>  #define MSR_BITMAP_MODE_X2APIC		1
>  #define MSR_BITMAP_MODE_X2APIC_APICV	2
>  
> @@ -868,6 +871,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
>  
>  	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
>  	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
> +	/*
> +	 * #VE isn't used for VMX.  To test against unexpected changes
> +	 * related to #VE for VMX, intercept unexpected #VE and warn on it.
> +	 */
> +	if (ept_violation_ve_test)
> +		eb |= 1u << VE_VECTOR;
>  	/*
>  	 * Guest access to VMware backdoor ports could legitimately
>  	 * trigger #GP because of TSS I/O permission bitmap.
> @@ -2603,6 +2613,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  					&_cpu_based_2nd_exec_control))
>  			return -EIO;
>  	}
> +	if (!ept_violation_ve_test)
> +		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;

_If_ we add a module param, the param needs to be disabled if #VE isn't supported.

>  #ifndef CONFIG_X86_64
>  	if (!(_cpu_based_2nd_exec_control &
>  				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> @@ -2627,6 +2640,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  			return -EIO;
>  
>  		vmx_cap->ept = 0;
> +		_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  	}
>  	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
>  	    vmx_cap->vpid) {
> @@ -4592,6 +4606,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
>  	if (!enable_ept) {
>  		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
> +		exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
>  		enable_unrestricted_guest = 0;
>  	}
>  	if (!enable_unrestricted_guest)
> @@ -4715,8 +4730,40 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  
>  	exec_controls_set(vmx, vmx_exec_control(vmx));
>  
> -	if (cpu_has_secondary_exec_ctrls())
> +	if (cpu_has_secondary_exec_ctrls()) {
>  		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
> +		if (secondary_exec_controls_get(vmx) &
> +		    SECONDARY_EXEC_EPT_VIOLATION_VE) {
> +			if (!vmx->ve_info) {
> +				/* ve_info must be page aligned. */
> +				struct page *page;
> +
> +				BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
> +				page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +				if (page)
> +					vmx->ve_info = page_to_virt(page);
> +			}
> +			if (vmx->ve_info) {
> +				/*
> +				 * Allow #VE delivery. CPU sets this field to
> +				 * 0xFFFFFFFF on #VE delivery.  Another #VE can
> +				 * occur only if software clears the field.
> +				 */
> +				vmx->ve_info->delivery = 0;

This is completely unnecessary, the entire page is zero-allocated.

> +				vmcs_write64(VE_INFORMATION_ADDRESS,
> +					     __pa(vmx->ve_info));
> +			} else {
> +				/*
> +				 * Because SECONDARY_EXEC_EPT_VIOLATION_VE is
> +				 * used only when ept_violation_ve_test is true,
> +				 * it's okay to go with the bit disabled.

No, it's not.  This is silly on multiple fronts.  (a) KVM knows if it's going to
enable #VE when the vCPU is first created, the allocation can and should be done
at that time along with all the other allocations needed for the VM.  (b) Except
for error injection from syzkaller and friends, there is basically zero chance
the VM will live on if one 4KiB allocation fails.  (c) This will never be enabled
in production; it's totally fine if the vCPU creation fails during testing, because
as a above, that will practically never happen outside of deliberate error
injection.

> +				 */
> +				pr_err("Failed to allocate ve_info. disabling EPT_VIOLATION_VE.\n");
> +				secondary_exec_controls_clearbit(vmx,
> +								 SECONDARY_EXEC_EPT_VIOLATION_VE);
> +			}
> +		}
> +	}
>  
>  	if (cpu_has_tertiary_exec_ctrls())
>  		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
> @@ -5204,6 +5251,12 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
>  	if (is_invalid_opcode(intr_info))
>  		return handle_ud(vcpu);
>  
> +	/*
> +	 * #VE isn't supposed to happen.  Block the VM if it does.

This is not a useful comment.  Obviously #VE isn't supposed to happen, otherwise
KVM wouldn't be bugging the VM. 

> +	 */
> +	if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
> +		return -EIO;
> +
>  	error_code = 0;
>  	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
>  		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> @@ -6393,6 +6446,18 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
>  	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
>  		pr_err("Virtual processor ID = 0x%04x\n",
>  		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
> +	if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
> +		struct vmx_ve_information *ve_info;
> +
> +		pr_err("VE info address = 0x%016llx\n",
> +		       vmcs_read64(VE_INFORMATION_ADDRESS));
> +		ve_info = __va(vmcs_read64(VE_INFORMATION_ADDRESS));

Why!?!?  You have the address in vcpu_vmx, just use that.  If KVM is dumping
the VMCS, then something has gone wrong, possible in hardware or ucode.
Derefencing an address from the VMCS, which could very well be corrupted, is a
terrible idea.  This could easily escalate from a dead VM into a dead host.

> +		pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
> +		       ve_info->exit_reason, ve_info->delivery,
> +		       ve_info->exit_qualification,
> +		       ve_info->guest_linear_address,
> +		       ve_info->guest_physical_address, ve_info->eptp_index);
> +	}
>  }
>  
>  /*
> @@ -7433,6 +7498,8 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
>  	free_vpid(vmx->vpid);
>  	nested_vmx_free_vcpu(vcpu);
>  	free_loaded_vmcs(vmx->loaded_vmcs);
> +	if (vmx->ve_info)

Unnecessary, free_page() does this for you.

> +		free_page((unsigned long)vmx->ve_info);
>  }
>  
>  static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index e3b0985bb74a..1ea1e5c8930d 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -364,6 +364,9 @@ struct vcpu_vmx {
>  		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
>  	} shadow_msr_intercept;
> +
> +	/* ve_info must be page aligned. */

This is also not a useful comment.  Even the one at the allocation site is of
dubious value.

> +	struct vmx_ve_information *ve_info;
>  };
>  
>  struct kvm_vmx {
> @@ -576,7 +579,8 @@ static inline u8 vmx_get_rvi(void)
>  	 SECONDARY_EXEC_ENABLE_VMFUNC |					\
>  	 SECONDARY_EXEC_BUS_LOCK_DETECTION |				\
>  	 SECONDARY_EXEC_NOTIFY_VM_EXITING |				\
> -	 SECONDARY_EXEC_ENCLS_EXITING)
> +	 SECONDARY_EXEC_ENCLS_EXITING |					\
> +	 SECONDARY_EXEC_EPT_VIOLATION_VE)
>  
>  #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
>  #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL			\
> -- 
> 2.39.0
> 
> 

  reply	other threads:[~2024-02-28  1:56 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-27 23:20 [PATCH 00/21] TDX/SNP part 1 of n, for 6.9 Paolo Bonzini
2024-02-27 23:20 ` [PATCH 01/21] KVM: x86: Split core of hypercall emulation to helper function Paolo Bonzini
2024-02-28  2:09   ` Xiaoyao Li
2024-03-05  6:24   ` Binbin Wu
2024-02-27 23:20 ` [PATCH 02/21] KVM: Allow page-sized MMU caches to be initialized with custom 64-bit values Paolo Bonzini
2024-02-29 13:46   ` Xiaoyao Li
2024-03-05  6:55   ` Binbin Wu
2024-03-26 15:56     ` Binbin Wu
2024-05-13 20:38       ` Isaku Yamahata
2024-05-13 20:51         ` Isaku Yamahata
2024-05-13 20:56         ` Sean Christopherson
2024-02-27 23:20 ` [PATCH 03/21] KVM: x86/mmu: Replace hardcoded value 0 for the initial value for SPTE Paolo Bonzini
2024-02-29 13:50   ` Xiaoyao Li
2024-03-05  7:09   ` Binbin Wu
2024-02-27 23:20 ` [PATCH 04/21] KVM: x86/mmu: Allow non-zero value for non-present SPTE and removed SPTE Paolo Bonzini
2024-02-29  7:00   ` Xu Yilun
2024-02-29 13:55   ` Xiaoyao Li
2024-03-11 23:26   ` Huang, Kai
2024-02-27 23:20 ` [PATCH 05/21] KVM: x86/mmu: Add Suppress VE bit to EPT shadow_mmio_mask/shadow_present_mask Paolo Bonzini
2024-03-01  7:26   ` Xiaoyao Li
2024-03-05 13:17   ` Binbin Wu
2024-02-27 23:20 ` [PATCH 06/21] KVM: x86/mmu: Track shadow MMIO value on a per-VM basis Paolo Bonzini
2024-03-01  7:44   ` Xiaoyao Li
2024-03-05  8:35   ` Binbin Wu
2024-03-12  1:21   ` Huang, Kai
2024-02-27 23:20 ` [PATCH 07/21] KVM: VMX: Introduce test mode related to EPT violation VE Paolo Bonzini
2024-02-28  1:56   ` Sean Christopherson [this message]
2024-03-12  1:35   ` Huang, Kai
2024-03-12 16:54     ` Sean Christopherson
2024-03-12 21:03       ` Huang, Kai
2024-02-27 23:20 ` [PATCH 08/21] KVM: VMX: Move out vmx_x86_ops to 'main.c' to dispatch VMX and TDX Paolo Bonzini
2024-02-27 23:20 ` [PATCH 09/21] KVM: VMX: Modify NMI and INTR handlers to take intr_info as function argument Paolo Bonzini
2024-03-04  8:09   ` Xiaoyao Li
2024-03-05 13:42   ` Binbin Wu
2024-03-12  1:43   ` Huang, Kai
2024-02-27 23:20 ` [PATCH 10/21] KVM: SEV: Use a VMSA physical address variable for populating VMCB Paolo Bonzini
2024-02-28  2:00   ` Sean Christopherson
2024-02-28 17:32     ` Paolo Bonzini
2024-02-29 16:02       ` Sean Christopherson
2024-02-27 23:20 ` [PATCH 11/21] KVM: x86/tdp_mmu: Init role member of struct kvm_mmu_page at allocation Paolo Bonzini
2024-03-03  4:47   ` Xu Yilun
2024-03-25 23:32   ` Edgecombe, Rick P
2024-02-27 23:20 ` [PATCH 12/21] KVM: x86/tdp_mmu: Sprinkle __must_check Paolo Bonzini
2024-03-04  8:29   ` Xiaoyao Li
2024-02-27 23:20 ` [PATCH 13/21] KVM: x86/mmu: Pass around full 64-bit error code for KVM page faults Paolo Bonzini
2024-03-04  8:56   ` Xiaoyao Li
2024-03-04 15:39     ` Sean Christopherson
2024-04-05 17:57     ` Paolo Bonzini
2024-02-27 23:20 ` [PATCH 14/21] KVM: x86/mmu: pass error code back to MMU when async pf is ready Paolo Bonzini
2024-02-28  2:03   ` Sean Christopherson
2024-02-28 13:13     ` Paolo Bonzini
2024-02-27 23:20 ` [PATCH 15/21] KVM: x86/mmu: Use PFERR_GUEST_ENC_MASK to indicate fault is private Paolo Bonzini
2024-02-27 23:20 ` [PATCH 16/21] KVM: guest_memfd: pass error up from filemap_grab_folio Paolo Bonzini
2024-03-03 14:41   ` Xu Yilun
2024-02-27 23:20 ` [PATCH 17/21] filemap: add FGP_CREAT_ONLY Paolo Bonzini
2024-02-28  2:14   ` Sean Christopherson
2024-02-28  2:17     ` Yosry Ahmed
2024-02-28 13:15       ` Matthew Wilcox
2024-02-28 13:28         ` Paolo Bonzini
2024-02-28 19:24           ` Matthew Wilcox
2024-02-28 20:17             ` Paolo Bonzini
2024-03-04  2:55           ` Xu Yilun
2024-02-27 23:20 ` [PATCH 18/21] KVM: x86: Add gmem hook for initializing memory Paolo Bonzini
2024-02-28 20:29   ` Isaku Yamahata
2024-02-27 23:20 ` [PATCH 19/21] KVM: guest_memfd: add API to undo kvm_gmem_get_uninit_pfn Paolo Bonzini
2024-03-04  4:44   ` Xu Yilun
2024-02-27 23:20 ` [PATCH 20/21] KVM: x86: Add gmem hook for invalidating memory Paolo Bonzini
2024-02-27 23:21 ` [PATCH 21/21] KVM: x86: Add gmem hook for determining max NPT mapping level Paolo Bonzini
2024-03-12  0:39   ` Binbin Wu
2024-03-12  0:48   ` Binbin Wu
2024-02-28  1:24 ` [PATCH 00/21] TDX/SNP part 1 of n, for 6.9 Sean Christopherson
2024-02-28 13:29   ` Paolo Bonzini
2024-02-28 16:39     ` Sean Christopherson
2024-02-28 17:20       ` Paolo Bonzini
2024-02-28 18:04         ` Sean Christopherson
2024-02-28  2:11 ` Sean Christopherson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zd6Sy_PujXJVji0n@google.com \
    --to=seanjc@google.com \
    --cc=isaku.yamahata@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=michael.roth@amd.com \
    --cc=pbonzini@redhat.com \
    --cc=thomas.lendacky@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.