Re: [PATCH] KVM: x86/mmu: Do not create SPTEs for GFNs that exceed host.MAXPHYADDR

From: Paolo Bonzini <pbonzini@redhat.com>
To: Sean Christopherson <seanjc@google.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>,
	Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	Maxim Levitsky <mlevitsk@redhat.com>,
	Ben Gardon <bgardon@google.com>,
	David Matlack <dmatlack@google.com>
Subject: Re: [PATCH] KVM: x86/mmu: Do not create SPTEs for GFNs that exceed host.MAXPHYADDR
Date: Fri, 29 Apr 2022 12:36:17 +0200	[thread overview]
Message-ID: <337332ca-835c-087c-c99b-92c35ea8dcd3@redhat.com> (raw)
In-Reply-To: <20220428233416.2446833-1-seanjc@google.com>

On 4/29/22 01:34, Sean Christopherson wrote:

> +static inline gfn_t kvm_mmu_max_gfn_host(void)
> +{
> +	/*
> +	 * Disallow SPTEs (via memslots or cached MMIO) whose gfn would exceed
> +	 * host.MAXPHYADDR.  Assuming KVM is running on bare metal, guest
> +	 * accesses beyond host.MAXPHYADDR will hit a #PF(RSVD) and never hit
> +	 * an EPT Violation/Misconfig / #NPF, and so KVM will never install a
> +	 * SPTE for such addresses.  That doesn't hold true if KVM is running
> +	 * as a VM itself, e.g. if the MAXPHYADDR KVM sees is less than
> +	 * hardware's real MAXPHYADDR, but since KVM can't honor such behavior
> +	 * on bare metal, disallow it entirely to simplify e.g. the TDP MMU.
> +	 */
> +	return (1ULL << (shadow_phys_bits - PAGE_SHIFT)) - 1;

The host.MAXPHYADDR however does not matter if EPT/NPT is not in use, because
the shadow paging fault path can accept any gfn.

> -static inline gfn_t tdp_mmu_max_gfn_host(void)
> +static inline gfn_t tdp_mmu_max_exclusive_gfn_host(void)
>   {
>   	/*
> -	 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
> -	 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
> -	 * and so KVM will never install a SPTE for such addresses.
> +	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
> +	 * a gpa range that would exceed the max gfn, and KVM does not create
> +	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
> +	 * the slow emulation path every time.
>   	 */
> -	return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
> +	return kvm_mmu_max_gfn_host() + 1;
>   }

Slightly nicer name, tdp_mmu_max_gfn_exclusive().  It has to be the host
one because EPT/NPT is in use, but it doesn't really matter.

> +		 * whose gfn is greater than host.MAXPHYADDR, any guest that
> +		 * generates such gfns is either malicious or in the weeds.
> +		 * Note, it's possible to observe a gfn > host.MAXPHYADDR if
> +		 * and only if host.MAXPHYADDR is inaccurate with respect to
> +		 * hardware behavior, e.g. if KVM itself is running as a VM.

I don't think maliciousness is particularly likely, and "in the weeds" implies
L2 is the buggy one.  Slightly more accurate:

         * whose gfn is greater than host.MAXPHYADDR, any guest that
         * generates such gfns is running nested and is being tricked
         * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
         * and only if L1's MAXPHYADDR is inaccurate with respect to
         * the hardware's).

Putting everything together and rebasing on top of kvm/master:

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e6cae6f22683..dba275d323a7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
  	return ((2ULL << (e - s)) - 1) << s;
  }
  
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+extern u8 __read_mostly shadow_phys_bits;
+
+static inline gfn_t kvm_mmu_max_gfn(void)
+{
+	/*
+	 * Note that this uses the host MAXPHYADDR, not the guest's.
+	 * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+	 * assuming KVM is running on bare metal, guest accesses beyond
+	 * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+	 * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+	 * install a SPTE for such addresses.  If KVM is running as a VM
+	 * itself, on the other hand, it might see a MAXPHYADDR that is less
+	 * than hardware's real MAXPHYADDR.  Using the host MAXPHYADDR
+	 * disallows such SPTEs entirely and simplifies the TDP MMU.
+	 */
+	int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+
+	return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
+}
+
  void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
  void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
  
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index af7910a46c12..7b632a4f81cb 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3033,9 +3033,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
  		/*
  		 * If MMIO caching is disabled, emulate immediately without
  		 * touching the shadow page tables as attempting to install an
-		 * MMIO SPTE will just be an expensive nop.
+		 * MMIO SPTE will just be an expensive nop.  Do not cache MMIO
+		 * whose gfn is greater than host.MAXPHYADDR, any guest that
+		 * generates such gfns is running nested and is being tricked
+		 * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
+		 * and only if L1's MAXPHYADDR is inaccurate with respect to
+		 * the hardware's).
  		 */
-		if (unlikely(!shadow_mmio_value)) {
+		if (unlikely(!shadow_mmio_value) ||
+		    unlikely(fault->gfn > kvm_mmu_max_gfn())) {
  			*ret_val = RET_PF_EMULATE;
  			return true;
  		}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 73f12615416f..e4abeb5df1b1 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
   */
  extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
  
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
  static inline bool is_mmio_spte(u64 spte)
  {
  	return (spte & shadow_mmio_mask) == shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c472769e0300..edc68538819b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
  	return iter->yielded;
  }
  
-static inline gfn_t tdp_mmu_max_gfn_host(void)
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
  {
  	/*
-	 * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
-	 * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
-	 * and so KVM will never install a SPTE for such addresses.
+	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
+	 * a gpa range that would exceed the max gfn, and KVM does not create
+	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+	 * the slow emulation path every time.
  	 */
-	return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+	return kvm_mmu_max_gfn() + 1;
  }
  
  static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
  {
  	struct tdp_iter iter;
  
-	gfn_t end = tdp_mmu_max_gfn_host();
+	gfn_t end = tdp_mmu_max_gfn_exclusive();
  	gfn_t start = 0;
  
  	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
@@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
  {
  	struct tdp_iter iter;
  
-	end = min(end, tdp_mmu_max_gfn_host());
+	end = min(end, tdp_mmu_max_gfn_exclusive());
  
  	lockdep_assert_held_write(&kvm->mmu_lock);
  
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 278b2fdd3590..015ecc249c2e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11994,8 +11994,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
  				   struct kvm_memory_slot *new,
  				   enum kvm_mr_change change)
  {
-	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
+		if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
+			return -EINVAL;
+
  		return kvm_alloc_memslot_metadata(kvm, new);
+	}
  
  	if (change == KVM_MR_FLAGS_ONLY)
  		memcpy(&new->arch, &old->arch, sizeof(old->arch));