kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1 0/3] SVM 5-level page table support
@ 2021-08-05 20:55 Wei Huang
  2021-08-05 20:55 ` [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code Wei Huang
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Wei Huang @ 2021-08-05 20:55 UTC (permalink / raw)
  To: kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, wanpengli, jmattson,
	joro, tglx, mingo, bp, x86, hpa, wei.huang2

This patch set adds 5-level page table support for AMD SVM. When the
5-level page table is enabled on host OS, the nested page table for guest
VMs will use the same format as host OS (i.e. 5-level NPT). These patches
were tested with various combination of different settings (nested/regular
VMs, AMD64/i686 kernels, etc.)

Thanks,
-Wei

Wei Huang (3):
  KVM: x86: Convert TDP level calculation to vendor's specific code
  KVM: x86: Handle the case of 5-level shadow page table
  KVM: SVM: Add 5-level page table support for SVM

 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  6 +--
 arch/x86/kvm/mmu/mmu.c             | 68 +++++++++++++++++-------------
 arch/x86/kvm/svm/svm.c             | 14 +++---
 arch/x86/kvm/vmx/vmx.c             |  7 +--
 5 files changed, 52 insertions(+), 44 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code
  2021-08-05 20:55 [PATCH v1 0/3] SVM 5-level page table support Wei Huang
@ 2021-08-05 20:55 ` Wei Huang
  2021-08-05 21:51   ` Sean Christopherson
  2021-08-05 22:35   ` Jim Mattson
  2021-08-05 20:55 ` [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table Wei Huang
  2021-08-05 20:55 ` [PATCH v1 3/3] KVM: SVM: Add 5-level page table support for SVM Wei Huang
  2 siblings, 2 replies; 11+ messages in thread
From: Wei Huang @ 2021-08-05 20:55 UTC (permalink / raw)
  To: kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, wanpengli, jmattson,
	joro, tglx, mingo, bp, x86, hpa, wei.huang2

Currently the TDP level for x86 vCPU is calculated by checking both
MAXPHYADDR and max_tdp_level. This design assumes that all x86 CPUs have
the flexibility of changing the nested page table level different from host
CPU. This assumption might not be true. To solve this problem, let us
create a kvm_x86_ops specific function for TDP level calculation.

Signed-off-by: Wei Huang <wei.huang2@amd.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  5 ++---
 arch/x86/kvm/mmu/mmu.c             | 22 +++++-----------------
 arch/x86/kvm/svm/svm.c             |  5 +++--
 arch/x86/kvm/vmx/vmx.c             |  7 ++++---
 5 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index a12a4987154e..9853a7c9e4b7 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -85,6 +85,7 @@ KVM_X86_OP_NULL(sync_pir_to_irr)
 KVM_X86_OP(set_tss_addr)
 KVM_X86_OP(set_identity_map_addr)
 KVM_X86_OP(get_mt_mask)
+KVM_X86_OP(get_tdp_level)
 KVM_X86_OP(load_mmu_pgd)
 KVM_X86_OP_NULL(has_wbinvd_exit)
 KVM_X86_OP(get_l2_tsc_offset)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 974cbfb1eefe..20ddfbac966e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -723,7 +723,6 @@ struct kvm_vcpu_arch {
 
 	u64 reserved_gpa_bits;
 	int maxphyaddr;
-	int max_tdp_level;
 
 	/* emulate context */
 
@@ -1365,6 +1364,7 @@ struct kvm_x86_ops {
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+	int (*get_tdp_level)(struct kvm_vcpu *vcpu);
 
 	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 			     int root_level);
@@ -1747,8 +1747,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
 
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-		       int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_huge_page_level);
 
 static inline u16 kvm_read_ldt(void)
 {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 66f7f5bc3482..44e4561e41f5 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,7 +97,6 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
 bool tdp_enabled = false;
 
 static int max_huge_page_level __read_mostly;
-static int max_tdp_level __read_mostly;
 
 enum {
 	AUDIT_PRE_PAGE_FAULT,
@@ -4560,15 +4559,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
 	return role;
 }
 
-static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
-{
-	/* Use 5-level TDP if and only if it's useful/necessary. */
-	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
-		return 4;
-
-	return max_tdp_level;
-}
-
 static union kvm_mmu_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 				struct kvm_mmu_role_regs *regs, bool base_only)
@@ -4576,7 +4566,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
 
 	role.base.ad_disabled = (shadow_accessed_mask == 0);
-	role.base.level = kvm_mmu_get_tdp_level(vcpu);
+	role.base.level = static_call(kvm_x86_get_tdp_level)(vcpu);
 	role.base.direct = true;
 	role.base.gpte_is_8_bytes = true;
 
@@ -4597,7 +4587,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
-	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
+	context->shadow_root_level = static_call(kvm_x86_get_tdp_level)(vcpu);
 	context->direct_map = true;
 	context->get_guest_pgd = get_cr3;
 	context->get_pdptr = kvm_pdptr_read;
@@ -4688,7 +4678,7 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
 		kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
 
 	role.base.direct = false;
-	role.base.level = kvm_mmu_get_tdp_level(vcpu);
+	role.base.level = static_call(kvm_x86_get_tdp_level)(vcpu);
 
 	return role;
 }
@@ -5253,11 +5243,9 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	 */
 }
 
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-		       int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_huge_page_level)
 {
 	tdp_enabled = enable_tdp;
-	max_tdp_level = tdp_max_root_level;
 
 	/*
 	 * max_huge_page_level reflects KVM's MMU capabilities irrespective
@@ -5356,7 +5344,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
 	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
 	 * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
 	 */
-	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+	if (tdp_enabled && static_call(kvm_x86_get_tdp_level)(vcpu) > PT32E_ROOT_LEVEL)
 		return 0;
 
 	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e8ccab50ebf6..04710e10d04a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -258,7 +258,7 @@ u32 svm_msrpm_offset(u32 msr)
 
 #define MAX_INST_SIZE 15
 
-static int get_max_npt_level(void)
+static int svm_get_npt_level(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
 	return PT64_ROOT_4LEVEL;
@@ -1015,7 +1015,7 @@ static __init int svm_hardware_setup(void)
 	if (!boot_cpu_has(X86_FEATURE_NPT))
 		npt_enabled = false;
 
-	kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+	kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
 	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
 	/* Note, SEV setup consumes npt_enabled. */
@@ -4619,6 +4619,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.set_tss_addr = svm_set_tss_addr,
 	.set_identity_map_addr = svm_set_identity_map_addr,
 	.get_mt_mask = svm_get_mt_mask,
+	.get_tdp_level = svm_get_npt_level,
 
 	.get_exit_info = svm_get_exit_info,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 927a552393b9..419cea586646 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3062,9 +3062,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vmx->emulation_required = emulation_required(vcpu);
 }
 
-static int vmx_get_max_tdp_level(void)
+static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
 {
-	if (cpu_has_vmx_ept_5levels())
+	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
 		return 5;
 	return 4;
 }
@@ -7613,6 +7613,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.set_tss_addr = vmx_set_tss_addr,
 	.set_identity_map_addr = vmx_set_identity_map_addr,
 	.get_mt_mask = vmx_get_mt_mask,
+	.get_tdp_level = vmx_get_tdp_level,
 
 	.get_exit_info = vmx_get_exit_info,
 
@@ -7803,7 +7804,7 @@ static __init int hardware_setup(void)
 		ept_lpage_level = PG_LEVEL_2M;
 	else
 		ept_lpage_level = PG_LEVEL_4K;
-	kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+	kvm_configure_mmu(enable_ept, ept_lpage_level);
 
 	/*
 	 * Only enable PML when hardware supports PML feature, and both EPT
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table
  2021-08-05 20:55 [PATCH v1 0/3] SVM 5-level page table support Wei Huang
  2021-08-05 20:55 ` [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code Wei Huang
@ 2021-08-05 20:55 ` Wei Huang
  2021-08-06 17:58   ` Sean Christopherson
  2021-08-05 20:55 ` [PATCH v1 3/3] KVM: SVM: Add 5-level page table support for SVM Wei Huang
  2 siblings, 1 reply; 11+ messages in thread
From: Wei Huang @ 2021-08-05 20:55 UTC (permalink / raw)
  To: kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, wanpengli, jmattson,
	joro, tglx, mingo, bp, x86, hpa, wei.huang2

When the 5-level page table CPU flag is exposed, KVM code needs to handle
this case by pointing mmu->root_hpa to a properly-constructed 5-level page
table.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu/mmu.c          | 46 +++++++++++++++++++++++----------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 20ddfbac966e..8586ffdf4de8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -447,6 +447,7 @@ struct kvm_mmu {
 
 	u64 *pae_root;
 	u64 *pml4_root;
+	u64 *pml5_root;
 
 	/*
 	 * check zero bits on shadow page table entries, these
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 44e4561e41f5..b162c3e530aa 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3428,7 +3428,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * the shadow page table may be a PAE or a long mode page table.
 	 */
 	pm_mask = PT_PRESENT_MASK | shadow_me_mask;
-	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
 		if (WARN_ON_ONCE(!mmu->pml4_root)) {
@@ -3454,11 +3454,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 				      PT32_ROOT_LEVEL, false);
 		mmu->pae_root[i] = root | pm_mask;
 	}
+	mmu->root_hpa = __pa(mmu->pae_root);
 
-	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+		mmu->pml4_root[0] = mmu->root_hpa | pm_mask;
 		mmu->root_hpa = __pa(mmu->pml4_root);
-	else
-		mmu->root_hpa = __pa(mmu->pae_root);
+	}
+
+	if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+		mmu->pml5_root[0] = mmu->root_hpa | pm_mask;
+		mmu->root_hpa = __pa(mmu->pml5_root);
+	}
 
 set_root_pgd:
 	mmu->root_pgd = root_pgd;
@@ -3471,7 +3477,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
-	u64 *pml4_root, *pae_root;
+	u64 *pml5_root, *pml4_root, *pae_root;
 
 	/*
 	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3487,17 +3493,18 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	 * This mess only works with 4-level paging and needs to be updated to
 	 * work with 5-level paging.
 	 */
-	if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+	if (WARN_ON_ONCE(mmu->shadow_root_level < PT64_ROOT_4LEVEL)) {
 		return -EIO;
+	}
 
-	if (mmu->pae_root && mmu->pml4_root)
+	if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
 		return 0;
 
 	/*
 	 * The special roots should always be allocated in concert.  Yell and
 	 * bail if KVM ends up in a state where only one of the roots is valid.
 	 */
-	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
+	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || mmu->pml5_root))
 		return -EIO;
 
 	/*
@@ -3506,18 +3513,30 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	 */
 	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!pae_root)
-		return -ENOMEM;
+		goto err_out;
 
 	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-	if (!pml4_root) {
-		free_page((unsigned long)pae_root);
-		return -ENOMEM;
-	}
+	if (!pml4_root)
+		goto err_out;
+
+	pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+	if (!pml5_root)
+		goto err_out;
 
 	mmu->pae_root = pae_root;
 	mmu->pml4_root = pml4_root;
+	mmu->pml5_root = pml5_root;
 
 	return 0;
+err_out:
+	if (pae_root)
+		free_page((unsigned long)pae_root);
+	if (pml4_root)
+		free_page((unsigned long)pml4_root);
+	if (pml5_root)
+		free_page((unsigned long)pml5_root);
+
+	return -ENOMEM;
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -5320,6 +5339,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
 		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
 	free_page((unsigned long)mmu->pae_root);
 	free_page((unsigned long)mmu->pml4_root);
+	free_page((unsigned long)mmu->pml5_root);
 }
 
 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v1 3/3] KVM: SVM: Add 5-level page table support for SVM
  2021-08-05 20:55 [PATCH v1 0/3] SVM 5-level page table support Wei Huang
  2021-08-05 20:55 ` [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code Wei Huang
  2021-08-05 20:55 ` [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table Wei Huang
@ 2021-08-05 20:55 ` Wei Huang
  2 siblings, 0 replies; 11+ messages in thread
From: Wei Huang @ 2021-08-05 20:55 UTC (permalink / raw)
  To: kvm
  Cc: linux-kernel, pbonzini, vkuznets, seanjc, wanpengli, jmattson,
	joro, tglx, mingo, bp, x86, hpa, wei.huang2

Future AMD CPUs will support 5-level page table which is indicated by
X86_CR4_LA57 flag. When the 5-level page table is enabled on host OS,
the nested page table for guest VMs must use 5-level as well. Update
get_npt_level() function to reflect this requirement. In the meanwhile,
remove the code that prevents kvm-amd driver from being loaded when
5-level page table is detected.

Signed-off-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 04710e10d04a..f91ff7d2d9f9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -261,7 +261,9 @@ u32 svm_msrpm_offset(u32 msr)
 static int svm_get_npt_level(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	return PT64_ROOT_4LEVEL;
+	bool la57 = (cr4_read_shadow() & X86_CR4_LA57) != 0;
+
+	return la57 ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 #else
 	return PT32E_ROOT_LEVEL;
 #endif
@@ -462,11 +464,6 @@ static int has_svm(void)
 		return 0;
 	}
 
-	if (pgtable_l5_enabled()) {
-		pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
-		return 0;
-	}
-
 	return 1;
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code
  2021-08-05 20:55 ` [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code Wei Huang
@ 2021-08-05 21:51   ` Sean Christopherson
  2021-08-05 22:26     ` Wei Huang
  2021-08-08 19:30     ` Wei Huang
  2021-08-05 22:35   ` Jim Mattson
  1 sibling, 2 replies; 11+ messages in thread
From: Sean Christopherson @ 2021-08-05 21:51 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, wanpengli, jmattson, joro,
	tglx, mingo, bp, x86, hpa

On Thu, Aug 05, 2021, Wei Huang wrote:
> Currently the TDP level for x86 vCPU is calculated by checking both
> MAXPHYADDR and max_tdp_level. This design assumes that all x86 CPUs have
> the flexibility of changing the nested page table level different from host
> CPU. This assumption might not be true.

Heh, no need to be circumspect, just state that 5-level NPT inherits CR4.LA57
from the host.  I didn't fully understand this sentence until I looked at patch 3.

> To solve this problem, let us
> create a kvm_x86_ops specific function for TDP level calculation.
> 
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---

...

> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 974cbfb1eefe..20ddfbac966e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -723,7 +723,6 @@ struct kvm_vcpu_arch {
>  
>  	u64 reserved_gpa_bits;
>  	int maxphyaddr;
> -	int max_tdp_level;

Ha, this is leftover crud that can get zapped no matter what.

>  	/* emulate context */
>  

...

> -static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
> -{
> -	/* Use 5-level TDP if and only if it's useful/necessary. */
> -	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)

I'd strongly prefer to keep this logic in the MMU.  When this was in vendor code,
there were multiple bugs where the MMU and VMX didn't communicate correctly, I
really don't want to back down that road.

Actually, I'm very, very tempted to say we should simply drop the cpuid_maxphyaddr()
bit and just return the max level (and I suppose rename it), e.g.

	return mmu_tdp_level;

It's effectively a single 4kb page per VM, and Intel's numbers on 5-level paging
were that there was no measurable cost to the extra level.  I would hope that
holds true here, too.

If we want to keep the MAXPHYADDR behavior, I'd vote for something like:

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b4b65c21b2ca..7e35f2bf89b4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
 bool tdp_enabled = false;

 static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
 static int max_tdp_level __read_mostly;

 enum {
@@ -4645,6 +4646,9 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,

 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 {
+       if (tdp_root_level)
+               return tdp_root_level;
+
        /* Use 5-level TDP if and only if it's useful/necessary. */
        if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
                return 4;
@@ -5336,10 +5340,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
         */
 }

-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
-                      int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+                      int tdp_max_root_level, int tdp_huge_page_level)
 {
        tdp_enabled = enable_tdp;
+       tdp_root_level = tdp_forced_root_level;
        max_tdp_level = tdp_max_root_level;

        /*

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code
  2021-08-05 21:51   ` Sean Christopherson
@ 2021-08-05 22:26     ` Wei Huang
  2021-08-08 19:30     ` Wei Huang
  1 sibling, 0 replies; 11+ messages in thread
From: Wei Huang @ 2021-08-05 22:26 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: kvm, linux-kernel, pbonzini, vkuznets, wanpengli, jmattson, joro,
	tglx, mingo, bp, x86, hpa



On 8/5/21 4:51 PM, Sean Christopherson wrote:
> On Thu, Aug 05, 2021, Wei Huang wrote:
>> Currently the TDP level for x86 vCPU is calculated by checking both
>> MAXPHYADDR and max_tdp_level. This design assumes that all x86 CPUs have
>> the flexibility of changing the nested page table level different from host
>> CPU. This assumption might not be true.
> 
> Heh, no need to be circumspect, just state that 5-level NPT inherits CR4.LA57
> from the host.  I didn't fully understand this sentence until I looked at patch 3.

Sure, I will fix the comments

> 
>> To solve this problem, let us
>> create a kvm_x86_ops specific function for TDP level calculation.
>>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> ---
> 
> ...
> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 974cbfb1eefe..20ddfbac966e 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -723,7 +723,6 @@ struct kvm_vcpu_arch {
>>  
>>  	u64 reserved_gpa_bits;
>>  	int maxphyaddr;
>> -	int max_tdp_level;
> 
> Ha, this is leftover crud that can get zapped no matter what.
> 

Correct, this field is not being used at this moment and should be removed.

>>  	/* emulate context */
>>  
> 
> ...
> 
>> -static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
>> -{
>> -	/* Use 5-level TDP if and only if it's useful/necessary. */
>> -	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
> 
> I'd strongly prefer to keep this logic in the MMU.  When this was in vendor code,
> there were multiple bugs where the MMU and VMX didn't communicate correctly, I
> really don't want to back down that road.
> 
> Actually, I'm very, very tempted to say we should simply drop the cpuid_maxphyaddr()
> bit and just return the max level (and I suppose rename it), e.g.
> 
> 	return mmu_tdp_level;
> 
> It's effectively a single 4kb page per VM, and Intel's numbers on 5-level paging
> were that there was no measurable cost to the extra level.  I would hope that
> holds true here, too.

4KB waste per VM is possibly OK. My concern is the unnecessary perf cost
of one extra level. But if you think the hit is minimal, then returning
mmu_tdp_level without checking cpuid_maxphyaddr() is cleaner.

> 
> If we want to keep the MAXPHYADDR behavior, I'd vote for something like:
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index b4b65c21b2ca..7e35f2bf89b4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>  bool tdp_enabled = false;
> 
>  static int max_huge_page_level __read_mostly;
> +static int tdp_root_level __read_mostly;
>  static int max_tdp_level __read_mostly;
> 
>  enum {
> @@ -4645,6 +4646,9 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
> 
>  static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
>  {
> +       if (tdp_root_level)
> +               return tdp_root_level;
> +
>         /* Use 5-level TDP if and only if it's useful/necessary. */
>         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
>                 return 4;
> @@ -5336,10 +5340,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
>          */
>  }
> 
> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
> -                      int tdp_huge_page_level)
> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> +                      int tdp_max_root_level, int tdp_huge_page_level)
>  {
>         tdp_enabled = enable_tdp;
> +       tdp_root_level = tdp_forced_root_level;
>         max_tdp_level = tdp_max_root_level;
> 
>         /*
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code
  2021-08-05 20:55 ` [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code Wei Huang
  2021-08-05 21:51   ` Sean Christopherson
@ 2021-08-05 22:35   ` Jim Mattson
  2021-08-05 22:44     ` Sean Christopherson
  1 sibling, 1 reply; 11+ messages in thread
From: Jim Mattson @ 2021-08-05 22:35 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, seanjc, wanpengli, joro,
	tglx, mingo, bp, x86, hpa

On Thu, Aug 5, 2021 at 1:55 PM Wei Huang <wei.huang2@amd.com> wrote:
>
> This design assumes that all x86 CPUs have the flexibility of changing the nested page table level different from host CPU.

I can't even parse this sentence. What are you trying to say here?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code
  2021-08-05 22:35   ` Jim Mattson
@ 2021-08-05 22:44     ` Sean Christopherson
  0 siblings, 0 replies; 11+ messages in thread
From: Sean Christopherson @ 2021-08-05 22:44 UTC (permalink / raw)
  To: Jim Mattson
  Cc: Wei Huang, kvm, linux-kernel, pbonzini, vkuznets, wanpengli,
	joro, tglx, mingo, bp, x86, hpa

On Thu, Aug 05, 2021, Jim Mattson wrote:
> On Thu, Aug 5, 2021 at 1:55 PM Wei Huang <wei.huang2@amd.com> wrote:
> >
> > This design assumes that all x86 CPUs have the flexibility of changing the
> > nested page table level different from host CPU.
> 
> I can't even parse this sentence. What are you trying to say here?

NPT inherits the host's CR4, and thus CR4.LA57.  So KVM NPT is stuck using whatever
N-level paging the host kernel is using.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table
  2021-08-05 20:55 ` [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table Wei Huang
@ 2021-08-06 17:58   ` Sean Christopherson
  2021-08-08 17:49     ` Wei Huang
  0 siblings, 1 reply; 11+ messages in thread
From: Sean Christopherson @ 2021-08-06 17:58 UTC (permalink / raw)
  To: Wei Huang
  Cc: kvm, linux-kernel, pbonzini, vkuznets, wanpengli, jmattson, joro,
	tglx, mingo, bp, x86, hpa

On Thu, Aug 05, 2021, Wei Huang wrote:
> When the 5-level page table CPU flag is exposed, KVM code needs to handle
> this case by pointing mmu->root_hpa to a properly-constructed 5-level page
> table.
> 
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu/mmu.c          | 46 +++++++++++++++++++++++----------
>  2 files changed, 34 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 20ddfbac966e..8586ffdf4de8 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -447,6 +447,7 @@ struct kvm_mmu {
>  
>  	u64 *pae_root;
>  	u64 *pml4_root;
> +	u64 *pml5_root;
>  
>  	/*
>  	 * check zero bits on shadow page table entries, these
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 44e4561e41f5..b162c3e530aa 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3428,7 +3428,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  	 * the shadow page table may be a PAE or a long mode page table.
>  	 */
>  	pm_mask = PT_PRESENT_MASK | shadow_me_mask;
> -	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
> +	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
>  		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
>  
>  		if (WARN_ON_ONCE(!mmu->pml4_root)) {
> @@ -3454,11 +3454,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  				      PT32_ROOT_LEVEL, false);
>  		mmu->pae_root[i] = root | pm_mask;
>  	}
> +	mmu->root_hpa = __pa(mmu->pae_root);
>  
> -	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
> +	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
> +		mmu->pml4_root[0] = mmu->root_hpa | pm_mask;
>  		mmu->root_hpa = __pa(mmu->pml4_root);
> -	else
> -		mmu->root_hpa = __pa(mmu->pae_root);
> +	}
> +
> +	if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
> +		mmu->pml5_root[0] = mmu->root_hpa | pm_mask;
> +		mmu->root_hpa = __pa(mmu->pml5_root);
> +	}

Ouch, the root_hpa chaining is subtle.  That's my fault :-)  I think it would be
better to explicitly chain pae->pml4->pml5?  E.g.

	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;

		if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
			mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
			mmu->root_hpa = __pa(mmu->pml5_root);
		} else {
			mmu->root_hpa = __pa(mmu->pml4_root);
		}
	} else {
		mmu->root_hpa = __pa(mmu->pae_root);
	}

It'd require more churn if we get to 6-level paging, but that's a risk I'm willing
to take ;-)

>  
>  set_root_pgd:
>  	mmu->root_pgd = root_pgd;
> @@ -3471,7 +3477,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_mmu *mmu = vcpu->arch.mmu;
> -	u64 *pml4_root, *pae_root;
> +	u64 *pml5_root, *pml4_root, *pae_root;
>  
>  	/*
>  	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
> @@ -3487,17 +3493,18 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
>  	 * This mess only works with 4-level paging and needs to be updated to
>  	 * work with 5-level paging.
>  	 */
> -	if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
> +	if (WARN_ON_ONCE(mmu->shadow_root_level < PT64_ROOT_4LEVEL)) {

This is amusingly wrong.  The check above this is:

	if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
	    mmu->shadow_root_level < PT64_ROOT_4LEVEL)  <--------
		return 0;

meaning this is dead code.  It should simply deleted.  If we reaaaaaly wanted to
future proof the code, we could do:

	if (WARN_ON_ONCE(mmu->shadow_root_level > PT64_ROOT_5LEVEL)
		return -EIO;

but at that point we're looking at a completely different architecture, so I don't
think we need to be that paranoid :-)

>  		return -EIO;
> +	}
>  
> -	if (mmu->pae_root && mmu->pml4_root)
> +	if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
>  		return 0;
>  
>  	/*
>  	 * The special roots should always be allocated in concert.  Yell and
>  	 * bail if KVM ends up in a state where only one of the roots is valid.
>  	 */
> -	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
> +	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || mmu->pml5_root))
>  		return -EIO;
>  
>  	/*
> @@ -3506,18 +3513,30 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
>  	 */
>  	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>  	if (!pae_root)
> -		return -ENOMEM;
> +		goto err_out;

Branching to the error handling here is silly, it's the first allocation.

>  
>  	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> -	if (!pml4_root) {
> -		free_page((unsigned long)pae_root);
> -		return -ENOMEM;
> -	}
> +	if (!pml4_root)
> +		goto err_out;
> +
> +	pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);

This should be guarded by "mmu->shadow_root_level > PT64_ROOT_4LEVEL", there's no
need to waste a page on PML5 if it can't exist.

> +	if (!pml5_root)
> +		goto err_out;
>  
>  	mmu->pae_root = pae_root;
>  	mmu->pml4_root = pml4_root;
> +	mmu->pml5_root = pml5_root;
>  
>  	return 0;
> +err_out:
> +	if (pae_root)
> +		free_page((unsigned long)pae_root);
> +	if (pml4_root)
> +		free_page((unsigned long)pml4_root);
> +	if (pml5_root)
> +		free_page((unsigned long)pml5_root);

This is flawed as failure to allocate pml4_root will consume an uninitialized
pml5_root.  There's also no need to check for non-NULL values as free_page plays
nice with NULL pointers.

If you drop the unnecessary goto for pae_root allocation failure, than this can
become:

err_out:
	free_page((unsigned long)pml4_root);
	free_page((unsigned long)pae_root);

since pml4_root will be NULL if pml4_root allocation failures.  IMO that's
unnecessarily clever though, and a more standard:

err_pml5:
	free_page((unsigned long)pml4_root);
err_pml4:
	free_page((unsigned long)pae_root);
	return -ENOMEM;

would be far easier to read/maintain.

> +
> +	return -ENOMEM;
>  }
>  
>  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> @@ -5320,6 +5339,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
>  		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
>  	free_page((unsigned long)mmu->pae_root);
>  	free_page((unsigned long)mmu->pml4_root);
> +	free_page((unsigned long)mmu->pml5_root);
>  }
>  
>  static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
> -- 
> 2.31.1
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table
  2021-08-06 17:58   ` Sean Christopherson
@ 2021-08-08 17:49     ` Wei Huang
  0 siblings, 0 replies; 11+ messages in thread
From: Wei Huang @ 2021-08-08 17:49 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: kvm, linux-kernel, pbonzini, vkuznets, wanpengli, jmattson, joro,
	tglx, mingo, bp, x86, hpa



On 8/6/21 12:58 PM, Sean Christopherson wrote:
> On Thu, Aug 05, 2021, Wei Huang wrote:
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 44e4561e41f5..b162c3e530aa 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -3428,7 +3428,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>>   	 * the shadow page table may be a PAE or a long mode page table.
>>   	 */
>>   	pm_mask = PT_PRESENT_MASK | shadow_me_mask;
>> -	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
>> +	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
>>   		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
>>   
>>   		if (WARN_ON_ONCE(!mmu->pml4_root)) {
>> @@ -3454,11 +3454,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>>   				      PT32_ROOT_LEVEL, false);
>>   		mmu->pae_root[i] = root | pm_mask;
>>   	}
>> +	mmu->root_hpa = __pa(mmu->pae_root);
>>   
>> -	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
>> +	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
>> +		mmu->pml4_root[0] = mmu->root_hpa | pm_mask;
>>   		mmu->root_hpa = __pa(mmu->pml4_root);
>> -	else
>> -		mmu->root_hpa = __pa(mmu->pae_root);
>> +	}
>> +
>> +	if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
>> +		mmu->pml5_root[0] = mmu->root_hpa | pm_mask;
>> +		mmu->root_hpa = __pa(mmu->pml5_root);
>> +	}
> 
> Ouch, the root_hpa chaining is subtle.  That's my fault :-)  I think it would be
> better to explicitly chain pae->pml4->pml5?  E.g.
> 
> 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
> 		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
> 
> 		if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
> 			mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
> 			mmu->root_hpa = __pa(mmu->pml5_root);
> 		} else {
> 			mmu->root_hpa = __pa(mmu->pml4_root);
> 		}
> 	} else {
> 		mmu->root_hpa = __pa(mmu->pae_root);
> 	}
> 
> It'd require more churn if we get to 6-level paging, but that's a risk I'm willing
> to take ;-)
> 

Thanks for the review. This part of code is indeed subtle. The chaining 
trick will be easier to understand with a proper explanation. My 
proposal is to keep the original approach, but add more comments to this 
group of code.

       /* 
 

        * Depending on the shadow_root_level, build the root_hpa table 
by 

        * chaining either pml5->pml4->pae or pml4->pae. 
 

        */
       mmu->root_hpa = __pa(mmu->pae_root);
       if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
               mmu->pml4_root[0] = mmu->root_hpa | pm_mask;
               mmu->root_hpa = __pa(mmu->pml4_root);
       }
       if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
               mmu->pml5_root[0] = mmu->root_hpa | pm_mask;
               mmu->root_hpa = __pa(mmu->pml5_root);
       }

This code will be easy to extend for 6-level page table (if needed) in 
the future.

>>   
>>   set_root_pgd:
>>   	mmu->root_pgd = root_pgd;
>> @@ -3471,7 +3477,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>>   static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
>>   {
>>   	struct kvm_mmu *mmu = vcpu->arch.mmu;
>> -	u64 *pml4_root, *pae_root;
>> +	u64 *pml5_root, *pml4_root, *pae_root;
>>   
>>   	/*
>>   	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
>> @@ -3487,17 +3493,18 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
>>   	 * This mess only works with 4-level paging and needs to be updated to
>>   	 * work with 5-level paging.
>>   	 */
>> -	if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
>> +	if (WARN_ON_ONCE(mmu->shadow_root_level < PT64_ROOT_4LEVEL)) {
> 
> This is amusingly wrong.  The check above this is:
> 
> 	if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
> 	    mmu->shadow_root_level < PT64_ROOT_4LEVEL)  <--------
> 		return 0;
> 
> meaning this is dead code.  It should simply deleted.  If we reaaaaaly wanted to
> future proof the code, we could do:
> 
> 	if (WARN_ON_ONCE(mmu->shadow_root_level > PT64_ROOT_5LEVEL)
> 		return -EIO;
> 
> but at that point we're looking at a completely different architecture, so I don't
> think we need to be that paranoid :-)

You are right that this can be removed.

> 
>>   		return -EIO;
>> +	}
>>   
>> -	if (mmu->pae_root && mmu->pml4_root)
>> +	if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
>>   		return 0;
>>   
>>   	/*
>>   	 * The special roots should always be allocated in concert.  Yell and
>>   	 * bail if KVM ends up in a state where only one of the roots is valid.
>>   	 */
>> -	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
>> +	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || mmu->pml5_root))
>>   		return -EIO;
>>   
>>   	/*
>> @@ -3506,18 +3513,30 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
>>   	 */
>>   	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>>   	if (!pae_root)
>> -		return -ENOMEM;
>> +		goto err_out;
> 
> Branching to the error handling here is silly, it's the first allocation.
> 
>>   
>>   	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>> -	if (!pml4_root) {
>> -		free_page((unsigned long)pae_root);
>> -		return -ENOMEM;
>> -	}
>> +	if (!pml4_root)
>> +		goto err_out;
>> +
>> +	pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> 
> This should be guarded by "mmu->shadow_root_level > PT64_ROOT_4LEVEL", there's no
> need to waste a page on PML5 if it can't exist.

Will do

> 
>> +	if (!pml5_root)
>> +		goto err_out;
>>   
>>   	mmu->pae_root = pae_root;
>>   	mmu->pml4_root = pml4_root;
>> +	mmu->pml5_root = pml5_root;
>>   
>>   	return 0;
>> +err_out:
>> +	if (pae_root)
>> +		free_page((unsigned long)pae_root);
>> +	if (pml4_root)
>> +		free_page((unsigned long)pml4_root);
>> +	if (pml5_root)
>> +		free_page((unsigned long)pml5_root);
> 
> This is flawed as failure to allocate pml4_root will consume an uninitialized
> pml5_root.  There's also no need to check for non-NULL values as free_page plays
> nice with NULL pointers.
> 
> If you drop the unnecessary goto for pae_root allocation failure, than this can
> become:
> 
> err_out:
> 	free_page((unsigned long)pml4_root);
> 	free_page((unsigned long)pae_root);
> 
> since pml4_root will be NULL if pml4_root allocation failures.  IMO that's
> unnecessarily clever though, and a more standard:
> 
> err_pml5:
> 	free_page((unsigned long)pml4_root);
> err_pml4:
> 	free_page((unsigned long)pae_root);
> 	return -ENOMEM;
> 
> would be far easier to read/maintain.
> 

I will take the advice for this part of code.

>> +
>> +	return -ENOMEM;
>>   }
>>   
>>   void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
>> @@ -5320,6 +5339,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
>>   		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
>>   	free_page((unsigned long)mmu->pae_root);
>>   	free_page((unsigned long)mmu->pml4_root);
>> +	free_page((unsigned long)mmu->pml5_root);
>>   }
>>   
>>   static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
>> -- 
>> 2.31.1
>>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code
  2021-08-05 21:51   ` Sean Christopherson
  2021-08-05 22:26     ` Wei Huang
@ 2021-08-08 19:30     ` Wei Huang
  1 sibling, 0 replies; 11+ messages in thread
From: Wei Huang @ 2021-08-08 19:30 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: kvm, linux-kernel, pbonzini, vkuznets, wanpengli, jmattson, joro,
	tglx, mingo, bp, x86, hpa



On 8/5/21 4:51 PM, Sean Christopherson wrote:
> 
> If we want to keep the MAXPHYADDR behavior, I'd vote for something like:
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index b4b65c21b2ca..7e35f2bf89b4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>   bool tdp_enabled = false;
> 
>   static int max_huge_page_level __read_mostly;
> +static int tdp_root_level __read_mostly;
>   static int max_tdp_level __read_mostly;
> 
>   enum {
> @@ -4645,6 +4646,9 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
> 
>   static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
>   {
> +       if (tdp_root_level)
> +               return tdp_root_level;
> +
>          /* Use 5-level TDP if and only if it's useful/necessary. */
>          if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
>                  return 4;
> @@ -5336,10 +5340,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
>           */
>   }
> 
> -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
> -                      int tdp_huge_page_level)
> +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> +                      int tdp_max_root_level, int tdp_huge_page_level)
>   {
>          tdp_enabled = enable_tdp;
> +       tdp_root_level = tdp_forced_root_level;
>          max_tdp_level = tdp_max_root_level;
> 
>          /*
> 

I decided to take this suggestion in v2: it avoids using 5-level table 
(memory cost and potential table-walk overhead) if the host has the 
flexibility of using 4-level NPT under LA57.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-08-08 19:31 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-05 20:55 [PATCH v1 0/3] SVM 5-level page table support Wei Huang
2021-08-05 20:55 ` [PATCH v1 1/3] KVM: x86: Convert TDP level calculation to vendor's specific code Wei Huang
2021-08-05 21:51   ` Sean Christopherson
2021-08-05 22:26     ` Wei Huang
2021-08-08 19:30     ` Wei Huang
2021-08-05 22:35   ` Jim Mattson
2021-08-05 22:44     ` Sean Christopherson
2021-08-05 20:55 ` [PATCH v1 2/3] KVM: x86: Handle the case of 5-level shadow page table Wei Huang
2021-08-06 17:58   ` Sean Christopherson
2021-08-08 17:49     ` Wei Huang
2021-08-05 20:55 ` [PATCH v1 3/3] KVM: SVM: Add 5-level page table support for SVM Wei Huang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).