All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 00/25] KVM MMU refactoring part 2: role changes
@ 2022-02-21 16:22 Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3 Paolo Bonzini
                   ` (24 more replies)
  0 siblings, 25 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

This is the MMU root role vs. CPU mode split.  While not a requirement
in absolute terms for further hacking, it makes it much clearer to reason
on whether the previous root is still valid.

Paolo

Paolo Bonzini (25):
  KVM: x86/mmu: avoid indirect call for get_cr3
  KVM: x86/mmu: nested EPT cannot be used in SMM
  KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs
  KVM: x86/mmu: pull computation of kvm_mmu_role_regs to kvm_init_mmu
  KVM: x86/mmu: rephrase unclear comment
  KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback
  KVM: x86/mmu: remove "bool base_only" arguments
  KVM: x86/mmu: split cpu_mode from mmu_role
  KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs
  KVM: x86/mmu: remove ept_ad field
  KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common
  KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional
    paging
  KVM: x86/mmu: cleanup computation of MMU roles for shadow paging
  KVM: x86/mmu: store shadow EFER.NX in the MMU role
  KVM: x86/mmu: remove extended bits from mmu_role, rename field
  KVM: x86/mmu: rename kvm_mmu_role union
  KVM: x86/mmu: remove redundant bits from extended role
  KVM: x86/mmu: remove valid from extended role
  KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles
  KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu
  KVM: x86/mmu: replace shadow_root_level with root_role.level
  KVM: x86/mmu: replace root_level with cpu_mode.base.level
  KVM: x86/mmu: replace direct_map with root_role.direct
  KVM: x86/mmu: initialize constant-value fields just once
  KVM: x86/mmu: extract initialization of the page walking data

 arch/x86/include/asm/kvm_host.h |  25 +-
 arch/x86/kvm/mmu.h              |  12 +-
 arch/x86/kvm/mmu/mmu.c          | 479 +++++++++++++-------------------
 arch/x86/kvm/mmu/paging_tmpl.h  |  16 +-
 arch/x86/kvm/mmu/tdp_mmu.c      |   4 +-
 arch/x86/kvm/svm/nested.c       |  13 +-
 arch/x86/kvm/svm/svm.c          |   2 +-
 arch/x86/kvm/vmx/nested.c       |   9 +-
 arch/x86/kvm/vmx/vmx.c          |   2 +-
 arch/x86/kvm/x86.c              |  31 ++-
 10 files changed, 260 insertions(+), 333 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 16:16   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 02/25] KVM: x86/mmu: nested EPT cannot be used in SMM Paolo Bonzini
                   ` (23 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Most of the time, calls to get_guest_pgd result in calling kvm_read_cr3
(the exception is only nested TDP).  Check if that is the case if
retpolines are enabled, thus avoiding an expensive indirect call.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h             | 10 ++++++++++
 arch/x86/kvm/mmu/mmu.c         | 15 ++++++++-------
 arch/x86/kvm/mmu/paging_tmpl.h |  2 +-
 arch/x86/kvm/x86.c             |  2 +-
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1d0c1904d69a..6ee4436e46f1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -116,6 +116,16 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
 					  vcpu->arch.mmu->shadow_root_level);
 }
 
+extern unsigned long kvm_get_guest_cr3(struct kvm_vcpu *vcpu);
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+#ifdef CONFIG_RETPOLINE
+	if (mmu->get_guest_pgd == kvm_get_guest_cr3)
+		return kvm_read_cr3(vcpu);
+#endif
+	return mmu->get_guest_pgd(vcpu);
+}
+
 struct kvm_page_fault {
 	/* arguments to kvm_mmu_do_page_fault.  */
 	const gpa_t addr;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b2c1c4eb6007..7051040e15b3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3435,7 +3435,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	unsigned i;
 	int r;
 
-	root_pgd = mmu->get_guest_pgd(vcpu);
+	root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
 	root_gfn = root_pgd >> PAGE_SHIFT;
 
 	if (mmu_check_root(vcpu, root_gfn))
@@ -3854,12 +3854,13 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 				    gfn_t gfn)
 {
+	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	struct kvm_arch_async_pf arch;
 
 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
 	arch.gfn = gfn;
-	arch.direct_map = vcpu->arch.mmu->direct_map;
-	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
+	arch.direct_map = mmu->direct_map;
+	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, mmu);
 
 	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
 				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
@@ -4208,7 +4209,7 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
 
-static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+unsigned long kvm_get_guest_cr3(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr3(vcpu);
 }
@@ -4767,7 +4768,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = NULL;
 	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
 	context->direct_map = true;
-	context->get_guest_pgd = get_cr3;
+	context->get_guest_pgd = kvm_get_guest_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
 	context->root_level = role_regs_to_root_level(&regs);
@@ -4942,7 +4943,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 
 	kvm_init_shadow_mmu(vcpu, &regs);
 
-	context->get_guest_pgd     = get_cr3;
+	context->get_guest_pgd	   = kvm_get_guest_cr3;
 	context->get_pdptr         = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
 }
@@ -4974,7 +4975,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		return;
 
 	g_context->mmu_role.as_u64 = new_role.as_u64;
-	g_context->get_guest_pgd     = get_cr3;
+	g_context->get_guest_pgd     = kvm_get_guest_cr3;
 	g_context->get_pdptr         = kvm_pdptr_read;
 	g_context->inject_page_fault = kvm_inject_page_fault;
 	g_context->root_level        = new_role.base.level;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 252c77805eb9..80b4b291002a 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -362,7 +362,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
 	walker->level = mmu->root_level;
-	pte           = mmu->get_guest_pgd(vcpu);
+	pte           = kvm_mmu_get_guest_pgd(vcpu, mmu);
 	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6552360d8888..da33d3a88a8d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12190,7 +12190,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 		return;
 
 	if (!vcpu->arch.mmu->direct_map &&
-	      work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
+	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
 		return;
 
 	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 02/25] KVM: x86/mmu: nested EPT cannot be used in SMM
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3 Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 16:18   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 03/25] KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs Paolo Bonzini
                   ` (22 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The role.base.smm flag is always zero when setting up shadow EPT,
do not bother copying it over from vcpu->arch.root_mmu.

Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7051040e15b3..4f9bbd02fb8b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4886,9 +4886,11 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 {
 	union kvm_mmu_role role = {0};
 
-	/* SMM flag is inherited from root_mmu */
-	role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
-
+	/*
+	 * KVM does not support SMM transfer monitors, and consequently does not
+	 * support the "entry to SMM" control either.  role.base.smm is always 0.
+	 */
+	WARN_ON_ONCE(is_smm(vcpu));
 	role.base.level = level;
 	role.base.has_4_byte_gpte = false;
 	role.base.direct = false;
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 03/25] KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3 Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 02/25] KVM: x86/mmu: nested EPT cannot be used in SMM Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 16:22   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 04/25] KVM: x86/mmu: pull computation of kvm_mmu_role_regs to kvm_init_mmu Paolo Bonzini
                   ` (21 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

struct kvm_mmu_role_regs is computed just once and then accessed.  Use
const to make this clearer, even though the const fields of struct
kvm_mmu_role_regs already prevent modifications to the contents of the
struct, or rather make them harder.

Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4f9bbd02fb8b..97566ac539e3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -197,7 +197,7 @@ struct kvm_mmu_role_regs {
  * the single source of truth for the MMU's state.
  */
 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
-static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+static inline bool __maybe_unused ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)\
 {									\
 	return !!(regs->reg & flag);					\
 }
@@ -244,7 +244,7 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 	return regs;
 }
 
-static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
+static int role_regs_to_root_level(const struct kvm_mmu_role_regs *regs)
 {
 	if (!____is_cr0_pg(regs))
 		return 0;
@@ -4681,7 +4681,7 @@ static void paging32_init_context(struct kvm_mmu *context)
 }
 
 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
-							 struct kvm_mmu_role_regs *regs)
+							 const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_extended_role ext = {0};
 
@@ -4704,7 +4704,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
 }
 
 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
-						   struct kvm_mmu_role_regs *regs,
+						   const struct kvm_mmu_role_regs *regs,
 						   bool base_only)
 {
 	union kvm_mmu_role role = {0};
@@ -4740,7 +4740,8 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 
 static union kvm_mmu_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				struct kvm_mmu_role_regs *regs, bool base_only)
+				const struct kvm_mmu_role_regs *regs,
+				bool base_only)
 {
 	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
 
@@ -4786,7 +4787,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 static union kvm_mmu_role
 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
-				      struct kvm_mmu_role_regs *regs, bool base_only)
+				      const struct kvm_mmu_role_regs *regs,
+				      bool base_only)
 {
 	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
 
@@ -4799,7 +4801,8 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_role
 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				   struct kvm_mmu_role_regs *regs, bool base_only)
+				   const struct kvm_mmu_role_regs *regs,
+				   bool base_only)
 {
 	union kvm_mmu_role role =
 		kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
@@ -4817,7 +4820,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-				    struct kvm_mmu_role_regs *regs,
+				    const struct kvm_mmu_role_regs *regs,
 				    union kvm_mmu_role new_role)
 {
 	if (new_role.as_u64 == context->mmu_role.as_u64)
@@ -4840,7 +4843,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 }
 
 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
-				struct kvm_mmu_role_regs *regs)
+				const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role new_role =
@@ -4851,7 +4854,7 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_role
 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
-				   struct kvm_mmu_role_regs *regs)
+				   const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_role role =
 		kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
@@ -4951,7 +4954,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 }
 
 static union kvm_mmu_role
-kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
+kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_role role;
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 04/25] KVM: x86/mmu: pull computation of kvm_mmu_role_regs to kvm_init_mmu
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (2 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 03/25] KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 05/25] KVM: x86/mmu: rephrase unclear comment Paolo Bonzini
                   ` (20 subsequent siblings)
  24 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The init_kvm_*mmu functions, with the exception of shadow NPT,
do not need to know the full values of CR0/CR4/EFER; they only
need to know the bits that make up the "role".  This cleanup
however will take quite a few incremental steps.  As a start,
pull the common computation of the struct kvm_mmu_role_regs
into their caller: all of them extract the struct from the vcpu
as the very first step.

Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 97566ac539e3..0e393506f4df 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4753,12 +4753,12 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	return role;
 }
 
-static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+			     const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
 	union kvm_mmu_role new_role =
-		kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
+		kvm_calc_tdp_mmu_root_page_role(vcpu, regs, false);
 
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
@@ -4772,7 +4772,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->get_guest_pgd = kvm_get_guest_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
-	context->root_level = role_regs_to_root_level(&regs);
+	context->root_level = role_regs_to_root_level(regs);
 
 	if (!is_cr0_pg(context))
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -4941,12 +4941,12 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+			     const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
 
-	kvm_init_shadow_mmu(vcpu, &regs);
+	kvm_init_shadow_mmu(vcpu, regs);
 
 	context->get_guest_pgd	   = kvm_get_guest_cr3;
 	context->get_pdptr         = kvm_pdptr_read;
@@ -4970,10 +4970,9 @@ kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *
 	return role;
 }
 
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 {
-	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
-	union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
+	union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, regs);
 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
 	if (new_role.as_u64 == g_context->mmu_role.as_u64)
@@ -5013,12 +5012,14 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
 {
+	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+
 	if (mmu_is_nested(vcpu))
-		init_kvm_nested_mmu(vcpu);
+		init_kvm_nested_mmu(vcpu, &regs);
 	else if (tdp_enabled)
-		init_kvm_tdp_mmu(vcpu);
+		init_kvm_tdp_mmu(vcpu, &regs);
 	else
-		init_kvm_softmmu(vcpu);
+		init_kvm_softmmu(vcpu, &regs);
 }
 EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 05/25] KVM: x86/mmu: rephrase unclear comment
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (3 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 04/25] KVM: x86/mmu: pull computation of kvm_mmu_role_regs to kvm_init_mmu Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 16:39   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback Paolo Bonzini
                   ` (19 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

If accessed bits are not supported there simple isn't any distinction
between accessed and non-accessed gPTEs, so the comment does not make
much sense.  Rephrase it in terms of what happens if accessed bits
*are* supported.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 80b4b291002a..d1d17d28e81b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -193,7 +193,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	if (!FNAME(is_present_gpte)(gpte))
 		goto no_present;
 
-	/* if accessed bit is not supported prefetch non accessed gpte */
+	/* if accessed bit is supported, prefetch only accessed gpte */
 	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
 	    !(gpte & PT_GUEST_ACCESSED_MASK))
 		goto no_present;
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (4 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 05/25] KVM: x86/mmu: rephrase unclear comment Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 17:13   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 07/25] KVM: x86/mmu: remove "bool base_only" arguments Paolo Bonzini
                   ` (18 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Currently, vendor code is patching the inject_page_fault and later, on
vmexit, expecting kvm_init_mmu to restore the inject_page_fault callback.

This is brittle, as exposed by the fact that SVM KVM_SET_NESTED_STATE
forgets to do it.  Instead, do the check at the time a page fault actually
has to be injected.  This does incur the cost of an extra retpoline
for nested vmexits when TDP is disabled, but is overall much cleaner.
While at it, add a comment that explains why the different behavior
is needed in this case.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/svm/nested.c       |  4 +---
 arch/x86/kvm/vmx/nested.c       |  4 +---
 arch/x86/kvm/x86.c              | 17 +++++++++++++++++
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 713e08f62385..92855d3984a7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1508,6 +1508,8 @@ struct kvm_x86_nested_ops {
 	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
 			    uint16_t *vmcs_version);
 	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  struct x86_exception *fault);
 };
 
 struct kvm_x86_init_ops {
@@ -1747,6 +1749,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long pay
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+void kvm_inject_page_fault_shadow(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 				    struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0e393506f4df..f3494dcc4e2f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4950,7 +4950,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 
 	context->get_guest_pgd	   = kvm_get_guest_cr3;
 	context->get_pdptr         = kvm_pdptr_read;
-	context->inject_page_fault = kvm_inject_page_fault;
+	context->inject_page_fault = kvm_inject_page_fault_shadow;
 }
 
 static union kvm_mmu_role
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96bab464967f..ff58c9ebc552 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -680,9 +680,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
 	if (ret)
 		return ret;
 
-	if (!npt_enabled)
-		vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
-
 	if (!from_vmrun)
 		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
@@ -1571,4 +1568,5 @@ struct kvm_x86_nested_ops svm_nested_ops = {
 	.get_nested_state_pages = svm_get_nested_state_pages,
 	.get_state = svm_get_nested_state,
 	.set_state = svm_set_nested_state,
+	.inject_page_fault = svm_inject_page_fault_nested,
 };
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1dfe23963a9e..564c60566da7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2615,9 +2615,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 	}
 
-	if (!enable_ept)
-		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
 	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
 				     vmcs12->guest_ia32_perf_global_ctrl))) {
@@ -6807,4 +6804,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
 	.write_log_dirty = nested_vmx_write_pml_buffer,
 	.enable_evmcs = nested_enable_evmcs,
 	.get_evmcs_version = nested_get_evmcs_version,
+	.inject_page_fault = vmx_inject_page_fault_nested,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da33d3a88a8d..1546a25a9307 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -746,6 +746,23 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
+void kvm_inject_page_fault_shadow(struct kvm_vcpu *vcpu,
+				  struct x86_exception *fault)
+{
+	/*
+	 * The core exception injection code is not able to combine
+	 * an exception with a vmexit; if a page fault happens while
+	 * a page fault exception is being delivered, the original
+	 * page fault would be changed incorrectly into a double
+	 * fault.  To work around this, #PF vmexits are injected
+	 * without going through kvm_queue_exception.
+	 */
+	if (unlikely(is_guest_mode(vcpu)))
+		kvm_x86_ops.nested_ops->inject_page_fault(vcpu, fault);
+	else
+		kvm_inject_page_fault(vcpu, fault);
+}
+
 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 				    struct x86_exception *fault)
 {
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 07/25] KVM: x86/mmu: remove "bool base_only" arguments
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (5 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 17:15   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role Paolo Bonzini
                   ` (17 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The argument is always false now that kvm_mmu_calc_root_page_role has
been removed.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 66 +++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f3494dcc4e2f..7c835253a330 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4680,47 +4680,30 @@ static void paging32_init_context(struct kvm_mmu *context)
 	context->direct_map = false;
 }
 
-static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
-							 const struct kvm_mmu_role_regs *regs)
-{
-	union kvm_mmu_extended_role ext = {0};
-
-	if (____is_cr0_pg(regs)) {
-		ext.cr0_pg = 1;
-		ext.cr4_pae = ____is_cr4_pae(regs);
-		ext.cr4_smep = ____is_cr4_smep(regs);
-		ext.cr4_smap = ____is_cr4_smap(regs);
-		ext.cr4_pse = ____is_cr4_pse(regs);
-
-		/* PKEY and LA57 are active iff long mode is active. */
-		ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
-		ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
-		ext.efer_lma = ____is_efer_lma(regs);
-	}
-
-	ext.valid = 1;
-
-	return ext;
-}
-
 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
-						   const struct kvm_mmu_role_regs *regs,
-						   bool base_only)
+						   const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_role role = {0};
 
 	role.base.access = ACC_ALL;
 	if (____is_cr0_pg(regs)) {
+		role.ext.cr0_pg = 1;
 		role.base.efer_nx = ____is_efer_nx(regs);
 		role.base.cr0_wp = ____is_cr0_wp(regs);
+
+		role.ext.cr4_pae = ____is_cr4_pae(regs);
+		role.ext.cr4_smep = ____is_cr4_smep(regs);
+		role.ext.cr4_smap = ____is_cr4_smap(regs);
+		role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+		/* PKEY and LA57 are active iff long mode is active. */
+		role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+		role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+		role.ext.efer_lma = ____is_efer_lma(regs);
 	}
 	role.base.smm = is_smm(vcpu);
 	role.base.guest_mode = is_guest_mode(vcpu);
-
-	if (base_only)
-		return role;
-
-	role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
+	role.ext.valid = 1;
 
 	return role;
 }
@@ -4740,10 +4723,9 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 
 static union kvm_mmu_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				const struct kvm_mmu_role_regs *regs,
-				bool base_only)
+				const struct kvm_mmu_role_regs *regs)
 {
-	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
+	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
 
 	role.base.ad_disabled = (shadow_accessed_mask == 0);
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
@@ -4758,7 +4740,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role new_role =
-		kvm_calc_tdp_mmu_root_page_role(vcpu, regs, false);
+		kvm_calc_tdp_mmu_root_page_role(vcpu, regs);
 
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
@@ -4787,10 +4769,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_role
 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
-				      const struct kvm_mmu_role_regs *regs,
-				      bool base_only)
+				      const struct kvm_mmu_role_regs *regs)
 {
-	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
+	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
 
 	role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
 	role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
@@ -4801,11 +4782,10 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_role
 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				   const struct kvm_mmu_role_regs *regs,
-				   bool base_only)
+				   const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_role role =
-		kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
+		kvm_calc_shadow_root_page_role_common(vcpu, regs);
 
 	role.base.direct = !____is_cr0_pg(regs);
 
@@ -4847,7 +4827,7 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role new_role =
-		kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
+		kvm_calc_shadow_mmu_root_page_role(vcpu, regs);
 
 	shadow_mmu_init_context(vcpu, context, regs, new_role);
 }
@@ -4857,7 +4837,7 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
 				   const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_role role =
-		kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
+		kvm_calc_shadow_root_page_role_common(vcpu, regs);
 
 	role.base.direct = false;
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
@@ -4958,7 +4938,7 @@ kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *
 {
 	union kvm_mmu_role role;
 
-	role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
+	role = kvm_calc_shadow_root_page_role_common(vcpu, regs);
 
 	/*
 	 * Nested MMUs are used only for walking L2's gva->gpa, they never have
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (6 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 07/25] KVM: x86/mmu: remove "bool base_only" arguments Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 17:36   ` Sean Christopherson
  2022-03-08 18:55   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 09/25] KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs Paolo Bonzini
                   ` (16 subsequent siblings)
  24 siblings, 2 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Snapshot the state of the processor registers that govern page walk into
a new field of struct kvm_mmu.  This is a more natural representation
than having it *mostly* in mmu_role but not exclusively; the delta
right now is represented in other fields, such as root_level.

The nested MMU now has only the CPU mode; and in fact the new function
kvm_calc_cpu_mode is analogous to the previous kvm_calc_nested_mmu_role,
except that it has role.base.direct equal to CR0.PG.  It is not clear
what the code meant by "setting role.base.direct to true to detect bogus
usage of the nested MMU".

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/mmu/mmu.c          | 107 ++++++++++++++++++++------------
 arch/x86/kvm/mmu/paging_tmpl.h  |   2 +-
 3 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 92855d3984a7..cc268116eb3f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -433,6 +433,7 @@ struct kvm_mmu {
 			 struct kvm_mmu_page *sp);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
 	struct kvm_mmu_root_info root;
+	union kvm_mmu_role cpu_mode;
 	union kvm_mmu_role mmu_role;
 	u8 root_level;
 	u8 shadow_root_level;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7c835253a330..1af898f0cf87 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -221,7 +221,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)		\
 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)	\
 {								\
-	return !!(mmu->mmu_role. base_or_ext . reg##_##name);	\
+	return !!(mmu->cpu_mode. base_or_ext . reg##_##name);	\
 }
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
@@ -4680,6 +4680,39 @@ static void paging32_init_context(struct kvm_mmu *context)
 	context->direct_map = false;
 }
 
+static union kvm_mmu_role
+kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
+{
+	union kvm_mmu_role role = {0};
+
+	role.base.access = ACC_ALL;
+	role.base.smm = is_smm(vcpu);
+	role.base.guest_mode = is_guest_mode(vcpu);
+	role.base.direct = !____is_cr0_pg(regs);
+	if (!role.base.direct) {
+		role.base.efer_nx = ____is_efer_nx(regs);
+		role.base.cr0_wp = ____is_cr0_wp(regs);
+		role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+		role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+		role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
+		role.base.level = role_regs_to_root_level(regs);
+
+		role.ext.cr0_pg = 1;
+		role.ext.cr4_pae = ____is_cr4_pae(regs);
+		role.ext.cr4_smep = ____is_cr4_smep(regs);
+		role.ext.cr4_smap = ____is_cr4_smap(regs);
+		role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+		/* PKEY and LA57 are active iff long mode is active. */
+		role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+		role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+		role.ext.efer_lma = ____is_efer_lma(regs);
+	}
+
+	role.ext.valid = 1;
+	return role;
+}
+
 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
 						   const struct kvm_mmu_role_regs *regs)
 {
@@ -4739,13 +4772,16 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 			     const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_role new_role =
+	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
+	union kvm_mmu_role mmu_role =
 		kvm_calc_tdp_mmu_root_page_role(vcpu, regs);
 
-	if (new_role.as_u64 == context->mmu_role.as_u64)
+	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
+	    mmu_role.as_u64 == context->mmu_role.as_u64)
 		return;
 
-	context->mmu_role.as_u64 = new_role.as_u64;
+	context->cpu_mode.as_u64 = cpu_mode.as_u64;
+	context->mmu_role.as_u64 = mmu_role.as_u64;
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
@@ -4800,13 +4836,15 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-				    const struct kvm_mmu_role_regs *regs,
-				    union kvm_mmu_role new_role)
+				    union kvm_mmu_role cpu_mode,
+				    union kvm_mmu_role mmu_role)
 {
-	if (new_role.as_u64 == context->mmu_role.as_u64)
+	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
+	    mmu_role.as_u64 == context->mmu_role.as_u64)
 		return;
 
-	context->mmu_role.as_u64 = new_role.as_u64;
+	context->cpu_mode.as_u64 = cpu_mode.as_u64;
+	context->mmu_role.as_u64 = mmu_role.as_u64;
 
 	if (!is_cr0_pg(context))
 		nonpaging_init_context(context);
@@ -4814,10 +4852,10 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 		paging64_init_context(context);
 	else
 		paging32_init_context(context);
-	context->root_level = role_regs_to_root_level(regs);
+	context->root_level = cpu_mode.base.level;
 
 	reset_guest_paging_metadata(vcpu, context);
-	context->shadow_root_level = new_role.base.level;
+	context->shadow_root_level = mmu_role.base.level;
 
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
@@ -4826,10 +4864,11 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 				const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_role new_role =
+	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
+	union kvm_mmu_role mmu_role =
 		kvm_calc_shadow_mmu_root_page_role(vcpu, regs);
 
-	shadow_mmu_init_context(vcpu, context, regs, new_role);
+	shadow_mmu_init_context(vcpu, context, cpu_mode, mmu_role);
 }
 
 static union kvm_mmu_role
@@ -4854,11 +4893,10 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 		.cr4 = cr4 & ~X86_CR4_PKE,
 		.efer = efer,
 	};
-	union kvm_mmu_role new_role;
+	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
+	union kvm_mmu_role mmu_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);;
 
-	new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
-
-	shadow_mmu_init_context(vcpu, context, &regs, new_role);
+	shadow_mmu_init_context(vcpu, context, cpu_mode, mmu_role);
 	kvm_mmu_new_pgd(vcpu, nested_cr3);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4881,7 +4919,6 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 	role.base.guest_mode = true;
 	role.base.access = ACC_ALL;
 
-	/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
 	role.ext.word = 0;
 	role.ext.execonly = execonly;
 	role.ext.valid = 1;
@@ -4895,12 +4932,14 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 {
 	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
 	u8 level = vmx_eptp_page_walk_level(new_eptp);
-	union kvm_mmu_role new_role =
+	union kvm_mmu_role new_mode =
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level);
 
-	if (new_role.as_u64 != context->mmu_role.as_u64) {
-		context->mmu_role.as_u64 = new_role.as_u64;
+	if (new_mode.as_u64 != context->cpu_mode.as_u64) {
+		/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+		context->cpu_mode.as_u64 = new_mode.as_u64;
+		context->mmu_role.as_u64 = new_mode.as_u64;
 
 		context->shadow_root_level = level;
 
@@ -4933,36 +4972,19 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 	context->inject_page_fault = kvm_inject_page_fault_shadow;
 }
 
-static union kvm_mmu_role
-kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
-{
-	union kvm_mmu_role role;
-
-	role = kvm_calc_shadow_root_page_role_common(vcpu, regs);
-
-	/*
-	 * Nested MMUs are used only for walking L2's gva->gpa, they never have
-	 * shadow pages of their own and so "direct" has no meaning.   Set it
-	 * to "true" to try to detect bogus usage of the nested MMU.
-	 */
-	role.base.direct = true;
-	role.base.level = role_regs_to_root_level(regs);
-	return role;
-}
-
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 {
-	union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, regs);
+	union kvm_mmu_role new_mode = kvm_calc_cpu_mode(vcpu, regs);
 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
-	if (new_role.as_u64 == g_context->mmu_role.as_u64)
+	if (new_mode.as_u64 == g_context->cpu_mode.as_u64)
 		return;
 
-	g_context->mmu_role.as_u64 = new_role.as_u64;
+	g_context->cpu_mode.as_u64   = new_mode.as_u64;
 	g_context->get_guest_pgd     = kvm_get_guest_cr3;
 	g_context->get_pdptr         = kvm_pdptr_read;
 	g_context->inject_page_fault = kvm_inject_page_fault;
-	g_context->root_level        = new_role.base.level;
+	g_context->root_level        = new_mode.base.level;
 
 	/*
 	 * L2 page tables are never shadowed, so there is no need to sync
@@ -5020,6 +5042,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
 	vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
 	vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
+	vcpu->arch.root_mmu.cpu_mode.ext.valid = 0;
+	vcpu->arch.guest_mmu.cpu_mode.ext.valid = 0;
+	vcpu->arch.nested_mmu.cpu_mode.ext.valid = 0;
 	kvm_mmu_reset_context(vcpu);
 
 	/*
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index d1d17d28e81b..e1c2ecb4ddee 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -323,7 +323,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
 	 * is not reserved and does not indicate a large page at this level,
 	 * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
 	 */
-	gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+	gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_mode.ext.cr4_pse);
 #endif
 	/*
 	 * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 09/25] KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (7 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 17:41   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 10/25] KVM: x86/mmu: remove ept_ad field Paolo Bonzini
                   ` (15 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The root_level can be found in the cpu_mode (in fact the field
is superfluous and could be removed, but one thing at a time).
Since there is only one usage left of role_regs_to_root_level,
inline it into kvm_calc_cpu_mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1af898f0cf87..6e539fc2c9c7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -244,19 +244,6 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 	return regs;
 }
 
-static int role_regs_to_root_level(const struct kvm_mmu_role_regs *regs)
-{
-	if (!____is_cr0_pg(regs))
-		return 0;
-	else if (____is_efer_lma(regs))
-		return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
-					       PT64_ROOT_4LEVEL;
-	else if (____is_cr4_pae(regs))
-		return PT32E_ROOT_LEVEL;
-	else
-		return PT32_ROOT_LEVEL;
-}
-
 static inline bool kvm_available_flush_tlb_with_range(void)
 {
 	return kvm_x86_ops.tlb_remote_flush_with_range;
@@ -4695,7 +4682,13 @@ kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 		role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
 		role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
 		role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
-		role.base.level = role_regs_to_root_level(regs);
+
+		if (____is_efer_lma(regs))
+			role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+		else if (____is_cr4_pae(regs))
+			role.base.level = PT32E_ROOT_LEVEL;
+		else
+			role.base.level = PT32_ROOT_LEVEL;
 
 		role.ext.cr0_pg = 1;
 		role.ext.cr4_pae = ____is_cr4_pae(regs);
@@ -4790,7 +4783,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	context->get_guest_pgd = kvm_get_guest_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
-	context->root_level = role_regs_to_root_level(regs);
+	context->root_level = cpu_mode.base.level;
 
 	if (!is_cr0_pg(context))
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 10/25] KVM: x86/mmu: remove ept_ad field
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (8 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 09/25] KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 17:42   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common Paolo Bonzini
                   ` (14 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The ept_ad field is used during page walk to determine if the guest PTEs
have accessed and dirty bits.  In the MMU role, the ad_disabled
bit represents whether the *shadow* PTEs have the bits, so it
would be incorrect to replace PT_HAVE_ACCESSED_DIRTY with just
!mmu->mmu_role.base.ad_disabled.

However, the similar field in the CPU mode, ad_disabled, is initialized
correctly: to the opposite value of ept_ad for shadow EPT, and zero
for non-EPT guest paging modes (which always have A/D bits).  It is
therefore possible to compute PT_HAVE_ACCESSED_DIRTY from the CPU mode,
like other page-format fields; it just has to be inverted to account
for the different polarity.

Having a CPU mode that is distinct from the MMU roles in fact would even
allow to remove PT_HAVE_ACCESSED_DIRTY macro altogether, and always use
!mmu->cpu_mode.base.ad_disabled.  I am not doing this because the macro
has a small effect in terms of dead code elimination:

   text	   data	    bss	    dec	    hex
 103544	  16665	    112	 120321	  1d601    # as of this patch
 103746	  16665	    112	 120523	  1d6cb    # without PT_HAVE_ACCESSED_DIRTY

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/mmu/mmu.c          | 1 -
 arch/x86/kvm/mmu/paging_tmpl.h  | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cc268116eb3f..996cf9b14f5e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -437,7 +437,6 @@ struct kvm_mmu {
 	union kvm_mmu_role mmu_role;
 	u8 root_level;
 	u8 shadow_root_level;
-	u8 ept_ad;
 	bool direct_map;
 	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6e539fc2c9c7..3ffa6f2bf991 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4936,7 +4936,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 
 		context->shadow_root_level = level;
 
-		context->ept_ad = accessed_dirty;
 		context->page_fault = ept_page_fault;
 		context->gva_to_gpa = ept_gva_to_gpa;
 		context->sync_page = ept_sync_page;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index e1c2ecb4ddee..64b6f76641f0 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -64,7 +64,7 @@
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
 	#define PT_GUEST_DIRTY_SHIFT 9
 	#define PT_GUEST_ACCESSED_SHIFT 8
-	#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_mode.base.ad_disabled)
 	#define CMPXCHG cmpxchg64
 	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
 #else
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (9 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 10/25] KVM: x86/mmu: remove ept_ad field Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 17:48   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging Paolo Bonzini
                   ` (13 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

kvm_calc_shadow_root_page_role_common is the same as
kvm_calc_cpu_mode except for the level, which is overwritten
afterwards in kvm_calc_shadow_mmu_root_page_role
and kvm_calc_shadow_npt_root_page_role.

role.base.direct is already set correctly for the CPU mode,
and CR0.PG=1 is required for VMRUN so it will also be
correct for nested NPT.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3ffa6f2bf991..31874fad12fb 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4796,27 +4796,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	reset_tdp_shadow_zero_bits_mask(context);
 }
 
-static union kvm_mmu_role
-kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
-				      const struct kvm_mmu_role_regs *regs)
-{
-	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
-
-	role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
-	role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
-	role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
-
-	return role;
-}
-
 static union kvm_mmu_role
 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 				   const struct kvm_mmu_role_regs *regs)
 {
-	union kvm_mmu_role role =
-		kvm_calc_shadow_root_page_role_common(vcpu, regs);
-
-	role.base.direct = !____is_cr0_pg(regs);
+	union kvm_mmu_role role = kvm_calc_cpu_mode(vcpu, regs);
 
 	if (!____is_efer_lma(regs))
 		role.base.level = PT32E_ROOT_LEVEL;
@@ -4869,9 +4853,8 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
 				   const struct kvm_mmu_role_regs *regs)
 {
 	union kvm_mmu_role role =
-		kvm_calc_shadow_root_page_role_common(vcpu, regs);
+               kvm_calc_cpu_mode(vcpu, regs);
 
-	role.base.direct = false;
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
 
 	return role;
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (10 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 18:11   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 13/25] KVM: x86/mmu: cleanup computation of MMU roles for shadow paging Paolo Bonzini
                   ` (12 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Inline kvm_calc_mmu_role_common into its sole caller, and simplify it
by removing the computation of unnecessary bits.

Extended bits are unnecessary because page walking uses the CPU mode,
and EFER.NX/CR0.WP can be set to one unconditionally---matching the
format of shadow pages rather than the format of guest pages.

The MMU role for two dimensional paging does still depend on the CPU mode,
even if only barely so, due to SMM and guest mode; for consistency,
pass it down to kvm_calc_tdp_mmu_root_page_role instead of querying
the vcpu with is_smm or is_guest_mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 41 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 31874fad12fb..0a08ab8e2e4e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4706,34 +4706,6 @@ kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 	return role;
 }
 
-static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
-						   const struct kvm_mmu_role_regs *regs)
-{
-	union kvm_mmu_role role = {0};
-
-	role.base.access = ACC_ALL;
-	if (____is_cr0_pg(regs)) {
-		role.ext.cr0_pg = 1;
-		role.base.efer_nx = ____is_efer_nx(regs);
-		role.base.cr0_wp = ____is_cr0_wp(regs);
-
-		role.ext.cr4_pae = ____is_cr4_pae(regs);
-		role.ext.cr4_smep = ____is_cr4_smep(regs);
-		role.ext.cr4_smap = ____is_cr4_smap(regs);
-		role.ext.cr4_pse = ____is_cr4_pse(regs);
-
-		/* PKEY and LA57 are active iff long mode is active. */
-		role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
-		role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
-		role.ext.efer_lma = ____is_efer_lma(regs);
-	}
-	role.base.smm = is_smm(vcpu);
-	role.base.guest_mode = is_guest_mode(vcpu);
-	role.ext.valid = 1;
-
-	return role;
-}
-
 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 {
 	/* tdp_root_level is architecture forced level, use it if nonzero */
@@ -4749,14 +4721,20 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 
 static union kvm_mmu_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				const struct kvm_mmu_role_regs *regs)
+				union kvm_mmu_role cpu_mode)
 {
-	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
+	union kvm_mmu_role role = {0};
 
+	role.base.access = ACC_ALL;
+	role.base.cr0_wp = true;
+	role.base.efer_nx = true;
+	role.base.smm = cpu_mode.base.smm;
+	role.base.guest_mode = cpu_mode.base.guest_mode;
 	role.base.ad_disabled = (shadow_accessed_mask == 0);
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
 	role.base.direct = true;
 	role.base.has_4_byte_gpte = false;
+	role.ext.valid = true;
 
 	return role;
 }
@@ -4766,8 +4744,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
-	union kvm_mmu_role mmu_role =
-		kvm_calc_tdp_mmu_root_page_role(vcpu, regs);
+	union kvm_mmu_role mmu_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
 
 	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
 	    mmu_role.as_u64 == context->mmu_role.as_u64)
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 13/25] KVM: x86/mmu: cleanup computation of MMU roles for shadow paging
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (11 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 14/25] KVM: x86/mmu: store shadow EFER.NX in the MMU role Paolo Bonzini
                   ` (11 subsequent siblings)
  24 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Pass the already-computed CPU mode, instead of redoing it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0a08ab8e2e4e..725aef55c08f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4775,13 +4775,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_role
 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				   const struct kvm_mmu_role_regs *regs)
+				   union kvm_mmu_role role)
 {
-	union kvm_mmu_role role = kvm_calc_cpu_mode(vcpu, regs);
-
-	if (!____is_efer_lma(regs))
+	if (!role.ext.efer_lma)
 		role.base.level = PT32E_ROOT_LEVEL;
-	else if (____is_cr4_la57(regs))
+	else if (role.ext.cr4_la57)
 		role.base.level = PT64_ROOT_5LEVEL;
 	else
 		role.base.level = PT64_ROOT_4LEVEL;
@@ -4820,18 +4818,15 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
 	union kvm_mmu_role mmu_role =
-		kvm_calc_shadow_mmu_root_page_role(vcpu, regs);
+		kvm_calc_shadow_mmu_root_page_role(vcpu, cpu_mode);
 
 	shadow_mmu_init_context(vcpu, context, cpu_mode, mmu_role);
 }
 
 static union kvm_mmu_role
 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
-				   const struct kvm_mmu_role_regs *regs)
+				   union kvm_mmu_role role)
 {
-	union kvm_mmu_role role =
-               kvm_calc_cpu_mode(vcpu, regs);
-
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
 
 	return role;
@@ -4847,7 +4842,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 		.efer = efer,
 	};
 	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
-	union kvm_mmu_role mmu_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);;
+	union kvm_mmu_role mmu_role = kvm_calc_shadow_npt_root_page_role(vcpu, cpu_mode);
 
 	shadow_mmu_init_context(vcpu, context, cpu_mode, mmu_role);
 	kvm_mmu_new_pgd(vcpu, nested_cr3);
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 14/25] KVM: x86/mmu: store shadow EFER.NX in the MMU role
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (12 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 13/25] KVM: x86/mmu: cleanup computation of MMU roles for shadow paging Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 15/25] KVM: x86/mmu: remove extended bits from mmu_role, rename field Paolo Bonzini
                   ` (10 subsequent siblings)
  24 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Now that the MMU role is separate from the CPU mode, it can be a
truthful description of the format of the shadow pages.  This includes
whether the shadow pages use the NX bit; so force the efer_nx field
of the MMU role when TDP is disabled, and remove the hardcoding it in
the callers of reset_shadow_zero_bits_mask.

In fact, the initialization of reserved SPTE bits can now be made common
to shadow paging and shadow NPT; move it to shadow_mmu_init_context.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 725aef55c08f..bbb5f50d3dcc 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4405,16 +4405,6 @@ static inline u64 reserved_hpa_bits(void)
 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context)
 {
-	/*
-	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
-	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
-	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
-	 * The iTLB multi-hit workaround can be toggled at any time, so assume
-	 * NX can be used by any non-nested shadow MMU to avoid having to reset
-	 * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
-	 */
-	bool uses_nx = is_efer_nx(context) || !tdp_enabled;
-
 	/* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
 	bool is_amd = true;
 	/* KVM doesn't use 2-level page tables for the shadow MMU. */
@@ -4426,7 +4416,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 
 	shadow_zero_check = &context->shadow_zero_check;
 	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-				context->shadow_root_level, uses_nx,
+				context->shadow_root_level,
+				context->mmu_role.base.efer_nx,
 				guest_can_use_gbpages(vcpu), is_pse, is_amd);
 
 	if (!shadow_me_mask)
@@ -4784,6 +4775,16 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	else
 		role.base.level = PT64_ROOT_4LEVEL;
 
+	/*
+	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+	 * The iTLB multi-hit workaround can be toggled at any time, so assume
+	 * NX can be used by any non-nested shadow MMU to avoid having to reset
+	 * MMU contexts.
+	 */
+	role.base.efer_nx = true;
 	return role;
 }
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 15/25] KVM: x86/mmu: remove extended bits from mmu_role, rename field
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (13 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 14/25] KVM: x86/mmu: store shadow EFER.NX in the MMU role Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:02   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 16/25] KVM: x86/mmu: rename kvm_mmu_role union Paolo Bonzini
                   ` (9 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

mmu_role represents the role of the root of the page tables.
It does not need any extended bits, as those govern only KVM's
page table walking; the is_* functions used for page table
walking always use the CPU mode.

ext.valid is not present anymore in the MMU role, but an
all-zero MMU role is impossible because the level field is
never zero in the MMU role.  So just zap the whole mmu_role
in order to force invalidation after CPUID is updated.

While making this change, which requires touching almost every
occurrence of "mmu_role", rename it to "root_role".

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu/mmu.c          | 72 ++++++++++++++++-----------------
 arch/x86/kvm/mmu/paging_tmpl.h  |  4 +-
 arch/x86/kvm/mmu/tdp_mmu.c      |  2 +-
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 996cf9b14f5e..b7d7c4f31730 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -434,7 +434,7 @@ struct kvm_mmu {
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
 	struct kvm_mmu_root_info root;
 	union kvm_mmu_role cpu_mode;
-	union kvm_mmu_role mmu_role;
+	union kvm_mmu_page_role root_role;
 	u8 root_level;
 	u8 shadow_root_level;
 	bool direct_map;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index bbb5f50d3dcc..35907badb6ce 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -193,7 +193,7 @@ struct kvm_mmu_role_regs {
 
 /*
  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
- * reading from the role_regs.  Once the mmu_role is constructed, it becomes
+ * reading from the role_regs.  Once the root_role is constructed, it becomes
  * the single source of truth for the MMU's state.
  */
 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
@@ -2029,7 +2029,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	int collisions = 0;
 	LIST_HEAD(invalid_list);
 
-	role = vcpu->arch.mmu->mmu_role.base;
+	role = vcpu->arch.mmu->root_role;
 	role.level = level;
 	role.direct = direct;
 	role.access = access;
@@ -3265,7 +3265,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
 	 * This should not be called while L2 is active, L2 can't invalidate
 	 * _only_ its own roots, e.g. INVVPID unconditionally exits.
 	 */
-	WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
+	WARN_ON_ONCE(mmu->root_role.guest_mode);
 
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 		root_hpa = mmu->prev_roots[i].hpa;
@@ -4158,7 +4158,7 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
-	union kvm_mmu_page_role new_role = mmu->mmu_role.base;
+	union kvm_mmu_page_role new_role = mmu->root_role;
 
 	if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
 		/* kvm_mmu_ensure_valid_pgd will set up a new root.  */
@@ -4417,7 +4417,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 	shadow_zero_check = &context->shadow_zero_check;
 	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
 				context->shadow_root_level,
-				context->mmu_role.base.efer_nx,
+				context->root_role.efer_nx,
 				guest_can_use_gbpages(vcpu), is_pse, is_amd);
 
 	if (!shadow_me_mask)
@@ -4710,22 +4710,21 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 	return max_tdp_level;
 }
 
-static union kvm_mmu_role
+static union kvm_mmu_page_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 				union kvm_mmu_role cpu_mode)
 {
-	union kvm_mmu_role role = {0};
+	union kvm_mmu_page_role role = {0};
 
-	role.base.access = ACC_ALL;
-	role.base.cr0_wp = true;
-	role.base.efer_nx = true;
-	role.base.smm = cpu_mode.base.smm;
-	role.base.guest_mode = cpu_mode.base.guest_mode;
-	role.base.ad_disabled = (shadow_accessed_mask == 0);
-	role.base.level = kvm_mmu_get_tdp_level(vcpu);
-	role.base.direct = true;
-	role.base.has_4_byte_gpte = false;
-	role.ext.valid = true;
+	role.access = ACC_ALL;
+	role.cr0_wp = true;
+	role.efer_nx = true;
+	role.smm = cpu_mode.base.smm;
+	role.guest_mode = cpu_mode.base.guest_mode;
+	role.ad_disabled = (shadow_accessed_mask == 0);
+	role.level = kvm_mmu_get_tdp_level(vcpu);
+	role.direct = true;
+	role.has_4_byte_gpte = false;
 
 	return role;
 }
@@ -4735,14 +4734,14 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
-	union kvm_mmu_role mmu_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
+	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
 
 	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
-	    mmu_role.as_u64 == context->mmu_role.as_u64)
+	    root_role.word == context->root_role.word)
 		return;
 
 	context->cpu_mode.as_u64 = cpu_mode.as_u64;
-	context->mmu_role.as_u64 = mmu_role.as_u64;
+	context->root_role.word = root_role.word;
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
@@ -4764,7 +4763,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	reset_tdp_shadow_zero_bits_mask(context);
 }
 
-static union kvm_mmu_role
+static union kvm_mmu_page_role
 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 				   union kvm_mmu_role role)
 {
@@ -4785,19 +4784,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	 * MMU contexts.
 	 */
 	role.base.efer_nx = true;
-	return role;
+	return role.base;
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
 				    union kvm_mmu_role cpu_mode,
-				    union kvm_mmu_role mmu_role)
+				    union kvm_mmu_page_role root_role)
 {
 	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
-	    mmu_role.as_u64 == context->mmu_role.as_u64)
+	    root_role.word == context->root_role.word)
 		return;
 
 	context->cpu_mode.as_u64 = cpu_mode.as_u64;
-	context->mmu_role.as_u64 = mmu_role.as_u64;
+	context->root_role.word = root_role.word;
 
 	if (!is_cr0_pg(context))
 		nonpaging_init_context(context);
@@ -4808,7 +4807,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	context->root_level = cpu_mode.base.level;
 
 	reset_guest_paging_metadata(vcpu, context);
-	context->shadow_root_level = mmu_role.base.level;
+	context->shadow_root_level = root_role.level;
 
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
@@ -4818,19 +4817,18 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
-	union kvm_mmu_role mmu_role =
+	union kvm_mmu_page_role root_role =
 		kvm_calc_shadow_mmu_root_page_role(vcpu, cpu_mode);
 
-	shadow_mmu_init_context(vcpu, context, cpu_mode, mmu_role);
+	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
 }
 
-static union kvm_mmu_role
+static union kvm_mmu_page_role
 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
 				   union kvm_mmu_role role)
 {
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
-
-	return role;
+	return role.base;
 }
 
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
@@ -4843,9 +4841,9 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 		.efer = efer,
 	};
 	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
-	union kvm_mmu_role mmu_role = kvm_calc_shadow_npt_root_page_role(vcpu, cpu_mode);
+	union kvm_mmu_page_role root_role = kvm_calc_shadow_npt_root_page_role(vcpu, cpu_mode);
 
-	shadow_mmu_init_context(vcpu, context, cpu_mode, mmu_role);
+	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
 	kvm_mmu_new_pgd(vcpu, nested_cr3);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4888,7 +4886,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 	if (new_mode.as_u64 != context->cpu_mode.as_u64) {
 		/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
 		context->cpu_mode.as_u64 = new_mode.as_u64;
-		context->mmu_role.as_u64 = new_mode.as_u64;
+		context->root_role.word = new_mode.base.word;
 
 		context->shadow_root_level = level;
 
@@ -4987,9 +4985,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	 * problem is swept under the rug; KVM's CPUID API is horrific and
 	 * it's all but impossible to solve it without introducing a new API.
 	 */
-	vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
-	vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
-	vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
+	vcpu->arch.root_mmu.root_role.word = 0;
+	vcpu->arch.guest_mmu.root_role.word = 0;
+	vcpu->arch.nested_mmu.root_role.word = 0;
 	vcpu->arch.root_mmu.cpu_mode.ext.valid = 0;
 	vcpu->arch.guest_mmu.cpu_mode.ext.valid = 0;
 	vcpu->arch.nested_mmu.cpu_mode.ext.valid = 0;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 64b6f76641f0..7c0fa115bd56 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -1023,7 +1023,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
+	union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
 	int i;
 	bool host_writable;
 	gpa_t first_pte_gpa;
@@ -1051,7 +1051,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	 * reserved bits checks will be wrong, etc...
 	 */
 	if (WARN_ON_ONCE(sp->role.direct ||
-			 (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+			 (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
 		return -1;
 
 	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index debf08212f12..c18ad86c9a82 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -209,7 +209,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 {
-	union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
+	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_page *root;
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 16/25] KVM: x86/mmu: rename kvm_mmu_role union
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (14 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 15/25] KVM: x86/mmu: remove extended bits from mmu_role, rename field Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:15   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 17/25] KVM: x86/mmu: remove redundant bits from extended role Paolo Bonzini
                   ` (8 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

It is quite confusing that the "full" union is called kvm_mmu_role
but is used for the "cpu_mode" field of struct kvm_mmu.  Rename it
to kvm_mmu_paging_mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++---
 arch/x86/kvm/mmu/mmu.c          | 28 ++++++++++++++--------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b7d7c4f31730..3dbe0be075f5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -276,7 +276,7 @@ struct kvm_kernel_irq_routing_entry;
 /*
  * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
  * also includes TDP pages) to determine whether or not a page can be used in
- * the given MMU context.  This is a subset of the overall kvm_mmu_role to
+ * the given MMU context.  This is a subset of the overall kvm_mmu_paging_mode to
  * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
  * 2 bytes per gfn instead of 4 bytes per gfn.
  *
@@ -373,7 +373,7 @@ union kvm_mmu_extended_role {
 	};
 };
 
-union kvm_mmu_role {
+union kvm_mmu_paging_mode {
 	u64 as_u64;
 	struct {
 		union kvm_mmu_page_role base;
@@ -433,7 +433,7 @@ struct kvm_mmu {
 			 struct kvm_mmu_page *sp);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
 	struct kvm_mmu_root_info root;
-	union kvm_mmu_role cpu_mode;
+	union kvm_mmu_paging_mode cpu_mode;
 	union kvm_mmu_page_role root_role;
 	u8 root_level;
 	u8 shadow_root_level;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 35907badb6ce..61499fd7d017 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4658,10 +4658,10 @@ static void paging32_init_context(struct kvm_mmu *context)
 	context->direct_map = false;
 }
 
-static union kvm_mmu_role
+static union kvm_mmu_paging_mode
 kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 {
-	union kvm_mmu_role role = {0};
+	union kvm_mmu_paging_mode role = {0};
 
 	role.base.access = ACC_ALL;
 	role.base.smm = is_smm(vcpu);
@@ -4712,7 +4712,7 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 
 static union kvm_mmu_page_role
 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				union kvm_mmu_role cpu_mode)
+				union kvm_mmu_paging_mode cpu_mode)
 {
 	union kvm_mmu_page_role role = {0};
 
@@ -4733,7 +4733,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 			     const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
+	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
 	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
 
 	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
@@ -4765,7 +4765,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_page_role
 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				   union kvm_mmu_role role)
+				   union kvm_mmu_paging_mode role)
 {
 	if (!role.ext.efer_lma)
 		role.base.level = PT32E_ROOT_LEVEL;
@@ -4788,7 +4788,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-				    union kvm_mmu_role cpu_mode,
+				    union kvm_mmu_paging_mode cpu_mode,
 				    union kvm_mmu_page_role root_role)
 {
 	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
@@ -4816,7 +4816,7 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 				const struct kvm_mmu_role_regs *regs)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
+	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
 	union kvm_mmu_page_role root_role =
 		kvm_calc_shadow_mmu_root_page_role(vcpu, cpu_mode);
 
@@ -4825,7 +4825,7 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 
 static union kvm_mmu_page_role
 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
-				   union kvm_mmu_role role)
+				   union kvm_mmu_paging_mode role)
 {
 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
 	return role.base;
@@ -4840,7 +4840,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 		.cr4 = cr4 & ~X86_CR4_PKE,
 		.efer = efer,
 	};
-	union kvm_mmu_role cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
+	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
 	union kvm_mmu_page_role root_role = kvm_calc_shadow_npt_root_page_role(vcpu, cpu_mode);
 
 	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
@@ -4848,11 +4848,11 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
 
-static union kvm_mmu_role
+static union kvm_mmu_paging_mode
 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 				   bool execonly, u8 level)
 {
-	union kvm_mmu_role role = {0};
+	union kvm_mmu_paging_mode role = {0};
 
 	/*
 	 * KVM does not support SMM transfer monitors, and consequently does not
@@ -4879,7 +4879,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 {
 	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
 	u8 level = vmx_eptp_page_walk_level(new_eptp);
-	union kvm_mmu_role new_mode =
+	union kvm_mmu_paging_mode new_mode =
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level);
 
@@ -4920,7 +4920,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 {
-	union kvm_mmu_role new_mode = kvm_calc_cpu_mode(vcpu, regs);
+	union kvm_mmu_paging_mode new_mode = kvm_calc_cpu_mode(vcpu, regs);
 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
 	if (new_mode.as_u64 == g_context->cpu_mode.as_u64)
@@ -6116,7 +6116,7 @@ int kvm_mmu_module_init(void)
 	 */
 	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
 	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
-	BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
+	BUILD_BUG_ON(sizeof(union kvm_mmu_paging_mode) != sizeof(u64));
 
 	kvm_mmu_reset_all_pte_masks();
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 17/25] KVM: x86/mmu: remove redundant bits from extended role
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (15 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 16/25] KVM: x86/mmu: rename kvm_mmu_role union Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 18/25] KVM: x86/mmu: remove valid " Paolo Bonzini
                   ` (7 subsequent siblings)
  24 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Before the separation of the CPU and the MMU role, CR0.PG was not
available in the base MMU role, because two-dimensional paging always
used direct=1 in the MMU role.  However, now that the raw role is
snapshotted in mmu->cpu_mode, CR0.PG *can* be found (though inverted)
as !cpu_mode.base.direct.  There is no need to store it again in union
kvm_mmu_extended_role; instead, write an is_cr0_pg accessor by hand that
takes care of the inversion.

Likewise, CR4.PAE is now always present in the CPU mode as
!cpu_mode.base.has_4_byte_gpte.  The inversion makes certain tests on
the MMU role easier, and is easily hidden by the is_cr4_pae accessor
when operating on the CPU mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 --
 arch/x86/kvm/mmu/mmu.c          | 14 ++++++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3dbe0be075f5..14f391582738 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,8 +362,6 @@ union kvm_mmu_extended_role {
 	struct {
 		unsigned int valid:1;
 		unsigned int execonly:1;
-		unsigned int cr0_pg:1;
-		unsigned int cr4_pae:1;
 		unsigned int cr4_pse:1;
 		unsigned int cr4_pke:1;
 		unsigned int cr4_smap:1;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 61499fd7d017..eb7c62c11700 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -223,16 +223,24 @@ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)	\
 {								\
 	return !!(mmu->cpu_mode. base_or_ext . reg##_##name);	\
 }
-BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
-BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pae);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
 
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+        return !mmu->cpu_mode.base.direct;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+        return !mmu->cpu_mode.base.has_4_byte_gpte;
+}
+
 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_role_regs regs = {
@@ -4681,8 +4689,6 @@ kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 		else
 			role.base.level = PT32_ROOT_LEVEL;
 
-		role.ext.cr0_pg = 1;
-		role.ext.cr4_pae = ____is_cr4_pae(regs);
 		role.ext.cr4_smep = ____is_cr4_smep(regs);
 		role.ext.cr4_smap = ____is_cr4_smap(regs);
 		role.ext.cr4_pse = ____is_cr4_pse(regs);
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 18/25] KVM: x86/mmu: remove valid from extended role
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (16 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 17/25] KVM: x86/mmu: remove redundant bits from extended role Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles Paolo Bonzini
                   ` (6 subsequent siblings)
  24 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The level and direct field of the CPU mode can act as a marker for validity
instead: exactly one of them is guaranteed to be nonzero, so a zero value
for both means that the role is invalid and the MMU properties will be
computed again.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 +---
 arch/x86/kvm/mmu/mmu.c          | 8 +++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 14f391582738..b8b115b2038f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -342,8 +342,7 @@ union kvm_mmu_page_role {
  * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
  * relevant to the current MMU configuration.   When loading CR0, CR4, or EFER,
  * including on nested transitions, if nothing in the full role changes then
- * MMU re-configuration can be skipped. @valid bit is set on first usage so we
- * don't treat all-zero structure as valid data.
+ * MMU re-configuration can be skipped.
  *
  * The properties that are tracked in the extended role but not the page role
  * are for things that either (a) do not affect the validity of the shadow page
@@ -360,7 +359,6 @@ union kvm_mmu_page_role {
 union kvm_mmu_extended_role {
 	u32 word;
 	struct {
-		unsigned int valid:1;
 		unsigned int execonly:1;
 		unsigned int cr4_pse:1;
 		unsigned int cr4_pke:1;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index eb7c62c11700..d657e2e2ceec 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4699,7 +4699,6 @@ kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 		role.ext.efer_lma = ____is_efer_lma(regs);
 	}
 
-	role.ext.valid = 1;
 	return role;
 }
 
@@ -4874,7 +4873,6 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 
 	role.ext.word = 0;
 	role.ext.execonly = execonly;
-	role.ext.valid = 1;
 
 	return role;
 }
@@ -4994,9 +4992,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu->arch.root_mmu.root_role.word = 0;
 	vcpu->arch.guest_mmu.root_role.word = 0;
 	vcpu->arch.nested_mmu.root_role.word = 0;
-	vcpu->arch.root_mmu.cpu_mode.ext.valid = 0;
-	vcpu->arch.guest_mmu.cpu_mode.ext.valid = 0;
-	vcpu->arch.nested_mmu.cpu_mode.ext.valid = 0;
+	vcpu->arch.root_mmu.cpu_mode.as_u64 = 0;
+	vcpu->arch.guest_mmu.cpu_mode.as_u64 = 0;
+	vcpu->arch.nested_mmu.cpu_mode.as_u64 = 0;
 	kvm_mmu_reset_context(vcpu);
 
 	/*
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (17 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 18/25] KVM: x86/mmu: remove valid " Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:35   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 20/25] KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu Paolo Bonzini
                   ` (5 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Shadow MMUs compute their role from cpu_mode.base, simply by adjusting
the root level.  It's one line of code, so do not place it in a separate
function.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 54 +++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d657e2e2ceec..47288643ab70 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4768,30 +4768,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	reset_tdp_shadow_zero_bits_mask(context);
 }
 
-static union kvm_mmu_page_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
-				   union kvm_mmu_paging_mode role)
-{
-	if (!role.ext.efer_lma)
-		role.base.level = PT32E_ROOT_LEVEL;
-	else if (role.ext.cr4_la57)
-		role.base.level = PT64_ROOT_5LEVEL;
-	else
-		role.base.level = PT64_ROOT_4LEVEL;
-
-	/*
-	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
-	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
-	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
-	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
-	 * The iTLB multi-hit workaround can be toggled at any time, so assume
-	 * NX can be used by any non-nested shadow MMU to avoid having to reset
-	 * MMU contexts.
-	 */
-	role.base.efer_nx = true;
-	return role.base;
-}
-
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
 				    union kvm_mmu_paging_mode cpu_mode,
 				    union kvm_mmu_page_role root_role)
@@ -4822,18 +4798,23 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
-	union kvm_mmu_page_role root_role =
-		kvm_calc_shadow_mmu_root_page_role(vcpu, cpu_mode);
+	union kvm_mmu_page_role root_role;
 
-	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
-}
+	root_role = cpu_mode.base;
+	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
 
-static union kvm_mmu_page_role
-kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
-				   union kvm_mmu_paging_mode role)
-{
-	role.base.level = kvm_mmu_get_tdp_level(vcpu);
-	return role.base;
+	/*
+	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+	 * The iTLB multi-hit workaround can be toggled at any time, so assume
+	 * NX can be used by any non-nested shadow MMU to avoid having to reset
+	 * MMU contexts.
+	 */
+	root_role.efer_nx = true;
+
+	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
 }
 
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
@@ -4846,7 +4827,10 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 		.efer = efer,
 	};
 	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
-	union kvm_mmu_page_role root_role = kvm_calc_shadow_npt_root_page_role(vcpu, cpu_mode);
+	union kvm_mmu_page_role root_role;
+
+	root_role = cpu_mode.base;
+	root_role.level = kvm_mmu_get_tdp_level(vcpu);
 
 	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
 	kvm_mmu_new_pgd(vcpu, nested_cr3);
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 20/25] KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (18 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:45   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 21/25] KVM: x86/mmu: replace shadow_root_level with root_role.level Paolo Bonzini
                   ` (4 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Do not lead init_kvm_*mmu into the temptation of poking
into struct kvm_mmu_role_regs, by passing to it directly
the CPU mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 47288643ab70..a7028c2ae5c7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4734,11 +4734,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	return role;
 }
 
-static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
-			     const struct kvm_mmu_role_regs *regs)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cpu_mode)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
 	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
 
 	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
@@ -4794,10 +4792,9 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 }
 
 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
-				const struct kvm_mmu_role_regs *regs)
+				union kvm_mmu_paging_mode cpu_mode)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
 	union kvm_mmu_page_role root_role;
 
 	root_role = cpu_mode.base;
@@ -4895,20 +4892,19 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
-			     const struct kvm_mmu_role_regs *regs)
+			     union kvm_mmu_paging_mode cpu_mode)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 
-	kvm_init_shadow_mmu(vcpu, regs);
+	kvm_init_shadow_mmu(vcpu, cpu_mode);
 
 	context->get_guest_pgd	   = kvm_get_guest_cr3;
 	context->get_pdptr         = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault_shadow;
 }
 
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode new_mode)
 {
-	union kvm_mmu_paging_mode new_mode = kvm_calc_cpu_mode(vcpu, regs);
 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
 
 	if (new_mode.as_u64 == g_context->cpu_mode.as_u64)
@@ -4949,13 +4945,14 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, const struct kvm_mmu_role
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
 
 	if (mmu_is_nested(vcpu))
-		init_kvm_nested_mmu(vcpu, &regs);
+		init_kvm_nested_mmu(vcpu, cpu_mode);
 	else if (tdp_enabled)
-		init_kvm_tdp_mmu(vcpu, &regs);
+		init_kvm_tdp_mmu(vcpu, cpu_mode);
 	else
-		init_kvm_softmmu(vcpu, &regs);
+		init_kvm_softmmu(vcpu, cpu_mode);
 }
 EXPORT_SYMBOL_GPL(kvm_init_mmu);
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 21/25] KVM: x86/mmu: replace shadow_root_level with root_role.level
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (19 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 20/25] KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:48   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 22/25] KVM: x86/mmu: replace root_level with cpu_mode.base.level Paolo Bonzini
                   ` (3 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

root_role.level is always the same value as shadow_level:

- it's kvm_mmu_get_tdp_level(vcpu) when going through init_kvm_tdp_mmu

- it's the level argument when going through kvm_init_shadow_ept_mmu

- it's assigned directly from new_role.base.level when going
  through shadow_mmu_init_context

Remove the duplication and get the level directly from the role.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.h              |  2 +-
 arch/x86/kvm/mmu/mmu.c          | 33 ++++++++++++++-------------------
 arch/x86/kvm/mmu/tdp_mmu.c      |  2 +-
 arch/x86/kvm/svm/svm.c          |  2 +-
 arch/x86/kvm/vmx/vmx.c          |  2 +-
 6 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b8b115b2038f..81897aa4e669 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -432,7 +432,6 @@ struct kvm_mmu {
 	union kvm_mmu_paging_mode cpu_mode;
 	union kvm_mmu_page_role root_role;
 	u8 root_level;
-	u8 shadow_root_level;
 	bool direct_map;
 	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6ee4436e46f1..596ac2d424fc 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -113,7 +113,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
 		return;
 
 	static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
-					  vcpu->arch.mmu->shadow_root_level);
+					  vcpu->arch.mmu->root_role.level);
 }
 
 extern unsigned long kvm_get_guest_cr3(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a7028c2ae5c7..c33879f23e94 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2127,7 +2127,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 {
 	iterator->addr = addr;
 	iterator->shadow_addr = root;
-	iterator->level = vcpu->arch.mmu->shadow_root_level;
+	iterator->level = vcpu->arch.mmu->root_role.level;
 
 	if (iterator->level >= PT64_ROOT_4LEVEL &&
 	    vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
@@ -3316,7 +3316,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
-	u8 shadow_root_level = mmu->shadow_root_level;
+	u8 shadow_root_level = mmu->root_role.level;
 	hpa_t root;
 	unsigned i;
 	int r;
@@ -3466,7 +3466,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 */
 	if (mmu->root_level >= PT64_ROOT_4LEVEL) {
 		root = mmu_alloc_root(vcpu, root_gfn, 0,
-				      mmu->shadow_root_level, false);
+				      mmu->root_role.level, false);
 		mmu->root.hpa = root;
 		goto set_root_pgd;
 	}
@@ -3482,7 +3482,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * the shadow page table may be a PAE or a long mode page table.
 	 */
 	pm_mask = PT_PRESENT_MASK | shadow_me_mask;
-	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+	if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
 		if (WARN_ON_ONCE(!mmu->pml4_root)) {
@@ -3491,7 +3491,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		}
 		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
 
-		if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+		if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
 			if (WARN_ON_ONCE(!mmu->pml5_root)) {
 				r = -EIO;
 				goto out_unlock;
@@ -3516,9 +3516,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		mmu->pae_root[i] = root | pm_mask;
 	}
 
-	if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+	if (mmu->root_role.level == PT64_ROOT_5LEVEL)
 		mmu->root.hpa = __pa(mmu->pml5_root);
-	else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+	else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
 		mmu->root.hpa = __pa(mmu->pml4_root);
 	else
 		mmu->root.hpa = __pa(mmu->pae_root);
@@ -3534,7 +3534,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
-	bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
+	bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
 	u64 *pml5_root = NULL;
 	u64 *pml4_root = NULL;
 	u64 *pae_root;
@@ -3546,7 +3546,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
 	 */
 	if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
-	    mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+	    mmu->root_role.level < PT64_ROOT_4LEVEL)
 		return 0;
 
 	/*
@@ -4420,18 +4420,18 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 	struct rsvd_bits_validate *shadow_zero_check;
 	int i;
 
-	WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
+	WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
 
 	shadow_zero_check = &context->shadow_zero_check;
 	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-				context->shadow_root_level,
+				context->root_role.level,
 				context->root_role.efer_nx,
 				guest_can_use_gbpages(vcpu), is_pse, is_amd);
 
 	if (!shadow_me_mask)
 		return;
 
-	for (i = context->shadow_root_level; --i >= 0;) {
+	for (i = context->root_role.level; --i >= 0;) {
 		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
 		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
 	}
@@ -4458,7 +4458,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 
 	if (boot_cpu_is_amd())
 		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-					context->shadow_root_level, false,
+					context->root_role.level, false,
 					boot_cpu_has(X86_FEATURE_GBPAGES),
 					false, true);
 	else
@@ -4469,7 +4469,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 	if (!shadow_me_mask)
 		return;
 
-	for (i = context->shadow_root_level; --i >= 0;) {
+	for (i = context->root_role.level; --i >= 0;) {
 		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
 		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
 	}
@@ -4748,7 +4748,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cp
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
-	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
 	context->direct_map = true;
 	context->get_guest_pgd = kvm_get_guest_cr3;
 	context->get_pdptr = kvm_pdptr_read;
@@ -4786,8 +4785,6 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	context->root_level = cpu_mode.base.level;
 
 	reset_guest_paging_metadata(vcpu, context);
-	context->shadow_root_level = root_role.level;
-
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -4873,8 +4870,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		context->cpu_mode.as_u64 = new_mode.as_u64;
 		context->root_role.word = new_mode.base.word;
 
-		context->shadow_root_level = level;
-
 		context->page_fault = ept_page_fault;
 		context->gva_to_gpa = ept_gva_to_gpa;
 		context->sync_page = ept_sync_page;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c18ad86c9a82..ab8c09db4c19 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1697,7 +1697,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 	gfn_t gfn = addr >> PAGE_SHIFT;
 	int leaf = -1;
 
-	*root_level = vcpu->arch.mmu->shadow_root_level;
+	*root_level = vcpu->arch.mmu->root_role.level;
 
 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 		leaf = iter.level;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7038c76fa841..d169221d9305 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3863,7 +3863,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
 		hv_track_root_tdp(vcpu, root_hpa);
 
 		cr3 = vcpu->arch.cr3;
-	} else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+	} else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
 		cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
 	} else {
 		/* PCID in the guest should be impossible with a 32-bit MMU. */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b183dfc41d74..00a79b82fa46 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2960,7 +2960,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
 
 	if (enable_ept)
 		ept_sync_context(construct_eptp(vcpu, root_hpa,
-						mmu->shadow_root_level));
+						mmu->root_role.level));
 	else
 		vpid_sync_context(vmx_get_current_vpid(vcpu));
 }
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 22/25] KVM: x86/mmu: replace root_level with cpu_mode.base.level
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (20 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 21/25] KVM: x86/mmu: replace shadow_root_level with root_role.level Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:49   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 23/25] KVM: x86/mmu: replace direct_map with root_role.direct Paolo Bonzini
                   ` (2 subsequent siblings)
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

Remove another duplicate field of struct kvm_mmu.  This time it's
the root level for page table walking; the separate field is
always initialized as cpu_mode.base.level, so its users can look
up the CPU mode directly instead.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu/mmu.c          | 18 +++++++-----------
 arch/x86/kvm/mmu/paging_tmpl.h  |  4 ++--
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81897aa4e669..ec89b1a488c5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -431,7 +431,6 @@ struct kvm_mmu {
 	struct kvm_mmu_root_info root;
 	union kvm_mmu_paging_mode cpu_mode;
 	union kvm_mmu_page_role root_role;
-	u8 root_level;
 	bool direct_map;
 	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c33879f23e94..0c88d4206715 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2130,7 +2130,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 	iterator->level = vcpu->arch.mmu->root_role.level;
 
 	if (iterator->level >= PT64_ROOT_4LEVEL &&
-	    vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
+	    vcpu->arch.mmu->cpu_mode.base.level < PT64_ROOT_4LEVEL &&
 	    !vcpu->arch.mmu->direct_map)
 		iterator->level = PT32E_ROOT_LEVEL;
 
@@ -3440,7 +3440,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * On SVM, reading PDPTRs might access guest memory, which might fault
 	 * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
 	 */
-	if (mmu->root_level == PT32E_ROOT_LEVEL) {
+	if (mmu->cpu_mode.base.level == PT32E_ROOT_LEVEL) {
 		for (i = 0; i < 4; ++i) {
 			pdptrs[i] = mmu->get_pdptr(vcpu, i);
 			if (!(pdptrs[i] & PT_PRESENT_MASK))
@@ -3464,7 +3464,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * Do we shadow a long mode page table? If so we need to
 	 * write-protect the guests page table root.
 	 */
-	if (mmu->root_level >= PT64_ROOT_4LEVEL) {
+	if (mmu->cpu_mode.base.level >= PT64_ROOT_4LEVEL) {
 		root = mmu_alloc_root(vcpu, root_gfn, 0,
 				      mmu->root_role.level, false);
 		mmu->root.hpa = root;
@@ -3503,7 +3503,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	for (i = 0; i < 4; ++i) {
 		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
-		if (mmu->root_level == PT32E_ROOT_LEVEL) {
+		if (mmu->cpu_mode.base.level == PT32E_ROOT_LEVEL) {
 			if (!(pdptrs[i] & PT_PRESENT_MASK)) {
 				mmu->pae_root[i] = INVALID_PAE_ROOT;
 				continue;
@@ -3545,7 +3545,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
 	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
 	 */
-	if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+	if (mmu->direct_map || mmu->cpu_mode.base.level >= PT64_ROOT_4LEVEL ||
 	    mmu->root_role.level < PT64_ROOT_4LEVEL)
 		return 0;
 
@@ -3642,7 +3642,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 
 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
 
-	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+	if (vcpu->arch.mmu->cpu_mode.base.level >= PT64_ROOT_4LEVEL) {
 		hpa_t root = vcpu->arch.mmu->root.hpa;
 		sp = to_shadow_page(root);
 
@@ -4348,7 +4348,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 {
 	__reset_rsvds_bits_mask(&context->guest_rsvd_check,
 				vcpu->arch.reserved_gpa_bits,
-				context->root_level, is_efer_nx(context),
+				context->cpu_mode.base.level, is_efer_nx(context),
 				guest_can_use_gbpages(vcpu),
 				is_cr4_pse(context),
 				guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4752,7 +4752,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cp
 	context->get_guest_pgd = kvm_get_guest_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
-	context->root_level = cpu_mode.base.level;
 
 	if (!is_cr0_pg(context))
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -4782,7 +4781,6 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 		paging64_init_context(context);
 	else
 		paging32_init_context(context);
-	context->root_level = cpu_mode.base.level;
 
 	reset_guest_paging_metadata(vcpu, context);
 	reset_shadow_zero_bits_mask(vcpu, context);
@@ -4874,7 +4872,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		context->gva_to_gpa = ept_gva_to_gpa;
 		context->sync_page = ept_sync_page;
 		context->invlpg = ept_invlpg;
-		context->root_level = level;
 		context->direct_map = false;
 		update_permission_bitmask(context, true);
 		context->pkru_mask = 0;
@@ -4909,7 +4906,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode
 	g_context->get_guest_pgd     = kvm_get_guest_cr3;
 	g_context->get_pdptr         = kvm_pdptr_read;
 	g_context->inject_page_fault = kvm_inject_page_fault;
-	g_context->root_level        = new_mode.base.level;
 
 	/*
 	 * L2 page tables are never shadowed, so there is no need to sync
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 7c0fa115bd56..bdfd38b0f8e6 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -361,7 +361,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
-	walker->level = mmu->root_level;
+	walker->level = mmu->cpu_mode.base.level;
 	pte           = kvm_mmu_get_guest_pgd(vcpu, mmu);
 	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
@@ -656,7 +656,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 	WARN_ON_ONCE(gw->gfn != base_gfn);
 	direct_access = gw->pte_access;
 
-	top_level = vcpu->arch.mmu->root_level;
+	top_level = vcpu->arch.mmu->cpu_mode.base.level;
 	if (top_level == PT32E_ROOT_LEVEL)
 		top_level = PT32_ROOT_LEVEL;
 	/*
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 23/25] KVM: x86/mmu: replace direct_map with root_role.direct
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (21 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 22/25] KVM: x86/mmu: replace root_level with cpu_mode.base.level Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 19:52   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once Paolo Bonzini
  2022-02-21 16:22 ` [PATCH v2 25/25] KVM: x86/mmu: extract initialization of the page walking data Paolo Bonzini
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

direct_map is always equal to the direct field of the root page's role:

- for shadow paging, direct_map is true if CR0.PG=0 and root_role.direct is
copied from cpu_mode.base.direct

- for TDP, it is always true and root_role.direct is also always true

- for shadow EPT, it is always false and root_role.direct is also always
false

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu/mmu.c          | 27 ++++++++++++---------------
 arch/x86/kvm/x86.c              | 12 ++++++------
 3 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ec89b1a488c5..af90d0653139 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -431,7 +431,6 @@ struct kvm_mmu {
 	struct kvm_mmu_root_info root;
 	union kvm_mmu_paging_mode cpu_mode;
 	union kvm_mmu_page_role root_role;
-	bool direct_map;
 	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 
 	/*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0c88d4206715..8eb2c0373309 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2029,7 +2029,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     int direct,
 					     unsigned int access)
 {
-	bool direct_mmu = vcpu->arch.mmu->direct_map;
+	bool direct_mmu = vcpu->arch.mmu->root_role.direct;
 	union kvm_mmu_page_role role;
 	struct hlist_head *sp_list;
 	unsigned quadrant;
@@ -2131,7 +2131,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 
 	if (iterator->level >= PT64_ROOT_4LEVEL &&
 	    vcpu->arch.mmu->cpu_mode.base.level < PT64_ROOT_4LEVEL &&
-	    !vcpu->arch.mmu->direct_map)
+	    !vcpu->arch.mmu->root_role.direct)
 		iterator->level = PT32E_ROOT_LEVEL;
 
 	if (iterator->level == PT32E_ROOT_LEVEL) {
@@ -2507,7 +2507,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	if (vcpu->arch.mmu->direct_map)
+	if (vcpu->arch.mmu->root_role.direct)
 		return 0;
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -3545,7 +3545,8 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
 	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
 	 */
-	if (mmu->direct_map || mmu->cpu_mode.base.level >= PT64_ROOT_4LEVEL ||
+	if (mmu->root_role.direct ||
+	    mmu->cpu_mode.base.level >= PT64_ROOT_4LEVEL ||
 	    mmu->root_role.level < PT64_ROOT_4LEVEL)
 		return 0;
 
@@ -3634,7 +3635,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_mmu_page *sp;
 
-	if (vcpu->arch.mmu->direct_map)
+	if (vcpu->arch.mmu->root_role.direct)
 		return;
 
 	if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
@@ -3854,7 +3855,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 
 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
 	arch.gfn = gfn;
-	arch.direct_map = mmu->direct_map;
+	arch.direct_map = mmu->root_role.direct;
 	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, mmu);
 
 	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
@@ -4072,7 +4073,6 @@ static void nonpaging_init_context(struct kvm_mmu *context)
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
-	context->direct_map = true;
 }
 
 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -4654,7 +4654,6 @@ static void paging64_init_context(struct kvm_mmu *context)
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->sync_page = paging64_sync_page;
 	context->invlpg = paging64_invlpg;
-	context->direct_map = false;
 }
 
 static void paging32_init_context(struct kvm_mmu *context)
@@ -4663,7 +4662,6 @@ static void paging32_init_context(struct kvm_mmu *context)
 	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->sync_page = paging32_sync_page;
 	context->invlpg = paging32_invlpg;
-	context->direct_map = false;
 }
 
 static union kvm_mmu_paging_mode
@@ -4748,7 +4746,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cp
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
-	context->direct_map = true;
 	context->get_guest_pgd = kvm_get_guest_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
@@ -4872,7 +4869,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		context->gva_to_gpa = ept_gva_to_gpa;
 		context->sync_page = ept_sync_page;
 		context->invlpg = ept_invlpg;
-		context->direct_map = false;
+
 		update_permission_bitmask(context, true);
 		context->pkru_mask = 0;
 		reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
@@ -4987,13 +4984,13 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
+	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
 	if (r)
 		goto out;
 	r = mmu_alloc_special_roots(vcpu);
 	if (r)
 		goto out;
-	if (vcpu->arch.mmu->direct_map)
+	if (vcpu->arch.mmu->root_role.direct)
 		r = mmu_alloc_direct_roots(vcpu);
 	else
 		r = mmu_alloc_shadow_roots(vcpu);
@@ -5197,7 +5194,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_PF;
-	bool direct = vcpu->arch.mmu->direct_map;
+	bool direct = vcpu->arch.mmu->root_role.direct;
 
 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
 		return RET_PF_RETRY;
@@ -5228,7 +5225,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 	 * paging in both guests. If true, we simply unprotect the page
 	 * and resume the guest.
 	 */
-	if (vcpu->arch.mmu->direct_map &&
+	if (vcpu->arch.mmu->root_role.direct &&
 	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
 		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
 		return 1;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1546a25a9307..53730e81ceb5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8016,7 +8016,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
 		return false;
 
-	if (!vcpu->arch.mmu->direct_map) {
+	if (!vcpu->arch.mmu->root_role.direct) {
 		/*
 		 * Write permission should be allowed since only
 		 * write access need to be emulated.
@@ -8049,7 +8049,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	kvm_release_pfn_clean(pfn);
 
 	/* The instructions are well-emulated on direct mmu. */
-	if (vcpu->arch.mmu->direct_map) {
+	if (vcpu->arch.mmu->root_role.direct) {
 		unsigned int indirect_shadow_pages;
 
 		write_lock(&vcpu->kvm->mmu_lock);
@@ -8117,7 +8117,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 	vcpu->arch.last_retry_eip = ctxt->eip;
 	vcpu->arch.last_retry_addr = cr2_or_gpa;
 
-	if (!vcpu->arch.mmu->direct_map)
+	if (!vcpu->arch.mmu->root_role.direct)
 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
 
 	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -8397,7 +8397,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		ctxt->exception.address = cr2_or_gpa;
 
 		/* With shadow page tables, cr2 contains a GVA or nGPA. */
-		if (vcpu->arch.mmu->direct_map) {
+		if (vcpu->arch.mmu->root_role.direct) {
 			ctxt->gpa_available = true;
 			ctxt->gpa_val = cr2_or_gpa;
 		}
@@ -12198,7 +12198,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 {
 	int r;
 
-	if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
+	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
 	      work->wakeup_all)
 		return;
 
@@ -12206,7 +12206,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 	if (unlikely(r))
 		return;
 
-	if (!vcpu->arch.mmu->direct_map &&
+	if (!vcpu->arch.mmu->root_role.direct &&
 	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
 		return;
 
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (22 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 23/25] KVM: x86/mmu: replace direct_map with root_role.direct Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 20:58   ` Sean Christopherson
  2022-02-21 16:22 ` [PATCH v2 25/25] KVM: x86/mmu: extract initialization of the page walking data Paolo Bonzini
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

The get_guest_pgd, get_pdptr and inject_page_fault pointers are constant
for all three of root_mmu, guest_mmu and nested_mmu.  In fact, the guest_mmu
function pointers depend on the processor vendor and need to be retrieved
from three new nested_ops, but the others are absolutely the same.

Opportunistically stop initializing get_pdptr for nested EPT, since it does
not have PDPTRs.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++
 arch/x86/kvm/mmu/mmu.c          | 65 +++++++++++++++++----------------
 arch/x86/kvm/svm/nested.c       |  9 +++--
 arch/x86/kvm/vmx/nested.c       |  5 +--
 4 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af90d0653139..b70965235c31 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1503,6 +1503,11 @@ struct kvm_x86_nested_ops {
 	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
 				  struct x86_exception *fault);
+	void (*inject_nested_tdp_vmexit)(struct kvm_vcpu *vcpu,
+					 struct x86_exception *fault);
+
+	unsigned long (*get_nested_pgd)(struct kvm_vcpu *vcpu);
+	u64 (*get_nested_pdptr)(struct kvm_vcpu *vcpu, int index);
 };
 
 struct kvm_x86_init_ops {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8eb2c0373309..27cb6ba5a3b0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4743,12 +4743,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cp
 
 	context->cpu_mode.as_u64 = cpu_mode.as_u64;
 	context->root_role.word = root_role.word;
-	context->page_fault = kvm_tdp_page_fault;
-	context->sync_page = nonpaging_sync_page;
-	context->invlpg = NULL;
-	context->get_guest_pgd = kvm_get_guest_cr3;
-	context->get_pdptr = kvm_pdptr_read;
-	context->inject_page_fault = kvm_inject_page_fault;
 
 	if (!is_cr0_pg(context))
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -4758,7 +4752,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cp
 		context->gva_to_gpa = paging32_gva_to_gpa;
 
 	reset_guest_paging_metadata(vcpu, context);
-	reset_tdp_shadow_zero_bits_mask(context);
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
@@ -4783,8 +4776,8 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
-				union kvm_mmu_paging_mode cpu_mode)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+			     union kvm_mmu_paging_mode cpu_mode)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_page_role root_role;
@@ -4880,18 +4873,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
-			     union kvm_mmu_paging_mode cpu_mode)
-{
-	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-
-	kvm_init_shadow_mmu(vcpu, cpu_mode);
-
-	context->get_guest_pgd	   = kvm_get_guest_cr3;
-	context->get_pdptr         = kvm_pdptr_read;
-	context->inject_page_fault = kvm_inject_page_fault_shadow;
-}
-
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode new_mode)
 {
 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
@@ -4899,16 +4880,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode
 	if (new_mode.as_u64 == g_context->cpu_mode.as_u64)
 		return;
 
-	g_context->cpu_mode.as_u64   = new_mode.as_u64;
-	g_context->get_guest_pgd     = kvm_get_guest_cr3;
-	g_context->get_pdptr         = kvm_pdptr_read;
-	g_context->inject_page_fault = kvm_inject_page_fault;
-
-	/*
-	 * L2 page tables are never shadowed, so there is no need to sync
-	 * SPTEs.
-	 */
-	g_context->invlpg            = NULL;
+	g_context->cpu_mode.as_u64 = new_mode.as_u64;
 
 	/*
 	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
@@ -5477,6 +5449,37 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
+	vcpu->arch.root_mmu.get_guest_pgd = kvm_get_guest_cr3;
+	vcpu->arch.root_mmu.get_pdptr = kvm_pdptr_read;
+
+	if (tdp_enabled) {
+		vcpu->arch.root_mmu.inject_page_fault = kvm_inject_page_fault;
+		vcpu->arch.root_mmu.page_fault = kvm_tdp_page_fault;
+		vcpu->arch.root_mmu.sync_page = nonpaging_sync_page;
+		vcpu->arch.root_mmu.invlpg = NULL;
+		reset_tdp_shadow_zero_bits_mask(&vcpu->arch.root_mmu);
+
+		vcpu->arch.guest_mmu.get_guest_pgd = kvm_x86_ops.nested_ops->get_nested_pgd;
+		vcpu->arch.guest_mmu.get_pdptr = kvm_x86_ops.nested_ops->get_nested_pdptr;
+		vcpu->arch.guest_mmu.inject_page_fault = kvm_x86_ops.nested_ops->inject_nested_tdp_vmexit;
+	} else {
+		vcpu->arch.root_mmu.inject_page_fault = kvm_inject_page_fault_shadow;
+		/*
+		 * page_fault, sync_page, invlpg are set at runtime depending
+		 * on the guest paging mode.
+		 */
+	}
+
+	vcpu->arch.nested_mmu.get_guest_pgd     = kvm_get_guest_cr3;
+	vcpu->arch.nested_mmu.get_pdptr         = kvm_pdptr_read;
+	vcpu->arch.nested_mmu.inject_page_fault = kvm_inject_page_fault;
+
+	/*
+	 * L2 page tables are never shadowed, so there is no need to sync
+	 * SPTEs.
+	 */
+	vcpu->arch.nested_mmu.invlpg = NULL;
+
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index ff58c9ebc552..713c7531de99 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -109,10 +109,8 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
 				svm->vmcb01.ptr->save.efer,
 				svm->nested.ctl.nested_cr3);
-	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
-	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
-	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
-	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -1569,4 +1567,7 @@ struct kvm_x86_nested_ops svm_nested_ops = {
 	.get_state = svm_get_nested_state,
 	.set_state = svm_set_nested_state,
 	.inject_page_fault = svm_inject_page_fault_nested,
+	.inject_nested_tdp_vmexit = nested_svm_inject_npf_exit,
+	.get_nested_pgd = nested_svm_get_tdp_cr3,
+	.get_nested_pdptr = nested_svm_get_tdp_pdptr,
 };
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 564c60566da7..02df0f4fccef 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -414,9 +414,6 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
 	nested_ept_new_eptp(vcpu);
-	vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
-	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
-	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 }
@@ -6805,4 +6802,6 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
 	.enable_evmcs = nested_enable_evmcs,
 	.get_evmcs_version = nested_get_evmcs_version,
 	.inject_page_fault = vmx_inject_page_fault_nested,
+	.inject_nested_tdp_vmexit = nested_ept_inject_page_fault,
+	.get_nested_pgd = nested_ept_get_eptp,
 };
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH v2 25/25] KVM: x86/mmu: extract initialization of the page walking data
  2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
                   ` (23 preceding siblings ...)
  2022-02-21 16:22 ` [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once Paolo Bonzini
@ 2022-02-21 16:22 ` Paolo Bonzini
  2022-03-08 20:02   ` Sean Christopherson
  24 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-02-21 16:22 UTC (permalink / raw)
  To: linux-kernel, kvm; +Cc: dmatlack, seanjc

struct kvm_mmu consists logically of two parts: a page walker that
operates based on the CPU mode, and the shadow page table builder
that operates based on the MMU role; the latter does not exist
on vcpu->arch.nested_mmu.

The callbacks are also logically separated; of those that are not
constant, gva_to_gpa belongs to the page walker and everything else
belongs to the shadow page table builder.  This is visible in the
duplicated code to initialize gva_to_gpa in *_init_context (for
shadow paging and nested NPT), in init_kvm_tdp_mmu (for non-nested
TDP), and in init_kvm_nested_mmu.  The guest paging metadata also
belongs to the page walker and is duplicated in the same way.

Extract this duplicated code to a new function.  The new function
is basically the same as init_kvm_nested_mmu, since the nested MMU
has only the page walker part.  The only difference is that it
uses the CPU mode rather than the VCPU directly, which is more
in line with the rest of the MMU code.

Shadow EPT does not use the new function, since it has its own
gva_to_gpa callback and a different set of reserved bits.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 87 ++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 27cb6ba5a3b0..659f014190d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4070,7 +4070,6 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 static void nonpaging_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = nonpaging_page_fault;
-	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
 }
@@ -4651,7 +4650,6 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
 static void paging64_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = paging64_page_fault;
-	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->sync_page = paging64_sync_page;
 	context->invlpg = paging64_invlpg;
 }
@@ -4659,7 +4657,6 @@ static void paging64_init_context(struct kvm_mmu *context)
 static void paging32_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = paging32_page_fault;
-	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->sync_page = paging32_sync_page;
 	context->invlpg = paging32_invlpg;
 }
@@ -4700,6 +4697,24 @@ kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
 	return role;
 }
 
+static void kvm_vcpu_init_walker(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *mmu,
+				 union kvm_mmu_paging_mode new_mode)
+{
+	if (new_mode.as_u64 == mmu->cpu_mode.as_u64)
+		return;
+
+	mmu->cpu_mode.as_u64 = new_mode.as_u64;
+	if (!is_cr0_pg(mmu))
+		mmu->gva_to_gpa = nonpaging_gva_to_gpa;
+	else if (is_cr4_pae(mmu))
+		mmu->gva_to_gpa = paging64_gva_to_gpa;
+	else
+		mmu->gva_to_gpa = paging32_gva_to_gpa;
+
+	reset_guest_paging_metadata(vcpu, mmu);
+}
+
 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
 {
 	/* tdp_root_level is architecture forced level, use it if nonzero */
@@ -4735,36 +4750,17 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cpu_mode)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
 
-	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
-	    root_role.word == context->root_role.word)
-		return;
-
-	context->cpu_mode.as_u64 = cpu_mode.as_u64;
-	context->root_role.word = root_role.word;
-
-	if (!is_cr0_pg(context))
-		context->gva_to_gpa = nonpaging_gva_to_gpa;
-	else if (is_cr4_pae(context))
-		context->gva_to_gpa = paging64_gva_to_gpa;
-	else
-		context->gva_to_gpa = paging32_gva_to_gpa;
-
-	reset_guest_paging_metadata(vcpu, context);
+	context->root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-				    union kvm_mmu_paging_mode cpu_mode,
 				    union kvm_mmu_page_role root_role)
 {
-	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
-	    root_role.word == context->root_role.word)
+	if (root_role.word == context->root_role.word)
 		return;
 
-	context->cpu_mode.as_u64 = cpu_mode.as_u64;
 	context->root_role.word = root_role.word;
-
 	if (!is_cr0_pg(context))
 		nonpaging_init_context(context);
 	else if (is_cr4_pae(context))
@@ -4772,7 +4768,6 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	else
 		paging32_init_context(context);
 
-	reset_guest_paging_metadata(vcpu, context);
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -4795,8 +4790,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 	 * MMU contexts.
 	 */
 	root_role.efer_nx = true;
-
-	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
+	shadow_mmu_init_context(vcpu, context, root_role);
 }
 
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
@@ -4811,10 +4805,12 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
 	union kvm_mmu_page_role root_role;
 
+	kvm_vcpu_init_walker(vcpu, context, cpu_mode);
+
 	root_role = cpu_mode.base;
 	root_role.level = kvm_mmu_get_tdp_level(vcpu);
 
-	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
+	shadow_mmu_init_context(vcpu, context, root_role);
 	kvm_mmu_new_pgd(vcpu, nested_cr3);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4873,43 +4869,16 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode new_mode)
-{
-	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
-
-	if (new_mode.as_u64 == g_context->cpu_mode.as_u64)
-		return;
-
-	g_context->cpu_mode.as_u64 = new_mode.as_u64;
-
-	/*
-	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
-	 * L1's nested page tables (e.g. EPT12). The nested translation
-	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
-	 * L2's page tables as the first level of translation and L1's
-	 * nested page tables as the second level of translation. Basically
-	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
-	 */
-	if (!is_paging(vcpu))
-		g_context->gva_to_gpa = nonpaging_gva_to_gpa;
-	else if (is_long_mode(vcpu))
-		g_context->gva_to_gpa = paging64_gva_to_gpa;
-	else if (is_pae(vcpu))
-		g_context->gva_to_gpa = paging64_gva_to_gpa;
-	else
-		g_context->gva_to_gpa = paging32_gva_to_gpa;
-
-	reset_guest_paging_metadata(vcpu, g_context);
-}
-
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
 	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
 
+	kvm_vcpu_init_walker(vcpu, vcpu->arch.walk_mmu, cpu_mode);
 	if (mmu_is_nested(vcpu))
-		init_kvm_nested_mmu(vcpu, cpu_mode);
-	else if (tdp_enabled)
+		return;
+
+	if (tdp_enabled)
 		init_kvm_tdp_mmu(vcpu, cpu_mode);
 	else
 		init_kvm_softmmu(vcpu, cpu_mode);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-02-21 16:22 ` [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3 Paolo Bonzini
@ 2022-03-08 16:16   ` Sean Christopherson
  2022-03-08 16:21     ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 16:16 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> Most of the time, calls to get_guest_pgd result in calling kvm_read_cr3
> (the exception is only nested TDP).  Check if that is the case if
> retpolines are enabled, thus avoiding an expensive indirect call.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu.h             | 10 ++++++++++
>  arch/x86/kvm/mmu/mmu.c         | 15 ++++++++-------
>  arch/x86/kvm/mmu/paging_tmpl.h |  2 +-
>  arch/x86/kvm/x86.c             |  2 +-
>  4 files changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 1d0c1904d69a..6ee4436e46f1 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -116,6 +116,16 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
>  					  vcpu->arch.mmu->shadow_root_level);
>  }
>  
> +extern unsigned long kvm_get_guest_cr3(struct kvm_vcpu *vcpu);

No extern please, it's superfluous and against KVM style.  Moot point though, see
below.

> +static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)

Wrap the params, no reason to make this line so long.

> +{
> +#ifdef CONFIG_RETPOLINE
> +	if (mmu->get_guest_pgd == kvm_get_guest_cr3)
> +		return kvm_read_cr3(vcpu);

This is unnecessarily fragile and confusing at first glance.  Compilers are smart
enough to generate a non-inline version of functions if they're used for function
pointers, while still inlining where appropriate.  In other words, just drop
kvm_get_guest_cr3() entirely, a al get_pdptr => kvm_pdptr_read().

---
 arch/x86/kvm/mmu.h     |  6 +++---
 arch/x86/kvm/mmu/mmu.c | 11 +++--------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3af66b9df640..50528d39de8d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -117,11 +117,11 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
 					  vcpu->arch.mmu->shadow_root_level);
 }

-extern unsigned long kvm_get_guest_cr3(struct kvm_vcpu *vcpu);
-static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+						  struct kvm_mmu *mmu)
 {
 #ifdef CONFIG_RETPOLINE
-	if (mmu->get_guest_pgd == kvm_get_guest_cr3)
+	if (mmu->get_guest_pgd == kvm_read_cr3)
 		return kvm_read_cr3(vcpu);
 #endif
 	return mmu->get_guest_pgd(vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 995c3450c20f..cc2414397e4b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4234,11 +4234,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);

-unsigned long kvm_get_guest_cr3(struct kvm_vcpu *vcpu)
-{
-	return kvm_read_cr3(vcpu);
-}
-
 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			   unsigned int access)
 {
@@ -4793,7 +4788,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = NULL;
 	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
 	context->direct_map = true;
-	context->get_guest_pgd = kvm_get_guest_cr3;
+	context->get_guest_pgd = kvm_read_cr3;
 	context->get_pdptr = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
 	context->root_level = role_regs_to_root_level(&regs);
@@ -4968,7 +4963,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)

 	kvm_init_shadow_mmu(vcpu, &regs);

-	context->get_guest_pgd	   = kvm_get_guest_cr3;
+	context->get_guest_pgd	   = kvm_read_cr3;
 	context->get_pdptr         = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
 }
@@ -5000,7 +4995,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		return;

 	g_context->mmu_role.as_u64 = new_role.as_u64;
-	g_context->get_guest_pgd     = kvm_get_guest_cr3;
+	g_context->get_guest_pgd     = kvm_read_cr3;
 	g_context->get_pdptr         = kvm_pdptr_read;
 	g_context->inject_page_fault = kvm_inject_page_fault;
 	g_context->root_level        = new_role.base.level;

base-commit: c31df3e63672c14d8b52e34606c823e2166024b8
--

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 02/25] KVM: x86/mmu: nested EPT cannot be used in SMM
  2022-02-21 16:22 ` [PATCH v2 02/25] KVM: x86/mmu: nested EPT cannot be used in SMM Paolo Bonzini
@ 2022-03-08 16:18   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 16:18 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> The role.base.smm flag is always zero when setting up shadow EPT,
> do not bother copying it over from vcpu->arch.root_mmu.
> 
> Reviewed-by: David Matlack <dmatlack@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Reviewed-by: Sean Christopherson <seanjc@google.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-03-08 16:16   ` Sean Christopherson
@ 2022-03-08 16:21     ` Paolo Bonzini
  2022-03-08 16:32       ` Sean Christopherson
  0 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 16:21 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 17:16, Sean Christopherson wrote:
> 
>> +static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
> Wrap the params, no reason to make this line so long.
> 
>> +{
>> +#ifdef CONFIG_RETPOLINE
>> +	if (mmu->get_guest_pgd == kvm_get_guest_cr3)
>> +		return kvm_read_cr3(vcpu);
> This is unnecessarily fragile and confusing at first glance.  Compilers are smart
> enough to generate a non-inline version of functions if they're used for function
> pointers, while still inlining where appropriate.  In other words, just drop
> kvm_get_guest_cr3() entirely, a al get_pdptr => kvm_pdptr_read().

Unfortunately this isn't entirely true.  The function pointer will not 
match between compilation units, in this case between the one that calls 
kvm_mmu_get_guest_pgd and the one that assigned kvm_read_cr3 to the 
function pointer.

Paolo


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 03/25] KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs
  2022-02-21 16:22 ` [PATCH v2 03/25] KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs Paolo Bonzini
@ 2022-03-08 16:22   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 16:22 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> struct kvm_mmu_role_regs is computed just once and then accessed.  Use
> const to make this clearer, even though the const fields of struct
> kvm_mmu_role_regs already prevent modifications to the contents of the
> struct, or rather make them harder.
> 
> Reviewed-by: David Matlack <dmatlack@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 25 ++++++++++++++-----------
>  1 file changed, 14 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4f9bbd02fb8b..97566ac539e3 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -197,7 +197,7 @@ struct kvm_mmu_role_regs {
>   * the single source of truth for the MMU's state.
>   */
>  #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)			\
> -static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
> +static inline bool __maybe_unused ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)\

This is one of the very rare times I think it's worth splitting the prototype,
it's not like the function name is grep-friendly anyways.  Either way,

Reviewed-by: Sean Christopherson <seanjc@google.com>

static inline bool __maybe_unused					\
____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)		\
{									\
	return !!(regs->reg & flag);					\
}

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-03-08 16:21     ` Paolo Bonzini
@ 2022-03-08 16:32       ` Sean Christopherson
  2022-03-08 16:43         ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 16:32 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Paolo Bonzini wrote:
> On 3/8/22 17:16, Sean Christopherson wrote:
> > 
> > > +static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
> > Wrap the params, no reason to make this line so long.
> > 
> > > +{
> > > +#ifdef CONFIG_RETPOLINE
> > > +	if (mmu->get_guest_pgd == kvm_get_guest_cr3)
> > > +		return kvm_read_cr3(vcpu);
> > This is unnecessarily fragile and confusing at first glance.  Compilers are smart
> > enough to generate a non-inline version of functions if they're used for function
> > pointers, while still inlining where appropriate.  In other words, just drop
> > kvm_get_guest_cr3() entirely, a al get_pdptr => kvm_pdptr_read().
> 
> Unfortunately this isn't entirely true.  The function pointer will not match
> between compilation units, in this case between the one that calls
> kvm_mmu_get_guest_pgd and the one that assigned kvm_read_cr3 to the function
> pointer.

Ooh, that's a nasty gotcha.  And that's why your v1 used a NULL entry as a sentinel
for rerouting to kvm_read_cr3().  Hrm, I'm torn between disliking the NULL behavior
and disliking the subtle redirect :-)

Aha!  An idea that would provide line of sight to avoiding retpoline in all cases
once we use static_call() for nested_ops, which I really want to do...  Drop the
mmu hook entirely and replace it with:

static inline kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu)
{
	if (!mmu_is_nested(vcpu))
		return kvm_read_cr3(vcpu);
	else
		return kvm_x86_ops.nested_ops->get_guest_pgd(vcpu);
}

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 05/25] KVM: x86/mmu: rephrase unclear comment
  2022-02-21 16:22 ` [PATCH v2 05/25] KVM: x86/mmu: rephrase unclear comment Paolo Bonzini
@ 2022-03-08 16:39   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 16:39 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> If accessed bits are not supported there simple isn't any distinction
> between accessed and non-accessed gPTEs, so the comment does not make
> much sense.  Rephrase it in terms of what happens if accessed bits
> *are* supported.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 80b4b291002a..d1d17d28e81b 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -193,7 +193,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>  	if (!FNAME(is_present_gpte)(gpte))
>  		goto no_present;
>  
> -	/* if accessed bit is not supported prefetch non accessed gpte */
> +	/* if accessed bit is supported, prefetch only accessed gpte */

Can we just reword the whole thing?  A/D bits being disabled is the anomaly,
leading with the "if" makes the logic we really care about seem like a secondary
concern.  E.g.

	/* Prefetch only accessed entries (unless A/D bits are disabled). */

>  	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
>  	    !(gpte & PT_GUEST_ACCESSED_MASK))
>  		goto no_present;
> -- 
> 2.31.1
> 
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-03-08 16:32       ` Sean Christopherson
@ 2022-03-08 16:43         ` Paolo Bonzini
  2022-03-08 16:53           ` Sean Christopherson
  0 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 16:43 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 17:32, Sean Christopherson wrote:
> 
> Aha!  An idea that would provide line of sight to avoiding retpoline in all cases
> once we use static_call() for nested_ops, which I really want to do...  Drop the
> mmu hook entirely and replace it with:
> 
> static inline kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu)
> {
> 	if (!mmu_is_nested(vcpu))
> 		return kvm_read_cr3(vcpu);
> 	else
> 		return kvm_x86_ops.nested_ops->get_guest_pgd(vcpu);
> }

Makes sense, but I think you mean

static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
						  struct kvm_mmu *mmu)
{
	if (unlikely(vcpu == &vcpu->arch.guest_mmu))
		return kvm_x86_ops.nested_ops->get_guest_pgd(vcpu);
	else
		return kvm_read_cr3(vcpu);
}

?

Paolo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-03-08 16:43         ` Paolo Bonzini
@ 2022-03-08 16:53           ` Sean Christopherson
  2022-03-08 17:14             ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 16:53 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Paolo Bonzini wrote:
> On 3/8/22 17:32, Sean Christopherson wrote:
> > 
> > Aha!  An idea that would provide line of sight to avoiding retpoline in all cases
> > once we use static_call() for nested_ops, which I really want to do...  Drop the
> > mmu hook entirely and replace it with:
> > 
> > static inline kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu)
> > {
> > 	if (!mmu_is_nested(vcpu))
> > 		return kvm_read_cr3(vcpu);
> > 	else
> > 		return kvm_x86_ops.nested_ops->get_guest_pgd(vcpu);
> > }
> 
> Makes sense, but I think you mean
> 
> static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
> 						  struct kvm_mmu *mmu)
> {
> 	if (unlikely(vcpu == &vcpu->arch.guest_mmu))

Well, not that certainly :-)

	if (mmu == &vcpu->arch.guest_mmu)

But you're right, we need to be able to do kvm_read_cr3() for the actual nested_mmu.

> 		return kvm_x86_ops.nested_ops->get_guest_pgd(vcpu);
> 	else
> 		return kvm_read_cr3(vcpu);
> }
> 
> ?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback
  2022-02-21 16:22 ` [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback Paolo Bonzini
@ 2022-03-08 17:13   ` Sean Christopherson
  2022-03-08 20:34     ` Sean Christopherson
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 17:13 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> Currently, vendor code is patching the inject_page_fault and later, on
> vmexit, expecting kvm_init_mmu to restore the inject_page_fault callback.
> 
> This is brittle, as exposed by the fact that SVM KVM_SET_NESTED_STATE
> forgets to do it.  Instead, do the check at the time a page fault actually
> has to be injected.  This does incur the cost of an extra retpoline
> for nested vmexits when TDP is disabled, but is overall much cleaner.
> While at it, add a comment that explains why the different behavior
> is needed in this case.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

If I have NAK powers, NAK NAK NAK NAK NAK :-)

Forcing a VM-Exit is a hack, e.g. it's the entire reason inject_emulated_exception()
returns a bool.  Even worse, it's confusing and misleading due to being incomplete.

The need hack for the hack is not unique to !tdp_enabled, the #DF can be triggered
any time L0 is intercepting #PF.  Hello, allow_smaller_maxphyaddr.

And while I think allow_smaller_maxphyaddr should be burned with fire, architecturally
it's still incomplete.  Any exception that is injected by KVM needs to be subjected
to nested interception checks, not just #PF.  E.g. a #GP while vectoring a different
fault should also be routed to L1.  KVM (mostly) gets away with special casing #PF
because that's the only common scenario where L1 wants to intercept _and fix_ a fault
that can occur while vectoring an exception.  E.g. in the #GP => #DF case, odds are
very good that L1 will inject a #DF too, but that doesn't make KVM's behavior correct.

I have a series to handle this by performing the interception checks when an exception
is queued, instead of when KVM injects the excepiton, and using a second kvm_queued_exception
field to track exceptions that are queued for VM-Exit (so as not to lose the injected
exception, which needs to be saved into vmc*12.  It's functional, though I haven't
tested migration (requires minor shenanigans to perform interception checks for pending
exceptions coming in from userspace).

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3
  2022-03-08 16:53           ` Sean Christopherson
@ 2022-03-08 17:14             ` Paolo Bonzini
  0 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 17:14 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 17:53, Sean Christopherson wrote:
>> static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
>> 						  struct kvm_mmu *mmu)
>> {
>> 	if (unlikely(vcpu == &vcpu->arch.guest_mmu))
> Well, not that certainly:-)
> 
> 	if (mmu == &vcpu->arch.guest_mmu)
> 
> But you're right, we need to be able to do kvm_read_cr3() for the actual nested_mmu.
> 

Ok, I'll drop this patch for now since the end of the series actually 
introduces the nested_ops already.  And then we can do it for both 
get_guest_pgd and get_pdptr.

Paolo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 07/25] KVM: x86/mmu: remove "bool base_only" arguments
  2022-02-21 16:22 ` [PATCH v2 07/25] KVM: x86/mmu: remove "bool base_only" arguments Paolo Bonzini
@ 2022-03-08 17:15   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 17:15 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> The argument is always false now that kvm_mmu_calc_root_page_role has
> been removed.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Reviewed-by: Sean Christopherson <seanjc@google.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-02-21 16:22 ` [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role Paolo Bonzini
@ 2022-03-08 17:36   ` Sean Christopherson
  2022-03-08 17:49     ` Paolo Bonzini
  2022-03-08 18:55   ` Sean Christopherson
  1 sibling, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 17:36 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> Snapshot the state of the processor registers that govern page walk into
> a new field of struct kvm_mmu.  This is a more natural representation
> than having it *mostly* in mmu_role but not exclusively; the delta
> right now is represented in other fields, such as root_level.
> 
> The nested MMU now has only the CPU mode; and in fact the new function
> kvm_calc_cpu_mode is analogous to the previous kvm_calc_nested_mmu_role,
> except that it has role.base.direct equal to CR0.PG.  It is not clear
> what the code meant by "setting role.base.direct to true to detect bogus
> usage of the nested MMU".

The idea was to trigger fireworks due to a incoherent state (e.g. direct mmu_role with
non-direct hooks) if the nested_mmu was ever used as a "real" MMU (handling faults,
installing SPs/SPTEs, etc...).  For a walk-only MMU, "direct" has no meaning and so
rather than arbitrarily leave it '0', I arbitrarily set it '1'.

Maybe this?

  The nested MMU now has only the CPU mode; and in fact the new function
  kvm_calc_cpu_mode is analogous to the previous kvm_calc_nested_mmu_role,
  except that it has role.base.direct equal to CR0.PG.  Having "direct"
  track CR0.PG has the serendipitious side effect of being an even better
  sentinel than arbitrarily setting direct to true for the nested MMU, as
  KVM will run afoul of sanity checks for both direct and indirect MMUs if
  KVM attempts to use the nested MMU as a "real" MMU, e.g. for page faults.
 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   1 +
>  arch/x86/kvm/mmu/mmu.c          | 107 ++++++++++++++++++++------------
>  arch/x86/kvm/mmu/paging_tmpl.h  |   2 +-
>  3 files changed, 68 insertions(+), 42 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 92855d3984a7..cc268116eb3f 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -433,6 +433,7 @@ struct kvm_mmu {
>  			 struct kvm_mmu_page *sp);
>  	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
>  	struct kvm_mmu_root_info root;
> +	union kvm_mmu_role cpu_mode;
>  	union kvm_mmu_role mmu_role;
>  	u8 root_level;
>  	u8 shadow_root_level;
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7c835253a330..1af898f0cf87 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -221,7 +221,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
>  #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)		\
>  static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)	\
>  {								\
> -	return !!(mmu->mmu_role. base_or_ext . reg##_##name);	\
> +	return !!(mmu->cpu_mode. base_or_ext . reg##_##name);	\
>  }
>  BUILD_MMU_ROLE_ACCESSOR(ext,  cr0, pg);
>  BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
> @@ -4680,6 +4680,39 @@ static void paging32_init_context(struct kvm_mmu *context)
>  	context->direct_map = false;
>  }
>  
> +static union kvm_mmu_role
> +kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)

I strongly prefer we avoid putting the return type on a different line unless
absolutely "necessary".

static union kvm_mmu_role kvm_calc_cpu_mode(struct kvm_vcpu *vcpu,
					    const struct kvm_mmu_role_regs *regs)

> +{
> +	union kvm_mmu_role role = {0};
> +
> +	role.base.access = ACC_ALL;
> +	role.base.smm = is_smm(vcpu);
> +	role.base.guest_mode = is_guest_mode(vcpu);
> +	role.base.direct = !____is_cr0_pg(regs);
> +	if (!role.base.direct) {

Can we check ____is_cr0_pg() instead of "direct"?  IMO that's more intuitive for
understanding why the bits below are left zero.  I was scratching my head trying
to figure out whether or not this was safe/correct for direct MMUs...

And this indentation is quite nasty, and will only get worse.  An early return or
a goto would solve that nicely.  I think I have a slight preference for an early
return?

	role.ext.valid = 1;
	
	if (!____is_cr0_pg(regs))
		return role;

	role.base.efer_nx = ____is_efer_nx(regs);
	role.base.cr0_wp = ____is_cr0_wp(regs);
	role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
	role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
	role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
	role.base.level = role_regs_to_root_level(regs);

	role.ext.cr0_pg = 1;
	role.ext.cr4_pae = ____is_cr4_pae(regs);
	role.ext.cr4_smep = ____is_cr4_smep(regs);
	role.ext.cr4_smap = ____is_cr4_smap(regs);
	role.ext.cr4_pse = ____is_cr4_pse(regs);

	/* PKEY and LA57 are active iff long mode is active. */
	role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
	role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
	role.ext.efer_lma = ____is_efer_lma(regs);

	return role;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 09/25] KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs
  2022-02-21 16:22 ` [PATCH v2 09/25] KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs Paolo Bonzini
@ 2022-03-08 17:41   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 17:41 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> The root_level can be found in the cpu_mode (in fact the field
> is superfluous and could be removed, but one thing at a time).
> Since there is only one usage left of role_regs_to_root_level,
> inline it into kvm_calc_cpu_mode.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 23 ++++++++---------------
>  1 file changed, 8 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 1af898f0cf87..6e539fc2c9c7 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -244,19 +244,6 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
>  	return regs;
>  }
>  
> -static int role_regs_to_root_level(const struct kvm_mmu_role_regs *regs)
> -{
> -	if (!____is_cr0_pg(regs))
> -		return 0;
> -	else if (____is_efer_lma(regs))
> -		return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
> -					       PT64_ROOT_4LEVEL;
> -	else if (____is_cr4_pae(regs))
> -		return PT32E_ROOT_LEVEL;
> -	else
> -		return PT32_ROOT_LEVEL;
> -}
> -
>  static inline bool kvm_available_flush_tlb_with_range(void)
>  {
>  	return kvm_x86_ops.tlb_remote_flush_with_range;
> @@ -4695,7 +4682,13 @@ kvm_calc_cpu_mode(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
>  		role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
>  		role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
>  		role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
> -		role.base.level = role_regs_to_root_level(regs);
> +
> +		if (____is_efer_lma(regs))
> +			role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;

Can we wrap this, even if indentation is reduced?  I find it much easier to quickly
understand the if-else paths if they're stacked and not run out to almost 100 chars.

	if (____is_efer_lma(regs))
		role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
							  PT64_ROOT_4LEVEL;
	else if (____is_cr4_pae(regs))
		role.base.level = PT32E_ROOT_LEVEL;
	else
		role.base.level = PT32_ROOT_LEVEL;

> +		else if (____is_cr4_pae(regs))
> +			role.base.level = PT32E_ROOT_LEVEL;
> +		else
> +			role.base.level = PT32_ROOT_LEVEL;
>  
>  		role.ext.cr0_pg = 1;
>  		role.ext.cr4_pae = ____is_cr4_pae(regs);
> @@ -4790,7 +4783,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
>  	context->get_guest_pgd = kvm_get_guest_cr3;
>  	context->get_pdptr = kvm_pdptr_read;
>  	context->inject_page_fault = kvm_inject_page_fault;
> -	context->root_level = role_regs_to_root_level(regs);
> +	context->root_level = cpu_mode.base.level;
>  
>  	if (!is_cr0_pg(context))
>  		context->gva_to_gpa = nonpaging_gva_to_gpa;
> -- 
> 2.31.1
> 
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 10/25] KVM: x86/mmu: remove ept_ad field
  2022-02-21 16:22 ` [PATCH v2 10/25] KVM: x86/mmu: remove ept_ad field Paolo Bonzini
@ 2022-03-08 17:42   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 17:42 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> The ept_ad field is used during page walk to determine if the guest PTEs
> have accessed and dirty bits.  In the MMU role, the ad_disabled
> bit represents whether the *shadow* PTEs have the bits, so it
> would be incorrect to replace PT_HAVE_ACCESSED_DIRTY with just
> !mmu->mmu_role.base.ad_disabled.
> 
> However, the similar field in the CPU mode, ad_disabled, is initialized
> correctly: to the opposite value of ept_ad for shadow EPT, and zero
> for non-EPT guest paging modes (which always have A/D bits).  It is
> therefore possible to compute PT_HAVE_ACCESSED_DIRTY from the CPU mode,
> like other page-format fields; it just has to be inverted to account
> for the different polarity.
> 
> Having a CPU mode that is distinct from the MMU roles in fact would even
> allow to remove PT_HAVE_ACCESSED_DIRTY macro altogether, and always use
> !mmu->cpu_mode.base.ad_disabled.  I am not doing this because the macro
> has a small effect in terms of dead code elimination:
> 
>    text	   data	    bss	    dec	    hex
>  103544	  16665	    112	 120321	  1d601    # as of this patch
>  103746	  16665	    112	 120523	  1d6cb    # without PT_HAVE_ACCESSED_DIRTY
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Reviewed-by: Sean Christopherson <seanjc@google.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common
  2022-02-21 16:22 ` [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common Paolo Bonzini
@ 2022-03-08 17:48   ` Sean Christopherson
  2022-03-08 17:50     ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 17:48 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> kvm_calc_shadow_root_page_role_common is the same as
> kvm_calc_cpu_mode except for the level, which is overwritten
> afterwards in kvm_calc_shadow_mmu_root_page_role
> and kvm_calc_shadow_npt_root_page_role.
> 
> role.base.direct is already set correctly for the CPU mode,
> and CR0.PG=1 is required for VMRUN so it will also be
> correct for nested NPT.

Bzzzt, this is wrong, the nested NPT MMU is indirect but will be computed as direct.

> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 21 ++-------------------
>  1 file changed, 2 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3ffa6f2bf991..31874fad12fb 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4796,27 +4796,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
>  	reset_tdp_shadow_zero_bits_mask(context);
>  }
>  
> -static union kvm_mmu_role
> -kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
> -				      const struct kvm_mmu_role_regs *regs)
> -{
> -	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs);
> -
> -	role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
> -	role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
> -	role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
> -
> -	return role;
> -}
> -
>  static union kvm_mmu_role
>  kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
>  				   const struct kvm_mmu_role_regs *regs)
>  {
> -	union kvm_mmu_role role =
> -		kvm_calc_shadow_root_page_role_common(vcpu, regs);
> -
> -	role.base.direct = !____is_cr0_pg(regs);
> +	union kvm_mmu_role role = kvm_calc_cpu_mode(vcpu, regs);
>  
>  	if (!____is_efer_lma(regs))
>  		role.base.level = PT32E_ROOT_LEVEL;
> @@ -4869,9 +4853,8 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
>  				   const struct kvm_mmu_role_regs *regs)
>  {
>  	union kvm_mmu_role role =
> -		kvm_calc_shadow_root_page_role_common(vcpu, regs);
> +               kvm_calc_cpu_mode(vcpu, regs);

No need to split this line with the less verbose name.

>  
> -	role.base.direct = false;

As above, this line needs to stay.

>  	role.base.level = kvm_mmu_get_tdp_level(vcpu);
>  
>  	return role;
> -- 
> 2.31.1
> 
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-03-08 17:36   ` Sean Christopherson
@ 2022-03-08 17:49     ` Paolo Bonzini
  0 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 17:49 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 18:36, Sean Christopherson wrote:
> The idea was to trigger fireworks due to a incoherent state (e.g. direct mmu_role with
> non-direct hooks) if the nested_mmu was ever used as a "real" MMU (handling faults,
> installing SPs/SPTEs, etc...).  For a walk-only MMU, "direct" has no meaning and so
> rather than arbitrarily leave it '0', I arbitrarily set it '1'.
> 
> Maybe this?
> 
>    The nested MMU now has only the CPU mode; and in fact the new function
>    kvm_calc_cpu_mode is analogous to the previous kvm_calc_nested_mmu_role,
>    except that it has role.base.direct equal to CR0.PG.  Having "direct"
>    track CR0.PG has the serendipitious side effect of being an even better
>    sentinel than arbitrarily setting direct to true for the nested MMU, as
>    KVM will run afoul of sanity checks for both direct and indirect MMUs if
>    KVM attempts to use the nested MMU as a "real" MMU, e.g. for page faults.

Hmm, actually it is set to CR0.PG *negated*, so that future patches can 
get rid of role.ext.cr0_pg.  But really anybody trying to use nested_mmu 
for real would get NULL pointer dereferences left and right in all 
likelihood.  This will be even clearer by the end of the series, when 
the function pointers are initialized at vCPU creation time.

>>
>> +	role.base.direct = !____is_cr0_pg(regs);
>> +	if (!role.base.direct) {
> 
> Can we check ____is_cr0_pg() instead of "direct"?  IMO that's more intuitive for
> understanding why the bits below are left zero.  I was scratching my head trying
> to figure out whether or not this was safe/correct for direct MMUs...

Yes, that's good.

Paolo


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common
  2022-03-08 17:48   ` Sean Christopherson
@ 2022-03-08 17:50     ` Paolo Bonzini
  2022-03-08 18:17       ` Sean Christopherson
  0 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 17:50 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 18:48, Sean Christopherson wrote:
> On Mon, Feb 21, 2022, Paolo Bonzini wrote:
>> kvm_calc_shadow_root_page_role_common is the same as
>> kvm_calc_cpu_mode except for the level, which is overwritten
>> afterwards in kvm_calc_shadow_mmu_root_page_role
>> and kvm_calc_shadow_npt_root_page_role.
>>
>> role.base.direct is already set correctly for the CPU mode,
>> and CR0.PG=1 is required for VMRUN so it will also be
>> correct for nested NPT.
> 
> Bzzzt, this is wrong, the nested NPT MMU is indirect but will be computed as direct.

CR0.PG=1 means it's *not* direct:

> +	role.base.direct = !____is_cr0_pg(regs);

Paolo


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging
  2022-02-21 16:22 ` [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging Paolo Bonzini
@ 2022-03-08 18:11   ` Sean Christopherson
  2022-03-08 18:24     ` Paolo Bonzini
  2022-03-08 18:38     ` Sean Christopherson
  0 siblings, 2 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 18:11 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> Inline kvm_calc_mmu_role_common into its sole caller, and simplify it
> by removing the computation of unnecessary bits.
> 
> Extended bits are unnecessary because page walking uses the CPU mode,
> and EFER.NX/CR0.WP can be set to one unconditionally---matching the
> format of shadow pages rather than the format of guest pages.

But they don't match the format of shadow pages.  EPT has an equivalent to NX in
that KVM can always clear X, but KVM explicitly supports running with EPT and
EFER.NX=0 in the host (32-bit non-PAE kernels).

CR0.WP equally confusing.  Yes, both EPT and NPT enforce write protection at all
times, but EPT has no concept of user vs. supervisor in the EPT tables themselves,
at least with respect to writes (thanks mode-based execution for the qualifier...).
NPT is even worse as the APM explicitly states:

  The host hCR0.WP bit is ignored under nested paging.

Unless there's some hidden dependency I'm missing, I'd prefer we arbitrarily leave
them zero.

> The MMU role for two dimensional paging does still depend on the CPU mode,

Heh, don't think it's necessary to spell out TDP, and I think it would be helpful
to write it as "non-nested TDP" since the surrounding patches deal with both.

> even if only barely so, due to SMM and guest mode; for consistency,
> pass it down to kvm_calc_tdp_mmu_root_page_role instead of querying
> the vcpu with is_smm or is_guest_mode.

The changelog should call out this is a _significant_ change in behavior for KVM,
as it allows reusing shadow pages with different guest MMU "role bits".  E.g. if
this lands after the changes to not unload MMUs on cr0/cr4 emulation, it will be
quite the functional change.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common
  2022-03-08 17:50     ` Paolo Bonzini
@ 2022-03-08 18:17       ` Sean Christopherson
  2022-03-08 18:18         ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 18:17 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Paolo Bonzini wrote:
> On 3/8/22 18:48, Sean Christopherson wrote:
> > On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> > > kvm_calc_shadow_root_page_role_common is the same as
> > > kvm_calc_cpu_mode except for the level, which is overwritten
> > > afterwards in kvm_calc_shadow_mmu_root_page_role
> > > and kvm_calc_shadow_npt_root_page_role.
> > > 
> > > role.base.direct is already set correctly for the CPU mode,
> > > and CR0.PG=1 is required for VMRUN so it will also be
> > > correct for nested NPT.
> > 
> > Bzzzt, this is wrong, the nested NPT MMU is indirect but will be computed as direct.
> 
> CR0.PG=1 means it's *not* direct:
> 
> > +	role.base.direct = !____is_cr0_pg(regs);

Ha!  I was just cleverly making the case for checking ____is_cr0_pg() instead of
"direct" for computing the dependent flags, I swear...

On a serious note, can we add a WARN_ON_ONCE(role.base.direct)?  Not so much that
the WARN will be helpful, but to document the subtle dependency?  If the relevant
code goes away in the end, ignore this requrest.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common
  2022-03-08 18:17       ` Sean Christopherson
@ 2022-03-08 18:18         ` Paolo Bonzini
  0 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 18:18 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 19:17, Sean Christopherson wrote:
>>> +	role.base.direct = !____is_cr0_pg(regs);
> 
> On a serious note, can we add a WARN_ON_ONCE(role.base.direct)?  Not so much that
> the WARN will be helpful, but to document the subtle dependency?  If the relevant
> code goes away in the end, ignore this requrest.

Ok, that can be done.  Either that or !is_cr0_pg().

Paolo


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging
  2022-03-08 18:11   ` Sean Christopherson
@ 2022-03-08 18:24     ` Paolo Bonzini
  2022-03-08 18:44       ` Sean Christopherson
  2022-03-08 18:38     ` Sean Christopherson
  1 sibling, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-08 18:24 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 19:11, Sean Christopherson wrote:
> On Mon, Feb 21, 2022, Paolo Bonzini wrote:
>> Extended bits are unnecessary because page walking uses the CPU mode,
>> and EFER.NX/CR0.WP can be set to one unconditionally---matching the
>> format of shadow pages rather than the format of guest pages.
> 
> But they don't match the format of shadow pages.  EPT has an equivalent to NX in
> that KVM can always clear X, but KVM explicitly supports running with EPT and
> EFER.NX=0 in the host (32-bit non-PAE kernels).

In which case bit 2 of EPTs doesn't change meaning, does it?

> CR0.WP equally confusing.  Yes, both EPT and NPT enforce write protection at all
> times, but EPT has no concept of user vs. supervisor in the EPT tables themselves,
> at least with respect to writes (thanks mode-based execution for the qualifier...).
> NPT is even worse as the APM explicitly states:
> 
>    The host hCR0.WP bit is ignored under nested paging.
> 
> Unless there's some hidden dependency I'm missing, I'd prefer we arbitrarily leave
> them zero.

Setting EFER.NX=0 might be okay for EPT/NPT, but I'd prefer to set it 
respectively to 1 (X bit always present) and host EFER.NX (NX bit 
present depending on host EFER).

For CR0.WP it should really be 1 in my opinion, because CR0.WP=0 implies 
having a concept of user vs. supervisor access: CR0.WP=1 is the 
"default", while CR0.WP=0 is "always allow *supervisor* writes".

>> even if only barely so, due to SMM and guest mode; for consistency,
>> pass it down to kvm_calc_tdp_mmu_root_page_role instead of querying
>> the vcpu with is_smm or is_guest_mode.
> 
> The changelog should call out this is a _significant_ change in behavior for KVM,
> as it allows reusing shadow pages with different guest MMU "role bits".

Good point!  It's safe and arguably clea{n,r}er, but it's still a pretty 
large change.

> E.g. if this lands after the changes to not unload MMUs on cr0/cr4
> emulation, it will be quite the functional change.
I expect this to land first, so that the part where we don't really 
agree on the implementation comes last and benefits from a more 
understandable core.

Paolo


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging
  2022-03-08 18:11   ` Sean Christopherson
  2022-03-08 18:24     ` Paolo Bonzini
@ 2022-03-08 18:38     ` Sean Christopherson
  1 sibling, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 18:38 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Sean Christopherson wrote:
> On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> > Inline kvm_calc_mmu_role_common into its sole caller, and simplify it
> > by removing the computation of unnecessary bits.
> > 
> > Extended bits are unnecessary because page walking uses the CPU mode,
> > and EFER.NX/CR0.WP can be set to one unconditionally---matching the
> > format of shadow pages rather than the format of guest pages.
> 
> But they don't match the format of shadow pages.  EPT has an equivalent to NX in
> that KVM can always clear X, but KVM explicitly supports running with EPT and
> EFER.NX=0 in the host (32-bit non-PAE kernels).

Oof, digging into this made me realize we probably have yet another bug in the
handling of the NX hugepages workaround.  Unless I'm overlooking something, NX is
treated as reserved for NPT shadow pages and would cause WARN spam if someone
enabled the mitigation on AMD CPUs.

Untested...  If this is needed, it also means I didn't properly test commit
b26a71a1a5b9 ("KVM: SVM: Refuse to load kvm_amd if NX support is not available").

:-(

From: Sean Christopherson <seanjc@google.com>
Date: Tue, 8 Mar 2022 10:31:27 -0800
Subject: [PATCH] KVM: x86/mmu: Don't treat NX as reserved for NPT MMUs

Mark the NX bit as allowed for NPT shadow pages.  KVM doesn't use NX in
"normal" operation as KVM doesn't honor userspace execute-protection
settings, but KVM allows userspace to enable the NX hugepages mitigation
on AMD CPUs, despite no known AMD CPUs being affected by the iTLB
multi-hit erratum.

Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 47a2c5f3044d..9c79a0927a48 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4461,13 +4461,17 @@ static inline bool boot_cpu_is_amd(void)
 	return shadow_x_mask == 0;
 }

-/*
- * the direct page table on host, use as much mmu features as
- * possible, however, kvm currently does not do execution-protection.
- */
 static void
 reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 {
+	/*
+	 * KVM doesn't honor execute-protection from the host page tables, but
+	 * NX is required and potentially used at any time by KVM for NPT, as
+	 * the NX hugepages iTLB multi-hit mitigation is supported for any CPU
+	 * despite no known AMD (and derivative) CPUs being affected by erratum.
+	 */
+	bool efer_nx = true;
+
 	struct rsvd_bits_validate *shadow_zero_check;
 	int i;

@@ -4475,7 +4479,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)

 	if (boot_cpu_is_amd())
 		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-					context->shadow_root_level, false,
+					context->shadow_root_level, efer_nx,
 					boot_cpu_has(X86_FEATURE_GBPAGES),
 					false, true);
 	else

base-commit: 2a809a65d88e7e071dc6ef4a6be8416d25885efe
--


^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging
  2022-03-08 18:24     ` Paolo Bonzini
@ 2022-03-08 18:44       ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 18:44 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Paolo Bonzini wrote:
> On 3/8/22 19:11, Sean Christopherson wrote:
> > On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> > > Extended bits are unnecessary because page walking uses the CPU mode,
> > > and EFER.NX/CR0.WP can be set to one unconditionally---matching the
> > > format of shadow pages rather than the format of guest pages.
> > 
> > But they don't match the format of shadow pages.  EPT has an equivalent to NX in
> > that KVM can always clear X, but KVM explicitly supports running with EPT and
> > EFER.NX=0 in the host (32-bit non-PAE kernels).
> 
> In which case bit 2 of EPTs doesn't change meaning, does it?
> 
> > CR0.WP equally confusing.  Yes, both EPT and NPT enforce write protection at all
> > times, but EPT has no concept of user vs. supervisor in the EPT tables themselves,
> > at least with respect to writes (thanks mode-based execution for the qualifier...).
> > NPT is even worse as the APM explicitly states:
> > 
> >    The host hCR0.WP bit is ignored under nested paging.
> > 
> > Unless there's some hidden dependency I'm missing, I'd prefer we arbitrarily leave
> > them zero.
> 
> Setting EFER.NX=0 might be okay for EPT/NPT, but I'd prefer to set it
> respectively to 1 (X bit always present) and host EFER.NX (NX bit present
> depending on host EFER).
> 
> For CR0.WP it should really be 1 in my opinion, because CR0.WP=0 implies
> having a concept of user vs. supervisor access: CR0.WP=1 is the "default",
> while CR0.WP=0 is "always allow *supervisor* writes".

Yeah, I think we generally agree, just came to different conclusions :-)  I'm
totally fine setting them to '1', especially given the patch I just "posted",
but please add comments (suggested NX comment below).  The explicit "WP is ignored"
blurb for hCR0 on NPT will be especially confusing at some point.

With efer_nx forced to '1', we can do this somewhere in this series.  I really,
really despise "context" :-).

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 9c79a0927a48..657df7fd74bf 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4461,25 +4461,15 @@ static inline bool boot_cpu_is_amd(void)
        return shadow_x_mask == 0;
 }
 
-static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *mmu)
 {
-       /*
-        * KVM doesn't honor execute-protection from the host page tables, but
-        * NX is required and potentially used at any time by KVM for NPT, as
-        * the NX hugepages iTLB multi-hit mitigation is supported for any CPU
-        * despite no known AMD (and derivative) CPUs being affected by erratum.
-        */
-       bool efer_nx = true;
-
-       struct rsvd_bits_validate *shadow_zero_check;
        int i;
 
-       shadow_zero_check = &context->shadow_zero_check;
+       shadow_zero_check = &mmu->shadow_zero_check;
 
        if (boot_cpu_is_amd())
                __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
-                                       context->shadow_root_level, efer_nx,
+                                       mmu->shadow_root_level, is_efer_nx(mmu),
                                        boot_cpu_has(X86_FEATURE_GBPAGES),
                                        false, true);
        else
@@ -4490,7 +4480,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
        if (!shadow_me_mask)
                return;
 
-       for (i = context->shadow_root_level; --i >= 0;) {
+       for (i = mmu->shadow_root_level; --i >= 0;) {
                shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
                shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
        }
@@ -4751,6 +4741,16 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 
        role.base.access = ACC_ALL;
        role.base.cr0_wp = true;
+
+       /*
+        * KVM doesn't honor execute-protection from the host page tables, but
+        * NX is required and potentially used at any time by KVM for NPT, as
+        * the NX hugepages iTLB multi-hit mitigation is supported for any CPU
+        * despite no known AMD (and derivative) CPUs being affected by erratum.
+        *
+        * This is functionally accurate for EPT, if technically wrong, as KVM
+        * can always clear the X bit on EPT,
+        */
        role.base.efer_nx = true;
        role.base.smm = cpu_mode.base.smm;
        role.base.guest_mode = cpu_mode.base.guest_mode;

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-02-21 16:22 ` [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role Paolo Bonzini
  2022-03-08 17:36   ` Sean Christopherson
@ 2022-03-08 18:55   ` Sean Christopherson
  2022-03-09  9:58     ` Paolo Bonzini
  1 sibling, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 18:55 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> @@ -4800,13 +4836,15 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
>  }
>  
>  static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
> -				    const struct kvm_mmu_role_regs *regs,
> -				    union kvm_mmu_role new_role)
> +				    union kvm_mmu_role cpu_mode,

Can you give all helpers this treatment (rename "role" => "cpu_mode")?  I got
tripped up a few times reading patches because the ones where it wasn't necessary,
i.e. where there's only a single kvm_mmu_role paramenter, were left as-is.

I think kvm_calc_shadow_npt_root_page_role() and kvm_calc_shadow_mmu_root_page_role()
are the only offenders.

> +				    union kvm_mmu_role mmu_role)
>  {
> -	if (new_role.as_u64 == context->mmu_role.as_u64)
> +	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&
> +	    mmu_role.as_u64 == context->mmu_role.as_u64)
>  		return;
>  
> -	context->mmu_role.as_u64 = new_role.as_u64;
> +	context->cpu_mode.as_u64 = cpu_mode.as_u64;
> +	context->mmu_role.as_u64 = mmu_role.as_u64;
>  
>  	if (!is_cr0_pg(context))
>  		nonpaging_init_context(context);

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 15/25] KVM: x86/mmu: remove extended bits from mmu_role, rename field
  2022-02-21 16:22 ` [PATCH v2 15/25] KVM: x86/mmu: remove extended bits from mmu_role, rename field Paolo Bonzini
@ 2022-03-08 19:02   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:02 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> mmu_role represents the role of the root of the page tables.
> It does not need any extended bits, as those govern only KVM's
> page table walking; the is_* functions used for page table
> walking always use the CPU mode.
> 
> ext.valid is not present anymore in the MMU role, but an
> all-zero MMU role is impossible because the level field is
> never zero in the MMU role.

Probably worth adding a blurb to clarify that the level is zero in the extended
(guest) role if paging is disabled in the guest?

> So just zap the whole mmu_role
> in order to force invalidation after CPUID is updated.
> 
> While making this change, which requires touching almost every
> occurrence of "mmu_role", rename it to "root_role".

Heh, can you please wrap closer to 75 chars?  Your d20 rolls came up low again.

> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> @@ -4764,7 +4763,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
>  	reset_tdp_shadow_zero_bits_mask(context);
>  }
>  
> -static union kvm_mmu_role
> +static union kvm_mmu_page_role
>  kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
>  				   union kvm_mmu_role role)
>  {
> @@ -4785,19 +4784,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
>  	 * MMU contexts.
>  	 */
>  	role.base.efer_nx = true;
> -	return role;
> +	return role.base;

For both cases where the root role is derived from the cpu_role, can we make
an explicit copy of the "base" as root_role, and then do mods on that?  Doing
modification on the incoming role is unnecessarily hard to read IMO, I doubt the
compiler even generates different code.

For me, this makes it more obvious that the root_role is derived from the cpu_role.

static union kvm_mmu_page_role
kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
				   const union kvm_mmu_role cpu_role)
{
	union kvm_mmu_page_role root_role = cpu_role.base;

	if (!cpu_role.ext.efer_lma)
		root_role.level = PT32E_ROOT_LEVEL;
	else if (cpu_role.ext.cr4_la57)
		root_role.level = PT64_ROOT_5LEVEL;
	else
		root_role.level = PT64_ROOT_4LEVEL;

	/*
	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
	 * The iTLB multi-hit workaround can be toggled at any time, so assume
	 * NX can be used by any non-nested shadow MMU to avoid having to reset
	 * MMU contexts.
	 */
	root_role.efer_nx = true;
	return root_role;
}

static union kvm_mmu_page_role
kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
				   const union kvm_mmu_role cpu_role)
{
	union kvm_mmu_page_role root_role = cpu_role.base;

	WARN_ON_ONCE(root_role.direct);
	root_role.level = kvm_mmu_get_tdp_level(vcpu);
	return root_role;
}


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 16/25] KVM: x86/mmu: rename kvm_mmu_role union
  2022-02-21 16:22 ` [PATCH v2 16/25] KVM: x86/mmu: rename kvm_mmu_role union Paolo Bonzini
@ 2022-03-08 19:15   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:15 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> It is quite confusing that the "full" union is called kvm_mmu_role
> but is used for the "cpu_mode" field of struct kvm_mmu.  Rename it
> to kvm_mmu_paging_mode.

My official vote is to use

	union kvm_mmu_cpu_role cpu_role

instead of

	union kvm_mmu_paging_mode cpu_mode

The latter has too many inconsistencies, e.g. helpers using "role" instead of "mode",
the union using "paging" but the field/params using "cpu".  The cpu instead of paging
thing isn't a coincidence as having the param be "paging_mode" would be really, really
confusing (ditto for kvm_calc_paging_mode()).

IMO this is consistent, if imperfect.

static union kvm_mmu_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
						const struct kvm_mmu_role_regs *regs)
{
	union kvm_mmu_cpu_role role = {0};

	role.base.access = ACC_ALL;
	role.base.smm = is_smm(vcpu);
	role.base.guest_mode = is_guest_mode(vcpu);
	role.base.direct = !____is_cr0_pg(regs);

	role.ext.valid = 1;

	if (!____is_cr0_pg(regs))
		return role;

	...

	return role;
}

Whereas this is inconsistent, and also imperfect.

static union kvm_mmu_paging_mode kvm_calc_cpu_mode(struct kvm_vcpu *vcpu,
						   const struct kvm_mmu_role_regs *regs)
{
	union kvm_mmu_paging_mode role = {0};

	role.base.access = ACC_ALL;
	role.base.smm = is_smm(vcpu);
	role.base.guest_mode = is_guest_mode(vcpu);
	role.base.direct = !____is_cr0_pg(regs);

	role.ext.valid = 1;

	if (!____is_cr0_pg(regs))
		return role;

	...

	return role;
}

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles
  2022-02-21 16:22 ` [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles Paolo Bonzini
@ 2022-03-08 19:35   ` Sean Christopherson
  2022-03-08 19:41     ` Sean Christopherson
  2022-03-09 10:33     ` Paolo Bonzini
  0 siblings, 2 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:35 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> @@ -4822,18 +4798,23 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
>  {
>  	struct kvm_mmu *context = &vcpu->arch.root_mmu;
>  	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
> -	union kvm_mmu_page_role root_role =
> -		kvm_calc_shadow_mmu_root_page_role(vcpu, cpu_mode);
> +	union kvm_mmu_page_role root_role;
>  
> -	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
> -}
> +	root_role = cpu_mode.base;
> +	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);

Heh, we have different definitions of "simpler".   Can we split the difference
and do?

	/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
	if (!____is_efer_lma(regs))
		root_role.level = PT32E_ROOT_LEVEL;

> -static union kvm_mmu_page_role
> -kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
> -				   union kvm_mmu_paging_mode role)
> -{
> -	role.base.level = kvm_mmu_get_tdp_level(vcpu);
> -	return role.base;
> +	/*
> +	 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
> +	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
> +	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
> +	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
> +	 * The iTLB multi-hit workaround can be toggled at any time, so assume
> +	 * NX can be used by any non-nested shadow MMU to avoid having to reset
> +	 * MMU contexts.
> +	 */
> +	root_role.efer_nx = true;
> +
> +	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
>  }
>  
>  void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
> @@ -4846,7 +4827,10 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
>  		.efer = efer,
>  	};
>  	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
> -	union kvm_mmu_page_role root_role = kvm_calc_shadow_npt_root_page_role(vcpu, cpu_mode);
> +	union kvm_mmu_page_role root_role;
> +
> +	root_role = cpu_mode.base;
> +	root_role.level = kvm_mmu_get_tdp_level(vcpu);

Regarding the WARN_ON_ONCE(root_role.direct) discussed for a different patch, how
about this for a WARN + comment?

	/* NPT requires CR0.PG=1, thus the MMU is guaranteed to be indirect. */
	WARN_ON_ONCE(root_role.direct);

>  	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
>  	kvm_mmu_new_pgd(vcpu, nested_cr3);
> -- 
> 2.31.1
> 
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles
  2022-03-08 19:35   ` Sean Christopherson
@ 2022-03-08 19:41     ` Sean Christopherson
  2022-03-09 10:33     ` Paolo Bonzini
  1 sibling, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:41 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Sean Christopherson wrote:
> On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> > @@ -4822,18 +4798,23 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
> >  {
> >  	struct kvm_mmu *context = &vcpu->arch.root_mmu;
> >  	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
> > -	union kvm_mmu_page_role root_role =
> > -		kvm_calc_shadow_mmu_root_page_role(vcpu, cpu_mode);
> > +	union kvm_mmu_page_role root_role;
> >  
> > -	shadow_mmu_init_context(vcpu, context, cpu_mode, root_role);
> > -}
> > +	root_role = cpu_mode.base;
> > +	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
> 
> Heh, we have different definitions of "simpler".   Can we split the difference
> and do?
> 
> 	/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
> 	if (!____is_efer_lma(regs))
> 		root_role.level = PT32E_ROOT_LEVEL;

Ha, and then the very next patch stomps all over this.  I think this just needs
to add

	BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);

and do

	if (!is_efer_lma(context))
		root_role.level = PT32E_ROOT_LEVEL;

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 20/25] KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu
  2022-02-21 16:22 ` [PATCH v2 20/25] KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu Paolo Bonzini
@ 2022-03-08 19:45   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:45 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> Do not lead init_kvm_*mmu into the temptation of poking
> into struct kvm_mmu_role_regs, by passing to it directly
> the CPU mode.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 21 +++++++++------------
>  1 file changed, 9 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 47288643ab70..a7028c2ae5c7 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4734,11 +4734,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
>  	return role;
>  }
>  
> -static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
> -			     const struct kvm_mmu_role_regs *regs)
> +static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode cpu_mode)

Please keep the newline.  I like running over the 80 char soft limit when it
improves readability, but IMO stacking params is easier to read than a long line
unless it's over by like 3 chars or less.

>  {
>  	struct kvm_mmu *context = &vcpu->arch.root_mmu;
> -	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, regs);
>  	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_mode);
>  
>  	if (cpu_mode.as_u64 == context->cpu_mode.as_u64 &&

...

> -static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
> +static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_paging_mode new_mode)

And add one here (or retroactively add it back when @regs was constified).

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 21/25] KVM: x86/mmu: replace shadow_root_level with root_role.level
  2022-02-21 16:22 ` [PATCH v2 21/25] KVM: x86/mmu: replace shadow_root_level with root_role.level Paolo Bonzini
@ 2022-03-08 19:48   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:48 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> root_role.level is always the same value as shadow_level:
> 
> - it's kvm_mmu_get_tdp_level(vcpu) when going through init_kvm_tdp_mmu
> 
> - it's the level argument when going through kvm_init_shadow_ept_mmu
> 
> - it's assigned directly from new_role.base.level when going
>   through shadow_mmu_init_context
> 
> Remove the duplication and get the level directly from the role.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 -
>  arch/x86/kvm/mmu.h              |  2 +-
>  arch/x86/kvm/mmu/mmu.c          | 33 ++++++++++++++-------------------
>  arch/x86/kvm/mmu/tdp_mmu.c      |  2 +-
>  arch/x86/kvm/svm/svm.c          |  2 +-
>  arch/x86/kvm/vmx/vmx.c          |  2 +-
>  6 files changed, 18 insertions(+), 24 deletions(-)

Yay!  That's a lot less churn than I expected.

Reviewed-by: Sean Christopherson <seanjc@google.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 22/25] KVM: x86/mmu: replace root_level with cpu_mode.base.level
  2022-02-21 16:22 ` [PATCH v2 22/25] KVM: x86/mmu: replace root_level with cpu_mode.base.level Paolo Bonzini
@ 2022-03-08 19:49   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:49 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> Remove another duplicate field of struct kvm_mmu.  This time it's
> the root level for page table walking; the separate field is
> always initialized as cpu_mode.base.level, so its users can look
> up the CPU mode directly instead.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 -
>  arch/x86/kvm/mmu/mmu.c          | 18 +++++++-----------
>  arch/x86/kvm/mmu/paging_tmpl.h  |  4 ++--
>  3 files changed, 9 insertions(+), 14 deletions(-)

Reviewed-by: Sean Christopherson <seanjc@google.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 23/25] KVM: x86/mmu: replace direct_map with root_role.direct
  2022-02-21 16:22 ` [PATCH v2 23/25] KVM: x86/mmu: replace direct_map with root_role.direct Paolo Bonzini
@ 2022-03-08 19:52   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 19:52 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> direct_map is always equal to the direct field of the root page's role:
> 
> - for shadow paging, direct_map is true if CR0.PG=0 and root_role.direct is
> copied from cpu_mode.base.direct
> 
> - for TDP, it is always true and root_role.direct is also always true
> 
> - for shadow EPT, it is always false and root_role.direct is also always

Uber nit, s/EPT/TDP to cover shadow NPT, which at this point we are 100% sure is
indirect ;-)

> false
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Reviewed-by: Sean Christopherson <seanjc@google.com>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 25/25] KVM: x86/mmu: extract initialization of the page walking data
  2022-02-21 16:22 ` [PATCH v2 25/25] KVM: x86/mmu: extract initialization of the page walking data Paolo Bonzini
@ 2022-03-08 20:02   ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 20:02 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> +static void kvm_vcpu_init_walker(struct kvm_vcpu *vcpu,
> +				 struct kvm_mmu *mmu,
> +				 union kvm_mmu_paging_mode new_mode)

kvm_vcpu_init_walker() is a rather odd and almost maliciously obtuse.  We're not
short on space for this one, so how about?

static void kvm_mmu_init_guest_walker(struct kvm_vcpu *vcpu,
				      struct kvm_mmu *mmu,
				      union kvm_mmu_paging_mode new_mode)
>  void kvm_init_mmu(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
>  	union kvm_mmu_paging_mode cpu_mode = kvm_calc_cpu_mode(vcpu, &regs);
>  
> +	kvm_vcpu_init_walker(vcpu, vcpu->arch.walk_mmu, cpu_mode);
>  	if (mmu_is_nested(vcpu))
> -		init_kvm_nested_mmu(vcpu, cpu_mode);
> -	else if (tdp_enabled)
> +		return;

Nice!  I really like that this highlights that the nested_mmu crud is just for
the walker.  Can you also add a comment here explaining that part?

> +
> +	if (tdp_enabled)
>  		init_kvm_tdp_mmu(vcpu, cpu_mode);
>  	else
>  		init_kvm_softmmu(vcpu, cpu_mode);
> -- 
> 2.31.1
> 

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback
  2022-03-08 17:13   ` Sean Christopherson
@ 2022-03-08 20:34     ` Sean Christopherson
  0 siblings, 0 replies; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 20:34 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Tue, Mar 08, 2022, Sean Christopherson wrote:
> On Mon, Feb 21, 2022, Paolo Bonzini wrote:
> > Currently, vendor code is patching the inject_page_fault and later, on
> > vmexit, expecting kvm_init_mmu to restore the inject_page_fault callback.
> > 
> > This is brittle, as exposed by the fact that SVM KVM_SET_NESTED_STATE
> > forgets to do it.  Instead, do the check at the time a page fault actually
> > has to be injected.  This does incur the cost of an extra retpoline
> > for nested vmexits when TDP is disabled, but is overall much cleaner.
> > While at it, add a comment that explains why the different behavior
> > is needed in this case.
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> 
> If I have NAK powers, NAK NAK NAK NAK NAK :-)
> 
> Forcing a VM-Exit is a hack, e.g. it's the entire reason inject_emulated_exception()
> returns a bool.  Even worse, it's confusing and misleading due to being incomplete.
> 
> The need hack for the hack is not unique to !tdp_enabled, the #DF can be triggered
> any time L0 is intercepting #PF.  Hello, allow_smaller_maxphyaddr.
> 
> And while I think allow_smaller_maxphyaddr should be burned with fire, architecturally
> it's still incomplete.  Any exception that is injected by KVM needs to be subjected
> to nested interception checks, not just #PF.  E.g. a #GP while vectoring a different
> fault should also be routed to L1.  KVM (mostly) gets away with special casing #PF
> because that's the only common scenario where L1 wants to intercept _and fix_ a fault
> that can occur while vectoring an exception.  E.g. in the #GP => #DF case, odds are
> very good that L1 will inject a #DF too, but that doesn't make KVM's behavior correct.
> 
> I have a series to handle this by performing the interception checks when an exception
> is queued, instead of when KVM injects the excepiton, and using a second kvm_queued_exception
> field to track exceptions that are queued for VM-Exit (so as not to lose the injected
> exception, which needs to be saved into vmc*12.  It's functional, though I haven't
> tested migration (requires minor shenanigans to perform interception checks for pending
> exceptions coming in from userspace).

Here's my preferred band-aid for this so we can make inject_page_fault() constant
without having to wait for a proper fix.  It's still putting lipstick on a pig,
but is a bit more complete and IMO better documents the mess.  This slots into
your series in place of your patch without much fuss.


From: Sean Christopherson <seanjc@google.com>
Date: Thu, 3 Mar 2022 20:20:17 -0800
Subject: [PATCH] KVM: x86: Clean up and document nested #PF workaround

Replace the per-vendor hack-a-fix for KVM's #PF => #PF => #DF workaround
with an explicit, common workaround in kvm_inject_emulated_page_fault().
Aside from being a hack, the current approach is brittle and incomplete,
e.g. nSVM's KVM_SET_NESTED_STATE fails to set ->inject_page_fault(),
and nVMX fails to apply the workaround when VMX is intercepting #PF due
to allow_smaller_maxphyaddr=1.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/svm/nested.c       | 15 ++++++++-------
 arch/x86/kvm/vmx/nested.c       | 15 ++++++---------
 arch/x86/kvm/x86.c              | 21 ++++++++++++++++++++-
 4 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da2f3a21e37b..c372a74acd9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1496,6 +1496,8 @@ struct kvm_x86_ops {
 struct kvm_x86_nested_ops {
 	void (*leave_nested)(struct kvm_vcpu *vcpu);
 	int (*check_events)(struct kvm_vcpu *vcpu);
+	int (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
+					    struct x86_exception *fault);
 	bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
 	void (*triple_fault)(struct kvm_vcpu *vcpu);
 	int (*get_state)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96bab464967f..dd942c719cf6 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -54,22 +54,25 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 	nested_svm_vmexit(svm);
 }

-static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static int nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+						   struct x86_exception *fault)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+
        WARN_ON(!is_guest_mode(vcpu));

 	if (vmcb12_is_intercept(&svm->nested.ctl,
 				INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
-	    !svm->nested.nested_run_pending) {
+	    !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
                svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
                svm->vmcb->control.exit_code_hi = 0;
                svm->vmcb->control.exit_info_1 = fault->error_code;
                svm->vmcb->control.exit_info_2 = fault->address;
                nested_svm_vmexit(svm);
-       } else {
-               kvm_inject_page_fault(vcpu, fault);
+	       return 0;
        }
+
+	return -EINVAL;
 }

 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@@ -680,9 +683,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
 	if (ret)
 		return ret;

-	if (!npt_enabled)
-		vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
-
 	if (!from_vmrun)
 		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);

@@ -1567,6 +1567,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
 struct kvm_x86_nested_ops svm_nested_ops = {
 	.leave_nested = svm_leave_nested,
 	.check_events = svm_check_nested_events,
+	.handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
 	.triple_fault = nested_svm_triple_fault,
 	.get_nested_state_pages = svm_get_nested_state_pages,
 	.get_state = svm_get_nested_state,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f18744f7ff82..cc4c74339d35 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -476,24 +476,23 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 	return 0;
 }

-
-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
-		struct x86_exception *fault)
+static int nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+						   struct x86_exception *fault)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

 	WARN_ON(!is_guest_mode(vcpu));

 	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-		!to_vmx(vcpu)->nested.nested_run_pending) {
+	    !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
 		vmcs12->vm_exit_intr_error_code = fault->error_code;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
 				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
 				  fault->address);
-	} else {
-		kvm_inject_page_fault(vcpu, fault);
+		return 0;
 	}
+	return -EINVAL;
 }

 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@ -2614,9 +2613,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
 	}

-	if (!enable_ept)
-		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
 	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
 				     vmcs12->guest_ia32_perf_global_ctrl))) {
@@ -6804,6 +6800,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 struct kvm_x86_nested_ops vmx_nested_ops = {
 	.leave_nested = vmx_leave_nested,
 	.check_events = vmx_check_nested_events,
+	.handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
 	.hv_timer_pending = nested_vmx_preemption_timer_pending,
 	.triple_fault = nested_vmx_triple_fault,
 	.get_state = vmx_get_nested_state,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa1bdd9909e..010fb54a9a82 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -748,6 +748,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);

+/* Returns true if the page fault was immediately morphed into a VM-Exit. */
 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 				    struct x86_exception *fault)
 {
@@ -766,8 +767,26 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 		kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
 				       fault_mmu->root.hpa);

+	/*
+	 * A workaround for KVM's bad exception handling.  If KVM injected an
+	 * exception into L2, and L2 encountered a #PF while vectoring the
+	 * injected exception, manually check to see if L1 wants to intercept
+	 * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
+	 * In all other cases, defer the check to nested_ops->check_events(),
+	 * which will correctly handle priority (this does not).  Note, other
+	 * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
+	 * most problematic, e.g. when L0 and L1 are both intercepting #PF for
+	 * shadow paging.
+	 *
+	 * TODO: Rewrite exception handling to track injected and pending
+	 *       (VM-Exit) exceptions separately.
+	 */
+	if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
+	    !kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
+		return true;
+
 	fault_mmu->inject_page_fault(vcpu, fault);
-	return fault->nested_page_fault;
+	return false;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);


base-commit: bb92fb66dcf8735c5190f415fed587bff7dd6717
--


^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once
  2022-02-21 16:22 ` [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once Paolo Bonzini
@ 2022-03-08 20:58   ` Sean Christopherson
  2022-03-09 10:34     ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-08 20:58 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Mon, Feb 21, 2022, Paolo Bonzini wrote:
>  
> +	vcpu->arch.root_mmu.get_guest_pgd = kvm_get_guest_cr3;
> +	vcpu->arch.root_mmu.get_pdptr = kvm_pdptr_read;
> +
> +	if (tdp_enabled) {

Putting all this code is in a separate helper reduces line-lengths via early
returns.  And it'll allow us to do the same for the nested specific MMUs if we
ever get smart and move "nested" to x86.c (preferably as enable_nested or
nested_enabled).

> +		vcpu->arch.root_mmu.inject_page_fault = kvm_inject_page_fault;
> +		vcpu->arch.root_mmu.page_fault = kvm_tdp_page_fault;
> +		vcpu->arch.root_mmu.sync_page = nonpaging_sync_page;
> +		vcpu->arch.root_mmu.invlpg = NULL;
> +		reset_tdp_shadow_zero_bits_mask(&vcpu->arch.root_mmu);
> +
> +		vcpu->arch.guest_mmu.get_guest_pgd = kvm_x86_ops.nested_ops->get_nested_pgd;
> +		vcpu->arch.guest_mmu.get_pdptr = kvm_x86_ops.nested_ops->get_nested_pdptr;
> +		vcpu->arch.guest_mmu.inject_page_fault = kvm_x86_ops.nested_ops->inject_nested_tdp_vmexit;

Using nested_ops is clever, but IMO unnecessary, especially since we can go even
further by adding a nEPT specific hook to initialize its constant shadow paging
stuff.

Here's what I had written spliced in with your code.  Compile tested only for
this version.


From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Feb 2022 11:22:42 -0500
Subject: [PATCH] KVM: x86/mmu: initialize constant-value fields just once

The get_guest_pgd, get_pdptr and inject_page_fault pointers are constant
for all three of root_mmu, guest_mmu and nested_mmu.  The guest_mmu
function pointers depend on the processor vendor, but are otherwise
constant.

Opportunistically stop initializing get_pdptr for nested EPT, since it
does not have PDPTRs.

Opportunistically change kvm_mmu_create() to return '0' unconditionally
in its happy path to make it obvious that it's a happy path.

Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h        |  1 +
 arch/x86/kvm/mmu/mmu.c    | 85 ++++++++++++++++++++++++---------------
 arch/x86/kvm/svm/nested.c | 15 +++++--
 arch/x86/kvm/svm/svm.c    |  3 ++
 arch/x86/kvm/svm/svm.h    |  1 +
 arch/x86/kvm/vmx/nested.c | 13 ++++--
 arch/x86/kvm/vmx/nested.h |  1 +
 arch/x86/kvm/vmx/vmx.c    |  3 ++
 8 files changed, 82 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9517e56a0da1..bd2a6e20307c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 void kvm_init_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
 			     unsigned long cr4, u64 efer, gpa_t nested_cr3);
+void kvm_init_shadow_ept_mmu_constants(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 			     int huge_page_level, bool accessed_dirty,
 			     gpa_t new_eptp);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8c388add95cb..db2d88c59198 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4778,12 +4778,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,

 	context->cpu_mode.as_u64 = cpu_mode.as_u64;
 	context->root_role.word = root_role.word;
-	context->page_fault = kvm_tdp_page_fault;
-	context->sync_page = nonpaging_sync_page;
-	context->invlpg = NULL;
-	context->get_guest_pgd = kvm_get_guest_cr3;
-	context->get_pdptr = kvm_pdptr_read;
-	context->inject_page_fault = kvm_inject_page_fault;

 	if (!is_cr0_pg(context))
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -4793,7 +4787,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 		context->gva_to_gpa = paging32_gva_to_gpa;

 	reset_guest_paging_metadata(vcpu, context);
-	reset_tdp_shadow_zero_bits_mask(context);
 }

 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
@@ -4818,8 +4811,8 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	reset_shadow_zero_bits_mask(vcpu, context);
 }

-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
-				union kvm_mmu_paging_mode cpu_mode)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+			     union kvm_mmu_paging_mode cpu_mode)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_page_role root_role;
@@ -4891,6 +4884,17 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 	return role;
 }

+void kvm_init_shadow_ept_mmu_constants(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *guest_mmu = &vcpu->arch.guest_mmu;
+
+	guest_mmu->page_fault = ept_page_fault;
+	guest_mmu->gva_to_gpa = ept_gva_to_gpa;
+	guest_mmu->sync_page  = ept_sync_page;
+	guest_mmu->invlpg     = ept_invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu_constants);
+
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 			     int huge_page_level, bool accessed_dirty,
 			     gpa_t new_eptp)
@@ -4912,7 +4916,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		context->invlpg = ept_invlpg;

 		update_permission_bitmask(context, true);
-		context->pkru_mask = 0;
 		reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
 		reset_ept_shadow_zero_bits_mask(context, execonly);
 	}
@@ -4921,18 +4924,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);

-static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
-			     union kvm_mmu_paging_mode cpu_mode)
-{
-	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-
-	kvm_init_shadow_mmu(vcpu, cpu_mode);
-
-	context->get_guest_pgd	   = kvm_get_guest_cr3;
-	context->get_pdptr         = kvm_pdptr_read;
-	context->inject_page_fault = kvm_inject_page_fault;
-}
-
 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
 				union kvm_mmu_paging_mode new_mode)
 {
@@ -4941,16 +4932,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
 	if (new_mode.as_u64 == g_context->cpu_mode.as_u64)
 		return;

-	g_context->cpu_mode.as_u64   = new_mode.as_u64;
-	g_context->get_guest_pgd     = kvm_get_guest_cr3;
-	g_context->get_pdptr         = kvm_pdptr_read;
-	g_context->inject_page_fault = kvm_inject_page_fault;
-
-	/*
-	 * L2 page tables are never shadowed, so there is no need to sync
-	 * SPTEs.
-	 */
-	g_context->invlpg            = NULL;
+	g_context->cpu_mode.as_u64 = new_mode.as_u64;

 	/*
 	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
@@ -5499,6 +5481,40 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
 	free_page((unsigned long)mmu->pml5_root);
 }

+static void kvm_init_mmu_constants(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *nested_mmu = &vcpu->arch.nested_mmu;
+	struct kvm_mmu *root_mmu = &vcpu->arch.root_mmu;
+
+	root_mmu->get_guest_pgd	    = kvm_get_guest_cr3;
+	root_mmu->get_pdptr	    = kvm_pdptr_read;
+	root_mmu->inject_page_fault = kvm_inject_page_fault;
+
+	/*
+	 * When shadowing IA32 page tables, all other callbacks various based
+	 * on paging mode, and the guest+nested MMUs are unused.
+	 */
+	if (!tdp_enabled)
+		return;
+
+	root_mmu->page_fault = kvm_tdp_page_fault;
+	root_mmu->sync_page  = nonpaging_sync_page;
+	root_mmu->invlpg     = NULL;
+	reset_tdp_shadow_zero_bits_mask(&vcpu->arch.root_mmu);
+
+	/*
+	 * Nested TDP MMU callbacks that are constant are vendor specific due
+	 * to the vast differences between EPT and NPT.  NPT in particular is
+	 * nasty because L1 may use 32-bit and/or 64-bit paging.
+	 */
+	nested_mmu->get_guest_pgd     = kvm_get_guest_cr3;
+	nested_mmu->get_pdptr         = kvm_pdptr_read;
+	nested_mmu->inject_page_fault = kvm_inject_page_fault;
+
+	/* L2 page tables are never shadowed, there's no need to sync SPTEs. */
+	nested_mmu->invlpg            = NULL;
+}
+
 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
 {
 	struct page *page;
@@ -5575,7 +5591,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	if (ret)
 		goto fail_allocate_root;

-	return ret;
+	kvm_init_mmu_constants(vcpu);
+
+	return 0;
+
  fail_allocate_root:
 	free_mmu_pages(&vcpu->arch.guest_mmu);
 	return ret;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index dd942c719cf6..c58c9d876a6c 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -96,6 +96,15 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 	return svm->nested.ctl.nested_cr3;
 }

+void nested_svm_init_mmu_constants(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *guest_mmu = &vcpu->arch.guest_mmu;
+
+	guest_mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
+	guest_mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
+	guest_mmu->inject_page_fault = nested_svm_inject_npf_exit;
+}
+
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -112,10 +121,8 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
 				svm->vmcb01.ptr->save.efer,
 				svm->nested.ctl.nested_cr3);
-	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
-	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
-	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
-	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
 }

 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a8ee949b2403..db62b3e88317 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1228,6 +1228,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)

 	svm->guest_state_loaded = false;

+	if (npt_enabled && nested)
+		nested_svm_init_mmu_constants(vcpu);
+
 	return 0;

 error_free_vmsa_page:
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..99c5a57ab5dd 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -564,6 +564,7 @@ void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
 void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
 void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
+void nested_svm_init_mmu_constants(struct kvm_vcpu *vcpu);

 extern struct kvm_x86_nested_ops svm_nested_ops;

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index cc4c74339d35..385f60305555 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -407,15 +407,22 @@ static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
 				nested_ept_get_eptp(vcpu));
 }

+void nested_ept_init_mmu_constants(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *mmu = &vcpu->arch.guest_mmu;
+
+	mmu->get_guest_pgd	= nested_ept_get_eptp;
+	mmu->inject_page_fault	= nested_ept_inject_page_fault;
+
+	kvm_init_shadow_ept_mmu_constants(vcpu);
+}
+
 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
 	WARN_ON(mmu_is_nested(vcpu));

 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
 	nested_ept_new_eptp(vcpu);
-	vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
-	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
-	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;

 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 }
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index c92cea0b8ccc..78e6d9ba5839 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -37,6 +37,7 @@ void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
 				 int size);
+void nested_ept_init_mmu_constants(struct kvm_vcpu *vcpu);

 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 40e015e9b260..04edb8a761a8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7081,6 +7081,9 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
 			goto free_vmcs;
 	}

+	if (enable_ept && nested)
+		nested_ept_init_mmu_constants(vcpu);
+
 	return 0;

 free_vmcs:

base-commit: 94fd8078bd4f838cf9ced265e6ac4237cbcba7a1
--


^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-03-08 18:55   ` Sean Christopherson
@ 2022-03-09  9:58     ` Paolo Bonzini
  2022-03-09 15:38       ` Sean Christopherson
  0 siblings, 1 reply; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-09  9:58 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 19:55, Sean Christopherson wrote:
>>   static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
>> -				    const struct kvm_mmu_role_regs *regs,
>> -				    union kvm_mmu_role new_role)
>> +				    union kvm_mmu_role cpu_mode,
> Can you give all helpers this treatment (rename "role" => "cpu_mode")?  I got
> tripped up a few times reading patches because the ones where it wasn't necessary,
> i.e. where there's only a single kvm_mmu_role paramenter, were left as-is.
> 
> I think kvm_calc_shadow_npt_root_page_role() and kvm_calc_shadow_mmu_root_page_role()
> are the only offenders.

These take struct kvm_mmu_role_regs; they *return* union kvm_mmu_role 
but that is changed later in the series to the base part only.

Paolo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles
  2022-03-08 19:35   ` Sean Christopherson
  2022-03-08 19:41     ` Sean Christopherson
@ 2022-03-09 10:33     ` Paolo Bonzini
  1 sibling, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-09 10:33 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 20:35, Sean Christopherson wrote:
>> +	root_role = cpu_mode.base;
>> +	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
> Heh, we have different definitions of "simpler".   Can we split the difference
> and do?
> 
> 	/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
> 	if (!____is_efer_lma(regs))
> 		root_role.level = PT32E_ROOT_LEVEL;
> 

It's not that easy until the very end (when cpu_mode is set in 
kvm_mmu_init_walker), but I'll make sure to switch to is_efer_lma once 
it is possible.

Paolo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once
  2022-03-08 20:58   ` Sean Christopherson
@ 2022-03-09 10:34     ` Paolo Bonzini
  0 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-09 10:34 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/8/22 21:58, Sean Christopherson wrote:
> Using nested_ops is clever, but IMO unnecessary, especially since we can go even
> further by adding a nEPT specific hook to initialize its constant shadow paging
> stuff.
> 
> Here's what I had written spliced in with your code.  Compile tested only for
> this version.

I'll do something in between, keeping the nested_ops but with three 
functions to initialize the various kvm_mmu structs.

Paolo

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-03-09  9:58     ` Paolo Bonzini
@ 2022-03-09 15:38       ` Sean Christopherson
  2022-03-09 15:40         ` Paolo Bonzini
  0 siblings, 1 reply; 66+ messages in thread
From: Sean Christopherson @ 2022-03-09 15:38 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: linux-kernel, kvm, dmatlack

On Wed, Mar 09, 2022, Paolo Bonzini wrote:
> On 3/8/22 19:55, Sean Christopherson wrote:
> > >   static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
> > > -				    const struct kvm_mmu_role_regs *regs,
> > > -				    union kvm_mmu_role new_role)
> > > +				    union kvm_mmu_role cpu_mode,
> > Can you give all helpers this treatment (rename "role" => "cpu_mode")?  I got
> > tripped up a few times reading patches because the ones where it wasn't necessary,
> > i.e. where there's only a single kvm_mmu_role paramenter, were left as-is.
> > 
> > I think kvm_calc_shadow_npt_root_page_role() and kvm_calc_shadow_mmu_root_page_role()
> > are the only offenders.
> 
> These take struct kvm_mmu_role_regs; they *return* union kvm_mmu_role but
> that is changed later in the series to the base part only.

Doh, sorry, a later patch is what confused me.  Lost track of things when moving
around in the series.

Patch 12, "KVM: x86/mmu: cleanup computation of MMU roles for shadow paging", is
where we end up with

	static union kvm_mmu_role
	kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
					   union kvm_mmu_role role)
	{
		if (!role.ext.efer_lma)
			role.base.level = PT32E_ROOT_LEVEL;
		else if (role.ext.cr4_la57)
			role.base.level = PT64_ROOT_5LEVEL;
		else
			role.base.level = PT64_ROOT_4LEVEL;

		return role;
	}

Can we instead tweak that patch to make it and kvm_calc_shadow_npt_root_page_role() be

	static union kvm_mmu_role
	kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
					union kvm_mmu_role cpu_role)
	{
		union kvm_mmu_role root_role = cpu_role;

		if (!cpu_role.ext.efer_lma)
			root_role.base.level = PT32E_ROOT_LEVEL;
		else if (cpu_role.ext.cr4_la57)
			root_role.base.level = PT64_ROOT_5LEVEL;
		else
			root_role.base.level = PT64_ROOT_4LEVEL;

		return root_role;
	}

This is effectively the same feedback I gave in Patch 15[*], I messed up when
trying to track back where the "union kvm_mmu_role role" param was introduced.

https://lore.kernel.org/all/YieoXYBFyo9pZhhX@google.com

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role
  2022-03-09 15:38       ` Sean Christopherson
@ 2022-03-09 15:40         ` Paolo Bonzini
  0 siblings, 0 replies; 66+ messages in thread
From: Paolo Bonzini @ 2022-03-09 15:40 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: linux-kernel, kvm, dmatlack

On 3/9/22 16:38, Sean Christopherson wrote:
> Can we instead tweak that patch to make it and kvm_calc_shadow_npt_root_page_role() be
> 
> 	static union kvm_mmu_role
> 	kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
> 					union kvm_mmu_role cpu_role)
> 	{
> 		union kvm_mmu_role root_role = cpu_role;
> 
> 		if (!cpu_role.ext.efer_lma)
> 			root_role.base.level = PT32E_ROOT_LEVEL;
> 		else if (cpu_role.ext.cr4_la57)
> 			root_role.base.level = PT64_ROOT_5LEVEL;
> 		else
> 			root_role.base.level = PT64_ROOT_4LEVEL;
> 
> 		return root_role;
> 	}

Yep, figured the same in the meanwhile.

Paolo


^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2022-03-09 15:40 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-21 16:22 [PATCH v2 00/25] KVM MMU refactoring part 2: role changes Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 01/25] KVM: x86/mmu: avoid indirect call for get_cr3 Paolo Bonzini
2022-03-08 16:16   ` Sean Christopherson
2022-03-08 16:21     ` Paolo Bonzini
2022-03-08 16:32       ` Sean Christopherson
2022-03-08 16:43         ` Paolo Bonzini
2022-03-08 16:53           ` Sean Christopherson
2022-03-08 17:14             ` Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 02/25] KVM: x86/mmu: nested EPT cannot be used in SMM Paolo Bonzini
2022-03-08 16:18   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 03/25] KVM: x86/mmu: constify uses of struct kvm_mmu_role_regs Paolo Bonzini
2022-03-08 16:22   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 04/25] KVM: x86/mmu: pull computation of kvm_mmu_role_regs to kvm_init_mmu Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 05/25] KVM: x86/mmu: rephrase unclear comment Paolo Bonzini
2022-03-08 16:39   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 06/25] KVM: nVMX/nSVM: do not monkey-patch inject_page_fault callback Paolo Bonzini
2022-03-08 17:13   ` Sean Christopherson
2022-03-08 20:34     ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 07/25] KVM: x86/mmu: remove "bool base_only" arguments Paolo Bonzini
2022-03-08 17:15   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 08/25] KVM: x86/mmu: split cpu_mode from mmu_role Paolo Bonzini
2022-03-08 17:36   ` Sean Christopherson
2022-03-08 17:49     ` Paolo Bonzini
2022-03-08 18:55   ` Sean Christopherson
2022-03-09  9:58     ` Paolo Bonzini
2022-03-09 15:38       ` Sean Christopherson
2022-03-09 15:40         ` Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 09/25] KVM: x86/mmu: do not recompute root level from kvm_mmu_role_regs Paolo Bonzini
2022-03-08 17:41   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 10/25] KVM: x86/mmu: remove ept_ad field Paolo Bonzini
2022-03-08 17:42   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 11/25] KVM: x86/mmu: remove kvm_calc_shadow_root_page_role_common Paolo Bonzini
2022-03-08 17:48   ` Sean Christopherson
2022-03-08 17:50     ` Paolo Bonzini
2022-03-08 18:17       ` Sean Christopherson
2022-03-08 18:18         ` Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 12/25] KVM: x86/mmu: cleanup computation of MMU roles for two-dimensional paging Paolo Bonzini
2022-03-08 18:11   ` Sean Christopherson
2022-03-08 18:24     ` Paolo Bonzini
2022-03-08 18:44       ` Sean Christopherson
2022-03-08 18:38     ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 13/25] KVM: x86/mmu: cleanup computation of MMU roles for shadow paging Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 14/25] KVM: x86/mmu: store shadow EFER.NX in the MMU role Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 15/25] KVM: x86/mmu: remove extended bits from mmu_role, rename field Paolo Bonzini
2022-03-08 19:02   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 16/25] KVM: x86/mmu: rename kvm_mmu_role union Paolo Bonzini
2022-03-08 19:15   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 17/25] KVM: x86/mmu: remove redundant bits from extended role Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 18/25] KVM: x86/mmu: remove valid " Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 19/25] KVM: x86/mmu: simplify and/or inline computation of shadow MMU roles Paolo Bonzini
2022-03-08 19:35   ` Sean Christopherson
2022-03-08 19:41     ` Sean Christopherson
2022-03-09 10:33     ` Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 20/25] KVM: x86/mmu: pull CPU mode computation to kvm_init_mmu Paolo Bonzini
2022-03-08 19:45   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 21/25] KVM: x86/mmu: replace shadow_root_level with root_role.level Paolo Bonzini
2022-03-08 19:48   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 22/25] KVM: x86/mmu: replace root_level with cpu_mode.base.level Paolo Bonzini
2022-03-08 19:49   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 23/25] KVM: x86/mmu: replace direct_map with root_role.direct Paolo Bonzini
2022-03-08 19:52   ` Sean Christopherson
2022-02-21 16:22 ` [PATCH v2 24/25] KVM: x86/mmu: initialize constant-value fields just once Paolo Bonzini
2022-03-08 20:58   ` Sean Christopherson
2022-03-09 10:34     ` Paolo Bonzini
2022-02-21 16:22 ` [PATCH v2 25/25] KVM: x86/mmu: extract initialization of the page walking data Paolo Bonzini
2022-03-08 20:02   ` Sean Christopherson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.