All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sean Christopherson <sean.j.christopherson@intel.com>
To: Paolo Bonzini <pbonzini@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Andy Lutomirski <luto@amacapital.net>,
	Vivek Goyal <vgoyal@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andy Lutomirski <luto@kernel.org>,
	LKML <linux-kernel@vger.kernel.org>, X86 ML <x86@kernel.org>,
	kvm list <kvm@vger.kernel.org>, stable <stable@vger.kernel.org>
Subject: Re: [PATCH v2] x86/kvm: Disable KVM_ASYNC_PF_SEND_ALWAYS
Date: Wed, 8 Apr 2020 08:34:14 -0700	[thread overview]
Message-ID: <20200408153413.GA11322@linux.intel.com> (raw)
In-Reply-To: <274f3d14-08ac-e5cc-0b23-e6e0274796c8@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 916 bytes --]

On Wed, Apr 08, 2020 at 10:23:58AM +0200, Paolo Bonzini wrote:
> Page-not-present async page faults are almost a perfect match for the
> hardware use of #VE (and it might even be possible to let the processor
> deliver the exceptions).

My "async" page fault knowledge is limited, but if the desired behavior is
to reflect a fault into the guest for select EPT Violations, then yes,
enabling EPT Violation #VEs in hardware is doable.  The big gotcha is that
KVM needs to set the suppress #VE bit for all EPTEs when allocating a new
MMU page, otherwise not-present faults on zero-initialized EPTEs will get
reflected.

Attached a patch that does the prep work in the MMU.  The VMX usage would be:

	kvm_mmu_set_spte_init_value(VMX_EPT_SUPPRESS_VE_BIT);

when EPT Violation #VEs are enabled.  It's 64-bit only as it uses stosq to
initialize EPTEs.  32-bit could also be supported by doing memcpy() from
a static page.

[-- Attachment #2: 0001-KVM-x86-mmu-Allow-non-zero-init-value-for-shadow-PTE.patch --]
[-- Type: text/x-diff, Size: 5340 bytes --]

From 078b485e8a64e6d72ebad58bf66f950763ba30bb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 29 Jul 2019 19:23:46 -0700
Subject: [PATCH] KVM: x86/mmu: Allow non-zero init value for shadow PTE

Add support for using a non-zero "init" value for shadow PTEs, which is
required to enable EPT Violation #VEs in hardware.  When #VEs are
enabled, KVM needs to set the "suppress #VE" bit in unused PTEs to avoid
unintentionally reflecting not-present EPT Violations into the guest.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/kvm/mmu.h             |  1 +
 arch/x86/kvm/mmu/mmu.c         | 43 ++++++++++++++++++++++++++++------
 arch/x86/kvm/mmu/paging_tmpl.h |  2 +-
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 8a3b1bce722a..139db8a125d6 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,7 @@ static inline u64 rsvd_bits(int s, int e)
 }
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_spte_init_value(u64 init_value);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..742ea9c254c4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -250,6 +250,8 @@ static u64 __read_mostly shadow_mmio_access_mask;
 static u64 __read_mostly shadow_present_mask;
 static u64 __read_mostly shadow_me_mask;
 
+static u64 __read_mostly shadow_init_value;
+
 /*
  * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
@@ -538,6 +540,13 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+void kvm_mmu_set_spte_init_value(u64 init_value)
+{
+	WARN_ON(!IS_ENABLED(CONFIG_X86_64) && init_value);
+	shadow_init_value = init_value;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_spte_init_value);
+
 static u8 kvm_get_shadow_phys_bits(void)
 {
 	/*
@@ -569,6 +578,7 @@ static void kvm_mmu_reset_all_pte_masks(void)
 	shadow_mmio_mask = 0;
 	shadow_present_mask = 0;
 	shadow_acc_track_mask = 0;
+	shadow_init_value = 0;
 
 	shadow_phys_bits = kvm_get_shadow_phys_bits();
 
@@ -610,7 +620,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return (pte != 0) && !is_mmio_spte(pte);
+	return (pte != 0 && pte != shadow_init_value && !is_mmio_spte(pte));
 }
 
 static int is_large_pte(u64 pte)
@@ -921,9 +931,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	u64 old_spte = *sptep;
 
 	if (!spte_has_volatile_bits(old_spte))
-		__update_clear_spte_fast(sptep, 0ull);
+		__update_clear_spte_fast(sptep, shadow_init_value);
 	else
-		old_spte = __update_clear_spte_slow(sptep, 0ull);
+		old_spte = __update_clear_spte_slow(sptep, shadow_init_value);
 
 	if (!is_shadow_present_pte(old_spte))
 		return 0;
@@ -953,7 +963,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
  */
 static void mmu_spte_clear_no_track(u64 *sptep)
 {
-	__update_clear_spte_fast(sptep, 0ull);
+	__update_clear_spte_fast(sptep, shadow_init_value);
 }
 
 static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -2473,6 +2483,20 @@ static void clear_sp_write_flooding_count(u64 *spte)
 	__clear_sp_write_flooding_count(sp);
 }
 
+#ifdef CONFIG_X86_64
+static inline void kvm_clear_ptes(void *page)
+{
+	int ign;
+
+	asm volatile (
+		"rep stosq\n\t"
+		: "=c"(ign), "=D"(page)
+		: "a"(shadow_init_value), "c"(4096/8), "D"(page)
+		: "memory"
+	);
+}
+#endif
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -2553,7 +2577,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
 	}
-	clear_page(sp->spt);
+#ifdef CONFIG_X86_64
+	if (shadow_init_value)
+		kvm_clear_ptes(sp->spt);
+	else
+#endif
+		clear_page(sp->spt);
 	trace_kvm_mmu_get_page(sp, true);
 
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@ -3515,7 +3544,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
 	bool fault_handled = false;
-	u64 spte = 0ull;
+	u64 spte = shadow_init_value;
 	uint retry_count = 0;
 
 	if (!page_fault_can_be_fast(error_code))
@@ -3951,7 +3980,7 @@ static bool
 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
 	struct kvm_shadow_walk_iterator iterator;
-	u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+	u64 sptes[PT64_ROOT_MAX_LEVEL], spte = shadow_init_value;
 	struct rsvd_bits_validate *rsvd_check;
 	int root, leaf;
 	bool reserved = false;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 9bdf9b7d9a96..949deed15933 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -1025,7 +1025,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gpa_t pte_gpa;
 		gfn_t gfn;
 
-		if (!sp->spt[i])
+		if (!sp->spt[i] || sp->spt[i] == shadow_init_value)
 			continue;
 
 		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-- 
2.24.1


  parent reply	other threads:[~2020-04-08 15:34 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-07  2:26 [PATCH v2] x86/kvm: Disable KVM_ASYNC_PF_SEND_ALWAYS Andy Lutomirski
2020-03-07 15:03 ` Andy Lutomirski
2020-03-07 15:47   ` Thomas Gleixner
2020-03-07 15:59     ` Andy Lutomirski
2020-03-07 19:01       ` Thomas Gleixner
2020-03-07 19:34         ` Andy Lutomirski
2020-03-08  7:23         ` Thomas Gleixner
2020-03-09  6:57           ` Thomas Gleixner
2020-03-09  8:40             ` Paolo Bonzini
2020-03-09  9:09               ` Thomas Gleixner
2020-03-09 18:14                 ` Andy Lutomirski
2020-03-09 19:05                   ` Thomas Gleixner
2020-03-09 20:22                     ` Peter Zijlstra
2020-04-06 19:09                       ` Vivek Goyal
2020-04-06 20:25                         ` Peter Zijlstra
2020-04-06 20:32                           ` Andy Lutomirski
2020-04-06 20:42                             ` Andy Lutomirski
2020-04-07 17:21                               ` Vivek Goyal
2020-04-07 17:38                                 ` Andy Lutomirski
2020-04-07 20:20                                   ` Thomas Gleixner
2020-04-07 21:41                                     ` Andy Lutomirski
2020-04-07 22:07                                       ` Paolo Bonzini
2020-04-07 22:29                                         ` Andy Lutomirski
2020-04-08  0:30                                           ` Paolo Bonzini
2020-05-21 15:55                                         ` Vivek Goyal
2020-04-07 22:48                                       ` Thomas Gleixner
2020-04-08  4:48                                         ` Andy Lutomirski
2020-04-08  9:32                                           ` Borislav Petkov
2020-04-08 10:12                                           ` Thomas Gleixner
2020-04-08 18:23                                           ` Vivek Goyal
2020-04-07 22:49                                       ` Vivek Goyal
2020-04-08 10:01                                         ` Borislav Petkov
2020-04-07 22:04                                     ` Paolo Bonzini
2020-04-07 23:21                                       ` Thomas Gleixner
2020-04-08  8:23                                         ` Paolo Bonzini
2020-04-08 13:01                                           ` Thomas Gleixner
2020-04-08 15:38                                             ` Peter Zijlstra
2020-04-08 16:41                                               ` Thomas Gleixner
2020-04-09  9:03                                             ` Paolo Bonzini
2020-04-08 15:34                                           ` Sean Christopherson [this message]
2020-04-08 16:50                                             ` Paolo Bonzini
2020-04-08 18:01                                               ` Thomas Gleixner
2020-04-08 20:34                                                 ` Vivek Goyal
2020-04-08 23:06                                                   ` Thomas Gleixner
2020-04-08 23:14                                                     ` Thomas Gleixner
2020-04-09  4:50                                                 ` Andy Lutomirski
2020-04-09  9:43                                                   ` Paolo Bonzini
2020-04-09 11:36                                                   ` Andrew Cooper
2020-04-09 12:47                                                   ` Paolo Bonzini
2020-04-09 14:13                                                     ` Andrew Cooper
2020-04-09 14:32                                                       ` Paolo Bonzini
2020-04-09 15:03                                                         ` Andy Lutomirski
2020-04-09 15:17                                                           ` Paolo Bonzini
2020-04-09 17:32                                                             ` Andy Lutomirski
2020-04-06 21:32                         ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200408153413.GA11322@linux.intel.com \
    --to=sean.j.christopherson@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=luto@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=stable@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=vgoyal@redhat.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.