linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	x86@kernel.org, linux-mm@kvack.org, luto@kernel.org,
	peterz@infradead.org, dave.hansen@intel.com, pbonzini@redhat.com,
	sean.j.christopherson@intel.com, keescook@chromium.org
Cc: kristen@linux.intel.com, deneen.t.dock@intel.com,
	Rick Edgecombe <rick.p.edgecombe@intel.com>
Subject: [RFC PATCH 05/13] kvm: Add #PF injection for KVM XO
Date: Thu,  3 Oct 2019 14:23:52 -0700	[thread overview]
Message-ID: <20191003212400.31130-6-rick.p.edgecombe@intel.com> (raw)
In-Reply-To: <20191003212400.31130-1-rick.p.edgecombe@intel.com>

If there is a read or write violation on the gfn range of an XO memslot,
then inject a page fault into the guest with the guest virtual address
that faulted. This can be done directly if the hardware provides the gva
access that caused the fault. Otherwise, the violating instruction needs
to be emulated to figure it out.

TODO:
Currently ACC_USER_MASK is used to mean not-readable in the EPT case,
but in the x86 page tables case it means the real user bit and so can't
be overloaded to mean not readable. Probably a new dedicated ACC_ flag is
needed for not readable to be used in XOM cases. Instead of changing that
everywhere a conditional is added in paging_tmpl.h to check for the KVM XO
bit. This should probably be made to work with the logic in
permission_fault instead of having a special case.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 52 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/paging_tmpl.h      | 29 ++++++++++++++----
 arch/x86/kvm/x86.c              |  5 +++-
 4 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b363a7fc47b0..6d06c794d720 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -785,6 +785,8 @@ struct kvm_vcpu_arch {
 	bool gva_available;
 	gva_t gva_val;
 
+	bool xo_fault;
+
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 338cc64cc821..d5ba44066b62 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -45,6 +45,7 @@
 #include <asm/io.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
+#include <asm/traps.h>
 #include "trace.h"
 
 /*
@@ -4130,6 +4131,34 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
 	return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
 }
 
+
+static int try_inject_exec_only_pf(struct kvm_vcpu *vcpu, u64 error_code)
+{
+	struct x86_exception fault;
+	int cpl = kvm_x86_ops->get_cpl(vcpu);
+	/*
+	 * There is an assumption here that if there is an TDP violation for an
+	 * XO memslot, then it must be a read or write fault.
+	 */
+	u16 fault_error_code = X86_PF_PROT | (cpl == 3 ? X86_PF_USER : 0);
+
+	if (!vcpu->arch.gva_available)
+		return 0;
+
+	if (error_code & PFERR_WRITE_MASK)
+		fault_error_code |= X86_PF_WRITE;
+
+	fault.vector = PF_VECTOR;
+	fault.error_code_valid = true;
+	fault.error_code = fault_error_code;
+	fault.nested_page_fault = false;
+	fault.address = vcpu->arch.gva_val;
+	fault.async_page_fault = true;
+	kvm_inject_page_fault(vcpu, &fault);
+
+	return 1;
+}
+
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 			  bool prefault)
 {
@@ -4141,12 +4170,35 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
 	if (page_fault_handle_page_track(vcpu, error_code, gfn))
 		return RET_PF_EMULATE;
 
+	/*
+	 * Set xo_fault when the fault is a read or write fault on an xo memslot
+	 * so that the emulator knows it needs to check page table permissions
+	 * and will inject a fault.
+	 */
+	vcpu->arch.xo_fault = false;
+	if (slot && unlikely((slot->flags & KVM_MEM_EXECONLY)
+		&& !(error_code & PFERR_FETCH_MASK)))
+		vcpu->arch.xo_fault = true;
+
+	/* If memslot is xo, need to inject fault */
+	if (unlikely(vcpu->arch.xo_fault)) {
+		/*
+		 * If not enough information to inject the fault,
+		 * emulate to figure it out and emulate the PF.
+		 */
+		if (!try_inject_exec_only_pf(vcpu, error_code))
+			return RET_PF_EMULATE;
+
+		return 1;
+	}
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7d5cdb3af594..eae1871c5225 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -307,7 +307,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gpa_t pte_gpa;
 	bool have_ad;
 	int offset;
-	u64 walk_nx_mask = 0;
+	u64 walk_mask = 0;
+	u64 walk_nr_mask = 0;
+	bool kvm_xo = guest_cpuid_has(vcpu, X86_FEATURE_KVM_XO);
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
 	const int fetch_fault = access & PFERR_FETCH_MASK;
@@ -322,7 +324,11 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
-	walk_nx_mask = 1ULL << PT64_NX_SHIFT;
+	walk_mask = 1ULL << PT64_NX_SHIFT;
+	if (kvm_xo) {
+		walk_nr_mask = 1ULL << cpuid_maxphyaddr(vcpu);
+		walk_mask |= walk_nr_mask;
+	}
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
@@ -395,7 +401,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		 * Inverting the NX it lets us AND it like other
 		 * permission bits.
 		 */
-		pte_access = pt_access & (pte ^ walk_nx_mask);
+		pte_access = pt_access & (pte ^ walk_mask);
 
 		if (unlikely(!FNAME(is_present_gpte)(pte)))
 			goto error;
@@ -412,12 +418,25 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
 
 	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
-	walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
-	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
+	walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_mask);
+	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_mask);
+
 	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
 	if (unlikely(errcode))
 		goto error;
 
+	/*
+	 * KVM XO bit is not checked in permission_fault(), so check it here and
+	 * inject appropriate fault.
+	 */
+	if (kvm_xo && !fetch_fault
+	    && (walk_nr_mask & (pte_access ^ walk_nr_mask))) {
+		errcode = PFERR_PRESENT_MASK;
+		if (write_fault)
+			errcode	|= PFERR_WRITE_MASK;
+		goto error;
+	}
+
 	gfn = gpte_to_gfn_lvl(pte, walker->level);
 	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aa138d3a86c5..2e321d788672 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5494,8 +5494,11 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 	 * Note, this cannot be used on string operations since string
 	 * operation using rep will only have the initial GPA from the NPF
 	 * occurred.
+	 *
+	 * If the fault was an XO fault, we need to walk the page tables to
+	 * determine the gva and emulate the PF.
 	 */
-	if (vcpu->arch.gpa_available &&
+	if (!vcpu->arch.xo_fault && vcpu->arch.gpa_available &&
 	    emulator_can_use_gpa(ctxt) &&
 	    (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
 		gpa = vcpu->arch.gpa_val;
-- 
2.17.1


  parent reply	other threads:[~2019-10-03 21:39 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-03 21:23 [RFC PATCH 00/13] XOM for KVM guest userspace Rick Edgecombe
2019-10-03 21:23 ` [RFC PATCH 01/13] kvm: Enable MTRR to work with GFNs with perm bits Rick Edgecombe
2019-10-14  6:47   ` Yu Zhang
2019-10-14 18:44     ` Edgecombe, Rick P
2019-10-03 21:23 ` [RFC PATCH 02/13] kvm: Add support for X86_FEATURE_KVM_XO Rick Edgecombe
2019-10-03 21:23 ` [RFC PATCH 03/13] kvm: Add XO memslot type Rick Edgecombe
2019-10-04  7:27   ` Paolo Bonzini
2019-10-04 19:06     ` Edgecombe, Rick P
2019-10-06 16:15       ` Paolo Bonzini
2019-10-03 21:23 ` [RFC PATCH 04/13] kvm, vmx: Add support for gva exit qualification Rick Edgecombe
2019-10-03 21:23 ` Rick Edgecombe [this message]
2019-10-04  7:42   ` [RFC PATCH 05/13] kvm: Add #PF injection for KVM XO Paolo Bonzini
2019-10-04 19:11     ` Edgecombe, Rick P
2019-10-03 21:23 ` [RFC PATCH 06/13] kvm: Add KVM_CAP_EXECONLY_MEM Rick Edgecombe
2019-10-04  7:24   ` Paolo Bonzini
2019-10-04 19:11     ` Edgecombe, Rick P
2019-10-03 21:23 ` [RFC PATCH 07/13] kvm: Add docs for KVM_CAP_EXECONLY_MEM Rick Edgecombe
2019-10-03 21:23 ` [RFC PATCH 08/13] x86/boot: Rename USE_EARLY_PGTABLE_L5 Rick Edgecombe
2019-10-03 21:23 ` [RFC PATCH 09/13] x86/cpufeature: Add detection of KVM XO Rick Edgecombe
2019-10-29 23:33   ` Kees Cook
2019-10-29 23:52     ` Edgecombe, Rick P
2019-10-30 14:55       ` Sean Christopherson
2019-10-30 21:02         ` Edgecombe, Rick P
2019-10-03 21:23 ` [RFC PATCH 10/13] x86/mm: Add NR page bit for " Rick Edgecombe
2019-10-04  7:33   ` Paolo Bonzini
2019-10-03 21:23 ` [RFC PATCH 11/13] x86, ptdump: Add NR bit to page table dump Rick Edgecombe
2019-10-03 21:23 ` [RFC PATCH 12/13] mmap: Add XO support for KVM XO Rick Edgecombe
2019-10-04  7:34   ` Paolo Bonzini
2019-10-04 19:12     ` Edgecombe, Rick P
2019-10-03 21:24 ` [RFC PATCH 13/13] x86/Kconfig: Add Kconfig for KVM based XO Rick Edgecombe
2019-10-29 23:36   ` Kees Cook
2019-10-30  0:01     ` Edgecombe, Rick P
2019-10-30 18:36       ` Kees Cook
2019-10-04  7:22 ` [RFC PATCH 00/13] XOM for KVM guest userspace Paolo Bonzini
2019-10-04 19:03   ` Edgecombe, Rick P
2019-10-04 14:56 ` Andy Lutomirski
2019-10-04 20:09   ` Edgecombe, Rick P
2019-10-05  1:33     ` Andy Lutomirski
2019-10-07 18:14       ` Edgecombe, Rick P
2019-10-29 23:40 ` Kees Cook
2019-10-30  0:27   ` Edgecombe, Rick P

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191003212400.31130-6-rick.p.edgecombe@intel.com \
    --to=rick.p.edgecombe@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=deneen.t.dock@intel.com \
    --cc=keescook@chromium.org \
    --cc=kristen@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=sean.j.christopherson@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).