From mboxrd@z Thu Jan  1 00:00:00 1970
From: Jun Nakajima <jun.nakajima@intel.com>
Subject: [PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h
Date: Sat, 18 May 2013 21:52:22 -0700
Message-ID: <1368939152-11406-3-git-send-email-jun.nakajima@intel.com>
References: <1368939152-11406-1-git-send-email-jun.nakajima@intel.com>
Cc: Gleb Natapov <gleb@redhat.com>, Paolo Bonzini <pbonzini@redhat.com>
To: kvm@vger.kernel.org
Return-path: <kvm-owner@vger.kernel.org>
Received: from mail-pa0-f50.google.com ([209.85.220.50]:53404 "EHLO
	mail-pa0-f50.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751155Ab3ESEwi (ORCPT <rfc822;kvm@vger.kernel.org>);
	Sun, 19 May 2013 00:52:38 -0400
Received: by mail-pa0-f50.google.com with SMTP id fb10so4756901pad.37
        for <kvm@vger.kernel.org>; Sat, 18 May 2013 21:52:38 -0700 (PDT)
In-Reply-To: <1368939152-11406-1-git-send-email-jun.nakajima@intel.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

From: Nadav Har'El <nyh@il.ibm.com>

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
---
 arch/x86/kvm/mmu.c         |  5 +++++
 arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 117233f..6c1670f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
 	return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df34d4a..4c45654 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
 	#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+	#define pt_element_t u64
+	#define guest_walker guest_walkerEPT
+	#define FNAME(name) EPT_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#ifdef CONFIG_X86_64
+	#define PT_MAX_FULL_LEVELS 4
+	#define CMPXCHG cmpxchg
+	#else
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 2
+	#endif
 #else
 	#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			       pt_element_t __user *ptep_user, unsigned index,
 			       pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 
 	return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
 	unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+	access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+			  VMX_EPT_EXECUTABLE_MASK));
+#else
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 	access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
 	return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 					     struct kvm_mmu *mmu,
 					     struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	}
 	return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 				    gva_t addr, u32 access)
 {
-	int ret;
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
@@ -192,7 +221,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t gfn;
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
+#if PTTYPE != PTTYPE_EPT
 retry_walk:
+#endif
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
 
@@ -277,6 +308,7 @@ retry_walk:
 
 	walker->gfn = real_gpa >> PAGE_SHIFT;
 
+#if PTTYPE != PTTYPE_EPT
 	if (!write_fault)
 		protect_clean_gpte(&pte_access, pte);
 	else
@@ -287,12 +319,15 @@ retry_walk:
 		accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
 
 	if (unlikely(!accessed_dirty)) {
+		int ret;
+
 		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
 		if (unlikely(ret < 0))
 			goto error;
 		else if (ret)
 			goto retry_walk;
 	}
+#endif
 
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
@@ -323,6 +358,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 					access);
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
 				   u32 access)
@@ -330,6 +366,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
 					addr, access);
 }
+#endif
 
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -754,6 +791,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 				      u32 access,
 				      struct x86_exception *exception)
@@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 
 	return gpa;
 }
+#endif
 
 /*
  * Using the cached information from sp->gfns is safe because:
-- 
1.8.1.2