[patch 0/4] KVM: support for pinning sptes (v2)

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch 0/4] KVM: support for pinning sptes (v2)
@ 2014-07-09 19:12 mtosatti
  2014-07-09 19:12 ` [patch 1/4] KVM: x86: add pinned parameter to page_fault methods mtosatti
                   ` (4 more replies)
  0 siblings, 5 replies; 30+ messages in thread
From: mtosatti @ 2014-07-09 19:12 UTC (permalink / raw)
  To: kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

Required by PEBS support as discussed at

Subject: [patch 0/4] [patch 0/5] Implement PEBS virtualization for Silvermont
Message-Id: <1401412327-14810-1-git-send-email-andi@firstfloor.org>

Thread.

----------------------

v2:
	- unify remote kick function	     (Gleb)
	- keep sptes intact on get_dirty_log (Gleb)
	- add new internal error	     (Gleb)
	- unpin sptes/shadow pages	     (Avi)
	- fix spte walker level		     (Nadav)
	- do not require get_page




^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch 1/4] KVM: x86: add pinned parameter to page_fault methods
  2014-07-09 19:12 [patch 0/4] KVM: support for pinning sptes (v2) mtosatti
@ 2014-07-09 19:12 ` mtosatti
  2014-07-09 19:12 ` [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only) mtosatti
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 30+ messages in thread
From: mtosatti @ 2014-07-09 19:12 UTC (permalink / raw)
  To: kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

[-- Attachment #1: add-pinned-parameter-to-page-fault --]
[-- Type: text/plain, Size: 3675 bytes --]

To be used by next patch.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 arch/x86/include/asm/kvm_host.h |    2 +-
 arch/x86/kvm/mmu.c              |   11 ++++++-----
 arch/x86/kvm/paging_tmpl.h      |    2 +-
 arch/x86/kvm/x86.c              |    2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h	2014-06-18 17:27:47.579549247 -0300
+++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h	2014-06-18 17:28:17.549456614 -0300
@@ -259,7 +259,7 @@
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
-			  bool prefault);
+			  bool prefault, bool pin, bool *pinned);
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
 				  struct x86_exception *fault);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-06-18 17:27:47.582549238 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-06-18 17:28:17.550456611 -0300
@@ -2899,7 +2899,7 @@
 static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
-			 gfn_t gfn, bool prefault)
+			 gfn_t gfn, bool prefault, bool pin, bool *pinned)
 {
 	int r;
 	int level;
@@ -3299,7 +3299,8 @@
 }
 
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-				u32 error_code, bool prefault)
+				u32 error_code, bool prefault, bool pin,
+				bool *pinned)
 {
 	gfn_t gfn;
 	int r;
@@ -3323,7 +3324,7 @@
 	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code, gfn, prefault);
+			     error_code, gfn, prefault, pin, pinned);
 }
 
 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3373,7 +3374,7 @@
 }
 
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
-			  bool prefault)
+			  bool prefault, bool pin, bool *pinned)
 {
 	pfn_t pfn;
 	int r;
@@ -4190,7 +4191,7 @@
 	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
 
-	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false, false, NULL);
 	if (r < 0)
 		goto out;
 
Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h	2014-06-18 17:27:47.583549234 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h	2014-06-18 17:28:17.550456611 -0300
@@ -687,7 +687,7 @@
  *           a negative value on error.
  */
 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
-			     bool prefault)
+			     bool prefault, bool pin, bool *pinned)
 {
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
Index: kvm.pinned-sptes/arch/x86/kvm/x86.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c	2014-06-18 17:27:47.586549225 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/x86.c	2014-06-18 17:28:17.552456605 -0300
@@ -7415,7 +7415,7 @@
 	      work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
 		return;
 
-	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true, false, NULL);
 }
 
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)



^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)
  2014-07-09 19:12 [patch 0/4] KVM: support for pinning sptes (v2) mtosatti
  2014-07-09 19:12 ` [patch 1/4] KVM: x86: add pinned parameter to page_fault methods mtosatti
@ 2014-07-09 19:12 ` mtosatti
  2014-07-17 17:18   ` Nadav Amit
  2014-07-21 21:46   ` Xiao Guangrong
  2014-07-09 19:12 ` [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path mtosatti
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 30+ messages in thread
From: mtosatti @ 2014-07-09 19:12 UTC (permalink / raw)
  To: kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

[-- Attachment #1: mmu-pinned-spte --]
[-- Type: text/plain, Size: 21829 bytes --]

Allow vcpus to pin spte translations by:

1) Creating a per-vcpu list of pinned ranges.
2) On mmu reload request:
	- Fault ranges.
	- Mark sptes with a pinned bit.
	- Mark shadow pages as pinned.

3) Then modify the following actions:
	- Page age => skip spte flush.
	- MMU notifiers => force mmu reload request (which kicks cpu out of
				guest mode).
	- GET_DIRTY_LOG => force mmu reload request.
	- SLAB shrinker => skip shadow page deletion.

TDP-only.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 arch/x86/include/asm/kvm_host.h |   13 +
 arch/x86/kvm/mmu.c              |  294 +++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/mmu.h              |    7 
 arch/x86/kvm/mmutrace.h         |   23 +++
 arch/x86/kvm/paging_tmpl.h      |    4 
 arch/x86/kvm/x86.c              |    8 -
 include/linux/kvm_host.h        |    3 
 include/uapi/linux/kvm.h        |    2 
 virt/kvm/kvm_main.c             |   18 +-
 9 files changed, 340 insertions(+), 32 deletions(-)

Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h	2014-07-09 12:05:34.836161266 -0300
+++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h	2014-07-09 12:08:45.341762782 -0300
@@ -97,6 +97,8 @@
 #define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
+#define KVM_MAX_PER_VCPU_PINNED_RANGE 10
+
 #define ASYNC_PF_PER_VCPU 64
 
 struct kvm_vcpu;
@@ -221,6 +223,8 @@
 	/* hold the gfn of each spte inside spt */
 	gfn_t *gfns;
 	bool unsync;
+	bool pinned;
+
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
@@ -337,6 +341,12 @@
 	KVM_DEBUGREG_WONT_EXIT = 2,
 };
 
+struct kvm_pinned_page_range {
+	gfn_t base_gfn;
+	unsigned long npages;
+	struct list_head link;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -392,6 +402,9 @@
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	struct list_head pinned_mmu_pages;
+	atomic_t nr_pinned_ranges;
+
 	struct fpu guest_fpu;
 	u64 xcr0;
 	u64 guest_supported_xcr0;
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 12:05:34.837161264 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 12:09:21.856684314 -0300
@@ -148,6 +148,9 @@
 
 #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+#define SPTE_PINNED		(1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
+
+#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
@@ -327,6 +330,11 @@
 	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
 }
 
+static int is_pinned_spte(u64 spte)
+{
+	return spte & SPTE_PINNED && is_shadow_present_pte(spte);
+}
+
 static int is_large_pte(u64 pte)
 {
 	return pte & PT_PAGE_SIZE_MASK;
@@ -1176,6 +1184,16 @@
 		kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
+static bool vcpu_has_pinned(struct kvm_vcpu *vcpu)
+{
+	return atomic_read(&vcpu->arch.nr_pinned_ranges);
+}
+
+static void mmu_reload_pinned_vcpus(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, &vcpu_has_pinned);
+}
+
 /*
  * Write-protect on the specified @sptep, @pt_protect indicates whether
  * spte write-protection is caused by protecting shadow page table.
@@ -1268,7 +1286,8 @@
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			   struct kvm_memory_slot *slot, unsigned long data)
+			   struct kvm_memory_slot *slot, unsigned long data,
+			   bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1278,6 +1297,14 @@
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
 
+		if (is_pinned_spte(*sptep)) {
+			/* don't nuke pinned sptes if page aging: return
+ 			 * young=yes instead.
+ 			 */
+			if (age)
+				return 1;
+			mmu_reload_pinned_vcpus(kvm);
+		}
 		drop_spte(kvm, sptep);
 		need_tlb_flush = 1;
 	}
@@ -1286,7 +1313,8 @@
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			     struct kvm_memory_slot *slot, unsigned long data)
+			     struct kvm_memory_slot *slot, unsigned long data,
+			     bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1304,6 +1332,9 @@
 
 		need_flush = 1;
 
+		if (is_pinned_spte(*sptep))
+			mmu_reload_pinned_vcpus(kvm);
+
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, sptep);
 			sptep = rmap_get_first(*rmapp, &iter);
@@ -1334,7 +1365,8 @@
 				int (*handler)(struct kvm *kvm,
 					       unsigned long *rmapp,
 					       struct kvm_memory_slot *slot,
-					       unsigned long data))
+					       unsigned long data,
+					       bool age))
 {
 	int j;
 	int ret = 0;
@@ -1374,7 +1406,7 @@
 			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
 
 			for (; idx <= idx_end; ++idx)
-				ret |= handler(kvm, rmapp++, memslot, data);
+				ret |= handler(kvm, rmapp++, memslot, data, false);
 		}
 	}
 
@@ -1385,7 +1417,8 @@
 			  unsigned long data,
 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 					 struct kvm_memory_slot *slot,
-					 unsigned long data))
+					 unsigned long data,
+					 bool age))
 {
 	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
 }
@@ -1406,7 +1439,8 @@
 }
 
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			 struct kvm_memory_slot *slot, unsigned long data)
+			 struct kvm_memory_slot *slot, unsigned long data,
+			 bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator uninitialized_var(iter);
@@ -1421,7 +1455,7 @@
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
 	if (!shadow_accessed_mask) {
-		young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+		young = kvm_unmap_rmapp(kvm, rmapp, slot, data, true);
 		goto out;
 	}
 
@@ -1442,7 +1476,8 @@
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			      struct kvm_memory_slot *slot, unsigned long data)
+			      struct kvm_memory_slot *slot, unsigned long data,
+			      bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1480,7 +1515,7 @@
 
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
-	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
+	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0, false);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
@@ -2753,7 +2788,8 @@
 }
 
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
-				pfn_t pfn, unsigned access, int *ret_val)
+				pfn_t pfn, unsigned access, int *ret_val,
+				bool pin)
 {
 	bool ret = true;
 
@@ -2763,8 +2799,14 @@
 		goto exit;
 	}
 
-	if (unlikely(is_noslot_pfn(pfn)))
+	if (unlikely(is_noslot_pfn(pfn))) {
+		/* pinned sptes must point to RAM */
+		if (unlikely(pin)) {
+			*ret_val = -EFAULT;
+			goto exit;
+		}
 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+	}
 
 	ret = false;
 exit:
@@ -2818,7 +2860,7 @@
  * - false: let the real page fault path to fix it.
  */
 static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
-			    u32 error_code)
+			    u32 error_code, bool pin)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -2828,6 +2870,9 @@
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return false;
 
+	if (pin)
+		return false;
+
 	if (!page_fault_can_be_fast(error_code))
 		return false;
 
@@ -2895,9 +2940,71 @@
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable,
+			 bool pin);
 static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
+
+static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4])
+
+{
+	struct kvm_shadow_walk_iterator iterator;
+	int nr_sptes = 0;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return nr_sptes;
+
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		sptes[iterator.level-1] = iterator.sptep;
+		nr_sptes++;
+		if (!is_shadow_present_pte(*iterator.sptep))
+			break;
+	}
+
+	return nr_sptes;
+}
+
+static bool __direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, bool pin)
+{
+	u64 *sptes[4];
+	int r, i, level;
+
+	r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
+	if (!r)
+		return false;
+
+	level = 5 - r;
+	if (!is_last_spte(*sptes[level-1], level))
+		return false;
+	if (!is_shadow_present_pte(*sptes[level-1]))
+		return false;
+
+	for (i = 0; i < r; i++) {
+		u64 *sptep = sptes[3-i];
+		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+		if (pin) {
+			sp->pinned = true;
+			set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
+		} else {
+			sp->pinned = false;
+			clear_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
+		}
+	}
+
+	return true;
+}
+
+static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return __direct_pin_sptes(vcpu, gfn, true);
+}
+
+static bool direct_unpin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return __direct_pin_sptes(vcpu, gfn, false);
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			 gfn_t gfn, bool prefault, bool pin, bool *pinned)
 {
@@ -2923,16 +3030,17 @@
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
 
-	if (fast_page_fault(vcpu, v, level, error_code))
+	if (fast_page_fault(vcpu, v, level, error_code, pin))
 		return 0;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable,
+			 pin))
 		return 0;
 
-	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r, pin))
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -2943,6 +3051,8 @@
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
 			 prefault);
+	if (pin)
+		*pinned = direct_pin_sptes(vcpu, gfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -3131,7 +3241,7 @@
 
 			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
 			if (lm_root == NULL)
-				return 1;
+				return -ENOMEM;
 
 			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
 
@@ -3349,7 +3459,8 @@
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable,
+			 bool pin)
 {
 	bool async;
 
@@ -3358,7 +3469,7 @@
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	if (!prefault && can_do_async_pf(vcpu)) {
+	if (!prefault && !pin && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
@@ -3406,16 +3517,17 @@
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
 
-	if (fast_page_fault(vcpu, gpa, level, error_code))
+	if (fast_page_fault(vcpu, gpa, level, error_code, pin))
 		return 0;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable,
+			 pin))
 		return 0;
 
-	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r, pin))
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -3426,6 +3538,8 @@
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, gpa, write, map_writable,
 			 level, gfn, pfn, prefault);
+	if (pin)
+		*pinned = direct_pin_sptes(vcpu, gfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -3903,6 +4017,141 @@
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 
+int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
+				  gfn_t base_gfn, unsigned long npages)
+{
+	struct kvm_pinned_page_range *p;
+
+	if (!tdp_enabled) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+		if (p->base_gfn == base_gfn && p->npages == npages) {
+			return -EEXIST;
+		}
+	}
+
+	if (atomic_read(&vcpu->arch.nr_pinned_ranges) >=
+	    KVM_MAX_PER_VCPU_PINNED_RANGE)
+		return -ENOSPC;
+
+	p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	atomic_inc(&vcpu->arch.nr_pinned_ranges);
+
+	trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages);
+
+	INIT_LIST_HEAD(&p->link);
+	p->base_gfn = base_gfn;
+	p->npages = npages;
+	list_add(&p->link, &vcpu->arch.pinned_mmu_pages);
+	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+
+	return 0;
+}
+
+
+void unregister_pinned_sptes(struct kvm_vcpu *vcpu, unsigned long base_gfn,
+			     unsigned long npages)
+{
+	gfn_t gfn;
+
+	for (gfn = base_gfn; gfn < base_gfn+npages; gfn++)
+		direct_unpin_sptes(vcpu, gfn);
+
+}
+
+int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
+				    gfn_t base_gfn, unsigned long npages)
+{
+	struct kvm_pinned_page_range *p;
+
+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+		if (p->base_gfn == base_gfn && p->npages == npages) {
+			list_del(&p->link);
+			atomic_dec(&vcpu->arch.nr_pinned_ranges);
+			spin_lock(&vcpu->kvm->mmu_lock);
+			mmu_reload_pinned_vcpus(vcpu->kvm);
+			unregister_pinned_sptes(vcpu, base_gfn, npages);
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			kfree(p);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pinned_page_range *p, *p2;
+
+	list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) {
+		list_del(&p->link);
+		kfree(p);
+	}
+}
+
+/*
+ * Pin KVM MMU page translations. This guarantees, for valid
+ * addresses registered by kvm_mmu_register_pinned_range (valid address
+ * meaning address which posses sufficient information for fault to
+ * be resolved), valid translations exist while in guest mode and
+ * therefore no VM-exits due to faults will occur.
+ *
+ * Failure to instantiate pages will abort guest entry.
+ *
+ * Pinning is not guaranteed while executing as L2 guest.
+ *
+ */
+
+static int kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pinned_page_range *p;
+	int r = 1;
+
+	if (is_guest_mode(vcpu))
+		return r;
+
+	if (!vcpu->arch.mmu.direct_map)
+		return r;
+
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+		gfn_t gfn_offset;
+
+		for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
+			gfn_t gfn = p->base_gfn + gfn_offset;
+			int r;
+			bool pinned = false;
+
+			r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
+						     PFERR_WRITE_MASK, false,
+						     true, &pinned);
+			/* MMU notifier sequence window: retry */
+			if (!r && !pinned)
+				kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+			if (r) {
+				vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+				vcpu->run->internal.suberror =
+					KVM_INTERNAL_ERROR_PIN_FAILURE;
+				vcpu->run->internal.ndata = 1;
+				vcpu->run->internal.data[0] = gfn;
+				r = 0;
+				goto out;
+			}
+
+		}
+	}
+out:
+	return r;
+}
+
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -3916,6 +4165,7 @@
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
 	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	r = kvm_mmu_pin_pages(vcpu);
 out:
 	return r;
 }
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h	2014-07-09 12:05:30.018171068 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.h	2014-07-09 12:08:45.343762778 -0300
@@ -94,7 +94,7 @@
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
 	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
-		return 0;
+		return 1;
 
 	return kvm_mmu_load(vcpu);
 }
@@ -178,4 +178,9 @@
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
+				 gfn_t base_gfn, unsigned long npages);
+int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
+				 gfn_t base_gfn, unsigned long npages);
+void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu);
 #endif
Index: kvm.pinned-sptes/arch/x86/kvm/x86.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c	2014-07-09 12:05:34.838161262 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/x86.c	2014-07-09 12:08:45.346762771 -0300
@@ -6017,7 +6017,7 @@
 	}
 
 	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r)) {
+	if (unlikely(r <= 0)) {
 		goto cancel_injection;
 	}
 
@@ -7049,6 +7049,8 @@
 
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
+	INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages);
+	atomic_set(&vcpu->arch.nr_pinned_ranges, 0);
 
 	return 0;
 fail_free_wbinvd_dirty_mask:
@@ -7069,6 +7071,7 @@
 {
 	int idx;
 
+	kvm_mmu_free_pinned_ranges(vcpu);
 	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
@@ -7113,6 +7116,7 @@
 	int r;
 	r = vcpu_load(vcpu);
 	BUG_ON(r);
+	kvm_mmu_free_pinned_ranges(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 }
@@ -7408,7 +7412,7 @@
 		return;
 
 	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
+	if (unlikely(r <= 0))
 		return;
 
 	if (!vcpu->arch.mmu.direct_map &&
Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h	2014-07-09 12:05:34.837161264 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h	2014-07-09 12:08:45.346762771 -0300
@@ -747,11 +747,11 @@
 	smp_rmb();
 
 	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
-			 &map_writable))
+			 &map_writable, false))
 		return 0;
 
 	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
-				walker.gfn, pfn, walker.pte_access, &r))
+				walker.gfn, pfn, walker.pte_access, &r, false))
 		return r;
 
 	/*
Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h	2014-07-09 12:05:30.018171068 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h	2014-07-09 12:08:45.347762769 -0300
@@ -322,6 +322,29 @@
 		  __entry->kvm_gen == __entry->spte_gen
 	)
 );
+
+TRACE_EVENT(
+	kvm_mmu_register_pinned_range,
+	TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages),
+	TP_ARGS(vcpu_id, gfn, npages),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id	)
+		__field(	gfn_t,		gfn	)
+		__field(	unsigned long,	npages	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+		__entry->gfn			= gfn;
+		__entry->npages			= npages;
+	),
+
+	TP_printk("vcpu_id %u gfn %llx npages %lx",
+		  __entry->vcpu_id,
+		  __entry->gfn,
+		  __entry->npages)
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
Index: kvm.pinned-sptes/include/uapi/linux/kvm.h
===================================================================
--- kvm.pinned-sptes.orig/include/uapi/linux/kvm.h	2014-07-09 12:05:30.019171066 -0300
+++ kvm.pinned-sptes/include/uapi/linux/kvm.h	2014-07-09 12:08:45.347762769 -0300
@@ -180,6 +180,8 @@
 #define KVM_INTERNAL_ERROR_SIMUL_EX	2
 /* Encounter unexpected vm-exit due to delivery event. */
 #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
+/* Failure to pin address translation. */
+#define KVM_INTERNAL_ERROR_PIN_FAILURE	4
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
Index: kvm.pinned-sptes/include/linux/kvm_host.h
===================================================================
--- kvm.pinned-sptes.orig/include/linux/kvm_host.h	2014-07-09 12:05:30.019171066 -0300
+++ kvm.pinned-sptes/include/linux/kvm_host.h	2014-07-09 12:08:45.348762767 -0300
@@ -591,6 +591,9 @@
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
+bool make_all_cpus_request(struct kvm *kvm, unsigned int req,
+			   bool (*vcpukick)(struct kvm_vcpu *));
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
Index: kvm.pinned-sptes/virt/kvm/kvm_main.c
===================================================================
--- kvm.pinned-sptes.orig/virt/kvm/kvm_main.c	2014-07-09 12:05:30.019171066 -0300
+++ kvm.pinned-sptes/virt/kvm/kvm_main.c	2014-07-09 12:08:45.349762765 -0300
@@ -152,7 +152,8 @@
 {
 }
 
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool make_all_cpus_request(struct kvm *kvm, unsigned int req,
+			   bool (*vcpukick)(struct kvm_vcpu *))
 {
 	int i, cpu, me;
 	cpumask_var_t cpus;
@@ -163,6 +164,8 @@
 
 	me = get_cpu();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpukick && !vcpukick(vcpu))
+			continue;
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
@@ -189,7 +192,7 @@
 	long dirty_count = kvm->tlbs_dirty;
 
 	smp_mb();
-	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH, NULL))
 		++kvm->stat.remote_tlb_flush;
 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
@@ -197,17 +200,22 @@
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, NULL);
+}
+
+void kvm_reload_pinned_remote_mmus(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, NULL);
 }
 
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS, NULL);
 }
 
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC, NULL);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)



^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-07-09 19:12 [patch 0/4] KVM: support for pinning sptes (v2) mtosatti
  2014-07-09 19:12 ` [patch 1/4] KVM: x86: add pinned parameter to page_fault methods mtosatti
  2014-07-09 19:12 ` [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only) mtosatti
@ 2014-07-09 19:12 ` mtosatti
  2014-07-21 13:14   ` Gleb Natapov
  2014-07-21 21:55   ` Xiao Guangrong
  2014-07-09 19:12 ` [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion mtosatti
  2014-07-09 19:20 ` [patch 0/4] KVM: support for pinning sptes (v2) Marcelo Tosatti
  4 siblings, 2 replies; 30+ messages in thread
From: mtosatti @ 2014-07-09 19:12 UTC (permalink / raw)
  To: kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

[-- Attachment #1: mmu-get-dirty-log-pinned --]
[-- Type: text/plain, Size: 2733 bytes --]

Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
deleting a pinned spte.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
@@ -1208,7 +1208,8 @@
  *
  * Return true if tlb need be flushed.
  */
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
+			       bool skip_pinned)
 {
 	u64 spte = *sptep;
 
@@ -1218,6 +1219,22 @@
 
 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
 
+	if (is_pinned_spte(spte)) {
+		/* keep pinned spte intact, mark page dirty again */
+		if (skip_pinned) {
+			struct kvm_mmu_page *sp;
+			gfn_t gfn;
+
+			sp = page_header(__pa(sptep));
+			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+			mark_page_dirty(kvm, gfn);
+			return false;
+		} else
+			mmu_reload_pinned_vcpus(kvm);
+	}
+
+
 	if (pt_protect)
 		spte &= ~SPTE_MMU_WRITEABLE;
 	spte = spte & ~PT_WRITABLE_MASK;
@@ -1226,7 +1243,7 @@
 }
 
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
-				 bool pt_protect)
+				 bool pt_protect, bool skip_pinned)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1235,7 +1252,7 @@
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
 
-		flush |= spte_write_protect(kvm, sptep, pt_protect);
+		flush |= spte_write_protect(kvm, sptep, pt_protect, skip_pinned);
 		sptep = rmap_get_next(&iter);
 	}
 
@@ -1261,7 +1278,7 @@
 	while (mask) {
 		rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
 				      PT_PAGE_TABLE_LEVEL, slot);
-		__rmap_write_protect(kvm, rmapp, false);
+		__rmap_write_protect(kvm, rmapp, false, true);
 
 		/* clear the first set bit */
 		mask &= mask - 1;
@@ -1280,7 +1297,7 @@
 	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, true);
+		write_protected |= __rmap_write_protect(kvm, rmapp, true, false);
 	}
 
 	return write_protected;
@@ -4565,7 +4582,7 @@
 
 		for (index = 0; index <= last_index; ++index, ++rmapp) {
 			if (*rmapp)
-				__rmap_write_protect(kvm, rmapp, false);
+				__rmap_write_protect(kvm, rmapp, false, false);
 
 			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
 				cond_resched_lock(&kvm->mmu_lock);



^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion.
  2014-07-09 19:12 [patch 0/4] KVM: support for pinning sptes (v2) mtosatti
                   ` (2 preceding siblings ...)
  2014-07-09 19:12 ` [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path mtosatti
@ 2014-07-09 19:12 ` mtosatti
  2014-07-21 21:59   ` Xiao Guangrong
  2014-07-09 19:20 ` [patch 0/4] KVM: support for pinning sptes (v2) Marcelo Tosatti
  4 siblings, 1 reply; 30+ messages in thread
From: mtosatti @ 2014-07-09 19:12 UTC (permalink / raw)
  To: kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

[-- Attachment #1: mmu-pinned-spte-skip --]
[-- Type: text/plain, Size: 1732 bytes --]

Skip pinned shadow pages when selecting pages to zap.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 arch/x86/kvm/mmu.c |   26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 12:09:26.433674438 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 12:09:27.164672860 -0300
@@ -2267,16 +2267,24 @@
 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
 					struct list_head *invalid_list)
 {
-	struct kvm_mmu_page *sp;
-
-	if (list_empty(&kvm->arch.active_mmu_pages))
-		return false;
+	struct kvm_mmu_page *sp, *nsp;
+	LIST_HEAD(pinned_list);
 
-	sp = list_entry(kvm->arch.active_mmu_pages.prev,
-			struct kvm_mmu_page, link);
-	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+	list_for_each_entry_safe_reverse(sp, nsp,
+					 &kvm->arch.active_mmu_pages, link) {
+		if (sp->pinned) {
+			list_move(&sp->link, &pinned_list);
+			continue;
+		}
+		if (!list_empty(&pinned_list))
+			list_move(&pinned_list, &kvm->arch.active_mmu_pages);
+		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+		return true;
+	}
 
-	return true;
+	if (!list_empty(&pinned_list))
+		list_move(&pinned_list, &kvm->arch.active_mmu_pages);
+	return false;
 }
 
 /*
@@ -4679,6 +4687,8 @@
 	 * Notify all vcpus to reload its shadow page table
 	 * and flush TLB. Then all vcpus will switch to new
 	 * shadow page table with the new mmu_valid_gen.
+	 * MMU reload request also forces fault of
+	 * sptes for pinned ranges.
 	 *
 	 * Note: we should do this under the protection of
 	 * mmu-lock, otherwise, vcpu would purge shadow page



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 0/4] KVM: support for pinning sptes (v2)
  2014-07-09 19:12 [patch 0/4] KVM: support for pinning sptes (v2) mtosatti
                   ` (3 preceding siblings ...)
  2014-07-09 19:12 ` [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion mtosatti
@ 2014-07-09 19:20 ` Marcelo Tosatti
  4 siblings, 0 replies; 30+ messages in thread
From: Marcelo Tosatti @ 2014-07-09 19:20 UTC (permalink / raw)
  To: kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

On Wed, Jul 09, 2014 at 04:12:50PM -0300, mtosatti@redhat.com wrote:
> Required by PEBS support as discussed at
> 
> Subject: [patch 0/4] [patch 0/5] Implement PEBS virtualization for Silvermont
> Message-Id: <1401412327-14810-1-git-send-email-andi@firstfloor.org>
> 
> Thread.
> 
> ----------------------
> 
> v2:
> 	- unify remote kick function	     (Gleb)
> 	- keep sptes intact on get_dirty_log (Gleb)
> 	- add new internal error	     (Gleb)
> 	- unpin sptes/shadow pages	     (Avi)
> 	- fix spte walker level		     (Nadav)
> 	- do not require get_page

Note: accounting into mlock is not necessary as this 
patchset allows swapping (mlock refers to ability 
to swapout, not lifetime of page in swap).

Please review, thanks for previous comments.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)
  2014-07-09 19:12 ` [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only) mtosatti
@ 2014-07-17 17:18   ` Nadav Amit
  2014-07-17 21:38     ` Marcelo Tosatti
  2014-07-21 21:46   ` Xiao Guangrong
  1 sibling, 1 reply; 30+ messages in thread
From: Nadav Amit @ 2014-07-17 17:18 UTC (permalink / raw)
  To: mtosatti, kvm, ak; +Cc: pbonzini, xiaoguangrong, gleb, avi.kivity

Small question if I may regarding kvm_mmu_pin_pages:

On 7/9/14, 10:12 PM, mtosatti@redhat.com wrote:
> +
> +static int kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pinned_page_range *p;
> +	int r = 1;
> +
> +	if (is_guest_mode(vcpu))
> +		return r;
> +
> +	if (!vcpu->arch.mmu.direct_map)
> +		return r;
> +
> +	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> +
> +	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> +		gfn_t gfn_offset;
> +
> +		for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
> +			gfn_t gfn = p->base_gfn + gfn_offset;
> +			int r;
> +			bool pinned = false;
> +
> +			r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
> +						     PFERR_WRITE_MASK, false,
> +						     true, &pinned);

I understand that the current use-case is for pinning only few pages. 
Yet, wouldn't it be better (for performance) to check whether the gfn 
uses a large page and if so to skip forward, increasing gfn_offset to 
point to the next large page?

Thanks,
Nadav

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)
  2014-07-17 17:18   ` Nadav Amit
@ 2014-07-17 21:38     ` Marcelo Tosatti
  2014-07-24 12:16       ` Nadav Amit
  0 siblings, 1 reply; 30+ messages in thread
From: Marcelo Tosatti @ 2014-07-17 21:38 UTC (permalink / raw)
  To: Nadav Amit; +Cc: kvm, ak, pbonzini, xiaoguangrong, gleb, avi.kivity

On Thu, Jul 17, 2014 at 08:18:03PM +0300, Nadav Amit wrote:
> Small question if I may regarding kvm_mmu_pin_pages:
> 
> On 7/9/14, 10:12 PM, mtosatti@redhat.com wrote:
> >+
> >+static int kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
> >+{
> >+	struct kvm_pinned_page_range *p;
> >+	int r = 1;
> >+
> >+	if (is_guest_mode(vcpu))
> >+		return r;
> >+
> >+	if (!vcpu->arch.mmu.direct_map)
> >+		return r;
> >+
> >+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> >+
> >+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> >+		gfn_t gfn_offset;
> >+
> >+		for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
> >+			gfn_t gfn = p->base_gfn + gfn_offset;
> >+			int r;
> >+			bool pinned = false;
> >+
> >+			r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
> >+						     PFERR_WRITE_MASK, false,
> >+						     true, &pinned);
> 
> I understand that the current use-case is for pinning only few
> pages. Yet, wouldn't it be better (for performance) to check whether
> the gfn uses a large page and if so to skip forward, increasing
> gfn_offset to point to the next large page?

Sure, that can be a lazy optimization and performed when necessary?

(feel free to do it in advance if you're interested in doing it 
now).


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-07-09 19:12 ` [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path mtosatti
@ 2014-07-21 13:14   ` Gleb Natapov
  2014-09-09 15:28     ` Marcelo Tosatti
  2014-07-21 21:55   ` Xiao Guangrong
  1 sibling, 1 reply; 30+ messages in thread
From: Gleb Natapov @ 2014-07-21 13:14 UTC (permalink / raw)
  To: mtosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> deleting a pinned spte.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
>  1 file changed, 23 insertions(+), 6 deletions(-)
> 
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> @@ -1208,7 +1208,8 @@
>   *
>   * Return true if tlb need be flushed.
>   */
> -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> +			       bool skip_pinned)
>  {
>  	u64 spte = *sptep;
>  
> @@ -1218,6 +1219,22 @@
>  
>  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
>  
> +	if (is_pinned_spte(spte)) {
> +		/* keep pinned spte intact, mark page dirty again */
> +		if (skip_pinned) {
> +			struct kvm_mmu_page *sp;
> +			gfn_t gfn;
> +
> +			sp = page_header(__pa(sptep));
> +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +
> +			mark_page_dirty(kvm, gfn);
> +			return false;
Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
populating dirty_bitmap_buffer?

> +		} else
> +			mmu_reload_pinned_vcpus(kvm);
Can you explain why do you need this?

> +	}
> +
> +
>  	if (pt_protect)
>  		spte &= ~SPTE_MMU_WRITEABLE;
>  	spte = spte & ~PT_WRITABLE_MASK;
> @@ -1226,7 +1243,7 @@
>  }
>  
>  static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
> -				 bool pt_protect)
> +				 bool pt_protect, bool skip_pinned)
>  {
>  	u64 *sptep;
>  	struct rmap_iterator iter;
> @@ -1235,7 +1252,7 @@
>  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
>  		BUG_ON(!(*sptep & PT_PRESENT_MASK));
>  
> -		flush |= spte_write_protect(kvm, sptep, pt_protect);
> +		flush |= spte_write_protect(kvm, sptep, pt_protect, skip_pinned);
>  		sptep = rmap_get_next(&iter);
>  	}
>  
> @@ -1261,7 +1278,7 @@
>  	while (mask) {
>  		rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
>  				      PT_PAGE_TABLE_LEVEL, slot);
> -		__rmap_write_protect(kvm, rmapp, false);
> +		__rmap_write_protect(kvm, rmapp, false, true);
>  
>  		/* clear the first set bit */
>  		mask &= mask - 1;
> @@ -1280,7 +1297,7 @@
>  	for (i = PT_PAGE_TABLE_LEVEL;
>  	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
>  		rmapp = __gfn_to_rmap(gfn, i, slot);
> -		write_protected |= __rmap_write_protect(kvm, rmapp, true);
> +		write_protected |= __rmap_write_protect(kvm, rmapp, true, false);
>  	}
>  
>  	return write_protected;
> @@ -4565,7 +4582,7 @@
>  
>  		for (index = 0; index <= last_index; ++index, ++rmapp) {
>  			if (*rmapp)
> -				__rmap_write_protect(kvm, rmapp, false);
> +				__rmap_write_protect(kvm, rmapp, false, false);
>  
>  			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
>  				cond_resched_lock(&kvm->mmu_lock);
> 
> 

--
			Gleb.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)
  2014-07-09 19:12 ` [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only) mtosatti
  2014-07-17 17:18   ` Nadav Amit
@ 2014-07-21 21:46   ` Xiao Guangrong
  2014-07-22  5:26     ` Xiao Guangrong
  1 sibling, 1 reply; 30+ messages in thread
From: Xiao Guangrong @ 2014-07-21 21:46 UTC (permalink / raw)
  To: mtosatti; +Cc: kvm, ak, pbonzini, gleb, avi.kivity



Hi Marcelo,

On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:

> struct kvm_vcpu_arch {
> 	/*
> 	 * rip and regs accesses must go through
> @@ -392,6 +402,9 @@
> 	struct kvm_mmu_memory_cache mmu_page_cache;
> 	struct kvm_mmu_memory_cache mmu_page_header_cache;
> 
> +	struct list_head pinned_mmu_pages;
> +	atomic_t nr_pinned_ranges;
> +

I’m not sure per-cpu pinned list is a good idea, since currently all vcpu are using the same
page table, the per-list can not reduce lock-contention, why not make it be global to vm
instead?

> 	struct fpu guest_fpu;
> 	u64 xcr0;
> 	u64 guest_supported_xcr0;
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 12:05:34.837161264 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 12:09:21.856684314 -0300
> @@ -148,6 +148,9 @@
> 
> #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> +#define SPTE_PINNED		(1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
> +
> +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
> 
> #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> 
> @@ -327,6 +330,11 @@
> 	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
> }
> 
> +static int is_pinned_spte(u64 spte)
> +{
> +	return spte & SPTE_PINNED && is_shadow_present_pte(spte);
> +}
> +
> static int is_large_pte(u64 pte)
> {
> 	return pte & PT_PAGE_SIZE_MASK;
> @@ -1176,6 +1184,16 @@
> 		kvm_flush_remote_tlbs(vcpu->kvm);
> }
> 
> +static bool vcpu_has_pinned(struct kvm_vcpu *vcpu)
> +{
> +	return atomic_read(&vcpu->arch.nr_pinned_ranges);
> +}
> +
> +static void mmu_reload_pinned_vcpus(struct kvm *kvm)
> +{
> +	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, &vcpu_has_pinned);
> +}
> +
> /*
> * Write-protect on the specified @sptep, @pt_protect indicates whether
> * spte write-protection is caused by protecting shadow page table.
> @@ -1268,7 +1286,8 @@
> }
> 
> static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			   struct kvm_memory_slot *slot, unsigned long data)
> +			   struct kvm_memory_slot *slot, unsigned long data,
> +			   bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator iter;
> @@ -1278,6 +1297,14 @@
> 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
> 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
> 
> +		if (is_pinned_spte(*sptep)) {
> +			/* don't nuke pinned sptes if page aging: return
> + 			 * young=yes instead.
> + 			 */
> +			if (age)
> +				return 1;
> +			mmu_reload_pinned_vcpus(kvm);
> +		}
> 		drop_spte(kvm, step);

This has a window between zapping spte and re-pin spte, so guest will fail
at this time.

> 		need_tlb_flush = 1;
> 	}
> @@ -1286,7 +1313,8 @@
> }
> 
> static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			     struct kvm_memory_slot *slot, unsigned long data)
> +			     struct kvm_memory_slot *slot, unsigned long data,
> +			     bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator iter;
> @@ -1304,6 +1332,9 @@
> 
> 		need_flush = 1;
> 
> +		if (is_pinned_spte(*sptep))
> +			mmu_reload_pinned_vcpus(kvm);
> +
> 		if (pte_write(*ptep)) {
> 			drop_spte(kvm, sptep);
> 			sptep = rmap_get_first(*rmapp, &iter);
> @@ -1334,7 +1365,8 @@
> 				int (*handler)(struct kvm *kvm,
> 					       unsigned long *rmapp,
> 					       struct kvm_memory_slot *slot,
> -					       unsigned long data))
> +					       unsigned long data,
> +					       bool age))
> {
> 	int j;
> 	int ret = 0;
> @@ -1374,7 +1406,7 @@
> 			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
> 
> 			for (; idx <= idx_end; ++idx)
> -				ret |= handler(kvm, rmapp++, memslot, data);
> +				ret |= handler(kvm, rmapp++, memslot, data, false);
> 		}
> 	}
> 
> @@ -1385,7 +1417,8 @@
> 			  unsigned long data,
> 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
> 					 struct kvm_memory_slot *slot,
> -					 unsigned long data))
> +					 unsigned long data,
> +					 bool age))
> {
> 	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
> }
> @@ -1406,7 +1439,8 @@
> }
> 
> static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			 struct kvm_memory_slot *slot, unsigned long data)
> +			 struct kvm_memory_slot *slot, unsigned long data,
> +			 bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator uninitialized_var(iter);
> @@ -1421,7 +1455,7 @@
> 	 * out actively used pages or breaking up actively used hugepages.
> 	 */
> 	if (!shadow_accessed_mask) {
> -		young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
> +		young = kvm_unmap_rmapp(kvm, rmapp, slot, data, true);
> 		goto out;
> 	}
> 
> @@ -1442,7 +1476,8 @@
> }
> 
> static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			      struct kvm_memory_slot *slot, unsigned long data)
> +			      struct kvm_memory_slot *slot, unsigned long data,
> +			      bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator iter;
> @@ -1480,7 +1515,7 @@
> 
> 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
> 
> -	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
> +	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0, false);
> 	kvm_flush_remote_tlbs(vcpu->kvm);
> }
> 
> @@ -2753,7 +2788,8 @@
> }
> 
> static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
> -				pfn_t pfn, unsigned access, int *ret_val)
> +				pfn_t pfn, unsigned access, int *ret_val,
> +				bool pin)
> {
> 	bool ret = true;
> 
> @@ -2763,8 +2799,14 @@
> 		goto exit;
> 	}
> 
> -	if (unlikely(is_noslot_pfn(pfn)))
> +	if (unlikely(is_noslot_pfn(pfn))) {
> +		/* pinned sptes must point to RAM */
> +		if (unlikely(pin)) {
> +			*ret_val = -EFAULT;
> +			goto exit;
> +		}
> 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
> +	}
> 
> 	ret = false;
> exit:
> @@ -2818,7 +2860,7 @@
> * - false: let the real page fault path to fix it.
> */
> static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> -			    u32 error_code)
> +			    u32 error_code, bool pin)
> {
> 	struct kvm_shadow_walk_iterator iterator;
> 	struct kvm_mmu_page *sp;
> @@ -2828,6 +2870,9 @@
> 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> 		return false;
> 
> +	if (pin)
> +		return false;
> +
> 	if (!page_fault_can_be_fast(error_code))
> 		return false;
> 
> @@ -2895,9 +2940,71 @@
> }
> 
> static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
> +			 gva_t gva, pfn_t *pfn, bool write, bool *writable,
> +			 bool pin);
> static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
> 
> +
> +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4])
> +
> +{
> +	struct kvm_shadow_walk_iterator iterator;
> +	int nr_sptes = 0;
> +
> +	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> +		return nr_sptes;
> +
> +	for_each_shadow_entry(vcpu, addr, iterator) {
> +		sptes[iterator.level-1] = iterator.sptep;
> +		nr_sptes++;
> +		if (!is_shadow_present_pte(*iterator.sptep))
> +			break;
> +	}
> +
> +	return nr_sptes;
> +}
> +
> +static bool __direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, bool pin)
> +{
> +	u64 *sptes[4];
> +	int r, i, level;
> +
> +	r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
> +	if (!r)
> +		return false;
> +
> +	level = 5 - r;
> +	if (!is_last_spte(*sptes[level-1], level))
> +		return false;
> +	if (!is_shadow_present_pte(*sptes[level-1]))
> +		return false;
> +
> +	for (i = 0; i < r; i++) {
> +		u64 *sptep = sptes[3-i];
> +		struct kvm_mmu_page *sp = page_header(__pa(sptep));
> +
> +		if (pin) {
> +			sp->pinned = true;
> +			set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
> +		} else {
> +			sp->pinned = false;
> +			clear_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +	return __direct_pin_sptes(vcpu, gfn, true);
> +}
> +
> +static bool direct_unpin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +	return __direct_pin_sptes(vcpu, gfn, false);
> +}
> +

I guess we can reuse the infrastructure which marks/clears unsync spte
to pin/unpind pages that can omit lot of code in this patch.



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-07-09 19:12 ` [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path mtosatti
  2014-07-21 13:14   ` Gleb Natapov
@ 2014-07-21 21:55   ` Xiao Guangrong
  2014-09-09 15:35     ` Marcelo Tosatti
  1 sibling, 1 reply; 30+ messages in thread
From: Xiao Guangrong @ 2014-07-21 21:55 UTC (permalink / raw)
  To: mtosatti; +Cc: kvm, ak, pbonzini, gleb, avi.kivity


On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:

> Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> deleting a pinned spte.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
> arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> 1 file changed, 23 insertions(+), 6 deletions(-)
> 
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> @@ -1208,7 +1208,8 @@
> *
> * Return true if tlb need be flushed.
> */
> -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> +			       bool skip_pinned)
> {
> 	u64 spte = *sptep;
> 
> @@ -1218,6 +1219,22 @@
> 
> 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> 
> +	if (is_pinned_spte(spte)) {
> +		/* keep pinned spte intact, mark page dirty again */
> +		if (skip_pinned) {
> +			struct kvm_mmu_page *sp;
> +			gfn_t gfn;
> +
> +			sp = page_header(__pa(sptep));
> +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +
> +			mark_page_dirty(kvm, gfn);
> +			return false;
> +		} else
> +			mmu_reload_pinned_vcpus(kvm);
> +	}
> +
> +
> 	if (pt_protect)
> 		spte &= ~SPTE_MMU_WRITEABLE;
> 	spte = spte & ~PT_WRITABLE_MASK;

This is also a window between marking spte readonly and re-ping…
IIUC, I think all spte spte can not be zapped and write-protected at any time


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion.
  2014-07-09 19:12 ` [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion mtosatti
@ 2014-07-21 21:59   ` Xiao Guangrong
  2014-09-09 15:41     ` Marcelo Tosatti
  0 siblings, 1 reply; 30+ messages in thread
From: Xiao Guangrong @ 2014-07-21 21:59 UTC (permalink / raw)
  To: mtosatti; +Cc: kvm, ak, pbonzini, gleb, avi.kivity


On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:

> Skip pinned shadow pages when selecting pages to zap.

It seems there is no way to prevent changing pinned spte on
zap-all path?

I am thing if we could move pinned spte to another list (eg. pinned_shadow_pages)
instead of active list so that it can not be touched by any other free paths. 
Your idea?



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)
  2014-07-21 21:46   ` Xiao Guangrong
@ 2014-07-22  5:26     ` Xiao Guangrong
  0 siblings, 0 replies; 30+ messages in thread
From: Xiao Guangrong @ 2014-07-22  5:26 UTC (permalink / raw)
  To: Xiao Guangrong, mtosatti; +Cc: kvm, ak, pbonzini, gleb, avi.kivity

On 07/22/2014 05:46 AM, Xiao Guangrong wrote:

>> +		if (is_pinned_spte(*sptep)) {
>> +			/* don't nuke pinned sptes if page aging: return
>> + 			 * young=yes instead.
>> + 			 */
>> +			if (age)
>> +				return 1;
>> +			mmu_reload_pinned_vcpus(kvm);
>> +		}
>> 		drop_spte(kvm, step);
> 
> This has a window between zapping spte and re-pin spte, so guest will fail
> at this time.

I got it, mmu_reload_pinned_vcpus will kick all vcpus out of guest and
pin the pages again... so it is ok. :)


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)
  2014-07-17 21:38     ` Marcelo Tosatti
@ 2014-07-24 12:16       ` Nadav Amit
  0 siblings, 0 replies; 30+ messages in thread
From: Nadav Amit @ 2014-07-24 12:16 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, gleb, avi.kivity

[-- Attachment #1: Type: text/plain, Size: 713 bytes --]


On Jul 18, 2014, at 12:38 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Thu, Jul 17, 2014 at 08:18:03PM +0300, Nadav Amit wrote:
>> Small question if I may regarding kvm_mmu_pin_pages:
>> 
...
>> I understand that the current use-case is for pinning only few
>> pages. Yet, wouldn't it be better (for performance) to check whether
>> the gfn uses a large page and if so to skip forward, increasing
>> gfn_offset to point to the next large page?
> 
> Sure, that can be a lazy optimization and performed when necessary?
> 
> (feel free to do it in advance if you're interested in doing it 
> now).
I would do it once your patch is applied (I don’t see it in the queue).

Thanks,
Nadav

[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-07-21 13:14   ` Gleb Natapov
@ 2014-09-09 15:28     ` Marcelo Tosatti
  2014-09-22 17:19       ` Marcelo Tosatti
  2014-10-04  7:23       ` Gleb Natapov
  0 siblings, 2 replies; 30+ messages in thread
From: Marcelo Tosatti @ 2014-09-09 15:28 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > deleting a pinned spte.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> >  1 file changed, 23 insertions(+), 6 deletions(-)
> > 
> > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > ===================================================================
> > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > @@ -1208,7 +1208,8 @@
> >   *
> >   * Return true if tlb need be flushed.
> >   */
> > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > +			       bool skip_pinned)
> >  {
> >  	u64 spte = *sptep;
> >  
> > @@ -1218,6 +1219,22 @@
> >  
> >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> >  
> > +	if (is_pinned_spte(spte)) {
> > +		/* keep pinned spte intact, mark page dirty again */
> > +		if (skip_pinned) {
> > +			struct kvm_mmu_page *sp;
> > +			gfn_t gfn;
> > +
> > +			sp = page_header(__pa(sptep));
> > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > +
> > +			mark_page_dirty(kvm, gfn);
> > +			return false;
> Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> populating dirty_bitmap_buffer?

The pinned gfns are per-vcpu. Requires looping all vcpus (not
scalable).


> > +		} else
> > +			mmu_reload_pinned_vcpus(kvm);
> Can you explain why do you need this?

Because if skip_pinned = false, we want vcpus to exit (called
from enable dirty logging codepath).


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-07-21 21:55   ` Xiao Guangrong
@ 2014-09-09 15:35     ` Marcelo Tosatti
  0 siblings, 0 replies; 30+ messages in thread
From: Marcelo Tosatti @ 2014-09-09 15:35 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: kvm, ak, pbonzini, gleb, avi.kivity

On Tue, Jul 22, 2014 at 05:55:20AM +0800, Xiao Guangrong wrote:
> 
> On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:
> 
> > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > deleting a pinned spte.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> > arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > 1 file changed, 23 insertions(+), 6 deletions(-)
> > 
> > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > ===================================================================
> > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > @@ -1208,7 +1208,8 @@
> > *
> > * Return true if tlb need be flushed.
> > */
> > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > +			       bool skip_pinned)
> > {
> > 	u64 spte = *sptep;
> > 
> > @@ -1218,6 +1219,22 @@
> > 
> > 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > 
> > +	if (is_pinned_spte(spte)) {
> > +		/* keep pinned spte intact, mark page dirty again */
> > +		if (skip_pinned) {
> > +			struct kvm_mmu_page *sp;
> > +			gfn_t gfn;
> > +
> > +			sp = page_header(__pa(sptep));
> > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > +
> > +			mark_page_dirty(kvm, gfn);
> > +			return false;
> > +		} else
> > +			mmu_reload_pinned_vcpus(kvm);
> > +	}
> > +
> > +
> > 	if (pt_protect)
> > 		spte &= ~SPTE_MMU_WRITEABLE;
> > 	spte = spte & ~PT_WRITABLE_MASK;
> 
> This is also a window between marking spte readonly and re-ping…
> IIUC, I think all spte spte can not be zapped and write-protected at any time

It is safe because mmu_lock is held by kvm_mmu_slot_remove_write_access
?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion.
  2014-07-21 21:59   ` Xiao Guangrong
@ 2014-09-09 15:41     ` Marcelo Tosatti
  2014-09-22 17:17       ` Marcelo Tosatti
  2014-09-23  5:30       ` Xiao Guangrong
  0 siblings, 2 replies; 30+ messages in thread
From: Marcelo Tosatti @ 2014-09-09 15:41 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: kvm, ak, pbonzini, gleb, avi.kivity

On Tue, Jul 22, 2014 at 05:59:42AM +0800, Xiao Guangrong wrote:
> 
> On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:
> 
> > Skip pinned shadow pages when selecting pages to zap.
> 
> It seems there is no way to prevent changing pinned spte on
> zap-all path?

Xiao,

The way would be to reload remote mmus, forcing the vcpu to exit,
zapping a page, then vcpu will pagefault any necessary page via
kvm_mmu_pin_pages.

kvm_mmu_invalidate_zap_all_pages does: 

- spin_lock(mmu_lock)
- kvm_reload_remote_mmus
...
- spin_unlock(mmu_lock)

So its OK to change pinned spte on zap all path.

> I am thing if we could move pinned spte to another list (eg. pinned_shadow_pages)
> instead of active list so that it can not be touched by any other free paths. 
> Your idea?

As mentioned it above, it is ok to zap pinned sptes as long w 
reload remote mmus request is performed.

That said, you still consider a separate list?


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion.
  2014-09-09 15:41     ` Marcelo Tosatti
@ 2014-09-22 17:17       ` Marcelo Tosatti
  2014-09-23  5:30       ` Xiao Guangrong
  1 sibling, 0 replies; 30+ messages in thread
From: Marcelo Tosatti @ 2014-09-22 17:17 UTC (permalink / raw)
  To: Xiao Guangrong, Xiao Guangrong; +Cc: kvm, ak, pbonzini, gleb, avi.kivity

On Tue, Sep 09, 2014 at 12:41:27PM -0300, Marcelo Tosatti wrote:
> On Tue, Jul 22, 2014 at 05:59:42AM +0800, Xiao Guangrong wrote:
> > 
> > On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:
> > 
> > > Skip pinned shadow pages when selecting pages to zap.
> > 
> > It seems there is no way to prevent changing pinned spte on
> > zap-all path?
> 
> Xiao,
> 
> The way would be to reload remote mmus, forcing the vcpu to exit,
> zapping a page, then vcpu will pagefault any necessary page via
> kvm_mmu_pin_pages.
> 
> kvm_mmu_invalidate_zap_all_pages does: 
> 
> - spin_lock(mmu_lock)
> - kvm_reload_remote_mmus
> ...
> - spin_unlock(mmu_lock)
> 
> So its OK to change pinned spte on zap all path.
> 
> > I am thing if we could move pinned spte to another list (eg. pinned_shadow_pages)
> > instead of active list so that it can not be touched by any other free paths. 
> > Your idea?
> 
> As mentioned it above, it is ok to zap pinned sptes as long w 
> reload remote mmus request is performed.
> 
> That said, you still consider a separate list?
> 

Xiao, ping?


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-09-09 15:28     ` Marcelo Tosatti
@ 2014-09-22 17:19       ` Marcelo Tosatti
  2014-09-30 18:59         ` Marcelo Tosatti
  2014-10-04  7:23       ` Gleb Natapov
  1 sibling, 1 reply; 30+ messages in thread
From: Marcelo Tosatti @ 2014-09-22 17:19 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > deleting a pinned spte.
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > ---
> > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > 
> > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > ===================================================================
> > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > @@ -1208,7 +1208,8 @@
> > >   *
> > >   * Return true if tlb need be flushed.
> > >   */
> > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > +			       bool skip_pinned)
> > >  {
> > >  	u64 spte = *sptep;
> > >  
> > > @@ -1218,6 +1219,22 @@
> > >  
> > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > >  
> > > +	if (is_pinned_spte(spte)) {
> > > +		/* keep pinned spte intact, mark page dirty again */
> > > +		if (skip_pinned) {
> > > +			struct kvm_mmu_page *sp;
> > > +			gfn_t gfn;
> > > +
> > > +			sp = page_header(__pa(sptep));
> > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > +
> > > +			mark_page_dirty(kvm, gfn);
> > > +			return false;
> > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > populating dirty_bitmap_buffer?
> 
> The pinned gfns are per-vcpu. Requires looping all vcpus (not
> scalable).
> 
> 
> > > +		} else
> > > +			mmu_reload_pinned_vcpus(kvm);
> > Can you explain why do you need this?
> 
> Because if skip_pinned = false, we want vcpus to exit (called
> from enable dirty logging codepath).

Gleb, any further opinions on this ?

Can you ack the patch ? TIA


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion.
  2014-09-09 15:41     ` Marcelo Tosatti
  2014-09-22 17:17       ` Marcelo Tosatti
@ 2014-09-23  5:30       ` Xiao Guangrong
  1 sibling, 0 replies; 30+ messages in thread
From: Xiao Guangrong @ 2014-09-23  5:30 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, gleb, avi.kivity


Hi Marcelo,

Sorry for the delay.

On Sep 9, 2014, at 11:41 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Tue, Jul 22, 2014 at 05:59:42AM +0800, Xiao Guangrong wrote:
>> 
>> On Jul 10, 2014, at 3:12 AM, mtosatti@redhat.com wrote:
>> 
>>> Skip pinned shadow pages when selecting pages to zap.
>> 
>> It seems there is no way to prevent changing pinned spte on
>> zap-all path?
> 
> Xiao,
> 
> The way would be to reload remote mmus, forcing the vcpu to exit,
> zapping a page, then vcpu will pagefault any necessary page via
> kvm_mmu_pin_pages.
> 
> kvm_mmu_invalidate_zap_all_pages does: 
> 
> - spin_lock(mmu_lock)
> - kvm_reload_remote_mmus
> ...
> - spin_unlock(mmu_lock)
> 
> So its OK to change pinned spte on zap all path.

Got it, thanks for your explanation.

> 
>> I am thing if we could move pinned spte to another list (eg. pinned_shadow_pages)
>> instead of active list so that it can not be touched by any other free paths. 
>> Your idea?
> 
> As mentioned it above, it is ok to zap pinned sptes as long w 
> reload remote mmus request is performed.
> 
> That said, you still consider a separate list?

I need to think more about this idea… let’s address it as your patch first. :)



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-09-22 17:19       ` Marcelo Tosatti
@ 2014-09-30 18:59         ` Marcelo Tosatti
  0 siblings, 0 replies; 30+ messages in thread
From: Marcelo Tosatti @ 2014-09-30 18:59 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

Ping?

On Mon, Sep 22, 2014 at 02:19:14PM -0300, Marcelo Tosatti wrote:
> On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> > On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > > deleting a pinned spte.
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > 
> > > > ---
> > > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > > 
> > > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > > ===================================================================
> > > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > > @@ -1208,7 +1208,8 @@
> > > >   *
> > > >   * Return true if tlb need be flushed.
> > > >   */
> > > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > > +			       bool skip_pinned)
> > > >  {
> > > >  	u64 spte = *sptep;
> > > >  
> > > > @@ -1218,6 +1219,22 @@
> > > >  
> > > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > > >  
> > > > +	if (is_pinned_spte(spte)) {
> > > > +		/* keep pinned spte intact, mark page dirty again */
> > > > +		if (skip_pinned) {
> > > > +			struct kvm_mmu_page *sp;
> > > > +			gfn_t gfn;
> > > > +
> > > > +			sp = page_header(__pa(sptep));
> > > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > > +
> > > > +			mark_page_dirty(kvm, gfn);
> > > > +			return false;
> > > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > > populating dirty_bitmap_buffer?
> > 
> > The pinned gfns are per-vcpu. Requires looping all vcpus (not
> > scalable).
> > 
> > 
> > > > +		} else
> > > > +			mmu_reload_pinned_vcpus(kvm);
> > > Can you explain why do you need this?
> > 
> > Because if skip_pinned = false, we want vcpus to exit (called
> > from enable dirty logging codepath).
> 
> Gleb, any further opinions on this ?
> 
> Can you ack the patch ? TIA
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-09-09 15:28     ` Marcelo Tosatti
  2014-09-22 17:19       ` Marcelo Tosatti
@ 2014-10-04  7:23       ` Gleb Natapov
  2014-10-06 17:19         ` Marcelo Tosatti
  1 sibling, 1 reply; 30+ messages in thread
From: Gleb Natapov @ 2014-10-04  7:23 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > deleting a pinned spte.
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > ---
> > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > 
> > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > ===================================================================
> > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > @@ -1208,7 +1208,8 @@
> > >   *
> > >   * Return true if tlb need be flushed.
> > >   */
> > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > +			       bool skip_pinned)
> > >  {
> > >  	u64 spte = *sptep;
> > >  
> > > @@ -1218,6 +1219,22 @@
> > >  
> > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > >  
> > > +	if (is_pinned_spte(spte)) {
> > > +		/* keep pinned spte intact, mark page dirty again */
> > > +		if (skip_pinned) {
> > > +			struct kvm_mmu_page *sp;
> > > +			gfn_t gfn;
> > > +
> > > +			sp = page_header(__pa(sptep));
> > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > +
> > > +			mark_page_dirty(kvm, gfn);
> > > +			return false;
> > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > populating dirty_bitmap_buffer?
> 
> The pinned gfns are per-vcpu. Requires looping all vcpus (not
> scalable).
> 
OK, but do they really have to be per-cpu? What's the advantage?

> 
> > > +		} else
> > > +			mmu_reload_pinned_vcpus(kvm);
> > Can you explain why do you need this?
> 
> Because if skip_pinned = false, we want vcpus to exit (called
> from enable dirty logging codepath).
> 
I guess what I wanted to ask is why do we need skip_pinned? As far as I see it
is set to false in two cases:
1: page is write protected for shadow MMU needs, should not happen since the feature
   is not supported with shadow mmu (can happen with nested EPT, but page will be marked
   is accessed during next vcpu entry anyway, so how will it work)?
2: when slot is marked as read only: such slot cannot have PEBS pages and if it will guest will die
   anyway during next guest entry, so why not maintain list of pinned pages per slot and kill aguest
   if slot with pinned pages is marked read only.
 
--
			Gleb.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-04  7:23       ` Gleb Natapov
@ 2014-10-06 17:19         ` Marcelo Tosatti
  2014-10-08  6:56           ` Gleb Natapov
  0 siblings, 1 reply; 30+ messages in thread
From: Marcelo Tosatti @ 2014-10-06 17:19 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Sat, Oct 04, 2014 at 10:23:32AM +0300, Gleb Natapov wrote:
> On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> > On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > > deleting a pinned spte.
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > 
> > > > ---
> > > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > > 
> > > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > > ===================================================================
> > > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > > @@ -1208,7 +1208,8 @@
> > > >   *
> > > >   * Return true if tlb need be flushed.
> > > >   */
> > > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > > +			       bool skip_pinned)
> > > >  {
> > > >  	u64 spte = *sptep;
> > > >  
> > > > @@ -1218,6 +1219,22 @@
> > > >  
> > > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > > >  
> > > > +	if (is_pinned_spte(spte)) {
> > > > +		/* keep pinned spte intact, mark page dirty again */
> > > > +		if (skip_pinned) {
> > > > +			struct kvm_mmu_page *sp;
> > > > +			gfn_t gfn;
> > > > +
> > > > +			sp = page_header(__pa(sptep));
> > > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > > +
> > > > +			mark_page_dirty(kvm, gfn);
> > > > +			return false;
> > > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > > populating dirty_bitmap_buffer?
> > 
> > The pinned gfns are per-vcpu. Requires looping all vcpus (not
> > scalable).
> > 
> OK, but do they really have to be per-cpu? What's the advantage?

The guest configures pinning on a per-cpu basis (that is, enabling PEBS 
is done on a per-cpu basis).

> > 
> > > > +		} else
> > > > +			mmu_reload_pinned_vcpus(kvm);
> > > Can you explain why do you need this?
> > 
> > Because if skip_pinned = false, we want vcpus to exit (called
> > from enable dirty logging codepath).
> > 
> I guess what I wanted to ask is why do we need skip_pinned? As far as I see it
> is set to false in two cases:
> 1: page is write protected for shadow MMU needs, should not happen since the feature

Correct.

>    is not supported with shadow mmu (can happen with nested EPT, but page will be marked
>    is accessed during next vcpu entry anyway, so how will it work)?

PEBS is not supported on nested EPT.

> 2: when slot is marked as read only: such slot cannot have PEBS pages and if it will guest will die
>    anyway during next guest entry, so why not maintain list of pinned pages per slot and kill aguest
>    if slot with pinned pages is marked read only.

2: when slots pages have dirty logging enabled. In that case, the page
is marked dirty immediately. This is the call from
kvm_mmu_slot_remove_write_access.

So when enabling dirty logging, for pinned sptes:

    - maintain pinned spte intact.
    - mark gfn for which pinned spte represents as dirty in the dirty
      log.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-06 17:19         ` Marcelo Tosatti
@ 2014-10-08  6:56           ` Gleb Natapov
  2014-10-08 17:15             ` Marcelo Tosatti
  0 siblings, 1 reply; 30+ messages in thread
From: Gleb Natapov @ 2014-10-08  6:56 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Mon, Oct 06, 2014 at 02:19:32PM -0300, Marcelo Tosatti wrote:
> On Sat, Oct 04, 2014 at 10:23:32AM +0300, Gleb Natapov wrote:
> > On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> > > On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > > > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > > > deleting a pinned spte.
> > > > > 
> > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > 
> > > > > ---
> > > > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > > > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > > > 
> > > > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > > > ===================================================================
> > > > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > > > @@ -1208,7 +1208,8 @@
> > > > >   *
> > > > >   * Return true if tlb need be flushed.
> > > > >   */
> > > > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > > > +			       bool skip_pinned)
> > > > >  {
> > > > >  	u64 spte = *sptep;
> > > > >  
> > > > > @@ -1218,6 +1219,22 @@
> > > > >  
> > > > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > > > >  
> > > > > +	if (is_pinned_spte(spte)) {
> > > > > +		/* keep pinned spte intact, mark page dirty again */
> > > > > +		if (skip_pinned) {
> > > > > +			struct kvm_mmu_page *sp;
> > > > > +			gfn_t gfn;
> > > > > +
> > > > > +			sp = page_header(__pa(sptep));
> > > > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > > > +
> > > > > +			mark_page_dirty(kvm, gfn);
> > > > > +			return false;
> > > > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > > > populating dirty_bitmap_buffer?
> > > 
> > > The pinned gfns are per-vcpu. Requires looping all vcpus (not
> > > scalable).
> > > 
> > OK, but do they really have to be per-cpu? What's the advantage?
> 
> The guest configures pinning on a per-cpu basis (that is, enabling PEBS 
> is done on a per-cpu basis).
Is it a problem to maintain global pinned pages list for each memslot too?

> 
> > > 
> > > > > +		} else
> > > > > +			mmu_reload_pinned_vcpus(kvm);
> > > > Can you explain why do you need this?
> > > 
> > > Because if skip_pinned = false, we want vcpus to exit (called
> > > from enable dirty logging codepath).
> > > 
> > I guess what I wanted to ask is why do we need skip_pinned? As far as I see it
> > is set to false in two cases:
> > 1: page is write protected for shadow MMU needs, should not happen since the feature
> 
> Correct.
> 
> >    is not supported with shadow mmu (can happen with nested EPT, but page will be marked
> >    is accessed during next vcpu entry anyway, so how will it work)?
> 
> PEBS is not supported on nested EPT.
> 
OK, so for this case we do not need skip_pinned. Assert if it happens.

> > 2: when slot is marked as read only: such slot cannot have PEBS pages and if it will guest will die
> >    anyway during next guest entry, so why not maintain list of pinned pages per slot and kill aguest
> >    if slot with pinned pages is marked read only.
> 
> 2: when slots pages have dirty logging enabled. In that case, the page
> is marked dirty immediately. This is the call from
> kvm_mmu_slot_remove_write_access.
> 
Right, my 2 is incorrect.

> So when enabling dirty logging, for pinned sptes:
> 
>     - maintain pinned spte intact.
>     - mark gfn for which pinned spte represents as dirty in the dirty
>       log.
> 
But you set skip_pinned to false in kvm_mmu_slot_remove_write_access(), so this is not
what is happening. Did you mean to set it to true there?

--
			Gleb.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-08  6:56           ` Gleb Natapov
@ 2014-10-08 17:15             ` Marcelo Tosatti
  2014-10-08 17:59               ` Gleb Natapov
  0 siblings, 1 reply; 30+ messages in thread
From: Marcelo Tosatti @ 2014-10-08 17:15 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Wed, Oct 08, 2014 at 09:56:36AM +0300, Gleb Natapov wrote:
> On Mon, Oct 06, 2014 at 02:19:32PM -0300, Marcelo Tosatti wrote:
> > On Sat, Oct 04, 2014 at 10:23:32AM +0300, Gleb Natapov wrote:
> > > On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> > > > On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > > > > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > > > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > > > > deleting a pinned spte.
> > > > > > 
> > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > 
> > > > > > ---
> > > > > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > > > > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > > > > 
> > > > > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > > > > ===================================================================
> > > > > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > > > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > > > > @@ -1208,7 +1208,8 @@
> > > > > >   *
> > > > > >   * Return true if tlb need be flushed.
> > > > > >   */
> > > > > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > > > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > > > > +			       bool skip_pinned)
> > > > > >  {
> > > > > >  	u64 spte = *sptep;
> > > > > >  
> > > > > > @@ -1218,6 +1219,22 @@
> > > > > >  
> > > > > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > > > > >  
> > > > > > +	if (is_pinned_spte(spte)) {
> > > > > > +		/* keep pinned spte intact, mark page dirty again */
> > > > > > +		if (skip_pinned) {
> > > > > > +			struct kvm_mmu_page *sp;
> > > > > > +			gfn_t gfn;
> > > > > > +
> > > > > > +			sp = page_header(__pa(sptep));
> > > > > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > > > > +
> > > > > > +			mark_page_dirty(kvm, gfn);
> > > > > > +			return false;
> > > > > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > > > > populating dirty_bitmap_buffer?
> > > > 
> > > > The pinned gfns are per-vcpu. Requires looping all vcpus (not
> > > > scalable).
> > > > 
> > > OK, but do they really have to be per-cpu? What's the advantage?
> > 
> > The guest configures pinning on a per-cpu basis (that is, enabling PEBS 
> > is done on a per-cpu basis).
> Is it a problem to maintain global pinned pages list for each memslot too?
> 
> > 
> > > > 
> > > > > > +		} else
> > > > > > +			mmu_reload_pinned_vcpus(kvm);
> > > > > Can you explain why do you need this?
> > > > 
> > > > Because if skip_pinned = false, we want vcpus to exit (called
> > > > from enable dirty logging codepath).
> > > > 
> > > I guess what I wanted to ask is why do we need skip_pinned? As far as I see it
> > > is set to false in two cases:
> > > 1: page is write protected for shadow MMU needs, should not happen since the feature
> > 
> > Correct.
> > 
> > >    is not supported with shadow mmu (can happen with nested EPT, but page will be marked
> > >    is accessed during next vcpu entry anyway, so how will it work)?
> > 
> > PEBS is not supported on nested EPT.
> > 
> OK, so for this case we do not need skip_pinned. Assert if it happens.
> 
> > > 2: when slot is marked as read only: such slot cannot have PEBS pages and if it will guest will die
> > >    anyway during next guest entry, so why not maintain list of pinned pages per slot and kill aguest
> > >    if slot with pinned pages is marked read only.
> > 
> > 2: when slots pages have dirty logging enabled. In that case, the page
> > is marked dirty immediately. This is the call from
> > kvm_mmu_slot_remove_write_access.
> > 
> Right, my 2 is incorrect.
> 
> > So when enabling dirty logging, for pinned sptes:
> > 
> >     - maintain pinned spte intact.
> >     - mark gfn for which pinned spte represents as dirty in the dirty
> >       log.
> > 
> But you set skip_pinned to false in kvm_mmu_slot_remove_write_access(), so this is not
> what is happening. Did you mean to set it to true there?

Argh, lets try again:

skip_pinned = true
------------------

mark page dirty, keep spte intact

called from get dirty log path.

skip_pinned = false
-------------------
reload remote mmu
destroy pinned spte.

called from: dirty log enablement, rmap write protect (unused for pinned
sptes)


Note this behaviour is your suggestion:

> > > > Why write protect it at all? mmu_reload_pinned_vcpus() will
> > > > unprotected
> > > > it anyway
> > > > on the next vmentry. Isn't it better to just report all pinned
> > > > pages as
> > > > dirty alway.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-08 17:15             ` Marcelo Tosatti
@ 2014-10-08 17:59               ` Gleb Natapov
  2014-10-08 19:22                 ` Marcelo Tosatti
  0 siblings, 1 reply; 30+ messages in thread
From: Gleb Natapov @ 2014-10-08 17:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Wed, Oct 08, 2014 at 02:15:34PM -0300, Marcelo Tosatti wrote:
> On Wed, Oct 08, 2014 at 09:56:36AM +0300, Gleb Natapov wrote:
> > On Mon, Oct 06, 2014 at 02:19:32PM -0300, Marcelo Tosatti wrote:
> > > On Sat, Oct 04, 2014 at 10:23:32AM +0300, Gleb Natapov wrote:
> > > > On Tue, Sep 09, 2014 at 12:28:11PM -0300, Marcelo Tosatti wrote:
> > > > > On Mon, Jul 21, 2014 at 04:14:24PM +0300, Gleb Natapov wrote:
> > > > > > On Wed, Jul 09, 2014 at 04:12:53PM -0300, mtosatti@redhat.com wrote:
> > > > > > > Reload remote vcpus MMU from GET_DIRTY_LOG codepath, before
> > > > > > > deleting a pinned spte.
> > > > > > > 
> > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > 
> > > > > > > ---
> > > > > > >  arch/x86/kvm/mmu.c |   29 +++++++++++++++++++++++------
> > > > > > >  1 file changed, 23 insertions(+), 6 deletions(-)
> > > > > > > 
> > > > > > > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> > > > > > > ===================================================================
> > > > > > > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 11:23:59.290744490 -0300
> > > > > > > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 11:24:58.449632435 -0300
> > > > > > > @@ -1208,7 +1208,8 @@
> > > > > > >   *
> > > > > > >   * Return true if tlb need be flushed.
> > > > > > >   */
> > > > > > > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
> > > > > > > +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect,
> > > > > > > +			       bool skip_pinned)
> > > > > > >  {
> > > > > > >  	u64 spte = *sptep;
> > > > > > >  
> > > > > > > @@ -1218,6 +1219,22 @@
> > > > > > >  
> > > > > > >  	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > > > > > >  
> > > > > > > +	if (is_pinned_spte(spte)) {
> > > > > > > +		/* keep pinned spte intact, mark page dirty again */
> > > > > > > +		if (skip_pinned) {
> > > > > > > +			struct kvm_mmu_page *sp;
> > > > > > > +			gfn_t gfn;
> > > > > > > +
> > > > > > > +			sp = page_header(__pa(sptep));
> > > > > > > +			gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > > > > > > +
> > > > > > > +			mark_page_dirty(kvm, gfn);
> > > > > > > +			return false;
> > > > > > Why not mark all pinned gfns as dirty in kvm_vm_ioctl_get_dirty_log() while
> > > > > > populating dirty_bitmap_buffer?
> > > > > 
> > > > > The pinned gfns are per-vcpu. Requires looping all vcpus (not
> > > > > scalable).
> > > > > 
> > > > OK, but do they really have to be per-cpu? What's the advantage?
> > > 
> > > The guest configures pinning on a per-cpu basis (that is, enabling PEBS 
> > > is done on a per-cpu basis).
> > Is it a problem to maintain global pinned pages list for each memslot too?
> > 
> > > 
> > > > > 
> > > > > > > +		} else
> > > > > > > +			mmu_reload_pinned_vcpus(kvm);
> > > > > > Can you explain why do you need this?
> > > > > 
> > > > > Because if skip_pinned = false, we want vcpus to exit (called
> > > > > from enable dirty logging codepath).
> > > > > 
> > > > I guess what I wanted to ask is why do we need skip_pinned? As far as I see it
> > > > is set to false in two cases:
> > > > 1: page is write protected for shadow MMU needs, should not happen since the feature
> > > 
> > > Correct.
> > > 
> > > >    is not supported with shadow mmu (can happen with nested EPT, but page will be marked
> > > >    is accessed during next vcpu entry anyway, so how will it work)?
> > > 
> > > PEBS is not supported on nested EPT.
> > > 
> > OK, so for this case we do not need skip_pinned. Assert if it happens.
> > 
> > > > 2: when slot is marked as read only: such slot cannot have PEBS pages and if it will guest will die
> > > >    anyway during next guest entry, so why not maintain list of pinned pages per slot and kill aguest
> > > >    if slot with pinned pages is marked read only.
> > > 
> > > 2: when slots pages have dirty logging enabled. In that case, the page
> > > is marked dirty immediately. This is the call from
> > > kvm_mmu_slot_remove_write_access.
> > > 
> > Right, my 2 is incorrect.
> > 
> > > So when enabling dirty logging, for pinned sptes:
> > > 
> > >     - maintain pinned spte intact.
> > >     - mark gfn for which pinned spte represents as dirty in the dirty
> > >       log.
> > > 
> > But you set skip_pinned to false in kvm_mmu_slot_remove_write_access(), so this is not
> > what is happening. Did you mean to set it to true there?
> 
> Argh, lets try again:
> 
> skip_pinned = true
> ------------------
> 
> mark page dirty, keep spte intact
> 
> called from get dirty log path.
> 
> skip_pinned = false
> -------------------
> reload remote mmu
> destroy pinned spte.
> 
> called from: dirty log enablement, rmap write protect (unused for pinned
> sptes)
> 
> 
> Note this behaviour is your suggestion:

Yes, I remember that and I thought we will not need this skip_pinned
at all.  For rmap write protect case there shouldn't be any pinned pages,
but why dirty log enablement sets skip_pinned to false? Why not mark
pinned pages as dirty just like you do in get dirty log path?

> 
> > > > > Why write protect it at all? mmu_reload_pinned_vcpus() will
> > > > > unprotected
> > > > > it anyway
> > > > > on the next vmentry. Isn't it better to just report all pinned
> > > > > pages as
> > > > > dirty alway.
> 

--
			Gleb.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-08 17:59               ` Gleb Natapov
@ 2014-10-08 19:22                 ` Marcelo Tosatti
  2014-10-10 13:09                   ` Gleb Natapov
  0 siblings, 1 reply; 30+ messages in thread
From: Marcelo Tosatti @ 2014-10-08 19:22 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

> > 
> > Argh, lets try again:
> > 
> > skip_pinned = true
> > ------------------
> > 
> > mark page dirty, keep spte intact
> > 
> > called from get dirty log path.
> > 
> > skip_pinned = false
> > -------------------
> > reload remote mmu
> > destroy pinned spte.
> > 
> > called from: dirty log enablement, rmap write protect (unused for pinned
> > sptes)
> > 
> > 
> > Note this behaviour is your suggestion:
> 
> Yes, I remember that and I thought we will not need this skip_pinned
> at all.  For rmap write protect case there shouldn't be any pinned pages,
> but why dirty log enablement sets skip_pinned to false? Why not mark
> pinned pages as dirty just like you do in get dirty log path?

Because if its a large spte, it must be nuked (or marked read-only,
which for pinned sptes, is not possible).


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-08 19:22                 ` Marcelo Tosatti
@ 2014-10-10 13:09                   ` Gleb Natapov
  2014-10-13  8:52                     ` Marcelo Tosatti
  0 siblings, 1 reply; 30+ messages in thread
From: Gleb Natapov @ 2014-10-10 13:09 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Wed, Oct 08, 2014 at 04:22:31PM -0300, Marcelo Tosatti wrote:
> > > 
> > > Argh, lets try again:
> > > 
> > > skip_pinned = true
> > > ------------------
> > > 
> > > mark page dirty, keep spte intact
> > > 
> > > called from get dirty log path.
> > > 
> > > skip_pinned = false
> > > -------------------
> > > reload remote mmu
> > > destroy pinned spte.
> > > 
> > > called from: dirty log enablement, rmap write protect (unused for pinned
> > > sptes)
> > > 
> > > 
> > > Note this behaviour is your suggestion:
> > 
> > Yes, I remember that and I thought we will not need this skip_pinned
> > at all.  For rmap write protect case there shouldn't be any pinned pages,
> > but why dirty log enablement sets skip_pinned to false? Why not mark
> > pinned pages as dirty just like you do in get dirty log path?
> 
> Because if its a large spte, it must be nuked (or marked read-only,
> which for pinned sptes, is not possible).
> 
If a large page has one small page pinned inside it its spte will
be marked as pinned, correct?  We did nuke large ptes here until very
recently: c126d94f2c90ed9d, but we cannot drop a pte here anyway without
kicking all vcpu from a guest mode, but do you need additional skip_pinned
parameter?  Why not check if spte is large instead?

So why not have per slot pinned page list (Xiao suggested the same) and do:

spte_write_protect() {
  if (is_pinned(spte) {
    if (large(spte))
       // cannot drop while vcpu are running
       mmu_reload_pinned_vcpus();
    else
       return false;
}
    

get_dirty_log() {
 for_each(pinned pages i)
    makr_dirty(i);
}

--
			Gleb.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-10 13:09                   ` Gleb Natapov
@ 2014-10-13  8:52                     ` Marcelo Tosatti
  2014-10-15  8:03                       ` Gleb Natapov
  0 siblings, 1 reply; 30+ messages in thread
From: Marcelo Tosatti @ 2014-10-13  8:52 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Fri, Oct 10, 2014 at 04:09:29PM +0300, Gleb Natapov wrote:
> On Wed, Oct 08, 2014 at 04:22:31PM -0300, Marcelo Tosatti wrote:
> > > > 
> > > > Argh, lets try again:
> > > > 
> > > > skip_pinned = true
> > > > ------------------
> > > > 
> > > > mark page dirty, keep spte intact
> > > > 
> > > > called from get dirty log path.
> > > > 
> > > > skip_pinned = false
> > > > -------------------
> > > > reload remote mmu
> > > > destroy pinned spte.
> > > > 
> > > > called from: dirty log enablement, rmap write protect (unused for pinned
> > > > sptes)
> > > > 
> > > > 
> > > > Note this behaviour is your suggestion:
> > > 
> > > Yes, I remember that and I thought we will not need this skip_pinned
> > > at all.  For rmap write protect case there shouldn't be any pinned pages,
> > > but why dirty log enablement sets skip_pinned to false? Why not mark
> > > pinned pages as dirty just like you do in get dirty log path?
> > 
> > Because if its a large spte, it must be nuked (or marked read-only,
> > which for pinned sptes, is not possible).
> > 
> If a large page has one small page pinned inside it its spte will
> be marked as pinned, correct?  

Correct.

> We did nuke large ptes here until very
> recently: c126d94f2c90ed9d, but we cannot drop a pte here anyway without
> kicking all vcpu from a guest mode, but do you need additional skip_pinned
> parameter?  Why not check if spte is large instead?

Nuke only if large spte is found? Can do that, instead.

> So why not have per slot pinned page list (Xiao suggested the same) and do:

The interface is per-vcpu (that is registration of pinned pages is
performed on a per-vcpu basis).

> spte_write_protect() {
>   if (is_pinned(spte) {
>     if (large(spte))
>        // cannot drop while vcpu are running
>        mmu_reload_pinned_vcpus();
>     else
>        return false;
> }
>     
> 
> get_dirty_log() {
>  for_each(pinned pages i)
>     makr_dirty(i);
> }

That is effectively the same this patchset does, except that the spte
pinned bit is checked at spte_write_protect, instead of looping over 
page pinned list. Fail to see huge advantage there.

I'll drop the skip_pinned parameter and use is_large_pte check instead.

Thanks


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path
  2014-10-13  8:52                     ` Marcelo Tosatti
@ 2014-10-15  8:03                       ` Gleb Natapov
  0 siblings, 0 replies; 30+ messages in thread
From: Gleb Natapov @ 2014-10-15  8:03 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, ak, pbonzini, xiaoguangrong, avi.kivity

On Mon, Oct 13, 2014 at 05:52:38AM -0300, Marcelo Tosatti wrote:
> On Fri, Oct 10, 2014 at 04:09:29PM +0300, Gleb Natapov wrote:
> > On Wed, Oct 08, 2014 at 04:22:31PM -0300, Marcelo Tosatti wrote:
> > > > > 
> > > > > Argh, lets try again:
> > > > > 
> > > > > skip_pinned = true
> > > > > ------------------
> > > > > 
> > > > > mark page dirty, keep spte intact
> > > > > 
> > > > > called from get dirty log path.
> > > > > 
> > > > > skip_pinned = false
> > > > > -------------------
> > > > > reload remote mmu
> > > > > destroy pinned spte.
> > > > > 
> > > > > called from: dirty log enablement, rmap write protect (unused for pinned
> > > > > sptes)
> > > > > 
> > > > > 
> > > > > Note this behaviour is your suggestion:
> > > > 
> > > > Yes, I remember that and I thought we will not need this skip_pinned
> > > > at all.  For rmap write protect case there shouldn't be any pinned pages,
> > > > but why dirty log enablement sets skip_pinned to false? Why not mark
> > > > pinned pages as dirty just like you do in get dirty log path?
> > > 
> > > Because if its a large spte, it must be nuked (or marked read-only,
> > > which for pinned sptes, is not possible).
> > > 
> > If a large page has one small page pinned inside it its spte will
> > be marked as pinned, correct?  
> 
> Correct.
> 
> > We did nuke large ptes here until very
> > recently: c126d94f2c90ed9d, but we cannot drop a pte here anyway without
> > kicking all vcpu from a guest mode, but do you need additional skip_pinned
> > parameter?  Why not check if spte is large instead?
> 
> Nuke only if large spte is found? Can do that, instead.
> 
> > So why not have per slot pinned page list (Xiao suggested the same) and do:
> 
> The interface is per-vcpu (that is registration of pinned pages is
> performed on a per-vcpu basis).
> 
PEBS is per cpu, but it does not mean that pinning should be per cpu, it
can be done globally with ref counting.

> > spte_write_protect() {
> >   if (is_pinned(spte) {
> >     if (large(spte))
> >        // cannot drop while vcpu are running
> >        mmu_reload_pinned_vcpus();
> >     else
> >        return false;
> > }
> >     
> > 
> > get_dirty_log() {
> >  for_each(pinned pages i)
> >     makr_dirty(i);
> > }
> 
> That is effectively the same this patchset does, except that the spte
> pinned bit is checked at spte_write_protect, instead of looping over 
> page pinned list. Fail to see huge advantage there.
>
I think spte_write_protect is a strange place to mark pages dirty, but
otherwise yes the effect is the same, so definitely not a huge difference.
If global pinned list is a PITA in your opinion leave it as is.

> I'll drop the skip_pinned parameter and use is_large_pte check instead.
> 
Thanks,

--
			Gleb.

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2014-10-15  8:04 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-09 19:12 [patch 0/4] KVM: support for pinning sptes (v2) mtosatti
2014-07-09 19:12 ` [patch 1/4] KVM: x86: add pinned parameter to page_fault methods mtosatti
2014-07-09 19:12 ` [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only) mtosatti
2014-07-17 17:18   ` Nadav Amit
2014-07-17 21:38     ` Marcelo Tosatti
2014-07-24 12:16       ` Nadav Amit
2014-07-21 21:46   ` Xiao Guangrong
2014-07-22  5:26     ` Xiao Guangrong
2014-07-09 19:12 ` [patch 3/4] KVM: MMU: reload request from GET_DIRTY_LOG path mtosatti
2014-07-21 13:14   ` Gleb Natapov
2014-09-09 15:28     ` Marcelo Tosatti
2014-09-22 17:19       ` Marcelo Tosatti
2014-09-30 18:59         ` Marcelo Tosatti
2014-10-04  7:23       ` Gleb Natapov
2014-10-06 17:19         ` Marcelo Tosatti
2014-10-08  6:56           ` Gleb Natapov
2014-10-08 17:15             ` Marcelo Tosatti
2014-10-08 17:59               ` Gleb Natapov
2014-10-08 19:22                 ` Marcelo Tosatti
2014-10-10 13:09                   ` Gleb Natapov
2014-10-13  8:52                     ` Marcelo Tosatti
2014-10-15  8:03                       ` Gleb Natapov
2014-07-21 21:55   ` Xiao Guangrong
2014-09-09 15:35     ` Marcelo Tosatti
2014-07-09 19:12 ` [patch 4/4] KVM: MMU: pinned sps are not candidates for deletion mtosatti
2014-07-21 21:59   ` Xiao Guangrong
2014-09-09 15:41     ` Marcelo Tosatti
2014-09-22 17:17       ` Marcelo Tosatti
2014-09-23  5:30       ` Xiao Guangrong
2014-07-09 19:20 ` [patch 0/4] KVM: support for pinning sptes (v2) Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.