All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush
@ 2012-05-17 10:24 Avi Kivity
  2012-05-17 10:24 ` [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush Avi Kivity
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Avi Kivity @ 2012-05-17 10:24 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

Currently we flush the TLB while holding mmu_lock.  This
increases the lock hold time by the IPI round-trip time, increasing
contention, and makes dropping the lock (for latency reasons) harder.

This patch changes TLB management to be usable locklessly, introducing
the following APIs:

  kvm_mark_tlb_dirty() - mark the TLB as containing stale entries
  kvm_cond_flush_remote_tlbs() - flush the TLB if it was marked as
                                  dirty

These APIs can be used without holding mmu_lock (though if the TLB
became stale due to shadow page table modifications, typically it
will need to be called with the lock held to prevent other threads
from seeing the modified page tables with the TLB unmarked and unflushed)/

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/locking.txt |   14 ++++++++++++++
 arch/x86/kvm/paging_tmpl.h            |    4 ++--
 include/linux/kvm_host.h              |   22 +++++++++++++++++++++-
 virt/kvm/kvm_main.c                   |   29 ++++++++++++++++++++---------
 4 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..f6c90479 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -23,3 +23,17 @@ Arch:		x86
 Protects:	- kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
 		- tsc offset in vmcb
 Comment:	'raw' because updating the tsc offsets must not be preempted.
+
+3. TLB control
+--------------
+
+The following APIs should be used for TLB control:
+
+ - kvm_mark_tlb_dirty() - indicates that the TLB is out of sync wrt
+      either guest or host page tables.
+ - kvm_flush_remote_tlbs() - unconditionally flush the tlbs
+ - kvm_cond_flush_remote_tlbs() - flush the TLBs if previously marked
+
+These may be used without mmu_lock, though kvm_mark_tlb_dirty() needs to be
+used while holding mmu_lock if it is called due to host page table changes
+(contrast to guest page table changes).
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f9709..97e2a81 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -793,7 +793,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			return -EINVAL;
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
-			vcpu->kvm->tlbs_dirty++;
+			kvm_mark_tlb_dirty(vcpu->kvm);
 			continue;
 		}
 
@@ -806,7 +806,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		if (gfn != sp->gfns[i]) {
 			drop_spte(vcpu->kvm, &sp->spt[i]);
-			vcpu->kvm->tlbs_dirty++;
+			kvm_mark_tlb_dirty(vcpu->kvm);
 			continue;
 		}
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cae342d..7ed1c4f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -310,7 +310,14 @@ struct kvm {
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
 #endif
-	long tlbs_dirty;
+	struct {
+		/*
+		 * When these two are different, a TLB somewhere holds a
+		 * stale TLB entry.  Clean with kvm_[cond_]flush_remote_tlbs().
+		 */
+		atomic_long_t dirtied_count;
+		atomic_long_t flushed_count;
+	} tlb_state;
 };
 
 /* The guest did something we don't support. */
@@ -468,6 +475,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_cond_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
@@ -889,5 +897,17 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 	}
 }
 
+/*
+ * Mark the TLB as dirty, for kvm_cond_flush_remote_tlbs().
+ */
+static inline void kvm_mark_tlb_dirty(struct kvm *kvm)
+{
+	/*
+	 * Make any changes to the page tables visible to remote flushers.
+	 */
+	smp_mb__before_atomic_inc();
+	atomic_long_inc(&kvm->tlb_state.dirtied_count);
+}
+
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7e14068..4113a36 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -203,12 +203,21 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	long dirty_count = kvm->tlbs_dirty;
+	long dirty_count = atomic_long_read(&kvm->tlb_state.dirtied_count);
+	long flushed_count = atomic_long_read(&kvm->tlb_state.flushed_count);
 
 	smp_mb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
-	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+	atomic_long_cmpxchg(&kvm->tlb_state.flushed_count,
+			    flushed_count, dirty_count);
+}
+
+void kvm_cond_flush_remote_tlbs(struct kvm *kvm)
+{
+	if (atomic_long_read(&kvm->tlb_state.dirtied_count)
+	    != atomic_long_read(&kvm->tlb_state.flushed_count))
+		kvm_flush_remote_tlbs(kvm);
 }
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -267,7 +276,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 					     unsigned long address)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int need_tlb_flush, idx;
+	int idx;
 
 	/*
 	 * When ->invalidate_page runs, the linux pte has been zapped
@@ -291,10 +300,10 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	spin_lock(&kvm->mmu_lock);
 
 	kvm->mmu_notifier_seq++;
-	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
+	if (kvm_unmap_hva(kvm, address))
+		kvm_mark_tlb_dirty(kvm);
 	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
+	kvm_cond_flush_remote_tlbs(kvm);
 
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -334,10 +343,12 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mmu_notifier_count++;
 	for (; start < end; start += PAGE_SIZE)
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
-	need_tlb_flush |= kvm->tlbs_dirty;
-	/* we've to flush the tlb before the pages can be freed */
+
 	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
+		kvm_mark_tlb_dirty(kvm);
+
+	/* we've to flush the tlb before the pages can be freed */
+	kvm_cond_flush_remote_tlbs(kvm);
 
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush
  2012-05-17 10:24 [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
@ 2012-05-17 10:24 ` Avi Kivity
  2012-05-21 20:13   ` Marcelo Tosatti
  2012-05-22 14:46   ` Takuya Yoshikawa
  2012-05-17 10:24 ` [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock Avi Kivity
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 14+ messages in thread
From: Avi Kivity @ 2012-05-17 10:24 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

This allows us to later move the actual flush out of protection of the
mmu spinlock, provided there are no additional dependencies.

Constructs of the form

    if (pred)
        kvm_flush_remote_tlbs(kvm)

are converted to

    if (pred)
        kvm_mark_tlb_dirty(kvm)

    kvm_cond_flush_remote_tlbs(kvm)

so that if another thread caused pred to transition from true to false,
but has not yet flushed the tlb, we notice it and flush it before proceeding.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   |    6 ++++--
 arch/x86/kvm/mmu.c         |   45 +++++++++++++++++++++++++++-----------------
 arch/x86/kvm/paging_tmpl.h |    4 +++-
 arch/x86/kvm/x86.c         |    4 +++-
 virt/kvm/kvm_main.c        |    4 +++-
 5 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd77cb5..a4a92d8 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1628,7 +1628,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
 {
-	kvm_flush_remote_tlbs(kvm);
+	kvm_mark_tlb_dirty(kvm);
+	kvm_cond_flush_remote_tlbs(kvm);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp,
@@ -1853,10 +1854,11 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
-		kvm_flush_remote_tlbs(kvm);
+		kvm_mark_tlb_dirty(kvm);
 		n = kvm_dirty_bitmap_bytes(memslot);
 		memset(memslot->dirty_bitmap, 0, n);
 	}
+	kvm_cond_flush_remote_tlbs(kvm);
 	r = 0;
 out:
 	mutex_unlock(&kvm->slots_lock);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07424cf..03a9c80 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1170,7 +1170,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	}
 
 	if (need_flush)
-		kvm_flush_remote_tlbs(kvm);
+		kvm_mark_tlb_dirty(kvm);
+
+	kvm_cond_flush_remote_tlbs(kvm);
 
 	return 0;
 }
@@ -1294,7 +1296,8 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
-	kvm_flush_remote_tlbs(vcpu->kvm);
+	kvm_mark_tlb_dirty(vcpu->kvm);
+	kvm_cond_flush_remote_tlbs(vcpu->kvm);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
@@ -1697,7 +1700,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
 
 		if (protected)
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			kvm_mark_tlb_dirty(vcpu->kvm);
+
+		kvm_cond_flush_remote_tlbs(vcpu->kvm);
 
 		for_each_sp(pages, sp, parents, i) {
 			kvm_sync_page(vcpu, sp, &invalid_list);
@@ -1786,7 +1791,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			kvm_mark_tlb_dirty(vcpu->kvm);
+		kvm_cond_flush_remote_tlbs(vcpu->kvm);
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			kvm_sync_pages(vcpu, gfn);
 
@@ -1861,7 +1867,8 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 	if (is_large_pte(*sptep)) {
 		drop_spte(vcpu->kvm, sptep);
 		--vcpu->kvm->stat.lpages;
-		kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_mark_tlb_dirty(vcpu->kvm);
+		kvm_cond_flush_remote_tlbs(vcpu->kvm);
 	}
 }
 
@@ -1883,7 +1890,8 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			return;
 
 		drop_parent_pte(child, sptep);
-		kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_mark_tlb_dirty(vcpu->kvm);
+		kvm_cond_flush_remote_tlbs(vcpu->kvm);
 	}
 }
 
@@ -2021,7 +2029,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	if (list_empty(invalid_list))
 		return;
 
-	kvm_flush_remote_tlbs(kvm);
+	kvm_mark_tlb_dirty(kvm);
+	kvm_cond_flush_remote_tlbs(kvm);
 
 	if (atomic_read(&kvm->arch.reader_counter)) {
 		kvm_mmu_isolate_pages(invalid_list);
@@ -2344,7 +2353,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 * might be cached on a CPU's TLB.
 	 */
 	if (is_writable_pte(entry) && !is_writable_pte(*sptep))
-		kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_mark_tlb_dirty(vcpu->kvm);
+
+	kvm_cond_flush_remote_tlbs(vcpu->kvm);
 done:
 	return ret;
 }
@@ -2376,15 +2387,15 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, sptep);
-			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep);
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			kvm_mark_tlb_dirty(vcpu->kvm);
 		} else
 			was_rmapped = 1;
 	}
+	kvm_cond_flush_remote_tlbs(vcpu->kvm);
 
 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
 		      level, gfn, pfn, speculative, true,
@@ -3561,14 +3572,13 @@ static bool need_remote_flush(u64 old, u64 new)
 }
 
 static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
-				    bool remote_flush, bool local_flush)
+				    bool local_flush)
 {
 	if (zap_page)
 		return;
 
-	if (remote_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	else if (local_flush)
+	kvm_cond_flush_remote_tlbs(vcpu->kvm);
+	if (local_flush)
 		kvm_mmu_flush_tlb(vcpu);
 }
 
@@ -3742,11 +3752,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			      & mask.word) && rmap_can_add(vcpu))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			if (!remote_flush && need_remote_flush(entry, *spte))
-				remote_flush = true;
+				kvm_mark_tlb_dirty(vcpu->kvm);
 			++spte;
 		}
 	}
-	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+	mmu_pte_write_flush_tlb(vcpu, zap_page, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3927,7 +3937,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 						pt[i] & ~PT_WRITABLE_MASK);
 		}
 	}
-	kvm_flush_remote_tlbs(kvm);
+	kvm_mark_tlb_dirty(kvm);
+	kvm_cond_flush_remote_tlbs(kvm);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 97e2a81..9fe5f72 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -698,7 +698,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
-				kvm_flush_remote_tlbs(vcpu->kvm);
+				kvm_mark_tlb_dirty(vcpu->kvm);
+
+			kvm_cond_flush_remote_tlbs(vcpu->kvm);
 
 			if (!rmap_can_add(vcpu))
 				break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2256f51..a2149d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3130,7 +3130,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
 	}
 	if (is_dirty)
-		kvm_flush_remote_tlbs(kvm);
+		kvm_mark_tlb_dirty(kvm);
+
+	kvm_cond_flush_remote_tlbs(kvm);
 
 	spin_unlock(&kvm->mmu_lock);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4113a36..585ab45 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -392,7 +392,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 
 	young = kvm_age_hva(kvm, address);
 	if (young)
-		kvm_flush_remote_tlbs(kvm);
+		kvm_mark_tlb_dirty(kvm);
+
+	kvm_cond_flush_remote_tlbs(kvm);
 
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-05-17 10:24 [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
  2012-05-17 10:24 ` [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush Avi Kivity
@ 2012-05-17 10:24 ` Avi Kivity
  2012-05-21 20:58   ` Marcelo Tosatti
  2012-05-17 10:24 ` [PATCH v2 4/5] KVM: Flush TLB in FNAME(invlpg) " Avi Kivity
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Avi Kivity @ 2012-05-17 10:24 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c |   16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 585ab45..9f6d15d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -302,11 +302,11 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	kvm->mmu_notifier_seq++;
 	if (kvm_unmap_hva(kvm, address))
 		kvm_mark_tlb_dirty(kvm);
-	/* we've to flush the tlb before the pages can be freed */
-	kvm_cond_flush_remote_tlbs(kvm);
-
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	/* we've to flush the tlb before the pages can be freed */
+	kvm_cond_flush_remote_tlbs(kvm);
 }
 
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -347,11 +347,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	if (need_tlb_flush)
 		kvm_mark_tlb_dirty(kvm);
 
-	/* we've to flush the tlb before the pages can be freed */
-	kvm_cond_flush_remote_tlbs(kvm);
-
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	/* we've to flush the tlb before the pages can be freed */
+	kvm_cond_flush_remote_tlbs(kvm);
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -394,11 +394,11 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	if (young)
 		kvm_mark_tlb_dirty(kvm);
 
-	kvm_cond_flush_remote_tlbs(kvm);
-
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 
+	kvm_cond_flush_remote_tlbs(kvm);
+
 	return young;
 }
 
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 4/5] KVM: Flush TLB in FNAME(invlpg) without holding mmu_lock
  2012-05-17 10:24 [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
  2012-05-17 10:24 ` [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush Avi Kivity
  2012-05-17 10:24 ` [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock Avi Kivity
@ 2012-05-17 10:24 ` Avi Kivity
  2012-05-17 10:24 ` [PATCH v2 5/5] KVM: Flush TLB in change_pte mmu notifier " Avi Kivity
  2012-05-17 11:51 ` [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
  4 siblings, 0 replies; 14+ messages in thread
From: Avi Kivity @ 2012-05-17 10:24 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

mmu_page_zap_pte() is modified to mark the TLB as dirty; but currently
only FNAME(invlpg) takes advantage of this.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         |    7 +++----
 arch/x86/kvm/paging_tmpl.h |    7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 03a9c80..0e1faec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1895,7 +1895,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 }
 
-static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			     u64 *spte)
 {
 	u64 pte;
@@ -1911,13 +1911,12 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
-		return true;
+		kvm_mark_tlb_dirty(kvm);
+		return;
 	}
 
 	if (is_mmio_spte(pte))
 		mmu_spte_clear_no_track(spte);
-
-	return false;
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9fe5f72..72c9cf4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -697,10 +697,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
-			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
-				kvm_mark_tlb_dirty(vcpu->kvm);
-
-			kvm_cond_flush_remote_tlbs(vcpu->kvm);
+			mmu_page_zap_pte(vcpu->kvm, sp, sptep);
 
 			if (!rmap_can_add(vcpu))
 				break;
@@ -716,6 +713,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			break;
 	}
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	kvm_cond_flush_remote_tlbs(vcpu->kvm);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 5/5] KVM: Flush TLB in change_pte mmu notifier without holding mmu_lock
  2012-05-17 10:24 [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
                   ` (2 preceding siblings ...)
  2012-05-17 10:24 ` [PATCH v2 4/5] KVM: Flush TLB in FNAME(invlpg) " Avi Kivity
@ 2012-05-17 10:24 ` Avi Kivity
  2012-05-17 11:51 ` [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
  4 siblings, 0 replies; 14+ messages in thread
From: Avi Kivity @ 2012-05-17 10:24 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c  |    8 +-------
 virt/kvm/kvm_main.c |    2 ++
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0e1faec..85ed48a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1138,7 +1138,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int need_flush = 0;
 	u64 new_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
@@ -1150,8 +1149,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		BUG_ON(!is_shadow_present_pte(*sptep));
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
 
-		need_flush = 1;
-
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, sptep);
 			sptep = rmap_get_first(*rmapp, &iter);
@@ -1167,12 +1164,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			mmu_spte_set(sptep, new_spte);
 			sptep = rmap_get_next(&iter);
 		}
-	}
 
-	if (need_flush)
 		kvm_mark_tlb_dirty(kvm);
-
-	kvm_cond_flush_remote_tlbs(kvm);
+	}
 
 	return 0;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9f6d15d..b840979 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -323,6 +323,8 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	kvm_set_spte_hva(kvm, address, pte);
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	kvm_cond_flush_remote_tlbs(kvm);
 }
 
 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush
  2012-05-17 10:24 [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
                   ` (3 preceding siblings ...)
  2012-05-17 10:24 ` [PATCH v2 5/5] KVM: Flush TLB in change_pte mmu notifier " Avi Kivity
@ 2012-05-17 11:51 ` Avi Kivity
  4 siblings, 0 replies; 14+ messages in thread
From: Avi Kivity @ 2012-05-17 11:51 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

On 05/17/2012 01:24 PM, Avi Kivity wrote:
> Currently we flush the TLB while holding mmu_lock.  This
> increases the lock hold time by the IPI round-trip time, increasing
> contention, and makes dropping the lock (for latency reasons) harder.
>
> This patch changes TLB management to be usable locklessly, introducing
> the following APIs:
>
>   kvm_mark_tlb_dirty() - mark the TLB as containing stale entries
>   kvm_cond_flush_remote_tlbs() - flush the TLB if it was marked as
>                                   dirty
>
> These APIs can be used without holding mmu_lock (though if the TLB
> became stale due to shadow page table modifications, typically it
> will need to be called with the lock held to prevent other threads
> from seeing the modified page tables with the TLB unmarked and unflushed)/
>


Oops, forgot the cover letter and the changelog.  I'm experimenting with
posting directly from git send-email.

The change is the new patch 2 which tries to ensure we never bypass a
tlb flush.  Perhaps a better way to do it is to move the
kvm_cond_flush_remote_tlbs() _before_ the code that depends on them,
instead of after the code that changes the spte.

Please review carefully.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush
  2012-05-17 10:24 ` [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush Avi Kivity
@ 2012-05-21 20:13   ` Marcelo Tosatti
  2012-05-22 14:46   ` Takuya Yoshikawa
  1 sibling, 0 replies; 14+ messages in thread
From: Marcelo Tosatti @ 2012-05-21 20:13 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Xiao Guangrong, Takuya Yoshikawa, kvm

On Thu, May 17, 2012 at 01:24:41PM +0300, Avi Kivity wrote:
> This allows us to later move the actual flush out of protection of the
> mmu spinlock, provided there are no additional dependencies.
> 
> Constructs of the form
> 
>     if (pred)
>         kvm_flush_remote_tlbs(kvm)
> 
> are converted to
> 
>     if (pred)
>         kvm_mark_tlb_dirty(kvm)
> 
>     kvm_cond_flush_remote_tlbs(kvm)
> 
> so that if another thread caused pred to transition from true to false,
> but has not yet flushed the tlb, we notice it and flush it before proceeding.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  arch/ia64/kvm/kvm-ia64.c   |    6 ++++--
>  arch/x86/kvm/mmu.c         |   45 +++++++++++++++++++++++++++-----------------
>  arch/x86/kvm/paging_tmpl.h |    4 +++-
>  arch/x86/kvm/x86.c         |    4 +++-
>  virt/kvm/kvm_main.c        |    4 +++-
>  5 files changed, 41 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
> index bd77cb5..a4a92d8 100644
> --- a/arch/ia64/kvm/kvm-ia64.c
> +++ b/arch/ia64/kvm/kvm-ia64.c
> @@ -1628,7 +1628,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>  
>  void kvm_arch_flush_shadow(struct kvm *kvm)
>  {
> -	kvm_flush_remote_tlbs(kvm);
> +	kvm_mark_tlb_dirty(kvm);
> +	kvm_cond_flush_remote_tlbs(kvm);
>  }
>  
>  long kvm_arch_dev_ioctl(struct file *filp,
> @@ -1853,10 +1854,11 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>  
>  	/* If nothing is dirty, don't bother messing with page tables. */
>  	if (is_dirty) {
> -		kvm_flush_remote_tlbs(kvm);
> +		kvm_mark_tlb_dirty(kvm);
>  		n = kvm_dirty_bitmap_bytes(memslot);
>  		memset(memslot->dirty_bitmap, 0, n);
>  	}
> +	kvm_cond_flush_remote_tlbs(kvm);
>  	r = 0;
>  out:
>  	mutex_unlock(&kvm->slots_lock);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 07424cf..03a9c80 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1170,7 +1170,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
>  	}
>  
>  	if (need_flush)
> -		kvm_flush_remote_tlbs(kvm);
> +		kvm_mark_tlb_dirty(kvm);
> +
> +	kvm_cond_flush_remote_tlbs(kvm);
>  
>  	return 0;
>  }
> @@ -1294,7 +1296,8 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
>  	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
>  
>  	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
> -	kvm_flush_remote_tlbs(vcpu->kvm);
> +	kvm_mark_tlb_dirty(vcpu->kvm);
> +	kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  }
>  
>  int kvm_age_hva(struct kvm *kvm, unsigned long hva)
> @@ -1697,7 +1700,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>  			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
>  
>  		if (protected)
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +			kvm_mark_tlb_dirty(vcpu->kvm);
> +
> +		kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  
>  		for_each_sp(pages, sp, parents, i) {
>  			kvm_sync_page(vcpu, sp, &invalid_list);
> @@ -1786,7 +1791,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
>  	if (!direct) {
>  		if (rmap_write_protect(vcpu->kvm, gfn))
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +			kvm_mark_tlb_dirty(vcpu->kvm);
> +		kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
>  			kvm_sync_pages(vcpu, gfn);
>  
> @@ -1861,7 +1867,8 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
>  	if (is_large_pte(*sptep)) {
>  		drop_spte(vcpu->kvm, sptep);
>  		--vcpu->kvm->stat.lpages;
> -		kvm_flush_remote_tlbs(vcpu->kvm);
> +		kvm_mark_tlb_dirty(vcpu->kvm);
> +		kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  	}
>  }
>  
> @@ -1883,7 +1890,8 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  			return;
>  
>  		drop_parent_pte(child, sptep);
> -		kvm_flush_remote_tlbs(vcpu->kvm);
> +		kvm_mark_tlb_dirty(vcpu->kvm);
> +		kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  	}
>  }
>  
> @@ -2021,7 +2029,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  	if (list_empty(invalid_list))
>  		return;
>  
> -	kvm_flush_remote_tlbs(kvm);
> +	kvm_mark_tlb_dirty(kvm);
> +	kvm_cond_flush_remote_tlbs(kvm);
>  
>  	if (atomic_read(&kvm->arch.reader_counter)) {
>  		kvm_mmu_isolate_pages(invalid_list);
> @@ -2344,7 +2353,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  	 * might be cached on a CPU's TLB.
>  	 */
>  	if (is_writable_pte(entry) && !is_writable_pte(*sptep))
> -		kvm_flush_remote_tlbs(vcpu->kvm);
> +		kvm_mark_tlb_dirty(vcpu->kvm);
> +
> +	kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  done:
>  	return ret;
>  }
> @@ -2376,15 +2387,15 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  
>  			child = page_header(pte & PT64_BASE_ADDR_MASK);
>  			drop_parent_pte(child, sptep);
> -			kvm_flush_remote_tlbs(vcpu->kvm);
>  		} else if (pfn != spte_to_pfn(*sptep)) {
>  			pgprintk("hfn old %llx new %llx\n",
>  				 spte_to_pfn(*sptep), pfn);
>  			drop_spte(vcpu->kvm, sptep);
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +			kvm_mark_tlb_dirty(vcpu->kvm);
>  		} else
>  			was_rmapped = 1;
>  	}
> +	kvm_cond_flush_remote_tlbs(vcpu->kvm);
>  
>  	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
>  		      level, gfn, pfn, speculative, true,
> @@ -3561,14 +3572,13 @@ static bool need_remote_flush(u64 old, u64 new)
>  }
>  
>  static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
> -				    bool remote_flush, bool local_flush)
> +				    bool local_flush)
>  {
>  	if (zap_page)
>  		return;
>  
> -	if (remote_flush)
> -		kvm_flush_remote_tlbs(vcpu->kvm);
> -	else if (local_flush)
> +	kvm_cond_flush_remote_tlbs(vcpu->kvm);
> +	if (local_flush)
>  		kvm_mmu_flush_tlb(vcpu);
>  }
>  
> @@ -3742,11 +3752,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  			      & mask.word) && rmap_can_add(vcpu))
>  				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
>  			if (!remote_flush && need_remote_flush(entry, *spte))
> -				remote_flush = true;
> +				kvm_mark_tlb_dirty(vcpu->kvm);


A remote cpu can flush the TLBs, marking the TLB as synced. 
But "remote_flush" variable remains true, so for further spte updates,
the TLB is not marked dirty.

Better kill the remote_flush variable (still reviewing).


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-05-17 10:24 ` [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock Avi Kivity
@ 2012-05-21 20:58   ` Marcelo Tosatti
  2012-05-21 21:07     ` Marcelo Tosatti
  2012-07-02 12:05     ` Avi Kivity
  0 siblings, 2 replies; 14+ messages in thread
From: Marcelo Tosatti @ 2012-05-21 20:58 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Xiao Guangrong, Takuya Yoshikawa, kvm

On Thu, May 17, 2012 at 01:24:42PM +0300, Avi Kivity wrote:
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  virt/kvm/kvm_main.c |   16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 585ab45..9f6d15d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -302,11 +302,11 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
>  	kvm->mmu_notifier_seq++;
>  	if (kvm_unmap_hva(kvm, address))
>  		kvm_mark_tlb_dirty(kvm);
> -	/* we've to flush the tlb before the pages can be freed */
> -	kvm_cond_flush_remote_tlbs(kvm);
> -
>  	spin_unlock(&kvm->mmu_lock);
>  	srcu_read_unlock(&kvm->srcu, idx);
> +
> +	/* we've to flush the tlb before the pages can be freed */
> +	kvm_cond_flush_remote_tlbs(kvm);
>  }

There are still sites that assumed implicitly that acquiring mmu_lock
guarantees that sptes and remote TLBs are in sync. Example:

void kvm_mmu_zap_all(struct kvm *kvm)
{
        struct kvm_mmu_page *sp, *node;
        LIST_HEAD(invalid_list);

        spin_lock(&kvm->mmu_lock);
restart:
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages,
link)
                if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
                        goto restart;

        kvm_mmu_commit_zap_page(kvm, &invalid_list);
        spin_unlock(&kvm->mmu_lock);
}

kvm_mmu_commit_zap_page only flushes if the TLB was dirtied by this
context, not before. kvm_mmu_unprotect_page is similar.

In general:

context 1                       context 2

lock(mmu_lock)
modify spte
mark_tlb_dirty
unlock(mmu_lock)
                                lock(mmu_lock)
                                read spte
                                make a decision based on spte value
                                unlock(mmu_lock)
flush remote TLBs


Is scary.

Perhaps have a rule that says:

1) Conditionally flush remote TLB after acquiring mmu_lock, 
before anything (even perhaps inside the lock macro).
2) Except special cases where it is clear that this is not 
necessary.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-05-21 20:58   ` Marcelo Tosatti
@ 2012-05-21 21:07     ` Marcelo Tosatti
  2012-07-02 12:05     ` Avi Kivity
  1 sibling, 0 replies; 14+ messages in thread
From: Marcelo Tosatti @ 2012-05-21 21:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Xiao Guangrong, Takuya Yoshikawa, kvm

On Mon, May 21, 2012 at 05:58:50PM -0300, Marcelo Tosatti wrote:
> On Thu, May 17, 2012 at 01:24:42PM +0300, Avi Kivity wrote:
> > Signed-off-by: Avi Kivity <avi@redhat.com>
> > ---
> >  virt/kvm/kvm_main.c |   16 ++++++++--------
> >  1 file changed, 8 insertions(+), 8 deletions(-)
> > 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 585ab45..9f6d15d 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -302,11 +302,11 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
> >  	kvm->mmu_notifier_seq++;
> >  	if (kvm_unmap_hva(kvm, address))
> >  		kvm_mark_tlb_dirty(kvm);
> > -	/* we've to flush the tlb before the pages can be freed */
> > -	kvm_cond_flush_remote_tlbs(kvm);
> > -
> >  	spin_unlock(&kvm->mmu_lock);
> >  	srcu_read_unlock(&kvm->srcu, idx);
> > +
> > +	/* we've to flush the tlb before the pages can be freed */
> > +	kvm_cond_flush_remote_tlbs(kvm);
> >  }
> 
> There are still sites that assumed implicitly that acquiring mmu_lock
> guarantees that sptes and remote TLBs are in sync. Example:
> 
> void kvm_mmu_zap_all(struct kvm *kvm)
> {
>         struct kvm_mmu_page *sp, *node;
>         LIST_HEAD(invalid_list);
> 
>         spin_lock(&kvm->mmu_lock);
> restart:
>         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages,
> link)
>                 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>                         goto restart;
> 
>         kvm_mmu_commit_zap_page(kvm, &invalid_list);
>         spin_unlock(&kvm->mmu_lock);
> }
> 
> kvm_mmu_commit_zap_page only flushes if the TLB was dirtied by this
> context, not before. kvm_mmu_unprotect_page is similar.

Its tempting to say "lets fix kvm_mmu_commit_zap_page by always
conditionally flushing and done".

But some variation of whats suggested below is safer (no need to think 
about all this every time something touches mmu_lock).

> In general:
> 
> context 1                       context 2
> 
> lock(mmu_lock)
> modify spte
> mark_tlb_dirty
> unlock(mmu_lock)
>                                 lock(mmu_lock)
>                                 read spte
>                                 make a decision based on spte value
>                                 unlock(mmu_lock)
> flush remote TLBs
> 
> 
> Is scary.
> 
> Perhaps have a rule that says:
> 
> 1) Conditionally flush remote TLB after acquiring mmu_lock, 
> before anything (even perhaps inside the lock macro).
> 2) Except special cases where it is clear that this is not 
> necessary.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush
  2012-05-17 10:24 ` [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush Avi Kivity
  2012-05-21 20:13   ` Marcelo Tosatti
@ 2012-05-22 14:46   ` Takuya Yoshikawa
  1 sibling, 0 replies; 14+ messages in thread
From: Takuya Yoshikawa @ 2012-05-22 14:46 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

On Thu, 17 May 2012 13:24:41 +0300
Avi Kivity <avi@redhat.com> wrote:

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2256f51..a2149d8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3130,7 +3130,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
>  		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
>  	}
>  	if (is_dirty)
> -		kvm_flush_remote_tlbs(kvm);
> +		kvm_mark_tlb_dirty(kvm);
> +
> +	kvm_cond_flush_remote_tlbs(kvm);
>  
>  	spin_unlock(&kvm->mmu_lock);

Any reason not to move this flush outside of the mmu_lock in this
patch series?

Unlike other rmap write protections, this one seems to be simple.

	Takuya

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-05-21 20:58   ` Marcelo Tosatti
  2012-05-21 21:07     ` Marcelo Tosatti
@ 2012-07-02 12:05     ` Avi Kivity
  2012-07-02 12:41       ` Avi Kivity
  1 sibling, 1 reply; 14+ messages in thread
From: Avi Kivity @ 2012-07-02 12:05 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Takuya Yoshikawa, kvm

Revisiting after hiatus.

On 05/21/2012 11:58 PM, Marcelo Tosatti wrote:
> On Thu, May 17, 2012 at 01:24:42PM +0300, Avi Kivity wrote:
>> Signed-off-by: Avi Kivity <avi@redhat.com>
>> ---
>>  virt/kvm/kvm_main.c |   16 ++++++++--------
>>  1 file changed, 8 insertions(+), 8 deletions(-)
>> 
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 585ab45..9f6d15d 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -302,11 +302,11 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
>>  	kvm->mmu_notifier_seq++;
>>  	if (kvm_unmap_hva(kvm, address))
>>  		kvm_mark_tlb_dirty(kvm);
>> -	/* we've to flush the tlb before the pages can be freed */
>> -	kvm_cond_flush_remote_tlbs(kvm);
>> -
>>  	spin_unlock(&kvm->mmu_lock);
>>  	srcu_read_unlock(&kvm->srcu, idx);
>> +
>> +	/* we've to flush the tlb before the pages can be freed */
>> +	kvm_cond_flush_remote_tlbs(kvm);
>>  }
> 
> There are still sites that assumed implicitly that acquiring mmu_lock
> guarantees that sptes and remote TLBs are in sync. Example:
> 
> void kvm_mmu_zap_all(struct kvm *kvm)
> {
>         struct kvm_mmu_page *sp, *node;
>         LIST_HEAD(invalid_list);
> 
>         spin_lock(&kvm->mmu_lock);
> restart:
>         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages,
> link)
>                 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>                         goto restart;
> 
>         kvm_mmu_commit_zap_page(kvm, &invalid_list);
>         spin_unlock(&kvm->mmu_lock);
> }
> 
> kvm_mmu_commit_zap_page only flushes if the TLB was dirtied by this
> context, not before. kvm_mmu_unprotect_page is similar.
> 
> In general:
> 
> context 1                       context 2
> 
> lock(mmu_lock)
> modify spte
> mark_tlb_dirty
> unlock(mmu_lock)
>                                 lock(mmu_lock)
>                                 read spte
>                                 make a decision based on spte value
>                                 unlock(mmu_lock)
> flush remote TLBs
> 
> 
> Is scary.

It scares me too.  Could be something trivial like following an
intermediate paging structure entry or not, depending on whether it is
present.  I don't think we'll be able to audit the entire code base to
ensure nothing like that happens, or to enforce it later on.

> Perhaps have a rule that says:
> 
> 1) Conditionally flush remote TLB after acquiring mmu_lock, 
> before anything (even perhaps inside the lock macro).

I would like to avoid any flush within the lock.  We may back down on
this goal but let's try to find something that works and doesn't depend
on huge audits.

> 2) Except special cases where it is clear that this is not 
> necessary.

One option: your idea, but without taking the lock

  def mmu_begin():
      clean = False
      while not clean:
        cond_flush the tlb
        rtlb = kvm.remote_tlb_counter # atomically wrt flush
        spin_lock mmu_lock
        if rtlb != kvm.remote_tlb_counter
            clean = False
            spin_unlock(mmu_lock)

Since we're spinning over the tlb counter, there's no real advantage
here except that preemption is enabled.

A simpler option is to make mmu_end() do a cond_flush():

   def mmu_begin():
       spin_lock(mmu_lock)

   def mmu_end():
       spin_unlock(mmu_lock)
       cond_flush

We need something for lockbreaking too:

   def mmu_lockbreak():
       if not (contended or need_resched):
           return False
       remember flush counter
       cond_resched_lock
       return flush counter changed

The caller would check the return value to see if it needs to redo
anything.  But this has the danger of long operations (like write
protecting a slot) never completing.

-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-07-02 12:05     ` Avi Kivity
@ 2012-07-02 12:41       ` Avi Kivity
  2012-07-02 14:09         ` Takuya Yoshikawa
  0 siblings, 1 reply; 14+ messages in thread
From: Avi Kivity @ 2012-07-02 12:41 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, Takuya Yoshikawa, kvm

On 07/02/2012 03:05 PM, Avi Kivity wrote:
> We need something for lockbreaking too:
> 
>    def mmu_lockbreak():
>        if not (contended or need_resched):
>            return False
>        remember flush counter
>        cond_resched_lock
>        return flush counter changed
> 
> The caller would check the return value to see if it needs to redo
> anything.  But this has the danger of long operations (like write
> protecting a slot) never completing.

In fact I don't think we need the return value.  All long running
operations can tolerate a lock break.

kvm_mmu_change_mmu_pages: so long as the counter is maintained
correctly, it will function.  May livelock though, so we should abort if
we don't manage to keep the counter down.

get_dirty_log: so long as a write fault updates the next bitmap, we're fine

kvm_mmu_slot_remove_write_access: same.  It's hard to continue the loop
after a lockbreak though.  We can switch it to be rmap based instead.

flush_shadow (zap_all): just restart from the beginning after dropping
the lock.  May livelock, can be fixed by using a generation counter for
kvm_mmu_page.

kvm_mmu_sync_roots: already does lockbreaking

kvm_mmu_unprotect_page: not long-running in normal operation, but a
guest can make it long running.  However, we're allowed not to be
accurate about it and just return to the guest.

kvm_mmu_pte_write: can be a long operation with crazy guests. Normal
guests will work fine.

-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-07-02 12:41       ` Avi Kivity
@ 2012-07-02 14:09         ` Takuya Yoshikawa
  2012-07-02 14:18           ` Avi Kivity
  0 siblings, 1 reply; 14+ messages in thread
From: Takuya Yoshikawa @ 2012-07-02 14:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

On Mon, 02 Jul 2012 15:41:30 +0300
Avi Kivity <avi@redhat.com> wrote:

> kvm_mmu_slot_remove_write_access: same.  It's hard to continue the loop
> after a lockbreak though.  We can switch it to be rmap based instead.

Switching to rmap based protection was on my queue before, but I wanted
to do that after your work.

Thanks,
	Takuya

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock
  2012-07-02 14:09         ` Takuya Yoshikawa
@ 2012-07-02 14:18           ` Avi Kivity
  0 siblings, 0 replies; 14+ messages in thread
From: Avi Kivity @ 2012-07-02 14:18 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Marcelo Tosatti, Xiao Guangrong, Takuya Yoshikawa, kvm

On 07/02/2012 05:09 PM, Takuya Yoshikawa wrote:
> On Mon, 02 Jul 2012 15:41:30 +0300
> Avi Kivity <avi@redhat.com> wrote:
> 
>> kvm_mmu_slot_remove_write_access: same.  It's hard to continue the loop
>> after a lockbreak though.  We can switch it to be rmap based instead.
> 
> Switching to rmap based protection was on my queue before, but I wanted
> to do that after your work.

It makes sense.  I'll dive back in after we agree on the way to go.

-- 
error compiling committee.c: too many arguments to function



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2012-07-02 14:19 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-17 10:24 [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity
2012-05-17 10:24 ` [PATCH v2 2/5] KVM: MMU: Convert remote flushes to kvm_mark_tlb_dirty() and a conditional flush Avi Kivity
2012-05-21 20:13   ` Marcelo Tosatti
2012-05-22 14:46   ` Takuya Yoshikawa
2012-05-17 10:24 ` [PATCH v2 3/5] KVM: Flush TLB in mmu notifier without holding mmu_lock Avi Kivity
2012-05-21 20:58   ` Marcelo Tosatti
2012-05-21 21:07     ` Marcelo Tosatti
2012-07-02 12:05     ` Avi Kivity
2012-07-02 12:41       ` Avi Kivity
2012-07-02 14:09         ` Takuya Yoshikawa
2012-07-02 14:18           ` Avi Kivity
2012-05-17 10:24 ` [PATCH v2 4/5] KVM: Flush TLB in FNAME(invlpg) " Avi Kivity
2012-05-17 10:24 ` [PATCH v2 5/5] KVM: Flush TLB in change_pte mmu notifier " Avi Kivity
2012-05-17 11:51 ` [PATCH v2 1/5] KVM: Add APIs for unlocked TLB flush Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.