linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/5] KVM TLB flushing improvements
@ 2018-04-10 12:48 Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 1/5] powerpc/64s/mm: Implement LPID based TLB flushes to be used by KVM Nicholas Piggin
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-10 12:48 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Nicholas Piggin, linuxppc-dev

This series adds powerpc:tlbie tracepoints for radix partition
scoped invalidations. After I started getting some traces on a
32 vCPU radix guest it showed a problem with partition scoped
faults/invalidates, so I had a try at fixing it. This seems to
stable be on radix so far (haven't tested hash yet).

Thanks,
Nick

Nicholas Piggin (5):
  powerpc/64s/mm: Implement LPID based TLB flushes to be used by KVM
  KVM: PPC: Book3S HV: kvmppc_radix_tlbie_page use Linux flush function
  KVM: PPC: Book3S HV: kvmhv_p9_set_lpcr use Linux flush function
  KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest
    entry
  KVM: PPC: Book3S HV: Radix do not clear partition scoped page table
    when page fault races with other vCPUs.

 .../include/asm/book3s/64/tlbflush-hash.h     |  2 +
 .../include/asm/book3s/64/tlbflush-radix.h    |  5 ++
 arch/powerpc/kvm/book3s_64_mmu_radix.c        | 65 +++++++-------
 arch/powerpc/kvm/book3s_hv.c                  | 21 ++++-
 arch/powerpc/kvm/book3s_hv_builtin.c          | 14 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S       | 43 +--------
 arch/powerpc/mm/hash_native_64.c              |  8 ++
 arch/powerpc/mm/tlb-radix.c                   | 87 +++++++++++++++++++
 8 files changed, 157 insertions(+), 88 deletions(-)

-- 
2.17.0

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH 1/5] powerpc/64s/mm: Implement LPID based TLB flushes to be used by KVM
  2018-04-10 12:48 [RFC PATCH 0/5] KVM TLB flushing improvements Nicholas Piggin
@ 2018-04-10 12:48 ` Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 2/5] KVM: PPC: Book3S HV: kvmppc_radix_tlbie_page use Linux flush function Nicholas Piggin
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-10 12:48 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Nicholas Piggin, linuxppc-dev

Implent local TLB flush for entire LPID, for hash and radix, and
a global TLB flush for a partition scoped page in an LPID, for
radix.

These will be used by KVM in subsequent patches.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 .../include/asm/book3s/64/tlbflush-hash.h     |  2 +
 .../include/asm/book3s/64/tlbflush-radix.h    |  5 ++
 arch/powerpc/mm/hash_native_64.c              |  8 ++
 arch/powerpc/mm/tlb-radix.c                   | 87 +++++++++++++++++++
 4 files changed, 102 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 64d02a704bcb..8b328fd87722 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -53,6 +53,8 @@ static inline void arch_leave_lazy_mmu_mode(void)
 
 extern void hash__tlbiel_all(unsigned int action);
 
+extern void hash__local_flush_tlb_lpid(unsigned int lpid);
+
 extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize,
 			    int ssize, unsigned long flags);
 extern void flush_hash_range(unsigned long number, int local);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 19b45ba6caf9..2ddaadf3e9ea 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -51,4 +51,9 @@ extern void radix__flush_tlb_all(void);
 extern void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
 					unsigned long address);
 
+extern void radix__flush_tlb_lpid_page(unsigned int lpid,
+					unsigned long addr,
+					unsigned long page_size);
+extern void radix__local_flush_tlb_lpid(unsigned int lpid);
+
 #endif
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 1d049c78c82a..2f02cd780c19 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -294,6 +294,14 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
 		raw_spin_unlock(&native_tlbie_lock);
 }
 
+void hash__local_flush_tlb_lpid(unsigned int lpid)
+{
+	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+	hash__tlbiel_all(TLB_INVAL_SCOPE_LPID);
+}
+EXPORT_SYMBOL_GPL(hash__local_flush_tlb_lpid);
+
 static inline void native_lock_hpte(struct hash_pte *hptep)
 {
 	unsigned long *word = (unsigned long *)&hptep->v;
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 2fba6170ab3f..f246fb0ac049 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -119,6 +119,22 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+				unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(52); /* IS = 2 */
+	rb |= set << PPC_BITLSHIFT(51);
+	rs = 0;  /* LPID comes from LPIDR */
+	prs = 0; /* partition scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 			       unsigned long ap, unsigned long ric)
 {
@@ -151,6 +167,22 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+			      unsigned long ap, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = va & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
+	rs = lpid;
+	prs = 0; /* partition scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
 static inline void fixup_tlbie(void)
 {
 	unsigned long pid = 0;
@@ -215,6 +247,34 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+	int set;
+
+	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+	 * also flush the entire Page Walk Cache.
+	 */
+	__tlbiel_lpid(lpid, 0, ric);
+
+	/* For PWC, only one flush is needed */
+	if (ric == RIC_FLUSH_PWC) {
+		asm volatile("ptesync": : :"memory");
+		return;
+	}
+
+	/* For the remaining sets, just flush the TLB */
+	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+		__tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
 static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize)
@@ -269,6 +329,17 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+			      unsigned long psize, unsigned long ric)
+{
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	__tlbie_lpid_va(va, lpid, ap, ric);
+	fixup_tlbie();
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
 static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize, bool also_pwc)
@@ -535,6 +606,22 @@ static int radix_get_mmu_psize(int page_size)
 	return psize;
 }
 
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+					unsigned long addr,
+					unsigned long page_size)
+{
+	int psize = radix_get_mmu_psize(page_size);
+
+	_tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+	_tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
 static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize);
 
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH 2/5] KVM: PPC: Book3S HV: kvmppc_radix_tlbie_page use Linux flush function
  2018-04-10 12:48 [RFC PATCH 0/5] KVM TLB flushing improvements Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 1/5] powerpc/64s/mm: Implement LPID based TLB flushes to be used by KVM Nicholas Piggin
@ 2018-04-10 12:48 ` Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 3/5] KVM: PPC: Book3S HV: kvmhv_p9_set_lpcr " Nicholas Piggin
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-10 12:48 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Nicholas Piggin, linuxppc-dev

This has the advantage of consolidating TLB flush code in fewer
places, and it also implements powerpc:tlbie trace events.

1GB pages should be handled without further modification.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 81d5ad26f9a1..dab6b622011c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -139,28 +139,16 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	return 0;
 }
 
-#ifdef CONFIG_PPC_64K_PAGES
-#define MMU_BASE_PSIZE	MMU_PAGE_64K
-#else
-#define MMU_BASE_PSIZE	MMU_PAGE_4K
-#endif
-
 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 				    unsigned int pshift)
 {
-	int psize = MMU_BASE_PSIZE;
-
-	if (pshift >= PMD_SHIFT)
-		psize = MMU_PAGE_2M;
-	addr &= ~0xfffUL;
-	addr |= mmu_psize_defs[psize].ap << 5;
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
-		     : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG))
-		asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
-			     : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
-	asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
+	unsigned long psize = PAGE_SIZE;
+
+	if (pshift)
+		psize = 1UL << pshift;
+
+	addr &= ~(psize - 1);
+	radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
 }
 
 unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH 3/5] KVM: PPC: Book3S HV: kvmhv_p9_set_lpcr use Linux flush function
  2018-04-10 12:48 [RFC PATCH 0/5] KVM TLB flushing improvements Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 1/5] powerpc/64s/mm: Implement LPID based TLB flushes to be used by KVM Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 2/5] KVM: PPC: Book3S HV: kvmppc_radix_tlbie_page use Linux flush function Nicholas Piggin
@ 2018-04-10 12:48 ` Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 5/5] KVM: PPC: Book3S HV: Radix do not clear partition scoped page table when page fault races with other vCPUs Nicholas Piggin
  4 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-10 12:48 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Nicholas Piggin, linuxppc-dev

The existing flush uses the radix value for sets, and uses R=0
tlbiel instructions. This can't be quite right, but I'm not entirely
sure if this is the right way to fix it.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv_builtin.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 0b9b8e188bfa..577769fbfae9 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -676,7 +676,7 @@ static void wait_for_sync(struct kvm_split_mode *sip, int phase)
 
 void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
 {
-	unsigned long rb, set;
+	struct kvm *kvm = local_paca->kvm_hstate.kvm_vcpu->kvm;
 
 	/* wait for every other thread to get to real mode */
 	wait_for_sync(sip, PHASE_REALMODE);
@@ -689,14 +689,10 @@ void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
 	/* Invalidate the TLB on thread 0 */
 	if (local_paca->kvm_hstate.tid == 0) {
 		sip->do_set = 0;
-		asm volatile("ptesync" : : : "memory");
-		for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
-			rb = TLBIEL_INVAL_SET_LPID +
-				(set << TLBIEL_INVAL_SET_SHIFT);
-			asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
-				     "r" (rb), "r" (0));
-		}
-		asm volatile("ptesync" : : : "memory");
+		if (kvm_is_radix(kvm))
+			radix__local_flush_tlb_lpid(kvm->arch.lpid);
+		else
+			hash__local_flush_tlb_lpid(kvm->arch.lpid);
 	}
 
 	/* indicate that we have done so and wait for others */
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry
  2018-04-10 12:48 [RFC PATCH 0/5] KVM TLB flushing improvements Nicholas Piggin
                   ` (2 preceding siblings ...)
  2018-04-10 12:48 ` [RFC PATCH 3/5] KVM: PPC: Book3S HV: kvmhv_p9_set_lpcr " Nicholas Piggin
@ 2018-04-10 12:48 ` Nicholas Piggin
  2018-04-11  1:32   ` Benjamin Herrenschmidt
  2018-04-15  5:28   ` Nicholas Piggin
  2018-04-10 12:48 ` [RFC PATCH 5/5] KVM: PPC: Book3S HV: Radix do not clear partition scoped page table when page fault races with other vCPUs Nicholas Piggin
  4 siblings, 2 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-10 12:48 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Nicholas Piggin, linuxppc-dev

Move this flushing out of assembly and have it use Linux TLB
flush implementations introduced earlier. This allows powerpc:tlbie
trace events to be used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv.c            | 21 +++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 43 +------------------------
 2 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 81e2ea882d97..5d4783b5b47a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2680,7 +2680,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	int sub;
 	bool thr0_done;
 	unsigned long cmd_bit, stat_bit;
-	int pcpu, thr;
+	int pcpu, thr, tmp;
 	int target_threads;
 	int controlled_threads;
 	int trap;
@@ -2780,6 +2780,25 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		return;
 	}
 
+	/*
+	 * Do we need to flush the TLB for the LPAR? (see TLB comment above)
+         * On POWER9, individual threads can come in here, but the
+         * TLB is shared between the 4 threads in a core, hence
+         * invalidating on one thread invalidates for all.
+         * Thus we make all 4 threads use the same bit here.
+         */
+	tmp = pcpu;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		tmp &= ~0x3UL;
+	if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
+		if (kvm_is_radix(vc->kvm))
+			radix__local_flush_tlb_lpid(vc->kvm->arch.lpid);
+		else
+			hash__local_flush_tlb_lpid(vc->kvm->arch.lpid);
+		/* Clear the bit after the TLB flush */
+		cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
+	}
+
 	kvmppc_clear_host_core(pcpu);
 
 	/* Decide on micro-threading (split-core) mode */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index bd63fa8a08b5..6a23a0f3ceea 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -647,49 +647,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	mtspr	SPRN_LPID,r7
 	isync
 
-	/* See if we need to flush the TLB */
-	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
-BEGIN_FTR_SECTION
-	/*
-	 * On POWER9, individual threads can come in here, but the
-	 * TLB is shared between the 4 threads in a core, hence
-	 * invalidating on one thread invalidates for all.
-	 * Thus we make all 4 threads use the same bit here.
-	 */
-	clrrdi	r6,r6,2
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
-	srdi	r6,r6,6			/* doubleword number */
-	sldi	r6,r6,3			/* address offset */
-	add	r6,r6,r9
-	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
-	li	r8,1
-	sld	r8,r8,r7
-	ld	r7,0(r6)
-	and.	r7,r7,r8
-	beq	22f
-	/* Flush the TLB of any entries for this LPID */
-	lwz	r0,KVM_TLB_SETS(r9)
-	mtctr	r0
-	li	r7,0x800		/* IS field = 0b10 */
-	ptesync
-	li	r0,0			/* RS for P9 version of tlbiel */
-	bne	cr7, 29f
-28:	tlbiel	r7			/* On P9, rs=0, RIC=0, PRS=0, R=0 */
-	addi	r7,r7,0x1000
-	bdnz	28b
-	b	30f
-29:	PPC_TLBIEL(7,0,2,1,1)		/* for radix, RIC=2, PRS=1, R=1 */
-	addi	r7,r7,0x1000
-	bdnz	29b
-30:	ptesync
-23:	ldarx	r7,0,r6			/* clear the bit after TLB flushed */
-	andc	r7,r7,r8
-	stdcx.	r7,0,r6
-	bne	23b
-
 	/* Add timebase offset onto timebase */
-22:	ld	r8,VCORE_TB_OFFSET(r5)
+	ld	r8,VCORE_TB_OFFSET(r5)
 	cmpdi	r8,0
 	beq	37f
 	mftb	r6		/* current host timebase */
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH 5/5] KVM: PPC: Book3S HV: Radix do not clear partition scoped page table when page fault races with other vCPUs.
  2018-04-10 12:48 [RFC PATCH 0/5] KVM TLB flushing improvements Nicholas Piggin
                   ` (3 preceding siblings ...)
  2018-04-10 12:48 ` [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry Nicholas Piggin
@ 2018-04-10 12:48 ` Nicholas Piggin
  4 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-10 12:48 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Nicholas Piggin, linuxppc-dev

KVM with an SMP radix guest can get into storms of page faults and
tlbies due to the partition scopd page tables being invalidated and
TLB flushed if they were found to race with another page fault that
set them up.

This tends to cause vCPUs to pile up if several hit common addresses,
then page faults will get serialized on common locks, and then they
each invalidate the previous entry and it's long enough before installing
the new entry that will cause more CPUs to hit page faults and they will
invalidate that new entry.

There doesn't seem to be a need to invalidate in the case of an existing
entry. This solves the tlbie storms.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 39 +++++++++++++++-----------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index dab6b622011c..4af177d24f6c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -243,6 +243,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 	pmd = pmd_offset(pud, gpa);
 	if (pmd_is_leaf(*pmd)) {
 		unsigned long lgpa = gpa & PMD_MASK;
+		pte_t old_pte = *pmdp_ptep(pmd);
 
 		/*
 		 * If we raced with another CPU which has just put
@@ -252,18 +253,17 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 			ret = -EAGAIN;
 			goto out_unlock;
 		}
-		/* Valid 2MB page here already, remove it */
-		old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
-					      ~0UL, 0, lgpa, PMD_SHIFT);
-		kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT);
-		if (old & _PAGE_DIRTY) {
-			unsigned long gfn = lgpa >> PAGE_SHIFT;
-			struct kvm_memory_slot *memslot;
-			memslot = gfn_to_memslot(kvm, gfn);
-			if (memslot && memslot->dirty_bitmap)
-				kvmppc_update_dirty_map(memslot,
-							gfn, PMD_SIZE);
+		WARN_ON_ONCE(pte_pfn(old_pte) != pte_pfn(pte));
+		if (pte_val(old_pte) == pte_val(pte)) {
+			ret = -EAGAIN;
+			goto out_unlock;
 		}
+
+		/* Valid 2MB page here already, remove it */
+		kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
+					0, pte_val(pte), lgpa, PMD_SHIFT);
+		ret = 0;
+		goto out_unlock;
 	} else if (level == 1 && !pmd_none(*pmd)) {
 		/*
 		 * There's a page table page here, but we wanted
@@ -274,6 +274,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 		goto out_unlock;
 	}
 	if (level == 0) {
+		pte_t old_pte;
+
 		if (pmd_none(*pmd)) {
 			if (!new_ptep)
 				goto out_unlock;
@@ -281,13 +283,16 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 			new_ptep = NULL;
 		}
 		ptep = pte_offset_kernel(pmd, gpa);
-		if (pte_present(*ptep)) {
+		old_pte = *ptep;
+		if (pte_present(old_pte)) {
 			/* PTE was previously valid, so invalidate it */
-			old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
-						      0, gpa, 0);
-			kvmppc_radix_tlbie_page(kvm, gpa, 0);
-			if (old & _PAGE_DIRTY)
-				mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+			WARN_ON_ONCE(pte_pfn(old_pte) != pte_pfn(pte));
+			if (pte_val(old_pte) == pte_val(pte)) {
+				ret = -EAGAIN;
+				goto out_unlock;
+			}
+			kvmppc_radix_update_pte(kvm, ptep, 0,
+						pte_val(pte), gpa, 0);
 		}
 		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 	} else {
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry
  2018-04-10 12:48 ` [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry Nicholas Piggin
@ 2018-04-11  1:32   ` Benjamin Herrenschmidt
  2018-04-11  2:19     ` Nicholas Piggin
  2018-04-15  5:28   ` Nicholas Piggin
  1 sibling, 1 reply; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-04-11  1:32 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev

On Tue, 2018-04-10 at 22:48 +1000, Nicholas Piggin wrote:
>  
> +       /*
> +        * Do we need to flush the TLB for the LPAR? (see TLB comment above)
> +         * On POWER9, individual threads can come in here, but the
> +         * TLB is shared between the 4 threads in a core, hence
> +         * invalidating on one thread invalidates for all.
> +         * Thus we make all 4 threads use the same bit here.
> +         */

This might be true of P9 implementation but isn't architecturally
correct. From an ISA perspective, the threads could have dedicatd
tagged TLB entries. Do we need to be careful here vs. backward
compatiblity ?

Also this won't flush ERAT entries for another thread afaik.

> +       tmp = pcpu;
> +       if (cpu_has_feature(CPU_FTR_ARCH_300))
> +               tmp &= ~0x3UL;
> +       if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
> +               if (kvm_is_radix(vc->kvm))
> +                       radix__local_flush_tlb_lpid(vc->kvm->arch.lpid);
> +               else
> +                       hash__local_flush_tlb_lpid(vc->kvm->arch.lpid);
> +               /* Clear the bit after the TLB flush */
> +               cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
> +       }
> +

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry
  2018-04-11  1:32   ` Benjamin Herrenschmidt
@ 2018-04-11  2:19     ` Nicholas Piggin
  0 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-11  2:19 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: kvm-ppc, linuxppc-dev

On Wed, 11 Apr 2018 11:32:12 +1000
Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:

> On Tue, 2018-04-10 at 22:48 +1000, Nicholas Piggin wrote:
> >  
> > +       /*
> > +        * Do we need to flush the TLB for the LPAR? (see TLB comment above)
> > +         * On POWER9, individual threads can come in here, but the
> > +         * TLB is shared between the 4 threads in a core, hence
> > +         * invalidating on one thread invalidates for all.
> > +         * Thus we make all 4 threads use the same bit here.
> > +         */  
> 
> This might be true of P9 implementation but isn't architecturally
> correct. From an ISA perspective, the threads could have dedicatd
> tagged TLB entries. Do we need to be careful here vs. backward
> compatiblity ?

I think so. I noticed that, just trying to do like for like replacement
with this patch.

Yes it should have a feature bit test for this optimization IMO. That
can be expanded if other CPUs have the same ability... Is it even
a worthwhile optimisation to do at this point, I wonder? I didn't see
it being hit a lot in traces.

> Also this won't flush ERAT entries for another thread afaik.

Yeah, I'm still not entirely clear exactly when ERATs get invalidated.
I would like to see more commentary here to show why it's okay.

> 
> > +       tmp = pcpu;
> > +       if (cpu_has_feature(CPU_FTR_ARCH_300))
> > +               tmp &= ~0x3UL;
> > +       if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
> > +               if (kvm_is_radix(vc->kvm))
> > +                       radix__local_flush_tlb_lpid(vc->kvm->arch.lpid);
> > +               else
> > +                       hash__local_flush_tlb_lpid(vc->kvm->arch.lpid);
> > +               /* Clear the bit after the TLB flush */
> > +               cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
> > +       }
> > +  
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry
  2018-04-10 12:48 ` [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry Nicholas Piggin
  2018-04-11  1:32   ` Benjamin Herrenschmidt
@ 2018-04-15  5:28   ` Nicholas Piggin
  1 sibling, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2018-04-15  5:28 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev

On Tue, 10 Apr 2018 22:48:41 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> Move this flushing out of assembly and have it use Linux TLB
> flush implementations introduced earlier. This allows powerpc:tlbie
> trace events to be used.

Ah, I kept wondering why my guests were locking up with stale PWC
after this patch... Turns out I made a significant oversight -- radix
wants to flush the LPID's process scoped translations here, to deal
with vCPU migration. My patch had it flushing the partition scoped.

Radix and HPT have quite different requirements here, and HPT has to
flush in real-mode. So I'll update the patch and leave the asm flush
in place for HPT only, and do the right thing here for radix.

We already have other gaps with tracing HPT flushes, so they will all
have to be dealt with some other way if we want that capability.

After this is fixed, the series has been stable here so far. I'll
re-send all the tlb stuff tomorrow if it holds up.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/kvm/book3s_hv.c            | 21 +++++++++++-
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 43 +------------------------
>  2 files changed, 21 insertions(+), 43 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 81e2ea882d97..5d4783b5b47a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -2680,7 +2680,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>  	int sub;
>  	bool thr0_done;
>  	unsigned long cmd_bit, stat_bit;
> -	int pcpu, thr;
> +	int pcpu, thr, tmp;
>  	int target_threads;
>  	int controlled_threads;
>  	int trap;
> @@ -2780,6 +2780,25 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>  		return;
>  	}
>  
> +	/*
> +	 * Do we need to flush the TLB for the LPAR? (see TLB comment above)
> +         * On POWER9, individual threads can come in here, but the
> +         * TLB is shared between the 4 threads in a core, hence
> +         * invalidating on one thread invalidates for all.
> +         * Thus we make all 4 threads use the same bit here.
> +         */
> +	tmp = pcpu;
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		tmp &= ~0x3UL;
> +	if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
> +		if (kvm_is_radix(vc->kvm))
> +			radix__local_flush_tlb_lpid(vc->kvm->arch.lpid);
> +		else
> +			hash__local_flush_tlb_lpid(vc->kvm->arch.lpid);
> +		/* Clear the bit after the TLB flush */
> +		cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
> +	}
> +
>  	kvmppc_clear_host_core(pcpu);
>  
>  	/* Decide on micro-threading (split-core) mode */
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index bd63fa8a08b5..6a23a0f3ceea 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -647,49 +647,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>  	mtspr	SPRN_LPID,r7
>  	isync
>  
> -	/* See if we need to flush the TLB */
> -	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
> -BEGIN_FTR_SECTION
> -	/*
> -	 * On POWER9, individual threads can come in here, but the
> -	 * TLB is shared between the 4 threads in a core, hence
> -	 * invalidating on one thread invalidates for all.
> -	 * Thus we make all 4 threads use the same bit here.
> -	 */
> -	clrrdi	r6,r6,2
> -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> -	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
> -	srdi	r6,r6,6			/* doubleword number */
> -	sldi	r6,r6,3			/* address offset */
> -	add	r6,r6,r9
> -	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
> -	li	r8,1
> -	sld	r8,r8,r7
> -	ld	r7,0(r6)
> -	and.	r7,r7,r8
> -	beq	22f
> -	/* Flush the TLB of any entries for this LPID */
> -	lwz	r0,KVM_TLB_SETS(r9)
> -	mtctr	r0
> -	li	r7,0x800		/* IS field = 0b10 */
> -	ptesync
> -	li	r0,0			/* RS for P9 version of tlbiel */
> -	bne	cr7, 29f
> -28:	tlbiel	r7			/* On P9, rs=0, RIC=0, PRS=0, R=0 */
> -	addi	r7,r7,0x1000
> -	bdnz	28b
> -	b	30f
> -29:	PPC_TLBIEL(7,0,2,1,1)		/* for radix, RIC=2, PRS=1, R=1 */
> -	addi	r7,r7,0x1000
> -	bdnz	29b
> -30:	ptesync
> -23:	ldarx	r7,0,r6			/* clear the bit after TLB flushed */
> -	andc	r7,r7,r8
> -	stdcx.	r7,0,r6
> -	bne	23b
> -
>  	/* Add timebase offset onto timebase */
> -22:	ld	r8,VCORE_TB_OFFSET(r5)
> +	ld	r8,VCORE_TB_OFFSET(r5)
>  	cmpdi	r8,0
>  	beq	37f
>  	mftb	r6		/* current host timebase */

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-04-15  5:28 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-10 12:48 [RFC PATCH 0/5] KVM TLB flushing improvements Nicholas Piggin
2018-04-10 12:48 ` [RFC PATCH 1/5] powerpc/64s/mm: Implement LPID based TLB flushes to be used by KVM Nicholas Piggin
2018-04-10 12:48 ` [RFC PATCH 2/5] KVM: PPC: Book3S HV: kvmppc_radix_tlbie_page use Linux flush function Nicholas Piggin
2018-04-10 12:48 ` [RFC PATCH 3/5] KVM: PPC: Book3S HV: kvmhv_p9_set_lpcr " Nicholas Piggin
2018-04-10 12:48 ` [RFC PATCH 4/5] KVM: PPC: Book3S HV: handle need_tlb_flush in C before low-level guest entry Nicholas Piggin
2018-04-11  1:32   ` Benjamin Herrenschmidt
2018-04-11  2:19     ` Nicholas Piggin
2018-04-15  5:28   ` Nicholas Piggin
2018-04-10 12:48 ` [RFC PATCH 5/5] KVM: PPC: Book3S HV: Radix do not clear partition scoped page table when page fault races with other vCPUs Nicholas Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).