All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs
@ 2011-05-18 21:04 Scott Wood
  2011-05-18 21:05 ` [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table Scott Wood
                   ` (6 more replies)
  0 siblings, 7 replies; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:04 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

This allows a virtual page table to be used at the PMD rather than
the PTE level.

Rather than adjust the constant in pgd_index() (or ignore it, as
too-large values don't hurt as long as overly large addresses aren't
passed in), go back to using PTRS_PER_PGD.  The overflow comment seems to
apply to a very old implementation of free_pgtables that used pgd_index()
(unfortunately the commit message, if you seek it out in the historic
tree, doesn't mention any details about the overflow).  The existing
value was numerically indentical to the old 4K-page PTRS_PER_PGD, so
using it shouldn't produce an overflow where it's not otherwise possible.

Also get rid of the incorrect comment at the top of pgtable-ppc64-4k.h.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/pgtable-ppc64-4k.h |   12 ++++--------
 arch/powerpc/include/asm/pgtable-ppc64.h    |    3 +--
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64-4k.h b/arch/powerpc/include/asm/pgtable-ppc64-4k.h
index 6eefdcf..194005e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-4k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-4k.h
@@ -1,14 +1,10 @@
 #ifndef _ASM_POWERPC_PGTABLE_PPC64_4K_H
 #define _ASM_POWERPC_PGTABLE_PPC64_4K_H
-/*
- * Entries per page directory level.  The PTE level must use a 64b record
- * for each page table entry.  The PMD and PGD level use a 32b record for
- * each entry by assuming that each entry is page aligned.
- */
+
 #define PTE_INDEX_SIZE  9
-#define PMD_INDEX_SIZE  7
+#define PMD_INDEX_SIZE  9
 #define PUD_INDEX_SIZE  7
-#define PGD_INDEX_SIZE  9
+#define PGD_INDEX_SIZE  7
 
 #ifndef __ASSEMBLY__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
@@ -19,7 +15,7 @@
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PUD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 /* PMD_SHIFT determines what a second-level page table entry can map */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 2b09cd5..8bd1cd9 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -181,8 +181,7 @@
  * Find an entry in a page-table-directory.  We combine the address region
  * (the high order N bits) and the pgd portion of the address.
  */
-/* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff)
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
@ 2011-05-18 21:05 ` Scott Wood
  2011-05-18 21:33   ` Benjamin Herrenschmidt
  2011-05-18 21:05 ` [PATCH 3/7] powerpc/mm: 64-bit tlb miss: get PACA from memory rather than SPR Scott Wood
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:05 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

Loads with non-linear access patterns were producing a very high
ratio of recursive pt faults to regular tlb misses.  Rather than
choose between a 4-level table walk or a 1-level virtual page table
lookup, use a hybrid scheme with a virtual linear pmd, followed by a
2-level lookup in the normal handler.

This adds about 5 cycles (assuming no cache misses, and e5500 timing)
to a normal TLB miss, but greatly reduces the recursive fault rate
for loads which don't have locality within 2 MiB regions but do have
significant locality within 1 GiB regions.  Improvements of close to 50%
were seen on such benchmarks.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/mm/tlb_low_64e.S |   23 +++++++++++++++--------
 1 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index af08922..17726d3 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -24,7 +24,7 @@
 #ifdef CONFIG_PPC_64K_PAGES
 #define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
 #else
-#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
+#define VPTE_PMD_SHIFT	0
 #endif
 #define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
 #define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
@@ -185,7 +185,7 @@ normal_tlb_miss:
 	/* Insert the bottom bits in */
 	rlwimi	r14,r15,0,16,31
 #else
-	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
+	rldicl	r14,r16,64-(PMD_SHIFT-3),PMD_SHIFT-3+4
 #endif
 	sldi	r15,r10,60
 	clrrdi	r14,r14,3
@@ -202,6 +202,16 @@ MMU_FTR_SECTION_ELSE
 	ld	r14,0(r10)
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
 
+#ifndef CONFIG_PPC_64K_PAGES
+	rldicl	r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+
+	cmpldi	cr0,r14,0
+	beq	normal_tlb_miss_access_fault
+
+	ldx	r14,r14,r15
+#endif
+
 finish_normal_tlb_miss:
 	/* Check if required permissions are met */
 	andc.	r15,r11,r14
@@ -353,14 +363,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
 #ifndef CONFIG_PPC_64K_PAGES
 	/* Get to PUD entry */
 	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	virt_page_table_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
-
+#else
 	/* Get to PMD entry */
 	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+#endif
+
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
 	cmpldi	cr0,r15,0
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/7] powerpc/mm: 64-bit tlb miss: get PACA from memory rather than SPR
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
  2011-05-18 21:05 ` [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table Scott Wood
@ 2011-05-18 21:05 ` Scott Wood
  2011-05-18 21:05 ` [PATCH 4/7] powerpc/mm: 64-bit: Don't load PACA in normal TLB miss exceptions Scott Wood
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:05 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

This saves a few cycles, at least on e5500.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/exception-64e.h |   16 +++++++---------
 arch/powerpc/kernel/paca.c               |    5 +++++
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index 6d53f31..6921261 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -62,16 +62,14 @@
 #define EX_TLB_ESR	( 9 * 8) /* Level 0 and 2 only */
 #define EX_TLB_SRR0	(10 * 8)
 #define EX_TLB_SRR1	(11 * 8)
-#define EX_TLB_MMUCR0	(12 * 8) /* Level 0 */
-#define EX_TLB_MAS1	(12 * 8) /* Level 0 */
-#define EX_TLB_MAS2	(13 * 8) /* Level 0 */
+#define EX_TLB_PACA	(12 * 8)
 #ifdef CONFIG_BOOK3E_MMU_TLB_STATS
-#define EX_TLB_R8	(14 * 8)
-#define EX_TLB_R9	(15 * 8)
-#define EX_TLB_LR	(16 * 8)
-#define EX_TLB_SIZE	(17 * 8)
+#define EX_TLB_R8	(13 * 8)
+#define EX_TLB_R9	(14 * 8)
+#define EX_TLB_LR	(15 * 8)
+#define EX_TLB_SIZE	(16 * 8)
 #else
-#define EX_TLB_SIZE	(14 * 8)
+#define EX_TLB_SIZE	(13 * 8)
 #endif
 
 #define	START_EXCEPTION(label)						\
@@ -98,7 +96,7 @@ exc_##label##_book3e:
 	std	r11,EX_TLB_R11(r12);					    \
 	mfspr	r11,SPRN_SPRG_TLB_SCRATCH;				    \
 	std	r13,EX_TLB_R13(r12);					    \
-	mfspr	r13,SPRN_SPRG_PACA;					    \
+	ld	r13,EX_TLB_PACA(r12);					    \
 	std	r14,EX_TLB_R14(r12);					    \
 	addi	r14,r12,EX_TLB_SIZE;					    \
 	std	r15,EX_TLB_R15(r12);					    \
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 102244e..814dae2 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -151,6 +151,11 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 #ifdef CONFIG_PPC_STD_MMU_64
 	new_paca->slb_shadow_ptr = &slb_shadow[cpu];
 #endif /* CONFIG_PPC_STD_MMU_64 */
+#ifdef CONFIG_PPC_BOOK3E
+	new_paca->extlb[0][EX_TLB_PACA / 8] = (u64)new_paca;
+	new_paca->extlb[1][EX_TLB_PACA / 8] = (u64)new_paca;
+	new_paca->extlb[2][EX_TLB_PACA / 8] = (u64)new_paca;
+#endif
 }
 
 /* Put the paca pointer into r13 and SPRG_PACA */
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/7] powerpc/mm: 64-bit: Don't load PACA in normal TLB miss exceptions
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
  2011-05-18 21:05 ` [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table Scott Wood
  2011-05-18 21:05 ` [PATCH 3/7] powerpc/mm: 64-bit tlb miss: get PACA from memory rather than SPR Scott Wood
@ 2011-05-18 21:05 ` Scott Wood
  2011-05-18 21:05 ` [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes Scott Wood
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:05 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

Load it only when needed, in recursive/linear/indirect faults,
and in the stats code.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/exception-64e.h |   28 +++++++++---------
 arch/powerpc/mm/tlb_low_64e.S            |   43 +++++++++++++++++------------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index 6921261..9b57a27 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -80,9 +80,9 @@ exc_##label##_book3e:
  *
  * This prolog handles re-entrancy (up to 3 levels supported in the PACA
  * though we currently don't test for overflow). It provides you with a
- * re-entrancy safe working space of r10...r16 and CR with r12 being used
- * as the exception area pointer in the PACA for that level of re-entrancy
- * and r13 containing the PACA pointer.
+ * re-entrancy safe working space of r10...r16 (except r13) and CR with r12
+ * being used as the exception area pointer in the PACA for that level of
+ * re-entrancy.
  *
  * SRR0 and SRR1 are saved, but DEAR and ESR are not, since they don't apply
  * as-is for instruction exceptions. It's up to the actual exception code
@@ -95,8 +95,6 @@ exc_##label##_book3e:
 	mfcr	r10;							    \
 	std	r11,EX_TLB_R11(r12);					    \
 	mfspr	r11,SPRN_SPRG_TLB_SCRATCH;				    \
-	std	r13,EX_TLB_R13(r12);					    \
-	ld	r13,EX_TLB_PACA(r12);					    \
 	std	r14,EX_TLB_R14(r12);					    \
 	addi	r14,r12,EX_TLB_SIZE;					    \
 	std	r15,EX_TLB_R15(r12);					    \
@@ -135,7 +133,6 @@ exc_##label##_book3e:
 	mtspr	SPRN_SPRG_TLB_EXFRAME,freg;				    \
 	ld	r11,EX_TLB_R11(r12);					    \
 	mtcr	r14;							    \
-	ld	r13,EX_TLB_R13(r12);					    \
 	ld	r14,EX_TLB_R14(r12);					    \
 	mtspr	SPRN_SRR0,r15;						    \
 	ld	r15,EX_TLB_R15(r12);					    \
@@ -148,11 +145,13 @@ exc_##label##_book3e:
 	TLB_MISS_RESTORE(r12)
 
 #define TLB_MISS_EPILOG_ERROR						    \
-	addi	r12,r13,PACA_EXTLB;					    \
+	ld	r10,EX_TLB_PACA(r12);					    \
+	addi	r12,r10,PACA_EXTLB;					    \
 	TLB_MISS_RESTORE(r12)
 
 #define TLB_MISS_EPILOG_ERROR_SPECIAL					    \
-	addi	r11,r13,PACA_EXTLB;					    \
+	ld	r10,EX_TLB_PACA(r12);					    \
+	addi	r11,r10,PACA_EXTLB;					    \
 	TLB_MISS_RESTORE(r11)
 
 #ifdef CONFIG_BOOK3E_MMU_TLB_STATS
@@ -160,25 +159,26 @@ exc_##label##_book3e:
 	mflr	r10;							    \
 	std	r8,EX_TLB_R8(r12);					    \
 	std	r9,EX_TLB_R9(r12);					    \
-	std	r10,EX_TLB_LR(r12);
+	std	r10,EX_TLB_LR(r12);					    \
+	ld	r9,EX_TLB_PACA(r12);
 #define TLB_MISS_RESTORE_STATS					            \
 	ld	r16,EX_TLB_LR(r12);					    \
 	ld	r9,EX_TLB_R9(r12);					    \
 	ld	r8,EX_TLB_R8(r12);					    \
 	mtlr	r16;
 #define TLB_MISS_STATS_D(name)						    \
-	addi	r9,r13,MMSTAT_DSTATS+name;				    \
+	addi	r9,r9,MMSTAT_DSTATS+name;				    \
 	bl	.tlb_stat_inc;
 #define TLB_MISS_STATS_I(name)						    \
-	addi	r9,r13,MMSTAT_ISTATS+name;				    \
+	addi	r9,r9,MMSTAT_ISTATS+name;				    \
 	bl	.tlb_stat_inc;
 #define TLB_MISS_STATS_X(name)						    \
-	ld	r8,PACA_EXTLB+EX_TLB_ESR(r13);				    \
+	ld	r8,PACA_EXTLB+EX_TLB_ESR(r9);				    \
 	cmpdi	cr2,r8,-1;						    \
 	beq	cr2,61f;						    \
-	addi	r9,r13,MMSTAT_DSTATS+name;				    \
+	addi	r9,r9,MMSTAT_DSTATS+name;				    \
 	b	62f;							    \
-61:	addi	r9,r13,MMSTAT_ISTATS+name;				    \
+61:	addi	r9,r9,MMSTAT_ISTATS+name;				    \
 62:	bl	.tlb_stat_inc;
 #define TLB_MISS_STATS_SAVE_INFO					    \
 	std	r14,EX_TLB_ESR(r12);	/* save ESR */			    \
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 17726d3..922fece 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -160,7 +160,6 @@
  * r16 = faulting address
  * r15 = region ID
  * r14 = crap (free to use)
- * r13 = PACA
  * r12 = TLB exception frame in PACA
  * r11 = PTE permission mask
  * r10 = crap (free to use)
@@ -299,8 +298,7 @@ normal_tlb_miss_access_fault:
  *
  * r16 = virtual page table faulting address
  * r15 = region (top 4 bits of address)
- * r14 = crap (free to use)
- * r13 = PACA
+ * r14 = crap (we load with PACA)
  * r12 = TLB exception frame in PACA
  * r11 = crap (free to use)
  * r10 = crap (free to use)
@@ -318,6 +316,8 @@ normal_tlb_miss_access_fault:
  * so we could probably optimize things a bit
  */
 virt_page_table_tlb_miss:
+	ld	r14,EX_TLB_PACA(r12)
+
 	/* Are we hitting a kernel page table ? */
 	andi.	r10,r15,0x8
 
@@ -325,7 +325,7 @@ virt_page_table_tlb_miss:
 	 * and we happen to have the swapper_pg_dir at offset 8 from the user
 	 * pgdir in the PACA :-).
 	 */
-	add	r11,r10,r13
+	add	r11,r10,r14
 
 	/* If kernel, we need to clear MAS1 TID */
 	beq	1f
@@ -417,12 +417,12 @@ virt_page_table_tlb_miss_done:
 	 * offset the return address by -4 in order to replay the tlbsrx
 	 * instruction there
 	 */
-	subf	r10,r13,r12
+	subf	r10,r14,r12
 	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
 	bne-	1f
-	ld	r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+	ld	r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r14)
 	addi	r10,r11,-4
-	std	r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+	std	r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r14)
 1:
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
 	/* Return to caller, normal case */
@@ -449,13 +449,13 @@ virt_page_table_tlb_miss_fault:
 	 * level as well. Since we are doing that, we don't need to clear or
 	 * restore the TLB reservation neither.
 	 */
-	subf	r10,r13,r12
+	subf	r10,r14,r12
 	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
 	bne-	virt_page_table_tlb_miss_whacko_fault
 
 	/* We dig the original DEAR and ESR from slot 0 */
-	ld	r15,EX_TLB_DEAR+PACA_EXTLB(r13)
-	ld	r16,EX_TLB_ESR+PACA_EXTLB(r13)
+	ld	r15,EX_TLB_DEAR+PACA_EXTLB(r14)
+	ld	r16,EX_TLB_ESR+PACA_EXTLB(r14)
 
 	/* We check for the "special" ESR value for instruction faults */
 	cmpdi	cr0,r16,-1
@@ -489,6 +489,8 @@ virt_page_table_tlb_miss_whacko_fault:
 	START_EXCEPTION(data_tlb_miss_htw)
 	TLB_MISS_PROLOG
 
+	ld	r15,EX_TLB_PACA(r12)
+
 	/* Now we handle the fault proper. We only save DEAR in normal
 	 * fault case since that's the only interesting values here.
 	 * We could probably also optimize by not saving SRR0/1 in the
@@ -503,8 +505,12 @@ virt_page_table_tlb_miss_whacko_fault:
 
 	/* We do the user/kernel test for the PID here along with the RW test
 	 */
+	/* The cool thing now is that r11 contains 0 for user and 8 for kernel,
+	 * and we happen to have the swapper_pg_dir at offset 8 from the user
+	 * pgdir in the PACA :-).
+	 */
 	cmpldi	cr0,r11,0		/* Check for user region */
-	ld	r15,PACAPGD(r13)	/* Load user pgdir */
+	add	r15,r15,r11
 	beq	htw_tlb_miss
 
 	/* XXX replace the RMW cycles with immediate loads + writes */
@@ -512,7 +518,6 @@ virt_page_table_tlb_miss_whacko_fault:
 	cmpldi	cr0,r11,8		/* Check for vmalloc region */
 	rlwinm	r10,r10,0,16,1		/* Clear TID */
 	mtspr	SPRN_MAS1,r10
-	ld	r15,PACA_KERNELPGD(r13)	/* Load kernel pgdir */
 	beq+	htw_tlb_miss
 
 	/* We got a crappy address, just fault with whatever DEAR and ESR
@@ -526,6 +531,8 @@ virt_page_table_tlb_miss_whacko_fault:
 	START_EXCEPTION(instruction_tlb_miss_htw)
 	TLB_MISS_PROLOG
 
+	ld	r15,EX_TLB_PACA(r12)
+
 	/* If we take a recursive fault, the second level handler may need
 	 * to know whether we are handling a data or instruction fault in
 	 * order to get to the right store fault handler. We provide that
@@ -548,7 +555,7 @@ virt_page_table_tlb_miss_whacko_fault:
 	/* We do the user/kernel test for the PID here along with the RW test
 	 */
 	cmpldi	cr0,r11,0			/* Check for user region */
-	ld	r15,PACAPGD(r13)		/* Load user pgdir */
+	add	r15,r15,r11
 	beq	htw_tlb_miss
 
 	/* XXX replace the RMW cycles with immediate loads + writes */
@@ -556,7 +563,6 @@ virt_page_table_tlb_miss_whacko_fault:
 	cmpldi	cr0,r11,8			/* Check for vmalloc region */
 	rlwinm	r10,r10,0,16,1			/* Clear TID */
 	mtspr	SPRN_MAS1,r10
-	ld	r15,PACA_KERNELPGD(r13)		/* Load kernel pgdir */
 	beq+	htw_tlb_miss
 
 	/* We got a crappy address, just fault */
@@ -570,9 +576,8 @@ virt_page_table_tlb_miss_whacko_fault:
  * misses. We are entered with:
  *
  * r16 = virtual page table faulting address
- * r15 = PGD pointer
+ * r15 = pointer to PGD pointer
  * r14 = ESR
- * r13 = PACA
  * r12 = TLB exception frame in PACA
  * r11 = crap (free to use)
  * r10 = crap (free to use)
@@ -581,6 +586,8 @@ virt_page_table_tlb_miss_whacko_fault:
  * avoid too much complication, it will save/restore things for us
  */
 htw_tlb_miss:
+	ld	r15,0(r15)
+
 	/* Search if we already have a TLB entry for that virtual address, and
 	 * if we do, bail out.
 	 *
@@ -692,7 +699,6 @@ htw_tlb_miss_fault:
  * r16 = faulting address
  * r15 = crap (free to use)
  * r14 = ESR (data) or -1 (instruction)
- * r13 = PACA
  * r12 = TLB exception frame in PACA
  * r11 = crap (free to use)
  * r10 = crap (free to use)
@@ -714,7 +720,8 @@ tlb_load_linear:
 	 * we only use 1G pages for now. That might have to be changed in a
 	 * final implementation, especially when dealing with hypervisors
 	 */
-	ld	r11,PACATOC(r13)
+	ld	r11,EX_TLB_PACA(r12)
+	ld	r11,PACATOC(r11)
 	ld	r11,linear_map_top@got(r11)
 	ld	r10,0(r11)
 	cmpld	cr0,r10,r16
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
                   ` (2 preceding siblings ...)
  2011-05-18 21:05 ` [PATCH 4/7] powerpc/mm: 64-bit: Don't load PACA in normal TLB miss exceptions Scott Wood
@ 2011-05-18 21:05 ` Scott Wood
  2011-05-18 21:36   ` Benjamin Herrenschmidt
  2011-05-18 21:05 ` [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization Scott Wood
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:05 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

I don't see where any non-standard page size will be set in the
kernel page tables, so don't waste time checking for it.  It wouldn't
work with TLB0 on an FSL MMU anyway, so if there's something I missed
(or which is out-of-tree), it's relying on implementation-specific
behavior.  If there's an out-of-tree need for occasional 4K mappings
with CONFIG_PPC_64K_PAGES, perhaps this check could only be done when
that is defined.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/mm/tlb_low_64e.S |   13 -------------
 1 files changed, 0 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 922fece..e782023 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -232,19 +232,6 @@ finish_normal_tlb_miss:
 	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
 	mtspr	SPRN_MAS2,r11
 
-	/* Check page size, if not standard, update MAS1 */
-	rldicl	r11,r14,64-8,64-8
-#ifdef CONFIG_PPC_64K_PAGES
-	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
-#else
-	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
-#endif
-	beq-	1f
-	mfspr	r11,SPRN_MAS1
-	rlwimi	r11,r14,31,21,24
-	rlwinm	r11,r11,0,21,19
-	mtspr	SPRN_MAS1,r11
-1:
 	/* Move RPN in position */
 	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
 	clrldi	r15,r11,12		/* Clear crap at the top */
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
                   ` (3 preceding siblings ...)
  2011-05-18 21:05 ` [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes Scott Wood
@ 2011-05-18 21:05 ` Scott Wood
  2011-05-18 21:37   ` Benjamin Herrenschmidt
  2011-05-18 21:05 ` [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS Scott Wood
  2011-05-18 21:32 ` [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Benjamin Herrenschmidt
  6 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:05 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

A little more speed up measured on e5500.

Setting of U0-3 is dropped as it is not used by Linux as far as I can
see.

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/mm/tlb_low_64e.S |   21 ++++++++-------------
 1 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index e782023..a94c87b 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -47,10 +47,10 @@
 	 * We could probably also optimize by not saving SRR0/1 in the
 	 * linear mapping case but I'll leave that for later
 	 */
-	mfspr	r14,SPRN_ESR
 	mfspr	r16,SPRN_DEAR		/* get faulting address */
 	srdi	r15,r16,60		/* get region */
 	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	mfspr	r14,SPRN_ESR
 	TLB_MISS_STATS_SAVE_INFO
 	beq	tlb_load_linear		/* yes -> go to linear map load */
 
@@ -62,11 +62,11 @@
 	andi.	r10,r15,0x1
 	bne-	virt_page_table_tlb_miss
 
-	std	r14,EX_TLB_ESR(r12);	/* save ESR */
-	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
+	/* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
 
-	 /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+	std	r14,EX_TLB_ESR(r12);	/* save ESR */
 	li	r11,_PAGE_PRESENT
+	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
 	oris	r11,r11,_PAGE_ACCESSED@h
 
 	/* We do the user/kernel test for the PID here along with the RW test
@@ -225,21 +225,16 @@ finish_normal_tlb_miss:
 	 *                 yet implemented for now
 	 * MAS 2   :	Defaults not useful, need to be redone
 	 * MAS 3+7 :	Needs to be done
-	 *
-	 * TODO: mix up code below for better scheduling
 	 */
 	clrrdi	r11,r16,12		/* Clear low crap in EA */
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
 	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
+	clrldi	r15,r15,12		/* Clear crap at the top */
 	mtspr	SPRN_MAS2,r11
-
-	/* Move RPN in position */
-	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
-	clrldi	r15,r11,12		/* Clear crap at the top */
-	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	andi.	r11,r14,_PAGE_DIRTY
 	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
 
 	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
-	andi.	r11,r14,_PAGE_DIRTY
 	bne	1f
 	li	r11,MAS3_SW|MAS3_UW
 	andc	r15,r15,r11
@@ -483,10 +478,10 @@ virt_page_table_tlb_miss_whacko_fault:
 	 * We could probably also optimize by not saving SRR0/1 in the
 	 * linear mapping case but I'll leave that for later
 	 */
-	mfspr	r14,SPRN_ESR
 	mfspr	r16,SPRN_DEAR		/* get faulting address */
 	srdi	r11,r16,60		/* get region */
 	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	mfspr	r14,SPRN_ESR
 	TLB_MISS_STATS_SAVE_INFO
 	beq	tlb_load_linear		/* yes -> go to linear map load */
 
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
                   ` (4 preceding siblings ...)
  2011-05-18 21:05 ` [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization Scott Wood
@ 2011-05-18 21:05 ` Scott Wood
  2011-05-18 21:38   ` Benjamin Herrenschmidt
  2011-05-18 21:32 ` [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Benjamin Herrenschmidt
  6 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:05 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev

Signed-off-by: Scott Wood <scottwood@freescale.com>
---
Is there any 64-bit book3e chip that doesn't support this?  It
doesn't appear to be optional in the ISA.

 arch/powerpc/kernel/cputable.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 34d2722..a3b8eeb 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -1981,7 +1981,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_E5500,
 		.cpu_user_features	= COMMON_USER_BOOKE,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
-			MMU_FTR_USE_TLBILX,
+			MMU_FTR_USE_TLBILX | MMU_FTR_USE_PAIRED_MAS,
 		.icache_bsize		= 64,
 		.dcache_bsize		= 64,
 		.num_pmcs		= 4,
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs
  2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
                   ` (5 preceding siblings ...)
  2011-05-18 21:05 ` [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS Scott Wood
@ 2011-05-18 21:32 ` Benjamin Herrenschmidt
  2011-05-18 21:46   ` Scott Wood
  6 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:32 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Wed, 2011-05-18 at 16:04 -0500, Scott Wood wrote:
> This allows a virtual page table to be used at the PMD rather than
> the PTE level.
> 
> Rather than adjust the constant in pgd_index() (or ignore it, as
> too-large values don't hurt as long as overly large addresses aren't
> passed in), go back to using PTRS_PER_PGD.  The overflow comment seems to
> apply to a very old implementation of free_pgtables that used pgd_index()
> (unfortunately the commit message, if you seek it out in the historic
> tree, doesn't mention any details about the overflow).  The existing
> value was numerically indentical to the old 4K-page PTRS_PER_PGD, so
> using it shouldn't produce an overflow where it's not otherwise possible.
> 
> Also get rid of the incorrect comment at the top of pgtable-ppc64-4k.h.

Why do you want to create a virtual page table at the PMD level ? Also,
you are changing the geometry of the page tables which I think we don't
want. We chose that geometry so that the levels match the segment sizes
on server, I think it may have an impact with the hugetlbfs code (check
with David), it also was meant as a way to implement shared page tables
on hash64 tho we never published that.

Cheers,
Ben.

> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
>  arch/powerpc/include/asm/pgtable-ppc64-4k.h |   12 ++++--------
>  arch/powerpc/include/asm/pgtable-ppc64.h    |    3 +--
>  2 files changed, 5 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pgtable-ppc64-4k.h b/arch/powerpc/include/asm/pgtable-ppc64-4k.h
> index 6eefdcf..194005e 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc64-4k.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64-4k.h
> @@ -1,14 +1,10 @@
>  #ifndef _ASM_POWERPC_PGTABLE_PPC64_4K_H
>  #define _ASM_POWERPC_PGTABLE_PPC64_4K_H
> -/*
> - * Entries per page directory level.  The PTE level must use a 64b record
> - * for each page table entry.  The PMD and PGD level use a 32b record for
> - * each entry by assuming that each entry is page aligned.
> - */
> +
>  #define PTE_INDEX_SIZE  9
> -#define PMD_INDEX_SIZE  7
> +#define PMD_INDEX_SIZE  9
>  #define PUD_INDEX_SIZE  7
> -#define PGD_INDEX_SIZE  9
> +#define PGD_INDEX_SIZE  7
>  
>  #ifndef __ASSEMBLY__
>  #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
> @@ -19,7 +15,7 @@
>  
>  #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
>  #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
> -#define PTRS_PER_PUD	(1 << PMD_INDEX_SIZE)
> +#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
>  #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
>  
>  /* PMD_SHIFT determines what a second-level page table entry can map */
> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
> index 2b09cd5..8bd1cd9 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> @@ -181,8 +181,7 @@
>   * Find an entry in a page-table-directory.  We combine the address region
>   * (the high order N bits) and the pgd portion of the address.
>   */
> -/* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */
> -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff)
> +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
>  
>  #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
>  

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-18 21:05 ` [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table Scott Wood
@ 2011-05-18 21:33   ` Benjamin Herrenschmidt
  2011-05-20 20:57     ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:33 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> Loads with non-linear access patterns were producing a very high
> ratio of recursive pt faults to regular tlb misses.  Rather than
> choose between a 4-level table walk or a 1-level virtual page table
> lookup, use a hybrid scheme with a virtual linear pmd, followed by a
> 2-level lookup in the normal handler.
> 
> This adds about 5 cycles (assuming no cache misses, and e5500 timing)
> to a normal TLB miss, but greatly reduces the recursive fault rate
> for loads which don't have locality within 2 MiB regions but do have
> significant locality within 1 GiB regions.  Improvements of close to 50%
> were seen on such benchmarks.

Can you publish benchmarks that compare these two with no virtual at all
(4 full loads) ?

Cheers,
Ben.

> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
>  arch/powerpc/mm/tlb_low_64e.S |   23 +++++++++++++++--------
>  1 files changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
> index af08922..17726d3 100644
> --- a/arch/powerpc/mm/tlb_low_64e.S
> +++ b/arch/powerpc/mm/tlb_low_64e.S
> @@ -24,7 +24,7 @@
>  #ifdef CONFIG_PPC_64K_PAGES
>  #define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
>  #else
> -#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
> +#define VPTE_PMD_SHIFT	0
>  #endif
>  #define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
>  #define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
> @@ -185,7 +185,7 @@ normal_tlb_miss:
>  	/* Insert the bottom bits in */
>  	rlwimi	r14,r15,0,16,31
>  #else
> -	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
> +	rldicl	r14,r16,64-(PMD_SHIFT-3),PMD_SHIFT-3+4
>  #endif
>  	sldi	r15,r10,60
>  	clrrdi	r14,r14,3
> @@ -202,6 +202,16 @@ MMU_FTR_SECTION_ELSE
>  	ld	r14,0(r10)
>  ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
>  
> +#ifndef CONFIG_PPC_64K_PAGES
> +	rldicl	r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
> +	clrrdi	r15,r15,3
> +
> +	cmpldi	cr0,r14,0
> +	beq	normal_tlb_miss_access_fault
> +
> +	ldx	r14,r14,r15
> +#endif
> +
>  finish_normal_tlb_miss:
>  	/* Check if required permissions are met */
>  	andc.	r15,r11,r14
> @@ -353,14 +363,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
>  #ifndef CONFIG_PPC_64K_PAGES
>  	/* Get to PUD entry */
>  	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
> -	clrrdi	r10,r11,3
> -	ldx	r15,r10,r15
> -	cmpldi	cr0,r15,0
> -	beq	virt_page_table_tlb_miss_fault
> -#endif /* CONFIG_PPC_64K_PAGES */
> -
> +#else
>  	/* Get to PMD entry */
>  	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
> +#endif
> +
>  	clrrdi	r10,r11,3
>  	ldx	r15,r10,r15
>  	cmpldi	cr0,r15,0

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes
  2011-05-18 21:05 ` [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes Scott Wood
@ 2011-05-18 21:36   ` Benjamin Herrenschmidt
  2011-05-18 21:50     ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:36 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, David Gibson

On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> I don't see where any non-standard page size will be set in the
> kernel page tables, so don't waste time checking for it.  It wouldn't
> work with TLB0 on an FSL MMU anyway, so if there's something I missed
> (or which is out-of-tree), it's relying on implementation-specific
> behavior.  If there's an out-of-tree need for occasional 4K mappings
> with CONFIG_PPC_64K_PAGES, perhaps this check could only be done when
> that is defined.
> 
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---

Do you use that in the hugetlbfs code ? Can you publish that code ? It's
long overdue...

Cheers,
Ben.

>  arch/powerpc/mm/tlb_low_64e.S |   13 -------------
>  1 files changed, 0 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
> index 922fece..e782023 100644
> --- a/arch/powerpc/mm/tlb_low_64e.S
> +++ b/arch/powerpc/mm/tlb_low_64e.S
> @@ -232,19 +232,6 @@ finish_normal_tlb_miss:
>  	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
>  	mtspr	SPRN_MAS2,r11
>  
> -	/* Check page size, if not standard, update MAS1 */
> -	rldicl	r11,r14,64-8,64-8
> -#ifdef CONFIG_PPC_64K_PAGES
> -	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
> -#else
> -	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
> -#endif
> -	beq-	1f
> -	mfspr	r11,SPRN_MAS1
> -	rlwimi	r11,r14,31,21,24
> -	rlwinm	r11,r11,0,21,19
> -	mtspr	SPRN_MAS1,r11
> -1:
>  	/* Move RPN in position */
>  	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
>  	clrldi	r15,r11,12		/* Clear crap at the top */

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization
  2011-05-18 21:05 ` [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization Scott Wood
@ 2011-05-18 21:37   ` Benjamin Herrenschmidt
  2011-05-18 21:51     ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:37 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> A little more speed up measured on e5500.
> 
> Setting of U0-3 is dropped as it is not used by Linux as far as I can
> see.

Please keep them for now. If your core doesn't have them, make them an
MMU feature.

Cheers,
Ben.

> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
>  arch/powerpc/mm/tlb_low_64e.S |   21 ++++++++-------------
>  1 files changed, 8 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
> index e782023..a94c87b 100644
> --- a/arch/powerpc/mm/tlb_low_64e.S
> +++ b/arch/powerpc/mm/tlb_low_64e.S
> @@ -47,10 +47,10 @@
>  	 * We could probably also optimize by not saving SRR0/1 in the
>  	 * linear mapping case but I'll leave that for later
>  	 */
> -	mfspr	r14,SPRN_ESR
>  	mfspr	r16,SPRN_DEAR		/* get faulting address */
>  	srdi	r15,r16,60		/* get region */
>  	cmpldi	cr0,r15,0xc		/* linear mapping ? */
> +	mfspr	r14,SPRN_ESR
>  	TLB_MISS_STATS_SAVE_INFO
>  	beq	tlb_load_linear		/* yes -> go to linear map load */
>  
> @@ -62,11 +62,11 @@
>  	andi.	r10,r15,0x1
>  	bne-	virt_page_table_tlb_miss
>  
> -	std	r14,EX_TLB_ESR(r12);	/* save ESR */
> -	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
> +	/* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
>  
> -	 /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
> +	std	r14,EX_TLB_ESR(r12);	/* save ESR */
>  	li	r11,_PAGE_PRESENT
> +	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
>  	oris	r11,r11,_PAGE_ACCESSED@h
>  
>  	/* We do the user/kernel test for the PID here along with the RW test
> @@ -225,21 +225,16 @@ finish_normal_tlb_miss:
>  	 *                 yet implemented for now
>  	 * MAS 2   :	Defaults not useful, need to be redone
>  	 * MAS 3+7 :	Needs to be done
> -	 *
> -	 * TODO: mix up code below for better scheduling
>  	 */
>  	clrrdi	r11,r16,12		/* Clear low crap in EA */
> +	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
>  	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
> +	clrldi	r15,r15,12		/* Clear crap at the top */
>  	mtspr	SPRN_MAS2,r11
> -
> -	/* Move RPN in position */
> -	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
> -	clrldi	r15,r11,12		/* Clear crap at the top */
> -	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
> +	andi.	r11,r14,_PAGE_DIRTY
>  	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
>  
>  	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
> -	andi.	r11,r14,_PAGE_DIRTY
>  	bne	1f
>  	li	r11,MAS3_SW|MAS3_UW
>  	andc	r15,r15,r11
> @@ -483,10 +478,10 @@ virt_page_table_tlb_miss_whacko_fault:
>  	 * We could probably also optimize by not saving SRR0/1 in the
>  	 * linear mapping case but I'll leave that for later
>  	 */
> -	mfspr	r14,SPRN_ESR
>  	mfspr	r16,SPRN_DEAR		/* get faulting address */
>  	srdi	r11,r16,60		/* get region */
>  	cmpldi	cr0,r11,0xc		/* linear mapping ? */
> +	mfspr	r14,SPRN_ESR
>  	TLB_MISS_STATS_SAVE_INFO
>  	beq	tlb_load_linear		/* yes -> go to linear map load */
>  

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS
  2011-05-18 21:05 ` [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS Scott Wood
@ 2011-05-18 21:38   ` Benjamin Herrenschmidt
  2011-05-18 21:52     ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:38 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> ---
> Is there any 64-bit book3e chip that doesn't support this?  It
> doesn't appear to be optional in the ISA.

Not afaik.

Cheers,
Ben.

>  arch/powerpc/kernel/cputable.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
> index 34d2722..a3b8eeb 100644
> --- a/arch/powerpc/kernel/cputable.c
> +++ b/arch/powerpc/kernel/cputable.c
> @@ -1981,7 +1981,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
>  		.cpu_features		= CPU_FTRS_E5500,
>  		.cpu_user_features	= COMMON_USER_BOOKE,
>  		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
> -			MMU_FTR_USE_TLBILX,
> +			MMU_FTR_USE_TLBILX | MMU_FTR_USE_PAIRED_MAS,
>  		.icache_bsize		= 64,
>  		.dcache_bsize		= 64,
>  		.num_pmcs		= 4,

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs
  2011-05-18 21:32 ` [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Benjamin Herrenschmidt
@ 2011-05-18 21:46   ` Scott Wood
  2011-05-18 21:52     ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:46 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Thu, 19 May 2011 07:32:41 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2011-05-18 at 16:04 -0500, Scott Wood wrote:
> > This allows a virtual page table to be used at the PMD rather than
> > the PTE level.
> > 
> > Rather than adjust the constant in pgd_index() (or ignore it, as
> > too-large values don't hurt as long as overly large addresses aren't
> > passed in), go back to using PTRS_PER_PGD.  The overflow comment seems to
> > apply to a very old implementation of free_pgtables that used pgd_index()
> > (unfortunately the commit message, if you seek it out in the historic
> > tree, doesn't mention any details about the overflow).  The existing
> > value was numerically indentical to the old 4K-page PTRS_PER_PGD, so
> > using it shouldn't produce an overflow where it's not otherwise possible.
> > 
> > Also get rid of the incorrect comment at the top of pgtable-ppc64-4k.h.
> 
> Why do you want to create a virtual page table at the PMD level ? Also,
> you are changing the geometry of the page tables which I think we don't
> want. We chose that geometry so that the levels match the segment sizes
> on server, I think it may have an impact with the hugetlbfs code (check
> with David), it also was meant as a way to implement shared page tables
> on hash64 tho we never published that.

The number of virtual page table misses were very high on certain loads.
Cutting back to a virtual PMD eliminates most of that for the benchmark I
tested, though it could still be painful for access patterns that are
extremely spread out through the 64-bit address space.  I'll try a full
4-level walk and see what the performance is like; I was aiming for a
compromise between random access and linear/localized access.

Why does it need to match segment sizes on server?

As for hugetlbfs, it merged easily enough with Becky's patches (you'll have
to ask her when they'll be published).

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes
  2011-05-18 21:36   ` Benjamin Herrenschmidt
@ 2011-05-18 21:50     ` Scott Wood
  2011-05-18 21:54       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, David Gibson

On Thu, 19 May 2011 07:36:04 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > I don't see where any non-standard page size will be set in the
> > kernel page tables, so don't waste time checking for it.  It wouldn't
> > work with TLB0 on an FSL MMU anyway, so if there's something I missed
> > (or which is out-of-tree), it's relying on implementation-specific
> > behavior.  If there's an out-of-tree need for occasional 4K mappings
> > with CONFIG_PPC_64K_PAGES, perhaps this check could only be done when
> > that is defined.
> > 
> > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > ---
> 
> Do you use that in the hugetlbfs code ? Can you publish that code ? It's
> long overdue...

hugetlbfs entries don't get loaded by this code.  It branches to a slow
path based on seeing a positive value in a pgd/pud/pmd entry.

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization
  2011-05-18 21:37   ` Benjamin Herrenschmidt
@ 2011-05-18 21:51     ` Scott Wood
  2011-05-18 21:54       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:51 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Thu, 19 May 2011 07:37:47 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > A little more speed up measured on e5500.
> > 
> > Setting of U0-3 is dropped as it is not used by Linux as far as I can
> > see.
> 
> Please keep them for now. If your core doesn't have them, make them an
> MMU feature.

We have them, it was just an attempt to clean out unused things to speed up
the miss handler.  I'll drop that part if you think we'll use it in the
future.

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS
  2011-05-18 21:38   ` Benjamin Herrenschmidt
@ 2011-05-18 21:52     ` Scott Wood
  2011-05-18 21:58       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-18 21:52 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Thu, 19 May 2011 07:38:19 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > ---
> > Is there any 64-bit book3e chip that doesn't support this?  It
> > doesn't appear to be optional in the ISA.
> 
> Not afaik.

Any objection to just removing the feature bit?

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs
  2011-05-18 21:46   ` Scott Wood
@ 2011-05-18 21:52     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:52 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev


> > Why do you want to create a virtual page table at the PMD level ? Also,
> > you are changing the geometry of the page tables which I think we don't
> > want. We chose that geometry so that the levels match the segment sizes
> > on server, I think it may have an impact with the hugetlbfs code (check
> > with David), it also was meant as a way to implement shared page tables
> > on hash64 tho we never published that.
> 
> The number of virtual page table misses were very high on certain loads.
> Cutting back to a virtual PMD eliminates most of that for the benchmark I
> tested, though it could still be painful for access patterns that are
> extremely spread out through the 64-bit address space.  I'll try a full
> 4-level walk and see what the performance is like; I was aiming for a
> compromise between random access and linear/localized access.

Let's get more numbers first then :-)

> Why does it need to match segment sizes on server?

I'm not sure whether we have a dependency with hugetlbfs there, I need
to check (remember we have one page size per segment there). For sharing
page tables that came from us using the PMD pointer as a base to
calculate the VSIDs. But I don't think we have plans to revive those
patches in the immediate future.

Cheers,
Ben.

> As for hugetlbfs, it merged easily enough with Becky's patches (you'll have
> to ask her when they'll be published).
> 
> -Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes
  2011-05-18 21:50     ` Scott Wood
@ 2011-05-18 21:54       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:54 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, David Gibson

On Wed, 2011-05-18 at 16:50 -0500, Scott Wood wrote:
> On Thu, 19 May 2011 07:36:04 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > > I don't see where any non-standard page size will be set in the
> > > kernel page tables, so don't waste time checking for it.  It wouldn't
> > > work with TLB0 on an FSL MMU anyway, so if there's something I missed
> > > (or which is out-of-tree), it's relying on implementation-specific
> > > behavior.  If there's an out-of-tree need for occasional 4K mappings
> > > with CONFIG_PPC_64K_PAGES, perhaps this check could only be done when
> > > that is defined.
> > > 
> > > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > > ---
> > 
> > Do you use that in the hugetlbfs code ? Can you publish that code ? It's
> > long overdue...
> 
> hugetlbfs entries don't get loaded by this code.  It branches to a slow
> path based on seeing a positive value in a pgd/pud/pmd entry.

BTW. The long overdue was aimed at David to get A2 hugetlbfs out :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization
  2011-05-18 21:51     ` Scott Wood
@ 2011-05-18 21:54       ` Benjamin Herrenschmidt
  2011-05-18 22:27         ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:54 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Wed, 2011-05-18 at 16:51 -0500, Scott Wood wrote:
> On Thu, 19 May 2011 07:37:47 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > > A little more speed up measured on e5500.
> > > 
> > > Setting of U0-3 is dropped as it is not used by Linux as far as I can
> > > see.
> > 
> > Please keep them for now. If your core doesn't have them, make them an
> > MMU feature.
> 
> We have them, it was just an attempt to clean out unused things to speed up
> the miss handler.  I'll drop that part if you think we'll use it in the
> future.

I never know for sure ... damn research people ... :-)

I'd rather keep them for now, does it make a significant difference ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS
  2011-05-18 21:52     ` Scott Wood
@ 2011-05-18 21:58       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-18 21:58 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Wed, 2011-05-18 at 16:52 -0500, Scott Wood wrote:
> On Thu, 19 May 2011 07:38:19 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > > ---
> > > Is there any 64-bit book3e chip that doesn't support this?  It
> > > doesn't appear to be optional in the ISA.
> > 
> > Not afaik.
> 
> Any objection to just removing the feature bit?

Nope. Wasn't it added by Kumar in the first place ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization
  2011-05-18 21:54       ` Benjamin Herrenschmidt
@ 2011-05-18 22:27         ` Scott Wood
  0 siblings, 0 replies; 27+ messages in thread
From: Scott Wood @ 2011-05-18 22:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Thu, 19 May 2011 07:54:48 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2011-05-18 at 16:51 -0500, Scott Wood wrote:
> > On Thu, 19 May 2011 07:37:47 +1000
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> > 
> > > On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > > > A little more speed up measured on e5500.
> > > > 
> > > > Setting of U0-3 is dropped as it is not used by Linux as far as I can
> > > > see.
> > > 
> > > Please keep them for now. If your core doesn't have them, make them an
> > > MMU feature.
> > 
> > We have them, it was just an attempt to clean out unused things to speed up
> > the miss handler.  I'll drop that part if you think we'll use it in the
> > future.
> 
> I never know for sure ... damn research people ... :-)
> 
> I'd rather keep them for now, does it make a significant difference ?

It was minor but measurable (wouldn't have been worthwhile except as part
of a series of small things that add up), but upon trying again I was able
to reorder slightly and fit it in without seeing an impact.

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-18 21:33   ` Benjamin Herrenschmidt
@ 2011-05-20 20:57     ` Scott Wood
  2011-05-20 22:15       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-20 20:57 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Thu, 19 May 2011 07:33:55 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2011-05-18 at 16:05 -0500, Scott Wood wrote:
> > Loads with non-linear access patterns were producing a very high
> > ratio of recursive pt faults to regular tlb misses.  Rather than
> > choose between a 4-level table walk or a 1-level virtual page table
> > lookup, use a hybrid scheme with a virtual linear pmd, followed by a
> > 2-level lookup in the normal handler.
> > 
> > This adds about 5 cycles (assuming no cache misses, and e5500 timing)
> > to a normal TLB miss, but greatly reduces the recursive fault rate
> > for loads which don't have locality within 2 MiB regions but do have
> > significant locality within 1 GiB regions.  Improvements of close to 50%
> > were seen on such benchmarks.
> 
> Can you publish benchmarks that compare these two with no virtual at all
> (4 full loads) ?

I see a 2% cost going from virtual pmd to full 4-level walk in the
benchmark mentioned above (some type of sort), and just under 3% in
page-stride lat_mem_rd from lmbench.

OTOH, the virtual pmd approach still leaves the possibility of taking a
bunch of virtual page table misses if non-localized accesses happen over a
very large chunk of address space (tens of GiB), and we'd have one fewer
type of TLB miss to worry about complexity-wise with a straight table walk.

Let me know what you'd prefer.

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-20 20:57     ` Scott Wood
@ 2011-05-20 22:15       ` Benjamin Herrenschmidt
  2011-05-23 18:54         ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-20 22:15 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Fri, 2011-05-20 at 15:57 -0500, Scott Wood wrote:

> I see a 2% cost going from virtual pmd to full 4-level walk in the
> benchmark mentioned above (some type of sort), and just under 3% in
> page-stride lat_mem_rd from lmbench.
> 
> OTOH, the virtual pmd approach still leaves the possibility of taking a
> bunch of virtual page table misses if non-localized accesses happen over a
> very large chunk of address space (tens of GiB), and we'd have one fewer
> type of TLB miss to worry about complexity-wise with a straight table walk.
> 
> Let me know what you'd prefer.

I'm tempted to kill the virtual linear feature alltogether.. it didn't
buy us that much. Have you looked if you can snatch back some of those
cycles with hand tuning of the level walker ?

Would it work/help to have a simple cache of the last pmd & address and
compare just that ? Maybe in a SPRG or a known cache hot location like
the PACA in a line that we already load anyways ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-20 22:15       ` Benjamin Herrenschmidt
@ 2011-05-23 18:54         ` Scott Wood
  2011-05-23 20:51           ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-23 18:54 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Sat, 21 May 2011 08:15:36 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Fri, 2011-05-20 at 15:57 -0500, Scott Wood wrote:
> 
> > I see a 2% cost going from virtual pmd to full 4-level walk in the
> > benchmark mentioned above (some type of sort), and just under 3% in
> > page-stride lat_mem_rd from lmbench.
> > 
> > OTOH, the virtual pmd approach still leaves the possibility of taking a
> > bunch of virtual page table misses if non-localized accesses happen over a
> > very large chunk of address space (tens of GiB), and we'd have one fewer
> > type of TLB miss to worry about complexity-wise with a straight table walk.
> > 
> > Let me know what you'd prefer.
> 
> I'm tempted to kill the virtual linear feature alltogether.. it didn't
> buy us that much. Have you looked if you can snatch back some of those
> cycles with hand tuning of the level walker ?

That's after trying a bit of that (pulled the pgd load up before
normal_tlb_miss, and some other reordering).  Not sure how much more can be
squeezed out of it with such techniques, at least with e5500.

Hmm, in the normal miss case we know we're in the first EXTLB level,
right?  So we could cut out a load/mfspr by subtracting EXTLB from r12
to get the PACA (that load's latency is pretty well buried, but maybe we
could replace it with loading pgd, replacing it later if it's a kernel
region).  Maybe move pgd to the first EXTLB, so it's in the same cache line
as the state save data. The PACA cacheline containing pgd is probably
pretty hot in normal kernel code, but not so much in a long stretch of
userspace plus TLB misses (other than for pgd itself).

> Would it work/help to have a simple cache of the last pmd & address and
> compare just that ?

Maybe.

It would still slow down the case where you miss that cache -- not by as
much as a virtual page table miss (and it wouldn't compete for TLB entries
with actual user pages), but it would happen more often, since you'd only be
able to cache one pmd.

> Maybe in a SPRG or a known cache hot location like
> the PACA in a line that we already load anyways ?

A cache access is faster than a SPRG access on our chips (plus we
don't have many to spare, especially if we want to avoid swapping SPRG4-7 on
guest entry/exit in KVM), so I'd favor putting it in the PACA.

I'll try this stuff out and see what helps.

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-23 18:54         ` Scott Wood
@ 2011-05-23 20:51           ` Benjamin Herrenschmidt
  2011-05-23 23:31             ` Scott Wood
  0 siblings, 1 reply; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-23 20:51 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Mon, 2011-05-23 at 13:54 -0500, Scott Wood wrote:
> On Sat, 21 May 2011 08:15:36 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Fri, 2011-05-20 at 15:57 -0500, Scott Wood wrote:
> > 
> > > I see a 2% cost going from virtual pmd to full 4-level walk in the
> > > benchmark mentioned above (some type of sort), and just under 3% in
> > > page-stride lat_mem_rd from lmbench.
> > > 
> > > OTOH, the virtual pmd approach still leaves the possibility of taking a
> > > bunch of virtual page table misses if non-localized accesses happen over a
> > > very large chunk of address space (tens of GiB), and we'd have one fewer
> > > type of TLB miss to worry about complexity-wise with a straight table walk.
> > > 
> > > Let me know what you'd prefer.
> > 
> > I'm tempted to kill the virtual linear feature alltogether.. it didn't
> > buy us that much. Have you looked if you can snatch back some of those
> > cycles with hand tuning of the level walker ?
> 
> That's after trying a bit of that (pulled the pgd load up before
> normal_tlb_miss, and some other reordering).  Not sure how much more can be
> squeezed out of it with such techniques, at least with e5500.
> 
> Hmm, in the normal miss case we know we're in the first EXTLB level,
> right?  So we could cut out a load/mfspr by subtracting EXTLB from r12
> to get the PACA (that load's latency is pretty well buried, but maybe we
> could replace it with loading pgd, replacing it later if it's a kernel
> region).  Maybe move pgd to the first EXTLB, so it's in the same cache line
> as the state save data. The PACA cacheline containing pgd is probably
> pretty hot in normal kernel code, but not so much in a long stretch of
> userspace plus TLB misses (other than for pgd itself).

Is your linear mapping bolted ? If it is you may be able to cut out most
of the save/restore stuff (SRR0,1, ...) since with a normal walk you
won't take nested misses.
 
> > Would it work/help to have a simple cache of the last pmd & address and
> > compare just that ?
> 
> Maybe.
> 
> It would still slow down the case where you miss that cache -- not by as
> much as a virtual page table miss (and it wouldn't compete for TLB entries
> with actual user pages), but it would happen more often, since you'd only be
> able to cache one pmd.
>
> > Maybe in a SPRG or a known cache hot location like
> > the PACA in a line that we already load anyways ?
> 
> A cache access is faster than a SPRG access on our chips (plus we
> don't have many to spare, especially if we want to avoid swapping SPRG4-7 on
> guest entry/exit in KVM), so I'd favor putting it in the PACA.
> 
> I'll try this stuff out and see what helps.

Cool,

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-23 20:51           ` Benjamin Herrenschmidt
@ 2011-05-23 23:31             ` Scott Wood
  2011-05-24  2:52               ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 27+ messages in thread
From: Scott Wood @ 2011-05-23 23:31 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

On Tue, 24 May 2011 06:51:01 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> Is your linear mapping bolted ? If it is you may be able to cut out most
> of the save/restore stuff (SRR0,1, ...) since with a normal walk you
> won't take nested misses.

It is bolted -- we ignore anything we can't map with 16 entries.  The only
semi-realistic case I can think of where we might bump into that (and thus
want non-bolted memory), especially with more than negligible loss compared
to the size of memory, is AMP with a non-zero start address where we have
to stick with smaller pages due to alignment. Even so, 16 times the
alignment of the start of RAM doesn't seem that unreasonable a limit.  The
32-bit limit of 3 entries for lowmem is a bit more troublesome there.

-Scott

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table
  2011-05-23 23:31             ` Scott Wood
@ 2011-05-24  2:52               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 27+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-24  2:52 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev

On Mon, 2011-05-23 at 18:31 -0500, Scott Wood wrote:
> On Tue, 24 May 2011 06:51:01 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > Is your linear mapping bolted ? If it is you may be able to cut out most
> > of the save/restore stuff (SRR0,1, ...) since with a normal walk you
> > won't take nested misses.
> 
> It is bolted -- we ignore anything we can't map with 16 entries.  The only
> semi-realistic case I can think of where we might bump into that (and thus
> want non-bolted memory), especially with more than negligible loss compared
> to the size of memory, is AMP with a non-zero start address where we have
> to stick with smaller pages due to alignment. Even so, 16 times the
> alignment of the start of RAM doesn't seem that unreasonable a limit.  The
> 32-bit limit of 3 entries for lowmem is a bit more troublesome there.

Ok so in this case, it might be worth doing a separate of TLB miss
handlers without all the context save/restore business... would also
make re-entrant CRITs and MCs easier to deal with.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2011-05-24  2:52 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-18 21:04 [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Scott Wood
2011-05-18 21:05 ` [PATCH 2/7] powerpc/mm: 64-bit 4k: use a PMD-based virtual page table Scott Wood
2011-05-18 21:33   ` Benjamin Herrenschmidt
2011-05-20 20:57     ` Scott Wood
2011-05-20 22:15       ` Benjamin Herrenschmidt
2011-05-23 18:54         ` Scott Wood
2011-05-23 20:51           ` Benjamin Herrenschmidt
2011-05-23 23:31             ` Scott Wood
2011-05-24  2:52               ` Benjamin Herrenschmidt
2011-05-18 21:05 ` [PATCH 3/7] powerpc/mm: 64-bit tlb miss: get PACA from memory rather than SPR Scott Wood
2011-05-18 21:05 ` [PATCH 4/7] powerpc/mm: 64-bit: Don't load PACA in normal TLB miss exceptions Scott Wood
2011-05-18 21:05 ` [PATCH 5/7] powerpc/mm: 64-bit: don't handle non-standard page sizes Scott Wood
2011-05-18 21:36   ` Benjamin Herrenschmidt
2011-05-18 21:50     ` Scott Wood
2011-05-18 21:54       ` Benjamin Herrenschmidt
2011-05-18 21:05 ` [PATCH 6/7] powerpc/mm: 64-bit: tlb handler micro-optimization Scott Wood
2011-05-18 21:37   ` Benjamin Herrenschmidt
2011-05-18 21:51     ` Scott Wood
2011-05-18 21:54       ` Benjamin Herrenschmidt
2011-05-18 22:27         ` Scott Wood
2011-05-18 21:05 ` [PATCH 7/7] powerpc/e5500: set MMU_FTR_USE_PAIRED_MAS Scott Wood
2011-05-18 21:38   ` Benjamin Herrenschmidt
2011-05-18 21:52     ` Scott Wood
2011-05-18 21:58       ` Benjamin Herrenschmidt
2011-05-18 21:32 ` [PATCH 1/7] powerpc/mm: 64-bit 4k: use page-sized PMDs Benjamin Herrenschmidt
2011-05-18 21:46   ` Scott Wood
2011-05-18 21:52     ` Benjamin Herrenschmidt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.