All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use
@ 2018-02-10  8:11 Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 1/5] powerpc/mm/slice: pass pointers to struct slice_mask where possible Nicholas Piggin
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-10  8:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V, Christophe Leroy

This series intends to improve performance and reduce stack
consumption in the slice allocation code. It does it by keeping slice
masks in the mm_context rather than compute them for each allocation,
and by reducing bitmaps and slice_masks from stacks, using pointers
instead where possible.

checkstack.pl gives, before:
0x00000de4 slice_get_unmapped_area [slice.o]:           656
0x00001b4c is_hugepage_only_range [slice.o]:            512
0x0000075c slice_find_area_topdown [slice.o]:           416
0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:   272
0x00001aa0 slice_set_range_psize [slice.o]:             240
0x00000a64 slice_find_area [slice.o]:                   176
0x00000174 slice_check_fit [slice.o]:                   112

after:
0x00000d70 slice_get_unmapped_area [slice.o]:           320
0x000008f8 slice_find_area [slice.o]:                   144
0x00001860 slice_set_range_psize [slice.o]:             144
0x000018ec is_hugepage_only_range [slice.o]:            144
0x00000750 slice_find_area_bottomup.isra.4 [slice.o]:   128

The benchmark in https://github.com/linuxppc/linux/issues/49 gives, before:
$ time ./slicemask 
real	0m20.712s
user	0m5.830s
sys	0m15.105s

after:
$ time ./slicemask 
real	0m13.197s
user	0m5.409s
sys	0m7.779s

Thanks,
Nick

Nicholas Piggin (5):
  powerpc/mm/slice: pass pointers to struct slice_mask where possible
  powerpc/mm/slice: implement a slice mask cache
  powerpc/mm/slice: implement slice_check_range_fits
  powerpc/mm/slice: Use const pointers to cached slice masks where
    possible
  powerpc/mm/slice: use the dynamic high slice size to limit bitmap
    operations

 arch/powerpc/include/asm/book3s/64/mmu.h |  20 +-
 arch/powerpc/mm/slice.c                  | 302 +++++++++++++++++++------------
 2 files changed, 204 insertions(+), 118 deletions(-)

-- 
2.15.1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH 1/5] powerpc/mm/slice: pass pointers to struct slice_mask where possible
  2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
@ 2018-02-10  8:11 ` Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 2/5] powerpc/mm/slice: implement a slice mask cache Nicholas Piggin
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-10  8:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V, Christophe Leroy

Pass around const pointers to struct slice_mask where possible, rather
than copies of slice_mask, to reduce stack and call overhead.

checkstack.pl gives, before:
0x00000de4 slice_get_unmapped_area [slice.o]:		656
0x00001b4c is_hugepage_only_range [slice.o]:		512
0x0000075c slice_find_area_topdown [slice.o]:		416
0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:	272
0x00001aa0 slice_set_range_psize [slice.o]:		240
0x00000a64 slice_find_area [slice.o]:			176
0x00000174 slice_check_fit [slice.o]:			112

after:
0x00000bd4 slice_get_unmapped_area [slice.o]:		496
0x000017cc is_hugepage_only_range [slice.o]:		352
0x00000758 slice_find_area [slice.o]:			144
0x00001750 slice_set_range_psize [slice.o]:		144
0x00000180 slice_check_fit [slice.o]:			128
0x000005b0 slice_find_area_bottomup.isra.2 [slice.o]:	128

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/slice.c | 83 +++++++++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 23ec2c5e3b78..e8f6922d3c9b 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -50,19 +50,21 @@ struct slice_mask {
 #ifdef DEBUG
 int _slice_debug = 1;
 
-static void slice_print_mask(const char *label, struct slice_mask mask)
+static void slice_print_mask(const char *label, const struct slice_mask *mask)
 {
 	if (!_slice_debug)
 		return;
-	pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices);
-	pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices);
+	pr_devel("%s low_slice: %*pbl\n", label,
+			(int)SLICE_NUM_LOW, &mask->low_slices);
+	pr_devel("%s high_slice: %*pbl\n", label,
+			(int)SLICE_NUM_HIGH, mask->high_slices);
 }
 
 #define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
 
 #else
 
-static void slice_print_mask(const char *label, struct slice_mask mask) {}
+static void slice_print_mask(const char *label, const struct slice_mask *mask) {}
 #define slice_dbg(fmt...)
 
 #endif
@@ -142,7 +144,8 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
 			__set_bit(i, ret->high_slices);
 }
 
-static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret,
+static void slice_mask_for_size(struct mm_struct *mm, int psize,
+				struct slice_mask *ret,
 				unsigned long high_limit)
 {
 	unsigned char *hpsizes;
@@ -171,7 +174,8 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma
 }
 
 static int slice_check_fit(struct mm_struct *mm,
-			   struct slice_mask mask, struct slice_mask available)
+			   const struct slice_mask *mask,
+			   const struct slice_mask *available)
 {
 	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
 	/*
@@ -180,11 +184,11 @@ static int slice_check_fit(struct mm_struct *mm,
 	 */
 	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
 
-	bitmap_and(result, mask.high_slices,
-		   available.high_slices, slice_count);
+	bitmap_and(result, mask->high_slices,
+		   available->high_slices, slice_count);
 
-	return (mask.low_slices & available.low_slices) == mask.low_slices &&
-		bitmap_equal(result, mask.high_slices, slice_count);
+	return (mask->low_slices & available->low_slices) == mask->low_slices &&
+		bitmap_equal(result, mask->high_slices, slice_count);
 }
 
 static void slice_flush_segments(void *parm)
@@ -202,7 +206,8 @@ static void slice_flush_segments(void *parm)
 	local_irq_restore(flags);
 }
 
-static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
+static void slice_convert(struct mm_struct *mm,
+				const struct slice_mask *mask, int psize)
 {
 	int index, mask_index;
 	/* Write the new slice psize bits */
@@ -220,7 +225,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 
 	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
-		if (mask.low_slices & (1u << i))
+		if (mask->low_slices & (1u << i))
 			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
 				(((unsigned long)psize) << (i * 4));
 
@@ -231,7 +236,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
-		if (test_bit(i, mask.high_slices))
+		if (test_bit(i, mask->high_slices))
 			hpsizes[index] = (hpsizes[index] &
 					  ~(0xf << (mask_index * 4))) |
 				(((unsigned long)psize) << (mask_index * 4));
@@ -254,26 +259,25 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
  * 'available' slice_mark.
  */
 static bool slice_scan_available(unsigned long addr,
-				 struct slice_mask available,
-				 int end,
-				 unsigned long *boundary_addr)
+				 const struct slice_mask *available,
+				 int end, unsigned long *boundary_addr)
 {
 	unsigned long slice;
 	if (addr < SLICE_LOW_TOP) {
 		slice = GET_LOW_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
-		return !!(available.low_slices & (1u << slice));
+		return !!(available->low_slices & (1u << slice));
 	} else {
 		slice = GET_HIGH_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) ?
 			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
-		return !!test_bit(slice, available.high_slices);
+		return !!test_bit(slice, available->high_slices);
 	}
 }
 
 static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 					      unsigned long len,
-					      struct slice_mask available,
+					      const struct slice_mask *available,
 					      int psize, unsigned long high_limit)
 {
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
@@ -319,7 +323,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 
 static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 					     unsigned long len,
-					     struct slice_mask available,
+					     const struct slice_mask *available,
 					     int psize, unsigned long high_limit)
 {
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
@@ -377,7 +381,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 
 
 static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
-				     struct slice_mask mask, int psize,
+				     const struct slice_mask *mask, int psize,
 				     int topdown, unsigned long high_limit)
 {
 	if (topdown)
@@ -386,7 +390,8 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 		return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
 }
 
-static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
+static inline void slice_or_mask(struct slice_mask *dst,
+					const struct slice_mask *src)
 {
 	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
 
@@ -395,7 +400,8 @@ static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
 	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
 }
 
-static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src)
+static inline void slice_andnot_mask(struct slice_mask *dst,
+					const struct slice_mask *src)
 {
 	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
 
@@ -482,7 +488,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * already
 	 */
 	slice_mask_for_size(mm, psize, &good_mask, high_limit);
-	slice_print_mask(" good_mask", good_mask);
+	slice_print_mask(" good_mask", &good_mask);
 
 	/*
 	 * Here "good" means slices that are already the right page size,
@@ -516,12 +522,12 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (addr != 0 || fixed) {
 		/* Build a mask for the requested range */
 		slice_range_to_mask(addr, len, &mask);
-		slice_print_mask(" mask", mask);
+		slice_print_mask(" mask", &mask);
 
 		/* Check if we fit in the good mask. If we do, we just return,
 		 * nothing else to do
 		 */
-		if (slice_check_fit(mm, mask, good_mask)) {
+		if (slice_check_fit(mm, &mask, &good_mask)) {
 			slice_dbg(" fits good !\n");
 			return addr;
 		}
@@ -529,7 +535,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 		/* Now let's see if we can find something in the existing
 		 * slices for that size
 		 */
-		newaddr = slice_find_area(mm, len, good_mask,
+		newaddr = slice_find_area(mm, len, &good_mask,
 					  psize, topdown, high_limit);
 		if (newaddr != -ENOMEM) {
 			/* Found within the good mask, we don't have to setup,
@@ -545,9 +551,10 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 */
 	slice_mask_for_free(mm, &potential_mask, high_limit);
 	slice_or_mask(&potential_mask, &good_mask);
-	slice_print_mask(" potential", potential_mask);
+	slice_print_mask(" potential", &potential_mask);
 
-	if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) {
+	if ((addr != 0 || fixed) &&
+			slice_check_fit(mm, &mask, &potential_mask)) {
 		slice_dbg(" fits potential !\n");
 		goto convert;
 	}
@@ -562,7 +569,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * anywhere in the good area.
 	 */
 	if (addr) {
-		addr = slice_find_area(mm, len, good_mask,
+		addr = slice_find_area(mm, len, &good_mask,
 				       psize, topdown, high_limit);
 		if (addr != -ENOMEM) {
 			slice_dbg(" found area at 0x%lx\n", addr);
@@ -573,14 +580,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* Now let's see if we can find something in the existing slices
 	 * for that size plus free slices
 	 */
-	addr = slice_find_area(mm, len, potential_mask,
+	addr = slice_find_area(mm, len, &potential_mask,
 			       psize, topdown, high_limit);
 
 #ifdef CONFIG_PPC_64K_PAGES
 	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
 		slice_or_mask(&potential_mask, &compat_mask);
-		addr = slice_find_area(mm, len, potential_mask,
+		addr = slice_find_area(mm, len, &potential_mask,
 				       psize, topdown, high_limit);
 	}
 #endif
@@ -590,13 +597,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 
 	slice_range_to_mask(addr, len, &mask);
 	slice_dbg(" found potential area at 0x%lx\n", addr);
-	slice_print_mask(" mask", mask);
+	slice_print_mask(" mask", &mask);
 
  convert:
 	slice_andnot_mask(&mask, &good_mask);
 	slice_andnot_mask(&mask, &compat_mask);
 	if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
-		slice_convert(mm, mask, psize);
+		slice_convert(mm, &mask, psize);
 		if (psize > MMU_PAGE_BASE)
 			on_each_cpu(slice_flush_segments, mm, 1);
 	}
@@ -725,7 +732,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 	VM_BUG_ON(radix_enabled());
 
 	slice_range_to_mask(start, len, &mask);
-	slice_convert(mm, mask, psize);
+	slice_convert(mm, &mask, psize);
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
@@ -772,9 +779,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 #if 0 /* too verbose */
 	slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
 		 mm, addr, len);
-	slice_print_mask(" mask", mask);
-	slice_print_mask(" available", available);
+	slice_print_mask(" mask", &mask);
+	slice_print_mask(" available", &available);
 #endif
-	return !slice_check_fit(mm, mask, available);
+	return !slice_check_fit(mm, &mask, &available);
 }
 #endif
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH 2/5] powerpc/mm/slice: implement a slice mask cache
  2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 1/5] powerpc/mm/slice: pass pointers to struct slice_mask where possible Nicholas Piggin
@ 2018-02-10  8:11 ` Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 3/5] powerpc/mm/slice: implement slice_check_range_fits Nicholas Piggin
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-10  8:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V, Christophe Leroy

Calculating the slice mask can become a signifcant overhead for
get_unmapped_area. This patch adds a struct slice_mask for
each page size in the mm_context, and keeps these in synch with
the slices psize arrays and slb_addr_limit.

This saves about 30% kernel time on a single-page mmap/munmap micro
benchmark.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 20 +++++++++-
 arch/powerpc/mm/slice.c                  | 68 ++++++++++++++++++++++++--------
 2 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 0abeb0e2d616..b6d136fd8ffd 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -80,6 +80,16 @@ struct spinlock;
 /* Maximum possible number of NPUs in a system. */
 #define NV_MAX_NPUS 8
 
+/*
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ */
+struct slice_mask {
+	u64 low_slices;
+	DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
+};
+
 typedef struct {
 	mm_context_id_t id;
 	u16 user_psize;		/* page size index */
@@ -91,9 +101,17 @@ typedef struct {
 	struct npu_context *npu_context;
 
 #ifdef CONFIG_PPC_MM_SLICES
+	unsigned long slb_addr_limit;
 	u64 low_slices_psize;	/* SLB page size encodings */
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
-	unsigned long slb_addr_limit;
+# ifdef CONFIG_PPC_64K_PAGES
+	struct slice_mask mask_64k;
+# endif
+	struct slice_mask mask_4k;
+# ifdef CONFIG_HUGETLB_PAGE
+	struct slice_mask mask_16m;
+	struct slice_mask mask_16g;
+# endif
 #else
 	u16 sllp;		/* SLB page size encoding */
 #endif
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index e8f6922d3c9b..837700bb50a9 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -37,15 +37,6 @@
 #include <asm/hugetlb.h>
 
 static DEFINE_SPINLOCK(slice_convert_lock);
-/*
- * One bit per slice. We have lower slices which cover 256MB segments
- * upto 4G range. That gets us 16 low slices. For the rest we track slices
- * in 1TB size.
- */
-struct slice_mask {
-	u64 low_slices;
-	DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
-};
 
 #ifdef DEBUG
 int _slice_debug = 1;
@@ -144,7 +135,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
 			__set_bit(i, ret->high_slices);
 }
 
-static void slice_mask_for_size(struct mm_struct *mm, int psize,
+static void calc_slice_mask_for_size(struct mm_struct *mm, int psize,
 				struct slice_mask *ret,
 				unsigned long high_limit)
 {
@@ -173,6 +164,40 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize,
 	}
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static void recalc_slice_mask_cache(struct mm_struct *mm)
+{
+	unsigned long l = mm->context.slb_addr_limit;
+	calc_slice_mask_for_size(mm, MMU_PAGE_4K, &mm->context.mask_4k, l);
+#ifdef CONFIG_PPC_64K_PAGES
+	calc_slice_mask_for_size(mm, MMU_PAGE_64K, &mm->context.mask_64k, l);
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+	calc_slice_mask_for_size(mm, MMU_PAGE_16M, &mm->context.mask_16m, l);
+	calc_slice_mask_for_size(mm, MMU_PAGE_16G, &mm->context.mask_16g, l);
+#endif
+}
+
+static const struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
+{
+#ifdef CONFIG_PPC_64K_PAGES
+	if (psize == MMU_PAGE_64K)
+		return &mm->context.mask_64k;
+#endif
+	if (psize == MMU_PAGE_4K)
+		return &mm->context.mask_4k;
+#ifdef CONFIG_HUGETLB_PAGE
+	if (psize == MMU_PAGE_16M)
+		return &mm->context.mask_16m;
+	if (psize == MMU_PAGE_16G)
+		return &mm->context.mask_16g;
+#endif
+	BUG();
+}
+#else
+#error "Must define the slice masks for page sizes supported by the platform"
+#endif
+
 static int slice_check_fit(struct mm_struct *mm,
 			   const struct slice_mask *mask,
 			   const struct slice_mask *available)
@@ -246,6 +271,8 @@ static void slice_convert(struct mm_struct *mm,
 		  (unsigned long)mm->context.low_slices_psize,
 		  (unsigned long)mm->context.high_slices_psize);
 
+	recalc_slice_mask_cache(mm);
+
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 
 	copro_flush_all_slbs(mm);
@@ -448,7 +475,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	}
 
 	if (high_limit > mm->context.slb_addr_limit) {
+		unsigned long flags;
+
 		mm->context.slb_addr_limit = high_limit;
+
+		spin_lock_irqsave(&slice_convert_lock, flags);
+		recalc_slice_mask_cache(mm);
+		spin_unlock_irqrestore(&slice_convert_lock, flags);
+
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
 
@@ -487,7 +521,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
-	slice_mask_for_size(mm, psize, &good_mask, high_limit);
+	good_mask = *slice_mask_for_size(mm, psize);
 	slice_print_mask(" good_mask", &good_mask);
 
 	/*
@@ -512,7 +546,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If we support combo pages, we can allow 64k pages in 4k slices */
 	if (psize == MMU_PAGE_64K) {
-		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit);
+		compat_mask = *slice_mask_for_size(mm, MMU_PAGE_4K);
 		if (fixed)
 			slice_or_mask(&good_mask, &compat_mask);
 	}
@@ -693,7 +727,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 		goto bail;
 
 	mm->context.user_psize = psize;
-	wmb();
+	wmb(); /* Why? */
 
 	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
@@ -720,6 +754,9 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 		  (unsigned long)mm->context.low_slices_psize,
 		  (unsigned long)mm->context.high_slices_psize);
 
+	recalc_slice_mask_cache(mm);
+	spin_unlock_irqrestore(&slice_convert_lock, flags);
+	return;
  bail:
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 }
@@ -760,18 +797,17 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 {
 	struct slice_mask mask, available;
 	unsigned int psize = mm->context.user_psize;
-	unsigned long high_limit = mm->context.slb_addr_limit;
 
 	if (radix_enabled())
 		return 0;
 
 	slice_range_to_mask(addr, len, &mask);
-	slice_mask_for_size(mm, psize, &available, high_limit);
+	available = *slice_mask_for_size(mm, psize);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
 	if (psize == MMU_PAGE_64K) {
 		struct slice_mask compat_mask;
-		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit);
+		compat_mask = *slice_mask_for_size(mm, MMU_PAGE_4K);
 		slice_or_mask(&available, &compat_mask);
 	}
 #endif
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH 3/5] powerpc/mm/slice: implement slice_check_range_fits
  2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 1/5] powerpc/mm/slice: pass pointers to struct slice_mask where possible Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 2/5] powerpc/mm/slice: implement a slice mask cache Nicholas Piggin
@ 2018-02-10  8:11 ` Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 4/5] powerpc/mm/slice: Use const pointers to cached slice masks where possible Nicholas Piggin
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-10  8:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V, Christophe Leroy

Rather than build slice masks from a range then use that to check for
fit in a candidate mask, implement slice_check_range_fits that checks
if a range fits in a mask directly.

This allows several structures to be removed from stacks, and also we
don't expect a huge range in a lot of these cases, so building and
comparing a full mask is going to be more expensive than testing just
one or two bits of the range.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/slice.c | 67 ++++++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 837700bb50a9..98497c105d7d 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -198,22 +198,35 @@ static const struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int ps
 #error "Must define the slice masks for page sizes supported by the platform"
 #endif
 
-static int slice_check_fit(struct mm_struct *mm,
-			   const struct slice_mask *mask,
-			   const struct slice_mask *available)
+static bool slice_check_range_fits(struct mm_struct *mm,
+			   const struct slice_mask *available,
+			   unsigned long start, unsigned long len)
 {
-	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-	/*
-	 * Make sure we just do bit compare only to the max
-	 * addr limit and not the full bit map size.
-	 */
-	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
+	unsigned long end = start + len - 1;
+	u64 low_slices = 0;
+
+	if (start < SLICE_LOW_TOP) {
+		unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
+
+		low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+				- (1u << GET_LOW_SLICE_INDEX(start));
+	}
+	if ((low_slices & available->low_slices) != low_slices)
+		return false;
+
+	if ((start + len) > SLICE_LOW_TOP) {
+		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
+		unsigned long i;
 
-	bitmap_and(result, mask->high_slices,
-		   available->high_slices, slice_count);
+		for (i = start_index; i < start_index + count; i++) {
+			if (!test_bit(i, available->high_slices))
+				return false;
+		}
+	}
 
-	return (mask->low_slices & available->low_slices) == mask->low_slices &&
-		bitmap_equal(result, mask->high_slices, slice_count);
+	return true;
 }
 
 static void slice_flush_segments(void *parm)
@@ -486,12 +499,6 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
 
-	/*
-	 * init different masks
-	 */
-	mask.low_slices = 0;
-	bitmap_zero(mask.high_slices, SLICE_NUM_HIGH);
-
 	/* silence stupid warning */;
 	potential_mask.low_slices = 0;
 	bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
@@ -553,15 +560,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #endif
 
 	/* First check hint if it's valid or if we have MAP_FIXED */
-	if (addr != 0 || fixed) {
-		/* Build a mask for the requested range */
-		slice_range_to_mask(addr, len, &mask);
-		slice_print_mask(" mask", &mask);
-
+	if (addr || fixed) {
 		/* Check if we fit in the good mask. If we do, we just return,
 		 * nothing else to do
 		 */
-		if (slice_check_fit(mm, &mask, &good_mask)) {
+		if (slice_check_range_fits(mm, &good_mask, addr, len)) {
 			slice_dbg(" fits good !\n");
 			return addr;
 		}
@@ -587,10 +590,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	slice_or_mask(&potential_mask, &good_mask);
 	slice_print_mask(" potential", &potential_mask);
 
-	if ((addr != 0 || fixed) &&
-			slice_check_fit(mm, &mask, &potential_mask)) {
-		slice_dbg(" fits potential !\n");
-		goto convert;
+	if (addr || fixed) {
+		if (slice_check_range_fits(mm, &potential_mask, addr, len)) {
+			slice_dbg(" fits potential !\n");
+			goto convert;
+		}
 	}
 
 	/* If we have MAP_FIXED and failed the above steps, then error out */
@@ -795,13 +799,12 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len)
 {
-	struct slice_mask mask, available;
+	struct slice_mask available;
 	unsigned int psize = mm->context.user_psize;
 
 	if (radix_enabled())
 		return 0;
 
-	slice_range_to_mask(addr, len, &mask);
 	available = *slice_mask_for_size(mm, psize);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
@@ -818,6 +821,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	slice_print_mask(" mask", &mask);
 	slice_print_mask(" available", &available);
 #endif
-	return !slice_check_fit(mm, &mask, &available);
+	return !slice_check_range_fits(mm, &available, addr, len);
 }
 #endif
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH 4/5] powerpc/mm/slice: Use const pointers to cached slice masks where possible
  2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
                   ` (2 preceding siblings ...)
  2018-02-10  8:11 ` [RFC PATCH 3/5] powerpc/mm/slice: implement slice_check_range_fits Nicholas Piggin
@ 2018-02-10  8:11 ` Nicholas Piggin
  2018-02-10  8:11 ` [RFC PATCH 5/5] powerpc/mm/slice: use the dynamic high slice size to limit bitmap operations Nicholas Piggin
  2018-02-12 15:02 ` [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Christophe LEROY
  5 siblings, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-10  8:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V, Christophe Leroy

The slice_mask cache was a basic conversion which copied the slice
mask into caller's structures, because that's how the original code
worked. In most cases the pointer can be used directly instead, saving
a copy and an on-stack structure.

This also converts the slice_mask bit operation helpers to be the usual
3-operand kind, which is clearer to work with. And we remove some
unnecessary intermediate bitmaps, reducing stack and copy overhead
further.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/slice.c | 78 ++++++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 98497c105d7d..b2e6c7667bc5 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -430,25 +430,28 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 		return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
 }
 
-static inline void slice_or_mask(struct slice_mask *dst,
+static inline void slice_copy_mask(struct slice_mask *dst,
 					const struct slice_mask *src)
 {
-	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+	dst->low_slices = src->low_slices;
+	bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+}
 
-	dst->low_slices |= src->low_slices;
-	bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
-	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+static inline void slice_or_mask(struct slice_mask *dst,
+					const struct slice_mask *src1,
+					const struct slice_mask *src2)
+{
+	dst->low_slices = src1->low_slices | src2->low_slices;
+	bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
 }
 
 static inline void slice_andnot_mask(struct slice_mask *dst,
-					const struct slice_mask *src)
+					const struct slice_mask *src1,
+					const struct slice_mask *src2)
 {
-	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-
-	dst->low_slices &= ~src->low_slices;
+	dst->low_slices = src1->low_slices & ~src2->low_slices;
 
-	bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
-	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+	bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
 }
 
 #ifdef CONFIG_PPC_64K_PAGES
@@ -461,10 +464,10 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 				      unsigned long flags, unsigned int psize,
 				      int topdown)
 {
-	struct slice_mask mask;
 	struct slice_mask good_mask;
 	struct slice_mask potential_mask;
-	struct slice_mask compat_mask;
+	const struct slice_mask *maskp;
+	const struct slice_mask *compat_maskp = NULL;
 	int fixed = (flags & MAP_FIXED);
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	unsigned long page_size = 1UL << pshift;
@@ -503,9 +506,6 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	potential_mask.low_slices = 0;
 	bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
 
-	compat_mask.low_slices = 0;
-	bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH);
-
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
 	BUG_ON(mm->context.slb_addr_limit == 0);
@@ -528,7 +528,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
-	good_mask = *slice_mask_for_size(mm, psize);
+	maskp = slice_mask_for_size(mm, psize);
 	slice_print_mask(" good_mask", &good_mask);
 
 	/*
@@ -553,11 +553,16 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If we support combo pages, we can allow 64k pages in 4k slices */
 	if (psize == MMU_PAGE_64K) {
-		compat_mask = *slice_mask_for_size(mm, MMU_PAGE_4K);
+		compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
 		if (fixed)
-			slice_or_mask(&good_mask, &compat_mask);
-	}
+			slice_or_mask(&good_mask, maskp, compat_maskp);
+		else
+			slice_copy_mask(&good_mask, maskp);
+	} else
 #endif
+	{
+		slice_copy_mask(&good_mask, maskp);
+	}
 
 	/* First check hint if it's valid or if we have MAP_FIXED */
 	if (addr || fixed) {
@@ -587,7 +592,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * empty and thus can be converted
 	 */
 	slice_mask_for_free(mm, &potential_mask, high_limit);
-	slice_or_mask(&potential_mask, &good_mask);
+	slice_or_mask(&potential_mask, &potential_mask, &good_mask);
 	slice_print_mask(" potential", &potential_mask);
 
 	if (addr || fixed) {
@@ -624,7 +629,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
-		slice_or_mask(&potential_mask, &compat_mask);
+		slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
 		addr = slice_find_area(mm, len, &potential_mask,
 				       psize, topdown, high_limit);
 	}
@@ -633,15 +638,17 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (addr == -ENOMEM)
 		return -ENOMEM;
 
-	slice_range_to_mask(addr, len, &mask);
+	slice_range_to_mask(addr, len, &potential_mask);
 	slice_dbg(" found potential area at 0x%lx\n", addr);
-	slice_print_mask(" mask", &mask);
+	slice_print_mask(" mask", maskp);
 
  convert:
-	slice_andnot_mask(&mask, &good_mask);
-	slice_andnot_mask(&mask, &compat_mask);
-	if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
-		slice_convert(mm, &mask, psize);
+	slice_andnot_mask(&potential_mask, &potential_mask, &good_mask);
+	if (compat_maskp && !fixed)
+		slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp);
+	if (potential_mask.low_slices ||
+		!bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH)) {
+		slice_convert(mm, &potential_mask, psize);
 		if (psize > MMU_PAGE_BASE)
 			on_each_cpu(slice_flush_segments, mm, 1);
 	}
@@ -799,19 +806,22 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len)
 {
-	struct slice_mask available;
+	const struct slice_mask *maskp;
 	unsigned int psize = mm->context.user_psize;
 
 	if (radix_enabled())
 		return 0;
 
-	available = *slice_mask_for_size(mm, psize);
+	maskp = slice_mask_for_size(mm, psize);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
 	if (psize == MMU_PAGE_64K) {
-		struct slice_mask compat_mask;
-		compat_mask = *slice_mask_for_size(mm, MMU_PAGE_4K);
-		slice_or_mask(&available, &compat_mask);
+		const struct slice_mask *compat_maskp;
+		struct slice_mask available;
+
+		compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
+		slice_or_mask(&available, maskp, compat_maskp);
+		return !slice_check_range_fits(mm, &available, addr, len);
 	}
 #endif
 
@@ -821,6 +831,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	slice_print_mask(" mask", &mask);
 	slice_print_mask(" available", &available);
 #endif
-	return !slice_check_range_fits(mm, &available, addr, len);
+	return !slice_check_range_fits(mm, maskp, addr, len);
 }
 #endif
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH 5/5] powerpc/mm/slice: use the dynamic high slice size to limit bitmap operations
  2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
                   ` (3 preceding siblings ...)
  2018-02-10  8:11 ` [RFC PATCH 4/5] powerpc/mm/slice: Use const pointers to cached slice masks where possible Nicholas Piggin
@ 2018-02-10  8:11 ` Nicholas Piggin
  2018-02-12 15:02 ` [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Christophe LEROY
  5 siblings, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-10  8:11 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V, Christophe Leroy

The number of high slices a process might use now depends on its
address space size, and what allocation address it has requested.

This patch uses that limit throughout call chains where possible,
rather than use the fixed SLICE_NUM_HIGH for bitmap operations.
This saves some cost for processes that don't use very large address
spaces.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/slice.c | 98 +++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index b2e6c7667bc5..bec68ea07e29 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -61,13 +61,12 @@ static void slice_print_mask(const char *label, const struct slice_mask *mask) {
 #endif
 
 static void slice_range_to_mask(unsigned long start, unsigned long len,
-				struct slice_mask *ret)
+				struct slice_mask *ret,
+				unsigned long high_slices)
 {
 	unsigned long end = start + len - 1;
 
 	ret->low_slices = 0;
-	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
-
 	if (start < SLICE_LOW_TOP) {
 		unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
 
@@ -75,6 +74,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len,
 			- (1u << GET_LOW_SLICE_INDEX(start));
 	}
 
+	bitmap_zero(ret->high_slices, high_slices);
 	if ((start + len) > SLICE_LOW_TOP) {
 		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
 		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
@@ -116,28 +116,27 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
 }
 
 static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
-				unsigned long high_limit)
+				unsigned long high_slices)
 {
 	unsigned long i;
 
 	ret->low_slices = 0;
-	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
-
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (!slice_low_has_vma(mm, i))
 			ret->low_slices |= 1u << i;
 
-	if (high_limit <= SLICE_LOW_TOP)
+	if (!high_slices)
 		return;
 
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++)
+	bitmap_zero(ret->high_slices, high_slices);
+	for (i = 0; i < high_slices; i++)
 		if (!slice_high_has_vma(mm, i))
 			__set_bit(i, ret->high_slices);
 }
 
 static void calc_slice_mask_for_size(struct mm_struct *mm, int psize,
 				struct slice_mask *ret,
-				unsigned long high_limit)
+				unsigned long high_slices)
 {
 	unsigned char *hpsizes;
 	int index, mask_index;
@@ -145,18 +144,17 @@ static void calc_slice_mask_for_size(struct mm_struct *mm, int psize,
 	u64 lpsizes;
 
 	ret->low_slices = 0;
-	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
-
 	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (((lpsizes >> (i * 4)) & 0xf) == psize)
 			ret->low_slices |= 1u << i;
 
-	if (high_limit <= SLICE_LOW_TOP)
+	if (!high_slices)
 		return;
 
+	bitmap_zero(ret->high_slices, high_slices);
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) {
+	for (i = 0; i < high_slices; i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
@@ -165,16 +163,15 @@ static void calc_slice_mask_for_size(struct mm_struct *mm, int psize,
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
-static void recalc_slice_mask_cache(struct mm_struct *mm)
+static void recalc_slice_mask_cache(struct mm_struct *mm, unsigned long high_slices)
 {
-	unsigned long l = mm->context.slb_addr_limit;
-	calc_slice_mask_for_size(mm, MMU_PAGE_4K, &mm->context.mask_4k, l);
+	calc_slice_mask_for_size(mm, MMU_PAGE_4K, &mm->context.mask_4k, high_slices);
 #ifdef CONFIG_PPC_64K_PAGES
-	calc_slice_mask_for_size(mm, MMU_PAGE_64K, &mm->context.mask_64k, l);
+	calc_slice_mask_for_size(mm, MMU_PAGE_64K, &mm->context.mask_64k, high_slices);
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
-	calc_slice_mask_for_size(mm, MMU_PAGE_16M, &mm->context.mask_16m, l);
-	calc_slice_mask_for_size(mm, MMU_PAGE_16G, &mm->context.mask_16g, l);
+	calc_slice_mask_for_size(mm, MMU_PAGE_16M, &mm->context.mask_16m, high_slices);
+	calc_slice_mask_for_size(mm, MMU_PAGE_16G, &mm->context.mask_16g, high_slices);
 #endif
 }
 
@@ -252,6 +249,7 @@ static void slice_convert(struct mm_struct *mm,
 	unsigned char *hpsizes;
 	u64 lpsizes;
 	unsigned long i, flags;
+	unsigned long high_slices;
 
 	slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
 	slice_print_mask(" mask", mask);
@@ -271,7 +269,8 @@ static void slice_convert(struct mm_struct *mm,
 	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
+	high_slices = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
+	for (i = 0; i < high_slices; i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (test_bit(i, mask->high_slices))
@@ -284,7 +283,7 @@ static void slice_convert(struct mm_struct *mm,
 		  (unsigned long)mm->context.low_slices_psize,
 		  (unsigned long)mm->context.high_slices_psize);
 
-	recalc_slice_mask_cache(mm);
+	recalc_slice_mask_cache(mm, high_slices);
 
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 
@@ -431,27 +430,32 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 }
 
 static inline void slice_copy_mask(struct slice_mask *dst,
-					const struct slice_mask *src)
+					const struct slice_mask *src,
+					unsigned long high_slices)
 {
 	dst->low_slices = src->low_slices;
-	bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+	bitmap_copy(dst->high_slices, src->high_slices, high_slices);
 }
 
 static inline void slice_or_mask(struct slice_mask *dst,
 					const struct slice_mask *src1,
-					const struct slice_mask *src2)
+					const struct slice_mask *src2,
+					unsigned long high_slices)
 {
 	dst->low_slices = src1->low_slices | src2->low_slices;
-	bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+	bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices,
+			high_slices);
 }
 
 static inline void slice_andnot_mask(struct slice_mask *dst,
 					const struct slice_mask *src1,
-					const struct slice_mask *src2)
+					const struct slice_mask *src2,
+					unsigned long high_slices)
 {
 	dst->low_slices = src1->low_slices & ~src2->low_slices;
 
-	bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+	bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices,
+			high_slices);
 }
 
 #ifdef CONFIG_PPC_64K_PAGES
@@ -474,6 +478,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	struct mm_struct *mm = current->mm;
 	unsigned long newaddr;
 	unsigned long high_limit;
+	unsigned long high_slices;
 
 	high_limit = DEFAULT_MAP_WINDOW;
 	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
@@ -490,13 +495,14 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 			return -ENOMEM;
 	}
 
+	high_slices = GET_HIGH_SLICE_INDEX(high_limit);
 	if (high_limit > mm->context.slb_addr_limit) {
 		unsigned long flags;
 
 		mm->context.slb_addr_limit = high_limit;
 
 		spin_lock_irqsave(&slice_convert_lock, flags);
-		recalc_slice_mask_cache(mm);
+		recalc_slice_mask_cache(mm, high_slices);
 		spin_unlock_irqrestore(&slice_convert_lock, flags);
 
 		on_each_cpu(slice_flush_segments, mm, 1);
@@ -504,7 +510,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 
 	/* silence stupid warning */;
 	potential_mask.low_slices = 0;
-	bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
+	bitmap_zero(potential_mask.high_slices, high_slices);
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
@@ -555,13 +561,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (psize == MMU_PAGE_64K) {
 		compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
 		if (fixed)
-			slice_or_mask(&good_mask, maskp, compat_maskp);
+			slice_or_mask(&good_mask, maskp, compat_maskp, high_slices);
 		else
-			slice_copy_mask(&good_mask, maskp);
+			slice_copy_mask(&good_mask, maskp, high_slices);
 	} else
 #endif
 	{
-		slice_copy_mask(&good_mask, maskp);
+		slice_copy_mask(&good_mask, maskp, high_slices);
 	}
 
 	/* First check hint if it's valid or if we have MAP_FIXED */
@@ -591,8 +597,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * We don't fit in the good mask, check what other slices are
 	 * empty and thus can be converted
 	 */
-	slice_mask_for_free(mm, &potential_mask, high_limit);
-	slice_or_mask(&potential_mask, &potential_mask, &good_mask);
+	slice_mask_for_free(mm, &potential_mask, high_slices);
+	slice_or_mask(&potential_mask, &potential_mask, &good_mask, high_slices);
 	slice_print_mask(" potential", &potential_mask);
 
 	if (addr || fixed) {
@@ -629,7 +635,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
-		slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
+		slice_or_mask(&potential_mask, &potential_mask, compat_maskp, high_slices);
 		addr = slice_find_area(mm, len, &potential_mask,
 				       psize, topdown, high_limit);
 	}
@@ -638,16 +644,16 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	if (addr == -ENOMEM)
 		return -ENOMEM;
 
-	slice_range_to_mask(addr, len, &potential_mask);
+	slice_range_to_mask(addr, len, &potential_mask, high_slices);
 	slice_dbg(" found potential area at 0x%lx\n", addr);
 	slice_print_mask(" mask", maskp);
 
  convert:
-	slice_andnot_mask(&potential_mask, &potential_mask, &good_mask);
+	slice_andnot_mask(&potential_mask, &potential_mask, &good_mask, high_slices);
 	if (compat_maskp && !fixed)
-		slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp);
+		slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp, high_slices);
 	if (potential_mask.low_slices ||
-		!bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH)) {
+		!bitmap_empty(potential_mask.high_slices, high_slices)) {
 		slice_convert(mm, &potential_mask, psize);
 		if (psize > MMU_PAGE_BASE)
 			on_each_cpu(slice_flush_segments, mm, 1);
@@ -724,6 +730,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 	int index, mask_index;
 	unsigned char *hpsizes;
 	unsigned long flags, lpsizes;
+	unsigned long high_slices;
 	unsigned int old_psize;
 	int i;
 
@@ -749,7 +756,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+	high_slices = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
+	for (i = 0; i < high_slices; i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
@@ -765,7 +773,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 		  (unsigned long)mm->context.low_slices_psize,
 		  (unsigned long)mm->context.high_slices_psize);
 
-	recalc_slice_mask_cache(mm);
+	recalc_slice_mask_cache(mm, high_slices);
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 	return;
  bail:
@@ -776,10 +784,12 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 			   unsigned long len, unsigned int psize)
 {
 	struct slice_mask mask;
+	unsigned long high_slices;
 
 	VM_BUG_ON(radix_enabled());
 
-	slice_range_to_mask(start, len, &mask);
+	high_slices = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
+	slice_range_to_mask(start, len, &mask, high_slices);
 	slice_convert(mm, &mask, psize);
 }
 
@@ -818,9 +828,11 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	if (psize == MMU_PAGE_64K) {
 		const struct slice_mask *compat_maskp;
 		struct slice_mask available;
+		unsigned long high_slices;
 
 		compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
-		slice_or_mask(&available, maskp, compat_maskp);
+		high_slices = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
+		slice_or_mask(&available, maskp, compat_maskp, high_slices);
 		return !slice_check_range_fits(mm, &available, addr, len);
 	}
 #endif
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use
  2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
                   ` (4 preceding siblings ...)
  2018-02-10  8:11 ` [RFC PATCH 5/5] powerpc/mm/slice: use the dynamic high slice size to limit bitmap operations Nicholas Piggin
@ 2018-02-12 15:02 ` Christophe LEROY
  2018-02-12 15:24   ` Nicholas Piggin
  5 siblings, 1 reply; 11+ messages in thread
From: Christophe LEROY @ 2018-02-12 15:02 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V



Le 10/02/2018 à 09:11, Nicholas Piggin a écrit :
> This series intends to improve performance and reduce stack
> consumption in the slice allocation code. It does it by keeping slice
> masks in the mm_context rather than compute them for each allocation,
> and by reducing bitmaps and slice_masks from stacks, using pointers
> instead where possible.
> 
> checkstack.pl gives, before:
> 0x00000de4 slice_get_unmapped_area [slice.o]:           656
> 0x00001b4c is_hugepage_only_range [slice.o]:            512
> 0x0000075c slice_find_area_topdown [slice.o]:           416
> 0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:   272
> 0x00001aa0 slice_set_range_psize [slice.o]:             240
> 0x00000a64 slice_find_area [slice.o]:                   176
> 0x00000174 slice_check_fit [slice.o]:                   112
> 
> after:
> 0x00000d70 slice_get_unmapped_area [slice.o]:           320
> 0x000008f8 slice_find_area [slice.o]:                   144
> 0x00001860 slice_set_range_psize [slice.o]:             144
> 0x000018ec is_hugepage_only_range [slice.o]:            144
> 0x00000750 slice_find_area_bottomup.isra.4 [slice.o]:   128
> 
> The benchmark in https://github.com/linuxppc/linux/issues/49 gives, before:
> $ time ./slicemask
> real	0m20.712s
> user	0m5.830s
> sys	0m15.105s
> 
> after:
> $ time ./slicemask
> real	0m13.197s
> user	0m5.409s
> sys	0m7.779s

Hi,

I tested your serie on an 8xx, on top of patch 
https://patchwork.ozlabs.org/patch/871675/

I don't get a result as significant as yours, but there is some 
improvment anyway:

ITERATION 500000

Before:

root@vgoip:~# time ./slicemask
real    0m 33.26s
user    0m 1.94s
sys     0m 30.85s

After:
root@vgoip:~# time ./slicemask
real    0m 29.69s
user    0m 2.11s
sys     0m 27.15s

Most significant improvment is obtained with the first patch of your serie:
root@vgoip:~# time ./slicemask
real    0m 30.85s
user    0m 1.80s
sys     0m 28.57s

Had to modify your serie a bit, if you are interested I can post it.

Christophe


> 
> Thanks,
> Nick
> 
> Nicholas Piggin (5):
>    powerpc/mm/slice: pass pointers to struct slice_mask where possible
>    powerpc/mm/slice: implement a slice mask cache
>    powerpc/mm/slice: implement slice_check_range_fits
>    powerpc/mm/slice: Use const pointers to cached slice masks where
>      possible
>    powerpc/mm/slice: use the dynamic high slice size to limit bitmap
>      operations
> 
>   arch/powerpc/include/asm/book3s/64/mmu.h |  20 +-
>   arch/powerpc/mm/slice.c                  | 302 +++++++++++++++++++------------
>   2 files changed, 204 insertions(+), 118 deletions(-)
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use
  2018-02-12 15:02 ` [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Christophe LEROY
@ 2018-02-12 15:24   ` Nicholas Piggin
  2018-02-12 17:42     ` Christophe LEROY
  0 siblings, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-12 15:24 UTC (permalink / raw)
  To: Christophe LEROY; +Cc: linuxppc-dev, Aneesh Kumar K . V

On Mon, 12 Feb 2018 16:02:23 +0100
Christophe LEROY <christophe.leroy@c-s.fr> wrote:

> Le 10/02/2018 à 09:11, Nicholas Piggin a écrit :
> > This series intends to improve performance and reduce stack
> > consumption in the slice allocation code. It does it by keeping slice
> > masks in the mm_context rather than compute them for each allocation,
> > and by reducing bitmaps and slice_masks from stacks, using pointers
> > instead where possible.
> > 
> > checkstack.pl gives, before:
> > 0x00000de4 slice_get_unmapped_area [slice.o]:           656
> > 0x00001b4c is_hugepage_only_range [slice.o]:            512
> > 0x0000075c slice_find_area_topdown [slice.o]:           416
> > 0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:   272
> > 0x00001aa0 slice_set_range_psize [slice.o]:             240
> > 0x00000a64 slice_find_area [slice.o]:                   176
> > 0x00000174 slice_check_fit [slice.o]:                   112
> > 
> > after:
> > 0x00000d70 slice_get_unmapped_area [slice.o]:           320
> > 0x000008f8 slice_find_area [slice.o]:                   144
> > 0x00001860 slice_set_range_psize [slice.o]:             144
> > 0x000018ec is_hugepage_only_range [slice.o]:            144
> > 0x00000750 slice_find_area_bottomup.isra.4 [slice.o]:   128
> > 
> > The benchmark in https://github.com/linuxppc/linux/issues/49 gives, before:
> > $ time ./slicemask
> > real	0m20.712s
> > user	0m5.830s
> > sys	0m15.105s
> > 
> > after:
> > $ time ./slicemask
> > real	0m13.197s
> > user	0m5.409s
> > sys	0m7.779s  
> 
> Hi,
> 
> I tested your serie on an 8xx, on top of patch 
> https://patchwork.ozlabs.org/patch/871675/
> 
> I don't get a result as significant as yours, but there is some 
> improvment anyway:
> 
> ITERATION 500000
> 
> Before:
> 
> root@vgoip:~# time ./slicemask
> real    0m 33.26s
> user    0m 1.94s
> sys     0m 30.85s
> 
> After:
> root@vgoip:~# time ./slicemask
> real    0m 29.69s
> user    0m 2.11s
> sys     0m 27.15s
> 
> Most significant improvment is obtained with the first patch of your serie:
> root@vgoip:~# time ./slicemask
> real    0m 30.85s
> user    0m 1.80s
> sys     0m 28.57s

Okay, thanks. Are you still spending significant time in the slice
code?

> 
> Had to modify your serie a bit, if you are interested I can post it.
> 

Sure, that would be good.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use
  2018-02-12 15:24   ` Nicholas Piggin
@ 2018-02-12 17:42     ` Christophe LEROY
  2018-02-13  8:40       ` Nicholas Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Christophe LEROY @ 2018-02-12 17:42 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linuxppc-dev, Aneesh Kumar K . V, Michael Ellerman



Le 12/02/2018 à 16:24, Nicholas Piggin a écrit :
> On Mon, 12 Feb 2018 16:02:23 +0100
> Christophe LEROY <christophe.leroy@c-s.fr> wrote:
> 
>> Le 10/02/2018 à 09:11, Nicholas Piggin a écrit :
>>> This series intends to improve performance and reduce stack
>>> consumption in the slice allocation code. It does it by keeping slice
>>> masks in the mm_context rather than compute them for each allocation,
>>> and by reducing bitmaps and slice_masks from stacks, using pointers
>>> instead where possible.
>>>
>>> checkstack.pl gives, before:
>>> 0x00000de4 slice_get_unmapped_area [slice.o]:           656
>>> 0x00001b4c is_hugepage_only_range [slice.o]:            512
>>> 0x0000075c slice_find_area_topdown [slice.o]:           416
>>> 0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:   272
>>> 0x00001aa0 slice_set_range_psize [slice.o]:             240
>>> 0x00000a64 slice_find_area [slice.o]:                   176
>>> 0x00000174 slice_check_fit [slice.o]:                   112
>>>
>>> after:
>>> 0x00000d70 slice_get_unmapped_area [slice.o]:           320
>>> 0x000008f8 slice_find_area [slice.o]:                   144
>>> 0x00001860 slice_set_range_psize [slice.o]:             144
>>> 0x000018ec is_hugepage_only_range [slice.o]:            144
>>> 0x00000750 slice_find_area_bottomup.isra.4 [slice.o]:   128
>>>
>>> The benchmark in https://github.com/linuxppc/linux/issues/49 gives, before:
>>> $ time ./slicemask
>>> real	0m20.712s
>>> user	0m5.830s
>>> sys	0m15.105s
>>>
>>> after:
>>> $ time ./slicemask
>>> real	0m13.197s
>>> user	0m5.409s
>>> sys	0m7.779s
>>
>> Hi,
>>
>> I tested your serie on an 8xx, on top of patch
>> https://patchwork.ozlabs.org/patch/871675/
>>
>> I don't get a result as significant as yours, but there is some
>> improvment anyway:
>>
>> ITERATION 500000
>>
>> Before:
>>
>> root@vgoip:~# time ./slicemask
>> real    0m 33.26s
>> user    0m 1.94s
>> sys     0m 30.85s
>>
>> After:
>> root@vgoip:~# time ./slicemask
>> real    0m 29.69s
>> user    0m 2.11s
>> sys     0m 27.15s
>>
>> Most significant improvment is obtained with the first patch of your serie:
>> root@vgoip:~# time ./slicemask
>> real    0m 30.85s
>> user    0m 1.80s
>> sys     0m 28.57s
> 
> Okay, thanks. Are you still spending significant time in the slice
> code?

Do you mean am I still updating my patches ? No I hope we are at last 
run with v4 now that Aneesh has tagged all of them as reviewed-by himself.
Once the serie has been accepted, my next step will be to backport at 
least the 3 first ones in kernel 4.14

> 
>>
>> Had to modify your serie a bit, if you are interested I can post it.
>>
> 
> Sure, that would be good.

Ok, lets share it. The patch are not 100% clean.

Christophe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use
  2018-02-12 17:42     ` Christophe LEROY
@ 2018-02-13  8:40       ` Nicholas Piggin
  2018-02-13 11:24         ` Christophe LEROY
  0 siblings, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2018-02-13  8:40 UTC (permalink / raw)
  To: Christophe LEROY; +Cc: linuxppc-dev, Aneesh Kumar K . V, Michael Ellerman

On Mon, 12 Feb 2018 18:42:21 +0100
Christophe LEROY <christophe.leroy@c-s.fr> wrote:

> Le 12/02/2018 à 16:24, Nicholas Piggin a écrit :
> > On Mon, 12 Feb 2018 16:02:23 +0100
> > Christophe LEROY <christophe.leroy@c-s.fr> wrote:
> >   
> >> Le 10/02/2018 à 09:11, Nicholas Piggin a écrit :  
> >>> This series intends to improve performance and reduce stack
> >>> consumption in the slice allocation code. It does it by keeping slice
> >>> masks in the mm_context rather than compute them for each allocation,
> >>> and by reducing bitmaps and slice_masks from stacks, using pointers
> >>> instead where possible.
> >>>
> >>> checkstack.pl gives, before:
> >>> 0x00000de4 slice_get_unmapped_area [slice.o]:           656
> >>> 0x00001b4c is_hugepage_only_range [slice.o]:            512
> >>> 0x0000075c slice_find_area_topdown [slice.o]:           416
> >>> 0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:   272
> >>> 0x00001aa0 slice_set_range_psize [slice.o]:             240
> >>> 0x00000a64 slice_find_area [slice.o]:                   176
> >>> 0x00000174 slice_check_fit [slice.o]:                   112
> >>>
> >>> after:
> >>> 0x00000d70 slice_get_unmapped_area [slice.o]:           320
> >>> 0x000008f8 slice_find_area [slice.o]:                   144
> >>> 0x00001860 slice_set_range_psize [slice.o]:             144
> >>> 0x000018ec is_hugepage_only_range [slice.o]:            144
> >>> 0x00000750 slice_find_area_bottomup.isra.4 [slice.o]:   128
> >>>
> >>> The benchmark in https://github.com/linuxppc/linux/issues/49 gives, before:
> >>> $ time ./slicemask
> >>> real	0m20.712s
> >>> user	0m5.830s
> >>> sys	0m15.105s
> >>>
> >>> after:
> >>> $ time ./slicemask
> >>> real	0m13.197s
> >>> user	0m5.409s
> >>> sys	0m7.779s  
> >>
> >> Hi,
> >>
> >> I tested your serie on an 8xx, on top of patch
> >> https://patchwork.ozlabs.org/patch/871675/
> >>
> >> I don't get a result as significant as yours, but there is some
> >> improvment anyway:
> >>
> >> ITERATION 500000
> >>
> >> Before:
> >>
> >> root@vgoip:~# time ./slicemask
> >> real    0m 33.26s
> >> user    0m 1.94s
> >> sys     0m 30.85s
> >>
> >> After:
> >> root@vgoip:~# time ./slicemask
> >> real    0m 29.69s
> >> user    0m 2.11s
> >> sys     0m 27.15s
> >>
> >> Most significant improvment is obtained with the first patch of your serie:
> >> root@vgoip:~# time ./slicemask
> >> real    0m 30.85s
> >> user    0m 1.80s
> >> sys     0m 28.57s  
> > 
> > Okay, thanks. Are you still spending significant time in the slice
> > code?  
> 
> Do you mean am I still updating my patches ? No I hope we are at last 

Actually I was wondering about CPU time spent for the microbenchmark :)

> run with v4 now that Aneesh has tagged all of them as reviewed-by himself.
> Once the serie has been accepted, my next step will be to backport at 
> least the 3 first ones in kernel 4.14
> 
> >   
> >>
> >> Had to modify your serie a bit, if you are interested I can post it.
> >>  
> > 
> > Sure, that would be good.  
> 
> Ok, lets share it. The patch are not 100% clean.

Those look pretty good, thanks for doing that work.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use
  2018-02-13  8:40       ` Nicholas Piggin
@ 2018-02-13 11:24         ` Christophe LEROY
  0 siblings, 0 replies; 11+ messages in thread
From: Christophe LEROY @ 2018-02-13 11:24 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linuxppc-dev, Aneesh Kumar K . V, Michael Ellerman



Le 13/02/2018 à 09:40, Nicholas Piggin a écrit :
> On Mon, 12 Feb 2018 18:42:21 +0100
> Christophe LEROY <christophe.leroy@c-s.fr> wrote:
> 
>> Le 12/02/2018 à 16:24, Nicholas Piggin a écrit :
>>> On Mon, 12 Feb 2018 16:02:23 +0100
>>> Christophe LEROY <christophe.leroy@c-s.fr> wrote:
>>>    
>>>> Le 10/02/2018 à 09:11, Nicholas Piggin a écrit :
>>>>> This series intends to improve performance and reduce stack
>>>>> consumption in the slice allocation code. It does it by keeping slice
>>>>> masks in the mm_context rather than compute them for each allocation,
>>>>> and by reducing bitmaps and slice_masks from stacks, using pointers
>>>>> instead where possible.
>>>>>
>>>>> checkstack.pl gives, before:
>>>>> 0x00000de4 slice_get_unmapped_area [slice.o]:           656
>>>>> 0x00001b4c is_hugepage_only_range [slice.o]:            512
>>>>> 0x0000075c slice_find_area_topdown [slice.o]:           416
>>>>> 0x000004c8 slice_find_area_bottomup.isra.1 [slice.o]:   272
>>>>> 0x00001aa0 slice_set_range_psize [slice.o]:             240
>>>>> 0x00000a64 slice_find_area [slice.o]:                   176
>>>>> 0x00000174 slice_check_fit [slice.o]:                   112
>>>>>
>>>>> after:
>>>>> 0x00000d70 slice_get_unmapped_area [slice.o]:           320
>>>>> 0x000008f8 slice_find_area [slice.o]:                   144
>>>>> 0x00001860 slice_set_range_psize [slice.o]:             144
>>>>> 0x000018ec is_hugepage_only_range [slice.o]:            144
>>>>> 0x00000750 slice_find_area_bottomup.isra.4 [slice.o]:   128
>>>>>
>>>>> The benchmark in https://github.com/linuxppc/linux/issues/49 gives, before:
>>>>> $ time ./slicemask
>>>>> real	0m20.712s
>>>>> user	0m5.830s
>>>>> sys	0m15.105s
>>>>>
>>>>> after:
>>>>> $ time ./slicemask
>>>>> real	0m13.197s
>>>>> user	0m5.409s
>>>>> sys	0m7.779s
>>>>
>>>> Hi,
>>>>
>>>> I tested your serie on an 8xx, on top of patch
>>>> https://patchwork.ozlabs.org/patch/871675/
>>>>
>>>> I don't get a result as significant as yours, but there is some
>>>> improvment anyway:
>>>>
>>>> ITERATION 500000
>>>>
>>>> Before:
>>>>
>>>> root@vgoip:~# time ./slicemask
>>>> real    0m 33.26s
>>>> user    0m 1.94s
>>>> sys     0m 30.85s
>>>>
>>>> After:
>>>> root@vgoip:~# time ./slicemask
>>>> real    0m 29.69s
>>>> user    0m 2.11s
>>>> sys     0m 27.15s
>>>>
>>>> Most significant improvment is obtained with the first patch of your serie:
>>>> root@vgoip:~# time ./slicemask
>>>> real    0m 30.85s
>>>> user    0m 1.80s
>>>> sys     0m 28.57s
>>>
>>> Okay, thanks. Are you still spending significant time in the slice
>>> code?
>>
>> Do you mean am I still updating my patches ? No I hope we are at last
> 
> Actually I was wondering about CPU time spent for the microbenchmark :)

Lol.

I've got the following perf report (functions over 0.50%)

# Overhead  Command    Shared Object      Symbol
# ........  .........  .................  ..................................
#
      7.13%  slicemask  [kernel.kallsyms]  [k] do_brk_flags
      6.19%  slicemask  [kernel.kallsyms]  [k] DoSyscall
      5.81%  slicemask  [kernel.kallsyms]  [k] perf_event_mmap
      5.55%  slicemask  [kernel.kallsyms]  [k] do_munmap
      4.55%  slicemask  [kernel.kallsyms]  [k] sys_brk
      4.43%  slicemask  [kernel.kallsyms]  [k] find_vma
      3.42%  slicemask  [kernel.kallsyms]  [k] vma_compute_subtree_gap
      3.08%  slicemask  libc-2.23.so       [.] __brk
      2.95%  slicemask  [kernel.kallsyms]  [k] slice_get_unmapped_area
      2.81%  slicemask  [kernel.kallsyms]  [k] __vm_enough_memory
      2.78%  slicemask  [kernel.kallsyms]  [k] kmem_cache_free
      2.51%  slicemask  [kernel.kallsyms]  [k] perf_iterate_ctx.constprop.84
      2.40%  slicemask  [kernel.kallsyms]  [k] unmap_page_range
      2.27%  slicemask  [kernel.kallsyms]  [k] perf_iterate_sb
      2.21%  slicemask  [kernel.kallsyms]  [k] vmacache_find
      2.04%  slicemask  [kernel.kallsyms]  [k] vma_gap_update
      1.91%  slicemask  [kernel.kallsyms]  [k] unmap_region
      1.81%  slicemask  [kernel.kallsyms]  [k] memset_nocache_branch
      1.59%  slicemask  [kernel.kallsyms]  [k] kmem_cache_alloc
      1.57%  slicemask  [kernel.kallsyms]  [k] get_unmapped_area.part.7
      1.55%  slicemask  [kernel.kallsyms]  [k] up_write
      1.44%  slicemask  [kernel.kallsyms]  [k] vma_merge
      1.28%  slicemask  slicemask          [.] main
      1.27%  slicemask  [kernel.kallsyms]  [k] lru_add_drain
      1.22%  slicemask  [kernel.kallsyms]  [k] vma_link
      1.19%  slicemask  [kernel.kallsyms]  [k] tlb_gather_mmu
      1.17%  slicemask  [kernel.kallsyms]  [k] tlb_flush_mmu_free
      1.15%  slicemask  libc-2.23.so       [.] got_label
      1.11%  slicemask  [kernel.kallsyms]  [k] unlink_anon_vmas
      1.06%  slicemask  [kernel.kallsyms]  [k] lru_add_drain_cpu
      1.02%  slicemask  [kernel.kallsyms]  [k] free_pgtables
      1.01%  slicemask  [kernel.kallsyms]  [k] remove_vma
      0.98%  slicemask  [kernel.kallsyms]  [k] strlcpy
      0.98%  slicemask  [kernel.kallsyms]  [k] perf_event_mmap_output
      0.95%  slicemask  [kernel.kallsyms]  [k] may_expand_vm
      0.90%  slicemask  [kernel.kallsyms]  [k] unmap_vmas
      0.86%  slicemask  [kernel.kallsyms]  [k] down_write_killable
      0.83%  slicemask  [kernel.kallsyms]  [k] __vma_link_list
      0.83%  slicemask  [kernel.kallsyms]  [k] arch_vma_name
      0.81%  slicemask  [kernel.kallsyms]  [k] __vma_rb_erase
      0.80%  slicemask  [kernel.kallsyms]  [k] __rcu_read_unlock
      0.71%  slicemask  [kernel.kallsyms]  [k] tlb_flush_mmu
      0.70%  slicemask  [kernel.kallsyms]  [k] tlb_finish_mmu
      0.68%  slicemask  [kernel.kallsyms]  [k] __rb_insert_augmented
      0.63%  slicemask  [kernel.kallsyms]  [k] cap_capable
      0.61%  slicemask  [kernel.kallsyms]  [k] free_pgd_range
      0.59%  slicemask  [kernel.kallsyms]  [k] arch_tlb_finish_mmu
      0.59%  slicemask  [kernel.kallsyms]  [k] __vma_link_rb
      0.56%  slicemask  [kernel.kallsyms]  [k] __rcu_read_lock
      0.55%  slicemask  [kernel.kallsyms]  [k] 
arch_get_unmapped_area_topdown
      0.53%  slicemask  [kernel.kallsyms]  [k] unlink_file_vma
      0.51%  slicemask  [kernel.kallsyms]  [k] vmacache_update
      0.50%  slicemask  [kernel.kallsyms]  [k] kfree

Unfortunatly I didn't run a perf report before applying the patch serie. 
If you are interested for the comparison, I won't be able to do it 
before next week.

> 
>> run with v4 now that Aneesh has tagged all of them as reviewed-by himself.
>> Once the serie has been accepted, my next step will be to backport at
>> least the 3 first ones in kernel 4.14
>>
>>>    
>>>>
>>>> Had to modify your serie a bit, if you are interested I can post it.
>>>>   
>>>
>>> Sure, that would be good.
>>
>> Ok, lets share it. The patch are not 100% clean.
> 
> Those look pretty good, thanks for doing that work.

You are welcome. I wanted to try your serie on the 8xx. It is untested 
on the book3s64, not sure it even compiles.

Christophe

> 
> Thanks,
> Nick
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-02-13 11:24 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-10  8:11 [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Nicholas Piggin
2018-02-10  8:11 ` [RFC PATCH 1/5] powerpc/mm/slice: pass pointers to struct slice_mask where possible Nicholas Piggin
2018-02-10  8:11 ` [RFC PATCH 2/5] powerpc/mm/slice: implement a slice mask cache Nicholas Piggin
2018-02-10  8:11 ` [RFC PATCH 3/5] powerpc/mm/slice: implement slice_check_range_fits Nicholas Piggin
2018-02-10  8:11 ` [RFC PATCH 4/5] powerpc/mm/slice: Use const pointers to cached slice masks where possible Nicholas Piggin
2018-02-10  8:11 ` [RFC PATCH 5/5] powerpc/mm/slice: use the dynamic high slice size to limit bitmap operations Nicholas Piggin
2018-02-12 15:02 ` [RFC PATCH 0/5] powerpc/mm/slice: improve slice speed and stack use Christophe LEROY
2018-02-12 15:24   ` Nicholas Piggin
2018-02-12 17:42     ` Christophe LEROY
2018-02-13  8:40       ` Nicholas Piggin
2018-02-13 11:24         ` Christophe LEROY

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.