All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 17:25 Mikulas Patocka
  2018-03-20 17:35   ` Matthew Wilcox
  0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 17:25 UTC (permalink / raw)
  To: Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton
  Cc: linux-mm, dm-devel, Mike Snitzer

Hi

I'm submitting this patch for the slab allocator. The patch adds a new 
flag SLAB_MINIMIZE_WASTE. When this flag is present, the slab subsystem 
will use higher-order allocations to minimize wasted space (it will not 
use higher-order allocations when there's no benefit, such as if the 
object size is a power of two).

The reason why we need this is that we are going to merge code that does 
block device deduplication (it was developed separatedly and sold as a 
commercial product), and the code uses block sizes that are not a power of 
two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab 
allocator rounds up the allocation to the nearest power of two, but that 
wastes a lot of memory. Performance of the solution depends on efficient 
memory usage, so we should minimize wasted as much as possible.

Larger-order allocations are unreliable (they may fail at any time if the 
memory is fragmented), but it is not an issue here - the code preallocates 
a few buffers with vmalloc and then allocates buffers from the slab cache. 
If the allocation fails due to memory fragmentation, we throw away and 
reuse some existing buffer, so there is no functionality loss.

Mikulas


From: Mikulas Patocka <mpatocka@redhat.com>

This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
flag causes allocation of larger slab caches in order to minimize wasted
space.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 include/linux/slab.h  |    7 +++++++
 mm/slab.c             |    4 ++--
 mm/slab.h             |    7 ++++---
 mm/slab_common.c      |    2 +-
 mm/slub.c             |   25 ++++++++++++++++++++-----
 6 files changed, 35 insertions(+), 12 deletions(-)

Index: linux-2.6/include/linux/slab.h
===================================================================
--- linux-2.6.orig/include/linux/slab.h	2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/include/linux/slab.h	2018-03-20 14:59:20.518030000 +0100
@@ -108,6 +108,13 @@
 #define SLAB_KASAN		0
 #endif
 
+/*
+ * Use higer order allocations to minimize wasted space.
+ * Note: the allocation is unreliable if this flag is used, the caller
+ * must handle allocation failures gracefully.
+ */
+#define SLAB_MINIMIZE_WASTE	((slab_flags_t __force)0x10000000U)
+
 /* The following flags affect the page allocator grouping pages by mobility */
 /* Objects are reclaimable */
 #define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
Index: linux-2.6/mm/slab_common.c
===================================================================
--- linux-2.6.orig/mm/slab_common.c	2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slab_common.c	2018-03-20 14:59:20.518030000 +0100
@@ -52,7 +52,7 @@ static DECLARE_WORK(slab_caches_to_rcu_d
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_ACCOUNT)
+			 SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slub.c	2018-03-20 14:59:20.518030000 +0100
@@ -3234,7 +3234,7 @@ static inline int slab_order(int size, i
 	return order;
 }
 
-static inline int calculate_order(int size, int reserved)
+static inline int calculate_order(int size, int reserved, slab_flags_t flags)
 {
 	int order;
 	int min_objects;
@@ -3261,7 +3261,7 @@ static inline int calculate_order(int si
 			order = slab_order(size, min_objects,
 					slub_max_order, fraction, reserved);
 			if (order <= slub_max_order)
-				return order;
+				goto ret_order;
 			fraction /= 2;
 		}
 		min_objects--;
@@ -3273,15 +3273,30 @@ static inline int calculate_order(int si
 	 */
 	order = slab_order(size, 1, slub_max_order, 1, reserved);
 	if (order <= slub_max_order)
-		return order;
+		goto ret_order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
 	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
 	if (order < MAX_ORDER)
-		return order;
+		goto ret_order;
 	return -ENOSYS;
+
+ret_order:
+	if (flags & SLAB_MINIMIZE_WASTE) {
+		/* Increase the order if it decreases waste */
+		int test_order;
+		for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+			unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+			unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+			if (test_order_objects >= min(64, MAX_OBJS_PER_PAGE))
+				break;
+			if (test_order_objects > order_objects << (test_order - order))
+				order = test_order;
+		}
+	}
+	return order;
 }
 
 static void
@@ -3546,7 +3561,7 @@ static int calculate_sizes(struct kmem_c
 	if (forced_order >= 0)
 		order = forced_order;
 	else
-		order = calculate_order(size, s->reserved);
+		order = calculate_order(size, s->reserved, flags);
 
 	if (order < 0)
 		return 0;
Index: linux-2.6/mm/slab.h
===================================================================
--- linux-2.6.orig/mm/slab.h	2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slab.h	2018-03-20 14:59:20.518030000 +0100
@@ -142,10 +142,10 @@ static inline slab_flags_t kmem_cache_fl
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
 			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_ACCOUNT)
+			  SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_ACCOUNT)
+			  SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -164,7 +164,8 @@ static inline slab_flags_t kmem_cache_fl
 			      SLAB_NOLEAKTRACE | \
 			      SLAB_RECLAIM_ACCOUNT | \
 			      SLAB_TEMPORARY | \
-			      SLAB_ACCOUNT)
+			      SLAB_ACCOUNT | \
+			      SLAB_MINIMIZE_WASTE)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
Index: linux-2.6/mm/slab.c
===================================================================
--- linux-2.6.orig/mm/slab.c	2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slab.c	2018-03-20 14:59:20.518030000 +0100
@@ -1789,14 +1789,14 @@ static size_t calculate_slab_order(struc
 		 * as GFP_NOFS and we really don't want to have to be allocating
 		 * higher-order pages when we are unable to shrink dcache.
 		 */
-		if (flags & SLAB_RECLAIM_ACCOUNT)
+		if (flags & SLAB_RECLAIM_ACCOUNT && !(flags & SLAB_MINIMIZE_WASTE))
 			break;
 
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
-		if (gfporder >= slab_max_order)
+		if (gfporder >= slab_max_order && !(flags & SLAB_MINIMIZE_WASTE))
 			break;
 
 		/*

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-20 17:25 [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE Mikulas Patocka
@ 2018-03-20 17:35   ` Matthew Wilcox
  0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-20 17:35 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> The reason why we need this is that we are going to merge code that does 
> block device deduplication (it was developed separatedly and sold as a 
> commercial product), and the code uses block sizes that are not a power of 
> two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab 
> allocator rounds up the allocation to the nearest power of two, but that 
> wastes a lot of memory. Performance of the solution depends on efficient 
> memory usage, so we should minimize wasted as much as possible.

The SLUB allocator also falls back to using the page (buddy) allocator
for allocations above 8kB, so this patch is going to have no effect on
slub.  You'd be better off using alloc_pages_exact() for this kind of
size, or managing your own pool of pages by using something like five
192k blocks in a 1MB allocation.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 17:35   ` Matthew Wilcox
  0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-20 17:35 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Andrew Morton, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christoph Lameter, Joonsoo Kim

On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> The reason why we need this is that we are going to merge code that does 
> block device deduplication (it was developed separatedly and sold as a 
> commercial product), and the code uses block sizes that are not a power of 
> two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab 
> allocator rounds up the allocation to the nearest power of two, but that 
> wastes a lot of memory. Performance of the solution depends on efficient 
> memory usage, so we should minimize wasted as much as possible.

The SLUB allocator also falls back to using the page (buddy) allocator
for allocations above 8kB, so this patch is going to have no effect on
slub.  You'd be better off using alloc_pages_exact() for this kind of
size, or managing your own pool of pages by using something like five
192k blocks in a 1MB allocation.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-20 17:35   ` Matthew Wilcox
@ 2018-03-20 17:54     ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 17:54 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Tue, 20 Mar 2018, Matthew Wilcox wrote:

> On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > The reason why we need this is that we are going to merge code that does
> > block device deduplication (it was developed separatedly and sold as a
> > commercial product), and the code uses block sizes that are not a power of
> > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > allocator rounds up the allocation to the nearest power of two, but that
> > wastes a lot of memory. Performance of the solution depends on efficient
> > memory usage, so we should minimize wasted as much as possible.
>
> The SLUB allocator also falls back to using the page (buddy) allocator
> for allocations above 8kB, so this patch is going to have no effect on
> slub.  You'd be better off using alloc_pages_exact() for this kind of
> size, or managing your own pool of pages by using something like five
> 192k blocks in a 1MB allocation.

The fallback is only effective for kmalloc caches. Manually created caches
do not follow this rule.

Note that you can already control the page orders for allocation and
the objects per slab using

	slub_min_order
	slub_max_order
	slub_min_objects

This is documented in linux/Documentation/vm/slub.txt

Maybe do the same thing for SLAB?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 17:54     ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 17:54 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Tue, 20 Mar 2018, Matthew Wilcox wrote:

> On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > The reason why we need this is that we are going to merge code that does
> > block device deduplication (it was developed separatedly and sold as a
> > commercial product), and the code uses block sizes that are not a power of
> > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > allocator rounds up the allocation to the nearest power of two, but that
> > wastes a lot of memory. Performance of the solution depends on efficient
> > memory usage, so we should minimize wasted as much as possible.
>
> The SLUB allocator also falls back to using the page (buddy) allocator
> for allocations above 8kB, so this patch is going to have no effect on
> slub.  You'd be better off using alloc_pages_exact() for this kind of
> size, or managing your own pool of pages by using something like five
> 192k blocks in a 1MB allocation.

The fallback is only effective for kmalloc caches. Manually created caches
do not follow this rule.

Note that you can already control the page orders for allocation and
the objects per slab using

	slub_min_order
	slub_max_order
	slub_min_objects

This is documented in linux/Documentation/vm/slub.txt

Maybe do the same thing for SLAB?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-20 17:54     ` Christopher Lameter
@ 2018-03-20 19:22       ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 19:22 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Tue, 20 Mar 2018, Christopher Lameter wrote:

> On Tue, 20 Mar 2018, Matthew Wilcox wrote:
> 
> > On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > > The reason why we need this is that we are going to merge code that does
> > > block device deduplication (it was developed separatedly and sold as a
> > > commercial product), and the code uses block sizes that are not a power of
> > > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > > allocator rounds up the allocation to the nearest power of two, but that
> > > wastes a lot of memory. Performance of the solution depends on efficient
> > > memory usage, so we should minimize wasted as much as possible.
> >
> > The SLUB allocator also falls back to using the page (buddy) allocator
> > for allocations above 8kB, so this patch is going to have no effect on
> > slub.  You'd be better off using alloc_pages_exact() for this kind of
> > size, or managing your own pool of pages by using something like five
> > 192k blocks in a 1MB allocation.
> 
> The fallback is only effective for kmalloc caches. Manually created caches
> do not follow this rule.

Yes - the dm-bufio layer uses manually created caches.

> Note that you can already control the page orders for allocation and
> the objects per slab using
> 
> 	slub_min_order
> 	slub_max_order
> 	slub_min_objects
> 
> This is documented in linux/Documentation/vm/slub.txt
> 
> Maybe do the same thing for SLAB?

Yes, but I need to change it for a specific cache, not for all caches.

When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation 
becomes unreliable, thus it is a bad idea to increase slub_max_order 
system-wide.

Another problem with slub_max_order is that it would pad all caches to 
slub_max_order, even those that already have a power-of-two size (in that 
case, the padding is counterproductive).

BTW. the function "order_store" in mm/slub.c modifies the structure 
kmem_cache without taking any locks - is it a bug?

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 19:22       ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 19:22 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Tue, 20 Mar 2018, Christopher Lameter wrote:

> On Tue, 20 Mar 2018, Matthew Wilcox wrote:
> 
> > On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > > The reason why we need this is that we are going to merge code that does
> > > block device deduplication (it was developed separatedly and sold as a
> > > commercial product), and the code uses block sizes that are not a power of
> > > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > > allocator rounds up the allocation to the nearest power of two, but that
> > > wastes a lot of memory. Performance of the solution depends on efficient
> > > memory usage, so we should minimize wasted as much as possible.
> >
> > The SLUB allocator also falls back to using the page (buddy) allocator
> > for allocations above 8kB, so this patch is going to have no effect on
> > slub.  You'd be better off using alloc_pages_exact() for this kind of
> > size, or managing your own pool of pages by using something like five
> > 192k blocks in a 1MB allocation.
> 
> The fallback is only effective for kmalloc caches. Manually created caches
> do not follow this rule.

Yes - the dm-bufio layer uses manually created caches.

> Note that you can already control the page orders for allocation and
> the objects per slab using
> 
> 	slub_min_order
> 	slub_max_order
> 	slub_min_objects
> 
> This is documented in linux/Documentation/vm/slub.txt
> 
> Maybe do the same thing for SLAB?

Yes, but I need to change it for a specific cache, not for all caches.

When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation 
becomes unreliable, thus it is a bad idea to increase slub_max_order 
system-wide.

Another problem with slub_max_order is that it would pad all caches to 
slub_max_order, even those that already have a power-of-two size (in that 
case, the padding is counterproductive).

BTW. the function "order_store" in mm/slub.c modifies the structure 
kmem_cache without taking any locks - is it a bug?

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-20 19:22       ` Mikulas Patocka
@ 2018-03-20 20:42         ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 20:42 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Tue, 20 Mar 2018, Mikulas Patocka wrote:

> > Maybe do the same thing for SLAB?
>
> Yes, but I need to change it for a specific cache, not for all caches.

Why only some caches?

> When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> becomes unreliable, thus it is a bad idea to increase slub_max_order
> system-wide.

Well the allocations is more likely to fail that is true but SLUB will
fall back to a smaller order should the page allocator refuse to give us
that larger sized page.

> Another problem with slub_max_order is that it would pad all caches to
> slub_max_order, even those that already have a power-of-two size (in that
> case, the padding is counterproductive).

No it does not. Slub will calculate the configuration with the least byte
wastage. It is not the standard order but the maximum order to be used.
Power of two caches below PAGE_SIZE will have order 0.

There are some corner cases where extra metadata is needed per object or
per page that will result in either object sizes that are no longer a
power of two or in page sizes smaller than the whole page. Maybe you have
a case like that? Can you show me a cache that has this issue?

> BTW. the function "order_store" in mm/slub.c modifies the structure
> kmem_cache without taking any locks - is it a bug?

The kmem_cache structure was just allocated. Only one thread can access it
thus no locking is necessary.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 20:42         ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 20:42 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Tue, 20 Mar 2018, Mikulas Patocka wrote:

> > Maybe do the same thing for SLAB?
>
> Yes, but I need to change it for a specific cache, not for all caches.

Why only some caches?

> When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> becomes unreliable, thus it is a bad idea to increase slub_max_order
> system-wide.

Well the allocations is more likely to fail that is true but SLUB will
fall back to a smaller order should the page allocator refuse to give us
that larger sized page.

> Another problem with slub_max_order is that it would pad all caches to
> slub_max_order, even those that already have a power-of-two size (in that
> case, the padding is counterproductive).

No it does not. Slub will calculate the configuration with the least byte
wastage. It is not the standard order but the maximum order to be used.
Power of two caches below PAGE_SIZE will have order 0.

There are some corner cases where extra metadata is needed per object or
per page that will result in either object sizes that are no longer a
power of two or in page sizes smaller than the whole page. Maybe you have
a case like that? Can you show me a cache that has this issue?

> BTW. the function "order_store" in mm/slub.c modifies the structure
> kmem_cache without taking any locks - is it a bug?

The kmem_cache structure was just allocated. Only one thread can access it
thus no locking is necessary.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-20 20:42         ` Christopher Lameter
@ 2018-03-20 22:02           ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 22:02 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Tue, 20 Mar 2018, Christopher Lameter wrote:

> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> 
> > > Maybe do the same thing for SLAB?
> >
> > Yes, but I need to change it for a specific cache, not for all caches.
> 
> Why only some caches?

I need high order for the buffer cache that holds the deduplicated data. I 
don't need to force it system-wide.

> > When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> > becomes unreliable, thus it is a bad idea to increase slub_max_order
> > system-wide.
> 
> Well the allocations is more likely to fail that is true but SLUB will
> fall back to a smaller order should the page allocator refuse to give us
> that larger sized page.

Does SLAB have this fall-back too?

> > Another problem with slub_max_order is that it would pad all caches to
> > slub_max_order, even those that already have a power-of-two size (in that
> > case, the padding is counterproductive).
> 
> No it does not. Slub will calculate the configuration with the least byte
> wastage. It is not the standard order but the maximum order to be used.
> Power of two caches below PAGE_SIZE will have order 0.

Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
kmalloc-8192         352    352   8192   32   64 : tunables    0    0    0 : slabdata     11     11      0
                                             ^^^^

So it rounds up power-of-two sizes to high orders unnecessarily. Without 
slub_max_order=10, the number of pages for the kmalloc-8192 cache is just 
8.

I observe the same pathological rounding in dm-bufio caches.

> There are some corner cases where extra metadata is needed per object or
> per page that will result in either object sizes that are no longer a
> power of two or in page sizes smaller than the whole page. Maybe you have
> a case like that? Can you show me a cache that has this issue?

Here I have a patch set that changes the dm-bufio subsystem to support 
buffer sizes that are not a power of two:
http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/

I need to change the slub cache to minimize wasted space - i.e. when 
asking for a slab cache for 640kB objects, the slub system currently 
allocates 1MB per object and 384kB is wasted. This is the reason why I'm 
making this patch.

> > BTW. the function "order_store" in mm/slub.c modifies the structure
> > kmem_cache without taking any locks - is it a bug?
> 
> The kmem_cache structure was just allocated. Only one thread can access it
> thus no locking is necessary.

No - order_store is called when writing to /sys/kernel/slab/<cache>/order 
- you can modify order for any existing cache - and the modification 
happens without any locking.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 22:02           ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 22:02 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Tue, 20 Mar 2018, Christopher Lameter wrote:

> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> 
> > > Maybe do the same thing for SLAB?
> >
> > Yes, but I need to change it for a specific cache, not for all caches.
> 
> Why only some caches?

I need high order for the buffer cache that holds the deduplicated data. I 
don't need to force it system-wide.

> > When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> > becomes unreliable, thus it is a bad idea to increase slub_max_order
> > system-wide.
> 
> Well the allocations is more likely to fail that is true but SLUB will
> fall back to a smaller order should the page allocator refuse to give us
> that larger sized page.

Does SLAB have this fall-back too?

> > Another problem with slub_max_order is that it would pad all caches to
> > slub_max_order, even those that already have a power-of-two size (in that
> > case, the padding is counterproductive).
> 
> No it does not. Slub will calculate the configuration with the least byte
> wastage. It is not the standard order but the maximum order to be used.
> Power of two caches below PAGE_SIZE will have order 0.

Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
kmalloc-8192         352    352   8192   32   64 : tunables    0    0    0 : slabdata     11     11      0
                                             ^^^^

So it rounds up power-of-two sizes to high orders unnecessarily. Without 
slub_max_order=10, the number of pages for the kmalloc-8192 cache is just 
8.

I observe the same pathological rounding in dm-bufio caches.

> There are some corner cases where extra metadata is needed per object or
> per page that will result in either object sizes that are no longer a
> power of two or in page sizes smaller than the whole page. Maybe you have
> a case like that? Can you show me a cache that has this issue?

Here I have a patch set that changes the dm-bufio subsystem to support 
buffer sizes that are not a power of two:
http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/

I need to change the slub cache to minimize wasted space - i.e. when 
asking for a slab cache for 640kB objects, the slub system currently 
allocates 1MB per object and 384kB is wasted. This is the reason why I'm 
making this patch.

> > BTW. the function "order_store" in mm/slub.c modifies the structure
> > kmem_cache without taking any locks - is it a bug?
> 
> The kmem_cache structure was just allocated. Only one thread can access it
> thus no locking is necessary.

No - order_store is called when writing to /sys/kernel/slab/<cache>/order 
- you can modify order for any existing cache - and the modification 
happens without any locking.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-20 22:02           ` Mikulas Patocka
@ 2018-03-21 15:35             ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 15:35 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Tue, 20 Mar 2018, Mikulas Patocka wrote:

> > > Another problem with slub_max_order is that it would pad all caches to
> > > slub_max_order, even those that already have a power-of-two size (in that
> > > case, the padding is counterproductive).
> >
> > No it does not. Slub will calculate the configuration with the least byte
> > wastage. It is not the standard order but the maximum order to be used.
> > Power of two caches below PAGE_SIZE will have order 0.
>
> Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> kmalloc-8192         352    352   8192   32   64 : tunables    0    0    0 : slabdata     11     11      0

Yes it tries to create a slab size that will accomodate the minimum
objects per slab.

> So it rounds up power-of-two sizes to high orders unnecessarily. Without
> slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> 8.

The kmalloc-8192 has 4 objects per slab on my system which means an
allocation size of 32k = order 4.

In this case 4 objects fit tightly into a slab. There is no waste.

But then I thought you were talking about manually created slabs not
about the kmalloc array?

> I observe the same pathological rounding in dm-bufio caches.
>
> > There are some corner cases where extra metadata is needed per object or
> > per page that will result in either object sizes that are no longer a
> > power of two or in page sizes smaller than the whole page. Maybe you have
> > a case like that? Can you show me a cache that has this issue?
>
> Here I have a patch set that changes the dm-bufio subsystem to support
> buffer sizes that are not a power of two:
> http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
>
> I need to change the slub cache to minimize wasted space - i.e. when
> asking for a slab cache for 640kB objects, the slub system currently
> allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> making this patch.

You should not be using the slab allocators for these. Allocate higher
order pages or numbers of consecutive smaller pagess from the page
allocator. The slab allocators are written for objects smaller than page
size.

> > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > kmem_cache without taking any locks - is it a bug?
> >
> > The kmem_cache structure was just allocated. Only one thread can access it
> > thus no locking is necessary.
>
> No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> - you can modify order for any existing cache - and the modification
> happens without any locking.

Well it still does not matter. The size of the order of slab pages
can be dynamic even within a slab. You can have pages of varying sizes.

What kind of problem could be caused here?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 15:35             ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 15:35 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Tue, 20 Mar 2018, Mikulas Patocka wrote:

> > > Another problem with slub_max_order is that it would pad all caches to
> > > slub_max_order, even those that already have a power-of-two size (in that
> > > case, the padding is counterproductive).
> >
> > No it does not. Slub will calculate the configuration with the least byte
> > wastage. It is not the standard order but the maximum order to be used.
> > Power of two caches below PAGE_SIZE will have order 0.
>
> Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> kmalloc-8192         352    352   8192   32   64 : tunables    0    0    0 : slabdata     11     11      0

Yes it tries to create a slab size that will accomodate the minimum
objects per slab.

> So it rounds up power-of-two sizes to high orders unnecessarily. Without
> slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> 8.

The kmalloc-8192 has 4 objects per slab on my system which means an
allocation size of 32k = order 4.

In this case 4 objects fit tightly into a slab. There is no waste.

But then I thought you were talking about manually created slabs not
about the kmalloc array?

> I observe the same pathological rounding in dm-bufio caches.
>
> > There are some corner cases where extra metadata is needed per object or
> > per page that will result in either object sizes that are no longer a
> > power of two or in page sizes smaller than the whole page. Maybe you have
> > a case like that? Can you show me a cache that has this issue?
>
> Here I have a patch set that changes the dm-bufio subsystem to support
> buffer sizes that are not a power of two:
> http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
>
> I need to change the slub cache to minimize wasted space - i.e. when
> asking for a slab cache for 640kB objects, the slub system currently
> allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> making this patch.

You should not be using the slab allocators for these. Allocate higher
order pages or numbers of consecutive smaller pagess from the page
allocator. The slab allocators are written for objects smaller than page
size.

> > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > kmem_cache without taking any locks - is it a bug?
> >
> > The kmem_cache structure was just allocated. Only one thread can access it
> > thus no locking is necessary.
>
> No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> - you can modify order for any existing cache - and the modification
> happens without any locking.

Well it still does not matter. The size of the order of slab pages
can be dynamic even within a slab. You can have pages of varying sizes.

What kind of problem could be caused here?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 15:35             ` Christopher Lameter
@ 2018-03-21 16:25               ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 16:25 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> 
> > > > Another problem with slub_max_order is that it would pad all caches to
> > > > slub_max_order, even those that already have a power-of-two size (in that
> > > > case, the padding is counterproductive).
> > >
> > > No it does not. Slub will calculate the configuration with the least byte
> > > wastage. It is not the standard order but the maximum order to be used.
> > > Power of two caches below PAGE_SIZE will have order 0.
> >
> > Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> > kmalloc-8192         352    352   8192   32   64 : tunables    0    0    0 : slabdata     11     11      0
> 
> Yes it tries to create a slab size that will accomodate the minimum
> objects per slab.
> 
> > So it rounds up power-of-two sizes to high orders unnecessarily. Without
> > slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> > 8.
> 
> The kmalloc-8192 has 4 objects per slab on my system which means an
> allocation size of 32k = order 4.
> 
> In this case 4 objects fit tightly into a slab. There is no waste.
> 
> But then I thought you were talking about manually created slabs not
> about the kmalloc array?

For some workloads, dm-bufio needs caches with sizes that are a power of 
two (majority of workloads fall into this cathegory). For other workloads 
dm-bufio needs caches with sizes that are not a power of two.

Now - we don't want higher-order allocations for power-of-two caches 
(because higher-order allocations just cause memory fragmentation without 
any benefit), but we want higher-order allocations for non-power-of-two 
caches (because higher-order allocations minimize wasted space).

For example:
for 192K block size, the ideal order is 4MB (it takes 21 blocks)
for 448K block size, the ideal order is 4MB (it takes 9 blocks)
for 512K block size, the ideal order is 512KB (there is no benefit from 
	using higher order)
for 640K block size, the ideal order is 2MB (it takes 3 blocks, increasing 
	the allocation size to 4MB doesn't result in any benefit)
for 832K block size, the ideal order is 1MB (it takes 1 block, increasing
	the allocation to 2MB or 4MB doesn't result in any benefit)
for 1M block size, the ideal order is 1MB

The problem with "slub_max_order" is that it increases the order either 
always or never, but doesn't have the capability to calculate the ideal 
order for the given object size. The patch that I send just does this 
calculation.

Another problem wit "slub_max_order" is that the device driver that needs 
to create a slab cache cannot really set it - the device driver can't 
modify the kernel parameters.

> > I observe the same pathological rounding in dm-bufio caches.
> >
> > > There are some corner cases where extra metadata is needed per object or
> > > per page that will result in either object sizes that are no longer a
> > > power of two or in page sizes smaller than the whole page. Maybe you have
> > > a case like that? Can you show me a cache that has this issue?
> >
> > Here I have a patch set that changes the dm-bufio subsystem to support
> > buffer sizes that are not a power of two:
> > http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
> >
> > I need to change the slub cache to minimize wasted space - i.e. when
> > asking for a slab cache for 640kB objects, the slub system currently
> > allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> > making this patch.
> 
> You should not be using the slab allocators for these. Allocate higher
> order pages or numbers of consecutive smaller pagess from the page
> allocator. The slab allocators are written for objects smaller than page
> size.

So, do you argue that I need to write my own slab cache functionality 
instead of using the existing slab code?

I can do it - but duplicating code is bad thing.

> > > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > > kmem_cache without taking any locks - is it a bug?
> > >
> > > The kmem_cache structure was just allocated. Only one thread can access it
> > > thus no locking is necessary.
> >
> > No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> > - you can modify order for any existing cache - and the modification
> > happens without any locking.
> 
> Well it still does not matter. The size of the order of slab pages
> can be dynamic even within a slab. You can have pages of varying sizes.
> 
> What kind of problem could be caused here?

Unlocked accesses are generally considered bad. For example, see this 
piece of code in calculate_sizes:
        s->allocflags = 0;
        if (order)
                s->allocflags |= __GFP_COMP;

        if (s->flags & SLAB_CACHE_DMA)
                s->allocflags |= GFP_DMA;

        if (s->flags & SLAB_RECLAIM_ACCOUNT)
                s->allocflags |= __GFP_RECLAIMABLE;

If you are running this while the cache is in use (i.e. when the user 
writes /sys/kernel/slab/<cache>/order), then other processes will see 
invalid s->allocflags for a short time.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 16:25               ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 16:25 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> 
> > > > Another problem with slub_max_order is that it would pad all caches to
> > > > slub_max_order, even those that already have a power-of-two size (in that
> > > > case, the padding is counterproductive).
> > >
> > > No it does not. Slub will calculate the configuration with the least byte
> > > wastage. It is not the standard order but the maximum order to be used.
> > > Power of two caches below PAGE_SIZE will have order 0.
> >
> > Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> > kmalloc-8192         352    352   8192   32   64 : tunables    0    0    0 : slabdata     11     11      0
> 
> Yes it tries to create a slab size that will accomodate the minimum
> objects per slab.
> 
> > So it rounds up power-of-two sizes to high orders unnecessarily. Without
> > slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> > 8.
> 
> The kmalloc-8192 has 4 objects per slab on my system which means an
> allocation size of 32k = order 4.
> 
> In this case 4 objects fit tightly into a slab. There is no waste.
> 
> But then I thought you were talking about manually created slabs not
> about the kmalloc array?

For some workloads, dm-bufio needs caches with sizes that are a power of 
two (majority of workloads fall into this cathegory). For other workloads 
dm-bufio needs caches with sizes that are not a power of two.

Now - we don't want higher-order allocations for power-of-two caches 
(because higher-order allocations just cause memory fragmentation without 
any benefit), but we want higher-order allocations for non-power-of-two 
caches (because higher-order allocations minimize wasted space).

For example:
for 192K block size, the ideal order is 4MB (it takes 21 blocks)
for 448K block size, the ideal order is 4MB (it takes 9 blocks)
for 512K block size, the ideal order is 512KB (there is no benefit from 
	using higher order)
for 640K block size, the ideal order is 2MB (it takes 3 blocks, increasing 
	the allocation size to 4MB doesn't result in any benefit)
for 832K block size, the ideal order is 1MB (it takes 1 block, increasing
	the allocation to 2MB or 4MB doesn't result in any benefit)
for 1M block size, the ideal order is 1MB

The problem with "slub_max_order" is that it increases the order either 
always or never, but doesn't have the capability to calculate the ideal 
order for the given object size. The patch that I send just does this 
calculation.

Another problem wit "slub_max_order" is that the device driver that needs 
to create a slab cache cannot really set it - the device driver can't 
modify the kernel parameters.

> > I observe the same pathological rounding in dm-bufio caches.
> >
> > > There are some corner cases where extra metadata is needed per object or
> > > per page that will result in either object sizes that are no longer a
> > > power of two or in page sizes smaller than the whole page. Maybe you have
> > > a case like that? Can you show me a cache that has this issue?
> >
> > Here I have a patch set that changes the dm-bufio subsystem to support
> > buffer sizes that are not a power of two:
> > http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
> >
> > I need to change the slub cache to minimize wasted space - i.e. when
> > asking for a slab cache for 640kB objects, the slub system currently
> > allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> > making this patch.
> 
> You should not be using the slab allocators for these. Allocate higher
> order pages or numbers of consecutive smaller pagess from the page
> allocator. The slab allocators are written for objects smaller than page
> size.

So, do you argue that I need to write my own slab cache functionality 
instead of using the existing slab code?

I can do it - but duplicating code is bad thing.

> > > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > > kmem_cache without taking any locks - is it a bug?
> > >
> > > The kmem_cache structure was just allocated. Only one thread can access it
> > > thus no locking is necessary.
> >
> > No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> > - you can modify order for any existing cache - and the modification
> > happens without any locking.
> 
> Well it still does not matter. The size of the order of slab pages
> can be dynamic even within a slab. You can have pages of varying sizes.
> 
> What kind of problem could be caused here?

Unlocked accesses are generally considered bad. For example, see this 
piece of code in calculate_sizes:
        s->allocflags = 0;
        if (order)
                s->allocflags |= __GFP_COMP;

        if (s->flags & SLAB_CACHE_DMA)
                s->allocflags |= GFP_DMA;

        if (s->flags & SLAB_RECLAIM_ACCOUNT)
                s->allocflags |= __GFP_RECLAIMABLE;

If you are running this while the cache is in use (i.e. when the user 
writes /sys/kernel/slab/<cache>/order), then other processes will see 
invalid s->allocflags for a short time.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 16:25               ` Mikulas Patocka
@ 2018-03-21 17:10                 ` Matthew Wilcox
  -1 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:10 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Christopher Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, Mar 21, 2018 at 12:25:39PM -0400, Mikulas Patocka wrote:
> Now - we don't want higher-order allocations for power-of-two caches 
> (because higher-order allocations just cause memory fragmentation without 
> any benefit)

Higher-order allocations don't cause memory fragmentation.  Indeed,
they avoid it.  They do fail as a result of fragmentation, which is
probably what you meant.

> , but we want higher-order allocations for non-power-of-two 
> caches (because higher-order allocations minimize wasted space).
> 
> For example:
> for 192K block size, the ideal order is 4MB (it takes 21 blocks)

I wonder if that's true.  You can get five blocks into 1MB, wasting 64kB.
So going up by two orders of magnitude lets you get an extra block in
at the cost of failing more frequently.

> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
> 
> So, do you argue that I need to write my own slab cache functionality 
> instead of using the existing slab code?
> 
> I can do it - but duplicating code is bad thing.

It is -- but writing a special-purpose allocator can be better than making
a general purpose allocator also solve a special purpose.  I don't know
whether that's true here or not.

Your allocator seems like it could be remarkably simple; you know
you're always doing high-order allocations, and you know that you're
never allocating more than a handful of blocks from a page allocation.
So you can probably store all of your metadata in the struct page
(because your metadata is basically a bitmap) and significantly save on
memory usage.  The one downside I see is that you don't get the reporting
through /proc/slabinfo.

So, is this an area where slub should be improved, or is this a case where
writing a special-purpose allocator makes more sense?  It seems like you
already have a special-purpose allocator, in that you know how to fall
back to vmalloc if slab-alloc fails.  So maybe have your own allocator
that interfaces to the page allocator for now; keep its interface nice
and clean, and maybe it'll get pulled out of your driver and put into mm/
some day if it becomes a useful API for everybody to share?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:10                 ` Matthew Wilcox
  0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:10 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Andrew Morton, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christopher Lameter, Joonsoo Kim

On Wed, Mar 21, 2018 at 12:25:39PM -0400, Mikulas Patocka wrote:
> Now - we don't want higher-order allocations for power-of-two caches 
> (because higher-order allocations just cause memory fragmentation without 
> any benefit)

Higher-order allocations don't cause memory fragmentation.  Indeed,
they avoid it.  They do fail as a result of fragmentation, which is
probably what you meant.

> , but we want higher-order allocations for non-power-of-two 
> caches (because higher-order allocations minimize wasted space).
> 
> For example:
> for 192K block size, the ideal order is 4MB (it takes 21 blocks)

I wonder if that's true.  You can get five blocks into 1MB, wasting 64kB.
So going up by two orders of magnitude lets you get an extra block in
at the cost of failing more frequently.

> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
> 
> So, do you argue that I need to write my own slab cache functionality 
> instead of using the existing slab code?
> 
> I can do it - but duplicating code is bad thing.

It is -- but writing a special-purpose allocator can be better than making
a general purpose allocator also solve a special purpose.  I don't know
whether that's true here or not.

Your allocator seems like it could be remarkably simple; you know
you're always doing high-order allocations, and you know that you're
never allocating more than a handful of blocks from a page allocation.
So you can probably store all of your metadata in the struct page
(because your metadata is basically a bitmap) and significantly save on
memory usage.  The one downside I see is that you don't get the reporting
through /proc/slabinfo.

So, is this an area where slub should be improved, or is this a case where
writing a special-purpose allocator makes more sense?  It seems like you
already have a special-purpose allocator, in that you know how to fall
back to vmalloc if slab-alloc fails.  So maybe have your own allocator
that interfaces to the page allocator for now; keep its interface nice
and clean, and maybe it'll get pulled out of your driver and put into mm/
some day if it becomes a useful API for everybody to share?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 16:25               ` Mikulas Patocka
@ 2018-03-21 17:30                 ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:30 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
>
> So, do you argue that I need to write my own slab cache functionality
> instead of using the existing slab code?

Just use the existing page allocator calls to allocate and free the
memory you need.

> I can do it - but duplicating code is bad thing.

There is no need to duplicate anything. There is lots of infrastructure
already in the kernel. You just need to use the right allocation / freeing
calls.

> > What kind of problem could be caused here?
>
> Unlocked accesses are generally considered bad. For example, see this
> piece of code in calculate_sizes:
>         s->allocflags = 0;
>         if (order)
>                 s->allocflags |= __GFP_COMP;
>
>         if (s->flags & SLAB_CACHE_DMA)
>                 s->allocflags |= GFP_DMA;
>
>         if (s->flags & SLAB_RECLAIM_ACCOUNT)
>                 s->allocflags |= __GFP_RECLAIMABLE;
>
> If you are running this while the cache is in use (i.e. when the user
> writes /sys/kernel/slab/<cache>/order), then other processes will see
> invalid s->allocflags for a short time.

Calculating sizes is done when the slab has only a single accessor. Thus
no locking is neeed.

Changing the size of objects in a slab cache when there is already a set
of object allocated and under management by the slab cache would
cause the allocator to fail and lead to garbled data.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:30                 ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:30 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
>
> So, do you argue that I need to write my own slab cache functionality
> instead of using the existing slab code?

Just use the existing page allocator calls to allocate and free the
memory you need.

> I can do it - but duplicating code is bad thing.

There is no need to duplicate anything. There is lots of infrastructure
already in the kernel. You just need to use the right allocation / freeing
calls.

> > What kind of problem could be caused here?
>
> Unlocked accesses are generally considered bad. For example, see this
> piece of code in calculate_sizes:
>         s->allocflags = 0;
>         if (order)
>                 s->allocflags |= __GFP_COMP;
>
>         if (s->flags & SLAB_CACHE_DMA)
>                 s->allocflags |= GFP_DMA;
>
>         if (s->flags & SLAB_RECLAIM_ACCOUNT)
>                 s->allocflags |= __GFP_RECLAIMABLE;
>
> If you are running this while the cache is in use (i.e. when the user
> writes /sys/kernel/slab/<cache>/order), then other processes will see
> invalid s->allocflags for a short time.

Calculating sizes is done when the slab has only a single accessor. Thus
no locking is neeed.

Changing the size of objects in a slab cache when there is already a set
of object allocated and under management by the slab cache would
cause the allocator to fail and lead to garbled data.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 17:30                 ` Christopher Lameter
@ 2018-03-21 17:39                   ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:39 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

One other thought: If you want to improve the behavior for large scale
objects allocated through kmalloc/kmemcache then we would certainly be
glad to entertain those ideas.

F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
allocate powers of two pages. It would be relatively easy to make
kmalloc_large round the allocation to the next page size and then allocate
N consecutive pages via alloc_pages_exact() and free the remainder unused
pages or some such thing.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:39                   ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:39 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

One other thought: If you want to improve the behavior for large scale
objects allocated through kmalloc/kmemcache then we would certainly be
glad to entertain those ideas.

F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
allocate powers of two pages. It would be relatively easy to make
kmalloc_large round the allocation to the next page size and then allocate
N consecutive pages via alloc_pages_exact() and free the remainder unused
pages or some such thing.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 17:39                   ` Christopher Lameter
@ 2018-03-21 17:49                     ` Matthew Wilcox
  -1 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:49 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
> 
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.

I don't know if that's a good idea.  That will contribute to fragmentation
if the allocation is held onto for a short-to-medium length of time.
If the allocation is for a very long period of time then those pages
would have been unavailable anyway, but if the user of the tail pages
holds them beyond the lifetime of the large allocation, then this is
probably a bad tradeoff to make.

I do see Mikulas' use case as interesting, I just don't know whether it's
worth changing slab/slub to support it.  At first blush, other than the
sheer size of the allocations, it's a good fit.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:49                     ` Matthew Wilcox
  0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:49 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
> 
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.

I don't know if that's a good idea.  That will contribute to fragmentation
if the allocation is held onto for a short-to-medium length of time.
If the allocation is for a very long period of time then those pages
would have been unavailable anyway, but if the user of the tail pages
holds them beyond the lifetime of the large allocation, then this is
probably a bad tradeoff to make.

I do see Mikulas' use case as interesting, I just don't know whether it's
worth changing slab/slub to support it.  At first blush, other than the
sheer size of the allocations, it's a good fit.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 17:49                     ` Matthew Wilcox
@ 2018-03-21 18:01                       ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:01 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Matthew Wilcox wrote:

> I don't know if that's a good idea.  That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.
>
> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it.  At first blush, other than the
> sheer size of the allocations, it's a good fit.

Well there are numerous page pool approaches already in the kernel. Maybe
there is a better fit with those?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:01                       ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:01 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Matthew Wilcox wrote:

> I don't know if that's a good idea.  That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.
>
> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it.  At first blush, other than the
> sheer size of the allocations, it's a good fit.

Well there are numerous page pool approaches already in the kernel. Maybe
there is a better fit with those?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 17:49                     ` Matthew Wilcox
@ 2018-03-21 18:23                       ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:23 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christopher Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Matthew Wilcox wrote:

> On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> > One other thought: If you want to improve the behavior for large scale
> > objects allocated through kmalloc/kmemcache then we would certainly be
> > glad to entertain those ideas.
> > 
> > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > allocate powers of two pages. It would be relatively easy to make
> > kmalloc_large round the allocation to the next page size and then allocate
> > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > pages or some such thing.

alloc_pages_exact() has O(n*log n) complexity with respect to the number 
of requested pages. It would have to be reworked and optimized if it were 
to be used for the dm-bufio cache. (it could be optimized down to O(log n) 
if it didn't split the compound page to a lot of separate pages, but split 
it to a power-of-two clusters instead).

> I don't know if that's a good idea.  That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.

The problem with alloc_pages_exact() is that it exhausts all the 
high-order pages and leaves many free low-order pages around. So you'll 
end up in a system with a lot of free memory, but with all high-order 
pages missing. As there would be a lot of free memory, the kswapd thread 
would not be woken up to free some high-order pages.

I think that using slab with high order is better, because it at least 
doesn't leave many low-order pages behind.

> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it.  At first blush, other than the
> sheer size of the allocations, it's a good fit.

All I need is to increase the order of a specific slab cache - I think 
it's better to implement an interface that allows doing it than to 
duplicate the slab cache code.

BTW. it could be possible to open the file 
"/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write 
the requested value there, but it seems very dirty. It would be better to 
have a kernel interface for that.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:23                       ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:23 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mike Snitzer, Andrew Morton, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christopher Lameter, Joonsoo Kim



On Wed, 21 Mar 2018, Matthew Wilcox wrote:

> On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> > One other thought: If you want to improve the behavior for large scale
> > objects allocated through kmalloc/kmemcache then we would certainly be
> > glad to entertain those ideas.
> > 
> > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > allocate powers of two pages. It would be relatively easy to make
> > kmalloc_large round the allocation to the next page size and then allocate
> > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > pages or some such thing.

alloc_pages_exact() has O(n*log n) complexity with respect to the number 
of requested pages. It would have to be reworked and optimized if it were 
to be used for the dm-bufio cache. (it could be optimized down to O(log n) 
if it didn't split the compound page to a lot of separate pages, but split 
it to a power-of-two clusters instead).

> I don't know if that's a good idea.  That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.

The problem with alloc_pages_exact() is that it exhausts all the 
high-order pages and leaves many free low-order pages around. So you'll 
end up in a system with a lot of free memory, but with all high-order 
pages missing. As there would be a lot of free memory, the kswapd thread 
would not be woken up to free some high-order pages.

I think that using slab with high order is better, because it at least 
doesn't leave many low-order pages behind.

> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it.  At first blush, other than the
> sheer size of the allocations, it's a good fit.

All I need is to increase the order of a specific slab cache - I think 
it's better to implement an interface that allows doing it than to 
duplicate the slab cache code.

BTW. it could be possible to open the file 
"/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write 
the requested value there, but it seems very dirty. It would be better to 
have a kernel interface for that.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 17:30                 ` Christopher Lameter
@ 2018-03-21 18:36                   ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:36 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > You should not be using the slab allocators for these. Allocate higher
> > > order pages or numbers of consecutive smaller pagess from the page
> > > allocator. The slab allocators are written for objects smaller than page
> > > size.
> >
> > So, do you argue that I need to write my own slab cache functionality
> > instead of using the existing slab code?
> 
> Just use the existing page allocator calls to allocate and free the
> memory you need.
> 
> > I can do it - but duplicating code is bad thing.
> 
> There is no need to duplicate anything. There is lots of infrastructure
> already in the kernel. You just need to use the right allocation / freeing
> calls.

So, what would you recommend for allocating 640KB objects while minimizing 
wasted space?
* alloc_pages - rounds up to the next power of two
* kmalloc - rounds up to the next power of two
* alloc_pages_exact - O(n*log n) complexity; and causes memory 
  fragmentation if used excesivelly
* vmalloc - horrible performance (modifies page tables and that causes 
  synchronization across all CPUs)

anything else?

The slab cache with large order seems as a best choice for this.

> > > What kind of problem could be caused here?
> >
> > Unlocked accesses are generally considered bad. For example, see this
> > piece of code in calculate_sizes:
> >         s->allocflags = 0;
> >         if (order)
> >                 s->allocflags |= __GFP_COMP;
> >
> >         if (s->flags & SLAB_CACHE_DMA)
> >                 s->allocflags |= GFP_DMA;
> >
> >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> >                 s->allocflags |= __GFP_RECLAIMABLE;
> >
> > If you are running this while the cache is in use (i.e. when the user
> > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > invalid s->allocflags for a short time.
> 
> Calculating sizes is done when the slab has only a single accessor. Thus
> no locking is neeed.

The calculation is done whenever someone writes to 
"/sys/kernel/slab/*/order"

And you can obviously write to that file why the slab cache is in use. Try 
it.

So, the function calculate_sizes can actually race with allocation from 
the slab cache.

> Changing the size of objects in a slab cache when there is already a set
> of object allocated and under management by the slab cache would
> cause the allocator to fail and lead to garbled data.

I am not talking about changing the size of objects in a slab cache. I am 
talking about changing the allocation order of a slab cache while the 
cache is in use. This can be done with the sysfs interface.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:36                   ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:36 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > You should not be using the slab allocators for these. Allocate higher
> > > order pages or numbers of consecutive smaller pagess from the page
> > > allocator. The slab allocators are written for objects smaller than page
> > > size.
> >
> > So, do you argue that I need to write my own slab cache functionality
> > instead of using the existing slab code?
> 
> Just use the existing page allocator calls to allocate and free the
> memory you need.
> 
> > I can do it - but duplicating code is bad thing.
> 
> There is no need to duplicate anything. There is lots of infrastructure
> already in the kernel. You just need to use the right allocation / freeing
> calls.

So, what would you recommend for allocating 640KB objects while minimizing 
wasted space?
* alloc_pages - rounds up to the next power of two
* kmalloc - rounds up to the next power of two
* alloc_pages_exact - O(n*log n) complexity; and causes memory 
  fragmentation if used excesivelly
* vmalloc - horrible performance (modifies page tables and that causes 
  synchronization across all CPUs)

anything else?

The slab cache with large order seems as a best choice for this.

> > > What kind of problem could be caused here?
> >
> > Unlocked accesses are generally considered bad. For example, see this
> > piece of code in calculate_sizes:
> >         s->allocflags = 0;
> >         if (order)
> >                 s->allocflags |= __GFP_COMP;
> >
> >         if (s->flags & SLAB_CACHE_DMA)
> >                 s->allocflags |= GFP_DMA;
> >
> >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> >                 s->allocflags |= __GFP_RECLAIMABLE;
> >
> > If you are running this while the cache is in use (i.e. when the user
> > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > invalid s->allocflags for a short time.
> 
> Calculating sizes is done when the slab has only a single accessor. Thus
> no locking is neeed.

The calculation is done whenever someone writes to 
"/sys/kernel/slab/*/order"

And you can obviously write to that file why the slab cache is in use. Try 
it.

So, the function calculate_sizes can actually race with allocation from 
the slab cache.

> Changing the size of objects in a slab cache when there is already a set
> of object allocated and under management by the slab cache would
> cause the allocator to fail and lead to garbled data.

I am not talking about changing the size of objects in a slab cache. I am 
talking about changing the allocation order of a slab cache while the 
cache is in use. This can be done with the sysfs interface.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:23                       ` Mikulas Patocka
@ 2018-03-21 18:40                         ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:40 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > allocate powers of two pages. It would be relatively easy to make
> > > kmalloc_large round the allocation to the next page size and then allocate
> > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > pages or some such thing.
>
> alloc_pages_exact() has O(n*log n) complexity with respect to the number
> of requested pages. It would have to be reworked and optimized if it were
> to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> if it didn't split the compound page to a lot of separate pages, but split
> it to a power-of-two clusters instead).

Well then a memory pool of page allocator requests may address that issue?

Have a look at include/linux/mempool.h.

> > I don't know if that's a good idea.  That will contribute to fragmentation
> > if the allocation is held onto for a short-to-medium length of time.
> > If the allocation is for a very long period of time then those pages
> > would have been unavailable anyway, but if the user of the tail pages
> > holds them beyond the lifetime of the large allocation, then this is
> > probably a bad tradeoff to make.

Fragmentation is sadly a big issue. You could create a mempool on bootup
or early after boot to ensure that you have a sufficient number of
contiguous pages available.

> The problem with alloc_pages_exact() is that it exhausts all the
> high-order pages and leaves many free low-order pages around. So you'll
> end up in a system with a lot of free memory, but with all high-order
> pages missing. As there would be a lot of free memory, the kswapd thread
> would not be woken up to free some high-order pages.

I think that logic is properly balanced and will take into account pages
that have been removed from the LRU expiration logic.

> I think that using slab with high order is better, because it at least
> doesn't leave many low-order pages behind.

Any request to the slab via kmalloc with a size > 2x page size will simply
lead to a page allocator request. You have the same issue. If you want to
rely on the slab allocator buffering large segments for you then a mempool
will also solve the issue for you and you have more control over the pool.

> BTW. it could be possible to open the file
> "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> the requested value there, but it seems very dirty. It would be better to
> have a kernel interface for that.

Hehehe you could directly write to the kmem_cache structure and increase
the order. AFAICT this would be dirty but work.

But still the increased page order will get you into trouble with
fragmentation when the system runs for a long time. That is the reason we
try to limit the allocation sizes coming from the slab allocator.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:40                         ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:40 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > allocate powers of two pages. It would be relatively easy to make
> > > kmalloc_large round the allocation to the next page size and then allocate
> > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > pages or some such thing.
>
> alloc_pages_exact() has O(n*log n) complexity with respect to the number
> of requested pages. It would have to be reworked and optimized if it were
> to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> if it didn't split the compound page to a lot of separate pages, but split
> it to a power-of-two clusters instead).

Well then a memory pool of page allocator requests may address that issue?

Have a look at include/linux/mempool.h.

> > I don't know if that's a good idea.  That will contribute to fragmentation
> > if the allocation is held onto for a short-to-medium length of time.
> > If the allocation is for a very long period of time then those pages
> > would have been unavailable anyway, but if the user of the tail pages
> > holds them beyond the lifetime of the large allocation, then this is
> > probably a bad tradeoff to make.

Fragmentation is sadly a big issue. You could create a mempool on bootup
or early after boot to ensure that you have a sufficient number of
contiguous pages available.

> The problem with alloc_pages_exact() is that it exhausts all the
> high-order pages and leaves many free low-order pages around. So you'll
> end up in a system with a lot of free memory, but with all high-order
> pages missing. As there would be a lot of free memory, the kswapd thread
> would not be woken up to free some high-order pages.

I think that logic is properly balanced and will take into account pages
that have been removed from the LRU expiration logic.

> I think that using slab with high order is better, because it at least
> doesn't leave many low-order pages behind.

Any request to the slab via kmalloc with a size > 2x page size will simply
lead to a page allocator request. You have the same issue. If you want to
rely on the slab allocator buffering large segments for you then a mempool
will also solve the issue for you and you have more control over the pool.

> BTW. it could be possible to open the file
> "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> the requested value there, but it seems very dirty. It would be better to
> have a kernel interface for that.

Hehehe you could directly write to the kmem_cache structure and increase
the order. AFAICT this would be dirty but work.

But still the increased page order will get you into trouble with
fragmentation when the system runs for a long time. That is the reason we
try to limit the allocation sizes coming from the slab allocator.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:40                         ` Christopher Lameter
@ 2018-03-21 18:55                           ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:55 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
> 
> Well then a memory pool of page allocator requests may address that issue?
> 
> Have a look at include/linux/mempool.h.

I know the mempool interface. Mempool can keep some amount reserved 
objects if the system memory is exhausted.

Mempool doesn't deal with object allocation at all - mempool needs to be 
hooked to an existing object allocator (slab cache, kmalloc, alloc_pages, 
or some custom allocator provided with the methods mempool_alloc_t and 
mempool_free_t).

> > > I don't know if that's a good idea.  That will contribute to fragmentation
> > > if the allocation is held onto for a short-to-medium length of time.
> > > If the allocation is for a very long period of time then those pages
> > > would have been unavailable anyway, but if the user of the tail pages
> > > holds them beyond the lifetime of the large allocation, then this is
> > > probably a bad tradeoff to make.
> 
> Fragmentation is sadly a big issue. You could create a mempool on bootup
> or early after boot to ensure that you have a sufficient number of
> contiguous pages available.

The dm-bufio driver deals correctly with this - it preallocates several 
buffers with vmalloc when the dm-bufio cache is created. During operation, 
if a high-order allocation fails, the dm-bufio subsystem throws away some 
existing buffer and reuses the already allocated chunk of memory for the 
buffer that needs to be created.

So, fragmentation is not an issue with this use case. dm-bufio can make 
forward progress even if memory is totally exhausted.

> > The problem with alloc_pages_exact() is that it exhausts all the
> > high-order pages and leaves many free low-order pages around. So you'll
> > end up in a system with a lot of free memory, but with all high-order
> > pages missing. As there would be a lot of free memory, the kswapd thread
> > would not be woken up to free some high-order pages.
> 
> I think that logic is properly balanced and will take into account pages
> that have been removed from the LRU expiration logic.
> 
> > I think that using slab with high order is better, because it at least
> > doesn't leave many low-order pages behind.
> 
> Any request to the slab via kmalloc with a size > 2x page size will simply
> lead to a page allocator request. You have the same issue. If you want to
> rely on the slab allocator buffering large segments for you then a mempool
> will also solve the issue for you and you have more control over the pool.

mempool solves nothing because it needs a backing allocator. And the 
question is what this backing allocator should be.

> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
> 
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
> 
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.

It won't - see above - if the high-order allocation fails, dm-bufio just 
reuses some existing buffer.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:55                           ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:55 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
> 
> Well then a memory pool of page allocator requests may address that issue?
> 
> Have a look at include/linux/mempool.h.

I know the mempool interface. Mempool can keep some amount reserved 
objects if the system memory is exhausted.

Mempool doesn't deal with object allocation at all - mempool needs to be 
hooked to an existing object allocator (slab cache, kmalloc, alloc_pages, 
or some custom allocator provided with the methods mempool_alloc_t and 
mempool_free_t).

> > > I don't know if that's a good idea.  That will contribute to fragmentation
> > > if the allocation is held onto for a short-to-medium length of time.
> > > If the allocation is for a very long period of time then those pages
> > > would have been unavailable anyway, but if the user of the tail pages
> > > holds them beyond the lifetime of the large allocation, then this is
> > > probably a bad tradeoff to make.
> 
> Fragmentation is sadly a big issue. You could create a mempool on bootup
> or early after boot to ensure that you have a sufficient number of
> contiguous pages available.

The dm-bufio driver deals correctly with this - it preallocates several 
buffers with vmalloc when the dm-bufio cache is created. During operation, 
if a high-order allocation fails, the dm-bufio subsystem throws away some 
existing buffer and reuses the already allocated chunk of memory for the 
buffer that needs to be created.

So, fragmentation is not an issue with this use case. dm-bufio can make 
forward progress even if memory is totally exhausted.

> > The problem with alloc_pages_exact() is that it exhausts all the
> > high-order pages and leaves many free low-order pages around. So you'll
> > end up in a system with a lot of free memory, but with all high-order
> > pages missing. As there would be a lot of free memory, the kswapd thread
> > would not be woken up to free some high-order pages.
> 
> I think that logic is properly balanced and will take into account pages
> that have been removed from the LRU expiration logic.
> 
> > I think that using slab with high order is better, because it at least
> > doesn't leave many low-order pages behind.
> 
> Any request to the slab via kmalloc with a size > 2x page size will simply
> lead to a page allocator request. You have the same issue. If you want to
> rely on the slab allocator buffering large segments for you then a mempool
> will also solve the issue for you and you have more control over the pool.

mempool solves nothing because it needs a backing allocator. And the 
question is what this backing allocator should be.

> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
> 
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
> 
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.

It won't - see above - if the high-order allocation fails, dm-bufio just 
reuses some existing buffer.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:40                         ` Christopher Lameter
@ 2018-03-21 18:55                           ` Matthew Wilcox
  -1 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 18:55 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, Mar 21, 2018 at 01:40:31PM -0500, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
> 
> Well then a memory pool of page allocator requests may address that issue?
> 
> Have a look at include/linux/mempool.h.

That's not what mempool is for.  mempool is a cache of elements that were
allocated from slab in the first place.  (OK, technically, you don't have
to use slab as the allocator, but since there is no allocator that solves
this problem, mempool doesn't solve the problem either!)

> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
> 
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
> 
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.

Right; he has a fallback already (vmalloc).  So ... let's just add the
interface to allow slab caches to have their order tuned by users who
really know what they're doing?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:55                           ` Matthew Wilcox
  0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 18:55 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, Mar 21, 2018 at 01:40:31PM -0500, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
> 
> Well then a memory pool of page allocator requests may address that issue?
> 
> Have a look at include/linux/mempool.h.

That's not what mempool is for.  mempool is a cache of elements that were
allocated from slab in the first place.  (OK, technically, you don't have
to use slab as the allocator, but since there is no allocator that solves
this problem, mempool doesn't solve the problem either!)

> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
> 
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
> 
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.

Right; he has a fallback already (vmalloc).  So ... let's just add the
interface to allow slab caches to have their order tuned by users who
really know what they're doing?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:36                   ` Mikulas Patocka
@ 2018-03-21 18:57                     ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:57 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> So, what would you recommend for allocating 640KB objects while minimizing
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory
>   fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes
>   synchronization across all CPUs)
>
> anything else?

Need to find it but there is a way to allocate N pages in sequence
somewhere. Otherwise mempools are something that would work.

> > > > What kind of problem could be caused here?
> > >
> > > Unlocked accesses are generally considered bad. For example, see this
> > > piece of code in calculate_sizes:
> > >         s->allocflags = 0;
> > >         if (order)
> > >                 s->allocflags |= __GFP_COMP;
> > >
> > >         if (s->flags & SLAB_CACHE_DMA)
> > >                 s->allocflags |= GFP_DMA;
> > >
> > >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > >                 s->allocflags |= __GFP_RECLAIMABLE;
> > >
> > > If you are running this while the cache is in use (i.e. when the user
> > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > invalid s->allocflags for a short time.
> >
> > Calculating sizes is done when the slab has only a single accessor. Thus
> > no locking is neeed.
>
> The calculation is done whenever someone writes to
> "/sys/kernel/slab/*/order"

But the flags you are mentioning do not change and the size of the object
does not change. What changes is the number of objects in the slab page.

> And you can obviously write to that file why the slab cache is in use. Try
> it.

You cannot change flags that would impact the size of the objects. I only
allowed changing characteristics that does not impact object size.

> I am not talking about changing the size of objects in a slab cache. I am
> talking about changing the allocation order of a slab cache while the
> cache is in use. This can be done with the sysfs interface.

But then that is something that is allowed but does not affect the object
size used by the slab allocators.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:57                     ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:57 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> So, what would you recommend for allocating 640KB objects while minimizing
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory
>   fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes
>   synchronization across all CPUs)
>
> anything else?

Need to find it but there is a way to allocate N pages in sequence
somewhere. Otherwise mempools are something that would work.

> > > > What kind of problem could be caused here?
> > >
> > > Unlocked accesses are generally considered bad. For example, see this
> > > piece of code in calculate_sizes:
> > >         s->allocflags = 0;
> > >         if (order)
> > >                 s->allocflags |= __GFP_COMP;
> > >
> > >         if (s->flags & SLAB_CACHE_DMA)
> > >                 s->allocflags |= GFP_DMA;
> > >
> > >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > >                 s->allocflags |= __GFP_RECLAIMABLE;
> > >
> > > If you are running this while the cache is in use (i.e. when the user
> > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > invalid s->allocflags for a short time.
> >
> > Calculating sizes is done when the slab has only a single accessor. Thus
> > no locking is neeed.
>
> The calculation is done whenever someone writes to
> "/sys/kernel/slab/*/order"

But the flags you are mentioning do not change and the size of the object
does not change. What changes is the number of objects in the slab page.

> And you can obviously write to that file why the slab cache is in use. Try
> it.

You cannot change flags that would impact the size of the objects. I only
allowed changing characteristics that does not impact object size.

> I am not talking about changing the size of objects in a slab cache. I am
> talking about changing the allocation order of a slab cache while the
> cache is in use. This can be done with the sysfs interface.

But then that is something that is allowed but does not affect the object
size used by the slab allocators.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:55                           ` Matthew Wilcox
@ 2018-03-21 18:58                             ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:58 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Matthew Wilcox wrote:

> > Have a look at include/linux/mempool.h.
>
> That's not what mempool is for.  mempool is a cache of elements that were
> allocated from slab in the first place.  (OK, technically, you don't have
> to use slab as the allocator, but since there is no allocator that solves
> this problem, mempool doesn't solve the problem either!)

You can put the page allocator in there instead of a slab allocator.

> > But still the increased page order will get you into trouble with
> > fragmentation when the system runs for a long time. That is the reason we
> > try to limit the allocation sizes coming from the slab allocator.
>
> Right; he has a fallback already (vmalloc).  So ... let's just add the
> interface to allow slab caches to have their order tuned by users who
> really know what they're doing?

Ok thats trivial.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:58                             ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:58 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Matthew Wilcox wrote:

> > Have a look at include/linux/mempool.h.
>
> That's not what mempool is for.  mempool is a cache of elements that were
> allocated from slab in the first place.  (OK, technically, you don't have
> to use slab as the allocator, but since there is no allocator that solves
> this problem, mempool doesn't solve the problem either!)

You can put the page allocator in there instead of a slab allocator.

> > But still the increased page order will get you into trouble with
> > fragmentation when the system runs for a long time. That is the reason we
> > try to limit the allocation sizes coming from the slab allocator.
>
> Right; he has a fallback already (vmalloc).  So ... let's just add the
> interface to allow slab caches to have their order tuned by users who
> really know what they're doing?

Ok thats trivial.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:57                     ` Christopher Lameter
@ 2018-03-21 19:19                       ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:19 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > So, what would you recommend for allocating 640KB objects while minimizing
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory
> >   fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes
> >   synchronization across all CPUs)
> >
> > anything else?
> 
> Need to find it but there is a way to allocate N pages in sequence
> somewhere. Otherwise mempools are something that would work.

There's also continuous-memory-allocator, but it needs its memory to be 
reserved at boot time. It is intended for misdesigned hardware devices 
that need continuous memory for DMA. As it's intended for one-time 
allocations when loading drivers, it lacks the performance and scalability 
of the slab cache and alloc_pages.

> > > > > What kind of problem could be caused here?
> > > >
> > > > Unlocked accesses are generally considered bad. For example, see this
> > > > piece of code in calculate_sizes:
> > > >         s->allocflags = 0;
> > > >         if (order)
> > > >                 s->allocflags |= __GFP_COMP;
> > > >
> > > >         if (s->flags & SLAB_CACHE_DMA)
> > > >                 s->allocflags |= GFP_DMA;
> > > >
> > > >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > >                 s->allocflags |= __GFP_RECLAIMABLE;
> > > >
> > > > If you are running this while the cache is in use (i.e. when the user
> > > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > > invalid s->allocflags for a short time.
> > >
> > > Calculating sizes is done when the slab has only a single accessor. Thus
> > > no locking is neeed.
> >
> > The calculation is done whenever someone writes to
> > "/sys/kernel/slab/*/order"
> 
> But the flags you are mentioning do not change and the size of the object
> does not change. What changes is the number of objects in the slab page.

See this code again:
> > >         s->allocflags = 0;
> > >         if (order)
> > >                 s->allocflags |= __GFP_COMP;
> > >
> > >         if (s->flags & SLAB_CACHE_DMA)
> > >                 s->allocflags |= GFP_DMA;
> > >
> > >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > >                 s->allocflags |= __GFP_RECLAIMABLE;
when this function is called, the value s->allocflags does change. At the 
end, s->allocflags holds the same value as before, but it changes 
temporarily.

For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA, 
and he allocates an object from this cache and this allocation races with 
the user writing to /sys/kernel/slab/cache/order - then the allocator can 
for a small period of time see "s->allocflags == 0" and allocate a non-DMA 
page. That is a bug.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 19:19                       ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:19 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > So, what would you recommend for allocating 640KB objects while minimizing
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory
> >   fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes
> >   synchronization across all CPUs)
> >
> > anything else?
> 
> Need to find it but there is a way to allocate N pages in sequence
> somewhere. Otherwise mempools are something that would work.

There's also continuous-memory-allocator, but it needs its memory to be 
reserved at boot time. It is intended for misdesigned hardware devices 
that need continuous memory for DMA. As it's intended for one-time 
allocations when loading drivers, it lacks the performance and scalability 
of the slab cache and alloc_pages.

> > > > > What kind of problem could be caused here?
> > > >
> > > > Unlocked accesses are generally considered bad. For example, see this
> > > > piece of code in calculate_sizes:
> > > >         s->allocflags = 0;
> > > >         if (order)
> > > >                 s->allocflags |= __GFP_COMP;
> > > >
> > > >         if (s->flags & SLAB_CACHE_DMA)
> > > >                 s->allocflags |= GFP_DMA;
> > > >
> > > >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > >                 s->allocflags |= __GFP_RECLAIMABLE;
> > > >
> > > > If you are running this while the cache is in use (i.e. when the user
> > > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > > invalid s->allocflags for a short time.
> > >
> > > Calculating sizes is done when the slab has only a single accessor. Thus
> > > no locking is neeed.
> >
> > The calculation is done whenever someone writes to
> > "/sys/kernel/slab/*/order"
> 
> But the flags you are mentioning do not change and the size of the object
> does not change. What changes is the number of objects in the slab page.

See this code again:
> > >         s->allocflags = 0;
> > >         if (order)
> > >                 s->allocflags |= __GFP_COMP;
> > >
> > >         if (s->flags & SLAB_CACHE_DMA)
> > >                 s->allocflags |= GFP_DMA;
> > >
> > >         if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > >                 s->allocflags |= __GFP_RECLAIMABLE;
when this function is called, the value s->allocflags does change. At the 
end, s->allocflags holds the same value as before, but it changes 
temporarily.

For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA, 
and he allocates an object from this cache and this allocation races with 
the user writing to /sys/kernel/slab/cache/order - then the allocator can 
for a small period of time see "s->allocflags == 0" and allocate a non-DMA 
page. That is a bug.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 17:39                   ` Christopher Lameter
@ 2018-03-21 19:25                     ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:25 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
> 
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.

It may be possible, but we'd need to improve the horrible complexity of 
alloc_pages_exact().

This is a trade-of between performance and waste. A power-of-two 
allocation can be done quicky, but it wastes a lot of space. 
alloc_pages_exact() wastes less space, but it is slow.

The question is - how many of these large-kmalloc allocations are 
short-lived and how many are long-lived? I don't know, I haven't measured 
it.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 19:25                     ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:25 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
> 
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.

It may be possible, but we'd need to improve the horrible complexity of 
alloc_pages_exact().

This is a trade-of between performance and waste. A power-of-two 
allocation can be done quicky, but it wastes a lot of space. 
alloc_pages_exact() wastes less space, but it is slow.

The question is - how many of these large-kmalloc allocations are 
short-lived and how many are long-lived? I don't know, I haven't measured 
it.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 19:19                       ` Mikulas Patocka
@ 2018-03-21 20:09                         ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 20:09 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> and he allocates an object from this cache and this allocation races with
> the user writing to /sys/kernel/slab/cache/order - then the allocator can
> for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> page. That is a bug.

True we need to fix that:

Subject: Avoid potentially visible allocflags without all flags set

During slab size recalculation s->allocflags may be temporarily set
to 0 and thus the flags may not be set which may result in the wrong
flags being passed. Slab size calculation happens in two cases:

1. When a slab is created (which is safe since we cannot have
   concurrent allocations)

2. When the slab order is changed via /sysfs.

Signed-off-by: Christoph Lameter <cl@linux.com>


Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
 	slab_flags_t flags = s->flags;
+	gfp_t allocflags;
 	size_t size = s->object_size;
 	int order;

@@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
 	if (order < 0)
 		return 0;

-	s->allocflags = 0;
+	allocflags = 0;
 	if (order)
-		s->allocflags |= __GFP_COMP;
+		allocflags |= __GFP_COMP;

 	if (s->flags & SLAB_CACHE_DMA)
-		s->allocflags |= GFP_DMA;
+		allocflags |= GFP_DMA;

 	if (s->flags & SLAB_RECLAIM_ACCOUNT)
-		s->allocflags |= __GFP_RECLAIMABLE;
+		allocflags |= __GFP_RECLAIMABLE;

+	s->allocflags = allocflags;
 	/*
 	 * Determine the number of objects per slab
 	 */

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 20:09                         ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 20:09 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> and he allocates an object from this cache and this allocation races with
> the user writing to /sys/kernel/slab/cache/order - then the allocator can
> for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> page. That is a bug.

True we need to fix that:

Subject: Avoid potentially visible allocflags without all flags set

During slab size recalculation s->allocflags may be temporarily set
to 0 and thus the flags may not be set which may result in the wrong
flags being passed. Slab size calculation happens in two cases:

1. When a slab is created (which is safe since we cannot have
   concurrent allocations)

2. When the slab order is changed via /sysfs.

Signed-off-by: Christoph Lameter <cl@linux.com>


Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
 	slab_flags_t flags = s->flags;
+	gfp_t allocflags;
 	size_t size = s->object_size;
 	int order;

@@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
 	if (order < 0)
 		return 0;

-	s->allocflags = 0;
+	allocflags = 0;
 	if (order)
-		s->allocflags |= __GFP_COMP;
+		allocflags |= __GFP_COMP;

 	if (s->flags & SLAB_CACHE_DMA)
-		s->allocflags |= GFP_DMA;
+		allocflags |= GFP_DMA;

 	if (s->flags & SLAB_RECLAIM_ACCOUNT)
-		s->allocflags |= __GFP_RECLAIMABLE;
+		allocflags |= __GFP_RECLAIMABLE;

+	s->allocflags = allocflags;
 	/*
 	 * Determine the number of objects per slab
 	 */

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 20:09                         ` Christopher Lameter
@ 2018-03-21 20:37                           ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 20:37 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> > and he allocates an object from this cache and this allocation races with
> > the user writing to /sys/kernel/slab/cache/order - then the allocator can
> > for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> > page. That is a bug.
> 
> True we need to fix that:
> 
> Subject: Avoid potentially visible allocflags without all flags set
> 
> During slab size recalculation s->allocflags may be temporarily set
> to 0 and thus the flags may not be set which may result in the wrong
> flags being passed. Slab size calculation happens in two cases:
> 
> 1. When a slab is created (which is safe since we cannot have
>    concurrent allocations)
> 
> 2. When the slab order is changed via /sysfs.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> 
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
>  static int calculate_sizes(struct kmem_cache *s, int forced_order)
>  {
>  	slab_flags_t flags = s->flags;
> +	gfp_t allocflags;
>  	size_t size = s->object_size;
>  	int order;
> 
> @@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
>  	if (order < 0)
>  		return 0;
> 
> -	s->allocflags = 0;
> +	allocflags = 0;
>  	if (order)
> -		s->allocflags |= __GFP_COMP;
> +		allocflags |= __GFP_COMP;
> 
>  	if (s->flags & SLAB_CACHE_DMA)
> -		s->allocflags |= GFP_DMA;
> +		allocflags |= GFP_DMA;
> 
>  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
> -		s->allocflags |= __GFP_RECLAIMABLE;
> +		allocflags |= __GFP_RECLAIMABLE;
> 
> +	s->allocflags = allocflags;

I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing 
s->oo and s->min to avoid some possible compiler misoptimizations.

WRITE_ONCE should be used when writing a value that can be read 
simultaneously (but a lot of kernel code misses it).



Another problem is that it updates s->oo and later it updates s->max:
        s->oo = oo_make(order, size, s->reserved);
        s->min = oo_make(get_order(size), size, s->reserved);
        if (oo_objects(s->oo) > oo_objects(s->max))
                s->max = s->oo;
--- so, the concurrently running code could see s->oo > s->max, which 
could trigger some memory corruption.

s->max is only used in memory allocations - 
kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so 
perhaps we could fix the bug by removing s->max at all and always 
allocating enough memory for the maximum possible number of objects?

- kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
+ kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);

Mikulas

>  	/*
>  	 * Determine the number of objects per slab
>  	 */
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 20:37                           ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 20:37 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Wed, 21 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> > and he allocates an object from this cache and this allocation races with
> > the user writing to /sys/kernel/slab/cache/order - then the allocator can
> > for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> > page. That is a bug.
> 
> True we need to fix that:
> 
> Subject: Avoid potentially visible allocflags without all flags set
> 
> During slab size recalculation s->allocflags may be temporarily set
> to 0 and thus the flags may not be set which may result in the wrong
> flags being passed. Slab size calculation happens in two cases:
> 
> 1. When a slab is created (which is safe since we cannot have
>    concurrent allocations)
> 
> 2. When the slab order is changed via /sysfs.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> 
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
>  static int calculate_sizes(struct kmem_cache *s, int forced_order)
>  {
>  	slab_flags_t flags = s->flags;
> +	gfp_t allocflags;
>  	size_t size = s->object_size;
>  	int order;
> 
> @@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
>  	if (order < 0)
>  		return 0;
> 
> -	s->allocflags = 0;
> +	allocflags = 0;
>  	if (order)
> -		s->allocflags |= __GFP_COMP;
> +		allocflags |= __GFP_COMP;
> 
>  	if (s->flags & SLAB_CACHE_DMA)
> -		s->allocflags |= GFP_DMA;
> +		allocflags |= GFP_DMA;
> 
>  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
> -		s->allocflags |= __GFP_RECLAIMABLE;
> +		allocflags |= __GFP_RECLAIMABLE;
> 
> +	s->allocflags = allocflags;

I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing 
s->oo and s->min to avoid some possible compiler misoptimizations.

WRITE_ONCE should be used when writing a value that can be read 
simultaneously (but a lot of kernel code misses it).



Another problem is that it updates s->oo and later it updates s->max:
        s->oo = oo_make(order, size, s->reserved);
        s->min = oo_make(get_order(size), size, s->reserved);
        if (oo_objects(s->oo) > oo_objects(s->max))
                s->max = s->oo;
--- so, the concurrently running code could see s->oo > s->max, which 
could trigger some memory corruption.

s->max is only used in memory allocations - 
kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so 
perhaps we could fix the bug by removing s->max at all and always 
allocating enough memory for the maximum possible number of objects?

- kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
+ kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);

Mikulas

>  	/*
>  	 * Determine the number of objects per slab
>  	 */
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 20:37                           ` Mikulas Patocka
@ 2018-03-23 15:10                             ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:10 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> > +	s->allocflags = allocflags;
>
> I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> s->oo and s->min to avoid some possible compiler misoptimizations.

It only matters that 0 etc is never written.

> Another problem is that it updates s->oo and later it updates s->max:
>         s->oo = oo_make(order, size, s->reserved);
>         s->min = oo_make(get_order(size), size, s->reserved);
>         if (oo_objects(s->oo) > oo_objects(s->max))
>                 s->max = s->oo;
> --- so, the concurrently running code could see s->oo > s->max, which
> could trigger some memory corruption.

Well s->max is only relevant for code that analyses the details of slab
structures for diagnostics.

> s->max is only used in memory allocations -
> kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> perhaps we could fix the bug by removing s->max at all and always
> allocating enough memory for the maximum possible number of objects?
>
> - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);

MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
256kbyte. Not good.

The simplest measure would be to disallow the changing of the order while
the slab contains objects.


Subject: slub: Disallow order changes when objects exist in a slab

There seems to be a couple of races that would have to be
addressed if the slab order would be changed during active use.

Lets disallow this in the same way as we also do not allow
other changes of slab characteristics when objects are active.

Signed-off-by: Christoph Lameter <cl@linux.com>

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
 	unsigned long order;
 	int err;

+	if (any_slab_objects(s))
+		return -EBUSY;
+
 	err = kstrtoul(buf, 10, &order);
 	if (err)
 		return err;

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-23 15:10                             ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:10 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Wed, 21 Mar 2018, Mikulas Patocka wrote:

> > +	s->allocflags = allocflags;
>
> I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> s->oo and s->min to avoid some possible compiler misoptimizations.

It only matters that 0 etc is never written.

> Another problem is that it updates s->oo and later it updates s->max:
>         s->oo = oo_make(order, size, s->reserved);
>         s->min = oo_make(get_order(size), size, s->reserved);
>         if (oo_objects(s->oo) > oo_objects(s->max))
>                 s->max = s->oo;
> --- so, the concurrently running code could see s->oo > s->max, which
> could trigger some memory corruption.

Well s->max is only relevant for code that analyses the details of slab
structures for diagnostics.

> s->max is only used in memory allocations -
> kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> perhaps we could fix the bug by removing s->max at all and always
> allocating enough memory for the maximum possible number of objects?
>
> - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);

MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
256kbyte. Not good.

The simplest measure would be to disallow the changing of the order while
the slab contains objects.


Subject: slub: Disallow order changes when objects exist in a slab

There seems to be a couple of races that would have to be
addressed if the slab order would be changed during active use.

Lets disallow this in the same way as we also do not allow
other changes of slab characteristics when objects are active.

Signed-off-by: Christoph Lameter <cl@linux.com>

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
 	unsigned long order;
 	int err;

+	if (any_slab_objects(s))
+		return -EBUSY;
+
 	err = kstrtoul(buf, 10, &order);
 	if (err)
 		return err;

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-23 15:10                             ` Christopher Lameter
@ 2018-03-23 15:31                               ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-23 15:31 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer



On Fri, 23 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > +	s->allocflags = allocflags;
> >
> > I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> > s->oo and s->min to avoid some possible compiler misoptimizations.
> 
> It only matters that 0 etc is never written.

The C11 standard says that reads and writes of the same variable should't 
race (even if you write the same value as before), and consequently, the 
compiler can make transformations based on this assumption. For example, 
the compiler optimization may transform the following code

>       allocflags = 0;
>       if (order)
>               allocflags |= __GFP_COMP;
>       if (s->flags & SLAB_CACHE_DMA)
>               allocflags |= GFP_DMA;
>       if (s->flags & SLAB_RECLAIM_ACCOUNT)
>               allocflags |= __GFP_RECLAIMABLE;
>       s->allocflags = allocflags;

back into:
>	s->allocflags = 0;
>	if (order)
>		s->allocflags |= __GFP_COMP;
>	if (s->flags & SLAB_CACHE_DMA)
>		s->allocflags |= GFP_DMA;
>	if (s->flags & SLAB_RECLAIM_ACCOUNT)
>		s->allocflags |= __GFP_RECLAIMABLE;

Afaik, gcc currently doesn't do this transformation, but it's better to 
write standard-compliant code and use the macro WRITE_ONCE for variables 
that may be concurrently read and written.

> > Another problem is that it updates s->oo and later it updates s->max:
> >         s->oo = oo_make(order, size, s->reserved);
> >         s->min = oo_make(get_order(size), size, s->reserved);
> >         if (oo_objects(s->oo) > oo_objects(s->max))
> >                 s->max = s->oo;
> > --- so, the concurrently running code could see s->oo > s->max, which
> > could trigger some memory corruption.
> 
> Well s->max is only relevant for code that analyses the details of slab
> structures for diagnostics.
> 
> > s->max is only used in memory allocations -
> > kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> > perhaps we could fix the bug by removing s->max at all and always
> > allocating enough memory for the maximum possible number of objects?
> >
> > - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> > + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
> 
> MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
> 256kbyte. Not good.

I think it's one bit per object, so the total size of the allocation is 
4k. Allocating 4k shouldn't be a problem.

> The simplest measure would be to disallow the changing of the order while
> the slab contains objects.
> 
> 
> Subject: slub: Disallow order changes when objects exist in a slab
> 
> There seems to be a couple of races that would have to be
> addressed if the slab order would be changed during active use.
> 
> Lets disallow this in the same way as we also do not allow
> other changes of slab characteristics when objects are active.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
>  	unsigned long order;
>  	int err;
> 
> +	if (any_slab_objects(s))
> +		return -EBUSY;
> +
>  	err = kstrtoul(buf, 10, &order);
>  	if (err)
>  		return err;

This test isn't locked against anything, so it may race with concurrent 
allocation. "any_slab_objects" may return false and a new object in the 
slab cache may appear immediatelly after that.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-23 15:31                               ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-23 15:31 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton



On Fri, 23 Mar 2018, Christopher Lameter wrote:

> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> 
> > > +	s->allocflags = allocflags;
> >
> > I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> > s->oo and s->min to avoid some possible compiler misoptimizations.
> 
> It only matters that 0 etc is never written.

The C11 standard says that reads and writes of the same variable should't 
race (even if you write the same value as before), and consequently, the 
compiler can make transformations based on this assumption. For example, 
the compiler optimization may transform the following code

>       allocflags = 0;
>       if (order)
>               allocflags |= __GFP_COMP;
>       if (s->flags & SLAB_CACHE_DMA)
>               allocflags |= GFP_DMA;
>       if (s->flags & SLAB_RECLAIM_ACCOUNT)
>               allocflags |= __GFP_RECLAIMABLE;
>       s->allocflags = allocflags;

back into:
>	s->allocflags = 0;
>	if (order)
>		s->allocflags |= __GFP_COMP;
>	if (s->flags & SLAB_CACHE_DMA)
>		s->allocflags |= GFP_DMA;
>	if (s->flags & SLAB_RECLAIM_ACCOUNT)
>		s->allocflags |= __GFP_RECLAIMABLE;

Afaik, gcc currently doesn't do this transformation, but it's better to 
write standard-compliant code and use the macro WRITE_ONCE for variables 
that may be concurrently read and written.

> > Another problem is that it updates s->oo and later it updates s->max:
> >         s->oo = oo_make(order, size, s->reserved);
> >         s->min = oo_make(get_order(size), size, s->reserved);
> >         if (oo_objects(s->oo) > oo_objects(s->max))
> >                 s->max = s->oo;
> > --- so, the concurrently running code could see s->oo > s->max, which
> > could trigger some memory corruption.
> 
> Well s->max is only relevant for code that analyses the details of slab
> structures for diagnostics.
> 
> > s->max is only used in memory allocations -
> > kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> > perhaps we could fix the bug by removing s->max at all and always
> > allocating enough memory for the maximum possible number of objects?
> >
> > - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> > + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
> 
> MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
> 256kbyte. Not good.

I think it's one bit per object, so the total size of the allocation is 
4k. Allocating 4k shouldn't be a problem.

> The simplest measure would be to disallow the changing of the order while
> the slab contains objects.
> 
> 
> Subject: slub: Disallow order changes when objects exist in a slab
> 
> There seems to be a couple of races that would have to be
> addressed if the slab order would be changed during active use.
> 
> Lets disallow this in the same way as we also do not allow
> other changes of slab characteristics when objects are active.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
>  	unsigned long order;
>  	int err;
> 
> +	if (any_slab_objects(s))
> +		return -EBUSY;
> +
>  	err = kstrtoul(buf, 10, &order);
>  	if (err)
>  		return err;

This test isn't locked against anything, so it may race with concurrent 
allocation. "any_slab_objects" may return false and a new object in the 
slab cache may appear immediatelly after that.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-23 15:31                               ` Mikulas Patocka
@ 2018-03-23 15:48                                 ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:48 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On Fri, 23 Mar 2018, Mikulas Patocka wrote:

> This test isn't locked against anything, so it may race with concurrent
> allocation. "any_slab_objects" may return false and a new object in the
> slab cache may appear immediatelly after that.

Ok the same reasoning applies to numerous other slab configuration
settings in /sys/kernel/slab.... So we need to disable all of that or come
up with a sane way of synchronization.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-23 15:48                                 ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:48 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On Fri, 23 Mar 2018, Mikulas Patocka wrote:

> This test isn't locked against anything, so it may race with concurrent
> allocation. "any_slab_objects" may return false and a new object in the
> slab cache may appear immediatelly after that.

Ok the same reasoning applies to numerous other slab configuration
settings in /sys/kernel/slab.... So we need to disable all of that or come
up with a sane way of synchronization.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-03-21 18:36                   ` Mikulas Patocka
@ 2018-04-13  9:22                     ` Vlastimil Babka
  -1 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-13  9:22 UTC (permalink / raw)
  To: Mikulas Patocka, Christopher Lameter
  Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-mm, dm-devel, Mike Snitzer

On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
> 
> 
> On Wed, 21 Mar 2018, Christopher Lameter wrote:
> 
>> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>>
>>>> You should not be using the slab allocators for these. Allocate higher
>>>> order pages or numbers of consecutive smaller pagess from the page
>>>> allocator. The slab allocators are written for objects smaller than page
>>>> size.
>>>
>>> So, do you argue that I need to write my own slab cache functionality
>>> instead of using the existing slab code?
>>
>> Just use the existing page allocator calls to allocate and free the
>> memory you need.
>>
>>> I can do it - but duplicating code is bad thing.
>>
>> There is no need to duplicate anything. There is lots of infrastructure
>> already in the kernel. You just need to use the right allocation / freeing
>> calls.
> 
> So, what would you recommend for allocating 640KB objects while minimizing 
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory 
>   fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes 
>   synchronization across all CPUs)
> 
> anything else?
> 
> The slab cache with large order seems as a best choice for this.

Sorry for being late, I just read this thread and tend to agree with
Mikulas, that this is a good use case for SL*B. If we extend the
use-case from "space-efficient allocator of objects smaller than page
size" to "space-efficient allocator of objects that are not power-of-two
pages" then IMHO it turns out the implementation would be almost the
same. All other variants listed above would lead to waste of memory or
fragmentation.

Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
attending, or anyone else that can vouch for your usecase?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-13  9:22                     ` Vlastimil Babka
  0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-13  9:22 UTC (permalink / raw)
  To: Mikulas Patocka, Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
> 
> 
> On Wed, 21 Mar 2018, Christopher Lameter wrote:
> 
>> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>>
>>>> You should not be using the slab allocators for these. Allocate higher
>>>> order pages or numbers of consecutive smaller pagess from the page
>>>> allocator. The slab allocators are written for objects smaller than page
>>>> size.
>>>
>>> So, do you argue that I need to write my own slab cache functionality
>>> instead of using the existing slab code?
>>
>> Just use the existing page allocator calls to allocate and free the
>> memory you need.
>>
>>> I can do it - but duplicating code is bad thing.
>>
>> There is no need to duplicate anything. There is lots of infrastructure
>> already in the kernel. You just need to use the right allocation / freeing
>> calls.
> 
> So, what would you recommend for allocating 640KB objects while minimizing 
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory 
>   fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes 
>   synchronization across all CPUs)
> 
> anything else?
> 
> The slab cache with large order seems as a best choice for this.

Sorry for being late, I just read this thread and tend to agree with
Mikulas, that this is a good use case for SL*B. If we extend the
use-case from "space-efficient allocator of objects smaller than page
size" to "space-efficient allocator of objects that are not power-of-two
pages" then IMHO it turns out the implementation would be almost the
same. All other variants listed above would lead to waste of memory or
fragmentation.

Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
attending, or anyone else that can vouch for your usecase?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-13  9:22                     ` Vlastimil Babka
@ 2018-04-13 15:10                       ` Mike Snitzer
  -1 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-13 15:10 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mikulas Patocka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton

On Fri, Apr 13 2018 at  5:22am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:

> On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
> > 
> > 
> > On Wed, 21 Mar 2018, Christopher Lameter wrote:
> > 
> >> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> >>
> >>>> You should not be using the slab allocators for these. Allocate higher
> >>>> order pages or numbers of consecutive smaller pagess from the page
> >>>> allocator. The slab allocators are written for objects smaller than page
> >>>> size.
> >>>
> >>> So, do you argue that I need to write my own slab cache functionality
> >>> instead of using the existing slab code?
> >>
> >> Just use the existing page allocator calls to allocate and free the
> >> memory you need.
> >>
> >>> I can do it - but duplicating code is bad thing.
> >>
> >> There is no need to duplicate anything. There is lots of infrastructure
> >> already in the kernel. You just need to use the right allocation / freeing
> >> calls.
> > 
> > So, what would you recommend for allocating 640KB objects while minimizing 
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory 
> >   fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes 
> >   synchronization across all CPUs)
> > 
> > anything else?
> > 
> > The slab cache with large order seems as a best choice for this.
> 
> Sorry for being late, I just read this thread and tend to agree with
> Mikulas, that this is a good use case for SL*B. If we extend the
> use-case from "space-efficient allocator of objects smaller than page
> size" to "space-efficient allocator of objects that are not power-of-two
> pages" then IMHO it turns out the implementation would be almost the
> same. All other variants listed above would lead to waste of memory or
> fragmentation.
> 
> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> attending, or anyone else that can vouch for your usecase?

Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.

Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
the 4.17 merge window).

Mike

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-13 15:10                       ` Mike Snitzer
  0 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-13 15:10 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	Mikulas Patocka, David Rientjes, Christopher Lameter,
	Joonsoo Kim

On Fri, Apr 13 2018 at  5:22am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:

> On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
> > 
> > 
> > On Wed, 21 Mar 2018, Christopher Lameter wrote:
> > 
> >> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> >>
> >>>> You should not be using the slab allocators for these. Allocate higher
> >>>> order pages or numbers of consecutive smaller pagess from the page
> >>>> allocator. The slab allocators are written for objects smaller than page
> >>>> size.
> >>>
> >>> So, do you argue that I need to write my own slab cache functionality
> >>> instead of using the existing slab code?
> >>
> >> Just use the existing page allocator calls to allocate and free the
> >> memory you need.
> >>
> >>> I can do it - but duplicating code is bad thing.
> >>
> >> There is no need to duplicate anything. There is lots of infrastructure
> >> already in the kernel. You just need to use the right allocation / freeing
> >> calls.
> > 
> > So, what would you recommend for allocating 640KB objects while minimizing 
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory 
> >   fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes 
> >   synchronization across all CPUs)
> > 
> > anything else?
> > 
> > The slab cache with large order seems as a best choice for this.
> 
> Sorry for being late, I just read this thread and tend to agree with
> Mikulas, that this is a good use case for SL*B. If we extend the
> use-case from "space-efficient allocator of objects smaller than page
> size" to "space-efficient allocator of objects that are not power-of-two
> pages" then IMHO it turns out the implementation would be almost the
> same. All other variants listed above would lead to waste of memory or
> fragmentation.
> 
> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> attending, or anyone else that can vouch for your usecase?

Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.

Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
the 4.17 merge window).

Mike

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-13 15:10                       ` Mike Snitzer
@ 2018-04-16 12:38                         ` Vlastimil Babka
  -1 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 12:38 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Mikulas Patocka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton

On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> On Fri, Apr 13 2018 at  5:22am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
>> attending, or anyone else that can vouch for your usecase?
> 
> Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> 
> Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> the 4.17 merge window).

Can you or Mikulas briefly summarize how the dependency is avoided, and
whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
dm-bufio code would happily switch to it, or not?

Thanks,
Vlastimil

> Mike
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 12:38                         ` Vlastimil Babka
  0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 12:38 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	Mikulas Patocka, David Rientjes, Christopher Lameter,
	Joonsoo Kim

On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> On Fri, Apr 13 2018 at  5:22am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
>> attending, or anyone else that can vouch for your usecase?
> 
> Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> 
> Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> the 4.17 merge window).

Can you or Mikulas briefly summarize how the dependency is avoided, and
whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
dm-bufio code would happily switch to it, or not?

Thanks,
Vlastimil

> Mike
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 12:38                         ` Vlastimil Babka
@ 2018-04-16 14:27                           ` Mike Snitzer
  -1 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:27 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mikulas Patocka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton

On Mon, Apr 16 2018 at  8:38am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:

> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at  5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> > 
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > 
> > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
> 
> Can you or Mikulas briefly summarize how the dependency is avoided, and
> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?

git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c

But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for dm_buffer structure allocations")

So no, I don't see why dm-bufio would need to switch to
SLAB_MINIMIZE_WASTE if it were introduced in the future.

Mike

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:27                           ` Mike Snitzer
  0 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:27 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	Mikulas Patocka, David Rientjes, Christopher Lameter,
	Joonsoo Kim

On Mon, Apr 16 2018 at  8:38am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:

> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at  5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> > 
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > 
> > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
> 
> Can you or Mikulas briefly summarize how the dependency is avoided, and
> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?

git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c

But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for dm_buffer structure allocations")

So no, I don't see why dm-bufio would need to switch to
SLAB_MINIMIZE_WASTE if it were introduced in the future.

Mike

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 14:27                           ` Mike Snitzer
@ 2018-04-16 14:37                             ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:37 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton



On Mon, 16 Apr 2018, Mike Snitzer wrote:

> On Mon, Apr 16 2018 at  8:38am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
> 
> > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > On Fri, Apr 13 2018 at  5:22am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > >>
> > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > >> attending, or anyone else that can vouch for your usecase?
> > > 
> > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > 
> > > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > the 4.17 merge window).
> > 
> > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > dm-bufio code would happily switch to it, or not?
> 
> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> 
> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
> dm_buffer structure allocations")
> 
> So no, I don't see why dm-bufio would need to switch to
> SLAB_MINIMIZE_WASTE if it were introduced in the future.

Currently, the slab cache rounds up the size of the slab to the next power 
of two (if the size is large). And that wastes memory if that memory were 
to be used for deduplication tables.

Generally, the performance of the deduplication solution depends on how 
much data can you put to memory. If you round 640KB buffer to 1MB (this is 
what the slab and slub subsystem currently do), you waste a lot of memory. 
Deduplication indices with 640KB blocks are already used in the wild, so 
it can't be easily changed.

> Mike

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:37                             ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:37 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christopher Lameter, Joonsoo Kim,
	Vlastimil Babka



On Mon, 16 Apr 2018, Mike Snitzer wrote:

> On Mon, Apr 16 2018 at  8:38am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
> 
> > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > On Fri, Apr 13 2018 at  5:22am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > >>
> > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > >> attending, or anyone else that can vouch for your usecase?
> > > 
> > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > 
> > > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > the 4.17 merge window).
> > 
> > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > dm-bufio code would happily switch to it, or not?
> 
> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> 
> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
> dm_buffer structure allocations")
> 
> So no, I don't see why dm-bufio would need to switch to
> SLAB_MINIMIZE_WASTE if it were introduced in the future.

Currently, the slab cache rounds up the size of the slab to the next power 
of two (if the size is large). And that wastes memory if that memory were 
to be used for deduplication tables.

Generally, the performance of the deduplication solution depends on how 
much data can you put to memory. If you round 640KB buffer to 1MB (this is 
what the slab and slub subsystem currently do), you waste a lot of memory. 
Deduplication indices with 640KB blocks are already used in the wild, so 
it can't be easily changed.

> Mike

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 14:37                             ` Mikulas Patocka
@ 2018-04-16 14:46                               ` Mike Snitzer
  -1 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:46 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton

On Mon, Apr 16 2018 at 10:37am -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Mon, 16 Apr 2018, Mike Snitzer wrote:
> 
> > On Mon, Apr 16 2018 at  8:38am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> > 
> > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > On Fri, Apr 13 2018 at  5:22am -0400,
> > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >>
> > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > >> attending, or anyone else that can vouch for your usecase?
> > > > 
> > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > > 
> > > > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > the 4.17 merge window).
> > > 
> > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > dm-bufio code would happily switch to it, or not?
> > 
> > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> > 
> > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
> > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
> > dm_buffer structure allocations")
> > 
> > So no, I don't see why dm-bufio would need to switch to
> > SLAB_MINIMIZE_WASTE if it were introduced in the future.
> 
> Currently, the slab cache rounds up the size of the slab to the next power 
> of two (if the size is large). And that wastes memory if that memory were 
> to be used for deduplication tables.

You mean on an overall size of the cache level?  Or on a per-object
level?  I can only imagine you mean the former.
 
> Generally, the performance of the deduplication solution depends on how 
> much data can you put to memory. If you round 640KB buffer to 1MB (this is 
> what the slab and slub subsystem currently do), you waste a lot of memory. 
> Deduplication indices with 640KB blocks are already used in the wild, so 
> it can't be easily changed.

OK, seems you're suggesting a single object is rounded up.. so then this
header is very wrong?:

commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
Author: Mikulas Patocka <mpatocka@redhat.com>
Date:   Mon Mar 26 20:29:45 2018 +0200

    dm bufio: use slab cache for dm_buffer structure allocations

    kmalloc padded to the next power of two, using a slab cache avoids this.

    Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
    Signed-off-by: Mike Snitzer <snitzer@redhat.com>

Please clarify further, thanks!
Mike

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:46                               ` Mike Snitzer
  0 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:46 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christopher Lameter, Joonsoo Kim,
	Vlastimil Babka

On Mon, Apr 16 2018 at 10:37am -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Mon, 16 Apr 2018, Mike Snitzer wrote:
> 
> > On Mon, Apr 16 2018 at  8:38am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> > 
> > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > On Fri, Apr 13 2018 at  5:22am -0400,
> > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >>
> > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > >> attending, or anyone else that can vouch for your usecase?
> > > > 
> > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > > 
> > > > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > the 4.17 merge window).
> > > 
> > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > dm-bufio code would happily switch to it, or not?
> > 
> > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> > 
> > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
> > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
> > dm_buffer structure allocations")
> > 
> > So no, I don't see why dm-bufio would need to switch to
> > SLAB_MINIMIZE_WASTE if it were introduced in the future.
> 
> Currently, the slab cache rounds up the size of the slab to the next power 
> of two (if the size is large). And that wastes memory if that memory were 
> to be used for deduplication tables.

You mean on an overall size of the cache level?  Or on a per-object
level?  I can only imagine you mean the former.
 
> Generally, the performance of the deduplication solution depends on how 
> much data can you put to memory. If you round 640KB buffer to 1MB (this is 
> what the slab and slub subsystem currently do), you waste a lot of memory. 
> Deduplication indices with 640KB blocks are already used in the wild, so 
> it can't be easily changed.

OK, seems you're suggesting a single object is rounded up.. so then this
header is very wrong?:

commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
Author: Mikulas Patocka <mpatocka@redhat.com>
Date:   Mon Mar 26 20:29:45 2018 +0200

    dm bufio: use slab cache for dm_buffer structure allocations

    kmalloc padded to the next power of two, using a slab cache avoids this.

    Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
    Signed-off-by: Mike Snitzer <snitzer@redhat.com>

Please clarify further, thanks!
Mike

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 14:46                               ` Mike Snitzer
@ 2018-04-16 14:57                                 ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:57 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton



On Mon, 16 Apr 2018, Mike Snitzer wrote:

> On Mon, Apr 16 2018 at 10:37am -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > 
> > 
> > On Mon, 16 Apr 2018, Mike Snitzer wrote:
> > 
> > > On Mon, Apr 16 2018 at  8:38am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > 
> > > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > > On Fri, Apr 13 2018 at  5:22am -0400,
> > > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > > >>
> > > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > > >> attending, or anyone else that can vouch for your usecase?
> > > > > 
> > > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > > > 
> > > > > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > > the 4.17 merge window).
> > > > 
> > > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > > dm-bufio code would happily switch to it, or not?
> > > 
> > > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> > > 
> > > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
> > > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
> > > dm_buffer structure allocations")
> > > 
> > > So no, I don't see why dm-bufio would need to switch to
> > > SLAB_MINIMIZE_WASTE if it were introduced in the future.
> > 
> > Currently, the slab cache rounds up the size of the slab to the next power 
> > of two (if the size is large). And that wastes memory if that memory were 
> > to be used for deduplication tables.
> 
> You mean on an overall size of the cache level?  Or on a per-object
> level?  I can only imagine you mean the former.

Unfortunatelly, it rounds up every object. So, if you have six 640KB 
objects, it consumes 6MB.

> > Generally, the performance of the deduplication solution depends on how 
> > much data can you put to memory. If you round 640KB buffer to 1MB (this is 
> > what the slab and slub subsystem currently do), you waste a lot of memory. 
> > Deduplication indices with 640KB blocks are already used in the wild, so 
> > it can't be easily changed.
> 
> OK, seems you're suggesting a single object is rounded up.. so then this
> header is very wrong?:
> 
> commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
> Author: Mikulas Patocka <mpatocka@redhat.com>
> Date:   Mon Mar 26 20:29:45 2018 +0200
> 
>     dm bufio: use slab cache for dm_buffer structure allocations
> 
>     kmalloc padded to the next power of two, using a slab cache avoids this.
> 
>     Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>     Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> 
> Please clarify further, thanks!
> Mike

Yes, using a slab cache currently doesn't avoid this rouding (it needs the 
SLAB_MINIMIZE_WASTE patch to do that).

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:57                                 ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:57 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christopher Lameter, Joonsoo Kim,
	Vlastimil Babka



On Mon, 16 Apr 2018, Mike Snitzer wrote:

> On Mon, Apr 16 2018 at 10:37am -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > 
> > 
> > On Mon, 16 Apr 2018, Mike Snitzer wrote:
> > 
> > > On Mon, Apr 16 2018 at  8:38am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > 
> > > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > > On Fri, Apr 13 2018 at  5:22am -0400,
> > > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > > >>
> > > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > > >> attending, or anyone else that can vouch for your usecase?
> > > > > 
> > > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > > > 
> > > > > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > > the 4.17 merge window).
> > > > 
> > > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > > dm-bufio code would happily switch to it, or not?
> > > 
> > > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> > > 
> > > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
> > > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
> > > dm_buffer structure allocations")
> > > 
> > > So no, I don't see why dm-bufio would need to switch to
> > > SLAB_MINIMIZE_WASTE if it were introduced in the future.
> > 
> > Currently, the slab cache rounds up the size of the slab to the next power 
> > of two (if the size is large). And that wastes memory if that memory were 
> > to be used for deduplication tables.
> 
> You mean on an overall size of the cache level?  Or on a per-object
> level?  I can only imagine you mean the former.

Unfortunatelly, it rounds up every object. So, if you have six 640KB 
objects, it consumes 6MB.

> > Generally, the performance of the deduplication solution depends on how 
> > much data can you put to memory. If you round 640KB buffer to 1MB (this is 
> > what the slab and slub subsystem currently do), you waste a lot of memory. 
> > Deduplication indices with 640KB blocks are already used in the wild, so 
> > it can't be easily changed.
> 
> OK, seems you're suggesting a single object is rounded up.. so then this
> header is very wrong?:
> 
> commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
> Author: Mikulas Patocka <mpatocka@redhat.com>
> Date:   Mon Mar 26 20:29:45 2018 +0200
> 
>     dm bufio: use slab cache for dm_buffer structure allocations
> 
>     kmalloc padded to the next power of two, using a slab cache avoids this.
> 
>     Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>     Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> 
> Please clarify further, thanks!
> Mike

Yes, using a slab cache currently doesn't avoid this rouding (it needs the 
SLAB_MINIMIZE_WASTE patch to do that).

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 14:57                                 ` Mikulas Patocka
@ 2018-04-16 15:18                                   ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:18 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> > Please clarify further, thanks!
> > Mike
>
> Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> SLAB_MINIMIZE_WASTE patch to do that).

Or an increase in slab_max_order

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 15:18                                   ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:18 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> > Please clarify further, thanks!
> > Mike
>
> Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> SLAB_MINIMIZE_WASTE patch to do that).

Or an increase in slab_max_order

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 15:18                                   ` Christopher Lameter
@ 2018-04-16 15:25                                     ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 15:25 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton



On Mon, 16 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > > Please clarify further, thanks!
> > > Mike
> >
> > Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> > SLAB_MINIMIZE_WASTE patch to do that).
> 
> Or an increase in slab_max_order

But that will increase it for all slabs (often senselessly - i.e. 
kmalloc-4096 would have order 4MB).

I need to increase it just for dm-bufio slabs.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 15:25                                     ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 15:25 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka



On Mon, 16 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > > Please clarify further, thanks!
> > > Mike
> >
> > Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> > SLAB_MINIMIZE_WASTE patch to do that).
> 
> Or an increase in slab_max_order

But that will increase it for all slabs (often senselessly - i.e. 
kmalloc-4096 would have order 4MB).

I need to increase it just for dm-bufio slabs.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 15:25                                     ` Mikulas Patocka
@ 2018-04-16 15:45                                       ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:45 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> >
> > Or an increase in slab_max_order
>
> But that will increase it for all slabs (often senselessly - i.e.
> kmalloc-4096 would have order 4MB).

4MB? Nope.... That is a power of two slab so no wasted space even with
order 0.

Its not a senseless increase. The more objects you fit into a slab page
the higher the performance of the allocator.


> I need to increase it just for dm-bufio slabs.

If you do this then others will want the same...

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 15:45                                       ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:45 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> >
> > Or an increase in slab_max_order
>
> But that will increase it for all slabs (often senselessly - i.e.
> kmalloc-4096 would have order 4MB).

4MB? Nope.... That is a power of two slab so no wasted space even with
order 0.

Its not a senseless increase. The more objects you fit into a slab page
the higher the performance of the allocator.


> I need to increase it just for dm-bufio slabs.

If you do this then others will want the same...

^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 14:46                               ` Mike Snitzer
  (?)
  (?)
@ 2018-04-16 19:32                               ` Mikulas Patocka
  2018-04-17 14:45                                 ` Christopher Lameter
  -1 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 19:32 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
	Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
	Andrew Morton, linux-kernel

This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
flag causes allocation of larger slab caches in order to minimize wasted
space.

This is needed because we want to use dm-bufio for deduplication index and
there are existing installations with non-power-of-two block sizes (such
as 640KB). The performance of the whole solution depends on efficient
memory use, so we must waste as little memory as possible.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-bufio.c |    2 +-
 include/linux/slab.h  |    7 +++++++
 mm/slab.c             |    4 ++--
 mm/slab.h             |    7 ++++---
 mm/slab_common.c      |    2 +-
 mm/slub.c             |   25 ++++++++++++++++++++-----
 6 files changed, 35 insertions(+), 12 deletions(-)

Index: linux-2.6/include/linux/slab.h
===================================================================
--- linux-2.6.orig/include/linux/slab.h	2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/include/linux/slab.h	2018-04-16 21:10:45.000000000 +0200
@@ -108,6 +108,13 @@
 #define SLAB_KASAN		0
 #endif
 
+/*
+ * Use higer order allocations to minimize wasted space.
+ * Note: the allocation is unreliable if this flag is used, the caller
+ * must handle allocation failures gracefully.
+ */
+#define SLAB_MINIMIZE_WASTE	((slab_flags_t __force)0x10000000U)
+
 /* The following flags affect the page allocator grouping pages by mobility */
 /* Objects are reclaimable */
 #define SLAB_RECLAIM_ACCOUNT	((slab_flags_t __force)0x00020000U)
Index: linux-2.6/mm/slab_common.c
===================================================================
--- linux-2.6.orig/mm/slab_common.c	2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slab_common.c	2018-04-16 21:10:45.000000000 +0200
@@ -53,7 +53,7 @@ static DECLARE_WORK(slab_caches_to_rcu_d
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_ACCOUNT)
+			 SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slub.c	2018-04-16 21:12:41.000000000 +0200
@@ -3249,7 +3249,7 @@ static inline unsigned int slab_order(un
 	return order;
 }
 
-static inline int calculate_order(unsigned int size, unsigned int reserved)
+static inline int calculate_order(unsigned int size, unsigned int reserved, slab_flags_t flags)
 {
 	unsigned int order;
 	unsigned int min_objects;
@@ -3277,7 +3277,7 @@ static inline int calculate_order(unsign
 			order = slab_order(size, min_objects,
 					slub_max_order, fraction, reserved);
 			if (order <= slub_max_order)
-				return order;
+				goto ret_order;
 			fraction /= 2;
 		}
 		min_objects--;
@@ -3289,15 +3289,30 @@ static inline int calculate_order(unsign
 	 */
 	order = slab_order(size, 1, slub_max_order, 1, reserved);
 	if (order <= slub_max_order)
-		return order;
+		goto ret_order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
 	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
 	if (order < MAX_ORDER)
-		return order;
+		goto ret_order;
 	return -ENOSYS;
+
+ret_order:
+	if (flags & SLAB_MINIMIZE_WASTE) {
+		/* Increase the order if it decreases waste */
+		int test_order;
+		for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+			unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+			unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+			if (test_order_objects >= min(32, MAX_OBJS_PER_PAGE))
+				break;
+			if (test_order_objects > order_objects << (test_order - order))
+				order = test_order;
+		}
+	}
+	return order;
 }
 
 static void
@@ -3562,7 +3577,7 @@ static int calculate_sizes(struct kmem_c
 	if (forced_order >= 0)
 		order = forced_order;
 	else
-		order = calculate_order(size, s->reserved);
+		order = calculate_order(size, s->reserved, flags);
 
 	if ((int)order < 0)
 		return 0;
Index: linux-2.6/drivers/md/dm-bufio.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-bufio.c	2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/drivers/md/dm-bufio.c	2018-04-16 21:11:23.000000000 +0200
@@ -1683,7 +1683,7 @@ struct dm_bufio_client *dm_bufio_client_
 	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
 		snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", c->block_size);
 		c->slab_cache = kmem_cache_create(slab_name, c->block_size, ARCH_KMALLOC_MINALIGN,
-						  SLAB_RECLAIM_ACCOUNT, NULL);
+						  SLAB_RECLAIM_ACCOUNT | SLAB_MINIMIZE_WASTE, NULL);
 		if (!c->slab_cache) {
 			r = -ENOMEM;
 			goto bad;
Index: linux-2.6/mm/slab.h
===================================================================
--- linux-2.6.orig/mm/slab.h	2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slab.h	2018-04-16 21:10:45.000000000 +0200
@@ -142,10 +142,10 @@ static inline slab_flags_t kmem_cache_fl
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
 			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_ACCOUNT)
+			  SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_ACCOUNT)
+			  SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -164,7 +164,8 @@ static inline slab_flags_t kmem_cache_fl
 			      SLAB_NOLEAKTRACE | \
 			      SLAB_RECLAIM_ACCOUNT | \
 			      SLAB_TEMPORARY | \
-			      SLAB_ACCOUNT)
+			      SLAB_ACCOUNT | \
+			      SLAB_MINIMIZE_WASTE)
 
 bool __kmem_cache_empty(struct kmem_cache *);
 int __kmem_cache_shutdown(struct kmem_cache *);
Index: linux-2.6/mm/slab.c
===================================================================
--- linux-2.6.orig/mm/slab.c	2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slab.c	2018-04-16 21:10:45.000000000 +0200
@@ -1790,14 +1790,14 @@ static size_t calculate_slab_order(struc
 		 * as GFP_NOFS and we really don't want to have to be allocating
 		 * higher-order pages when we are unable to shrink dcache.
 		 */
-		if (flags & SLAB_RECLAIM_ACCOUNT)
+		if (flags & SLAB_RECLAIM_ACCOUNT && !(flags & SLAB_MINIMIZE_WASTE))
 			break;
 
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
-		if (gfporder >= slab_max_order)
+		if (gfporder >= slab_max_order && !(flags & SLAB_MINIMIZE_WASTE))
 			break;
 
 		/*

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 15:45                                       ` Christopher Lameter
@ 2018-04-16 19:36                                         ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 19:36 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton



On Mon, 16 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > >
> > > Or an increase in slab_max_order
> >
> > But that will increase it for all slabs (often senselessly - i.e.
> > kmalloc-4096 would have order 4MB).
> 
> 4MB? Nope.... That is a power of two slab so no wasted space even with
> order 0.

See this email:
https://www.redhat.com/archives/dm-devel/2018-March/msg00387.html

If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages. 
So yes, it increases the order of all slab caches (although not up to 
4MB).

> Its not a senseless increase. The more objects you fit into a slab page
> the higher the performance of the allocator.
> 
> 
> > I need to increase it just for dm-bufio slabs.
> 
> If you do this then others will want the same...

If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 19:36                                         ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 19:36 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka



On Mon, 16 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > >
> > > Or an increase in slab_max_order
> >
> > But that will increase it for all slabs (often senselessly - i.e.
> > kmalloc-4096 would have order 4MB).
> 
> 4MB? Nope.... That is a power of two slab so no wasted space even with
> order 0.

See this email:
https://www.redhat.com/archives/dm-devel/2018-March/msg00387.html

If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages. 
So yes, it increases the order of all slab caches (although not up to 
4MB).

> Its not a senseless increase. The more objects you fit into a slab page
> the higher the performance of the allocator.
> 
> 
> > I need to increase it just for dm-bufio slabs.
> 
> If you do this then others will want the same...

If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 14:37                             ` Mikulas Patocka
@ 2018-04-16 19:38                               ` Vlastimil Babka
  -1 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 19:38 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer
  Cc: Christopher Lameter, Matthew Wilcox, Pekka Enberg, linux-mm,
	dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton

On 04/16/2018 04:37 PM, Mikulas Patocka wrote:
>>> Can you or Mikulas briefly summarize how the dependency is avoided, and
>>> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
>>> dm-bufio code would happily switch to it, or not?
>>
>> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
>>
>> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
>> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
>> dm_buffer structure allocations")
>>
>> So no, I don't see why dm-bufio would need to switch to
>> SLAB_MINIMIZE_WASTE if it were introduced in the future.
> 
> Currently, the slab cache rounds up the size of the slab to the next power 
> of two (if the size is large). And that wastes memory if that memory were 
> to be used for deduplication tables.
> 
> Generally, the performance of the deduplication solution depends on how 
> much data can you put to memory. If you round 640KB buffer to 1MB (this is 
> what the slab and slub subsystem currently do), you waste a lot of memory. 
> Deduplication indices with 640KB blocks are already used in the wild, so 
> it can't be easily changed.

Thank you both for the clarification.

Vlastimil

> 
>> Mike
> 
> Mikulas
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 19:38                               ` Vlastimil Babka
  0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 19:38 UTC (permalink / raw)
  To: Mikulas Patocka, Mike Snitzer
  Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Christopher Lameter, Joonsoo Kim

On 04/16/2018 04:37 PM, Mikulas Patocka wrote:
>>> Can you or Mikulas briefly summarize how the dependency is avoided, and
>>> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
>>> dm-bufio code would happily switch to it, or not?
>>
>> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
>>
>> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is: 
>> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for 
>> dm_buffer structure allocations")
>>
>> So no, I don't see why dm-bufio would need to switch to
>> SLAB_MINIMIZE_WASTE if it were introduced in the future.
> 
> Currently, the slab cache rounds up the size of the slab to the next power 
> of two (if the size is large). And that wastes memory if that memory were 
> to be used for deduplication tables.
> 
> Generally, the performance of the deduplication solution depends on how 
> much data can you put to memory. If you round 640KB buffer to 1MB (this is 
> what the slab and slub subsystem currently do), you waste a lot of memory. 
> Deduplication indices with 640KB blocks are already used in the wild, so 
> it can't be easily changed.

Thank you both for the clarification.

Vlastimil

> 
>> Mike
> 
> Mikulas
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 19:36                                         ` Mikulas Patocka
  (?)
@ 2018-04-16 19:53                                         ` Vlastimil Babka
  2018-04-16 21:01                                           ` Mikulas Patocka
  2018-04-17 14:49                                             ` Christopher Lameter
  -1 siblings, 2 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 19:53 UTC (permalink / raw)
  To: Mikulas Patocka, Christopher Lameter
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton

On 04/16/2018 09:36 PM, Mikulas Patocka wrote:
> 
> 
> On Mon, 16 Apr 2018, Christopher Lameter wrote:
> 
>> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>>
>>>>
>>>> Or an increase in slab_max_order
>>>
>>> But that will increase it for all slabs (often senselessly - i.e.
>>> kmalloc-4096 would have order 4MB).
>>
>> 4MB? Nope.... That is a power of two slab so no wasted space even with
>> order 0.
> 
> See this email:
> https://www.redhat.com/archives/dm-devel/2018-March/msg00387.html
> 
> If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages. 
> So yes, it increases the order of all slab caches (although not up to 
> 4MB).
> 
>> Its not a senseless increase. The more objects you fit into a slab page
>> the higher the performance of the allocator.

It's not universally without a cost. It might increase internal
fragmentation of the slabs, if you end up with lots of 4MB pages
containing just few objects. Thus, waste of memory. You also consume
high-order pages that could be used elsewhere. If you fail to allocate
4MB, then what's the fallback, order-0? I doubt it's "the highest
available order". Thus, a more conservative choice e.g. order-3 will
might succeed more in allocating order-3, while a choice of 4MB will
have many order-0 fallbacks.

>>> I need to increase it just for dm-bufio slabs.
>>
>> If you do this then others will want the same...
> 
> If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.

I think it should be possible without a new flag. The slub allocator
could just balance priorities (performance vs memory efficiency) better.
Currently I get the impression that "slub_max_order" is a performance
tunable. Let's add another criteria for selecting an order, that would
try to pick an order to minimize wasted space below e.g. 10% with some
different kind of max order. Pick good defaults, add tunables if you must.

I mean, anyone who's creating a cache for 640KB objects most likely
doesn't want to waste another 384KB by each such object. They shouldn't
have to add a flag to let the slub allocator figure out that using 2MB
pages is the right thing to do here.

Vlastimil

> Mikulas
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 19:53                                         ` Vlastimil Babka
@ 2018-04-16 21:01                                           ` Mikulas Patocka
  2018-04-17 14:40                                             ` Christopher Lameter
  2018-04-17 14:49                                             ` Christopher Lameter
  1 sibling, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 21:01 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Christopher Lameter, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Mon, 16 Apr 2018, Vlastimil Babka wrote:

> On 04/16/2018 09:36 PM, Mikulas Patocka wrote:
> 
> >>> I need to increase it just for dm-bufio slabs.
> >>
> >> If you do this then others will want the same...
> > 
> > If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.
> 
> I think it should be possible without a new flag. The slub allocator
> could just balance priorities (performance vs memory efficiency) better.
> Currently I get the impression that "slub_max_order" is a performance
> tunable. Let's add another criteria for selecting an order, that would
> try to pick an order to minimize wasted space below e.g. 10% with some
> different kind of max order. Pick good defaults, add tunables if you must.
> 
> I mean, anyone who's creating a cache for 640KB objects most likely
> doesn't want to waste another 384KB by each such object. They shouldn't
> have to add a flag to let the slub allocator figure out that using 2MB
> pages is the right thing to do here.
> 
> Vlastimil

The problem is that higher-order allocations (larger than 32K) are 
unreliable. So, if you increase page order beyond that, the allocation may 
randomly fail.

dm-bufio deals gracefully with allocation failure, because it preallocates 
some buffers with vmalloc, but other subsystems may not deal with it and 
they cound return ENOMEM randomly or misbehave in other ways. So, the 
"SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and 
the caller is prepared to deal with it.

The slub subsystem does actual fallback to low-order when the allocation 
fails (it allows different order for each slab in the same cache), but 
slab doesn't fallback and you get NULL if higher-order allocation fails. 
So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly 
fail with higher order.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 12:38                         ` Vlastimil Babka
@ 2018-04-16 21:04                           ` Mikulas Patocka
  -1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 21:04 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mike Snitzer, Christopher Lameter, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton



On Mon, 16 Apr 2018, Vlastimil Babka wrote:

> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at  5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> > 
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > 
> > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
> 
> Can you or Mikulas briefly summarize how the dependency is avoided, and

This was Mike's misconception - dm-bufio actually needs the 
SLAB_MINIMIZE_WASTE patch, otherwise it is wasting memory.

> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?
> 
> Thanks,
> Vlastimil

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 21:04                           ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 21:04 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Christopher Lameter,
	Joonsoo Kim



On Mon, 16 Apr 2018, Vlastimil Babka wrote:

> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at  5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> > 
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > 
> > Mikulas won't be at LSF/MM.  But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
> 
> Can you or Mikulas briefly summarize how the dependency is avoided, and

This was Mike's misconception - dm-bufio actually needs the 
SLAB_MINIMIZE_WASTE patch, otherwise it is wasting memory.

> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?
> 
> Thanks,
> Vlastimil

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 21:01                                           ` Mikulas Patocka
@ 2018-04-17 14:40                                             ` Christopher Lameter
  2018-04-17 18:53                                                 ` Mikulas Patocka
  0 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:40 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> dm-bufio deals gracefully with allocation failure, because it preallocates
> some buffers with vmalloc, but other subsystems may not deal with it and
> they cound return ENOMEM randomly or misbehave in other ways. So, the
> "SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
> the caller is prepared to deal with it.
>
> The slub subsystem does actual fallback to low-order when the allocation
> fails (it allows different order for each slab in the same cache), but
> slab doesn't fallback and you get NULL if higher-order allocation fails.
> So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> fail with higher order.

Fix Slab instead of adding a flag that is only useful for one allocator?

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 19:32                               ` [PATCH RESEND] " Mikulas Patocka
@ 2018-04-17 14:45                                 ` Christopher Lameter
  2018-04-17 16:16                                   ` Vlastimil Babka
  2018-04-17 19:06                                     ` Mikulas Patocka
  0 siblings, 2 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:45 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> flag causes allocation of larger slab caches in order to minimize wasted
> space.
>
> This is needed because we want to use dm-bufio for deduplication index and
> there are existing installations with non-power-of-two block sizes (such
> as 640KB). The performance of the whole solution depends on efficient
> memory use, so we must waste as little memory as possible.

Hmmm. Can we come up with a generic solution instead?

This may mean relaxing the enforcement of the allocation max order a bit
so that we can get dense allocation through higher order allocs.

But then higher order allocs are generally seen as problematic.

Note that SLUB will fall back to smallest order already if a failure
occurs so increasing slub_max_order may not be that much of an issue.

Maybe drop the max order limit completely and use MAX_ORDER instead? That
means that callers need to be able to tolerate failures.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 19:36                                         ` Mikulas Patocka
@ 2018-04-17 14:47                                           ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:47 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
> So yes, it increases the order of all slab caches (although not up to
> 4MB).

Hmmm... Ok. There is another setting slub_min_objects that controls how
many objects to fit into a slab page.

We could change the allocation scheme so that it finds the mininum order
with the minimum waste. Allocator performance will drop though since fewer
object are then in a slab page.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 14:47                                           ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:47 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka

On Mon, 16 Apr 2018, Mikulas Patocka wrote:

> If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
> So yes, it increases the order of all slab caches (although not up to
> 4MB).

Hmmm... Ok. There is another setting slub_min_objects that controls how
many objects to fit into a slab page.

We could change the allocation scheme so that it finds the mininum order
with the minimum waste. Allocator performance will drop though since fewer
object are then in a slab page.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-16 19:53                                         ` Vlastimil Babka
@ 2018-04-17 14:49                                             ` Christopher Lameter
  2018-04-17 14:49                                             ` Christopher Lameter
  1 sibling, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mikulas Patocka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton

On Mon, 16 Apr 2018, Vlastimil Babka wrote:

> >> Its not a senseless increase. The more objects you fit into a slab page
> >> the higher the performance of the allocator.
>
> It's not universally without a cost. It might increase internal
> fragmentation of the slabs, if you end up with lots of 4MB pages
> containing just few objects. Thus, waste of memory. You also consume
> high-order pages that could be used elsewhere. If you fail to allocate
> 4MB, then what's the fallback, order-0? I doubt it's "the highest
> available order". Thus, a more conservative choice e.g. order-3 will
> might succeed more in allocating order-3, while a choice of 4MB will
> have many order-0 fallbacks.

Obviously there is a cost. But here the subsystem has a fallback.

> I think it should be possible without a new flag. The slub allocator
> could just balance priorities (performance vs memory efficiency) better.
> Currently I get the impression that "slub_max_order" is a performance
> tunable. Let's add another criteria for selecting an order, that would
> try to pick an order to minimize wasted space below e.g. 10% with some
> different kind of max order. Pick good defaults, add tunables if you must.

There is also slub_min_objects.

> I mean, anyone who's creating a cache for 640KB objects most likely
> doesn't want to waste another 384KB by each such object. They shouldn't
> have to add a flag to let the slub allocator figure out that using 2MB
> pages is the right thing to do here.

I agree if we do this then preferably without a flag.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 14:49                                             ` Christopher Lameter
  0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	Mikulas Patocka, David Rientjes, Joonsoo Kim, Andrew Morton

On Mon, 16 Apr 2018, Vlastimil Babka wrote:

> >> Its not a senseless increase. The more objects you fit into a slab page
> >> the higher the performance of the allocator.
>
> It's not universally without a cost. It might increase internal
> fragmentation of the slabs, if you end up with lots of 4MB pages
> containing just few objects. Thus, waste of memory. You also consume
> high-order pages that could be used elsewhere. If you fail to allocate
> 4MB, then what's the fallback, order-0? I doubt it's "the highest
> available order". Thus, a more conservative choice e.g. order-3 will
> might succeed more in allocating order-3, while a choice of 4MB will
> have many order-0 fallbacks.

Obviously there is a cost. But here the subsystem has a fallback.

> I think it should be possible without a new flag. The slub allocator
> could just balance priorities (performance vs memory efficiency) better.
> Currently I get the impression that "slub_max_order" is a performance
> tunable. Let's add another criteria for selecting an order, that would
> try to pick an order to minimize wasted space below e.g. 10% with some
> different kind of max order. Pick good defaults, add tunables if you must.

There is also slub_min_objects.

> I mean, anyone who's creating a cache for 640KB objects most likely
> doesn't want to waste another 384KB by each such object. They shouldn't
> have to add a flag to let the slub allocator figure out that using 2MB
> pages is the right thing to do here.

I agree if we do this then preferably without a flag.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 14:45                                 ` Christopher Lameter
@ 2018-04-17 16:16                                   ` Vlastimil Babka
  2018-04-17 16:38                                     ` Christopher Lameter
  2018-04-17 17:26                                     ` Mikulas Patocka
  2018-04-17 19:06                                     ` Mikulas Patocka
  1 sibling, 2 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-17 16:16 UTC (permalink / raw)
  To: Christopher Lameter, Mikulas Patocka
  Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
	David Rientjes, Joonsoo Kim, Andrew Morton, linux-kernel

On 04/17/2018 04:45 PM, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
>> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
>> flag causes allocation of larger slab caches in order to minimize wasted
>> space.
>>
>> This is needed because we want to use dm-bufio for deduplication index and
>> there are existing installations with non-power-of-two block sizes (such
>> as 640KB). The performance of the whole solution depends on efficient
>> memory use, so we must waste as little memory as possible.
> 
> Hmmm. Can we come up with a generic solution instead?

Yes please.

> This may mean relaxing the enforcement of the allocation max order a bit
> so that we can get dense allocation through higher order allocs.
> 
> But then higher order allocs are generally seen as problematic.

I think in this case they are better than wasting/fragmenting 384kB for
640kB object.

> Note that SLUB will fall back to smallest order already if a failure
> occurs so increasing slub_max_order may not be that much of an issue.
> 
> Maybe drop the max order limit completely and use MAX_ORDER instead?

For packing, sure. For performance, please no (i.e. don't try to
allocate MAX_ORDER for each and every cache).

> That
> means that callers need to be able to tolerate failures.

Is it any different from now? I suppose there would still be
smallest-order fallback involved in sl*b itself? And if your allocation
is so large it can fail even with the fallback (i.e. >= costly order),
you need to tolerate failures anyway?

One corner case I see is if there is anyone who would rather use their
own fallback instead of the space-wasting smallest-order fallback.
Maybe we could map some GFP flag to indicate that.

> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 16:16                                   ` Vlastimil Babka
@ 2018-04-17 16:38                                     ` Christopher Lameter
  2018-04-17 19:09                                       ` Mikulas Patocka
  2018-04-17 17:26                                     ` Mikulas Patocka
  1 sibling, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 16:38 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Mikulas Patocka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Tue, 17 Apr 2018, Vlastimil Babka wrote:

> On 04/17/2018 04:45 PM, Christopher Lameter wrote:

> > But then higher order allocs are generally seen as problematic.
>
> I think in this case they are better than wasting/fragmenting 384kB for
> 640kB object.

Well typically we have suggested that people use vmalloc in the past.


> > Note that SLUB will fall back to smallest order already if a failure
> > occurs so increasing slub_max_order may not be that much of an issue.
> >
> > Maybe drop the max order limit completely and use MAX_ORDER instead?
>
> For packing, sure. For performance, please no (i.e. don't try to
> allocate MAX_ORDER for each and every cache).

No of course not. We would have to modify the order selection on kmem
cache creation.

> > That
> > means that callers need to be able to tolerate failures.
>
> Is it any different from now? I suppose there would still be
> smallest-order fallback involved in sl*b itself? And if your allocation
> is so large it can fail even with the fallback (i.e. >= costly order),
> you need to tolerate failures anyway?

Failures can occur even with < costly order as far as I can telkl. Order 0
is the only safe one.

> One corner case I see is if there is anyone who would rather use their
> own fallback instead of the space-wasting smallest-order fallback.
> Maybe we could map some GFP flag to indicate that.

Well if you have a fallback then maybe the slab allocator should not fall
back on its own but let the caller deal with it.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 16:16                                   ` Vlastimil Babka
  2018-04-17 16:38                                     ` Christopher Lameter
@ 2018-04-17 17:26                                     ` Mikulas Patocka
  2018-04-17 19:13                                       ` Vlastimil Babka
  1 sibling, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 17:26 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Christopher Lameter, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Tue, 17 Apr 2018, Vlastimil Babka wrote:

> On 04/17/2018 04:45 PM, Christopher Lameter wrote:
> > On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> > 
> >> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> >> flag causes allocation of larger slab caches in order to minimize wasted
> >> space.
> >>
> >> This is needed because we want to use dm-bufio for deduplication index and
> >> there are existing installations with non-power-of-two block sizes (such
> >> as 640KB). The performance of the whole solution depends on efficient
> >> memory use, so we must waste as little memory as possible.
> > 
> > Hmmm. Can we come up with a generic solution instead?
> 
> Yes please.
> 
> > This may mean relaxing the enforcement of the allocation max order a bit
> > so that we can get dense allocation through higher order allocs.
> > 
> > But then higher order allocs are generally seen as problematic.
> 
> I think in this case they are better than wasting/fragmenting 384kB for
> 640kB object.

Wasting 37% of memory is still better than the kernel randomly returning 
-ENOMEM when higher-order allocation fails.

> > That
> > means that callers need to be able to tolerate failures.
> 
> Is it any different from now? I suppose there would still be
> smallest-order fallback involved in sl*b itself? And if your allocation
> is so large it can fail even with the fallback (i.e. >= costly order),
> you need to tolerate failures anyway?
> 
> One corner case I see is if there is anyone who would rather use their
> own fallback instead of the space-wasting smallest-order fallback.
> Maybe we could map some GFP flag to indicate that.

For example, if you create a cache with 17KB objects, the slab subsystem 
will pad it up to 32KB. You are wasting almost 1/2 memory, but the 
allocation is realiable and it won't fail.

If you use order higher than 32KB, you get less wasted memory, but you 
also get random -ENOMEMs (yes, we had a problem in dm-thin that it was 
randomly failing during initialization due to 64KB allocation).

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 14:40                                             ` Christopher Lameter
@ 2018-04-17 18:53                                                 ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 18:53 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Tue, 17 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > dm-bufio deals gracefully with allocation failure, because it preallocates
> > some buffers with vmalloc, but other subsystems may not deal with it and
> > they cound return ENOMEM randomly or misbehave in other ways. So, the
> > "SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
> > the caller is prepared to deal with it.
> >
> > The slub subsystem does actual fallback to low-order when the allocation
> > fails (it allows different order for each slab in the same cache), but
> > slab doesn't fallback and you get NULL if higher-order allocation fails.
> > So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> > fail with higher order.
> 
> Fix Slab instead of adding a flag that is only useful for one allocator?

Slab assumes that all slabs have the same order, so it's not so easy to 
fix it.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 18:53                                                 ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 18:53 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, linux-kernel, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	Vlastimil Babka



On Tue, 17 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > dm-bufio deals gracefully with allocation failure, because it preallocates
> > some buffers with vmalloc, but other subsystems may not deal with it and
> > they cound return ENOMEM randomly or misbehave in other ways. So, the
> > "SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
> > the caller is prepared to deal with it.
> >
> > The slub subsystem does actual fallback to low-order when the allocation
> > fails (it allows different order for each slab in the same cache), but
> > slab doesn't fallback and you get NULL if higher-order allocation fails.
> > So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> > fail with higher order.
> 
> Fix Slab instead of adding a flag that is only useful for one allocator?

Slab assumes that all slabs have the same order, so it's not so easy to 
fix it.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 14:45                                 ` Christopher Lameter
@ 2018-04-17 19:06                                     ` Mikulas Patocka
  2018-04-17 19:06                                     ` Mikulas Patocka
  1 sibling, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 19:06 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Tue, 17 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> > flag causes allocation of larger slab caches in order to minimize wasted
> > space.
> >
> > This is needed because we want to use dm-bufio for deduplication index and
> > there are existing installations with non-power-of-two block sizes (such
> > as 640KB). The performance of the whole solution depends on efficient
> > memory use, so we must waste as little memory as possible.
> 
> Hmmm. Can we come up with a generic solution instead?
> 
> This may mean relaxing the enforcement of the allocation max order a bit
> so that we can get dense allocation through higher order allocs.
> 
> But then higher order allocs are generally seen as problematic.
> 
> Note that SLUB will fall back to smallest order already if a failure
> occurs so increasing slub_max_order may not be that much of an issue.
> 
> Maybe drop the max order limit completely and use MAX_ORDER instead? That
> means that callers need to be able to tolerate failures.

I can make a slub-only patch with no extra flag (on a freshly booted 
system it increases only the order of caches "TCPv6" and "sighand_cache" 
by one - so it should not have unexpected effects):

Doing a generic solution for slab would be more comlpicated because slab 
assumes that all slabs have the same order, so it can't fall-back to 
lower-order allocations.


From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] slub: minimize wasted space

When object size is greater than slub_max_order, the slub subsystem rounds
up the size to the next power of two. This causes a lot of wasted space -
i.e. 640KB block consumes 1MB of memory.

This patch makes the slub subsystem increase the order if it is benefical.
The order is increased as long as it reduces wasted space. There is cutoff
at 32 objects per slab.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 mm/slub.c |   21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2018-04-17 19:59:49.000000000 +0200
+++ linux-2.6/mm/slub.c	2018-04-17 20:58:23.000000000 +0200
@@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
 static inline int calculate_order(unsigned int size, unsigned int reserved)
 {
 	unsigned int order;
+	unsigned int test_order;
 	unsigned int min_objects;
 	unsigned int max_objects;
 
@@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
 			order = slab_order(size, min_objects,
 					slub_max_order, fraction, reserved);
 			if (order <= slub_max_order)
-				return order;
+				goto ret_order;
 			fraction /= 2;
 		}
 		min_objects--;
@@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
 	 */
 	order = slab_order(size, 1, slub_max_order, 1, reserved);
 	if (order <= slub_max_order)
-		return order;
+		goto ret_order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
 	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
-	if (order < MAX_ORDER)
-		return order;
-	return -ENOSYS;
+	if (order >= MAX_ORDER)
+		return -ENOSYS;
+
+ret_order:
+	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+		unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+		unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+		if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
+			break;
+		if (test_order_objects > order_objects << (test_order - order))
+			order = test_order;
+	}
+	return order;
 }
 
 static void

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 19:06                                     ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 19:06 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, linux-kernel, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	Vlastimil Babka



On Tue, 17 Apr 2018, Christopher Lameter wrote:

> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> 
> > This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> > flag causes allocation of larger slab caches in order to minimize wasted
> > space.
> >
> > This is needed because we want to use dm-bufio for deduplication index and
> > there are existing installations with non-power-of-two block sizes (such
> > as 640KB). The performance of the whole solution depends on efficient
> > memory use, so we must waste as little memory as possible.
> 
> Hmmm. Can we come up with a generic solution instead?
> 
> This may mean relaxing the enforcement of the allocation max order a bit
> so that we can get dense allocation through higher order allocs.
> 
> But then higher order allocs are generally seen as problematic.
> 
> Note that SLUB will fall back to smallest order already if a failure
> occurs so increasing slub_max_order may not be that much of an issue.
> 
> Maybe drop the max order limit completely and use MAX_ORDER instead? That
> means that callers need to be able to tolerate failures.

I can make a slub-only patch with no extra flag (on a freshly booted 
system it increases only the order of caches "TCPv6" and "sighand_cache" 
by one - so it should not have unexpected effects):

Doing a generic solution for slab would be more comlpicated because slab 
assumes that all slabs have the same order, so it can't fall-back to 
lower-order allocations.


From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] slub: minimize wasted space

When object size is greater than slub_max_order, the slub subsystem rounds
up the size to the next power of two. This causes a lot of wasted space -
i.e. 640KB block consumes 1MB of memory.

This patch makes the slub subsystem increase the order if it is benefical.
The order is increased as long as it reduces wasted space. There is cutoff
at 32 objects per slab.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 mm/slub.c |   21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2018-04-17 19:59:49.000000000 +0200
+++ linux-2.6/mm/slub.c	2018-04-17 20:58:23.000000000 +0200
@@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
 static inline int calculate_order(unsigned int size, unsigned int reserved)
 {
 	unsigned int order;
+	unsigned int test_order;
 	unsigned int min_objects;
 	unsigned int max_objects;
 
@@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
 			order = slab_order(size, min_objects,
 					slub_max_order, fraction, reserved);
 			if (order <= slub_max_order)
-				return order;
+				goto ret_order;
 			fraction /= 2;
 		}
 		min_objects--;
@@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
 	 */
 	order = slab_order(size, 1, slub_max_order, 1, reserved);
 	if (order <= slub_max_order)
-		return order;
+		goto ret_order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
 	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
-	if (order < MAX_ORDER)
-		return order;
-	return -ENOSYS;
+	if (order >= MAX_ORDER)
+		return -ENOSYS;
+
+ret_order:
+	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+		unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+		unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+		if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
+			break;
+		if (test_order_objects > order_objects << (test_order - order))
+			order = test_order;
+	}
+	return order;
 }
 
 static void

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 16:38                                     ` Christopher Lameter
@ 2018-04-17 19:09                                       ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 19:09 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Tue, 17 Apr 2018, Christopher Lameter wrote:

> On Tue, 17 Apr 2018, Vlastimil Babka wrote:
> 
> > On 04/17/2018 04:45 PM, Christopher Lameter wrote:
> 
> > > But then higher order allocs are generally seen as problematic.
> >
> > I think in this case they are better than wasting/fragmenting 384kB for
> > 640kB object.
> 
> Well typically we have suggested that people use vmalloc in the past.

vmalloc is slow - it is unuseable for a buffer cache.

> > > That
> > > means that callers need to be able to tolerate failures.
> >
> > Is it any different from now? I suppose there would still be
> > smallest-order fallback involved in sl*b itself? And if your allocation
> > is so large it can fail even with the fallback (i.e. >= costly order),
> > you need to tolerate failures anyway?
> 
> Failures can occur even with < costly order as far as I can telkl. Order 0
> is the only safe one.

The alloc_pages functions seems to retry indefinitely for order <= 
PAGE_ALLOC_COSTLY_ORDER. Do you have some explanation why it should fail?

> > One corner case I see is if there is anyone who would rather use their
> > own fallback instead of the space-wasting smallest-order fallback.
> > Maybe we could map some GFP flag to indicate that.
> 
> Well if you have a fallback then maybe the slab allocator should not fall
> back on its own but let the caller deal with it.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 17:26                                     ` Mikulas Patocka
@ 2018-04-17 19:13                                       ` Vlastimil Babka
  0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-17 19:13 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Christopher Lameter, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On 04/17/2018 07:26 PM, Mikulas Patocka wrote:
> 
> 
> On Tue, 17 Apr 2018, Vlastimil Babka wrote:
> 
>> On 04/17/2018 04:45 PM, Christopher Lameter wrote:
>>> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>>>
>>>> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
>>>> flag causes allocation of larger slab caches in order to minimize wasted
>>>> space.
>>>>
>>>> This is needed because we want to use dm-bufio for deduplication index and
>>>> there are existing installations with non-power-of-two block sizes (such
>>>> as 640KB). The performance of the whole solution depends on efficient
>>>> memory use, so we must waste as little memory as possible.
>>>
>>> Hmmm. Can we come up with a generic solution instead?
>>
>> Yes please.
>>
>>> This may mean relaxing the enforcement of the allocation max order a bit
>>> so that we can get dense allocation through higher order allocs.
>>>
>>> But then higher order allocs are generally seen as problematic.
>>
>> I think in this case they are better than wasting/fragmenting 384kB for
>> 640kB object.
> 
> Wasting 37% of memory is still better than the kernel randomly returning 
> -ENOMEM when higher-order allocation fails.

Of course, see below.

>>> That
>>> means that callers need to be able to tolerate failures.
>>
>> Is it any different from now? I suppose there would still be
>> smallest-order fallback involved in sl*b itself? And if your allocation

^ There: "I suppose there would still be smallest-order fallback
involved in sl*b itself?"

If SLAB doesn't currently support fallback to different order, it either
learns to do that, or keeps wasting memory and more people will migrate
to SLUB. Simple.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 18:53                                                 ` Mikulas Patocka
  (?)
@ 2018-04-17 21:42                                                 ` Christopher Lameter
  -1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 21:42 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Tue, 17 Apr 2018, Mikulas Patocka wrote:

> > > The slub subsystem does actual fallback to low-order when the allocation
> > > fails (it allows different order for each slab in the same cache), but
> > > slab doesn't fallback and you get NULL if higher-order allocation fails.
> > > So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> > > fail with higher order.
> >
> > Fix Slab instead of adding a flag that is only useful for one allocator?
>
> Slab assumes that all slabs have the same order, so it's not so easy to
> fix it.

Well since SLAB uses compound pages one could easily determine the order
of the page and thus also support multiple page orders there.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-17 19:06                                     ` Mikulas Patocka
  (?)
@ 2018-04-18 14:55                                     ` Christopher Lameter
  2018-04-25 21:04                                       ` Mikulas Patocka
  -1 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-18 14:55 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Tue, 17 Apr 2018, Mikulas Patocka wrote:

> I can make a slub-only patch with no extra flag (on a freshly booted
> system it increases only the order of caches "TCPv6" and "sighand_cache"
> by one - so it should not have unexpected effects):
>
> Doing a generic solution for slab would be more comlpicated because slab
> assumes that all slabs have the same order, so it can't fall-back to
> lower-order allocations.

Well again SLAB uses compound pages and thus would be able to detect the
size of the page. It may be some work but it could be done.

>
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2018-04-17 19:59:49.000000000 +0200
> +++ linux-2.6/mm/slub.c	2018-04-17 20:58:23.000000000 +0200
> @@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
>  static inline int calculate_order(unsigned int size, unsigned int reserved)
>  {
>  	unsigned int order;
> +	unsigned int test_order;
>  	unsigned int min_objects;
>  	unsigned int max_objects;
>
> @@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
>  			order = slab_order(size, min_objects,
>  					slub_max_order, fraction, reserved);
>  			if (order <= slub_max_order)
> -				return order;
> +				goto ret_order;
>  			fraction /= 2;
>  		}
>  		min_objects--;
> @@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
>  	 */
>  	order = slab_order(size, 1, slub_max_order, 1, reserved);

The slab order is determined in slab_order()

>  	if (order <= slub_max_order)
> -		return order;
> +		goto ret_order;
>
>  	/*
>  	 * Doh this slab cannot be placed using slub_max_order.
>  	 */
>  	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> -	if (order < MAX_ORDER)
> -		return order;
> -	return -ENOSYS;
> +	if (order >= MAX_ORDER)
> +		return -ENOSYS;
> +
> +ret_order:
> +	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> +		unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
> +		unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
> +		if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
> +			break;
> +		if (test_order_objects > order_objects << (test_order - order))
> +			order = test_order;
> +	}
> +	return order;

Could yo move that logic into slab_order()? It does something awfully
similar.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-18 14:55                                     ` Christopher Lameter
@ 2018-04-25 21:04                                       ` Mikulas Patocka
  2018-04-25 23:24                                         ` Mikulas Patocka
  2018-04-26 18:51                                         ` Christopher Lameter
  0 siblings, 2 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-25 21:04 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Wed, 18 Apr 2018, Christopher Lameter wrote:

> On Tue, 17 Apr 2018, Mikulas Patocka wrote:
> 
> > I can make a slub-only patch with no extra flag (on a freshly booted
> > system it increases only the order of caches "TCPv6" and "sighand_cache"
> > by one - so it should not have unexpected effects):
> >
> > Doing a generic solution for slab would be more comlpicated because slab
> > assumes that all slabs have the same order, so it can't fall-back to
> > lower-order allocations.
> 
> Well again SLAB uses compound pages and thus would be able to detect the
> size of the page. It may be some work but it could be done.
> 
> >
> > Index: linux-2.6/mm/slub.c
> > ===================================================================
> > --- linux-2.6.orig/mm/slub.c	2018-04-17 19:59:49.000000000 +0200
> > +++ linux-2.6/mm/slub.c	2018-04-17 20:58:23.000000000 +0200
> > @@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
> >  static inline int calculate_order(unsigned int size, unsigned int reserved)
> >  {
> >  	unsigned int order;
> > +	unsigned int test_order;
> >  	unsigned int min_objects;
> >  	unsigned int max_objects;
> >
> > @@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
> >  			order = slab_order(size, min_objects,
> >  					slub_max_order, fraction, reserved);
> >  			if (order <= slub_max_order)
> > -				return order;
> > +				goto ret_order;
> >  			fraction /= 2;
> >  		}
> >  		min_objects--;
> > @@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
> >  	 */
> >  	order = slab_order(size, 1, slub_max_order, 1, reserved);
> 
> The slab order is determined in slab_order()
> 
> >  	if (order <= slub_max_order)
> > -		return order;
> > +		goto ret_order;
> >
> >  	/*
> >  	 * Doh this slab cannot be placed using slub_max_order.
> >  	 */
> >  	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> > -	if (order < MAX_ORDER)
> > -		return order;
> > -	return -ENOSYS;
> > +	if (order >= MAX_ORDER)
> > +		return -ENOSYS;
> > +
> > +ret_order:
> > +	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> > +		unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
> > +		unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
> > +		if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
> > +			break;
> > +		if (test_order_objects > order_objects << (test_order - order))
> > +			order = test_order;
> > +	}
> > +	return order;
> 
> Could yo move that logic into slab_order()? It does something awfully
> similar.

But slab_order (and its caller) limits the order to "max_order" and we 
want more.

Perhaps slab_order should be dropped and calculate_order totally 
rewritten?

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-25 21:04                                       ` Mikulas Patocka
@ 2018-04-25 23:24                                         ` Mikulas Patocka
  2018-04-26 19:01                                           ` Christopher Lameter
  2018-04-26 18:51                                         ` Christopher Lameter
  1 sibling, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-25 23:24 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Wed, 25 Apr 2018, Mikulas Patocka wrote:

> 
> 
> On Wed, 18 Apr 2018, Christopher Lameter wrote:
> 
> > On Tue, 17 Apr 2018, Mikulas Patocka wrote:
> > 
> > > I can make a slub-only patch with no extra flag (on a freshly booted
> > > system it increases only the order of caches "TCPv6" and "sighand_cache"
> > > by one - so it should not have unexpected effects):
> > >
> > > Doing a generic solution for slab would be more comlpicated because slab
> > > assumes that all slabs have the same order, so it can't fall-back to
> > > lower-order allocations.
> > 
> > Well again SLAB uses compound pages and thus would be able to detect the
> > size of the page. It may be some work but it could be done.
> > 
> > >
> > > Index: linux-2.6/mm/slub.c
> > > ===================================================================
> > > --- linux-2.6.orig/mm/slub.c	2018-04-17 19:59:49.000000000 +0200
> > > +++ linux-2.6/mm/slub.c	2018-04-17 20:58:23.000000000 +0200
> > > @@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
> > >  static inline int calculate_order(unsigned int size, unsigned int reserved)
> > >  {
> > >  	unsigned int order;
> > > +	unsigned int test_order;
> > >  	unsigned int min_objects;
> > >  	unsigned int max_objects;
> > >
> > > @@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
> > >  			order = slab_order(size, min_objects,
> > >  					slub_max_order, fraction, reserved);
> > >  			if (order <= slub_max_order)
> > > -				return order;
> > > +				goto ret_order;
> > >  			fraction /= 2;
> > >  		}
> > >  		min_objects--;
> > > @@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
> > >  	 */
> > >  	order = slab_order(size, 1, slub_max_order, 1, reserved);
> > 
> > The slab order is determined in slab_order()
> > 
> > >  	if (order <= slub_max_order)
> > > -		return order;
> > > +		goto ret_order;
> > >
> > >  	/*
> > >  	 * Doh this slab cannot be placed using slub_max_order.
> > >  	 */
> > >  	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> > > -	if (order < MAX_ORDER)
> > > -		return order;
> > > -	return -ENOSYS;
> > > +	if (order >= MAX_ORDER)
> > > +		return -ENOSYS;
> > > +
> > > +ret_order:
> > > +	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> > > +		unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
> > > +		unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
> > > +		if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
> > > +			break;
> > > +		if (test_order_objects > order_objects << (test_order - order))
> > > +			order = test_order;
> > > +	}
> > > +	return order;
> > 
> > Could yo move that logic into slab_order()? It does something awfully
> > similar.
> 
> But slab_order (and its caller) limits the order to "max_order" and we 
> want more.
> 
> Perhaps slab_order should be dropped and calculate_order totally 
> rewritten?
> 
> Mikulas

Do you want this? It deletes slab_order and replaces it with the 
"minimize_waste" logic directly.

The patch starts with a minimal order for a given size and increases the 
order if one of these conditions is met:
* we is below slub_min_order
* we are below min_objects and slub_max_order
* we go above slub_max_order only if it minimizes waste and if we don't 
  increase the object count above 32

It simplifies the code and it is very similar to the old algorithms, most 
slab caches have the same order, so it shouldn't cause any regressions.

This patch changes order of these slabs:
TCPv6: 3 -> 4
sighand_cache: 3 -> 4
task_struct: 3 -> 4

---
 mm/slub.c |   76 +++++++++++++++++++++-----------------------------------------
 1 file changed, 26 insertions(+), 50 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2018-04-26 00:07:30.000000000 +0200
+++ linux-2.6/mm/slub.c	2018-04-26 00:21:37.000000000 +0200
@@ -3224,34 +3224,10 @@ static unsigned int slub_min_objects;
  * requested a higher mininum order then we start with that one instead of
  * the smallest order which will fit the object.
  */
-static inline unsigned int slab_order(unsigned int size,
-		unsigned int min_objects, unsigned int max_order,
-		unsigned int fract_leftover, unsigned int reserved)
-{
-	unsigned int min_order = slub_min_order;
-	unsigned int order;
-
-	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
-		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
-
-	for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
-			order <= max_order; order++) {
-
-		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
-		unsigned int rem;
-
-		rem = (slab_size - reserved) % size;
-
-		if (rem <= slab_size / fract_leftover)
-			break;
-	}
-
-	return order;
-}
-
 static inline int calculate_order(unsigned int size, unsigned int reserved)
 {
 	unsigned int order;
+	unsigned int test_order;
 	unsigned int min_objects;
 	unsigned int max_objects;
 
@@ -3269,35 +3245,35 @@ static inline int calculate_order(unsign
 	max_objects = order_objects(slub_max_order, size, reserved);
 	min_objects = min(min_objects, max_objects);
 
-	while (min_objects > 1) {
-		unsigned int fraction;
+	/* Get the minimum acceptable order for one object */
+	order = get_order(size + reserved);
+
+	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+		unsigned order_obj = order_objects(order, size, reserved);
+		unsigned test_order_obj = order_objects(test_order, size, reserved);
+
+		/* If there are too many objects, stop searching */
+		if (test_order_obj > MAX_OBJS_PER_PAGE)
+			break;
 
-		fraction = 16;
-		while (fraction >= 4) {
-			order = slab_order(size, min_objects,
-					slub_max_order, fraction, reserved);
-			if (order <= slub_max_order)
-				return order;
-			fraction /= 2;
-		}
-		min_objects--;
+		/* Always increase up to slub_min_order */
+		if (test_order <= slub_min_order)
+			order = test_order;
+
+		/* If we are below min_objects and slub_max_order, increase order */
+		if (order_obj < min_objects && test_order <= slub_max_order)
+			order = test_order;
+
+		/* Increase order even more, but only if it reduces waste */
+		if (test_order_obj <= 32 &&
+		    test_order_obj > order_obj << (test_order - order))
+			order = test_order;
 	}
 
-	/*
-	 * We were unable to place multiple objects in a slab. Now
-	 * lets see if we can place a single object there.
-	 */
-	order = slab_order(size, 1, slub_max_order, 1, reserved);
-	if (order <= slub_max_order)
-		return order;
+	if (order >= MAX_ORDER)
+		return -ENOSYS;
 
-	/*
-	 * Doh this slab cannot be placed using slub_max_order.
-	 */
-	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
-	if (order < MAX_ORDER)
-		return order;
-	return -ENOSYS;
+	return order;
 }
 
 static void

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-25 21:04                                       ` Mikulas Patocka
  2018-04-25 23:24                                         ` Mikulas Patocka
@ 2018-04-26 18:51                                         ` Christopher Lameter
  1 sibling, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-26 18:51 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Wed, 25 Apr 2018, Mikulas Patocka wrote:

> >
> > Could yo move that logic into slab_order()? It does something awfully
> > similar.
>
> But slab_order (and its caller) limits the order to "max_order" and we
> want more.
>
> Perhaps slab_order should be dropped and calculate_order totally
> rewritten?

Yes you likely need to do something creative with max_order if not with
more stuff.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-25 23:24                                         ` Mikulas Patocka
@ 2018-04-26 19:01                                           ` Christopher Lameter
  2018-04-26 21:09                                             ` Mikulas Patocka
  0 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-26 19:01 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Wed, 25 Apr 2018, Mikulas Patocka wrote:

> Do you want this? It deletes slab_order and replaces it with the
> "minimize_waste" logic directly.

Well yes that looks better. Now we need to make it easy to read and less
complicated. Maybe try to keep as much as possible of the old code
and also the names of variables to make it easier to review?

> It simplifies the code and it is very similar to the old algorithms, most
> slab caches have the same order, so it shouldn't cause any regressions.
>
> This patch changes order of these slabs:
> TCPv6: 3 -> 4
> sighand_cache: 3 -> 4
> task_struct: 3 -> 4

Hmmm... order 4 for these caches may cause some concern. These should stay
under costly order I think. Otherwise allocations are no longer
guaranteed.

> @@ -3269,35 +3245,35 @@ static inline int calculate_order(unsign
>  	max_objects = order_objects(slub_max_order, size, reserved);
>  	min_objects = min(min_objects, max_objects);
>
> -	while (min_objects > 1) {
> -		unsigned int fraction;
> +	/* Get the minimum acceptable order for one object */
> +	order = get_order(size + reserved);
> +
> +	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> +		unsigned order_obj = order_objects(order, size, reserved);
> +		unsigned test_order_obj = order_objects(test_order, size, reserved);
> +
> +		/* If there are too many objects, stop searching */
> +		if (test_order_obj > MAX_OBJS_PER_PAGE)
> +			break;
>
> -		fraction = 16;
> -		while (fraction >= 4) {
> -			order = slab_order(size, min_objects,
> -					slub_max_order, fraction, reserved);
> -			if (order <= slub_max_order)
> -				return order;
> -			fraction /= 2;
> -		}
> -		min_objects--;
> +		/* Always increase up to slub_min_order */
> +		if (test_order <= slub_min_order)
> +			order = test_order;

Well that is a significant change. In our current scheme the order
boundart wins.


> +
> +		/* If we are below min_objects and slub_max_order, increase order */
> +		if (order_obj < min_objects && test_order <= slub_max_order)
> +			order = test_order;
> +
> +		/* Increase order even more, but only if it reduces waste */
> +		if (test_order_obj <= 32 &&

Where does the 32 come from?

> +		    test_order_obj > order_obj << (test_order - order))

Add more () to make the condition better readable.

> +			order = test_order;

Can we just call test_order order and avoid using the long variable names
here? Variable names in functions are typically short.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-26 19:01                                           ` Christopher Lameter
@ 2018-04-26 21:09                                             ` Mikulas Patocka
  2018-04-27 16:41                                               ` Christopher Lameter
  0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-26 21:09 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Thu, 26 Apr 2018, Christopher Lameter wrote:

> On Wed, 25 Apr 2018, Mikulas Patocka wrote:
> 
> > Do you want this? It deletes slab_order and replaces it with the
> > "minimize_waste" logic directly.
> 
> Well yes that looks better. Now we need to make it easy to read and less
> complicated. Maybe try to keep as much as possible of the old code
> and also the names of variables to make it easier to review?
> 
> > It simplifies the code and it is very similar to the old algorithms, most
> > slab caches have the same order, so it shouldn't cause any regressions.
> >
> > This patch changes order of these slabs:
> > TCPv6: 3 -> 4
> > sighand_cache: 3 -> 4
> > task_struct: 3 -> 4
> 
> Hmmm... order 4 for these caches may cause some concern. These should stay
> under costly order I think. Otherwise allocations are no longer
> guaranteed.

You said that slub has fallback to smaller order allocations.

The whole purpose of this "minimize waste" approach is to use higher-order 
allocations to use memory more efficiently, so it is just doing its job. 
(for these 3 caches, order-4 really wastes less memory than order-3 - on 
my system TCPv6 and sighand_cache have size 2112, task_struct 2752).

We could improve the fallback code, so that if order-4 allocation fails, 
it tries order-3 allocation, and then falls back to order-0. But I think 
that these failures are rare enough that it is not a problem.

> > @@ -3269,35 +3245,35 @@ static inline int calculate_order(unsign
> >  	max_objects = order_objects(slub_max_order, size, reserved);
> >  	min_objects = min(min_objects, max_objects);
> >
> > -	while (min_objects > 1) {
> > -		unsigned int fraction;
> > +	/* Get the minimum acceptable order for one object */
> > +	order = get_order(size + reserved);
> > +
> > +	for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> > +		unsigned order_obj = order_objects(order, size, reserved);
> > +		unsigned test_order_obj = order_objects(test_order, size, reserved);
> > +
> > +		/* If there are too many objects, stop searching */
> > +		if (test_order_obj > MAX_OBJS_PER_PAGE)
> > +			break;
> >
> > -		fraction = 16;
> > -		while (fraction >= 4) {
> > -			order = slab_order(size, min_objects,
> > -					slub_max_order, fraction, reserved);
> > -			if (order <= slub_max_order)
> > -				return order;
> > -			fraction /= 2;
> > -		}
> > -		min_objects--;
> > +		/* Always increase up to slub_min_order */
> > +		if (test_order <= slub_min_order)
> > +			order = test_order;
> 
> Well that is a significant change. In our current scheme the order
> boundart wins.

I think it's not a change. The existing function slab_order() starts with 
min_order (unless it overshoots MAX_OBJS_PER_PAGE) and then goes upwards. 
My code does the same - my code tests for MAX_OBJS_PER_PAGE (and bails out 
if we would overshoot it) and increases the order until it reaches 
slub_min_order (and then increases it even more if it satisfies the other 
conditions).

If you believe that it behaves differently, please describe the situation 
in detail.

> > +
> > +		/* If we are below min_objects and slub_max_order, increase order */
> > +		if (order_obj < min_objects && test_order <= slub_max_order)
> > +			order = test_order;
> > +
> > +		/* Increase order even more, but only if it reduces waste */
> > +		if (test_order_obj <= 32 &&
> 
> Where does the 32 come from?

It is to avoid extremely high order for extremely small slabs.

For example, see kmalloc-96.
10922 96-byte objects would fit into 1MiB
21845 96-byte objects would fit into 2MiB

The algorithm would recognize this one more object that fits into 2MiB 
slab as "waste reduction" and increase the order to 2MiB - and we don't 
want this.

So, the general reasoning is - if we have 32 objects in a slab, then it is 
already considered that wasted space is reasonably low and we don't want 
to increase the order more.

Currently, kmalloc-96 uses order-0 - that is reasonable (we already have 
42 objects in 4k page, so we don't need to use higher order, even if it 
wastes one-less object).

> > +		    test_order_obj > order_obj << (test_order - order))
> 
> Add more () to make the condition better readable.
> 
> > +			order = test_order;
> 
> Can we just call test_order order and avoid using the long variable names
> here? Variable names in functions are typically short.

You need two variables - "order" and "test_order".

"order" is the best order found so far and "test_order" is the order that 
we are now testing. If "test_order" wastes less space than "order", we 
assign order = test_order.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-26 21:09                                             ` Mikulas Patocka
@ 2018-04-27 16:41                                               ` Christopher Lameter
  2018-04-27 19:19                                                 ` Mikulas Patocka
  0 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-27 16:41 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

On Thu, 26 Apr 2018, Mikulas Patocka wrote:

> > Hmmm... order 4 for these caches may cause some concern. These should stay
> > under costly order I think. Otherwise allocations are no longer
> > guaranteed.
>
> You said that slub has fallback to smaller order allocations.

Yes it does...

> The whole purpose of this "minimize waste" approach is to use higher-order
> allocations to use memory more efficiently, so it is just doing its job.
> (for these 3 caches, order-4 really wastes less memory than order-3 - on
> my system TCPv6 and sighand_cache have size 2112, task_struct 2752).

Hmmm... Ok if the others are fine with this as well. I got some pushback
there in the past.

> We could improve the fallback code, so that if order-4 allocation fails,
> it tries order-3 allocation, and then falls back to order-0. But I think
> that these failures are rare enough that it is not a problem.

I also think that would be too many fallbacks.

> > > +		/* Increase order even more, but only if it reduces waste */
> > > +		if (test_order_obj <= 32 &&
> >
> > Where does the 32 come from?
>
> It is to avoid extremely high order for extremely small slabs.
>
> For example, see kmalloc-96.
> 10922 96-byte objects would fit into 1MiB
> 21845 96-byte objects would fit into 2MiB

That is the result of considering absolute byte wastage..

> The algorithm would recognize this one more object that fits into 2MiB
> slab as "waste reduction" and increase the order to 2MiB - and we don't
> want this.
>
> So, the general reasoning is - if we have 32 objects in a slab, then it is
> already considered that wasted space is reasonably low and we don't want
> to increase the order more.
>
> Currently, kmalloc-96 uses order-0 - that is reasonable (we already have
> 42 objects in 4k page, so we don't need to use higher order, even if it
> wastes one-less object).


The old code uses the concept of a "fraction" to calculate overhead. The
code here uses absolute counts of bytes. Fraction looks better to me.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-27 16:41                                               ` Christopher Lameter
@ 2018-04-27 19:19                                                 ` Mikulas Patocka
  2018-06-13 17:01                                                   ` Mikulas Patocka
  0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-27 19:19 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel



On Fri, 27 Apr 2018, Christopher Lameter wrote:

> On Thu, 26 Apr 2018, Mikulas Patocka wrote:
> 
> > > Hmmm... order 4 for these caches may cause some concern. These should stay
> > > under costly order I think. Otherwise allocations are no longer
> > > guaranteed.
> >
> > You said that slub has fallback to smaller order allocations.
> 
> Yes it does...
> 
> > The whole purpose of this "minimize waste" approach is to use higher-order
> > allocations to use memory more efficiently, so it is just doing its job.
> > (for these 3 caches, order-4 really wastes less memory than order-3 - on
> > my system TCPv6 and sighand_cache have size 2112, task_struct 2752).
> 
> Hmmm... Ok if the others are fine with this as well. I got some pushback
> there in the past.
> 
> > We could improve the fallback code, so that if order-4 allocation fails,
> > it tries order-3 allocation, and then falls back to order-0. But I think
> > that these failures are rare enough that it is not a problem.
> 
> I also think that would be too many fallbacks.

You are right - it's better to fallback to the minimum possible size, so 
that the allocation is faster.

> The old code uses the concept of a "fraction" to calculate overhead. The
> code here uses absolute counts of bytes. Fraction looks better to me.

OK - I reworked the patch using the same "fraction" calculation as before.  
The existing logic targets 1/16 wasted space, so I used this target in 
this patch too.

This patch increases only the order of task_struct (from 3 to 4), all the 
other caches have the same order as before.

Mikulas



From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] slub: use higher order to reduce wasted space

If we create a slub cache with large object size (larger than
slub_max_order), the slub subsystem currently rounds up the object size to
the next power of two.

This is inefficient, because it wastes too much space. We use the slab
cache as a buffer cache in dm-bufio, in order to use the memory
efficiently, we need to reduce wasted space.

This patch reworks the slub order calculation algorithm, so that it uses
higher order allocations if it would reduce wasted space. The slub
subsystem has fallback if the higher-order allocations fails, so using
order higher than PAGE_ALLOC_COSTLY_ORDER is ok.

The new algorithm first calculates the minimum order that can be used for
a give object size and then increases the order according to these
conditions:
* if we would overshoot MAX_OBJS_PER_PAGE, don't increase
* if we are below slub_min_order, increase
* if we are below slub_max_order and below min_objects, increase
* we increase above slub_max_order only if it reduces wasted space and if
  we alrady waste at least 1/16 of the compound page

The new algorithm gives very similar results to the old one, all the
caches on my system have the same order as before, only the order of
task_struct (size 2752) is increased from 3 to 4.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 mm/slub.c |   82 +++++++++++++++++++++++---------------------------------------
 1 file changed, 31 insertions(+), 51 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2018-04-27 19:30:34.000000000 +0200
+++ linux-2.6/mm/slub.c	2018-04-27 21:05:53.000000000 +0200
@@ -3224,34 +3224,10 @@ static unsigned int slub_min_objects;
  * requested a higher mininum order then we start with that one instead of
  * the smallest order which will fit the object.
  */
-static inline unsigned int slab_order(unsigned int size,
-		unsigned int min_objects, unsigned int max_order,
-		unsigned int fract_leftover, unsigned int reserved)
+static int calculate_order(unsigned int size, unsigned int reserved)
 {
-	unsigned int min_order = slub_min_order;
-	unsigned int order;
-
-	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
-		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
-
-	for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
-			order <= max_order; order++) {
-
-		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
-		unsigned int rem;
-
-		rem = (slab_size - reserved) % size;
-
-		if (rem <= slab_size / fract_leftover)
-			break;
-	}
-
-	return order;
-}
-
-static inline int calculate_order(unsigned int size, unsigned int reserved)
-{
-	unsigned int order;
+	unsigned int best_order;
+	unsigned int test_order;
 	unsigned int min_objects;
 	unsigned int max_objects;
 
@@ -3269,34 +3245,38 @@ static inline int calculate_order(unsign
 	max_objects = order_objects(slub_max_order, size, reserved);
 	min_objects = min(min_objects, max_objects);
 
-	while (min_objects > 1) {
-		unsigned int fraction;
+	/* Get the minimum acceptable order for one object */
+	best_order = get_order(size + reserved);
+
+	for (test_order = best_order + 1; test_order < MAX_ORDER; test_order++) {
+		unsigned best_order_obj = order_objects(best_order, size, reserved);
+		unsigned test_order_obj = order_objects(test_order, size, reserved);
+
+		unsigned best_order_slab_size = (unsigned int)PAGE_SIZE << best_order;
+		unsigned best_order_rem = (best_order_slab_size - reserved) % size;
+
+		/* If there would be too many objects, stop searching */
+		if (test_order_obj > MAX_OBJS_PER_PAGE)
+			break;
 
-		fraction = 16;
-		while (fraction >= 4) {
-			order = slab_order(size, min_objects,
-					slub_max_order, fraction, reserved);
-			if (order <= slub_max_order)
-				return order;
-			fraction /= 2;
-		}
-		min_objects--;
+		/* Always increase up to slub_min_order */
+		if (test_order <= slub_min_order)
+			best_order = test_order;
+
+		/* If we are below min_objects and slub_max_order, increase the order */
+		if (best_order_obj < min_objects && test_order <= slub_max_order)
+			best_order = test_order;
+
+		/* Increase the order even more, but only if it reduces waste */
+		/* If we already waste less than 1/16, don't increase it */
+		if (best_order_rem >= (best_order_slab_size / 16) &&
+		    test_order_obj > (best_order_obj << (test_order - best_order)))
+			best_order = test_order;
 	}
 
-	/*
-	 * We were unable to place multiple objects in a slab. Now
-	 * lets see if we can place a single object there.
-	 */
-	order = slab_order(size, 1, slub_max_order, 1, reserved);
-	if (order <= slub_max_order)
-		return order;
+	if (best_order < MAX_ORDER)
+		return best_order;
 
-	/*
-	 * Doh this slab cannot be placed using slub_max_order.
-	 */
-	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
-	if (order < MAX_ORDER)
-		return order;
 	return -ENOSYS;
 }
 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-04-27 19:19                                                 ` Mikulas Patocka
@ 2018-06-13 17:01                                                   ` Mikulas Patocka
  2018-06-13 18:16                                                     ` Christoph Hellwig
  0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-06-13 17:01 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
	linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
	linux-kernel

Hi

I'd like to ask about this patch - will you commit it, or do you want to 
make some more changes to it?

Mikulas




On Fri, 27 Apr 2018, Mikulas Patocka wrote:

> 
> 
> On Fri, 27 Apr 2018, Christopher Lameter wrote:
> 
> > On Thu, 26 Apr 2018, Mikulas Patocka wrote:
> > 
> > > > Hmmm... order 4 for these caches may cause some concern. These should stay
> > > > under costly order I think. Otherwise allocations are no longer
> > > > guaranteed.
> > >
> > > You said that slub has fallback to smaller order allocations.
> > 
> > Yes it does...
> > 
> > > The whole purpose of this "minimize waste" approach is to use higher-order
> > > allocations to use memory more efficiently, so it is just doing its job.
> > > (for these 3 caches, order-4 really wastes less memory than order-3 - on
> > > my system TCPv6 and sighand_cache have size 2112, task_struct 2752).
> > 
> > Hmmm... Ok if the others are fine with this as well. I got some pushback
> > there in the past.
> > 
> > > We could improve the fallback code, so that if order-4 allocation fails,
> > > it tries order-3 allocation, and then falls back to order-0. But I think
> > > that these failures are rare enough that it is not a problem.
> > 
> > I also think that would be too many fallbacks.
> 
> You are right - it's better to fallback to the minimum possible size, so 
> that the allocation is faster.
> 
> > The old code uses the concept of a "fraction" to calculate overhead. The
> > code here uses absolute counts of bytes. Fraction looks better to me.
> 
> OK - I reworked the patch using the same "fraction" calculation as before.  
> The existing logic targets 1/16 wasted space, so I used this target in 
> this patch too.
> 
> This patch increases only the order of task_struct (from 3 to 4), all the 
> other caches have the same order as before.
> 
> Mikulas
> 
> 
> 
> From: Mikulas Patocka <mpatocka@redhat.com>
> Subject: [PATCH] slub: use higher order to reduce wasted space
> 
> If we create a slub cache with large object size (larger than
> slub_max_order), the slub subsystem currently rounds up the object size to
> the next power of two.
> 
> This is inefficient, because it wastes too much space. We use the slab
> cache as a buffer cache in dm-bufio, in order to use the memory
> efficiently, we need to reduce wasted space.
> 
> This patch reworks the slub order calculation algorithm, so that it uses
> higher order allocations if it would reduce wasted space. The slub
> subsystem has fallback if the higher-order allocations fails, so using
> order higher than PAGE_ALLOC_COSTLY_ORDER is ok.
> 
> The new algorithm first calculates the minimum order that can be used for
> a give object size and then increases the order according to these
> conditions:
> * if we would overshoot MAX_OBJS_PER_PAGE, don't increase
> * if we are below slub_min_order, increase
> * if we are below slub_max_order and below min_objects, increase
> * we increase above slub_max_order only if it reduces wasted space and if
>   we alrady waste at least 1/16 of the compound page
> 
> The new algorithm gives very similar results to the old one, all the
> caches on my system have the same order as before, only the order of
> task_struct (size 2752) is increased from 3 to 4.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> 
> ---
>  mm/slub.c |   82 +++++++++++++++++++++++---------------------------------------
>  1 file changed, 31 insertions(+), 51 deletions(-)
> 
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2018-04-27 19:30:34.000000000 +0200
> +++ linux-2.6/mm/slub.c	2018-04-27 21:05:53.000000000 +0200
> @@ -3224,34 +3224,10 @@ static unsigned int slub_min_objects;
>   * requested a higher mininum order then we start with that one instead of
>   * the smallest order which will fit the object.
>   */
> -static inline unsigned int slab_order(unsigned int size,
> -		unsigned int min_objects, unsigned int max_order,
> -		unsigned int fract_leftover, unsigned int reserved)
> +static int calculate_order(unsigned int size, unsigned int reserved)
>  {
> -	unsigned int min_order = slub_min_order;
> -	unsigned int order;
> -
> -	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
> -		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
> -
> -	for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
> -			order <= max_order; order++) {
> -
> -		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
> -		unsigned int rem;
> -
> -		rem = (slab_size - reserved) % size;
> -
> -		if (rem <= slab_size / fract_leftover)
> -			break;
> -	}
> -
> -	return order;
> -}
> -
> -static inline int calculate_order(unsigned int size, unsigned int reserved)
> -{
> -	unsigned int order;
> +	unsigned int best_order;
> +	unsigned int test_order;
>  	unsigned int min_objects;
>  	unsigned int max_objects;
>  
> @@ -3269,34 +3245,38 @@ static inline int calculate_order(unsign
>  	max_objects = order_objects(slub_max_order, size, reserved);
>  	min_objects = min(min_objects, max_objects);
>  
> -	while (min_objects > 1) {
> -		unsigned int fraction;
> +	/* Get the minimum acceptable order for one object */
> +	best_order = get_order(size + reserved);
> +
> +	for (test_order = best_order + 1; test_order < MAX_ORDER; test_order++) {
> +		unsigned best_order_obj = order_objects(best_order, size, reserved);
> +		unsigned test_order_obj = order_objects(test_order, size, reserved);
> +
> +		unsigned best_order_slab_size = (unsigned int)PAGE_SIZE << best_order;
> +		unsigned best_order_rem = (best_order_slab_size - reserved) % size;
> +
> +		/* If there would be too many objects, stop searching */
> +		if (test_order_obj > MAX_OBJS_PER_PAGE)
> +			break;
>  
> -		fraction = 16;
> -		while (fraction >= 4) {
> -			order = slab_order(size, min_objects,
> -					slub_max_order, fraction, reserved);
> -			if (order <= slub_max_order)
> -				return order;
> -			fraction /= 2;
> -		}
> -		min_objects--;
> +		/* Always increase up to slub_min_order */
> +		if (test_order <= slub_min_order)
> +			best_order = test_order;
> +
> +		/* If we are below min_objects and slub_max_order, increase the order */
> +		if (best_order_obj < min_objects && test_order <= slub_max_order)
> +			best_order = test_order;
> +
> +		/* Increase the order even more, but only if it reduces waste */
> +		/* If we already waste less than 1/16, don't increase it */
> +		if (best_order_rem >= (best_order_slab_size / 16) &&
> +		    test_order_obj > (best_order_obj << (test_order - best_order)))
> +			best_order = test_order;
>  	}
>  
> -	/*
> -	 * We were unable to place multiple objects in a slab. Now
> -	 * lets see if we can place a single object there.
> -	 */
> -	order = slab_order(size, 1, slub_max_order, 1, reserved);
> -	if (order <= slub_max_order)
> -		return order;
> +	if (best_order < MAX_ORDER)
> +		return best_order;
>  
> -	/*
> -	 * Doh this slab cannot be placed using slub_max_order.
> -	 */
> -	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> -	if (order < MAX_ORDER)
> -		return order;
>  	return -ENOSYS;
>  }
>  
> 

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-06-13 17:01                                                   ` Mikulas Patocka
@ 2018-06-13 18:16                                                     ` Christoph Hellwig
  2018-06-13 18:53                                                       ` Mikulas Patocka
  0 siblings, 1 reply; 109+ messages in thread
From: Christoph Hellwig @ 2018-06-13 18:16 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Christopher Lameter, Mike Snitzer, Vlastimil Babka,
	Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel, David Rientjes,
	Joonsoo Kim, Andrew Morton, linux-kernel

On Wed, Jun 13, 2018 at 01:01:22PM -0400, Mikulas Patocka wrote:
> Hi
> 
> I'd like to ask about this patch - will you commit it, or do you want to 
> make some more changes to it?

How about you resend it with the series adding an actual user once
ready?  I haven't actually seen patches using it posted on any list yet.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
  2018-06-13 18:16                                                     ` Christoph Hellwig
@ 2018-06-13 18:53                                                       ` Mikulas Patocka
  0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-06-13 18:53 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Christopher Lameter, Mike Snitzer, Vlastimil Babka,
	Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel, David Rientjes,
	Joonsoo Kim, Andrew Morton, linux-kernel



On Wed, 13 Jun 2018, Christoph Hellwig wrote:

> On Wed, Jun 13, 2018 at 01:01:22PM -0400, Mikulas Patocka wrote:
> > Hi
> > 
> > I'd like to ask about this patch - will you commit it, or do you want to 
> > make some more changes to it?
> 
> How about you resend it with the series adding an actual user once
> ready?  I haven't actually seen patches using it posted on any list yet.

dm-bufio is already using it. Starting with the kernel 4.17 (f51f2e0a7fb1 
- "dm bufio: support non-power-of-two block sizes"), dm-bufio has the 
capability to use non-power-of-two buffers. It uses slab cache for its 
buffers - so we would like to have this slab optimization - to avoid 
excessive memory wasting.

Originally, the slub patch used a new flag SLAB_MINIMIZE_WASTE, but after 
a suggestion from others, I reworked the patch so that it minimizes waste 
of all slub caches and doesn't need an extra flag to activate.

Mikulas

^ permalink raw reply	[flat|nested] 109+ messages in thread

end of thread, other threads:[~2018-06-13 18:53 UTC | newest]

Thread overview: 109+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-20 17:25 [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE Mikulas Patocka
2018-03-20 17:35 ` Matthew Wilcox
2018-03-20 17:35   ` Matthew Wilcox
2018-03-20 17:54   ` Christopher Lameter
2018-03-20 17:54     ` Christopher Lameter
2018-03-20 19:22     ` Mikulas Patocka
2018-03-20 19:22       ` Mikulas Patocka
2018-03-20 20:42       ` Christopher Lameter
2018-03-20 20:42         ` Christopher Lameter
2018-03-20 22:02         ` Mikulas Patocka
2018-03-20 22:02           ` Mikulas Patocka
2018-03-21 15:35           ` Christopher Lameter
2018-03-21 15:35             ` Christopher Lameter
2018-03-21 16:25             ` Mikulas Patocka
2018-03-21 16:25               ` Mikulas Patocka
2018-03-21 17:10               ` Matthew Wilcox
2018-03-21 17:10                 ` Matthew Wilcox
2018-03-21 17:30               ` Christopher Lameter
2018-03-21 17:30                 ` Christopher Lameter
2018-03-21 17:39                 ` Christopher Lameter
2018-03-21 17:39                   ` Christopher Lameter
2018-03-21 17:49                   ` Matthew Wilcox
2018-03-21 17:49                     ` Matthew Wilcox
2018-03-21 18:01                     ` Christopher Lameter
2018-03-21 18:01                       ` Christopher Lameter
2018-03-21 18:23                     ` Mikulas Patocka
2018-03-21 18:23                       ` Mikulas Patocka
2018-03-21 18:40                       ` Christopher Lameter
2018-03-21 18:40                         ` Christopher Lameter
2018-03-21 18:55                         ` Mikulas Patocka
2018-03-21 18:55                           ` Mikulas Patocka
2018-03-21 18:55                         ` Matthew Wilcox
2018-03-21 18:55                           ` Matthew Wilcox
2018-03-21 18:58                           ` Christopher Lameter
2018-03-21 18:58                             ` Christopher Lameter
2018-03-21 19:25                   ` Mikulas Patocka
2018-03-21 19:25                     ` Mikulas Patocka
2018-03-21 18:36                 ` Mikulas Patocka
2018-03-21 18:36                   ` Mikulas Patocka
2018-03-21 18:57                   ` Christopher Lameter
2018-03-21 18:57                     ` Christopher Lameter
2018-03-21 19:19                     ` Mikulas Patocka
2018-03-21 19:19                       ` Mikulas Patocka
2018-03-21 20:09                       ` Christopher Lameter
2018-03-21 20:09                         ` Christopher Lameter
2018-03-21 20:37                         ` Mikulas Patocka
2018-03-21 20:37                           ` Mikulas Patocka
2018-03-23 15:10                           ` Christopher Lameter
2018-03-23 15:10                             ` Christopher Lameter
2018-03-23 15:31                             ` Mikulas Patocka
2018-03-23 15:31                               ` Mikulas Patocka
2018-03-23 15:48                               ` Christopher Lameter
2018-03-23 15:48                                 ` Christopher Lameter
2018-04-13  9:22                   ` Vlastimil Babka
2018-04-13  9:22                     ` Vlastimil Babka
2018-04-13 15:10                     ` Mike Snitzer
2018-04-13 15:10                       ` Mike Snitzer
2018-04-16 12:38                       ` Vlastimil Babka
2018-04-16 12:38                         ` Vlastimil Babka
2018-04-16 14:27                         ` Mike Snitzer
2018-04-16 14:27                           ` Mike Snitzer
2018-04-16 14:37                           ` Mikulas Patocka
2018-04-16 14:37                             ` Mikulas Patocka
2018-04-16 14:46                             ` Mike Snitzer
2018-04-16 14:46                               ` Mike Snitzer
2018-04-16 14:57                               ` Mikulas Patocka
2018-04-16 14:57                                 ` Mikulas Patocka
2018-04-16 15:18                                 ` Christopher Lameter
2018-04-16 15:18                                   ` Christopher Lameter
2018-04-16 15:25                                   ` Mikulas Patocka
2018-04-16 15:25                                     ` Mikulas Patocka
2018-04-16 15:45                                     ` Christopher Lameter
2018-04-16 15:45                                       ` Christopher Lameter
2018-04-16 19:36                                       ` Mikulas Patocka
2018-04-16 19:36                                         ` Mikulas Patocka
2018-04-16 19:53                                         ` Vlastimil Babka
2018-04-16 21:01                                           ` Mikulas Patocka
2018-04-17 14:40                                             ` Christopher Lameter
2018-04-17 18:53                                               ` Mikulas Patocka
2018-04-17 18:53                                                 ` Mikulas Patocka
2018-04-17 21:42                                                 ` Christopher Lameter
2018-04-17 14:49                                           ` Christopher Lameter
2018-04-17 14:49                                             ` Christopher Lameter
2018-04-17 14:47                                         ` Christopher Lameter
2018-04-17 14:47                                           ` Christopher Lameter
2018-04-16 19:32                               ` [PATCH RESEND] " Mikulas Patocka
2018-04-17 14:45                                 ` Christopher Lameter
2018-04-17 16:16                                   ` Vlastimil Babka
2018-04-17 16:38                                     ` Christopher Lameter
2018-04-17 19:09                                       ` Mikulas Patocka
2018-04-17 17:26                                     ` Mikulas Patocka
2018-04-17 19:13                                       ` Vlastimil Babka
2018-04-17 19:06                                   ` Mikulas Patocka
2018-04-17 19:06                                     ` Mikulas Patocka
2018-04-18 14:55                                     ` Christopher Lameter
2018-04-25 21:04                                       ` Mikulas Patocka
2018-04-25 23:24                                         ` Mikulas Patocka
2018-04-26 19:01                                           ` Christopher Lameter
2018-04-26 21:09                                             ` Mikulas Patocka
2018-04-27 16:41                                               ` Christopher Lameter
2018-04-27 19:19                                                 ` Mikulas Patocka
2018-06-13 17:01                                                   ` Mikulas Patocka
2018-06-13 18:16                                                     ` Christoph Hellwig
2018-06-13 18:53                                                       ` Mikulas Patocka
2018-04-26 18:51                                         ` Christopher Lameter
2018-04-16 19:38                             ` Vlastimil Babka
2018-04-16 19:38                               ` Vlastimil Babka
2018-04-16 21:04                         ` Mikulas Patocka
2018-04-16 21:04                           ` Mikulas Patocka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.