* [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 17:25 Mikulas Patocka
2018-03-20 17:35 ` Matthew Wilcox
0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 17:25 UTC (permalink / raw)
To: Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton
Cc: linux-mm, dm-devel, Mike Snitzer
Hi
I'm submitting this patch for the slab allocator. The patch adds a new
flag SLAB_MINIMIZE_WASTE. When this flag is present, the slab subsystem
will use higher-order allocations to minimize wasted space (it will not
use higher-order allocations when there's no benefit, such as if the
object size is a power of two).
The reason why we need this is that we are going to merge code that does
block device deduplication (it was developed separatedly and sold as a
commercial product), and the code uses block sizes that are not a power of
two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
allocator rounds up the allocation to the nearest power of two, but that
wastes a lot of memory. Performance of the solution depends on efficient
memory usage, so we should minimize wasted as much as possible.
Larger-order allocations are unreliable (they may fail at any time if the
memory is fragmented), but it is not an issue here - the code preallocates
a few buffers with vmalloc and then allocates buffers from the slab cache.
If the allocation fails due to memory fragmentation, we throw away and
reuse some existing buffer, so there is no functionality loss.
Mikulas
From: Mikulas Patocka <mpatocka@redhat.com>
This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
flag causes allocation of larger slab caches in order to minimize wasted
space.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
include/linux/slab.h | 7 +++++++
mm/slab.c | 4 ++--
mm/slab.h | 7 ++++---
mm/slab_common.c | 2 +-
mm/slub.c | 25 ++++++++++++++++++++-----
6 files changed, 35 insertions(+), 12 deletions(-)
Index: linux-2.6/include/linux/slab.h
===================================================================
--- linux-2.6.orig/include/linux/slab.h 2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/include/linux/slab.h 2018-03-20 14:59:20.518030000 +0100
@@ -108,6 +108,13 @@
#define SLAB_KASAN 0
#endif
+/*
+ * Use higer order allocations to minimize wasted space.
+ * Note: the allocation is unreliable if this flag is used, the caller
+ * must handle allocation failures gracefully.
+ */
+#define SLAB_MINIMIZE_WASTE ((slab_flags_t __force)0x10000000U)
+
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
Index: linux-2.6/mm/slab_common.c
===================================================================
--- linux-2.6.orig/mm/slab_common.c 2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slab_common.c 2018-03-20 14:59:20.518030000 +0100
@@ -52,7 +52,7 @@ static DECLARE_WORK(slab_caches_to_rcu_d
SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
/*
* Merge control. If this is set then no merging of slab caches will occur.
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slub.c 2018-03-20 14:59:20.518030000 +0100
@@ -3234,7 +3234,7 @@ static inline int slab_order(int size, i
return order;
}
-static inline int calculate_order(int size, int reserved)
+static inline int calculate_order(int size, int reserved, slab_flags_t flags)
{
int order;
int min_objects;
@@ -3261,7 +3261,7 @@ static inline int calculate_order(int si
order = slab_order(size, min_objects,
slub_max_order, fraction, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
fraction /= 2;
}
min_objects--;
@@ -3273,15 +3273,30 @@ static inline int calculate_order(int si
*/
order = slab_order(size, 1, slub_max_order, 1, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
order = slab_order(size, 1, MAX_ORDER, 1, reserved);
if (order < MAX_ORDER)
- return order;
+ goto ret_order;
return -ENOSYS;
+
+ret_order:
+ if (flags & SLAB_MINIMIZE_WASTE) {
+ /* Increase the order if it decreases waste */
+ int test_order;
+ for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+ unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+ unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+ if (test_order_objects >= min(64, MAX_OBJS_PER_PAGE))
+ break;
+ if (test_order_objects > order_objects << (test_order - order))
+ order = test_order;
+ }
+ }
+ return order;
}
static void
@@ -3546,7 +3561,7 @@ static int calculate_sizes(struct kmem_c
if (forced_order >= 0)
order = forced_order;
else
- order = calculate_order(size, s->reserved);
+ order = calculate_order(size, s->reserved, flags);
if (order < 0)
return 0;
Index: linux-2.6/mm/slab.h
===================================================================
--- linux-2.6.orig/mm/slab.h 2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slab.h 2018-03-20 14:59:20.518030000 +0100
@@ -142,10 +142,10 @@ static inline slab_flags_t kmem_cache_fl
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT)
+ SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
@@ -164,7 +164,8 @@ static inline slab_flags_t kmem_cache_fl
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | \
+ SLAB_MINIMIZE_WASTE)
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
Index: linux-2.6/mm/slab.c
===================================================================
--- linux-2.6.orig/mm/slab.c 2018-03-20 14:59:20.528030000 +0100
+++ linux-2.6/mm/slab.c 2018-03-20 14:59:20.518030000 +0100
@@ -1789,14 +1789,14 @@ static size_t calculate_slab_order(struc
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
- if (flags & SLAB_RECLAIM_ACCOUNT)
+ if (flags & SLAB_RECLAIM_ACCOUNT && !(flags & SLAB_MINIMIZE_WASTE))
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
- if (gfporder >= slab_max_order)
+ if (gfporder >= slab_max_order && !(flags & SLAB_MINIMIZE_WASTE))
break;
/*
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-20 17:25 [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE Mikulas Patocka
@ 2018-03-20 17:35 ` Matthew Wilcox
0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-20 17:35 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> The reason why we need this is that we are going to merge code that does
> block device deduplication (it was developed separatedly and sold as a
> commercial product), and the code uses block sizes that are not a power of
> two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> allocator rounds up the allocation to the nearest power of two, but that
> wastes a lot of memory. Performance of the solution depends on efficient
> memory usage, so we should minimize wasted as much as possible.
The SLUB allocator also falls back to using the page (buddy) allocator
for allocations above 8kB, so this patch is going to have no effect on
slub. You'd be better off using alloc_pages_exact() for this kind of
size, or managing your own pool of pages by using something like five
192k blocks in a 1MB allocation.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 17:35 ` Matthew Wilcox
0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-20 17:35 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Andrew Morton, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christoph Lameter, Joonsoo Kim
On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> The reason why we need this is that we are going to merge code that does
> block device deduplication (it was developed separatedly and sold as a
> commercial product), and the code uses block sizes that are not a power of
> two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> allocator rounds up the allocation to the nearest power of two, but that
> wastes a lot of memory. Performance of the solution depends on efficient
> memory usage, so we should minimize wasted as much as possible.
The SLUB allocator also falls back to using the page (buddy) allocator
for allocations above 8kB, so this patch is going to have no effect on
slub. You'd be better off using alloc_pages_exact() for this kind of
size, or managing your own pool of pages by using something like five
192k blocks in a 1MB allocation.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-20 17:35 ` Matthew Wilcox
@ 2018-03-20 17:54 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 17:54 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Tue, 20 Mar 2018, Matthew Wilcox wrote:
> On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > The reason why we need this is that we are going to merge code that does
> > block device deduplication (it was developed separatedly and sold as a
> > commercial product), and the code uses block sizes that are not a power of
> > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > allocator rounds up the allocation to the nearest power of two, but that
> > wastes a lot of memory. Performance of the solution depends on efficient
> > memory usage, so we should minimize wasted as much as possible.
>
> The SLUB allocator also falls back to using the page (buddy) allocator
> for allocations above 8kB, so this patch is going to have no effect on
> slub. You'd be better off using alloc_pages_exact() for this kind of
> size, or managing your own pool of pages by using something like five
> 192k blocks in a 1MB allocation.
The fallback is only effective for kmalloc caches. Manually created caches
do not follow this rule.
Note that you can already control the page orders for allocation and
the objects per slab using
slub_min_order
slub_max_order
slub_min_objects
This is documented in linux/Documentation/vm/slub.txt
Maybe do the same thing for SLAB?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 17:54 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 17:54 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
David Rientjes, Joonsoo Kim, Andrew Morton
On Tue, 20 Mar 2018, Matthew Wilcox wrote:
> On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > The reason why we need this is that we are going to merge code that does
> > block device deduplication (it was developed separatedly and sold as a
> > commercial product), and the code uses block sizes that are not a power of
> > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > allocator rounds up the allocation to the nearest power of two, but that
> > wastes a lot of memory. Performance of the solution depends on efficient
> > memory usage, so we should minimize wasted as much as possible.
>
> The SLUB allocator also falls back to using the page (buddy) allocator
> for allocations above 8kB, so this patch is going to have no effect on
> slub. You'd be better off using alloc_pages_exact() for this kind of
> size, or managing your own pool of pages by using something like five
> 192k blocks in a 1MB allocation.
The fallback is only effective for kmalloc caches. Manually created caches
do not follow this rule.
Note that you can already control the page orders for allocation and
the objects per slab using
slub_min_order
slub_max_order
slub_min_objects
This is documented in linux/Documentation/vm/slub.txt
Maybe do the same thing for SLAB?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-20 17:54 ` Christopher Lameter
@ 2018-03-20 19:22 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 19:22 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Tue, 20 Mar 2018, Christopher Lameter wrote:
> On Tue, 20 Mar 2018, Matthew Wilcox wrote:
>
> > On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > > The reason why we need this is that we are going to merge code that does
> > > block device deduplication (it was developed separatedly and sold as a
> > > commercial product), and the code uses block sizes that are not a power of
> > > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > > allocator rounds up the allocation to the nearest power of two, but that
> > > wastes a lot of memory. Performance of the solution depends on efficient
> > > memory usage, so we should minimize wasted as much as possible.
> >
> > The SLUB allocator also falls back to using the page (buddy) allocator
> > for allocations above 8kB, so this patch is going to have no effect on
> > slub. You'd be better off using alloc_pages_exact() for this kind of
> > size, or managing your own pool of pages by using something like five
> > 192k blocks in a 1MB allocation.
>
> The fallback is only effective for kmalloc caches. Manually created caches
> do not follow this rule.
Yes - the dm-bufio layer uses manually created caches.
> Note that you can already control the page orders for allocation and
> the objects per slab using
>
> slub_min_order
> slub_max_order
> slub_min_objects
>
> This is documented in linux/Documentation/vm/slub.txt
>
> Maybe do the same thing for SLAB?
Yes, but I need to change it for a specific cache, not for all caches.
When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
becomes unreliable, thus it is a bad idea to increase slub_max_order
system-wide.
Another problem with slub_max_order is that it would pad all caches to
slub_max_order, even those that already have a power-of-two size (in that
case, the padding is counterproductive).
BTW. the function "order_store" in mm/slub.c modifies the structure
kmem_cache without taking any locks - is it a bug?
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 19:22 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 19:22 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Tue, 20 Mar 2018, Christopher Lameter wrote:
> On Tue, 20 Mar 2018, Matthew Wilcox wrote:
>
> > On Tue, Mar 20, 2018 at 01:25:09PM -0400, Mikulas Patocka wrote:
> > > The reason why we need this is that we are going to merge code that does
> > > block device deduplication (it was developed separatedly and sold as a
> > > commercial product), and the code uses block sizes that are not a power of
> > > two (block sizes 192K, 448K, 640K, 832K are used in the wild). The slab
> > > allocator rounds up the allocation to the nearest power of two, but that
> > > wastes a lot of memory. Performance of the solution depends on efficient
> > > memory usage, so we should minimize wasted as much as possible.
> >
> > The SLUB allocator also falls back to using the page (buddy) allocator
> > for allocations above 8kB, so this patch is going to have no effect on
> > slub. You'd be better off using alloc_pages_exact() for this kind of
> > size, or managing your own pool of pages by using something like five
> > 192k blocks in a 1MB allocation.
>
> The fallback is only effective for kmalloc caches. Manually created caches
> do not follow this rule.
Yes - the dm-bufio layer uses manually created caches.
> Note that you can already control the page orders for allocation and
> the objects per slab using
>
> slub_min_order
> slub_max_order
> slub_min_objects
>
> This is documented in linux/Documentation/vm/slub.txt
>
> Maybe do the same thing for SLAB?
Yes, but I need to change it for a specific cache, not for all caches.
When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
becomes unreliable, thus it is a bad idea to increase slub_max_order
system-wide.
Another problem with slub_max_order is that it would pad all caches to
slub_max_order, even those that already have a power-of-two size (in that
case, the padding is counterproductive).
BTW. the function "order_store" in mm/slub.c modifies the structure
kmem_cache without taking any locks - is it a bug?
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-20 19:22 ` Mikulas Patocka
@ 2018-03-20 20:42 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 20:42 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> > Maybe do the same thing for SLAB?
>
> Yes, but I need to change it for a specific cache, not for all caches.
Why only some caches?
> When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> becomes unreliable, thus it is a bad idea to increase slub_max_order
> system-wide.
Well the allocations is more likely to fail that is true but SLUB will
fall back to a smaller order should the page allocator refuse to give us
that larger sized page.
> Another problem with slub_max_order is that it would pad all caches to
> slub_max_order, even those that already have a power-of-two size (in that
> case, the padding is counterproductive).
No it does not. Slub will calculate the configuration with the least byte
wastage. It is not the standard order but the maximum order to be used.
Power of two caches below PAGE_SIZE will have order 0.
There are some corner cases where extra metadata is needed per object or
per page that will result in either object sizes that are no longer a
power of two or in page sizes smaller than the whole page. Maybe you have
a case like that? Can you show me a cache that has this issue?
> BTW. the function "order_store" in mm/slub.c modifies the structure
> kmem_cache without taking any locks - is it a bug?
The kmem_cache structure was just allocated. Only one thread can access it
thus no locking is necessary.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 20:42 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-20 20:42 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> > Maybe do the same thing for SLAB?
>
> Yes, but I need to change it for a specific cache, not for all caches.
Why only some caches?
> When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> becomes unreliable, thus it is a bad idea to increase slub_max_order
> system-wide.
Well the allocations is more likely to fail that is true but SLUB will
fall back to a smaller order should the page allocator refuse to give us
that larger sized page.
> Another problem with slub_max_order is that it would pad all caches to
> slub_max_order, even those that already have a power-of-two size (in that
> case, the padding is counterproductive).
No it does not. Slub will calculate the configuration with the least byte
wastage. It is not the standard order but the maximum order to be used.
Power of two caches below PAGE_SIZE will have order 0.
There are some corner cases where extra metadata is needed per object or
per page that will result in either object sizes that are no longer a
power of two or in page sizes smaller than the whole page. Maybe you have
a case like that? Can you show me a cache that has this issue?
> BTW. the function "order_store" in mm/slub.c modifies the structure
> kmem_cache without taking any locks - is it a bug?
The kmem_cache structure was just allocated. Only one thread can access it
thus no locking is necessary.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-20 20:42 ` Christopher Lameter
@ 2018-03-20 22:02 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 22:02 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Tue, 20 Mar 2018, Christopher Lameter wrote:
> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
>
> > > Maybe do the same thing for SLAB?
> >
> > Yes, but I need to change it for a specific cache, not for all caches.
>
> Why only some caches?
I need high order for the buffer cache that holds the deduplicated data. I
don't need to force it system-wide.
> > When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> > becomes unreliable, thus it is a bad idea to increase slub_max_order
> > system-wide.
>
> Well the allocations is more likely to fail that is true but SLUB will
> fall back to a smaller order should the page allocator refuse to give us
> that larger sized page.
Does SLAB have this fall-back too?
> > Another problem with slub_max_order is that it would pad all caches to
> > slub_max_order, even those that already have a power-of-two size (in that
> > case, the padding is counterproductive).
>
> No it does not. Slub will calculate the configuration with the least byte
> wastage. It is not the standard order but the maximum order to be used.
> Power of two caches below PAGE_SIZE will have order 0.
Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
kmalloc-8192 352 352 8192 32 64 : tunables 0 0 0 : slabdata 11 11 0
^^^^
So it rounds up power-of-two sizes to high orders unnecessarily. Without
slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
8.
I observe the same pathological rounding in dm-bufio caches.
> There are some corner cases where extra metadata is needed per object or
> per page that will result in either object sizes that are no longer a
> power of two or in page sizes smaller than the whole page. Maybe you have
> a case like that? Can you show me a cache that has this issue?
Here I have a patch set that changes the dm-bufio subsystem to support
buffer sizes that are not a power of two:
http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
I need to change the slub cache to minimize wasted space - i.e. when
asking for a slab cache for 640kB objects, the slub system currently
allocates 1MB per object and 384kB is wasted. This is the reason why I'm
making this patch.
> > BTW. the function "order_store" in mm/slub.c modifies the structure
> > kmem_cache without taking any locks - is it a bug?
>
> The kmem_cache structure was just allocated. Only one thread can access it
> thus no locking is necessary.
No - order_store is called when writing to /sys/kernel/slab/<cache>/order
- you can modify order for any existing cache - and the modification
happens without any locking.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-20 22:02 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-20 22:02 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Tue, 20 Mar 2018, Christopher Lameter wrote:
> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
>
> > > Maybe do the same thing for SLAB?
> >
> > Yes, but I need to change it for a specific cache, not for all caches.
>
> Why only some caches?
I need high order for the buffer cache that holds the deduplicated data. I
don't need to force it system-wide.
> > When the order is greater than 3 (PAGE_ALLOC_COSTLY_ORDER), the allocation
> > becomes unreliable, thus it is a bad idea to increase slub_max_order
> > system-wide.
>
> Well the allocations is more likely to fail that is true but SLUB will
> fall back to a smaller order should the page allocator refuse to give us
> that larger sized page.
Does SLAB have this fall-back too?
> > Another problem with slub_max_order is that it would pad all caches to
> > slub_max_order, even those that already have a power-of-two size (in that
> > case, the padding is counterproductive).
>
> No it does not. Slub will calculate the configuration with the least byte
> wastage. It is not the standard order but the maximum order to be used.
> Power of two caches below PAGE_SIZE will have order 0.
Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
kmalloc-8192 352 352 8192 32 64 : tunables 0 0 0 : slabdata 11 11 0
^^^^
So it rounds up power-of-two sizes to high orders unnecessarily. Without
slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
8.
I observe the same pathological rounding in dm-bufio caches.
> There are some corner cases where extra metadata is needed per object or
> per page that will result in either object sizes that are no longer a
> power of two or in page sizes smaller than the whole page. Maybe you have
> a case like that? Can you show me a cache that has this issue?
Here I have a patch set that changes the dm-bufio subsystem to support
buffer sizes that are not a power of two:
http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
I need to change the slub cache to minimize wasted space - i.e. when
asking for a slab cache for 640kB objects, the slub system currently
allocates 1MB per object and 384kB is wasted. This is the reason why I'm
making this patch.
> > BTW. the function "order_store" in mm/slub.c modifies the structure
> > kmem_cache without taking any locks - is it a bug?
>
> The kmem_cache structure was just allocated. Only one thread can access it
> thus no locking is necessary.
No - order_store is called when writing to /sys/kernel/slab/<cache>/order
- you can modify order for any existing cache - and the modification
happens without any locking.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-20 22:02 ` Mikulas Patocka
@ 2018-03-21 15:35 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 15:35 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> > > Another problem with slub_max_order is that it would pad all caches to
> > > slub_max_order, even those that already have a power-of-two size (in that
> > > case, the padding is counterproductive).
> >
> > No it does not. Slub will calculate the configuration with the least byte
> > wastage. It is not the standard order but the maximum order to be used.
> > Power of two caches below PAGE_SIZE will have order 0.
>
> Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> kmalloc-8192 352 352 8192 32 64 : tunables 0 0 0 : slabdata 11 11 0
Yes it tries to create a slab size that will accomodate the minimum
objects per slab.
> So it rounds up power-of-two sizes to high orders unnecessarily. Without
> slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> 8.
The kmalloc-8192 has 4 objects per slab on my system which means an
allocation size of 32k = order 4.
In this case 4 objects fit tightly into a slab. There is no waste.
But then I thought you were talking about manually created slabs not
about the kmalloc array?
> I observe the same pathological rounding in dm-bufio caches.
>
> > There are some corner cases where extra metadata is needed per object or
> > per page that will result in either object sizes that are no longer a
> > power of two or in page sizes smaller than the whole page. Maybe you have
> > a case like that? Can you show me a cache that has this issue?
>
> Here I have a patch set that changes the dm-bufio subsystem to support
> buffer sizes that are not a power of two:
> http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
>
> I need to change the slub cache to minimize wasted space - i.e. when
> asking for a slab cache for 640kB objects, the slub system currently
> allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> making this patch.
You should not be using the slab allocators for these. Allocate higher
order pages or numbers of consecutive smaller pagess from the page
allocator. The slab allocators are written for objects smaller than page
size.
> > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > kmem_cache without taking any locks - is it a bug?
> >
> > The kmem_cache structure was just allocated. Only one thread can access it
> > thus no locking is necessary.
>
> No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> - you can modify order for any existing cache - and the modification
> happens without any locking.
Well it still does not matter. The size of the order of slab pages
can be dynamic even within a slab. You can have pages of varying sizes.
What kind of problem could be caused here?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 15:35 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 15:35 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Tue, 20 Mar 2018, Mikulas Patocka wrote:
> > > Another problem with slub_max_order is that it would pad all caches to
> > > slub_max_order, even those that already have a power-of-two size (in that
> > > case, the padding is counterproductive).
> >
> > No it does not. Slub will calculate the configuration with the least byte
> > wastage. It is not the standard order but the maximum order to be used.
> > Power of two caches below PAGE_SIZE will have order 0.
>
> Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> kmalloc-8192 352 352 8192 32 64 : tunables 0 0 0 : slabdata 11 11 0
Yes it tries to create a slab size that will accomodate the minimum
objects per slab.
> So it rounds up power-of-two sizes to high orders unnecessarily. Without
> slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> 8.
The kmalloc-8192 has 4 objects per slab on my system which means an
allocation size of 32k = order 4.
In this case 4 objects fit tightly into a slab. There is no waste.
But then I thought you were talking about manually created slabs not
about the kmalloc array?
> I observe the same pathological rounding in dm-bufio caches.
>
> > There are some corner cases where extra metadata is needed per object or
> > per page that will result in either object sizes that are no longer a
> > power of two or in page sizes smaller than the whole page. Maybe you have
> > a case like that? Can you show me a cache that has this issue?
>
> Here I have a patch set that changes the dm-bufio subsystem to support
> buffer sizes that are not a power of two:
> http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
>
> I need to change the slub cache to minimize wasted space - i.e. when
> asking for a slab cache for 640kB objects, the slub system currently
> allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> making this patch.
You should not be using the slab allocators for these. Allocate higher
order pages or numbers of consecutive smaller pagess from the page
allocator. The slab allocators are written for objects smaller than page
size.
> > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > kmem_cache without taking any locks - is it a bug?
> >
> > The kmem_cache structure was just allocated. Only one thread can access it
> > thus no locking is necessary.
>
> No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> - you can modify order for any existing cache - and the modification
> happens without any locking.
Well it still does not matter. The size of the order of slab pages
can be dynamic even within a slab. You can have pages of varying sizes.
What kind of problem could be caused here?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 15:35 ` Christopher Lameter
@ 2018-03-21 16:25 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 16:25 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
>
> > > > Another problem with slub_max_order is that it would pad all caches to
> > > > slub_max_order, even those that already have a power-of-two size (in that
> > > > case, the padding is counterproductive).
> > >
> > > No it does not. Slub will calculate the configuration with the least byte
> > > wastage. It is not the standard order but the maximum order to be used.
> > > Power of two caches below PAGE_SIZE will have order 0.
> >
> > Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> > kmalloc-8192 352 352 8192 32 64 : tunables 0 0 0 : slabdata 11 11 0
>
> Yes it tries to create a slab size that will accomodate the minimum
> objects per slab.
>
> > So it rounds up power-of-two sizes to high orders unnecessarily. Without
> > slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> > 8.
>
> The kmalloc-8192 has 4 objects per slab on my system which means an
> allocation size of 32k = order 4.
>
> In this case 4 objects fit tightly into a slab. There is no waste.
>
> But then I thought you were talking about manually created slabs not
> about the kmalloc array?
For some workloads, dm-bufio needs caches with sizes that are a power of
two (majority of workloads fall into this cathegory). For other workloads
dm-bufio needs caches with sizes that are not a power of two.
Now - we don't want higher-order allocations for power-of-two caches
(because higher-order allocations just cause memory fragmentation without
any benefit), but we want higher-order allocations for non-power-of-two
caches (because higher-order allocations minimize wasted space).
For example:
for 192K block size, the ideal order is 4MB (it takes 21 blocks)
for 448K block size, the ideal order is 4MB (it takes 9 blocks)
for 512K block size, the ideal order is 512KB (there is no benefit from
using higher order)
for 640K block size, the ideal order is 2MB (it takes 3 blocks, increasing
the allocation size to 4MB doesn't result in any benefit)
for 832K block size, the ideal order is 1MB (it takes 1 block, increasing
the allocation to 2MB or 4MB doesn't result in any benefit)
for 1M block size, the ideal order is 1MB
The problem with "slub_max_order" is that it increases the order either
always or never, but doesn't have the capability to calculate the ideal
order for the given object size. The patch that I send just does this
calculation.
Another problem wit "slub_max_order" is that the device driver that needs
to create a slab cache cannot really set it - the device driver can't
modify the kernel parameters.
> > I observe the same pathological rounding in dm-bufio caches.
> >
> > > There are some corner cases where extra metadata is needed per object or
> > > per page that will result in either object sizes that are no longer a
> > > power of two or in page sizes smaller than the whole page. Maybe you have
> > > a case like that? Can you show me a cache that has this issue?
> >
> > Here I have a patch set that changes the dm-bufio subsystem to support
> > buffer sizes that are not a power of two:
> > http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
> >
> > I need to change the slub cache to minimize wasted space - i.e. when
> > asking for a slab cache for 640kB objects, the slub system currently
> > allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> > making this patch.
>
> You should not be using the slab allocators for these. Allocate higher
> order pages or numbers of consecutive smaller pagess from the page
> allocator. The slab allocators are written for objects smaller than page
> size.
So, do you argue that I need to write my own slab cache functionality
instead of using the existing slab code?
I can do it - but duplicating code is bad thing.
> > > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > > kmem_cache without taking any locks - is it a bug?
> > >
> > > The kmem_cache structure was just allocated. Only one thread can access it
> > > thus no locking is necessary.
> >
> > No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> > - you can modify order for any existing cache - and the modification
> > happens without any locking.
>
> Well it still does not matter. The size of the order of slab pages
> can be dynamic even within a slab. You can have pages of varying sizes.
>
> What kind of problem could be caused here?
Unlocked accesses are generally considered bad. For example, see this
piece of code in calculate_sizes:
s->allocflags = 0;
if (order)
s->allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
s->allocflags |= GFP_DMA;
if (s->flags & SLAB_RECLAIM_ACCOUNT)
s->allocflags |= __GFP_RECLAIMABLE;
If you are running this while the cache is in use (i.e. when the user
writes /sys/kernel/slab/<cache>/order), then other processes will see
invalid s->allocflags for a short time.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 16:25 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 16:25 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Tue, 20 Mar 2018, Mikulas Patocka wrote:
>
> > > > Another problem with slub_max_order is that it would pad all caches to
> > > > slub_max_order, even those that already have a power-of-two size (in that
> > > > case, the padding is counterproductive).
> > >
> > > No it does not. Slub will calculate the configuration with the least byte
> > > wastage. It is not the standard order but the maximum order to be used.
> > > Power of two caches below PAGE_SIZE will have order 0.
> >
> > Try to boot with slub_max_order=10 and you can see this in /proc/slabinfo:
> > kmalloc-8192 352 352 8192 32 64 : tunables 0 0 0 : slabdata 11 11 0
>
> Yes it tries to create a slab size that will accomodate the minimum
> objects per slab.
>
> > So it rounds up power-of-two sizes to high orders unnecessarily. Without
> > slub_max_order=10, the number of pages for the kmalloc-8192 cache is just
> > 8.
>
> The kmalloc-8192 has 4 objects per slab on my system which means an
> allocation size of 32k = order 4.
>
> In this case 4 objects fit tightly into a slab. There is no waste.
>
> But then I thought you were talking about manually created slabs not
> about the kmalloc array?
For some workloads, dm-bufio needs caches with sizes that are a power of
two (majority of workloads fall into this cathegory). For other workloads
dm-bufio needs caches with sizes that are not a power of two.
Now - we don't want higher-order allocations for power-of-two caches
(because higher-order allocations just cause memory fragmentation without
any benefit), but we want higher-order allocations for non-power-of-two
caches (because higher-order allocations minimize wasted space).
For example:
for 192K block size, the ideal order is 4MB (it takes 21 blocks)
for 448K block size, the ideal order is 4MB (it takes 9 blocks)
for 512K block size, the ideal order is 512KB (there is no benefit from
using higher order)
for 640K block size, the ideal order is 2MB (it takes 3 blocks, increasing
the allocation size to 4MB doesn't result in any benefit)
for 832K block size, the ideal order is 1MB (it takes 1 block, increasing
the allocation to 2MB or 4MB doesn't result in any benefit)
for 1M block size, the ideal order is 1MB
The problem with "slub_max_order" is that it increases the order either
always or never, but doesn't have the capability to calculate the ideal
order for the given object size. The patch that I send just does this
calculation.
Another problem wit "slub_max_order" is that the device driver that needs
to create a slab cache cannot really set it - the device driver can't
modify the kernel parameters.
> > I observe the same pathological rounding in dm-bufio caches.
> >
> > > There are some corner cases where extra metadata is needed per object or
> > > per page that will result in either object sizes that are no longer a
> > > power of two or in page sizes smaller than the whole page. Maybe you have
> > > a case like that? Can you show me a cache that has this issue?
> >
> > Here I have a patch set that changes the dm-bufio subsystem to support
> > buffer sizes that are not a power of two:
> > http://people.redhat.com/~mpatocka/patches/kernel/dm-bufio-arbitrary-sector-size/
> >
> > I need to change the slub cache to minimize wasted space - i.e. when
> > asking for a slab cache for 640kB objects, the slub system currently
> > allocates 1MB per object and 384kB is wasted. This is the reason why I'm
> > making this patch.
>
> You should not be using the slab allocators for these. Allocate higher
> order pages or numbers of consecutive smaller pagess from the page
> allocator. The slab allocators are written for objects smaller than page
> size.
So, do you argue that I need to write my own slab cache functionality
instead of using the existing slab code?
I can do it - but duplicating code is bad thing.
> > > > BTW. the function "order_store" in mm/slub.c modifies the structure
> > > > kmem_cache without taking any locks - is it a bug?
> > >
> > > The kmem_cache structure was just allocated. Only one thread can access it
> > > thus no locking is necessary.
> >
> > No - order_store is called when writing to /sys/kernel/slab/<cache>/order
> > - you can modify order for any existing cache - and the modification
> > happens without any locking.
>
> Well it still does not matter. The size of the order of slab pages
> can be dynamic even within a slab. You can have pages of varying sizes.
>
> What kind of problem could be caused here?
Unlocked accesses are generally considered bad. For example, see this
piece of code in calculate_sizes:
s->allocflags = 0;
if (order)
s->allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
s->allocflags |= GFP_DMA;
if (s->flags & SLAB_RECLAIM_ACCOUNT)
s->allocflags |= __GFP_RECLAIMABLE;
If you are running this while the cache is in use (i.e. when the user
writes /sys/kernel/slab/<cache>/order), then other processes will see
invalid s->allocflags for a short time.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 16:25 ` Mikulas Patocka
@ 2018-03-21 17:10 ` Matthew Wilcox
-1 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:10 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Christopher Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, Mar 21, 2018 at 12:25:39PM -0400, Mikulas Patocka wrote:
> Now - we don't want higher-order allocations for power-of-two caches
> (because higher-order allocations just cause memory fragmentation without
> any benefit)
Higher-order allocations don't cause memory fragmentation. Indeed,
they avoid it. They do fail as a result of fragmentation, which is
probably what you meant.
> , but we want higher-order allocations for non-power-of-two
> caches (because higher-order allocations minimize wasted space).
>
> For example:
> for 192K block size, the ideal order is 4MB (it takes 21 blocks)
I wonder if that's true. You can get five blocks into 1MB, wasting 64kB.
So going up by two orders of magnitude lets you get an extra block in
at the cost of failing more frequently.
> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
>
> So, do you argue that I need to write my own slab cache functionality
> instead of using the existing slab code?
>
> I can do it - but duplicating code is bad thing.
It is -- but writing a special-purpose allocator can be better than making
a general purpose allocator also solve a special purpose. I don't know
whether that's true here or not.
Your allocator seems like it could be remarkably simple; you know
you're always doing high-order allocations, and you know that you're
never allocating more than a handful of blocks from a page allocation.
So you can probably store all of your metadata in the struct page
(because your metadata is basically a bitmap) and significantly save on
memory usage. The one downside I see is that you don't get the reporting
through /proc/slabinfo.
So, is this an area where slub should be improved, or is this a case where
writing a special-purpose allocator makes more sense? It seems like you
already have a special-purpose allocator, in that you know how to fall
back to vmalloc if slab-alloc fails. So maybe have your own allocator
that interfaces to the page allocator for now; keep its interface nice
and clean, and maybe it'll get pulled out of your driver and put into mm/
some day if it becomes a useful API for everybody to share?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:10 ` Matthew Wilcox
0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:10 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Andrew Morton, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christopher Lameter, Joonsoo Kim
On Wed, Mar 21, 2018 at 12:25:39PM -0400, Mikulas Patocka wrote:
> Now - we don't want higher-order allocations for power-of-two caches
> (because higher-order allocations just cause memory fragmentation without
> any benefit)
Higher-order allocations don't cause memory fragmentation. Indeed,
they avoid it. They do fail as a result of fragmentation, which is
probably what you meant.
> , but we want higher-order allocations for non-power-of-two
> caches (because higher-order allocations minimize wasted space).
>
> For example:
> for 192K block size, the ideal order is 4MB (it takes 21 blocks)
I wonder if that's true. You can get five blocks into 1MB, wasting 64kB.
So going up by two orders of magnitude lets you get an extra block in
at the cost of failing more frequently.
> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
>
> So, do you argue that I need to write my own slab cache functionality
> instead of using the existing slab code?
>
> I can do it - but duplicating code is bad thing.
It is -- but writing a special-purpose allocator can be better than making
a general purpose allocator also solve a special purpose. I don't know
whether that's true here or not.
Your allocator seems like it could be remarkably simple; you know
you're always doing high-order allocations, and you know that you're
never allocating more than a handful of blocks from a page allocation.
So you can probably store all of your metadata in the struct page
(because your metadata is basically a bitmap) and significantly save on
memory usage. The one downside I see is that you don't get the reporting
through /proc/slabinfo.
So, is this an area where slub should be improved, or is this a case where
writing a special-purpose allocator makes more sense? It seems like you
already have a special-purpose allocator, in that you know how to fall
back to vmalloc if slab-alloc fails. So maybe have your own allocator
that interfaces to the page allocator for now; keep its interface nice
and clean, and maybe it'll get pulled out of your driver and put into mm/
some day if it becomes a useful API for everybody to share?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 16:25 ` Mikulas Patocka
@ 2018-03-21 17:30 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:30 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
>
> So, do you argue that I need to write my own slab cache functionality
> instead of using the existing slab code?
Just use the existing page allocator calls to allocate and free the
memory you need.
> I can do it - but duplicating code is bad thing.
There is no need to duplicate anything. There is lots of infrastructure
already in the kernel. You just need to use the right allocation / freeing
calls.
> > What kind of problem could be caused here?
>
> Unlocked accesses are generally considered bad. For example, see this
> piece of code in calculate_sizes:
> s->allocflags = 0;
> if (order)
> s->allocflags |= __GFP_COMP;
>
> if (s->flags & SLAB_CACHE_DMA)
> s->allocflags |= GFP_DMA;
>
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> s->allocflags |= __GFP_RECLAIMABLE;
>
> If you are running this while the cache is in use (i.e. when the user
> writes /sys/kernel/slab/<cache>/order), then other processes will see
> invalid s->allocflags for a short time.
Calculating sizes is done when the slab has only a single accessor. Thus
no locking is neeed.
Changing the size of objects in a slab cache when there is already a set
of object allocated and under management by the slab cache would
cause the allocator to fail and lead to garbled data.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:30 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:30 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> > You should not be using the slab allocators for these. Allocate higher
> > order pages or numbers of consecutive smaller pagess from the page
> > allocator. The slab allocators are written for objects smaller than page
> > size.
>
> So, do you argue that I need to write my own slab cache functionality
> instead of using the existing slab code?
Just use the existing page allocator calls to allocate and free the
memory you need.
> I can do it - but duplicating code is bad thing.
There is no need to duplicate anything. There is lots of infrastructure
already in the kernel. You just need to use the right allocation / freeing
calls.
> > What kind of problem could be caused here?
>
> Unlocked accesses are generally considered bad. For example, see this
> piece of code in calculate_sizes:
> s->allocflags = 0;
> if (order)
> s->allocflags |= __GFP_COMP;
>
> if (s->flags & SLAB_CACHE_DMA)
> s->allocflags |= GFP_DMA;
>
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> s->allocflags |= __GFP_RECLAIMABLE;
>
> If you are running this while the cache is in use (i.e. when the user
> writes /sys/kernel/slab/<cache>/order), then other processes will see
> invalid s->allocflags for a short time.
Calculating sizes is done when the slab has only a single accessor. Thus
no locking is neeed.
Changing the size of objects in a slab cache when there is already a set
of object allocated and under management by the slab cache would
cause the allocator to fail and lead to garbled data.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 17:30 ` Christopher Lameter
@ 2018-03-21 17:39 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:39 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
One other thought: If you want to improve the behavior for large scale
objects allocated through kmalloc/kmemcache then we would certainly be
glad to entertain those ideas.
F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
allocate powers of two pages. It would be relatively easy to make
kmalloc_large round the allocation to the next page size and then allocate
N consecutive pages via alloc_pages_exact() and free the remainder unused
pages or some such thing.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:39 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 17:39 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
One other thought: If you want to improve the behavior for large scale
objects allocated through kmalloc/kmemcache then we would certainly be
glad to entertain those ideas.
F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
allocate powers of two pages. It would be relatively easy to make
kmalloc_large round the allocation to the next page size and then allocate
N consecutive pages via alloc_pages_exact() and free the remainder unused
pages or some such thing.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 17:39 ` Christopher Lameter
@ 2018-03-21 17:49 ` Matthew Wilcox
-1 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:49 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
>
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.
I don't know if that's a good idea. That will contribute to fragmentation
if the allocation is held onto for a short-to-medium length of time.
If the allocation is for a very long period of time then those pages
would have been unavailable anyway, but if the user of the tail pages
holds them beyond the lifetime of the large allocation, then this is
probably a bad tradeoff to make.
I do see Mikulas' use case as interesting, I just don't know whether it's
worth changing slab/slub to support it. At first blush, other than the
sheer size of the allocations, it's a good fit.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 17:49 ` Matthew Wilcox
0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 17:49 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
>
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.
I don't know if that's a good idea. That will contribute to fragmentation
if the allocation is held onto for a short-to-medium length of time.
If the allocation is for a very long period of time then those pages
would have been unavailable anyway, but if the user of the tail pages
holds them beyond the lifetime of the large allocation, then this is
probably a bad tradeoff to make.
I do see Mikulas' use case as interesting, I just don't know whether it's
worth changing slab/slub to support it. At first blush, other than the
sheer size of the allocations, it's a good fit.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 17:49 ` Matthew Wilcox
@ 2018-03-21 18:01 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:01 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Matthew Wilcox wrote:
> I don't know if that's a good idea. That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.
>
> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it. At first blush, other than the
> sheer size of the allocations, it's a good fit.
Well there are numerous page pool approaches already in the kernel. Maybe
there is a better fit with those?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:01 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:01 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Matthew Wilcox wrote:
> I don't know if that's a good idea. That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.
>
> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it. At first blush, other than the
> sheer size of the allocations, it's a good fit.
Well there are numerous page pool approaches already in the kernel. Maybe
there is a better fit with those?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 17:49 ` Matthew Wilcox
@ 2018-03-21 18:23 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:23 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Christopher Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Matthew Wilcox wrote:
> On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> > One other thought: If you want to improve the behavior for large scale
> > objects allocated through kmalloc/kmemcache then we would certainly be
> > glad to entertain those ideas.
> >
> > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > allocate powers of two pages. It would be relatively easy to make
> > kmalloc_large round the allocation to the next page size and then allocate
> > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > pages or some such thing.
alloc_pages_exact() has O(n*log n) complexity with respect to the number
of requested pages. It would have to be reworked and optimized if it were
to be used for the dm-bufio cache. (it could be optimized down to O(log n)
if it didn't split the compound page to a lot of separate pages, but split
it to a power-of-two clusters instead).
> I don't know if that's a good idea. That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.
The problem with alloc_pages_exact() is that it exhausts all the
high-order pages and leaves many free low-order pages around. So you'll
end up in a system with a lot of free memory, but with all high-order
pages missing. As there would be a lot of free memory, the kswapd thread
would not be woken up to free some high-order pages.
I think that using slab with high order is better, because it at least
doesn't leave many low-order pages behind.
> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it. At first blush, other than the
> sheer size of the allocations, it's a good fit.
All I need is to increase the order of a specific slab cache - I think
it's better to implement an interface that allows doing it than to
duplicate the slab cache code.
BTW. it could be possible to open the file
"/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
the requested value there, but it seems very dirty. It would be better to
have a kernel interface for that.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:23 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:23 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mike Snitzer, Andrew Morton, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christopher Lameter, Joonsoo Kim
On Wed, 21 Mar 2018, Matthew Wilcox wrote:
> On Wed, Mar 21, 2018 at 12:39:33PM -0500, Christopher Lameter wrote:
> > One other thought: If you want to improve the behavior for large scale
> > objects allocated through kmalloc/kmemcache then we would certainly be
> > glad to entertain those ideas.
> >
> > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > allocate powers of two pages. It would be relatively easy to make
> > kmalloc_large round the allocation to the next page size and then allocate
> > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > pages or some such thing.
alloc_pages_exact() has O(n*log n) complexity with respect to the number
of requested pages. It would have to be reworked and optimized if it were
to be used for the dm-bufio cache. (it could be optimized down to O(log n)
if it didn't split the compound page to a lot of separate pages, but split
it to a power-of-two clusters instead).
> I don't know if that's a good idea. That will contribute to fragmentation
> if the allocation is held onto for a short-to-medium length of time.
> If the allocation is for a very long period of time then those pages
> would have been unavailable anyway, but if the user of the tail pages
> holds them beyond the lifetime of the large allocation, then this is
> probably a bad tradeoff to make.
The problem with alloc_pages_exact() is that it exhausts all the
high-order pages and leaves many free low-order pages around. So you'll
end up in a system with a lot of free memory, but with all high-order
pages missing. As there would be a lot of free memory, the kswapd thread
would not be woken up to free some high-order pages.
I think that using slab with high order is better, because it at least
doesn't leave many low-order pages behind.
> I do see Mikulas' use case as interesting, I just don't know whether it's
> worth changing slab/slub to support it. At first blush, other than the
> sheer size of the allocations, it's a good fit.
All I need is to increase the order of a specific slab cache - I think
it's better to implement an interface that allows doing it than to
duplicate the slab cache code.
BTW. it could be possible to open the file
"/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
the requested value there, but it seems very dirty. It would be better to
have a kernel interface for that.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 17:30 ` Christopher Lameter
@ 2018-03-21 18:36 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:36 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > You should not be using the slab allocators for these. Allocate higher
> > > order pages or numbers of consecutive smaller pagess from the page
> > > allocator. The slab allocators are written for objects smaller than page
> > > size.
> >
> > So, do you argue that I need to write my own slab cache functionality
> > instead of using the existing slab code?
>
> Just use the existing page allocator calls to allocate and free the
> memory you need.
>
> > I can do it - but duplicating code is bad thing.
>
> There is no need to duplicate anything. There is lots of infrastructure
> already in the kernel. You just need to use the right allocation / freeing
> calls.
So, what would you recommend for allocating 640KB objects while minimizing
wasted space?
* alloc_pages - rounds up to the next power of two
* kmalloc - rounds up to the next power of two
* alloc_pages_exact - O(n*log n) complexity; and causes memory
fragmentation if used excesivelly
* vmalloc - horrible performance (modifies page tables and that causes
synchronization across all CPUs)
anything else?
The slab cache with large order seems as a best choice for this.
> > > What kind of problem could be caused here?
> >
> > Unlocked accesses are generally considered bad. For example, see this
> > piece of code in calculate_sizes:
> > s->allocflags = 0;
> > if (order)
> > s->allocflags |= __GFP_COMP;
> >
> > if (s->flags & SLAB_CACHE_DMA)
> > s->allocflags |= GFP_DMA;
> >
> > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > s->allocflags |= __GFP_RECLAIMABLE;
> >
> > If you are running this while the cache is in use (i.e. when the user
> > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > invalid s->allocflags for a short time.
>
> Calculating sizes is done when the slab has only a single accessor. Thus
> no locking is neeed.
The calculation is done whenever someone writes to
"/sys/kernel/slab/*/order"
And you can obviously write to that file why the slab cache is in use. Try
it.
So, the function calculate_sizes can actually race with allocation from
the slab cache.
> Changing the size of objects in a slab cache when there is already a set
> of object allocated and under management by the slab cache would
> cause the allocator to fail and lead to garbled data.
I am not talking about changing the size of objects in a slab cache. I am
talking about changing the allocation order of a slab cache while the
cache is in use. This can be done with the sysfs interface.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:36 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:36 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > You should not be using the slab allocators for these. Allocate higher
> > > order pages or numbers of consecutive smaller pagess from the page
> > > allocator. The slab allocators are written for objects smaller than page
> > > size.
> >
> > So, do you argue that I need to write my own slab cache functionality
> > instead of using the existing slab code?
>
> Just use the existing page allocator calls to allocate and free the
> memory you need.
>
> > I can do it - but duplicating code is bad thing.
>
> There is no need to duplicate anything. There is lots of infrastructure
> already in the kernel. You just need to use the right allocation / freeing
> calls.
So, what would you recommend for allocating 640KB objects while minimizing
wasted space?
* alloc_pages - rounds up to the next power of two
* kmalloc - rounds up to the next power of two
* alloc_pages_exact - O(n*log n) complexity; and causes memory
fragmentation if used excesivelly
* vmalloc - horrible performance (modifies page tables and that causes
synchronization across all CPUs)
anything else?
The slab cache with large order seems as a best choice for this.
> > > What kind of problem could be caused here?
> >
> > Unlocked accesses are generally considered bad. For example, see this
> > piece of code in calculate_sizes:
> > s->allocflags = 0;
> > if (order)
> > s->allocflags |= __GFP_COMP;
> >
> > if (s->flags & SLAB_CACHE_DMA)
> > s->allocflags |= GFP_DMA;
> >
> > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > s->allocflags |= __GFP_RECLAIMABLE;
> >
> > If you are running this while the cache is in use (i.e. when the user
> > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > invalid s->allocflags for a short time.
>
> Calculating sizes is done when the slab has only a single accessor. Thus
> no locking is neeed.
The calculation is done whenever someone writes to
"/sys/kernel/slab/*/order"
And you can obviously write to that file why the slab cache is in use. Try
it.
So, the function calculate_sizes can actually race with allocation from
the slab cache.
> Changing the size of objects in a slab cache when there is already a set
> of object allocated and under management by the slab cache would
> cause the allocator to fail and lead to garbled data.
I am not talking about changing the size of objects in a slab cache. I am
talking about changing the allocation order of a slab cache while the
cache is in use. This can be done with the sysfs interface.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:23 ` Mikulas Patocka
@ 2018-03-21 18:40 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:40 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > allocate powers of two pages. It would be relatively easy to make
> > > kmalloc_large round the allocation to the next page size and then allocate
> > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > pages or some such thing.
>
> alloc_pages_exact() has O(n*log n) complexity with respect to the number
> of requested pages. It would have to be reworked and optimized if it were
> to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> if it didn't split the compound page to a lot of separate pages, but split
> it to a power-of-two clusters instead).
Well then a memory pool of page allocator requests may address that issue?
Have a look at include/linux/mempool.h.
> > I don't know if that's a good idea. That will contribute to fragmentation
> > if the allocation is held onto for a short-to-medium length of time.
> > If the allocation is for a very long period of time then those pages
> > would have been unavailable anyway, but if the user of the tail pages
> > holds them beyond the lifetime of the large allocation, then this is
> > probably a bad tradeoff to make.
Fragmentation is sadly a big issue. You could create a mempool on bootup
or early after boot to ensure that you have a sufficient number of
contiguous pages available.
> The problem with alloc_pages_exact() is that it exhausts all the
> high-order pages and leaves many free low-order pages around. So you'll
> end up in a system with a lot of free memory, but with all high-order
> pages missing. As there would be a lot of free memory, the kswapd thread
> would not be woken up to free some high-order pages.
I think that logic is properly balanced and will take into account pages
that have been removed from the LRU expiration logic.
> I think that using slab with high order is better, because it at least
> doesn't leave many low-order pages behind.
Any request to the slab via kmalloc with a size > 2x page size will simply
lead to a page allocator request. You have the same issue. If you want to
rely on the slab allocator buffering large segments for you then a mempool
will also solve the issue for you and you have more control over the pool.
> BTW. it could be possible to open the file
> "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> the requested value there, but it seems very dirty. It would be better to
> have a kernel interface for that.
Hehehe you could directly write to the kmem_cache structure and increase
the order. AFAICT this would be dirty but work.
But still the increased page order will get you into trouble with
fragmentation when the system runs for a long time. That is the reason we
try to limit the allocation sizes coming from the slab allocator.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:40 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:40 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > allocate powers of two pages. It would be relatively easy to make
> > > kmalloc_large round the allocation to the next page size and then allocate
> > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > pages or some such thing.
>
> alloc_pages_exact() has O(n*log n) complexity with respect to the number
> of requested pages. It would have to be reworked and optimized if it were
> to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> if it didn't split the compound page to a lot of separate pages, but split
> it to a power-of-two clusters instead).
Well then a memory pool of page allocator requests may address that issue?
Have a look at include/linux/mempool.h.
> > I don't know if that's a good idea. That will contribute to fragmentation
> > if the allocation is held onto for a short-to-medium length of time.
> > If the allocation is for a very long period of time then those pages
> > would have been unavailable anyway, but if the user of the tail pages
> > holds them beyond the lifetime of the large allocation, then this is
> > probably a bad tradeoff to make.
Fragmentation is sadly a big issue. You could create a mempool on bootup
or early after boot to ensure that you have a sufficient number of
contiguous pages available.
> The problem with alloc_pages_exact() is that it exhausts all the
> high-order pages and leaves many free low-order pages around. So you'll
> end up in a system with a lot of free memory, but with all high-order
> pages missing. As there would be a lot of free memory, the kswapd thread
> would not be woken up to free some high-order pages.
I think that logic is properly balanced and will take into account pages
that have been removed from the LRU expiration logic.
> I think that using slab with high order is better, because it at least
> doesn't leave many low-order pages behind.
Any request to the slab via kmalloc with a size > 2x page size will simply
lead to a page allocator request. You have the same issue. If you want to
rely on the slab allocator buffering large segments for you then a mempool
will also solve the issue for you and you have more control over the pool.
> BTW. it could be possible to open the file
> "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> the requested value there, but it seems very dirty. It would be better to
> have a kernel interface for that.
Hehehe you could directly write to the kmem_cache structure and increase
the order. AFAICT this would be dirty but work.
But still the increased page order will get you into trouble with
fragmentation when the system runs for a long time. That is the reason we
try to limit the allocation sizes coming from the slab allocator.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:40 ` Christopher Lameter
@ 2018-03-21 18:55 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:55 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
>
> Well then a memory pool of page allocator requests may address that issue?
>
> Have a look at include/linux/mempool.h.
I know the mempool interface. Mempool can keep some amount reserved
objects if the system memory is exhausted.
Mempool doesn't deal with object allocation at all - mempool needs to be
hooked to an existing object allocator (slab cache, kmalloc, alloc_pages,
or some custom allocator provided with the methods mempool_alloc_t and
mempool_free_t).
> > > I don't know if that's a good idea. That will contribute to fragmentation
> > > if the allocation is held onto for a short-to-medium length of time.
> > > If the allocation is for a very long period of time then those pages
> > > would have been unavailable anyway, but if the user of the tail pages
> > > holds them beyond the lifetime of the large allocation, then this is
> > > probably a bad tradeoff to make.
>
> Fragmentation is sadly a big issue. You could create a mempool on bootup
> or early after boot to ensure that you have a sufficient number of
> contiguous pages available.
The dm-bufio driver deals correctly with this - it preallocates several
buffers with vmalloc when the dm-bufio cache is created. During operation,
if a high-order allocation fails, the dm-bufio subsystem throws away some
existing buffer and reuses the already allocated chunk of memory for the
buffer that needs to be created.
So, fragmentation is not an issue with this use case. dm-bufio can make
forward progress even if memory is totally exhausted.
> > The problem with alloc_pages_exact() is that it exhausts all the
> > high-order pages and leaves many free low-order pages around. So you'll
> > end up in a system with a lot of free memory, but with all high-order
> > pages missing. As there would be a lot of free memory, the kswapd thread
> > would not be woken up to free some high-order pages.
>
> I think that logic is properly balanced and will take into account pages
> that have been removed from the LRU expiration logic.
>
> > I think that using slab with high order is better, because it at least
> > doesn't leave many low-order pages behind.
>
> Any request to the slab via kmalloc with a size > 2x page size will simply
> lead to a page allocator request. You have the same issue. If you want to
> rely on the slab allocator buffering large segments for you then a mempool
> will also solve the issue for you and you have more control over the pool.
mempool solves nothing because it needs a backing allocator. And the
question is what this backing allocator should be.
> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
>
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
>
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.
It won't - see above - if the high-order allocation fails, dm-bufio just
reuses some existing buffer.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:55 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 18:55 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
>
> Well then a memory pool of page allocator requests may address that issue?
>
> Have a look at include/linux/mempool.h.
I know the mempool interface. Mempool can keep some amount reserved
objects if the system memory is exhausted.
Mempool doesn't deal with object allocation at all - mempool needs to be
hooked to an existing object allocator (slab cache, kmalloc, alloc_pages,
or some custom allocator provided with the methods mempool_alloc_t and
mempool_free_t).
> > > I don't know if that's a good idea. That will contribute to fragmentation
> > > if the allocation is held onto for a short-to-medium length of time.
> > > If the allocation is for a very long period of time then those pages
> > > would have been unavailable anyway, but if the user of the tail pages
> > > holds them beyond the lifetime of the large allocation, then this is
> > > probably a bad tradeoff to make.
>
> Fragmentation is sadly a big issue. You could create a mempool on bootup
> or early after boot to ensure that you have a sufficient number of
> contiguous pages available.
The dm-bufio driver deals correctly with this - it preallocates several
buffers with vmalloc when the dm-bufio cache is created. During operation,
if a high-order allocation fails, the dm-bufio subsystem throws away some
existing buffer and reuses the already allocated chunk of memory for the
buffer that needs to be created.
So, fragmentation is not an issue with this use case. dm-bufio can make
forward progress even if memory is totally exhausted.
> > The problem with alloc_pages_exact() is that it exhausts all the
> > high-order pages and leaves many free low-order pages around. So you'll
> > end up in a system with a lot of free memory, but with all high-order
> > pages missing. As there would be a lot of free memory, the kswapd thread
> > would not be woken up to free some high-order pages.
>
> I think that logic is properly balanced and will take into account pages
> that have been removed from the LRU expiration logic.
>
> > I think that using slab with high order is better, because it at least
> > doesn't leave many low-order pages behind.
>
> Any request to the slab via kmalloc with a size > 2x page size will simply
> lead to a page allocator request. You have the same issue. If you want to
> rely on the slab allocator buffering large segments for you then a mempool
> will also solve the issue for you and you have more control over the pool.
mempool solves nothing because it needs a backing allocator. And the
question is what this backing allocator should be.
> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
>
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
>
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.
It won't - see above - if the high-order allocation fails, dm-bufio just
reuses some existing buffer.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:40 ` Christopher Lameter
@ 2018-03-21 18:55 ` Matthew Wilcox
-1 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 18:55 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, Mar 21, 2018 at 01:40:31PM -0500, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
>
> Well then a memory pool of page allocator requests may address that issue?
>
> Have a look at include/linux/mempool.h.
That's not what mempool is for. mempool is a cache of elements that were
allocated from slab in the first place. (OK, technically, you don't have
to use slab as the allocator, but since there is no allocator that solves
this problem, mempool doesn't solve the problem either!)
> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
>
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
>
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.
Right; he has a fallback already (vmalloc). So ... let's just add the
interface to allow slab caches to have their order tuned by users who
really know what they're doing?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:55 ` Matthew Wilcox
0 siblings, 0 replies; 109+ messages in thread
From: Matthew Wilcox @ 2018-03-21 18:55 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, Mar 21, 2018 at 01:40:31PM -0500, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > > F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> > > > allocate powers of two pages. It would be relatively easy to make
> > > > kmalloc_large round the allocation to the next page size and then allocate
> > > > N consecutive pages via alloc_pages_exact() and free the remainder unused
> > > > pages or some such thing.
> >
> > alloc_pages_exact() has O(n*log n) complexity with respect to the number
> > of requested pages. It would have to be reworked and optimized if it were
> > to be used for the dm-bufio cache. (it could be optimized down to O(log n)
> > if it didn't split the compound page to a lot of separate pages, but split
> > it to a power-of-two clusters instead).
>
> Well then a memory pool of page allocator requests may address that issue?
>
> Have a look at include/linux/mempool.h.
That's not what mempool is for. mempool is a cache of elements that were
allocated from slab in the first place. (OK, technically, you don't have
to use slab as the allocator, but since there is no allocator that solves
this problem, mempool doesn't solve the problem either!)
> > BTW. it could be possible to open the file
> > "/sys/kernel/slab/<cache>/order" from the dm-bufio kernel driver and write
> > the requested value there, but it seems very dirty. It would be better to
> > have a kernel interface for that.
>
> Hehehe you could directly write to the kmem_cache structure and increase
> the order. AFAICT this would be dirty but work.
>
> But still the increased page order will get you into trouble with
> fragmentation when the system runs for a long time. That is the reason we
> try to limit the allocation sizes coming from the slab allocator.
Right; he has a fallback already (vmalloc). So ... let's just add the
interface to allow slab caches to have their order tuned by users who
really know what they're doing?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:36 ` Mikulas Patocka
@ 2018-03-21 18:57 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:57 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> So, what would you recommend for allocating 640KB objects while minimizing
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory
> fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes
> synchronization across all CPUs)
>
> anything else?
Need to find it but there is a way to allocate N pages in sequence
somewhere. Otherwise mempools are something that would work.
> > > > What kind of problem could be caused here?
> > >
> > > Unlocked accesses are generally considered bad. For example, see this
> > > piece of code in calculate_sizes:
> > > s->allocflags = 0;
> > > if (order)
> > > s->allocflags |= __GFP_COMP;
> > >
> > > if (s->flags & SLAB_CACHE_DMA)
> > > s->allocflags |= GFP_DMA;
> > >
> > > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > s->allocflags |= __GFP_RECLAIMABLE;
> > >
> > > If you are running this while the cache is in use (i.e. when the user
> > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > invalid s->allocflags for a short time.
> >
> > Calculating sizes is done when the slab has only a single accessor. Thus
> > no locking is neeed.
>
> The calculation is done whenever someone writes to
> "/sys/kernel/slab/*/order"
But the flags you are mentioning do not change and the size of the object
does not change. What changes is the number of objects in the slab page.
> And you can obviously write to that file why the slab cache is in use. Try
> it.
You cannot change flags that would impact the size of the objects. I only
allowed changing characteristics that does not impact object size.
> I am not talking about changing the size of objects in a slab cache. I am
> talking about changing the allocation order of a slab cache while the
> cache is in use. This can be done with the sysfs interface.
But then that is something that is allowed but does not affect the object
size used by the slab allocators.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:57 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:57 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> So, what would you recommend for allocating 640KB objects while minimizing
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory
> fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes
> synchronization across all CPUs)
>
> anything else?
Need to find it but there is a way to allocate N pages in sequence
somewhere. Otherwise mempools are something that would work.
> > > > What kind of problem could be caused here?
> > >
> > > Unlocked accesses are generally considered bad. For example, see this
> > > piece of code in calculate_sizes:
> > > s->allocflags = 0;
> > > if (order)
> > > s->allocflags |= __GFP_COMP;
> > >
> > > if (s->flags & SLAB_CACHE_DMA)
> > > s->allocflags |= GFP_DMA;
> > >
> > > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > s->allocflags |= __GFP_RECLAIMABLE;
> > >
> > > If you are running this while the cache is in use (i.e. when the user
> > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > invalid s->allocflags for a short time.
> >
> > Calculating sizes is done when the slab has only a single accessor. Thus
> > no locking is neeed.
>
> The calculation is done whenever someone writes to
> "/sys/kernel/slab/*/order"
But the flags you are mentioning do not change and the size of the object
does not change. What changes is the number of objects in the slab page.
> And you can obviously write to that file why the slab cache is in use. Try
> it.
You cannot change flags that would impact the size of the objects. I only
allowed changing characteristics that does not impact object size.
> I am not talking about changing the size of objects in a slab cache. I am
> talking about changing the allocation order of a slab cache while the
> cache is in use. This can be done with the sysfs interface.
But then that is something that is allowed but does not affect the object
size used by the slab allocators.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:55 ` Matthew Wilcox
@ 2018-03-21 18:58 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:58 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mikulas Patocka, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Matthew Wilcox wrote:
> > Have a look at include/linux/mempool.h.
>
> That's not what mempool is for. mempool is a cache of elements that were
> allocated from slab in the first place. (OK, technically, you don't have
> to use slab as the allocator, but since there is no allocator that solves
> this problem, mempool doesn't solve the problem either!)
You can put the page allocator in there instead of a slab allocator.
> > But still the increased page order will get you into trouble with
> > fragmentation when the system runs for a long time. That is the reason we
> > try to limit the allocation sizes coming from the slab allocator.
>
> Right; he has a fallback already (vmalloc). So ... let's just add the
> interface to allow slab caches to have their order tuned by users who
> really know what they're doing?
Ok thats trivial.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 18:58 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 18:58 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mike Snitzer, Pekka Enberg, linux-mm, dm-devel, Mikulas Patocka,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Matthew Wilcox wrote:
> > Have a look at include/linux/mempool.h.
>
> That's not what mempool is for. mempool is a cache of elements that were
> allocated from slab in the first place. (OK, technically, you don't have
> to use slab as the allocator, but since there is no allocator that solves
> this problem, mempool doesn't solve the problem either!)
You can put the page allocator in there instead of a slab allocator.
> > But still the increased page order will get you into trouble with
> > fragmentation when the system runs for a long time. That is the reason we
> > try to limit the allocation sizes coming from the slab allocator.
>
> Right; he has a fallback already (vmalloc). So ... let's just add the
> interface to allow slab caches to have their order tuned by users who
> really know what they're doing?
Ok thats trivial.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:57 ` Christopher Lameter
@ 2018-03-21 19:19 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:19 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > So, what would you recommend for allocating 640KB objects while minimizing
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory
> > fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes
> > synchronization across all CPUs)
> >
> > anything else?
>
> Need to find it but there is a way to allocate N pages in sequence
> somewhere. Otherwise mempools are something that would work.
There's also continuous-memory-allocator, but it needs its memory to be
reserved at boot time. It is intended for misdesigned hardware devices
that need continuous memory for DMA. As it's intended for one-time
allocations when loading drivers, it lacks the performance and scalability
of the slab cache and alloc_pages.
> > > > > What kind of problem could be caused here?
> > > >
> > > > Unlocked accesses are generally considered bad. For example, see this
> > > > piece of code in calculate_sizes:
> > > > s->allocflags = 0;
> > > > if (order)
> > > > s->allocflags |= __GFP_COMP;
> > > >
> > > > if (s->flags & SLAB_CACHE_DMA)
> > > > s->allocflags |= GFP_DMA;
> > > >
> > > > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > > s->allocflags |= __GFP_RECLAIMABLE;
> > > >
> > > > If you are running this while the cache is in use (i.e. when the user
> > > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > > invalid s->allocflags for a short time.
> > >
> > > Calculating sizes is done when the slab has only a single accessor. Thus
> > > no locking is neeed.
> >
> > The calculation is done whenever someone writes to
> > "/sys/kernel/slab/*/order"
>
> But the flags you are mentioning do not change and the size of the object
> does not change. What changes is the number of objects in the slab page.
See this code again:
> > > s->allocflags = 0;
> > > if (order)
> > > s->allocflags |= __GFP_COMP;
> > >
> > > if (s->flags & SLAB_CACHE_DMA)
> > > s->allocflags |= GFP_DMA;
> > >
> > > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > s->allocflags |= __GFP_RECLAIMABLE;
when this function is called, the value s->allocflags does change. At the
end, s->allocflags holds the same value as before, but it changes
temporarily.
For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
and he allocates an object from this cache and this allocation races with
the user writing to /sys/kernel/slab/cache/order - then the allocator can
for a small period of time see "s->allocflags == 0" and allocate a non-DMA
page. That is a bug.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 19:19 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:19 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > So, what would you recommend for allocating 640KB objects while minimizing
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory
> > fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes
> > synchronization across all CPUs)
> >
> > anything else?
>
> Need to find it but there is a way to allocate N pages in sequence
> somewhere. Otherwise mempools are something that would work.
There's also continuous-memory-allocator, but it needs its memory to be
reserved at boot time. It is intended for misdesigned hardware devices
that need continuous memory for DMA. As it's intended for one-time
allocations when loading drivers, it lacks the performance and scalability
of the slab cache and alloc_pages.
> > > > > What kind of problem could be caused here?
> > > >
> > > > Unlocked accesses are generally considered bad. For example, see this
> > > > piece of code in calculate_sizes:
> > > > s->allocflags = 0;
> > > > if (order)
> > > > s->allocflags |= __GFP_COMP;
> > > >
> > > > if (s->flags & SLAB_CACHE_DMA)
> > > > s->allocflags |= GFP_DMA;
> > > >
> > > > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > > s->allocflags |= __GFP_RECLAIMABLE;
> > > >
> > > > If you are running this while the cache is in use (i.e. when the user
> > > > writes /sys/kernel/slab/<cache>/order), then other processes will see
> > > > invalid s->allocflags for a short time.
> > >
> > > Calculating sizes is done when the slab has only a single accessor. Thus
> > > no locking is neeed.
> >
> > The calculation is done whenever someone writes to
> > "/sys/kernel/slab/*/order"
>
> But the flags you are mentioning do not change and the size of the object
> does not change. What changes is the number of objects in the slab page.
See this code again:
> > > s->allocflags = 0;
> > > if (order)
> > > s->allocflags |= __GFP_COMP;
> > >
> > > if (s->flags & SLAB_CACHE_DMA)
> > > s->allocflags |= GFP_DMA;
> > >
> > > if (s->flags & SLAB_RECLAIM_ACCOUNT)
> > > s->allocflags |= __GFP_RECLAIMABLE;
when this function is called, the value s->allocflags does change. At the
end, s->allocflags holds the same value as before, but it changes
temporarily.
For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
and he allocates an object from this cache and this allocation races with
the user writing to /sys/kernel/slab/cache/order - then the allocator can
for a small period of time see "s->allocflags == 0" and allocate a non-DMA
page. That is a bug.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 17:39 ` Christopher Lameter
@ 2018-03-21 19:25 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:25 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
>
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.
It may be possible, but we'd need to improve the horrible complexity of
alloc_pages_exact().
This is a trade-of between performance and waste. A power-of-two
allocation can be done quicky, but it wastes a lot of space.
alloc_pages_exact() wastes less space, but it is slow.
The question is - how many of these large-kmalloc allocations are
short-lived and how many are long-lived? I don't know, I haven't measured
it.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 19:25 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 19:25 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> One other thought: If you want to improve the behavior for large scale
> objects allocated through kmalloc/kmemcache then we would certainly be
> glad to entertain those ideas.
>
> F.e. you could optimize the allcations > 2x PAGE_SIZE so that they do not
> allocate powers of two pages. It would be relatively easy to make
> kmalloc_large round the allocation to the next page size and then allocate
> N consecutive pages via alloc_pages_exact() and free the remainder unused
> pages or some such thing.
It may be possible, but we'd need to improve the horrible complexity of
alloc_pages_exact().
This is a trade-of between performance and waste. A power-of-two
allocation can be done quicky, but it wastes a lot of space.
alloc_pages_exact() wastes less space, but it is slow.
The question is - how many of these large-kmalloc allocations are
short-lived and how many are long-lived? I don't know, I haven't measured
it.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 19:19 ` Mikulas Patocka
@ 2018-03-21 20:09 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 20:09 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> and he allocates an object from this cache and this allocation races with
> the user writing to /sys/kernel/slab/cache/order - then the allocator can
> for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> page. That is a bug.
True we need to fix that:
Subject: Avoid potentially visible allocflags without all flags set
During slab size recalculation s->allocflags may be temporarily set
to 0 and thus the flags may not be set which may result in the wrong
flags being passed. Slab size calculation happens in two cases:
1. When a slab is created (which is safe since we cannot have
concurrent allocations)
2. When the slab order is changed via /sysfs.
Signed-off-by: Christoph Lameter <cl@linux.com>
Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
+ gfp_t allocflags;
size_t size = s->object_size;
int order;
@@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
if (order < 0)
return 0;
- s->allocflags = 0;
+ allocflags = 0;
if (order)
- s->allocflags |= __GFP_COMP;
+ allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
- s->allocflags |= GFP_DMA;
+ allocflags |= GFP_DMA;
if (s->flags & SLAB_RECLAIM_ACCOUNT)
- s->allocflags |= __GFP_RECLAIMABLE;
+ allocflags |= __GFP_RECLAIMABLE;
+ s->allocflags = allocflags;
/*
* Determine the number of objects per slab
*/
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 20:09 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-21 20:09 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> and he allocates an object from this cache and this allocation races with
> the user writing to /sys/kernel/slab/cache/order - then the allocator can
> for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> page. That is a bug.
True we need to fix that:
Subject: Avoid potentially visible allocflags without all flags set
During slab size recalculation s->allocflags may be temporarily set
to 0 and thus the flags may not be set which may result in the wrong
flags being passed. Slab size calculation happens in two cases:
1. When a slab is created (which is safe since we cannot have
concurrent allocations)
2. When the slab order is changed via /sysfs.
Signed-off-by: Christoph Lameter <cl@linux.com>
Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
+ gfp_t allocflags;
size_t size = s->object_size;
int order;
@@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
if (order < 0)
return 0;
- s->allocflags = 0;
+ allocflags = 0;
if (order)
- s->allocflags |= __GFP_COMP;
+ allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
- s->allocflags |= GFP_DMA;
+ allocflags |= GFP_DMA;
if (s->flags & SLAB_RECLAIM_ACCOUNT)
- s->allocflags |= __GFP_RECLAIMABLE;
+ allocflags |= __GFP_RECLAIMABLE;
+ s->allocflags = allocflags;
/*
* Determine the number of objects per slab
*/
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 20:09 ` Christopher Lameter
@ 2018-03-21 20:37 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 20:37 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> > and he allocates an object from this cache and this allocation races with
> > the user writing to /sys/kernel/slab/cache/order - then the allocator can
> > for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> > page. That is a bug.
>
> True we need to fix that:
>
> Subject: Avoid potentially visible allocflags without all flags set
>
> During slab size recalculation s->allocflags may be temporarily set
> to 0 and thus the flags may not be set which may result in the wrong
> flags being passed. Slab size calculation happens in two cases:
>
> 1. When a slab is created (which is safe since we cannot have
> concurrent allocations)
>
> 2. When the slab order is changed via /sysfs.
>
> Signed-off-by: Christoph Lameter <cl@linux.com>
>
>
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
> static int calculate_sizes(struct kmem_cache *s, int forced_order)
> {
> slab_flags_t flags = s->flags;
> + gfp_t allocflags;
> size_t size = s->object_size;
> int order;
>
> @@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
> if (order < 0)
> return 0;
>
> - s->allocflags = 0;
> + allocflags = 0;
> if (order)
> - s->allocflags |= __GFP_COMP;
> + allocflags |= __GFP_COMP;
>
> if (s->flags & SLAB_CACHE_DMA)
> - s->allocflags |= GFP_DMA;
> + allocflags |= GFP_DMA;
>
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> - s->allocflags |= __GFP_RECLAIMABLE;
> + allocflags |= __GFP_RECLAIMABLE;
>
> + s->allocflags = allocflags;
I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
s->oo and s->min to avoid some possible compiler misoptimizations.
WRITE_ONCE should be used when writing a value that can be read
simultaneously (but a lot of kernel code misses it).
Another problem is that it updates s->oo and later it updates s->max:
s->oo = oo_make(order, size, s->reserved);
s->min = oo_make(get_order(size), size, s->reserved);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
--- so, the concurrently running code could see s->oo > s->max, which
could trigger some memory corruption.
s->max is only used in memory allocations -
kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
perhaps we could fix the bug by removing s->max at all and always
allocating enough memory for the maximum possible number of objects?
- kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
+ kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
Mikulas
> /*
> * Determine the number of objects per slab
> */
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-21 20:37 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-21 20:37 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > For example, if someone creates a slab cache with the flag SLAB_CACHE_DMA,
> > and he allocates an object from this cache and this allocation races with
> > the user writing to /sys/kernel/slab/cache/order - then the allocator can
> > for a small period of time see "s->allocflags == 0" and allocate a non-DMA
> > page. That is a bug.
>
> True we need to fix that:
>
> Subject: Avoid potentially visible allocflags without all flags set
>
> During slab size recalculation s->allocflags may be temporarily set
> to 0 and thus the flags may not be set which may result in the wrong
> flags being passed. Slab size calculation happens in two cases:
>
> 1. When a slab is created (which is safe since we cannot have
> concurrent allocations)
>
> 2. When the slab order is changed via /sysfs.
>
> Signed-off-by: Christoph Lameter <cl@linux.com>
>
>
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -3457,6 +3457,7 @@ static void set_cpu_partial(struct kmem_
> static int calculate_sizes(struct kmem_cache *s, int forced_order)
> {
> slab_flags_t flags = s->flags;
> + gfp_t allocflags;
> size_t size = s->object_size;
> int order;
>
> @@ -3551,16 +3552,17 @@ static int calculate_sizes(struct kmem_c
> if (order < 0)
> return 0;
>
> - s->allocflags = 0;
> + allocflags = 0;
> if (order)
> - s->allocflags |= __GFP_COMP;
> + allocflags |= __GFP_COMP;
>
> if (s->flags & SLAB_CACHE_DMA)
> - s->allocflags |= GFP_DMA;
> + allocflags |= GFP_DMA;
>
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> - s->allocflags |= __GFP_RECLAIMABLE;
> + allocflags |= __GFP_RECLAIMABLE;
>
> + s->allocflags = allocflags;
I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
s->oo and s->min to avoid some possible compiler misoptimizations.
WRITE_ONCE should be used when writing a value that can be read
simultaneously (but a lot of kernel code misses it).
Another problem is that it updates s->oo and later it updates s->max:
s->oo = oo_make(order, size, s->reserved);
s->min = oo_make(get_order(size), size, s->reserved);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
--- so, the concurrently running code could see s->oo > s->max, which
could trigger some memory corruption.
s->max is only used in memory allocations -
kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
perhaps we could fix the bug by removing s->max at all and always
allocating enough memory for the maximum possible number of objects?
- kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
+ kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
Mikulas
> /*
> * Determine the number of objects per slab
> */
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 20:37 ` Mikulas Patocka
@ 2018-03-23 15:10 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:10 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> > + s->allocflags = allocflags;
>
> I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> s->oo and s->min to avoid some possible compiler misoptimizations.
It only matters that 0 etc is never written.
> Another problem is that it updates s->oo and later it updates s->max:
> s->oo = oo_make(order, size, s->reserved);
> s->min = oo_make(get_order(size), size, s->reserved);
> if (oo_objects(s->oo) > oo_objects(s->max))
> s->max = s->oo;
> --- so, the concurrently running code could see s->oo > s->max, which
> could trigger some memory corruption.
Well s->max is only relevant for code that analyses the details of slab
structures for diagnostics.
> s->max is only used in memory allocations -
> kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> perhaps we could fix the bug by removing s->max at all and always
> allocating enough memory for the maximum possible number of objects?
>
> - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
256kbyte. Not good.
The simplest measure would be to disallow the changing of the order while
the slab contains objects.
Subject: slub: Disallow order changes when objects exist in a slab
There seems to be a couple of races that would have to be
addressed if the slab order would be changed during active use.
Lets disallow this in the same way as we also do not allow
other changes of slab characteristics when objects are active.
Signed-off-by: Christoph Lameter <cl@linux.com>
Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
unsigned long order;
int err;
+ if (any_slab_objects(s))
+ return -EBUSY;
+
err = kstrtoul(buf, 10, &order);
if (err)
return err;
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-23 15:10 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:10 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> > + s->allocflags = allocflags;
>
> I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> s->oo and s->min to avoid some possible compiler misoptimizations.
It only matters that 0 etc is never written.
> Another problem is that it updates s->oo and later it updates s->max:
> s->oo = oo_make(order, size, s->reserved);
> s->min = oo_make(get_order(size), size, s->reserved);
> if (oo_objects(s->oo) > oo_objects(s->max))
> s->max = s->oo;
> --- so, the concurrently running code could see s->oo > s->max, which
> could trigger some memory corruption.
Well s->max is only relevant for code that analyses the details of slab
structures for diagnostics.
> s->max is only used in memory allocations -
> kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> perhaps we could fix the bug by removing s->max at all and always
> allocating enough memory for the maximum possible number of objects?
>
> - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
256kbyte. Not good.
The simplest measure would be to disallow the changing of the order while
the slab contains objects.
Subject: slub: Disallow order changes when objects exist in a slab
There seems to be a couple of races that would have to be
addressed if the slab order would be changed during active use.
Lets disallow this in the same way as we also do not allow
other changes of slab characteristics when objects are active.
Signed-off-by: Christoph Lameter <cl@linux.com>
Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
unsigned long order;
int err;
+ if (any_slab_objects(s))
+ return -EBUSY;
+
err = kstrtoul(buf, 10, &order);
if (err)
return err;
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-23 15:10 ` Christopher Lameter
@ 2018-03-23 15:31 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-23 15:31 UTC (permalink / raw)
To: Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Fri, 23 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > + s->allocflags = allocflags;
> >
> > I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> > s->oo and s->min to avoid some possible compiler misoptimizations.
>
> It only matters that 0 etc is never written.
The C11 standard says that reads and writes of the same variable should't
race (even if you write the same value as before), and consequently, the
compiler can make transformations based on this assumption. For example,
the compiler optimization may transform the following code
> allocflags = 0;
> if (order)
> allocflags |= __GFP_COMP;
> if (s->flags & SLAB_CACHE_DMA)
> allocflags |= GFP_DMA;
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> allocflags |= __GFP_RECLAIMABLE;
> s->allocflags = allocflags;
back into:
> s->allocflags = 0;
> if (order)
> s->allocflags |= __GFP_COMP;
> if (s->flags & SLAB_CACHE_DMA)
> s->allocflags |= GFP_DMA;
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> s->allocflags |= __GFP_RECLAIMABLE;
Afaik, gcc currently doesn't do this transformation, but it's better to
write standard-compliant code and use the macro WRITE_ONCE for variables
that may be concurrently read and written.
> > Another problem is that it updates s->oo and later it updates s->max:
> > s->oo = oo_make(order, size, s->reserved);
> > s->min = oo_make(get_order(size), size, s->reserved);
> > if (oo_objects(s->oo) > oo_objects(s->max))
> > s->max = s->oo;
> > --- so, the concurrently running code could see s->oo > s->max, which
> > could trigger some memory corruption.
>
> Well s->max is only relevant for code that analyses the details of slab
> structures for diagnostics.
>
> > s->max is only used in memory allocations -
> > kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> > perhaps we could fix the bug by removing s->max at all and always
> > allocating enough memory for the maximum possible number of objects?
> >
> > - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> > + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
>
> MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
> 256kbyte. Not good.
I think it's one bit per object, so the total size of the allocation is
4k. Allocating 4k shouldn't be a problem.
> The simplest measure would be to disallow the changing of the order while
> the slab contains objects.
>
>
> Subject: slub: Disallow order changes when objects exist in a slab
>
> There seems to be a couple of races that would have to be
> addressed if the slab order would be changed during active use.
>
> Lets disallow this in the same way as we also do not allow
> other changes of slab characteristics when objects are active.
>
> Signed-off-by: Christoph Lameter <cl@linux.com>
>
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
> unsigned long order;
> int err;
>
> + if (any_slab_objects(s))
> + return -EBUSY;
> +
> err = kstrtoul(buf, 10, &order);
> if (err)
> return err;
This test isn't locked against anything, so it may race with concurrent
allocation. "any_slab_objects" may return false and a new object in the
slab cache may appear immediatelly after that.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-23 15:31 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-03-23 15:31 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Fri, 23 Mar 2018, Christopher Lameter wrote:
> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>
> > > + s->allocflags = allocflags;
> >
> > I'd also use "WRITE_ONCE(s->allocflags, allocflags)" here and when writing
> > s->oo and s->min to avoid some possible compiler misoptimizations.
>
> It only matters that 0 etc is never written.
The C11 standard says that reads and writes of the same variable should't
race (even if you write the same value as before), and consequently, the
compiler can make transformations based on this assumption. For example,
the compiler optimization may transform the following code
> allocflags = 0;
> if (order)
> allocflags |= __GFP_COMP;
> if (s->flags & SLAB_CACHE_DMA)
> allocflags |= GFP_DMA;
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> allocflags |= __GFP_RECLAIMABLE;
> s->allocflags = allocflags;
back into:
> s->allocflags = 0;
> if (order)
> s->allocflags |= __GFP_COMP;
> if (s->flags & SLAB_CACHE_DMA)
> s->allocflags |= GFP_DMA;
> if (s->flags & SLAB_RECLAIM_ACCOUNT)
> s->allocflags |= __GFP_RECLAIMABLE;
Afaik, gcc currently doesn't do this transformation, but it's better to
write standard-compliant code and use the macro WRITE_ONCE for variables
that may be concurrently read and written.
> > Another problem is that it updates s->oo and later it updates s->max:
> > s->oo = oo_make(order, size, s->reserved);
> > s->min = oo_make(get_order(size), size, s->reserved);
> > if (oo_objects(s->oo) > oo_objects(s->max))
> > s->max = s->oo;
> > --- so, the concurrently running code could see s->oo > s->max, which
> > could trigger some memory corruption.
>
> Well s->max is only relevant for code that analyses the details of slab
> structures for diagnostics.
>
> > s->max is only used in memory allocations -
> > kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long)), so
> > perhaps we could fix the bug by removing s->max at all and always
> > allocating enough memory for the maximum possible number of objects?
> >
> > - kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL);
> > + kmalloc(BITS_TO_LONGS(MAX_OBJS_PER_PAGE) * sizeof(unsigned long), GFP_KERNEL);
>
> MAX_OBJS_PER_PAGE is 32k. So you are looking at contiguous allocations of
> 256kbyte. Not good.
I think it's one bit per object, so the total size of the allocation is
4k. Allocating 4k shouldn't be a problem.
> The simplest measure would be to disallow the changing of the order while
> the slab contains objects.
>
>
> Subject: slub: Disallow order changes when objects exist in a slab
>
> There seems to be a couple of races that would have to be
> addressed if the slab order would be changed during active use.
>
> Lets disallow this in the same way as we also do not allow
> other changes of slab characteristics when objects are active.
>
> Signed-off-by: Christoph Lameter <cl@linux.com>
>
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -4919,6 +4919,9 @@ static ssize_t order_store(struct kmem_c
> unsigned long order;
> int err;
>
> + if (any_slab_objects(s))
> + return -EBUSY;
> +
> err = kstrtoul(buf, 10, &order);
> if (err)
> return err;
This test isn't locked against anything, so it may race with concurrent
allocation. "any_slab_objects" may return false and a new object in the
slab cache may appear immediatelly after that.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-23 15:31 ` Mikulas Patocka
@ 2018-03-23 15:48 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:48 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On Fri, 23 Mar 2018, Mikulas Patocka wrote:
> This test isn't locked against anything, so it may race with concurrent
> allocation. "any_slab_objects" may return false and a new object in the
> slab cache may appear immediatelly after that.
Ok the same reasoning applies to numerous other slab configuration
settings in /sys/kernel/slab.... So we need to disable all of that or come
up with a sane way of synchronization.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-03-23 15:48 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-03-23 15:48 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On Fri, 23 Mar 2018, Mikulas Patocka wrote:
> This test isn't locked against anything, so it may race with concurrent
> allocation. "any_slab_objects" may return false and a new object in the
> slab cache may appear immediatelly after that.
Ok the same reasoning applies to numerous other slab configuration
settings in /sys/kernel/slab.... So we need to disable all of that or come
up with a sane way of synchronization.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-03-21 18:36 ` Mikulas Patocka
@ 2018-04-13 9:22 ` Vlastimil Babka
-1 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-13 9:22 UTC (permalink / raw)
To: Mikulas Patocka, Christopher Lameter
Cc: Matthew Wilcox, Pekka Enberg, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-mm, dm-devel, Mike Snitzer
On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
>
>
> On Wed, 21 Mar 2018, Christopher Lameter wrote:
>
>> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>>
>>>> You should not be using the slab allocators for these. Allocate higher
>>>> order pages or numbers of consecutive smaller pagess from the page
>>>> allocator. The slab allocators are written for objects smaller than page
>>>> size.
>>>
>>> So, do you argue that I need to write my own slab cache functionality
>>> instead of using the existing slab code?
>>
>> Just use the existing page allocator calls to allocate and free the
>> memory you need.
>>
>>> I can do it - but duplicating code is bad thing.
>>
>> There is no need to duplicate anything. There is lots of infrastructure
>> already in the kernel. You just need to use the right allocation / freeing
>> calls.
>
> So, what would you recommend for allocating 640KB objects while minimizing
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory
> fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes
> synchronization across all CPUs)
>
> anything else?
>
> The slab cache with large order seems as a best choice for this.
Sorry for being late, I just read this thread and tend to agree with
Mikulas, that this is a good use case for SL*B. If we extend the
use-case from "space-efficient allocator of objects smaller than page
size" to "space-efficient allocator of objects that are not power-of-two
pages" then IMHO it turns out the implementation would be almost the
same. All other variants listed above would lead to waste of memory or
fragmentation.
Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
attending, or anyone else that can vouch for your usecase?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-13 9:22 ` Vlastimil Babka
0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-13 9:22 UTC (permalink / raw)
To: Mikulas Patocka, Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
>
>
> On Wed, 21 Mar 2018, Christopher Lameter wrote:
>
>> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
>>
>>>> You should not be using the slab allocators for these. Allocate higher
>>>> order pages or numbers of consecutive smaller pagess from the page
>>>> allocator. The slab allocators are written for objects smaller than page
>>>> size.
>>>
>>> So, do you argue that I need to write my own slab cache functionality
>>> instead of using the existing slab code?
>>
>> Just use the existing page allocator calls to allocate and free the
>> memory you need.
>>
>>> I can do it - but duplicating code is bad thing.
>>
>> There is no need to duplicate anything. There is lots of infrastructure
>> already in the kernel. You just need to use the right allocation / freeing
>> calls.
>
> So, what would you recommend for allocating 640KB objects while minimizing
> wasted space?
> * alloc_pages - rounds up to the next power of two
> * kmalloc - rounds up to the next power of two
> * alloc_pages_exact - O(n*log n) complexity; and causes memory
> fragmentation if used excesivelly
> * vmalloc - horrible performance (modifies page tables and that causes
> synchronization across all CPUs)
>
> anything else?
>
> The slab cache with large order seems as a best choice for this.
Sorry for being late, I just read this thread and tend to agree with
Mikulas, that this is a good use case for SL*B. If we extend the
use-case from "space-efficient allocator of objects smaller than page
size" to "space-efficient allocator of objects that are not power-of-two
pages" then IMHO it turns out the implementation would be almost the
same. All other variants listed above would lead to waste of memory or
fragmentation.
Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
attending, or anyone else that can vouch for your usecase?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-13 9:22 ` Vlastimil Babka
@ 2018-04-13 15:10 ` Mike Snitzer
-1 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-13 15:10 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Mikulas Patocka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton
On Fri, Apr 13 2018 at 5:22am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:
> On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
> >
> >
> > On Wed, 21 Mar 2018, Christopher Lameter wrote:
> >
> >> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> >>
> >>>> You should not be using the slab allocators for these. Allocate higher
> >>>> order pages or numbers of consecutive smaller pagess from the page
> >>>> allocator. The slab allocators are written for objects smaller than page
> >>>> size.
> >>>
> >>> So, do you argue that I need to write my own slab cache functionality
> >>> instead of using the existing slab code?
> >>
> >> Just use the existing page allocator calls to allocate and free the
> >> memory you need.
> >>
> >>> I can do it - but duplicating code is bad thing.
> >>
> >> There is no need to duplicate anything. There is lots of infrastructure
> >> already in the kernel. You just need to use the right allocation / freeing
> >> calls.
> >
> > So, what would you recommend for allocating 640KB objects while minimizing
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory
> > fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes
> > synchronization across all CPUs)
> >
> > anything else?
> >
> > The slab cache with large order seems as a best choice for this.
>
> Sorry for being late, I just read this thread and tend to agree with
> Mikulas, that this is a good use case for SL*B. If we extend the
> use-case from "space-efficient allocator of objects smaller than page
> size" to "space-efficient allocator of objects that are not power-of-two
> pages" then IMHO it turns out the implementation would be almost the
> same. All other variants listed above would lead to waste of memory or
> fragmentation.
>
> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> attending, or anyone else that can vouch for your usecase?
Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
the 4.17 merge window).
Mike
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-13 15:10 ` Mike Snitzer
0 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-13 15:10 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
Mikulas Patocka, David Rientjes, Christopher Lameter,
Joonsoo Kim
On Fri, Apr 13 2018 at 5:22am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:
> On 03/21/2018 07:36 PM, Mikulas Patocka wrote:
> >
> >
> > On Wed, 21 Mar 2018, Christopher Lameter wrote:
> >
> >> On Wed, 21 Mar 2018, Mikulas Patocka wrote:
> >>
> >>>> You should not be using the slab allocators for these. Allocate higher
> >>>> order pages or numbers of consecutive smaller pagess from the page
> >>>> allocator. The slab allocators are written for objects smaller than page
> >>>> size.
> >>>
> >>> So, do you argue that I need to write my own slab cache functionality
> >>> instead of using the existing slab code?
> >>
> >> Just use the existing page allocator calls to allocate and free the
> >> memory you need.
> >>
> >>> I can do it - but duplicating code is bad thing.
> >>
> >> There is no need to duplicate anything. There is lots of infrastructure
> >> already in the kernel. You just need to use the right allocation / freeing
> >> calls.
> >
> > So, what would you recommend for allocating 640KB objects while minimizing
> > wasted space?
> > * alloc_pages - rounds up to the next power of two
> > * kmalloc - rounds up to the next power of two
> > * alloc_pages_exact - O(n*log n) complexity; and causes memory
> > fragmentation if used excesivelly
> > * vmalloc - horrible performance (modifies page tables and that causes
> > synchronization across all CPUs)
> >
> > anything else?
> >
> > The slab cache with large order seems as a best choice for this.
>
> Sorry for being late, I just read this thread and tend to agree with
> Mikulas, that this is a good use case for SL*B. If we extend the
> use-case from "space-efficient allocator of objects smaller than page
> size" to "space-efficient allocator of objects that are not power-of-two
> pages" then IMHO it turns out the implementation would be almost the
> same. All other variants listed above would lead to waste of memory or
> fragmentation.
>
> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> attending, or anyone else that can vouch for your usecase?
Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
the 4.17 merge window).
Mike
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-13 15:10 ` Mike Snitzer
@ 2018-04-16 12:38 ` Vlastimil Babka
-1 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 12:38 UTC (permalink / raw)
To: Mike Snitzer
Cc: Mikulas Patocka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton
On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> On Fri, Apr 13 2018 at 5:22am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
>> attending, or anyone else that can vouch for your usecase?
>
> Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
>
> Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> the 4.17 merge window).
Can you or Mikulas briefly summarize how the dependency is avoided, and
whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
dm-bufio code would happily switch to it, or not?
Thanks,
Vlastimil
> Mike
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 12:38 ` Vlastimil Babka
0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 12:38 UTC (permalink / raw)
To: Mike Snitzer
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
Mikulas Patocka, David Rientjes, Christopher Lameter,
Joonsoo Kim
On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> On Fri, Apr 13 2018 at 5:22am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
>> attending, or anyone else that can vouch for your usecase?
>
> Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
>
> Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> the 4.17 merge window).
Can you or Mikulas briefly summarize how the dependency is avoided, and
whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
dm-bufio code would happily switch to it, or not?
Thanks,
Vlastimil
> Mike
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 12:38 ` Vlastimil Babka
@ 2018-04-16 14:27 ` Mike Snitzer
-1 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:27 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Mikulas Patocka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton
On Mon, Apr 16 2018 at 8:38am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:
> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at 5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> >
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> >
> > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
>
> Can you or Mikulas briefly summarize how the dependency is avoided, and
> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?
git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for dm_buffer structure allocations")
So no, I don't see why dm-bufio would need to switch to
SLAB_MINIMIZE_WASTE if it were introduced in the future.
Mike
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:27 ` Mike Snitzer
0 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:27 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
Mikulas Patocka, David Rientjes, Christopher Lameter,
Joonsoo Kim
On Mon, Apr 16 2018 at 8:38am -0400,
Vlastimil Babka <vbabka@suse.cz> wrote:
> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at 5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> >
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> >
> > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
>
> Can you or Mikulas briefly summarize how the dependency is avoided, and
> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?
git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for dm_buffer structure allocations")
So no, I don't see why dm-bufio would need to switch to
SLAB_MINIMIZE_WASTE if it were introduced in the future.
Mike
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 14:27 ` Mike Snitzer
@ 2018-04-16 14:37 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:37 UTC (permalink / raw)
To: Mike Snitzer
Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton
On Mon, 16 Apr 2018, Mike Snitzer wrote:
> On Mon, Apr 16 2018 at 8:38am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
>
> > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > On Fri, Apr 13 2018 at 5:22am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > >>
> > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > >> attending, or anyone else that can vouch for your usecase?
> > >
> > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > >
> > > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > the 4.17 merge window).
> >
> > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > dm-bufio code would happily switch to it, or not?
>
> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
>
> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
> dm_buffer structure allocations")
>
> So no, I don't see why dm-bufio would need to switch to
> SLAB_MINIMIZE_WASTE if it were introduced in the future.
Currently, the slab cache rounds up the size of the slab to the next power
of two (if the size is large). And that wastes memory if that memory were
to be used for deduplication tables.
Generally, the performance of the deduplication solution depends on how
much data can you put to memory. If you round 640KB buffer to 1MB (this is
what the slab and slub subsystem currently do), you waste a lot of memory.
Deduplication indices with 640KB blocks are already used in the wild, so
it can't be easily changed.
> Mike
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:37 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:37 UTC (permalink / raw)
To: Mike Snitzer
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christopher Lameter, Joonsoo Kim,
Vlastimil Babka
On Mon, 16 Apr 2018, Mike Snitzer wrote:
> On Mon, Apr 16 2018 at 8:38am -0400,
> Vlastimil Babka <vbabka@suse.cz> wrote:
>
> > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > On Fri, Apr 13 2018 at 5:22am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > >>
> > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > >> attending, or anyone else that can vouch for your usecase?
> > >
> > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > >
> > > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > the 4.17 merge window).
> >
> > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > dm-bufio code would happily switch to it, or not?
>
> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
>
> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
> dm_buffer structure allocations")
>
> So no, I don't see why dm-bufio would need to switch to
> SLAB_MINIMIZE_WASTE if it were introduced in the future.
Currently, the slab cache rounds up the size of the slab to the next power
of two (if the size is large). And that wastes memory if that memory were
to be used for deduplication tables.
Generally, the performance of the deduplication solution depends on how
much data can you put to memory. If you round 640KB buffer to 1MB (this is
what the slab and slub subsystem currently do), you waste a lot of memory.
Deduplication indices with 640KB blocks are already used in the wild, so
it can't be easily changed.
> Mike
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 14:37 ` Mikulas Patocka
@ 2018-04-16 14:46 ` Mike Snitzer
-1 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:46 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton
On Mon, Apr 16 2018 at 10:37am -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:
>
>
> On Mon, 16 Apr 2018, Mike Snitzer wrote:
>
> > On Mon, Apr 16 2018 at 8:38am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > On Fri, Apr 13 2018 at 5:22am -0400,
> > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >>
> > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > >> attending, or anyone else that can vouch for your usecase?
> > > >
> > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > >
> > > > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > the 4.17 merge window).
> > >
> > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > dm-bufio code would happily switch to it, or not?
> >
> > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> >
> > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
> > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
> > dm_buffer structure allocations")
> >
> > So no, I don't see why dm-bufio would need to switch to
> > SLAB_MINIMIZE_WASTE if it were introduced in the future.
>
> Currently, the slab cache rounds up the size of the slab to the next power
> of two (if the size is large). And that wastes memory if that memory were
> to be used for deduplication tables.
You mean on an overall size of the cache level? Or on a per-object
level? I can only imagine you mean the former.
> Generally, the performance of the deduplication solution depends on how
> much data can you put to memory. If you round 640KB buffer to 1MB (this is
> what the slab and slub subsystem currently do), you waste a lot of memory.
> Deduplication indices with 640KB blocks are already used in the wild, so
> it can't be easily changed.
OK, seems you're suggesting a single object is rounded up.. so then this
header is very wrong?:
commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
Author: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon Mar 26 20:29:45 2018 +0200
dm bufio: use slab cache for dm_buffer structure allocations
kmalloc padded to the next power of two, using a slab cache avoids this.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Please clarify further, thanks!
Mike
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:46 ` Mike Snitzer
0 siblings, 0 replies; 109+ messages in thread
From: Mike Snitzer @ 2018-04-16 14:46 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christopher Lameter, Joonsoo Kim,
Vlastimil Babka
On Mon, Apr 16 2018 at 10:37am -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:
>
>
> On Mon, 16 Apr 2018, Mike Snitzer wrote:
>
> > On Mon, Apr 16 2018 at 8:38am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > On Fri, Apr 13 2018 at 5:22am -0400,
> > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >>
> > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > >> attending, or anyone else that can vouch for your usecase?
> > > >
> > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > >
> > > > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > the 4.17 merge window).
> > >
> > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > dm-bufio code would happily switch to it, or not?
> >
> > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> >
> > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
> > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
> > dm_buffer structure allocations")
> >
> > So no, I don't see why dm-bufio would need to switch to
> > SLAB_MINIMIZE_WASTE if it were introduced in the future.
>
> Currently, the slab cache rounds up the size of the slab to the next power
> of two (if the size is large). And that wastes memory if that memory were
> to be used for deduplication tables.
You mean on an overall size of the cache level? Or on a per-object
level? I can only imagine you mean the former.
> Generally, the performance of the deduplication solution depends on how
> much data can you put to memory. If you round 640KB buffer to 1MB (this is
> what the slab and slub subsystem currently do), you waste a lot of memory.
> Deduplication indices with 640KB blocks are already used in the wild, so
> it can't be easily changed.
OK, seems you're suggesting a single object is rounded up.. so then this
header is very wrong?:
commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
Author: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon Mar 26 20:29:45 2018 +0200
dm bufio: use slab cache for dm_buffer structure allocations
kmalloc padded to the next power of two, using a slab cache avoids this.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Please clarify further, thanks!
Mike
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 14:46 ` Mike Snitzer
@ 2018-04-16 14:57 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:57 UTC (permalink / raw)
To: Mike Snitzer
Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton
On Mon, 16 Apr 2018, Mike Snitzer wrote:
> On Mon, Apr 16 2018 at 10:37am -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
>
> >
> >
> > On Mon, 16 Apr 2018, Mike Snitzer wrote:
> >
> > > On Mon, Apr 16 2018 at 8:38am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > >
> > > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > > On Fri, Apr 13 2018 at 5:22am -0400,
> > > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > > >>
> > > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > > >> attending, or anyone else that can vouch for your usecase?
> > > > >
> > > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > > >
> > > > > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > > the 4.17 merge window).
> > > >
> > > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > > dm-bufio code would happily switch to it, or not?
> > >
> > > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> > >
> > > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
> > > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
> > > dm_buffer structure allocations")
> > >
> > > So no, I don't see why dm-bufio would need to switch to
> > > SLAB_MINIMIZE_WASTE if it were introduced in the future.
> >
> > Currently, the slab cache rounds up the size of the slab to the next power
> > of two (if the size is large). And that wastes memory if that memory were
> > to be used for deduplication tables.
>
> You mean on an overall size of the cache level? Or on a per-object
> level? I can only imagine you mean the former.
Unfortunatelly, it rounds up every object. So, if you have six 640KB
objects, it consumes 6MB.
> > Generally, the performance of the deduplication solution depends on how
> > much data can you put to memory. If you round 640KB buffer to 1MB (this is
> > what the slab and slub subsystem currently do), you waste a lot of memory.
> > Deduplication indices with 640KB blocks are already used in the wild, so
> > it can't be easily changed.
>
> OK, seems you're suggesting a single object is rounded up.. so then this
> header is very wrong?:
>
> commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
> Author: Mikulas Patocka <mpatocka@redhat.com>
> Date: Mon Mar 26 20:29:45 2018 +0200
>
> dm bufio: use slab cache for dm_buffer structure allocations
>
> kmalloc padded to the next power of two, using a slab cache avoids this.
>
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
>
> Please clarify further, thanks!
> Mike
Yes, using a slab cache currently doesn't avoid this rouding (it needs the
SLAB_MINIMIZE_WASTE patch to do that).
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 14:57 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 14:57 UTC (permalink / raw)
To: Mike Snitzer
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christopher Lameter, Joonsoo Kim,
Vlastimil Babka
On Mon, 16 Apr 2018, Mike Snitzer wrote:
> On Mon, Apr 16 2018 at 10:37am -0400,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
>
> >
> >
> > On Mon, 16 Apr 2018, Mike Snitzer wrote:
> >
> > > On Mon, Apr 16 2018 at 8:38am -0400,
> > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > >
> > > > On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > > > > On Fri, Apr 13 2018 at 5:22am -0400,
> > > > > Vlastimil Babka <vbabka@suse.cz> wrote:
> > > > >>
> > > > >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> > > > >> attending, or anyone else that can vouch for your usecase?
> > > > >
> > > > > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> > > > >
> > > > > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > > > > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > > > > the 4.17 merge window).
> > > >
> > > > Can you or Mikulas briefly summarize how the dependency is avoided, and
> > > > whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> > > > dm-bufio code would happily switch to it, or not?
> > >
> > > git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
> > >
> > > But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
> > > 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
> > > dm_buffer structure allocations")
> > >
> > > So no, I don't see why dm-bufio would need to switch to
> > > SLAB_MINIMIZE_WASTE if it were introduced in the future.
> >
> > Currently, the slab cache rounds up the size of the slab to the next power
> > of two (if the size is large). And that wastes memory if that memory were
> > to be used for deduplication tables.
>
> You mean on an overall size of the cache level? Or on a per-object
> level? I can only imagine you mean the former.
Unfortunatelly, it rounds up every object. So, if you have six 640KB
objects, it consumes 6MB.
> > Generally, the performance of the deduplication solution depends on how
> > much data can you put to memory. If you round 640KB buffer to 1MB (this is
> > what the slab and slub subsystem currently do), you waste a lot of memory.
> > Deduplication indices with 640KB blocks are already used in the wild, so
> > it can't be easily changed.
>
> OK, seems you're suggesting a single object is rounded up.. so then this
> header is very wrong?:
>
> commit 359dbf19ab524652a2208a2a2cddccec2eede2ad
> Author: Mikulas Patocka <mpatocka@redhat.com>
> Date: Mon Mar 26 20:29:45 2018 +0200
>
> dm bufio: use slab cache for dm_buffer structure allocations
>
> kmalloc padded to the next power of two, using a slab cache avoids this.
>
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
>
> Please clarify further, thanks!
> Mike
Yes, using a slab cache currently doesn't avoid this rouding (it needs the
SLAB_MINIMIZE_WASTE patch to do that).
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 14:57 ` Mikulas Patocka
@ 2018-04-16 15:18 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:18 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> > Please clarify further, thanks!
> > Mike
>
> Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> SLAB_MINIMIZE_WASTE patch to do that).
Or an increase in slab_max_order
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 15:18 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:18 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> > Please clarify further, thanks!
> > Mike
>
> Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> SLAB_MINIMIZE_WASTE patch to do that).
Or an increase in slab_max_order
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 15:18 ` Christopher Lameter
@ 2018-04-16 15:25 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 15:25 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > > Please clarify further, thanks!
> > > Mike
> >
> > Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> > SLAB_MINIMIZE_WASTE patch to do that).
>
> Or an increase in slab_max_order
But that will increase it for all slabs (often senselessly - i.e.
kmalloc-4096 would have order 4MB).
I need to increase it just for dm-bufio slabs.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 15:25 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 15:25 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka
On Mon, 16 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > > Please clarify further, thanks!
> > > Mike
> >
> > Yes, using a slab cache currently doesn't avoid this rouding (it needs the
> > SLAB_MINIMIZE_WASTE patch to do that).
>
> Or an increase in slab_max_order
But that will increase it for all slabs (often senselessly - i.e.
kmalloc-4096 would have order 4MB).
I need to increase it just for dm-bufio slabs.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 15:25 ` Mikulas Patocka
@ 2018-04-16 15:45 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:45 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> >
> > Or an increase in slab_max_order
>
> But that will increase it for all slabs (often senselessly - i.e.
> kmalloc-4096 would have order 4MB).
4MB? Nope.... That is a power of two slab so no wasted space even with
order 0.
Its not a senseless increase. The more objects you fit into a slab page
the higher the performance of the allocator.
> I need to increase it just for dm-bufio slabs.
If you do this then others will want the same...
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 15:45 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-16 15:45 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> >
> > Or an increase in slab_max_order
>
> But that will increase it for all slabs (often senselessly - i.e.
> kmalloc-4096 would have order 4MB).
4MB? Nope.... That is a power of two slab so no wasted space even with
order 0.
Its not a senseless increase. The more objects you fit into a slab page
the higher the performance of the allocator.
> I need to increase it just for dm-bufio slabs.
If you do this then others will want the same...
^ permalink raw reply [flat|nested] 109+ messages in thread
* [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 14:46 ` Mike Snitzer
(?)
(?)
@ 2018-04-16 19:32 ` Mikulas Patocka
2018-04-17 14:45 ` Christopher Lameter
-1 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 19:32 UTC (permalink / raw)
To: Mike Snitzer
Cc: Vlastimil Babka, Christopher Lameter, Matthew Wilcox,
Pekka Enberg, linux-mm, dm-devel, David Rientjes, Joonsoo Kim,
Andrew Morton, linux-kernel
This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
flag causes allocation of larger slab caches in order to minimize wasted
space.
This is needed because we want to use dm-bufio for deduplication index and
there are existing installations with non-power-of-two block sizes (such
as 640KB). The performance of the whole solution depends on efficient
memory use, so we must waste as little memory as possible.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
drivers/md/dm-bufio.c | 2 +-
include/linux/slab.h | 7 +++++++
mm/slab.c | 4 ++--
mm/slab.h | 7 ++++---
mm/slab_common.c | 2 +-
mm/slub.c | 25 ++++++++++++++++++++-----
6 files changed, 35 insertions(+), 12 deletions(-)
Index: linux-2.6/include/linux/slab.h
===================================================================
--- linux-2.6.orig/include/linux/slab.h 2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/include/linux/slab.h 2018-04-16 21:10:45.000000000 +0200
@@ -108,6 +108,13 @@
#define SLAB_KASAN 0
#endif
+/*
+ * Use higer order allocations to minimize wasted space.
+ * Note: the allocation is unreliable if this flag is used, the caller
+ * must handle allocation failures gracefully.
+ */
+#define SLAB_MINIMIZE_WASTE ((slab_flags_t __force)0x10000000U)
+
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
Index: linux-2.6/mm/slab_common.c
===================================================================
--- linux-2.6.orig/mm/slab_common.c 2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slab_common.c 2018-04-16 21:10:45.000000000 +0200
@@ -53,7 +53,7 @@ static DECLARE_WORK(slab_caches_to_rcu_d
SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
/*
* Merge control. If this is set then no merging of slab caches will occur.
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slub.c 2018-04-16 21:12:41.000000000 +0200
@@ -3249,7 +3249,7 @@ static inline unsigned int slab_order(un
return order;
}
-static inline int calculate_order(unsigned int size, unsigned int reserved)
+static inline int calculate_order(unsigned int size, unsigned int reserved, slab_flags_t flags)
{
unsigned int order;
unsigned int min_objects;
@@ -3277,7 +3277,7 @@ static inline int calculate_order(unsign
order = slab_order(size, min_objects,
slub_max_order, fraction, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
fraction /= 2;
}
min_objects--;
@@ -3289,15 +3289,30 @@ static inline int calculate_order(unsign
*/
order = slab_order(size, 1, slub_max_order, 1, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
order = slab_order(size, 1, MAX_ORDER, 1, reserved);
if (order < MAX_ORDER)
- return order;
+ goto ret_order;
return -ENOSYS;
+
+ret_order:
+ if (flags & SLAB_MINIMIZE_WASTE) {
+ /* Increase the order if it decreases waste */
+ int test_order;
+ for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+ unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+ unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+ if (test_order_objects >= min(32, MAX_OBJS_PER_PAGE))
+ break;
+ if (test_order_objects > order_objects << (test_order - order))
+ order = test_order;
+ }
+ }
+ return order;
}
static void
@@ -3562,7 +3577,7 @@ static int calculate_sizes(struct kmem_c
if (forced_order >= 0)
order = forced_order;
else
- order = calculate_order(size, s->reserved);
+ order = calculate_order(size, s->reserved, flags);
if ((int)order < 0)
return 0;
Index: linux-2.6/drivers/md/dm-bufio.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-bufio.c 2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/drivers/md/dm-bufio.c 2018-04-16 21:11:23.000000000 +0200
@@ -1683,7 +1683,7 @@ struct dm_bufio_client *dm_bufio_client_
(block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", c->block_size);
c->slab_cache = kmem_cache_create(slab_name, c->block_size, ARCH_KMALLOC_MINALIGN,
- SLAB_RECLAIM_ACCOUNT, NULL);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MINIMIZE_WASTE, NULL);
if (!c->slab_cache) {
r = -ENOMEM;
goto bad;
Index: linux-2.6/mm/slab.h
===================================================================
--- linux-2.6.orig/mm/slab.h 2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slab.h 2018-04-16 21:10:45.000000000 +0200
@@ -142,10 +142,10 @@ static inline slab_flags_t kmem_cache_fl
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT)
+ SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_MINIMIZE_WASTE)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
@@ -164,7 +164,8 @@ static inline slab_flags_t kmem_cache_fl
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | \
+ SLAB_MINIMIZE_WASTE)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
Index: linux-2.6/mm/slab.c
===================================================================
--- linux-2.6.orig/mm/slab.c 2018-04-16 21:10:45.000000000 +0200
+++ linux-2.6/mm/slab.c 2018-04-16 21:10:45.000000000 +0200
@@ -1790,14 +1790,14 @@ static size_t calculate_slab_order(struc
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
- if (flags & SLAB_RECLAIM_ACCOUNT)
+ if (flags & SLAB_RECLAIM_ACCOUNT && !(flags & SLAB_MINIMIZE_WASTE))
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
- if (gfporder >= slab_max_order)
+ if (gfporder >= slab_max_order && !(flags & SLAB_MINIMIZE_WASTE))
break;
/*
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 15:45 ` Christopher Lameter
@ 2018-04-16 19:36 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 19:36 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > >
> > > Or an increase in slab_max_order
> >
> > But that will increase it for all slabs (often senselessly - i.e.
> > kmalloc-4096 would have order 4MB).
>
> 4MB? Nope.... That is a power of two slab so no wasted space even with
> order 0.
See this email:
https://www.redhat.com/archives/dm-devel/2018-March/msg00387.html
If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
So yes, it increases the order of all slab caches (although not up to
4MB).
> Its not a senseless increase. The more objects you fit into a slab page
> the higher the performance of the allocator.
>
>
> > I need to increase it just for dm-bufio slabs.
>
> If you do this then others will want the same...
If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 19:36 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 19:36 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka
On Mon, 16 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > >
> > > Or an increase in slab_max_order
> >
> > But that will increase it for all slabs (often senselessly - i.e.
> > kmalloc-4096 would have order 4MB).
>
> 4MB? Nope.... That is a power of two slab so no wasted space even with
> order 0.
See this email:
https://www.redhat.com/archives/dm-devel/2018-March/msg00387.html
If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
So yes, it increases the order of all slab caches (although not up to
4MB).
> Its not a senseless increase. The more objects you fit into a slab page
> the higher the performance of the allocator.
>
>
> > I need to increase it just for dm-bufio slabs.
>
> If you do this then others will want the same...
If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 14:37 ` Mikulas Patocka
@ 2018-04-16 19:38 ` Vlastimil Babka
-1 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 19:38 UTC (permalink / raw)
To: Mikulas Patocka, Mike Snitzer
Cc: Christopher Lameter, Matthew Wilcox, Pekka Enberg, linux-mm,
dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On 04/16/2018 04:37 PM, Mikulas Patocka wrote:
>>> Can you or Mikulas briefly summarize how the dependency is avoided, and
>>> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
>>> dm-bufio code would happily switch to it, or not?
>>
>> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
>>
>> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
>> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
>> dm_buffer structure allocations")
>>
>> So no, I don't see why dm-bufio would need to switch to
>> SLAB_MINIMIZE_WASTE if it were introduced in the future.
>
> Currently, the slab cache rounds up the size of the slab to the next power
> of two (if the size is large). And that wastes memory if that memory were
> to be used for deduplication tables.
>
> Generally, the performance of the deduplication solution depends on how
> much data can you put to memory. If you round 640KB buffer to 1MB (this is
> what the slab and slub subsystem currently do), you waste a lot of memory.
> Deduplication indices with 640KB blocks are already used in the wild, so
> it can't be easily changed.
Thank you both for the clarification.
Vlastimil
>
>> Mike
>
> Mikulas
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 19:38 ` Vlastimil Babka
0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 19:38 UTC (permalink / raw)
To: Mikulas Patocka, Mike Snitzer
Cc: Andrew Morton, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Christopher Lameter, Joonsoo Kim
On 04/16/2018 04:37 PM, Mikulas Patocka wrote:
>>> Can you or Mikulas briefly summarize how the dependency is avoided, and
>>> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
>>> dm-bufio code would happily switch to it, or not?
>>
>> git log eeb67a0ba04df^..45354f1eb67224669a1 -- drivers/md/dm-bufio.c
>>
>> But the most signficant commit relative to SLAB_MINIMIZE_WASTE is:
>> 359dbf19ab524652a2208a2a2cddccec2eede2ad ("dm bufio: use slab cache for
>> dm_buffer structure allocations")
>>
>> So no, I don't see why dm-bufio would need to switch to
>> SLAB_MINIMIZE_WASTE if it were introduced in the future.
>
> Currently, the slab cache rounds up the size of the slab to the next power
> of two (if the size is large). And that wastes memory if that memory were
> to be used for deduplication tables.
>
> Generally, the performance of the deduplication solution depends on how
> much data can you put to memory. If you round 640KB buffer to 1MB (this is
> what the slab and slub subsystem currently do), you waste a lot of memory.
> Deduplication indices with 640KB blocks are already used in the wild, so
> it can't be easily changed.
Thank you both for the clarification.
Vlastimil
>
>> Mike
>
> Mikulas
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 19:36 ` Mikulas Patocka
(?)
@ 2018-04-16 19:53 ` Vlastimil Babka
2018-04-16 21:01 ` Mikulas Patocka
2018-04-17 14:49 ` Christopher Lameter
-1 siblings, 2 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-16 19:53 UTC (permalink / raw)
To: Mikulas Patocka, Christopher Lameter
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton
On 04/16/2018 09:36 PM, Mikulas Patocka wrote:
>
>
> On Mon, 16 Apr 2018, Christopher Lameter wrote:
>
>> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>>
>>>>
>>>> Or an increase in slab_max_order
>>>
>>> But that will increase it for all slabs (often senselessly - i.e.
>>> kmalloc-4096 would have order 4MB).
>>
>> 4MB? Nope.... That is a power of two slab so no wasted space even with
>> order 0.
>
> See this email:
> https://www.redhat.com/archives/dm-devel/2018-March/msg00387.html
>
> If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
> So yes, it increases the order of all slab caches (although not up to
> 4MB).
>
>> Its not a senseless increase. The more objects you fit into a slab page
>> the higher the performance of the allocator.
It's not universally without a cost. It might increase internal
fragmentation of the slabs, if you end up with lots of 4MB pages
containing just few objects. Thus, waste of memory. You also consume
high-order pages that could be used elsewhere. If you fail to allocate
4MB, then what's the fallback, order-0? I doubt it's "the highest
available order". Thus, a more conservative choice e.g. order-3 will
might succeed more in allocating order-3, while a choice of 4MB will
have many order-0 fallbacks.
>>> I need to increase it just for dm-bufio slabs.
>>
>> If you do this then others will want the same...
>
> If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.
I think it should be possible without a new flag. The slub allocator
could just balance priorities (performance vs memory efficiency) better.
Currently I get the impression that "slub_max_order" is a performance
tunable. Let's add another criteria for selecting an order, that would
try to pick an order to minimize wasted space below e.g. 10% with some
different kind of max order. Pick good defaults, add tunables if you must.
I mean, anyone who's creating a cache for 640KB objects most likely
doesn't want to waste another 384KB by each such object. They shouldn't
have to add a flag to let the slub allocator figure out that using 2MB
pages is the right thing to do here.
Vlastimil
> Mikulas
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 19:53 ` Vlastimil Babka
@ 2018-04-16 21:01 ` Mikulas Patocka
2018-04-17 14:40 ` Christopher Lameter
2018-04-17 14:49 ` Christopher Lameter
1 sibling, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 21:01 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Christopher Lameter, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Mon, 16 Apr 2018, Vlastimil Babka wrote:
> On 04/16/2018 09:36 PM, Mikulas Patocka wrote:
>
> >>> I need to increase it just for dm-bufio slabs.
> >>
> >> If you do this then others will want the same...
> >
> > If others need it, they can turn on the flag SLAB_MINIMIZE_WASTE too.
>
> I think it should be possible without a new flag. The slub allocator
> could just balance priorities (performance vs memory efficiency) better.
> Currently I get the impression that "slub_max_order" is a performance
> tunable. Let's add another criteria for selecting an order, that would
> try to pick an order to minimize wasted space below e.g. 10% with some
> different kind of max order. Pick good defaults, add tunables if you must.
>
> I mean, anyone who's creating a cache for 640KB objects most likely
> doesn't want to waste another 384KB by each such object. They shouldn't
> have to add a flag to let the slub allocator figure out that using 2MB
> pages is the right thing to do here.
>
> Vlastimil
The problem is that higher-order allocations (larger than 32K) are
unreliable. So, if you increase page order beyond that, the allocation may
randomly fail.
dm-bufio deals gracefully with allocation failure, because it preallocates
some buffers with vmalloc, but other subsystems may not deal with it and
they cound return ENOMEM randomly or misbehave in other ways. So, the
"SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
the caller is prepared to deal with it.
The slub subsystem does actual fallback to low-order when the allocation
fails (it allows different order for each slab in the same cache), but
slab doesn't fallback and you get NULL if higher-order allocation fails.
So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
fail with higher order.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 12:38 ` Vlastimil Babka
@ 2018-04-16 21:04 ` Mikulas Patocka
-1 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 21:04 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Mike Snitzer, Christopher Lameter, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Vlastimil Babka wrote:
> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at 5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> >
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> >
> > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
>
> Can you or Mikulas briefly summarize how the dependency is avoided, and
This was Mike's misconception - dm-bufio actually needs the
SLAB_MINIMIZE_WASTE patch, otherwise it is wasting memory.
> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?
>
> Thanks,
> Vlastimil
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-16 21:04 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-16 21:04 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Andrew Morton, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Christopher Lameter,
Joonsoo Kim
On Mon, 16 Apr 2018, Vlastimil Babka wrote:
> On 04/13/2018 05:10 PM, Mike Snitzer wrote:
> > On Fri, Apr 13 2018 at 5:22am -0400,
> > Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> Would this perhaps be a good LSF/MM discussion topic? Mikulas, are you
> >> attending, or anyone else that can vouch for your usecase?
> >
> > Any further discussion on SLAB_MINIMIZE_WASTE should continue on list.
> >
> > Mikulas won't be at LSF/MM. But I included Mikulas' dm-bufio changes
> > that no longer depend on this proposed SLAB_MINIMIZE_WASTE (as part of
> > the 4.17 merge window).
>
> Can you or Mikulas briefly summarize how the dependency is avoided, and
This was Mike's misconception - dm-bufio actually needs the
SLAB_MINIMIZE_WASTE patch, otherwise it is wasting memory.
> whether if (something like) SLAB_MINIMIZE_WASTE were implemented, the
> dm-bufio code would happily switch to it, or not?
>
> Thanks,
> Vlastimil
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 21:01 ` Mikulas Patocka
@ 2018-04-17 14:40 ` Christopher Lameter
2018-04-17 18:53 ` Mikulas Patocka
0 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:40 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> dm-bufio deals gracefully with allocation failure, because it preallocates
> some buffers with vmalloc, but other subsystems may not deal with it and
> they cound return ENOMEM randomly or misbehave in other ways. So, the
> "SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
> the caller is prepared to deal with it.
>
> The slub subsystem does actual fallback to low-order when the allocation
> fails (it allows different order for each slab in the same cache), but
> slab doesn't fallback and you get NULL if higher-order allocation fails.
> So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> fail with higher order.
Fix Slab instead of adding a flag that is only useful for one allocator?
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 19:32 ` [PATCH RESEND] " Mikulas Patocka
@ 2018-04-17 14:45 ` Christopher Lameter
2018-04-17 16:16 ` Vlastimil Babka
2018-04-17 19:06 ` Mikulas Patocka
0 siblings, 2 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:45 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> flag causes allocation of larger slab caches in order to minimize wasted
> space.
>
> This is needed because we want to use dm-bufio for deduplication index and
> there are existing installations with non-power-of-two block sizes (such
> as 640KB). The performance of the whole solution depends on efficient
> memory use, so we must waste as little memory as possible.
Hmmm. Can we come up with a generic solution instead?
This may mean relaxing the enforcement of the allocation max order a bit
so that we can get dense allocation through higher order allocs.
But then higher order allocs are generally seen as problematic.
Note that SLUB will fall back to smallest order already if a failure
occurs so increasing slub_max_order may not be that much of an issue.
Maybe drop the max order limit completely and use MAX_ORDER instead? That
means that callers need to be able to tolerate failures.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 19:36 ` Mikulas Patocka
@ 2018-04-17 14:47 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:47 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
> So yes, it increases the order of all slab caches (although not up to
> 4MB).
Hmmm... Ok. There is another setting slub_min_objects that controls how
many objects to fit into a slab page.
We could change the allocation scheme so that it finds the mininum order
with the minimum waste. Allocator performance will drop though since fewer
object are then in a slab page.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 14:47 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:47 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton, Vlastimil Babka
On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> If you boot with slub_max_order=10, the kmalloc-8192 cache has 64 pages.
> So yes, it increases the order of all slab caches (although not up to
> 4MB).
Hmmm... Ok. There is another setting slub_min_objects that controls how
many objects to fit into a slab page.
We could change the allocation scheme so that it finds the mininum order
with the minimum waste. Allocator performance will drop though since fewer
object are then in a slab page.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-16 19:53 ` Vlastimil Babka
@ 2018-04-17 14:49 ` Christopher Lameter
2018-04-17 14:49 ` Christopher Lameter
1 sibling, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:49 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Mikulas Patocka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Vlastimil Babka wrote:
> >> Its not a senseless increase. The more objects you fit into a slab page
> >> the higher the performance of the allocator.
>
> It's not universally without a cost. It might increase internal
> fragmentation of the slabs, if you end up with lots of 4MB pages
> containing just few objects. Thus, waste of memory. You also consume
> high-order pages that could be used elsewhere. If you fail to allocate
> 4MB, then what's the fallback, order-0? I doubt it's "the highest
> available order". Thus, a more conservative choice e.g. order-3 will
> might succeed more in allocating order-3, while a choice of 4MB will
> have many order-0 fallbacks.
Obviously there is a cost. But here the subsystem has a fallback.
> I think it should be possible without a new flag. The slub allocator
> could just balance priorities (performance vs memory efficiency) better.
> Currently I get the impression that "slub_max_order" is a performance
> tunable. Let's add another criteria for selecting an order, that would
> try to pick an order to minimize wasted space below e.g. 10% with some
> different kind of max order. Pick good defaults, add tunables if you must.
There is also slub_min_objects.
> I mean, anyone who's creating a cache for 640KB objects most likely
> doesn't want to waste another 384KB by each such object. They shouldn't
> have to add a flag to let the slub allocator figure out that using 2MB
> pages is the right thing to do here.
I agree if we do this then preferably without a flag.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 14:49 ` Christopher Lameter
0 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 14:49 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
Mikulas Patocka, David Rientjes, Joonsoo Kim, Andrew Morton
On Mon, 16 Apr 2018, Vlastimil Babka wrote:
> >> Its not a senseless increase. The more objects you fit into a slab page
> >> the higher the performance of the allocator.
>
> It's not universally without a cost. It might increase internal
> fragmentation of the slabs, if you end up with lots of 4MB pages
> containing just few objects. Thus, waste of memory. You also consume
> high-order pages that could be used elsewhere. If you fail to allocate
> 4MB, then what's the fallback, order-0? I doubt it's "the highest
> available order". Thus, a more conservative choice e.g. order-3 will
> might succeed more in allocating order-3, while a choice of 4MB will
> have many order-0 fallbacks.
Obviously there is a cost. But here the subsystem has a fallback.
> I think it should be possible without a new flag. The slub allocator
> could just balance priorities (performance vs memory efficiency) better.
> Currently I get the impression that "slub_max_order" is a performance
> tunable. Let's add another criteria for selecting an order, that would
> try to pick an order to minimize wasted space below e.g. 10% with some
> different kind of max order. Pick good defaults, add tunables if you must.
There is also slub_min_objects.
> I mean, anyone who's creating a cache for 640KB objects most likely
> doesn't want to waste another 384KB by each such object. They shouldn't
> have to add a flag to let the slub allocator figure out that using 2MB
> pages is the right thing to do here.
I agree if we do this then preferably without a flag.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 14:45 ` Christopher Lameter
@ 2018-04-17 16:16 ` Vlastimil Babka
2018-04-17 16:38 ` Christopher Lameter
2018-04-17 17:26 ` Mikulas Patocka
2018-04-17 19:06 ` Mikulas Patocka
1 sibling, 2 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-17 16:16 UTC (permalink / raw)
To: Christopher Lameter, Mikulas Patocka
Cc: Mike Snitzer, Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel,
David Rientjes, Joonsoo Kim, Andrew Morton, linux-kernel
On 04/17/2018 04:45 PM, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
>> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
>> flag causes allocation of larger slab caches in order to minimize wasted
>> space.
>>
>> This is needed because we want to use dm-bufio for deduplication index and
>> there are existing installations with non-power-of-two block sizes (such
>> as 640KB). The performance of the whole solution depends on efficient
>> memory use, so we must waste as little memory as possible.
>
> Hmmm. Can we come up with a generic solution instead?
Yes please.
> This may mean relaxing the enforcement of the allocation max order a bit
> so that we can get dense allocation through higher order allocs.
>
> But then higher order allocs are generally seen as problematic.
I think in this case they are better than wasting/fragmenting 384kB for
640kB object.
> Note that SLUB will fall back to smallest order already if a failure
> occurs so increasing slub_max_order may not be that much of an issue.
>
> Maybe drop the max order limit completely and use MAX_ORDER instead?
For packing, sure. For performance, please no (i.e. don't try to
allocate MAX_ORDER for each and every cache).
> That
> means that callers need to be able to tolerate failures.
Is it any different from now? I suppose there would still be
smallest-order fallback involved in sl*b itself? And if your allocation
is so large it can fail even with the fallback (i.e. >= costly order),
you need to tolerate failures anyway?
One corner case I see is if there is anyone who would rather use their
own fallback instead of the space-wasting smallest-order fallback.
Maybe we could map some GFP flag to indicate that.
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 16:16 ` Vlastimil Babka
@ 2018-04-17 16:38 ` Christopher Lameter
2018-04-17 19:09 ` Mikulas Patocka
2018-04-17 17:26 ` Mikulas Patocka
1 sibling, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 16:38 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Mikulas Patocka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Vlastimil Babka wrote:
> On 04/17/2018 04:45 PM, Christopher Lameter wrote:
> > But then higher order allocs are generally seen as problematic.
>
> I think in this case they are better than wasting/fragmenting 384kB for
> 640kB object.
Well typically we have suggested that people use vmalloc in the past.
> > Note that SLUB will fall back to smallest order already if a failure
> > occurs so increasing slub_max_order may not be that much of an issue.
> >
> > Maybe drop the max order limit completely and use MAX_ORDER instead?
>
> For packing, sure. For performance, please no (i.e. don't try to
> allocate MAX_ORDER for each and every cache).
No of course not. We would have to modify the order selection on kmem
cache creation.
> > That
> > means that callers need to be able to tolerate failures.
>
> Is it any different from now? I suppose there would still be
> smallest-order fallback involved in sl*b itself? And if your allocation
> is so large it can fail even with the fallback (i.e. >= costly order),
> you need to tolerate failures anyway?
Failures can occur even with < costly order as far as I can telkl. Order 0
is the only safe one.
> One corner case I see is if there is anyone who would rather use their
> own fallback instead of the space-wasting smallest-order fallback.
> Maybe we could map some GFP flag to indicate that.
Well if you have a fallback then maybe the slab allocator should not fall
back on its own but let the caller deal with it.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 16:16 ` Vlastimil Babka
2018-04-17 16:38 ` Christopher Lameter
@ 2018-04-17 17:26 ` Mikulas Patocka
2018-04-17 19:13 ` Vlastimil Babka
1 sibling, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 17:26 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Christopher Lameter, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Vlastimil Babka wrote:
> On 04/17/2018 04:45 PM, Christopher Lameter wrote:
> > On Mon, 16 Apr 2018, Mikulas Patocka wrote:
> >
> >> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> >> flag causes allocation of larger slab caches in order to minimize wasted
> >> space.
> >>
> >> This is needed because we want to use dm-bufio for deduplication index and
> >> there are existing installations with non-power-of-two block sizes (such
> >> as 640KB). The performance of the whole solution depends on efficient
> >> memory use, so we must waste as little memory as possible.
> >
> > Hmmm. Can we come up with a generic solution instead?
>
> Yes please.
>
> > This may mean relaxing the enforcement of the allocation max order a bit
> > so that we can get dense allocation through higher order allocs.
> >
> > But then higher order allocs are generally seen as problematic.
>
> I think in this case they are better than wasting/fragmenting 384kB for
> 640kB object.
Wasting 37% of memory is still better than the kernel randomly returning
-ENOMEM when higher-order allocation fails.
> > That
> > means that callers need to be able to tolerate failures.
>
> Is it any different from now? I suppose there would still be
> smallest-order fallback involved in sl*b itself? And if your allocation
> is so large it can fail even with the fallback (i.e. >= costly order),
> you need to tolerate failures anyway?
>
> One corner case I see is if there is anyone who would rather use their
> own fallback instead of the space-wasting smallest-order fallback.
> Maybe we could map some GFP flag to indicate that.
For example, if you create a cache with 17KB objects, the slab subsystem
will pad it up to 32KB. You are wasting almost 1/2 memory, but the
allocation is realiable and it won't fail.
If you use order higher than 32KB, you get less wasted memory, but you
also get random -ENOMEMs (yes, we had a problem in dm-thin that it was
randomly failing during initialization due to 64KB allocation).
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 14:40 ` Christopher Lameter
@ 2018-04-17 18:53 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 18:53 UTC (permalink / raw)
To: Christopher Lameter
Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > dm-bufio deals gracefully with allocation failure, because it preallocates
> > some buffers with vmalloc, but other subsystems may not deal with it and
> > they cound return ENOMEM randomly or misbehave in other ways. So, the
> > "SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
> > the caller is prepared to deal with it.
> >
> > The slub subsystem does actual fallback to low-order when the allocation
> > fails (it allows different order for each slab in the same cache), but
> > slab doesn't fallback and you get NULL if higher-order allocation fails.
> > So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> > fail with higher order.
>
> Fix Slab instead of adding a flag that is only useful for one allocator?
Slab assumes that all slabs have the same order, so it's not so easy to
fix it.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 18:53 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 18:53 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, linux-kernel, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
Vlastimil Babka
On Tue, 17 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > dm-bufio deals gracefully with allocation failure, because it preallocates
> > some buffers with vmalloc, but other subsystems may not deal with it and
> > they cound return ENOMEM randomly or misbehave in other ways. So, the
> > "SLAB_MINIMIZE_WASTE" flag is also saying that the allocation may fail and
> > the caller is prepared to deal with it.
> >
> > The slub subsystem does actual fallback to low-order when the allocation
> > fails (it allows different order for each slab in the same cache), but
> > slab doesn't fallback and you get NULL if higher-order allocation fails.
> > So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> > fail with higher order.
>
> Fix Slab instead of adding a flag that is only useful for one allocator?
Slab assumes that all slabs have the same order, so it's not so easy to
fix it.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 14:45 ` Christopher Lameter
@ 2018-04-17 19:06 ` Mikulas Patocka
2018-04-17 19:06 ` Mikulas Patocka
1 sibling, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 19:06 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> > flag causes allocation of larger slab caches in order to minimize wasted
> > space.
> >
> > This is needed because we want to use dm-bufio for deduplication index and
> > there are existing installations with non-power-of-two block sizes (such
> > as 640KB). The performance of the whole solution depends on efficient
> > memory use, so we must waste as little memory as possible.
>
> Hmmm. Can we come up with a generic solution instead?
>
> This may mean relaxing the enforcement of the allocation max order a bit
> so that we can get dense allocation through higher order allocs.
>
> But then higher order allocs are generally seen as problematic.
>
> Note that SLUB will fall back to smallest order already if a failure
> occurs so increasing slub_max_order may not be that much of an issue.
>
> Maybe drop the max order limit completely and use MAX_ORDER instead? That
> means that callers need to be able to tolerate failures.
I can make a slub-only patch with no extra flag (on a freshly booted
system it increases only the order of caches "TCPv6" and "sighand_cache"
by one - so it should not have unexpected effects):
Doing a generic solution for slab would be more comlpicated because slab
assumes that all slabs have the same order, so it can't fall-back to
lower-order allocations.
From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] slub: minimize wasted space
When object size is greater than slub_max_order, the slub subsystem rounds
up the size to the next power of two. This causes a lot of wasted space -
i.e. 640KB block consumes 1MB of memory.
This patch makes the slub subsystem increase the order if it is benefical.
The order is increased as long as it reduces wasted space. There is cutoff
at 32 objects per slab.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
mm/slub.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2018-04-17 19:59:49.000000000 +0200
+++ linux-2.6/mm/slub.c 2018-04-17 20:58:23.000000000 +0200
@@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
static inline int calculate_order(unsigned int size, unsigned int reserved)
{
unsigned int order;
+ unsigned int test_order;
unsigned int min_objects;
unsigned int max_objects;
@@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
order = slab_order(size, min_objects,
slub_max_order, fraction, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
fraction /= 2;
}
min_objects--;
@@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
*/
order = slab_order(size, 1, slub_max_order, 1, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
order = slab_order(size, 1, MAX_ORDER, 1, reserved);
- if (order < MAX_ORDER)
- return order;
- return -ENOSYS;
+ if (order >= MAX_ORDER)
+ return -ENOSYS;
+
+ret_order:
+ for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+ unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+ unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+ if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
+ break;
+ if (test_order_objects > order_objects << (test_order - order))
+ order = test_order;
+ }
+ return order;
}
static void
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
@ 2018-04-17 19:06 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 19:06 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, linux-kernel, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
Vlastimil Babka
On Tue, 17 Apr 2018, Christopher Lameter wrote:
> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>
> > This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
> > flag causes allocation of larger slab caches in order to minimize wasted
> > space.
> >
> > This is needed because we want to use dm-bufio for deduplication index and
> > there are existing installations with non-power-of-two block sizes (such
> > as 640KB). The performance of the whole solution depends on efficient
> > memory use, so we must waste as little memory as possible.
>
> Hmmm. Can we come up with a generic solution instead?
>
> This may mean relaxing the enforcement of the allocation max order a bit
> so that we can get dense allocation through higher order allocs.
>
> But then higher order allocs are generally seen as problematic.
>
> Note that SLUB will fall back to smallest order already if a failure
> occurs so increasing slub_max_order may not be that much of an issue.
>
> Maybe drop the max order limit completely and use MAX_ORDER instead? That
> means that callers need to be able to tolerate failures.
I can make a slub-only patch with no extra flag (on a freshly booted
system it increases only the order of caches "TCPv6" and "sighand_cache"
by one - so it should not have unexpected effects):
Doing a generic solution for slab would be more comlpicated because slab
assumes that all slabs have the same order, so it can't fall-back to
lower-order allocations.
From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] slub: minimize wasted space
When object size is greater than slub_max_order, the slub subsystem rounds
up the size to the next power of two. This causes a lot of wasted space -
i.e. 640KB block consumes 1MB of memory.
This patch makes the slub subsystem increase the order if it is benefical.
The order is increased as long as it reduces wasted space. There is cutoff
at 32 objects per slab.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
mm/slub.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2018-04-17 19:59:49.000000000 +0200
+++ linux-2.6/mm/slub.c 2018-04-17 20:58:23.000000000 +0200
@@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
static inline int calculate_order(unsigned int size, unsigned int reserved)
{
unsigned int order;
+ unsigned int test_order;
unsigned int min_objects;
unsigned int max_objects;
@@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
order = slab_order(size, min_objects,
slub_max_order, fraction, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
fraction /= 2;
}
min_objects--;
@@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
*/
order = slab_order(size, 1, slub_max_order, 1, reserved);
if (order <= slub_max_order)
- return order;
+ goto ret_order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
order = slab_order(size, 1, MAX_ORDER, 1, reserved);
- if (order < MAX_ORDER)
- return order;
- return -ENOSYS;
+ if (order >= MAX_ORDER)
+ return -ENOSYS;
+
+ret_order:
+ for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+ unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
+ unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
+ if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
+ break;
+ if (test_order_objects > order_objects << (test_order - order))
+ order = test_order;
+ }
+ return order;
}
static void
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 16:38 ` Christopher Lameter
@ 2018-04-17 19:09 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-17 19:09 UTC (permalink / raw)
To: Christopher Lameter
Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Christopher Lameter wrote:
> On Tue, 17 Apr 2018, Vlastimil Babka wrote:
>
> > On 04/17/2018 04:45 PM, Christopher Lameter wrote:
>
> > > But then higher order allocs are generally seen as problematic.
> >
> > I think in this case they are better than wasting/fragmenting 384kB for
> > 640kB object.
>
> Well typically we have suggested that people use vmalloc in the past.
vmalloc is slow - it is unuseable for a buffer cache.
> > > That
> > > means that callers need to be able to tolerate failures.
> >
> > Is it any different from now? I suppose there would still be
> > smallest-order fallback involved in sl*b itself? And if your allocation
> > is so large it can fail even with the fallback (i.e. >= costly order),
> > you need to tolerate failures anyway?
>
> Failures can occur even with < costly order as far as I can telkl. Order 0
> is the only safe one.
The alloc_pages functions seems to retry indefinitely for order <=
PAGE_ALLOC_COSTLY_ORDER. Do you have some explanation why it should fail?
> > One corner case I see is if there is anyone who would rather use their
> > own fallback instead of the space-wasting smallest-order fallback.
> > Maybe we could map some GFP flag to indicate that.
>
> Well if you have a fallback then maybe the slab allocator should not fall
> back on its own but let the caller deal with it.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 17:26 ` Mikulas Patocka
@ 2018-04-17 19:13 ` Vlastimil Babka
0 siblings, 0 replies; 109+ messages in thread
From: Vlastimil Babka @ 2018-04-17 19:13 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Christopher Lameter, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On 04/17/2018 07:26 PM, Mikulas Patocka wrote:
>
>
> On Tue, 17 Apr 2018, Vlastimil Babka wrote:
>
>> On 04/17/2018 04:45 PM, Christopher Lameter wrote:
>>> On Mon, 16 Apr 2018, Mikulas Patocka wrote:
>>>
>>>> This patch introduces a flag SLAB_MINIMIZE_WASTE for slab and slub. This
>>>> flag causes allocation of larger slab caches in order to minimize wasted
>>>> space.
>>>>
>>>> This is needed because we want to use dm-bufio for deduplication index and
>>>> there are existing installations with non-power-of-two block sizes (such
>>>> as 640KB). The performance of the whole solution depends on efficient
>>>> memory use, so we must waste as little memory as possible.
>>>
>>> Hmmm. Can we come up with a generic solution instead?
>>
>> Yes please.
>>
>>> This may mean relaxing the enforcement of the allocation max order a bit
>>> so that we can get dense allocation through higher order allocs.
>>>
>>> But then higher order allocs are generally seen as problematic.
>>
>> I think in this case they are better than wasting/fragmenting 384kB for
>> 640kB object.
>
> Wasting 37% of memory is still better than the kernel randomly returning
> -ENOMEM when higher-order allocation fails.
Of course, see below.
>>> That
>>> means that callers need to be able to tolerate failures.
>>
>> Is it any different from now? I suppose there would still be
>> smallest-order fallback involved in sl*b itself? And if your allocation
^ There: "I suppose there would still be smallest-order fallback
involved in sl*b itself?"
If SLAB doesn't currently support fallback to different order, it either
learns to do that, or keeps wasting memory and more people will migrate
to SLUB. Simple.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 18:53 ` Mikulas Patocka
(?)
@ 2018-04-17 21:42 ` Christopher Lameter
-1 siblings, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-17 21:42 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Vlastimil Babka, Mike Snitzer, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Mikulas Patocka wrote:
> > > The slub subsystem does actual fallback to low-order when the allocation
> > > fails (it allows different order for each slab in the same cache), but
> > > slab doesn't fallback and you get NULL if higher-order allocation fails.
> > > So, SLAB_MINIMIZE_WASTE is needed for slab because it will just randomly
> > > fail with higher order.
> >
> > Fix Slab instead of adding a flag that is only useful for one allocator?
>
> Slab assumes that all slabs have the same order, so it's not so easy to
> fix it.
Well since SLAB uses compound pages one could easily determine the order
of the page and thus also support multiple page orders there.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-17 19:06 ` Mikulas Patocka
(?)
@ 2018-04-18 14:55 ` Christopher Lameter
2018-04-25 21:04 ` Mikulas Patocka
-1 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-18 14:55 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Tue, 17 Apr 2018, Mikulas Patocka wrote:
> I can make a slub-only patch with no extra flag (on a freshly booted
> system it increases only the order of caches "TCPv6" and "sighand_cache"
> by one - so it should not have unexpected effects):
>
> Doing a generic solution for slab would be more comlpicated because slab
> assumes that all slabs have the same order, so it can't fall-back to
> lower-order allocations.
Well again SLAB uses compound pages and thus would be able to detect the
size of the page. It may be some work but it could be done.
>
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c 2018-04-17 19:59:49.000000000 +0200
> +++ linux-2.6/mm/slub.c 2018-04-17 20:58:23.000000000 +0200
> @@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
> static inline int calculate_order(unsigned int size, unsigned int reserved)
> {
> unsigned int order;
> + unsigned int test_order;
> unsigned int min_objects;
> unsigned int max_objects;
>
> @@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
> order = slab_order(size, min_objects,
> slub_max_order, fraction, reserved);
> if (order <= slub_max_order)
> - return order;
> + goto ret_order;
> fraction /= 2;
> }
> min_objects--;
> @@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
> */
> order = slab_order(size, 1, slub_max_order, 1, reserved);
The slab order is determined in slab_order()
> if (order <= slub_max_order)
> - return order;
> + goto ret_order;
>
> /*
> * Doh this slab cannot be placed using slub_max_order.
> */
> order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> - if (order < MAX_ORDER)
> - return order;
> - return -ENOSYS;
> + if (order >= MAX_ORDER)
> + return -ENOSYS;
> +
> +ret_order:
> + for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> + unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
> + unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
> + if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
> + break;
> + if (test_order_objects > order_objects << (test_order - order))
> + order = test_order;
> + }
> + return order;
Could yo move that logic into slab_order()? It does something awfully
similar.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-18 14:55 ` Christopher Lameter
@ 2018-04-25 21:04 ` Mikulas Patocka
2018-04-25 23:24 ` Mikulas Patocka
2018-04-26 18:51 ` Christopher Lameter
0 siblings, 2 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-25 21:04 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Wed, 18 Apr 2018, Christopher Lameter wrote:
> On Tue, 17 Apr 2018, Mikulas Patocka wrote:
>
> > I can make a slub-only patch with no extra flag (on a freshly booted
> > system it increases only the order of caches "TCPv6" and "sighand_cache"
> > by one - so it should not have unexpected effects):
> >
> > Doing a generic solution for slab would be more comlpicated because slab
> > assumes that all slabs have the same order, so it can't fall-back to
> > lower-order allocations.
>
> Well again SLAB uses compound pages and thus would be able to detect the
> size of the page. It may be some work but it could be done.
>
> >
> > Index: linux-2.6/mm/slub.c
> > ===================================================================
> > --- linux-2.6.orig/mm/slub.c 2018-04-17 19:59:49.000000000 +0200
> > +++ linux-2.6/mm/slub.c 2018-04-17 20:58:23.000000000 +0200
> > @@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
> > static inline int calculate_order(unsigned int size, unsigned int reserved)
> > {
> > unsigned int order;
> > + unsigned int test_order;
> > unsigned int min_objects;
> > unsigned int max_objects;
> >
> > @@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
> > order = slab_order(size, min_objects,
> > slub_max_order, fraction, reserved);
> > if (order <= slub_max_order)
> > - return order;
> > + goto ret_order;
> > fraction /= 2;
> > }
> > min_objects--;
> > @@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
> > */
> > order = slab_order(size, 1, slub_max_order, 1, reserved);
>
> The slab order is determined in slab_order()
>
> > if (order <= slub_max_order)
> > - return order;
> > + goto ret_order;
> >
> > /*
> > * Doh this slab cannot be placed using slub_max_order.
> > */
> > order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> > - if (order < MAX_ORDER)
> > - return order;
> > - return -ENOSYS;
> > + if (order >= MAX_ORDER)
> > + return -ENOSYS;
> > +
> > +ret_order:
> > + for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> > + unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
> > + unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
> > + if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
> > + break;
> > + if (test_order_objects > order_objects << (test_order - order))
> > + order = test_order;
> > + }
> > + return order;
>
> Could yo move that logic into slab_order()? It does something awfully
> similar.
But slab_order (and its caller) limits the order to "max_order" and we
want more.
Perhaps slab_order should be dropped and calculate_order totally
rewritten?
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-25 21:04 ` Mikulas Patocka
@ 2018-04-25 23:24 ` Mikulas Patocka
2018-04-26 19:01 ` Christopher Lameter
2018-04-26 18:51 ` Christopher Lameter
1 sibling, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-25 23:24 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Wed, 25 Apr 2018, Mikulas Patocka wrote:
>
>
> On Wed, 18 Apr 2018, Christopher Lameter wrote:
>
> > On Tue, 17 Apr 2018, Mikulas Patocka wrote:
> >
> > > I can make a slub-only patch with no extra flag (on a freshly booted
> > > system it increases only the order of caches "TCPv6" and "sighand_cache"
> > > by one - so it should not have unexpected effects):
> > >
> > > Doing a generic solution for slab would be more comlpicated because slab
> > > assumes that all slabs have the same order, so it can't fall-back to
> > > lower-order allocations.
> >
> > Well again SLAB uses compound pages and thus would be able to detect the
> > size of the page. It may be some work but it could be done.
> >
> > >
> > > Index: linux-2.6/mm/slub.c
> > > ===================================================================
> > > --- linux-2.6.orig/mm/slub.c 2018-04-17 19:59:49.000000000 +0200
> > > +++ linux-2.6/mm/slub.c 2018-04-17 20:58:23.000000000 +0200
> > > @@ -3252,6 +3252,7 @@ static inline unsigned int slab_order(un
> > > static inline int calculate_order(unsigned int size, unsigned int reserved)
> > > {
> > > unsigned int order;
> > > + unsigned int test_order;
> > > unsigned int min_objects;
> > > unsigned int max_objects;
> > >
> > > @@ -3277,7 +3278,7 @@ static inline int calculate_order(unsign
> > > order = slab_order(size, min_objects,
> > > slub_max_order, fraction, reserved);
> > > if (order <= slub_max_order)
> > > - return order;
> > > + goto ret_order;
> > > fraction /= 2;
> > > }
> > > min_objects--;
> > > @@ -3289,15 +3290,25 @@ static inline int calculate_order(unsign
> > > */
> > > order = slab_order(size, 1, slub_max_order, 1, reserved);
> >
> > The slab order is determined in slab_order()
> >
> > > if (order <= slub_max_order)
> > > - return order;
> > > + goto ret_order;
> > >
> > > /*
> > > * Doh this slab cannot be placed using slub_max_order.
> > > */
> > > order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> > > - if (order < MAX_ORDER)
> > > - return order;
> > > - return -ENOSYS;
> > > + if (order >= MAX_ORDER)
> > > + return -ENOSYS;
> > > +
> > > +ret_order:
> > > + for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> > > + unsigned long order_objects = ((PAGE_SIZE << order) - reserved) / size;
> > > + unsigned long test_order_objects = ((PAGE_SIZE << test_order) - reserved) / size;
> > > + if (test_order_objects > min(32, MAX_OBJS_PER_PAGE))
> > > + break;
> > > + if (test_order_objects > order_objects << (test_order - order))
> > > + order = test_order;
> > > + }
> > > + return order;
> >
> > Could yo move that logic into slab_order()? It does something awfully
> > similar.
>
> But slab_order (and its caller) limits the order to "max_order" and we
> want more.
>
> Perhaps slab_order should be dropped and calculate_order totally
> rewritten?
>
> Mikulas
Do you want this? It deletes slab_order and replaces it with the
"minimize_waste" logic directly.
The patch starts with a minimal order for a given size and increases the
order if one of these conditions is met:
* we is below slub_min_order
* we are below min_objects and slub_max_order
* we go above slub_max_order only if it minimizes waste and if we don't
increase the object count above 32
It simplifies the code and it is very similar to the old algorithms, most
slab caches have the same order, so it shouldn't cause any regressions.
This patch changes order of these slabs:
TCPv6: 3 -> 4
sighand_cache: 3 -> 4
task_struct: 3 -> 4
---
mm/slub.c | 76 +++++++++++++++++++++-----------------------------------------
1 file changed, 26 insertions(+), 50 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2018-04-26 00:07:30.000000000 +0200
+++ linux-2.6/mm/slub.c 2018-04-26 00:21:37.000000000 +0200
@@ -3224,34 +3224,10 @@ static unsigned int slub_min_objects;
* requested a higher mininum order then we start with that one instead of
* the smallest order which will fit the object.
*/
-static inline unsigned int slab_order(unsigned int size,
- unsigned int min_objects, unsigned int max_order,
- unsigned int fract_leftover, unsigned int reserved)
-{
- unsigned int min_order = slub_min_order;
- unsigned int order;
-
- if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
- return get_order(size * MAX_OBJS_PER_PAGE) - 1;
-
- for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
- order <= max_order; order++) {
-
- unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
- unsigned int rem;
-
- rem = (slab_size - reserved) % size;
-
- if (rem <= slab_size / fract_leftover)
- break;
- }
-
- return order;
-}
-
static inline int calculate_order(unsigned int size, unsigned int reserved)
{
unsigned int order;
+ unsigned int test_order;
unsigned int min_objects;
unsigned int max_objects;
@@ -3269,35 +3245,35 @@ static inline int calculate_order(unsign
max_objects = order_objects(slub_max_order, size, reserved);
min_objects = min(min_objects, max_objects);
- while (min_objects > 1) {
- unsigned int fraction;
+ /* Get the minimum acceptable order for one object */
+ order = get_order(size + reserved);
+
+ for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
+ unsigned order_obj = order_objects(order, size, reserved);
+ unsigned test_order_obj = order_objects(test_order, size, reserved);
+
+ /* If there are too many objects, stop searching */
+ if (test_order_obj > MAX_OBJS_PER_PAGE)
+ break;
- fraction = 16;
- while (fraction >= 4) {
- order = slab_order(size, min_objects,
- slub_max_order, fraction, reserved);
- if (order <= slub_max_order)
- return order;
- fraction /= 2;
- }
- min_objects--;
+ /* Always increase up to slub_min_order */
+ if (test_order <= slub_min_order)
+ order = test_order;
+
+ /* If we are below min_objects and slub_max_order, increase order */
+ if (order_obj < min_objects && test_order <= slub_max_order)
+ order = test_order;
+
+ /* Increase order even more, but only if it reduces waste */
+ if (test_order_obj <= 32 &&
+ test_order_obj > order_obj << (test_order - order))
+ order = test_order;
}
- /*
- * We were unable to place multiple objects in a slab. Now
- * lets see if we can place a single object there.
- */
- order = slab_order(size, 1, slub_max_order, 1, reserved);
- if (order <= slub_max_order)
- return order;
+ if (order >= MAX_ORDER)
+ return -ENOSYS;
- /*
- * Doh this slab cannot be placed using slub_max_order.
- */
- order = slab_order(size, 1, MAX_ORDER, 1, reserved);
- if (order < MAX_ORDER)
- return order;
- return -ENOSYS;
+ return order;
}
static void
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-25 21:04 ` Mikulas Patocka
2018-04-25 23:24 ` Mikulas Patocka
@ 2018-04-26 18:51 ` Christopher Lameter
1 sibling, 0 replies; 109+ messages in thread
From: Christopher Lameter @ 2018-04-26 18:51 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Wed, 25 Apr 2018, Mikulas Patocka wrote:
> >
> > Could yo move that logic into slab_order()? It does something awfully
> > similar.
>
> But slab_order (and its caller) limits the order to "max_order" and we
> want more.
>
> Perhaps slab_order should be dropped and calculate_order totally
> rewritten?
Yes you likely need to do something creative with max_order if not with
more stuff.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-25 23:24 ` Mikulas Patocka
@ 2018-04-26 19:01 ` Christopher Lameter
2018-04-26 21:09 ` Mikulas Patocka
0 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-26 19:01 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Wed, 25 Apr 2018, Mikulas Patocka wrote:
> Do you want this? It deletes slab_order and replaces it with the
> "minimize_waste" logic directly.
Well yes that looks better. Now we need to make it easy to read and less
complicated. Maybe try to keep as much as possible of the old code
and also the names of variables to make it easier to review?
> It simplifies the code and it is very similar to the old algorithms, most
> slab caches have the same order, so it shouldn't cause any regressions.
>
> This patch changes order of these slabs:
> TCPv6: 3 -> 4
> sighand_cache: 3 -> 4
> task_struct: 3 -> 4
Hmmm... order 4 for these caches may cause some concern. These should stay
under costly order I think. Otherwise allocations are no longer
guaranteed.
> @@ -3269,35 +3245,35 @@ static inline int calculate_order(unsign
> max_objects = order_objects(slub_max_order, size, reserved);
> min_objects = min(min_objects, max_objects);
>
> - while (min_objects > 1) {
> - unsigned int fraction;
> + /* Get the minimum acceptable order for one object */
> + order = get_order(size + reserved);
> +
> + for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> + unsigned order_obj = order_objects(order, size, reserved);
> + unsigned test_order_obj = order_objects(test_order, size, reserved);
> +
> + /* If there are too many objects, stop searching */
> + if (test_order_obj > MAX_OBJS_PER_PAGE)
> + break;
>
> - fraction = 16;
> - while (fraction >= 4) {
> - order = slab_order(size, min_objects,
> - slub_max_order, fraction, reserved);
> - if (order <= slub_max_order)
> - return order;
> - fraction /= 2;
> - }
> - min_objects--;
> + /* Always increase up to slub_min_order */
> + if (test_order <= slub_min_order)
> + order = test_order;
Well that is a significant change. In our current scheme the order
boundart wins.
> +
> + /* If we are below min_objects and slub_max_order, increase order */
> + if (order_obj < min_objects && test_order <= slub_max_order)
> + order = test_order;
> +
> + /* Increase order even more, but only if it reduces waste */
> + if (test_order_obj <= 32 &&
Where does the 32 come from?
> + test_order_obj > order_obj << (test_order - order))
Add more () to make the condition better readable.
> + order = test_order;
Can we just call test_order order and avoid using the long variable names
here? Variable names in functions are typically short.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-26 19:01 ` Christopher Lameter
@ 2018-04-26 21:09 ` Mikulas Patocka
2018-04-27 16:41 ` Christopher Lameter
0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-26 21:09 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Thu, 26 Apr 2018, Christopher Lameter wrote:
> On Wed, 25 Apr 2018, Mikulas Patocka wrote:
>
> > Do you want this? It deletes slab_order and replaces it with the
> > "minimize_waste" logic directly.
>
> Well yes that looks better. Now we need to make it easy to read and less
> complicated. Maybe try to keep as much as possible of the old code
> and also the names of variables to make it easier to review?
>
> > It simplifies the code and it is very similar to the old algorithms, most
> > slab caches have the same order, so it shouldn't cause any regressions.
> >
> > This patch changes order of these slabs:
> > TCPv6: 3 -> 4
> > sighand_cache: 3 -> 4
> > task_struct: 3 -> 4
>
> Hmmm... order 4 for these caches may cause some concern. These should stay
> under costly order I think. Otherwise allocations are no longer
> guaranteed.
You said that slub has fallback to smaller order allocations.
The whole purpose of this "minimize waste" approach is to use higher-order
allocations to use memory more efficiently, so it is just doing its job.
(for these 3 caches, order-4 really wastes less memory than order-3 - on
my system TCPv6 and sighand_cache have size 2112, task_struct 2752).
We could improve the fallback code, so that if order-4 allocation fails,
it tries order-3 allocation, and then falls back to order-0. But I think
that these failures are rare enough that it is not a problem.
> > @@ -3269,35 +3245,35 @@ static inline int calculate_order(unsign
> > max_objects = order_objects(slub_max_order, size, reserved);
> > min_objects = min(min_objects, max_objects);
> >
> > - while (min_objects > 1) {
> > - unsigned int fraction;
> > + /* Get the minimum acceptable order for one object */
> > + order = get_order(size + reserved);
> > +
> > + for (test_order = order + 1; test_order < MAX_ORDER; test_order++) {
> > + unsigned order_obj = order_objects(order, size, reserved);
> > + unsigned test_order_obj = order_objects(test_order, size, reserved);
> > +
> > + /* If there are too many objects, stop searching */
> > + if (test_order_obj > MAX_OBJS_PER_PAGE)
> > + break;
> >
> > - fraction = 16;
> > - while (fraction >= 4) {
> > - order = slab_order(size, min_objects,
> > - slub_max_order, fraction, reserved);
> > - if (order <= slub_max_order)
> > - return order;
> > - fraction /= 2;
> > - }
> > - min_objects--;
> > + /* Always increase up to slub_min_order */
> > + if (test_order <= slub_min_order)
> > + order = test_order;
>
> Well that is a significant change. In our current scheme the order
> boundart wins.
I think it's not a change. The existing function slab_order() starts with
min_order (unless it overshoots MAX_OBJS_PER_PAGE) and then goes upwards.
My code does the same - my code tests for MAX_OBJS_PER_PAGE (and bails out
if we would overshoot it) and increases the order until it reaches
slub_min_order (and then increases it even more if it satisfies the other
conditions).
If you believe that it behaves differently, please describe the situation
in detail.
> > +
> > + /* If we are below min_objects and slub_max_order, increase order */
> > + if (order_obj < min_objects && test_order <= slub_max_order)
> > + order = test_order;
> > +
> > + /* Increase order even more, but only if it reduces waste */
> > + if (test_order_obj <= 32 &&
>
> Where does the 32 come from?
It is to avoid extremely high order for extremely small slabs.
For example, see kmalloc-96.
10922 96-byte objects would fit into 1MiB
21845 96-byte objects would fit into 2MiB
The algorithm would recognize this one more object that fits into 2MiB
slab as "waste reduction" and increase the order to 2MiB - and we don't
want this.
So, the general reasoning is - if we have 32 objects in a slab, then it is
already considered that wasted space is reasonably low and we don't want
to increase the order more.
Currently, kmalloc-96 uses order-0 - that is reasonable (we already have
42 objects in 4k page, so we don't need to use higher order, even if it
wastes one-less object).
> > + test_order_obj > order_obj << (test_order - order))
>
> Add more () to make the condition better readable.
>
> > + order = test_order;
>
> Can we just call test_order order and avoid using the long variable names
> here? Variable names in functions are typically short.
You need two variables - "order" and "test_order".
"order" is the best order found so far and "test_order" is the order that
we are now testing. If "test_order" wastes less space than "order", we
assign order = test_order.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-26 21:09 ` Mikulas Patocka
@ 2018-04-27 16:41 ` Christopher Lameter
2018-04-27 19:19 ` Mikulas Patocka
0 siblings, 1 reply; 109+ messages in thread
From: Christopher Lameter @ 2018-04-27 16:41 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Thu, 26 Apr 2018, Mikulas Patocka wrote:
> > Hmmm... order 4 for these caches may cause some concern. These should stay
> > under costly order I think. Otherwise allocations are no longer
> > guaranteed.
>
> You said that slub has fallback to smaller order allocations.
Yes it does...
> The whole purpose of this "minimize waste" approach is to use higher-order
> allocations to use memory more efficiently, so it is just doing its job.
> (for these 3 caches, order-4 really wastes less memory than order-3 - on
> my system TCPv6 and sighand_cache have size 2112, task_struct 2752).
Hmmm... Ok if the others are fine with this as well. I got some pushback
there in the past.
> We could improve the fallback code, so that if order-4 allocation fails,
> it tries order-3 allocation, and then falls back to order-0. But I think
> that these failures are rare enough that it is not a problem.
I also think that would be too many fallbacks.
> > > + /* Increase order even more, but only if it reduces waste */
> > > + if (test_order_obj <= 32 &&
> >
> > Where does the 32 come from?
>
> It is to avoid extremely high order for extremely small slabs.
>
> For example, see kmalloc-96.
> 10922 96-byte objects would fit into 1MiB
> 21845 96-byte objects would fit into 2MiB
That is the result of considering absolute byte wastage..
> The algorithm would recognize this one more object that fits into 2MiB
> slab as "waste reduction" and increase the order to 2MiB - and we don't
> want this.
>
> So, the general reasoning is - if we have 32 objects in a slab, then it is
> already considered that wasted space is reasonably low and we don't want
> to increase the order more.
>
> Currently, kmalloc-96 uses order-0 - that is reasonable (we already have
> 42 objects in 4k page, so we don't need to use higher order, even if it
> wastes one-less object).
The old code uses the concept of a "fraction" to calculate overhead. The
code here uses absolute counts of bytes. Fraction looks better to me.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-27 16:41 ` Christopher Lameter
@ 2018-04-27 19:19 ` Mikulas Patocka
2018-06-13 17:01 ` Mikulas Patocka
0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-04-27 19:19 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
On Fri, 27 Apr 2018, Christopher Lameter wrote:
> On Thu, 26 Apr 2018, Mikulas Patocka wrote:
>
> > > Hmmm... order 4 for these caches may cause some concern. These should stay
> > > under costly order I think. Otherwise allocations are no longer
> > > guaranteed.
> >
> > You said that slub has fallback to smaller order allocations.
>
> Yes it does...
>
> > The whole purpose of this "minimize waste" approach is to use higher-order
> > allocations to use memory more efficiently, so it is just doing its job.
> > (for these 3 caches, order-4 really wastes less memory than order-3 - on
> > my system TCPv6 and sighand_cache have size 2112, task_struct 2752).
>
> Hmmm... Ok if the others are fine with this as well. I got some pushback
> there in the past.
>
> > We could improve the fallback code, so that if order-4 allocation fails,
> > it tries order-3 allocation, and then falls back to order-0. But I think
> > that these failures are rare enough that it is not a problem.
>
> I also think that would be too many fallbacks.
You are right - it's better to fallback to the minimum possible size, so
that the allocation is faster.
> The old code uses the concept of a "fraction" to calculate overhead. The
> code here uses absolute counts of bytes. Fraction looks better to me.
OK - I reworked the patch using the same "fraction" calculation as before.
The existing logic targets 1/16 wasted space, so I used this target in
this patch too.
This patch increases only the order of task_struct (from 3 to 4), all the
other caches have the same order as before.
Mikulas
From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] slub: use higher order to reduce wasted space
If we create a slub cache with large object size (larger than
slub_max_order), the slub subsystem currently rounds up the object size to
the next power of two.
This is inefficient, because it wastes too much space. We use the slab
cache as a buffer cache in dm-bufio, in order to use the memory
efficiently, we need to reduce wasted space.
This patch reworks the slub order calculation algorithm, so that it uses
higher order allocations if it would reduce wasted space. The slub
subsystem has fallback if the higher-order allocations fails, so using
order higher than PAGE_ALLOC_COSTLY_ORDER is ok.
The new algorithm first calculates the minimum order that can be used for
a give object size and then increases the order according to these
conditions:
* if we would overshoot MAX_OBJS_PER_PAGE, don't increase
* if we are below slub_min_order, increase
* if we are below slub_max_order and below min_objects, increase
* we increase above slub_max_order only if it reduces wasted space and if
we alrady waste at least 1/16 of the compound page
The new algorithm gives very similar results to the old one, all the
caches on my system have the same order as before, only the order of
task_struct (size 2752) is increased from 3 to 4.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
mm/slub.c | 82 +++++++++++++++++++++++---------------------------------------
1 file changed, 31 insertions(+), 51 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2018-04-27 19:30:34.000000000 +0200
+++ linux-2.6/mm/slub.c 2018-04-27 21:05:53.000000000 +0200
@@ -3224,34 +3224,10 @@ static unsigned int slub_min_objects;
* requested a higher mininum order then we start with that one instead of
* the smallest order which will fit the object.
*/
-static inline unsigned int slab_order(unsigned int size,
- unsigned int min_objects, unsigned int max_order,
- unsigned int fract_leftover, unsigned int reserved)
+static int calculate_order(unsigned int size, unsigned int reserved)
{
- unsigned int min_order = slub_min_order;
- unsigned int order;
-
- if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
- return get_order(size * MAX_OBJS_PER_PAGE) - 1;
-
- for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
- order <= max_order; order++) {
-
- unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
- unsigned int rem;
-
- rem = (slab_size - reserved) % size;
-
- if (rem <= slab_size / fract_leftover)
- break;
- }
-
- return order;
-}
-
-static inline int calculate_order(unsigned int size, unsigned int reserved)
-{
- unsigned int order;
+ unsigned int best_order;
+ unsigned int test_order;
unsigned int min_objects;
unsigned int max_objects;
@@ -3269,34 +3245,38 @@ static inline int calculate_order(unsign
max_objects = order_objects(slub_max_order, size, reserved);
min_objects = min(min_objects, max_objects);
- while (min_objects > 1) {
- unsigned int fraction;
+ /* Get the minimum acceptable order for one object */
+ best_order = get_order(size + reserved);
+
+ for (test_order = best_order + 1; test_order < MAX_ORDER; test_order++) {
+ unsigned best_order_obj = order_objects(best_order, size, reserved);
+ unsigned test_order_obj = order_objects(test_order, size, reserved);
+
+ unsigned best_order_slab_size = (unsigned int)PAGE_SIZE << best_order;
+ unsigned best_order_rem = (best_order_slab_size - reserved) % size;
+
+ /* If there would be too many objects, stop searching */
+ if (test_order_obj > MAX_OBJS_PER_PAGE)
+ break;
- fraction = 16;
- while (fraction >= 4) {
- order = slab_order(size, min_objects,
- slub_max_order, fraction, reserved);
- if (order <= slub_max_order)
- return order;
- fraction /= 2;
- }
- min_objects--;
+ /* Always increase up to slub_min_order */
+ if (test_order <= slub_min_order)
+ best_order = test_order;
+
+ /* If we are below min_objects and slub_max_order, increase the order */
+ if (best_order_obj < min_objects && test_order <= slub_max_order)
+ best_order = test_order;
+
+ /* Increase the order even more, but only if it reduces waste */
+ /* If we already waste less than 1/16, don't increase it */
+ if (best_order_rem >= (best_order_slab_size / 16) &&
+ test_order_obj > (best_order_obj << (test_order - best_order)))
+ best_order = test_order;
}
- /*
- * We were unable to place multiple objects in a slab. Now
- * lets see if we can place a single object there.
- */
- order = slab_order(size, 1, slub_max_order, 1, reserved);
- if (order <= slub_max_order)
- return order;
+ if (best_order < MAX_ORDER)
+ return best_order;
- /*
- * Doh this slab cannot be placed using slub_max_order.
- */
- order = slab_order(size, 1, MAX_ORDER, 1, reserved);
- if (order < MAX_ORDER)
- return order;
return -ENOSYS;
}
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-04-27 19:19 ` Mikulas Patocka
@ 2018-06-13 17:01 ` Mikulas Patocka
2018-06-13 18:16 ` Christoph Hellwig
0 siblings, 1 reply; 109+ messages in thread
From: Mikulas Patocka @ 2018-06-13 17:01 UTC (permalink / raw)
To: Christopher Lameter
Cc: Mike Snitzer, Vlastimil Babka, Matthew Wilcox, Pekka Enberg,
linux-mm, dm-devel, David Rientjes, Joonsoo Kim, Andrew Morton,
linux-kernel
Hi
I'd like to ask about this patch - will you commit it, or do you want to
make some more changes to it?
Mikulas
On Fri, 27 Apr 2018, Mikulas Patocka wrote:
>
>
> On Fri, 27 Apr 2018, Christopher Lameter wrote:
>
> > On Thu, 26 Apr 2018, Mikulas Patocka wrote:
> >
> > > > Hmmm... order 4 for these caches may cause some concern. These should stay
> > > > under costly order I think. Otherwise allocations are no longer
> > > > guaranteed.
> > >
> > > You said that slub has fallback to smaller order allocations.
> >
> > Yes it does...
> >
> > > The whole purpose of this "minimize waste" approach is to use higher-order
> > > allocations to use memory more efficiently, so it is just doing its job.
> > > (for these 3 caches, order-4 really wastes less memory than order-3 - on
> > > my system TCPv6 and sighand_cache have size 2112, task_struct 2752).
> >
> > Hmmm... Ok if the others are fine with this as well. I got some pushback
> > there in the past.
> >
> > > We could improve the fallback code, so that if order-4 allocation fails,
> > > it tries order-3 allocation, and then falls back to order-0. But I think
> > > that these failures are rare enough that it is not a problem.
> >
> > I also think that would be too many fallbacks.
>
> You are right - it's better to fallback to the minimum possible size, so
> that the allocation is faster.
>
> > The old code uses the concept of a "fraction" to calculate overhead. The
> > code here uses absolute counts of bytes. Fraction looks better to me.
>
> OK - I reworked the patch using the same "fraction" calculation as before.
> The existing logic targets 1/16 wasted space, so I used this target in
> this patch too.
>
> This patch increases only the order of task_struct (from 3 to 4), all the
> other caches have the same order as before.
>
> Mikulas
>
>
>
> From: Mikulas Patocka <mpatocka@redhat.com>
> Subject: [PATCH] slub: use higher order to reduce wasted space
>
> If we create a slub cache with large object size (larger than
> slub_max_order), the slub subsystem currently rounds up the object size to
> the next power of two.
>
> This is inefficient, because it wastes too much space. We use the slab
> cache as a buffer cache in dm-bufio, in order to use the memory
> efficiently, we need to reduce wasted space.
>
> This patch reworks the slub order calculation algorithm, so that it uses
> higher order allocations if it would reduce wasted space. The slub
> subsystem has fallback if the higher-order allocations fails, so using
> order higher than PAGE_ALLOC_COSTLY_ORDER is ok.
>
> The new algorithm first calculates the minimum order that can be used for
> a give object size and then increases the order according to these
> conditions:
> * if we would overshoot MAX_OBJS_PER_PAGE, don't increase
> * if we are below slub_min_order, increase
> * if we are below slub_max_order and below min_objects, increase
> * we increase above slub_max_order only if it reduces wasted space and if
> we alrady waste at least 1/16 of the compound page
>
> The new algorithm gives very similar results to the old one, all the
> caches on my system have the same order as before, only the order of
> task_struct (size 2752) is increased from 3 to 4.
>
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>
> ---
> mm/slub.c | 82 +++++++++++++++++++++++---------------------------------------
> 1 file changed, 31 insertions(+), 51 deletions(-)
>
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c 2018-04-27 19:30:34.000000000 +0200
> +++ linux-2.6/mm/slub.c 2018-04-27 21:05:53.000000000 +0200
> @@ -3224,34 +3224,10 @@ static unsigned int slub_min_objects;
> * requested a higher mininum order then we start with that one instead of
> * the smallest order which will fit the object.
> */
> -static inline unsigned int slab_order(unsigned int size,
> - unsigned int min_objects, unsigned int max_order,
> - unsigned int fract_leftover, unsigned int reserved)
> +static int calculate_order(unsigned int size, unsigned int reserved)
> {
> - unsigned int min_order = slub_min_order;
> - unsigned int order;
> -
> - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
> - return get_order(size * MAX_OBJS_PER_PAGE) - 1;
> -
> - for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
> - order <= max_order; order++) {
> -
> - unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
> - unsigned int rem;
> -
> - rem = (slab_size - reserved) % size;
> -
> - if (rem <= slab_size / fract_leftover)
> - break;
> - }
> -
> - return order;
> -}
> -
> -static inline int calculate_order(unsigned int size, unsigned int reserved)
> -{
> - unsigned int order;
> + unsigned int best_order;
> + unsigned int test_order;
> unsigned int min_objects;
> unsigned int max_objects;
>
> @@ -3269,34 +3245,38 @@ static inline int calculate_order(unsign
> max_objects = order_objects(slub_max_order, size, reserved);
> min_objects = min(min_objects, max_objects);
>
> - while (min_objects > 1) {
> - unsigned int fraction;
> + /* Get the minimum acceptable order for one object */
> + best_order = get_order(size + reserved);
> +
> + for (test_order = best_order + 1; test_order < MAX_ORDER; test_order++) {
> + unsigned best_order_obj = order_objects(best_order, size, reserved);
> + unsigned test_order_obj = order_objects(test_order, size, reserved);
> +
> + unsigned best_order_slab_size = (unsigned int)PAGE_SIZE << best_order;
> + unsigned best_order_rem = (best_order_slab_size - reserved) % size;
> +
> + /* If there would be too many objects, stop searching */
> + if (test_order_obj > MAX_OBJS_PER_PAGE)
> + break;
>
> - fraction = 16;
> - while (fraction >= 4) {
> - order = slab_order(size, min_objects,
> - slub_max_order, fraction, reserved);
> - if (order <= slub_max_order)
> - return order;
> - fraction /= 2;
> - }
> - min_objects--;
> + /* Always increase up to slub_min_order */
> + if (test_order <= slub_min_order)
> + best_order = test_order;
> +
> + /* If we are below min_objects and slub_max_order, increase the order */
> + if (best_order_obj < min_objects && test_order <= slub_max_order)
> + best_order = test_order;
> +
> + /* Increase the order even more, but only if it reduces waste */
> + /* If we already waste less than 1/16, don't increase it */
> + if (best_order_rem >= (best_order_slab_size / 16) &&
> + test_order_obj > (best_order_obj << (test_order - best_order)))
> + best_order = test_order;
> }
>
> - /*
> - * We were unable to place multiple objects in a slab. Now
> - * lets see if we can place a single object there.
> - */
> - order = slab_order(size, 1, slub_max_order, 1, reserved);
> - if (order <= slub_max_order)
> - return order;
> + if (best_order < MAX_ORDER)
> + return best_order;
>
> - /*
> - * Doh this slab cannot be placed using slub_max_order.
> - */
> - order = slab_order(size, 1, MAX_ORDER, 1, reserved);
> - if (order < MAX_ORDER)
> - return order;
> return -ENOSYS;
> }
>
>
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-06-13 17:01 ` Mikulas Patocka
@ 2018-06-13 18:16 ` Christoph Hellwig
2018-06-13 18:53 ` Mikulas Patocka
0 siblings, 1 reply; 109+ messages in thread
From: Christoph Hellwig @ 2018-06-13 18:16 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Christopher Lameter, Mike Snitzer, Vlastimil Babka,
Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel, David Rientjes,
Joonsoo Kim, Andrew Morton, linux-kernel
On Wed, Jun 13, 2018 at 01:01:22PM -0400, Mikulas Patocka wrote:
> Hi
>
> I'd like to ask about this patch - will you commit it, or do you want to
> make some more changes to it?
How about you resend it with the series adding an actual user once
ready? I haven't actually seen patches using it posted on any list yet.
^ permalink raw reply [flat|nested] 109+ messages in thread
* Re: [PATCH RESEND] slab: introduce the flag SLAB_MINIMIZE_WASTE
2018-06-13 18:16 ` Christoph Hellwig
@ 2018-06-13 18:53 ` Mikulas Patocka
0 siblings, 0 replies; 109+ messages in thread
From: Mikulas Patocka @ 2018-06-13 18:53 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christopher Lameter, Mike Snitzer, Vlastimil Babka,
Matthew Wilcox, Pekka Enberg, linux-mm, dm-devel, David Rientjes,
Joonsoo Kim, Andrew Morton, linux-kernel
On Wed, 13 Jun 2018, Christoph Hellwig wrote:
> On Wed, Jun 13, 2018 at 01:01:22PM -0400, Mikulas Patocka wrote:
> > Hi
> >
> > I'd like to ask about this patch - will you commit it, or do you want to
> > make some more changes to it?
>
> How about you resend it with the series adding an actual user once
> ready? I haven't actually seen patches using it posted on any list yet.
dm-bufio is already using it. Starting with the kernel 4.17 (f51f2e0a7fb1
- "dm bufio: support non-power-of-two block sizes"), dm-bufio has the
capability to use non-power-of-two buffers. It uses slab cache for its
buffers - so we would like to have this slab optimization - to avoid
excessive memory wasting.
Originally, the slub patch used a new flag SLAB_MINIMIZE_WASTE, but after
a suggestion from others, I reworked the patch so that it minimizes waste
of all slub caches and doesn't need an extra flag to activate.
Mikulas
^ permalink raw reply [flat|nested] 109+ messages in thread
end of thread, other threads:[~2018-06-13 18:53 UTC | newest]
Thread overview: 109+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-20 17:25 [PATCH] slab: introduce the flag SLAB_MINIMIZE_WASTE Mikulas Patocka
2018-03-20 17:35 ` Matthew Wilcox
2018-03-20 17:35 ` Matthew Wilcox
2018-03-20 17:54 ` Christopher Lameter
2018-03-20 17:54 ` Christopher Lameter
2018-03-20 19:22 ` Mikulas Patocka
2018-03-20 19:22 ` Mikulas Patocka
2018-03-20 20:42 ` Christopher Lameter
2018-03-20 20:42 ` Christopher Lameter
2018-03-20 22:02 ` Mikulas Patocka
2018-03-20 22:02 ` Mikulas Patocka
2018-03-21 15:35 ` Christopher Lameter
2018-03-21 15:35 ` Christopher Lameter
2018-03-21 16:25 ` Mikulas Patocka
2018-03-21 16:25 ` Mikulas Patocka
2018-03-21 17:10 ` Matthew Wilcox
2018-03-21 17:10 ` Matthew Wilcox
2018-03-21 17:30 ` Christopher Lameter
2018-03-21 17:30 ` Christopher Lameter
2018-03-21 17:39 ` Christopher Lameter
2018-03-21 17:39 ` Christopher Lameter
2018-03-21 17:49 ` Matthew Wilcox
2018-03-21 17:49 ` Matthew Wilcox
2018-03-21 18:01 ` Christopher Lameter
2018-03-21 18:01 ` Christopher Lameter
2018-03-21 18:23 ` Mikulas Patocka
2018-03-21 18:23 ` Mikulas Patocka
2018-03-21 18:40 ` Christopher Lameter
2018-03-21 18:40 ` Christopher Lameter
2018-03-21 18:55 ` Mikulas Patocka
2018-03-21 18:55 ` Mikulas Patocka
2018-03-21 18:55 ` Matthew Wilcox
2018-03-21 18:55 ` Matthew Wilcox
2018-03-21 18:58 ` Christopher Lameter
2018-03-21 18:58 ` Christopher Lameter
2018-03-21 19:25 ` Mikulas Patocka
2018-03-21 19:25 ` Mikulas Patocka
2018-03-21 18:36 ` Mikulas Patocka
2018-03-21 18:36 ` Mikulas Patocka
2018-03-21 18:57 ` Christopher Lameter
2018-03-21 18:57 ` Christopher Lameter
2018-03-21 19:19 ` Mikulas Patocka
2018-03-21 19:19 ` Mikulas Patocka
2018-03-21 20:09 ` Christopher Lameter
2018-03-21 20:09 ` Christopher Lameter
2018-03-21 20:37 ` Mikulas Patocka
2018-03-21 20:37 ` Mikulas Patocka
2018-03-23 15:10 ` Christopher Lameter
2018-03-23 15:10 ` Christopher Lameter
2018-03-23 15:31 ` Mikulas Patocka
2018-03-23 15:31 ` Mikulas Patocka
2018-03-23 15:48 ` Christopher Lameter
2018-03-23 15:48 ` Christopher Lameter
2018-04-13 9:22 ` Vlastimil Babka
2018-04-13 9:22 ` Vlastimil Babka
2018-04-13 15:10 ` Mike Snitzer
2018-04-13 15:10 ` Mike Snitzer
2018-04-16 12:38 ` Vlastimil Babka
2018-04-16 12:38 ` Vlastimil Babka
2018-04-16 14:27 ` Mike Snitzer
2018-04-16 14:27 ` Mike Snitzer
2018-04-16 14:37 ` Mikulas Patocka
2018-04-16 14:37 ` Mikulas Patocka
2018-04-16 14:46 ` Mike Snitzer
2018-04-16 14:46 ` Mike Snitzer
2018-04-16 14:57 ` Mikulas Patocka
2018-04-16 14:57 ` Mikulas Patocka
2018-04-16 15:18 ` Christopher Lameter
2018-04-16 15:18 ` Christopher Lameter
2018-04-16 15:25 ` Mikulas Patocka
2018-04-16 15:25 ` Mikulas Patocka
2018-04-16 15:45 ` Christopher Lameter
2018-04-16 15:45 ` Christopher Lameter
2018-04-16 19:36 ` Mikulas Patocka
2018-04-16 19:36 ` Mikulas Patocka
2018-04-16 19:53 ` Vlastimil Babka
2018-04-16 21:01 ` Mikulas Patocka
2018-04-17 14:40 ` Christopher Lameter
2018-04-17 18:53 ` Mikulas Patocka
2018-04-17 18:53 ` Mikulas Patocka
2018-04-17 21:42 ` Christopher Lameter
2018-04-17 14:49 ` Christopher Lameter
2018-04-17 14:49 ` Christopher Lameter
2018-04-17 14:47 ` Christopher Lameter
2018-04-17 14:47 ` Christopher Lameter
2018-04-16 19:32 ` [PATCH RESEND] " Mikulas Patocka
2018-04-17 14:45 ` Christopher Lameter
2018-04-17 16:16 ` Vlastimil Babka
2018-04-17 16:38 ` Christopher Lameter
2018-04-17 19:09 ` Mikulas Patocka
2018-04-17 17:26 ` Mikulas Patocka
2018-04-17 19:13 ` Vlastimil Babka
2018-04-17 19:06 ` Mikulas Patocka
2018-04-17 19:06 ` Mikulas Patocka
2018-04-18 14:55 ` Christopher Lameter
2018-04-25 21:04 ` Mikulas Patocka
2018-04-25 23:24 ` Mikulas Patocka
2018-04-26 19:01 ` Christopher Lameter
2018-04-26 21:09 ` Mikulas Patocka
2018-04-27 16:41 ` Christopher Lameter
2018-04-27 19:19 ` Mikulas Patocka
2018-06-13 17:01 ` Mikulas Patocka
2018-06-13 18:16 ` Christoph Hellwig
2018-06-13 18:53 ` Mikulas Patocka
2018-04-26 18:51 ` Christopher Lameter
2018-04-16 19:38 ` Vlastimil Babka
2018-04-16 19:38 ` Vlastimil Babka
2018-04-16 21:04 ` Mikulas Patocka
2018-04-16 21:04 ` Mikulas Patocka
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.