All of lore.kernel.org
 help / color / mirror / Atom feed
* slub: remove dynamic dma slab allocation
@ 2010-06-15 19:07 Christoph Lameter
  2010-06-15 19:11 ` [RFC] slub: Simplify boot kmem_cache_cpu allocations Christoph Lameter
  2010-06-18 22:30 ` slub: remove dynamic dma slab allocation David Rientjes
  0 siblings, 2 replies; 21+ messages in thread
From: Christoph Lameter @ 2010-06-15 19:07 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: David Rientjes, linux-mm

Subject: slub: remove dynamic dma slab allocation

Remove the dynamic dma slab allocation since this causes too many issues with
nested locks etc etc. The change avoids passing gfpflags into many functions.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 mm/slub.c |  153 ++++++++++++++++----------------------------------------------
 1 file changed, 41 insertions(+), 112 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-06-15 12:40:58.000000000 -0500
+++ linux-2.6/mm/slub.c	2010-06-15 12:41:36.000000000 -0500
@@ -2070,7 +2070,7 @@ init_kmem_cache_node(struct kmem_cache_n

 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);

-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 {
 	if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
 		/*
@@ -2097,7 +2097,7 @@ static inline int alloc_kmem_cache_cpus(
  * when allocating for the kmalloc_node_cache. This is used for bootstrapping
  * memory on a fresh node that has no slab structures yet.
  */
-static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
+static void early_kmem_cache_node_alloc(int node)
 {
 	struct page *page;
 	struct kmem_cache_node *n;
@@ -2105,7 +2105,7 @@ static void early_kmem_cache_node_alloc(

 	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));

-	page = new_slab(kmalloc_caches, gfpflags, node);
+	page = new_slab(kmalloc_caches, GFP_KERNEL, node);

 	BUG_ON(!page);
 	if (page_to_nid(page) != node) {
@@ -2149,7 +2149,7 @@ static void free_kmem_cache_nodes(struct
 	}
 }

-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
 	int node;

@@ -2157,11 +2157,11 @@ static int init_kmem_cache_nodes(struct
 		struct kmem_cache_node *n;

 		if (slab_state == DOWN) {
-			early_kmem_cache_node_alloc(gfpflags, node);
+			early_kmem_cache_node_alloc(node);
 			continue;
 		}
 		n = kmem_cache_alloc_node(kmalloc_caches,
-						gfpflags, node);
+						GFP_KERNEL, node);

 		if (!n) {
 			free_kmem_cache_nodes(s);
@@ -2178,7 +2178,7 @@ static void free_kmem_cache_nodes(struct
 {
 }

-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
 	init_kmem_cache_node(&s->local_node, s);
 	return 1;
@@ -2318,7 +2318,7 @@ static int calculate_sizes(struct kmem_c

 }

-static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+static int kmem_cache_open(struct kmem_cache *s,
 		const char *name, size_t size,
 		size_t align, unsigned long flags,
 		void (*ctor)(void *))
@@ -2354,10 +2354,10 @@ static int kmem_cache_open(struct kmem_c
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
 #endif
-	if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+	if (!init_kmem_cache_nodes(s))
 		goto error;

-	if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+	if (alloc_kmem_cache_cpus(s))
 		return 1;

 	free_kmem_cache_nodes(s);
@@ -2517,6 +2517,10 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);

+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
+
 static int __init setup_slub_min_order(char *str)
 {
 	get_option(&str, &slub_min_order);
@@ -2553,116 +2557,26 @@ static int __init setup_slub_nomerge(cha

 __setup("slub_nomerge", setup_slub_nomerge);

-static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
-		const char *name, int size, gfp_t gfp_flags)
+static void create_kmalloc_cache(struct kmem_cache *s,
+		const char *name, int size, unsigned int flags)
 {
-	unsigned int flags = 0;
-
-	if (gfp_flags & SLUB_DMA)
-		flags = SLAB_CACHE_DMA;
-
 	/*
 	 * This function is called with IRQs disabled during early-boot on
 	 * single CPU so there's no need to take slub_lock here.
 	 */
-	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+	if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;

 	list_add(&s->list, &slab_caches);

-	if (sysfs_slab_add(s))
-		goto panic;
-	return s;
+	if (!sysfs_slab_add(s))
+		return;

 panic:
 	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
 }

-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
-
-static void sysfs_add_func(struct work_struct *w)
-{
-	struct kmem_cache *s;
-
-	down_write(&slub_lock);
-	list_for_each_entry(s, &slab_caches, list) {
-		if (s->flags & __SYSFS_ADD_DEFERRED) {
-			s->flags &= ~__SYSFS_ADD_DEFERRED;
-			sysfs_slab_add(s);
-		}
-	}
-	up_write(&slub_lock);
-}
-
-static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
-
-static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
-{
-	struct kmem_cache *s;
-	char *text;
-	size_t realsize;
-	unsigned long slabflags;
-	int i;
-
-	s = kmalloc_caches_dma[index];
-	if (s)
-		return s;
-
-	/* Dynamically create dma cache */
-	if (flags & __GFP_WAIT)
-		down_write(&slub_lock);
-	else {
-		if (!down_write_trylock(&slub_lock))
-			goto out;
-	}
-
-	if (kmalloc_caches_dma[index])
-		goto unlock_out;
-
-	realsize = kmalloc_caches[index].objsize;
-	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
-			 (unsigned int)realsize);
-
-	s = NULL;
-	for (i = 0; i < KMALLOC_CACHES; i++)
-		if (!kmalloc_caches[i].size)
-			break;
-
-	BUG_ON(i >= KMALLOC_CACHES);
-	s = kmalloc_caches + i;
-
-	/*
-	 * Must defer sysfs creation to a workqueue because we don't know
-	 * what context we are called from. Before sysfs comes up, we don't
-	 * need to do anything because our sysfs initcall will start by
-	 * adding all existing slabs to sysfs.
-	 */
-	slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
-	if (slab_state >= SYSFS)
-		slabflags |= __SYSFS_ADD_DEFERRED;
-
-	if (!text || !kmem_cache_open(s, flags, text,
-			realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
-		s->size = 0;
-		kfree(text);
-		goto unlock_out;
-	}
-
-	list_add(&s->list, &slab_caches);
-	kmalloc_caches_dma[index] = s;
-
-	if (slab_state >= SYSFS)
-		schedule_work(&sysfs_add_work);
-
-unlock_out:
-	up_write(&slub_lock);
-out:
-	return kmalloc_caches_dma[index];
-}
-#endif
-
 /*
  * Conversion table for small slabs sizes / 8 to the index in the
  * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2715,7 +2629,7 @@ static struct kmem_cache *get_slab(size_

 #ifdef CONFIG_ZONE_DMA
 	if (unlikely((flags & SLUB_DMA)))
-		return dma_kmalloc_cache(index, flags);
+		return &kmalloc_dma_caches[index];

 #endif
 	return &kmalloc_caches[index];
@@ -3053,7 +2967,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_NOWAIT);
+		sizeof(struct kmem_cache_node), 0);
 	kmalloc_caches[0].refcount = -1;
 	caches++;

@@ -3066,18 +2980,18 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 32) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_NOWAIT);
+				"kmalloc-96", 96, 0);
 		caches++;
 	}
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_NOWAIT);
+				"kmalloc-192", 192, 0);
 		caches++;
 	}

 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_NOWAIT);
+			"kmalloc", 1 << i, 0);
 		caches++;
 	}

@@ -3124,7 +3038,7 @@ void __init kmem_cache_init(void)

 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
-		kmalloc_caches[i]. name =
+		kmalloc_caches[i].name =
 			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);

 #ifdef CONFIG_SMP
@@ -3147,6 +3061,21 @@ void __init kmem_cache_init(void)

 void __init kmem_cache_init_late(void)
 {
+#ifdef CONFIG_ZONE_DMA
+	int i;
+
+	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+		struct kmem_cache *s = &kmalloc_caches[i];
+
+		if (s && s->size) {
+			char *name = kasprintf(GFP_KERNEL,
+				 "dma-kmalloc-%d", s->objsize);
+
+			create_kmalloc_cache(&kmalloc_dma_caches[i],
+				name, s->objsize, SLAB_CACHE_DMA);
+		}
+	}
+#endif
 }

 /*
@@ -3241,7 +3170,7 @@ struct kmem_cache *kmem_cache_create(con

 	s = kmalloc(kmem_size, GFP_KERNEL);
 	if (s) {
-		if (kmem_cache_open(s, GFP_KERNEL, name,
+		if (kmem_cache_open(s, name,
 				size, align, flags, ctor)) {
 			list_add(&s->list, &slab_caches);
 			up_write(&slub_lock);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-15 19:07 slub: remove dynamic dma slab allocation Christoph Lameter
@ 2010-06-15 19:11 ` Christoph Lameter
  2010-06-16  8:53   ` Tejun Heo
  2010-06-18 22:30 ` slub: remove dynamic dma slab allocation David Rientjes
  1 sibling, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-15 19:11 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: David Rientjes, linux-mm, Tejun Heo

Maybe this one can also be applied after the other patch?

Tejun: Is it somehow possible to reliably use the alloc_percpu() on all
platforms during early boot before the slab allocator is up?



Subject: slub: Simplify boot kmem_cache_cpu allocations

There is no need anymore for a large amount of kmem_cache_cpu
structures during bootup since the DMA slabs are allocated late
during boot. Also the tight 1-1 association with the indexing
of the kmalloc array is not necessary.

Simply take SLUB_PAGE_SHIFT entries from static memory.

Many arches could avoid static kmem_cache_cpu allocations
entirely since they have the ability to do alloc_percpu() early
in boot. But at least i386 needs this for now since an alloc_percpu()
triggers a call to kmalloc.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 mm/slub.c |   20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-06-03 13:48:41.000000000 -0500
+++ linux-2.6/mm/slub.c	2010-06-03 14:24:30.000000000 -0500
@@ -2068,23 +2068,25 @@ init_kmem_cache_node(struct kmem_cache_n
 #endif
 }

-static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
+/*
+ * Some arches need some assist during bootup since the percpu allocator is
+ * not available early during boot.
+ */
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[SLUB_PAGE_SHIFT]);
+static struct kmem_cache_cpu *boot_kmem_cache_cpu = &kmalloc_percpu[0];

 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 {
-	if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
+	if (boot_kmem_cache_cpu - kmalloc_percpu < SLUB_PAGE_SHIFT)
 		/*
 		 * Boot time creation of the kmalloc array. Use static per cpu data
-		 * since the per cpu allocator is not available yet.
+		 * since the per cpu allocator may not be available yet.
 		 */
-		s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
+		s->cpu_slab = boot_kmem_cache_cpu++;
 	else
-		s->cpu_slab =  alloc_percpu(struct kmem_cache_cpu);
+		s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);

-	if (!s->cpu_slab)
-		return 0;
-
-	return 1;
+	return s->cpu_slab != NULL;
 }

 #ifdef CONFIG_NUMA

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-15 19:11 ` [RFC] slub: Simplify boot kmem_cache_cpu allocations Christoph Lameter
@ 2010-06-16  8:53   ` Tejun Heo
  2010-06-16 16:33     ` Christoph Lameter
  0 siblings, 1 reply; 21+ messages in thread
From: Tejun Heo @ 2010-06-16  8:53 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

Hello, Christoph.

On 06/15/2010 09:11 PM, Christoph Lameter wrote:
> Maybe this one can also be applied after the other patch?
> 
> Tejun: Is it somehow possible to reliably use the alloc_percpu() on all
> platforms during early boot before the slab allocator is up?

Hmmm... first chunk allocation is done using bootmem, so if we give it
enough to room (for both chunk itself and alloc map) so that it can
serve till slab comes up, it should work fine.  I think what's
important here is making up our minds and decide on how to order them.
If the order is well defined, things can be made to work one way or
the other.  What happened to the get-rid-of-bootmem effort?  Wouldn't
that also interact with this?

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-16  8:53   ` Tejun Heo
@ 2010-06-16 16:33     ` Christoph Lameter
  2010-06-16 17:18       ` Tejun Heo
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-16 16:33 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Pekka Enberg, David Rientjes, linux-mm

On Wed, 16 Jun 2010, Tejun Heo wrote:

> > Tejun: Is it somehow possible to reliably use the alloc_percpu() on all
> > platforms during early boot before the slab allocator is up?
>
> Hmmm... first chunk allocation is done using bootmem, so if we give it
> enough to room (for both chunk itself and alloc map) so that it can
> serve till slab comes up, it should work fine.  I think what's
> important here is making up our minds and decide on how to order them.
> If the order is well defined, things can be made to work one way or
> the other.  What happened to the get-rid-of-bootmem effort?  Wouldn't
> that also interact with this?

Ok how do we make sure that the first chunk has enough room?

Slab bootstrap occurs after the page allocator has taken over from
bootmem and after the per cpu areas have been initialized.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-16 16:33     ` Christoph Lameter
@ 2010-06-16 17:18       ` Tejun Heo
  2010-06-16 17:35         ` Christoph Lameter
  0 siblings, 1 reply; 21+ messages in thread
From: Tejun Heo @ 2010-06-16 17:18 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

Hello,

On 06/16/2010 06:33 PM, Christoph Lameter wrote:
> On Wed, 16 Jun 2010, Tejun Heo wrote:
>>> Tejun: Is it somehow possible to reliably use the alloc_percpu() on all
>>> platforms during early boot before the slab allocator is up?
>>
>> Hmmm... first chunk allocation is done using bootmem, so if we give it
>> enough to room (for both chunk itself and alloc map) so that it can
>> serve till slab comes up, it should work fine.  I think what's
>> important here is making up our minds and decide on how to order them.
>> If the order is well defined, things can be made to work one way or
>> the other.  What happened to the get-rid-of-bootmem effort?  Wouldn't
>> that also interact with this?
> 
> Ok how do we make sure that the first chunk has enough room?

It's primarily controlled by PERCPU_DYNAMIC_RESERVE.  I don't think
there will be any systematic way to do it other than sizing it
sufficiently.  Can you calculate the upper bound?  The constant has
been used primarily for optimization so how it's used needs to be
audited if we wanna guarantee free space in the first chunk but I
don't think it would be too difficult.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-16 17:18       ` Tejun Heo
@ 2010-06-16 17:35         ` Christoph Lameter
  2010-06-17  8:49           ` Tejun Heo
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-16 17:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Pekka Enberg, David Rientjes, linux-mm

On Wed, 16 Jun 2010, Tejun Heo wrote:

> Hello,
>
> On 06/16/2010 06:33 PM, Christoph Lameter wrote:
> > On Wed, 16 Jun 2010, Tejun Heo wrote:
> >>> Tejun: Is it somehow possible to reliably use the alloc_percpu() on all
> >>> platforms during early boot before the slab allocator is up?
> >>
> >> Hmmm... first chunk allocation is done using bootmem, so if we give it
> >> enough to room (for both chunk itself and alloc map) so that it can
> >> serve till slab comes up, it should work fine.  I think what's
> >> important here is making up our minds and decide on how to order them.
> >> If the order is well defined, things can be made to work one way or
> >> the other.  What happened to the get-rid-of-bootmem effort?  Wouldn't
> >> that also interact with this?
> >
> > Ok how do we make sure that the first chunk has enough room?
>
> It's primarily controlled by PERCPU_DYNAMIC_RESERVE.  I don't think
> there will be any systematic way to do it other than sizing it
> sufficiently.  Can you calculate the upper bound?  The constant has
> been used primarily for optimization so how it's used needs to be
> audited if we wanna guarantee free space in the first chunk but I
> don't think it would be too difficult.

The upper bound is SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu).

Thats usually 14 * 104 bytes = 1456 bytes. This may increase to more
than 8k given the future plans to add queues into kmem_cache_cpu.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-16 17:35         ` Christoph Lameter
@ 2010-06-17  8:49           ` Tejun Heo
  2010-06-17  9:01             ` Pekka Enberg
  2010-06-17 13:43             ` Christoph Lameter
  0 siblings, 2 replies; 21+ messages in thread
From: Tejun Heo @ 2010-06-17  8:49 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

Hello,

On 06/16/2010 07:35 PM, Christoph Lameter wrote:
>> It's primarily controlled by PERCPU_DYNAMIC_RESERVE.  I don't think
>> there will be any systematic way to do it other than sizing it
>> sufficiently.  Can you calculate the upper bound?  The constant has
>> been used primarily for optimization so how it's used needs to be
>> audited if we wanna guarantee free space in the first chunk but I
>> don't think it would be too difficult.
> 
> The upper bound is SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu).
> 
> Thats usually 14 * 104 bytes = 1456 bytes. This may increase to more
> than 8k given the future plans to add queues into kmem_cache_cpu.

Alright, will work on that.  Does slab allocator guarantee to return
NULL if called before initialized or is it undefined?  If latter, is
there a way to determine whether slab is initialized yet?

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-17  8:49           ` Tejun Heo
@ 2010-06-17  9:01             ` Pekka Enberg
  2010-06-17 13:43             ` Christoph Lameter
  1 sibling, 0 replies; 21+ messages in thread
From: Pekka Enberg @ 2010-06-17  9:01 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Christoph Lameter, David Rientjes, linux-mm

On 6/17/10 11:49 AM, Tejun Heo wrote:
> Hello,
>
> On 06/16/2010 07:35 PM, Christoph Lameter wrote:
>>> It's primarily controlled by PERCPU_DYNAMIC_RESERVE.  I don't think
>>> there will be any systematic way to do it other than sizing it
>>> sufficiently.  Can you calculate the upper bound?  The constant has
>>> been used primarily for optimization so how it's used needs to be
>>> audited if we wanna guarantee free space in the first chunk but I
>>> don't think it would be too difficult.
>>
>> The upper bound is SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu).
>>
>> Thats usually 14 * 104 bytes = 1456 bytes. This may increase to more
>> than 8k given the future plans to add queues into kmem_cache_cpu.
>
> Alright, will work on that.  Does slab allocator guarantee to return
> NULL if called before initialized or is it undefined?  If latter, is
> there a way to determine whether slab is initialized yet?

It's undefined and you can use slab_is_available() to check if it's 
available or not.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] slub: Simplify boot kmem_cache_cpu allocations
  2010-06-17  8:49           ` Tejun Heo
  2010-06-17  9:01             ` Pekka Enberg
@ 2010-06-17 13:43             ` Christoph Lameter
  2010-06-18 16:58               ` [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions Tejun Heo
  2010-06-18 16:58               ` [PATCH 2/2] percpu: allow limited allocation before slab is online Tejun Heo
  1 sibling, 2 replies; 21+ messages in thread
From: Christoph Lameter @ 2010-06-17 13:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Pekka Enberg, David Rientjes, linux-mm

On Thu, 17 Jun 2010, Tejun Heo wrote:

> On 06/16/2010 07:35 PM, Christoph Lameter wrote:
> >> It's primarily controlled by PERCPU_DYNAMIC_RESERVE.  I don't think
> >> there will be any systematic way to do it other than sizing it
> >> sufficiently.  Can you calculate the upper bound?  The constant has
> >> been used primarily for optimization so how it's used needs to be
> >> audited if we wanna guarantee free space in the first chunk but I
> >> don't think it would be too difficult.
> >
> > The upper bound is SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu).
> >
> > Thats usually 14 * 104 bytes = 1456 bytes. This may increase to more
> > than 8k given the future plans to add queues into kmem_cache_cpu.
>
> Alright, will work on that.  Does slab allocator guarantee to return
> NULL if called before initialized or is it undefined?  If latter, is
> there a way to determine whether slab is initialized yet?

Behavior is undefined. slab_is_available() determines if slab is up yet.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions
  2010-06-17 13:43             ` Christoph Lameter
@ 2010-06-18 16:58               ` Tejun Heo
  2010-06-18 17:29                 ` Christoph Lameter
  2010-06-18 17:31                 ` Christoph Lameter
  2010-06-18 16:58               ` [PATCH 2/2] percpu: allow limited allocation before slab is online Tejun Heo
  1 sibling, 2 replies; 21+ messages in thread
From: Tejun Heo @ 2010-06-18 16:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

In pcpu_alloc_info() and pcpu_embed_first_chunk(), @dyn_size was
ssize_t, -1 meant auto-size, 0 forced 0 and positive meant minimum
size.  There's no use case for forcing 0 and the upcoming early alloc
support always requires non-zero dynamic size.  Make @dyn_size always
mean minimum dyn_size.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
These two patches are on top of percpu#for-linus.  The branch is
available at...

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git early-alloc

It would be a good idea to add BUILD_BUG_ON() in slab to verify
allocation limits against PERCPU_DYNAMIC_EARLY_SIZE/SLOTS.  Please
note that two alloc slots might be necessary for each allocation and
there can be gaps in allocation due to alignment, so giving it some
headroom would be better.

Please let me know if it's okay for slab.  I'll push it through
percpu#for-next then.

Thanks.

 include/linux/percpu.h |    4 ++--
 mm/percpu.c            |   33 +++++++++------------------------
 2 files changed, 11 insertions(+), 26 deletions(-)

Index: work/include/linux/percpu.h
===================================================================
--- work.orig/include/linux/percpu.h
+++ work/include/linux/percpu.h
@@ -105,7 +105,7 @@ extern struct pcpu_alloc_info * __init p
 extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);

 extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
-				size_t reserved_size, ssize_t dyn_size,
+				size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);

@@ -113,7 +113,7 @@ extern int __init pcpu_setup_first_chunk
 					 void *base_addr);

 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
 				pcpu_fc_alloc_fn_t alloc_fn,
Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -1013,20 +1013,6 @@ phys_addr_t per_cpu_ptr_to_phys(void *ad
 		return page_to_phys(pcpu_addr_to_page(addr));
 }

-static inline size_t pcpu_calc_fc_sizes(size_t static_size,
-					size_t reserved_size,
-					ssize_t *dyn_sizep)
-{
-	size_t size_sum;
-
-	size_sum = PFN_ALIGN(static_size + reserved_size +
-			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
-	if (*dyn_sizep != 0)
-		*dyn_sizep = size_sum - static_size - reserved_size;
-
-	return size_sum;
-}
-
 /**
  * pcpu_alloc_alloc_info - allocate percpu allocation info
  * @nr_groups: the number of groups
@@ -1085,7 +1071,7 @@ void __init pcpu_free_alloc_info(struct
 /**
  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  *
@@ -1104,7 +1090,7 @@ void __init pcpu_free_alloc_info(struct
  * failure, ERR_PTR value is returned.
  */
 struct pcpu_alloc_info * __init pcpu_build_alloc_info(
-				size_t reserved_size, ssize_t dyn_size,
+				size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
@@ -1123,13 +1109,15 @@ struct pcpu_alloc_info * __init pcpu_bui
 	memset(group_map, 0, sizeof(group_map));
 	memset(group_cnt, 0, sizeof(group_cnt));

+	size_sum = PFN_ALIGN(static_size + reserved_size + dyn_size);
+	dyn_size = size_sum - static_size - reserved_size;
+
 	/*
 	 * Determine min_unit_size, alloc_size and max_upa such that
 	 * alloc_size is multiple of atom_size and is the smallest
 	 * which can accomodate 4k aligned segments which are equal to
 	 * or larger than min_unit_size.
 	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

 	alloc_size = roundup(min_unit_size, atom_size);
@@ -1532,7 +1520,7 @@ early_param("percpu_alloc", percpu_alloc
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: minimum free size for dynamic allocation in bytes
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  * @alloc_fn: function to allocate percpu page
@@ -1553,10 +1541,7 @@ early_param("percpu_alloc", percpu_alloc
  * vmalloc space is not orders of magnitude larger than distances
  * between node memory addresses (ie. 32bit NUMA machines).
  *
- * When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment.  When @dyn_size is auto,
- * @dyn_size is just big enough to fill page alignment after static
- * and reserved areas.
+ * @dyn_size specifies the minimum dynamic area size.
  *
  * If the needed size is smaller than the minimum or specified unit
  * size, the leftover is returned using @free_fn.
@@ -1564,7 +1549,7 @@ early_param("percpu_alloc", percpu_alloc
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				  size_t atom_size,
 				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
 				  pcpu_fc_alloc_fn_t alloc_fn,
@@ -1695,7 +1680,7 @@ int __init pcpu_page_first_chunk(size_t

 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

-	ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
 	BUG_ON(ai->nr_groups != 1);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 2/2] percpu: allow limited allocation before slab is online
  2010-06-17 13:43             ` Christoph Lameter
  2010-06-18 16:58               ` [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions Tejun Heo
@ 2010-06-18 16:58               ` Tejun Heo
  1 sibling, 0 replies; 21+ messages in thread
From: Tejun Heo @ 2010-06-18 16:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

This patch updates percpu allocator such that it can serve limited
amount of allocation before slab comes online.  This is primarily to
allow slab to depend on working percpu allocator.

Two parameters, PERCPU_DYNAMIC_EARLY_SIZE and SLOTS, determine how
much memory space and allocation map slots are reserved.  If this
reserved area is exhausted, WARN_ON_ONCE() will trigger and allocation
will fail till slab comes online.

The following changes are made to implement early alloc.

* pcpu_mem_alloc() now checks slab_is_available()

* Chunks are allocated using pcpu_mem_alloc()

* Init paths make sure ai->dyn_size is at least as large as
  PERCPU_DYNAMIC_EARLY_SIZE.

* Initial alloc maps are allocated in __initdata and copied to
  kmalloc'd areas once slab is online.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
---
 include/linux/percpu.h |   13 ++++++++++++
 init/main.c            |    1
 mm/percpu.c            |   52 +++++++++++++++++++++++++++++++++++++------------
 3 files changed, 54 insertions(+), 12 deletions(-)

Index: work/mm/percpu.c
===================================================================
--- work.orig/mm/percpu.c
+++ work/mm/percpu.c
@@ -282,6 +282,9 @@ static void __maybe_unused pcpu_next_pop
  */
 static void *pcpu_mem_alloc(size_t size)
 {
+	if (WARN_ON_ONCE(!slab_is_available()))
+		return NULL;
+
 	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
 	else {
@@ -392,13 +395,6 @@ static int pcpu_extend_area_map(struct p
 	old_size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, old_size);

-	/*
-	 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
-	 * one of the first chunks and still using static map.
-	 */
-	if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-		old = chunk->map;
-
 	chunk->map_alloc = new_alloc;
 	chunk->map = new;
 	new = NULL;
@@ -604,7 +600,7 @@ static struct pcpu_chunk *pcpu_alloc_chu
 {
 	struct pcpu_chunk *chunk;

-	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+	chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
 	if (!chunk)
 		return NULL;

@@ -1109,7 +1105,9 @@ struct pcpu_alloc_info * __init pcpu_bui
 	memset(group_map, 0, sizeof(group_map));
 	memset(group_cnt, 0, sizeof(group_cnt));

-	size_sum = PFN_ALIGN(static_size + reserved_size + dyn_size);
+	/* calculate size_sum and ensure dyn_size is enough for early alloc */
+	size_sum = PFN_ALIGN(static_size + reserved_size +
+			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
 	dyn_size = size_sum - static_size - reserved_size;

 	/*
@@ -1338,7 +1336,8 @@ int __init pcpu_setup_first_chunk(const
 				  void *base_addr)
 {
 	static char cpus_buf[4096] __initdata;
-	static int smap[2], dmap[2];
+	static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
+	static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	size_t dyn_size = ai->dyn_size;
 	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
@@ -1361,14 +1360,13 @@ int __init pcpu_setup_first_chunk(const
 } while (0)

 	/* sanity checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
-		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 	PCPU_SETUP_BUG_ON(!ai->static_size);
 	PCPU_SETUP_BUG_ON(!base_addr);
 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
 	PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

 	/* process group information and build config tables accordingly */
@@ -1806,3 +1804,33 @@ void __init setup_per_cpu_areas(void)
 		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+/*
+ * First and reserved chunks are initialized with temporary allocation
+ * map in initdata so that they can be used before slab is online.
+ * This function is called after slab is brought up and replaces those
+ * with properly allocated maps.
+ */
+void __init percpu_init_late(void)
+{
+	struct pcpu_chunk *target_chunks[] =
+		{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };
+	struct pcpu_chunk *chunk;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; (chunk = target_chunks[i]); i++) {
+		int *map;
+		const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
+
+		BUILD_BUG_ON(size > PAGE_SIZE);
+
+		map = pcpu_mem_alloc(size);
+		BUG_ON(!map);
+
+		spin_lock_irqsave(&pcpu_lock, flags);
+		memcpy(map, chunk->map, size);
+		chunk->map = map;
+		spin_unlock_irqrestore(&pcpu_lock, flags);
+	}
+}
Index: work/init/main.c
===================================================================
--- work.orig/init/main.c
+++ work/init/main.c
@@ -522,6 +522,7 @@ static void __init mm_init(void)
 	page_cgroup_init_flatmem();
 	mem_init();
 	kmem_cache_init();
+	percpu_init_late();
 	pgtable_cache_init();
 	vmalloc_init();
 }
Index: work/include/linux/percpu.h
===================================================================
--- work.orig/include/linux/percpu.h
+++ work/include/linux/percpu.h
@@ -45,6 +45,16 @@
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)

 /*
+ * Percpu allocator can serve percpu allocations before slab is
+ * initialized which allows slab to depend on the percpu allocator.
+ * The following two parameters decide how much resource to
+ * preallocate for this.  Keep PERCPU_DYNAMIC_RESERVE equal to or
+ * larger than PERCPU_DYNAMIC_EARLY_SIZE.
+ */
+#define PERCPU_DYNAMIC_EARLY_SLOTS	128
+#define PERCPU_DYNAMIC_EARLY_SIZE	(12 << 10)
+
+/*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
  * back on the first chunk for dynamic percpu allocation if arch is
  * manually allocating and mapping it for faster access (as a part of
@@ -140,6 +150,7 @@ extern bool is_kernel_percpu_address(uns
 #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 extern void __init setup_per_cpu_areas(void);
 #endif
+extern void __init percpu_init_late(void);

 #else /* CONFIG_SMP */

@@ -153,6 +164,8 @@ static inline bool is_kernel_percpu_addr

 static inline void __init setup_per_cpu_areas(void) { }

+static inline void __init percpu_init_late(void) { }
+
 static inline void *pcpu_lpage_remapped(void *kaddr)
 {
 	return NULL;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions
  2010-06-18 16:58               ` [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions Tejun Heo
@ 2010-06-18 17:29                 ` Christoph Lameter
  2010-06-18 17:31                 ` Christoph Lameter
  1 sibling, 0 replies; 21+ messages in thread
From: Christoph Lameter @ 2010-06-18 17:29 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Pekka Enberg, David Rientjes, linux-mm

Ok this worked when testing on x86 32 bit SMP no NUMA (after removal of
the bootstrap percpu areas in slub). The same config fails without
the patches (x86 64 bit NUMA works fine).

So if this is merged then we can drop the static percpu allocation and the
special boot handling.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions
  2010-06-18 16:58               ` [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions Tejun Heo
  2010-06-18 17:29                 ` Christoph Lameter
@ 2010-06-18 17:31                 ` Christoph Lameter
  2010-06-18 17:39                   ` Tejun Heo
  1 sibling, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-18 17:31 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Pekka Enberg, David Rientjes, linux-mm

On Fri, 18 Jun 2010, Tejun Heo wrote:

> It would be a good idea to add BUILD_BUG_ON() in slab to verify
> allocation limits against PERCPU_DYNAMIC_EARLY_SIZE/SLOTS.  Please
> note that two alloc slots might be necessary for each allocation and
> there can be gaps in allocation due to alignment, so giving it some
> headroom would be better.
>
> Please let me know if it's okay for slab.  I'll push it through
> percpu#for-next then.

We need SLUB_PAGE_SHIFT * sizeof(kmem_cache_cpu). So it would be

BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct
kmem_cache_cpu))?

What is the role of SLOTS?

Each kmem_cache_cpu structure is a separate percpu allocation.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions
  2010-06-18 17:31                 ` Christoph Lameter
@ 2010-06-18 17:39                   ` Tejun Heo
  2010-06-18 18:03                     ` Christoph Lameter
  0 siblings, 1 reply; 21+ messages in thread
From: Tejun Heo @ 2010-06-18 17:39 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

Hello,

On 06/18/2010 07:31 PM, Christoph Lameter wrote:
> We need SLUB_PAGE_SHIFT * sizeof(kmem_cache_cpu). So it would be
> 
> BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct
> kmem_cache_cpu))?

Yeah, something like that but I would add some buffer there for
alignment and whatnot.

> What is the role of SLOTS?

It's allocation map.  Each consecutive allocs consume one if alignment
doesn't require padding but two if it does.  ie. It limits how many
items one can allocate.

> Each kmem_cache_cpu structure is a separate percpu allocation.

If it's a single item.  Nothing to worry about.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions
  2010-06-18 17:39                   ` Tejun Heo
@ 2010-06-18 18:03                     ` Christoph Lameter
  2010-06-19  8:23                       ` Tejun Heo
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-18 18:03 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Pekka Enberg, David Rientjes, linux-mm

On Fri, 18 Jun 2010, Tejun Heo wrote:

> Hello,
>
> On 06/18/2010 07:31 PM, Christoph Lameter wrote:
> > We need SLUB_PAGE_SHIFT * sizeof(kmem_cache_cpu). So it would be
> >
> > BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct
> > kmem_cache_cpu))?
>
> Yeah, something like that but I would add some buffer there for
> alignment and whatnot.

Only the percpu allocator would know the waste for alignment and
"whatnot". What would you like me to add to the above formula to make it
safe?

> > What is the role of SLOTS?
>
> It's allocation map.  Each consecutive allocs consume one if alignment
> doesn't require padding but two if it does.  ie. It limits how many
> items one can allocate.
>
> > Each kmem_cache_cpu structure is a separate percpu allocation.
>
> If it's a single item.  Nothing to worry about.

ok so

BUILD_BUG_ON(SLUB_PAGE_SHIFT * <fuzz-factor> > SLOTS);

I dont know what fuzz factor would be needed.

Maybe its best to have a macro provided by percpu?

VERIFY_EARLY_ALLOCS(<nr-of-allocs>,<total-size-consumed>)

The macro would generate the proper BUILD_BUG_ON?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: slub: remove dynamic dma slab allocation
  2010-06-15 19:07 slub: remove dynamic dma slab allocation Christoph Lameter
  2010-06-15 19:11 ` [RFC] slub: Simplify boot kmem_cache_cpu allocations Christoph Lameter
@ 2010-06-18 22:30 ` David Rientjes
  2010-06-21 14:25   ` Christoph Lameter
  1 sibling, 1 reply; 21+ messages in thread
From: David Rientjes @ 2010-06-18 22:30 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, linux-mm

On Tue, 15 Jun 2010, Christoph Lameter wrote:

> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2010-06-15 12:40:58.000000000 -0500
> +++ linux-2.6/mm/slub.c	2010-06-15 12:41:36.000000000 -0500
> @@ -2070,7 +2070,7 @@ init_kmem_cache_node(struct kmem_cache_n
> 
>  static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
> 
> -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
>  {
>  	if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)

Looks like it'll conflict with "SLUB: is_kmalloc_cache" in slub/cleanups.

>  		/*
> @@ -2097,7 +2097,7 @@ static inline int alloc_kmem_cache_cpus(
>   * when allocating for the kmalloc_node_cache. This is used for bootstrapping
>   * memory on a fresh node that has no slab structures yet.
>   */
> -static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
> +static void early_kmem_cache_node_alloc(int node)
>  {
>  	struct page *page;
>  	struct kmem_cache_node *n;
> @@ -2105,7 +2105,7 @@ static void early_kmem_cache_node_alloc(
> 
>  	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
> 
> -	page = new_slab(kmalloc_caches, gfpflags, node);
> +	page = new_slab(kmalloc_caches, GFP_KERNEL, node);
> 
>  	BUG_ON(!page);
>  	if (page_to_nid(page) != node) {

Hmm, not sure of this.  We can't do GFP_KERNEL allocations in 
kmem_cache_init(), they must be deferred to kmem_cache_init_late().  So 
this will be allocating the kmem_cache_node cache while slab_state is 
still DOWN and yet passing GFP_KERNEL via early_kmem_cache_node_alloc().

I think this has to be GFP_NOWAIT instead.

> @@ -2149,7 +2149,7 @@ static void free_kmem_cache_nodes(struct
>  	}
>  }
> 
> -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
> +static int init_kmem_cache_nodes(struct kmem_cache *s)
>  {
>  	int node;
> 
> @@ -2157,11 +2157,11 @@ static int init_kmem_cache_nodes(struct
>  		struct kmem_cache_node *n;
> 
>  		if (slab_state == DOWN) {
> -			early_kmem_cache_node_alloc(gfpflags, node);
> +			early_kmem_cache_node_alloc(node);
>  			continue;
>  		}
>  		n = kmem_cache_alloc_node(kmalloc_caches,
> -						gfpflags, node);
> +						GFP_KERNEL, node);
> 
>  		if (!n) {
>  			free_kmem_cache_nodes(s);

Same here, this can still lead to GFP_KERNEL allocations from 
kmem_cache_init() because slab_state is PARTIAL or UP.

> @@ -2178,7 +2178,7 @@ static void free_kmem_cache_nodes(struct
>  {
>  }
> 
> -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
> +static int init_kmem_cache_nodes(struct kmem_cache *s)
>  {
>  	init_kmem_cache_node(&s->local_node, s);
>  	return 1;
> @@ -2318,7 +2318,7 @@ static int calculate_sizes(struct kmem_c
> 
>  }
> 
> -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
> +static int kmem_cache_open(struct kmem_cache *s,
>  		const char *name, size_t size,
>  		size_t align, unsigned long flags,
>  		void (*ctor)(void *))
> @@ -2354,10 +2354,10 @@ static int kmem_cache_open(struct kmem_c
>  #ifdef CONFIG_NUMA
>  	s->remote_node_defrag_ratio = 1000;
>  #endif
> -	if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
> +	if (!init_kmem_cache_nodes(s))
>  		goto error;
> 
> -	if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
> +	if (alloc_kmem_cache_cpus(s))
>  		return 1;
> 
>  	free_kmem_cache_nodes(s);
> @@ -2517,6 +2517,10 @@ EXPORT_SYMBOL(kmem_cache_destroy);
>  struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
>  EXPORT_SYMBOL(kmalloc_caches);
> 
> +#ifdef CONFIG_ZONE_DMA
> +static struct kmem_cache kmalloc_dma_caches[SLUB_PAGE_SHIFT];
> +#endif
> +
>  static int __init setup_slub_min_order(char *str)
>  {
>  	get_option(&str, &slub_min_order);
> @@ -2553,116 +2557,26 @@ static int __init setup_slub_nomerge(cha
> 
>  __setup("slub_nomerge", setup_slub_nomerge);
> 
> -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
> -		const char *name, int size, gfp_t gfp_flags)
> +static void create_kmalloc_cache(struct kmem_cache *s,
> +		const char *name, int size, unsigned int flags)
>  {
> -	unsigned int flags = 0;
> -
> -	if (gfp_flags & SLUB_DMA)
> -		flags = SLAB_CACHE_DMA;
> -
>  	/*
>  	 * This function is called with IRQs disabled during early-boot on
>  	 * single CPU so there's no need to take slub_lock here.
>  	 */
> -	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
> +	if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
>  								flags, NULL))
>  		goto panic;
> 
>  	list_add(&s->list, &slab_caches);
> 
> -	if (sysfs_slab_add(s))
> -		goto panic;
> -	return s;
> +	if (!sysfs_slab_add(s))
> +		return;
> 
>  panic:
>  	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
>  }
> 
> -#ifdef CONFIG_ZONE_DMA
> -static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
> -
> -static void sysfs_add_func(struct work_struct *w)
> -{
> -	struct kmem_cache *s;
> -
> -	down_write(&slub_lock);
> -	list_for_each_entry(s, &slab_caches, list) {
> -		if (s->flags & __SYSFS_ADD_DEFERRED) {
> -			s->flags &= ~__SYSFS_ADD_DEFERRED;
> -			sysfs_slab_add(s);
> -		}
> -	}
> -	up_write(&slub_lock);
> -}
> -
> -static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
> -
> -static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
> -{
> -	struct kmem_cache *s;
> -	char *text;
> -	size_t realsize;
> -	unsigned long slabflags;
> -	int i;
> -
> -	s = kmalloc_caches_dma[index];
> -	if (s)
> -		return s;
> -
> -	/* Dynamically create dma cache */
> -	if (flags & __GFP_WAIT)
> -		down_write(&slub_lock);
> -	else {
> -		if (!down_write_trylock(&slub_lock))
> -			goto out;
> -	}
> -
> -	if (kmalloc_caches_dma[index])
> -		goto unlock_out;
> -
> -	realsize = kmalloc_caches[index].objsize;
> -	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
> -			 (unsigned int)realsize);
> -
> -	s = NULL;
> -	for (i = 0; i < KMALLOC_CACHES; i++)
> -		if (!kmalloc_caches[i].size)
> -			break;
> -
> -	BUG_ON(i >= KMALLOC_CACHES);
> -	s = kmalloc_caches + i;
> -
> -	/*
> -	 * Must defer sysfs creation to a workqueue because we don't know
> -	 * what context we are called from. Before sysfs comes up, we don't
> -	 * need to do anything because our sysfs initcall will start by
> -	 * adding all existing slabs to sysfs.
> -	 */
> -	slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
> -	if (slab_state >= SYSFS)
> -		slabflags |= __SYSFS_ADD_DEFERRED;
> -
> -	if (!text || !kmem_cache_open(s, flags, text,
> -			realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
> -		s->size = 0;
> -		kfree(text);
> -		goto unlock_out;
> -	}
> -
> -	list_add(&s->list, &slab_caches);
> -	kmalloc_caches_dma[index] = s;
> -
> -	if (slab_state >= SYSFS)
> -		schedule_work(&sysfs_add_work);
> -
> -unlock_out:
> -	up_write(&slub_lock);
> -out:
> -	return kmalloc_caches_dma[index];
> -}
> -#endif
> -
>  /*
>   * Conversion table for small slabs sizes / 8 to the index in the
>   * kmalloc array. This is necessary for slabs < 192 since we have non power
> @@ -2715,7 +2629,7 @@ static struct kmem_cache *get_slab(size_
> 
>  #ifdef CONFIG_ZONE_DMA
>  	if (unlikely((flags & SLUB_DMA)))
> -		return dma_kmalloc_cache(index, flags);
> +		return &kmalloc_dma_caches[index];
> 
>  #endif
>  	return &kmalloc_caches[index];
> @@ -3053,7 +2967,7 @@ void __init kmem_cache_init(void)
>  	 * kmem_cache_open for slab_state == DOWN.
>  	 */
>  	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
> -		sizeof(struct kmem_cache_node), GFP_NOWAIT);
> +		sizeof(struct kmem_cache_node), 0);
>  	kmalloc_caches[0].refcount = -1;
>  	caches++;
> 
> @@ -3066,18 +2980,18 @@ void __init kmem_cache_init(void)
>  	/* Caches that are not of the two-to-the-power-of size */
>  	if (KMALLOC_MIN_SIZE <= 32) {
>  		create_kmalloc_cache(&kmalloc_caches[1],
> -				"kmalloc-96", 96, GFP_NOWAIT);
> +				"kmalloc-96", 96, 0);
>  		caches++;
>  	}
>  	if (KMALLOC_MIN_SIZE <= 64) {
>  		create_kmalloc_cache(&kmalloc_caches[2],
> -				"kmalloc-192", 192, GFP_NOWAIT);
> +				"kmalloc-192", 192, 0);
>  		caches++;
>  	}
> 
>  	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
>  		create_kmalloc_cache(&kmalloc_caches[i],
> -			"kmalloc", 1 << i, GFP_NOWAIT);
> +			"kmalloc", 1 << i, 0);
>  		caches++;
>  	}
> 
> @@ -3124,7 +3038,7 @@ void __init kmem_cache_init(void)
> 
>  	/* Provide the correct kmalloc names now that the caches are up */
>  	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
> -		kmalloc_caches[i]. name =
> +		kmalloc_caches[i].name =
>  			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
> 
>  #ifdef CONFIG_SMP
> @@ -3147,6 +3061,21 @@ void __init kmem_cache_init(void)
> 
>  void __init kmem_cache_init_late(void)
>  {
> +#ifdef CONFIG_ZONE_DMA
> +	int i;
> +
> +	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
> +		struct kmem_cache *s = &kmalloc_caches[i];
> +
> +		if (s && s->size) {
> +			char *name = kasprintf(GFP_KERNEL,
> +				 "dma-kmalloc-%d", s->objsize);

kasprintf() can return NULL which isn't caught by kmem_cache_open().

> +
> +			create_kmalloc_cache(&kmalloc_dma_caches[i],
> +				name, s->objsize, SLAB_CACHE_DMA);
> +		}
> +	}
> +#endif
>  }
> 
>  /*
> @@ -3241,7 +3170,7 @@ struct kmem_cache *kmem_cache_create(con
> 
>  	s = kmalloc(kmem_size, GFP_KERNEL);
>  	if (s) {
> -		if (kmem_cache_open(s, GFP_KERNEL, name,
> +		if (kmem_cache_open(s, name,
>  				size, align, flags, ctor)) {
>  			list_add(&s->list, &slab_caches);
>  			up_write(&slub_lock);
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions
  2010-06-18 18:03                     ` Christoph Lameter
@ 2010-06-19  8:23                       ` Tejun Heo
  0 siblings, 0 replies; 21+ messages in thread
From: Tejun Heo @ 2010-06-19  8:23 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, David Rientjes, linux-mm

Hello,

On 06/18/2010 08:03 PM, Christoph Lameter wrote:
>> Yeah, something like that but I would add some buffer there for
>> alignment and whatnot.
> 
> Only the percpu allocator would know the waste for alignment and
> "whatnot". What would you like me to add to the above formula to make it
> safe?

I'm not sure, some sensible slack.  :-)

>>> What is the role of SLOTS?
>>
>> It's allocation map.  Each consecutive allocs consume one if alignment
>> doesn't require padding but two if it does.  ie. It limits how many
>> items one can allocate.
>>
>>> Each kmem_cache_cpu structure is a separate percpu allocation.
>>
>> If it's a single item.  Nothing to worry about.
> 
> ok so
> 
> BUILD_BUG_ON(SLUB_PAGE_SHIFT * <fuzz-factor> > SLOTS);
> 
> I dont know what fuzz factor would be needed.
> 
> Maybe its best to have a macro provided by percpu?
> 
> VERIFY_EARLY_ALLOCS(<nr-of-allocs>,<total-size-consumed>)
> 
> The macro would generate the proper BUILD_BUG_ON?

The problem is that alignment of each item and their allocation order
also matter.  Even the percpu allocator itself can't tell for sure
before actually allocating it.  As it's gonna be used only by the slab
allocator at least for now && those preallocated areas aren't wasted
anyway, just giving it enough should work good enough, I think.  Say,
multiply everything by two.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: slub: remove dynamic dma slab allocation
  2010-06-18 22:30 ` slub: remove dynamic dma slab allocation David Rientjes
@ 2010-06-21 14:25   ` Christoph Lameter
  2010-06-21 19:56     ` David Rientjes
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-21 14:25 UTC (permalink / raw)
  To: David Rientjes; +Cc: Pekka Enberg, linux-mm

On Fri, 18 Jun 2010, David Rientjes wrote:

> On Tue, 15 Jun 2010, Christoph Lameter wrote:
>
> > Index: linux-2.6/mm/slub.c
> > ===================================================================
> > --- linux-2.6.orig/mm/slub.c	2010-06-15 12:40:58.000000000 -0500
> > +++ linux-2.6/mm/slub.c	2010-06-15 12:41:36.000000000 -0500
> > @@ -2070,7 +2070,7 @@ init_kmem_cache_node(struct kmem_cache_n
> >
> >  static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
> >
> > -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> > +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
> >  {
> >  	if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
>
> Looks like it'll conflict with "SLUB: is_kmalloc_cache" in slub/cleanups.

Yes I thought we dropped those.

> > @@ -2105,7 +2105,7 @@ static void early_kmem_cache_node_alloc(
> >
> >  	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
> >
> > -	page = new_slab(kmalloc_caches, gfpflags, node);
> > +	page = new_slab(kmalloc_caches, GFP_KERNEL, node);
> >
> >  	BUG_ON(!page);
> >  	if (page_to_nid(page) != node) {
>
> Hmm, not sure of this.  We can't do GFP_KERNEL allocations in
> kmem_cache_init(), they must be deferred to kmem_cache_init_late().  So
> this will be allocating the kmem_cache_node cache while slab_state is
> still DOWN and yet passing GFP_KERNEL via early_kmem_cache_node_alloc().
>
> I think this has to be GFP_NOWAIT instead.

Ok we could use GFP_NOWAIT in this case.

> >  		if (slab_state == DOWN) {
> > -			early_kmem_cache_node_alloc(gfpflags, node);
> > +			early_kmem_cache_node_alloc(node);
> >  			continue;
> >  		}
> >  		n = kmem_cache_alloc_node(kmalloc_caches,
> > -						gfpflags, node);
> > +						GFP_KERNEL, node);
> >
> >  		if (!n) {
> >  			free_kmem_cache_nodes(s);
>
> Same here, this can still lead to GFP_KERNEL allocations from
> kmem_cache_init() because slab_state is PARTIAL or UP.

You cannot do that here because this function is also used later when the
slab is up. There is more in the percpu allocator which we are also trying
to use to avoid having static kmem_cache_cpu declarations. GFP_KERNEL
needs to be usable during early boot otherwise functions will have to add
special casing for boot situations.

> > +	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
> > +		struct kmem_cache *s = &kmalloc_caches[i];
> > +
> > +		if (s && s->size) {
> > +			char *name = kasprintf(GFP_KERNEL,
> > +				 "dma-kmalloc-%d", s->objsize);
>
> kasprintf() can return NULL which isn't caught by kmem_cache_open().

Then we will have a nameless cache. We could catch this with a WARN_ON()
but does this work that early?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: slub: remove dynamic dma slab allocation
  2010-06-21 14:25   ` Christoph Lameter
@ 2010-06-21 19:56     ` David Rientjes
  2010-06-21 20:32       ` Christoph Lameter
  0 siblings, 1 reply; 21+ messages in thread
From: David Rientjes @ 2010-06-21 19:56 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, linux-mm

On Mon, 21 Jun 2010, Christoph Lameter wrote:

> > >  		if (slab_state == DOWN) {
> > > -			early_kmem_cache_node_alloc(gfpflags, node);
> > > +			early_kmem_cache_node_alloc(node);
> > >  			continue;
> > >  		}
> > >  		n = kmem_cache_alloc_node(kmalloc_caches,
> > > -						gfpflags, node);
> > > +						GFP_KERNEL, node);
> > >
> > >  		if (!n) {
> > >  			free_kmem_cache_nodes(s);
> >
> > Same here, this can still lead to GFP_KERNEL allocations from
> > kmem_cache_init() because slab_state is PARTIAL or UP.
> 
> You cannot do that here because this function is also used later when the
> slab is up. There is more in the percpu allocator which we are also trying
> to use to avoid having static kmem_cache_cpu declarations. GFP_KERNEL
> needs to be usable during early boot otherwise functions will have to add
> special casing for boot situations.
> 

The gfp_allowed_mask only changes once irqs are enabled, so either the 
gfpflags need to be passed into init_kmem_cache_nodes again or we need to 
do something like

	gfp_t gfpflags = irqs_disabled() ? GFP_NOWAIT : GFP_KERNEL;

locally.

The cleanest solution would probably be to extend slab_state to be set in 
kmem_cache_init_late() to determine when we're fully initialized, though.

> > > +	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
> > > +		struct kmem_cache *s = &kmalloc_caches[i];
> > > +
> > > +		if (s && s->size) {
> > > +			char *name = kasprintf(GFP_KERNEL,
> > > +				 "dma-kmalloc-%d", s->objsize);
> >
> > kasprintf() can return NULL which isn't caught by kmem_cache_open().
> 
> Then we will have a nameless cache. We could catch this with a WARN_ON()
> but does this work that early?
> 

It works, but this seems to be a forced

	if (WARN_ON(!name))
		continue;

because although it appears that s->name can be NULL within the slub 
layer, the sysfs layer would result in a NULL pointer dereference for 
things like strcmp() when looking up the cache's dirent.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: slub: remove dynamic dma slab allocation
  2010-06-21 19:56     ` David Rientjes
@ 2010-06-21 20:32       ` Christoph Lameter
  2010-06-21 21:08         ` David Rientjes
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Lameter @ 2010-06-21 20:32 UTC (permalink / raw)
  To: David Rientjes; +Cc: Pekka Enberg, linux-mm

On Mon, 21 Jun 2010, David Rientjes wrote:

> > You cannot do that here because this function is also used later when the
> > slab is up. There is more in the percpu allocator which we are also trying
> > to use to avoid having static kmem_cache_cpu declarations. GFP_KERNEL
> > needs to be usable during early boot otherwise functions will have to add
> > special casing for boot situations.
> >
>
> The gfp_allowed_mask only changes once irqs are enabled, so either the
> gfpflags need to be passed into init_kmem_cache_nodes again or we need to
> do something like
>
> 	gfp_t gfpflags = irqs_disabled() ? GFP_NOWAIT : GFP_KERNEL;
>
> locally.

What a mess....

> The cleanest solution would probably be to extend slab_state to be set in
> kmem_cache_init_late() to determine when we're fully initialized, though.

Not sure what the point would be. Changing slab_state does not change the
interrupt enabled/disabled state of the processor.

Is gfp_allowed_mask properly updated during boot? Then we could just use

	GFP_KERNEL & gfp_allowed_mask

in these locations? Still bad since we are wasting code on correctness
checks.

Noone thought about this when designing these checks? The checks cannot be
fixed up to consider boot time so that we do not have to do artistics in
the code?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: slub: remove dynamic dma slab allocation
  2010-06-21 20:32       ` Christoph Lameter
@ 2010-06-21 21:08         ` David Rientjes
  0 siblings, 0 replies; 21+ messages in thread
From: David Rientjes @ 2010-06-21 21:08 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Pekka Enberg, linux-mm

On Mon, 21 Jun 2010, Christoph Lameter wrote:

> > > You cannot do that here because this function is also used later when the
> > > slab is up. There is more in the percpu allocator which we are also trying
> > > to use to avoid having static kmem_cache_cpu declarations. GFP_KERNEL
> > > needs to be usable during early boot otherwise functions will have to add
> > > special casing for boot situations.
> > >
> >
> > The gfp_allowed_mask only changes once irqs are enabled, so either the
> > gfpflags need to be passed into init_kmem_cache_nodes again or we need to
> > do something like
> >
> > 	gfp_t gfpflags = irqs_disabled() ? GFP_NOWAIT : GFP_KERNEL;
> >
> > locally.
> 
> What a mess....
> 
> > The cleanest solution would probably be to extend slab_state to be set in
> > kmem_cache_init_late() to determine when we're fully initialized, though.
> 
> Not sure what the point would be. Changing slab_state does not change the
> interrupt enabled/disabled state of the processor.
> 

If you added an even higher slab_state level than UP and set it in 
kmem_cache_init_late(), then you could check for it to determine 
GFP_NOWAIT or GFP_KERNEL in init_kmem_cache_nodes() rather than 
irqs_disabled() because that's the only real event that requires 
kmem_cache_init_late() to need to exist in the first place.

I'm not sure if you'd ever use that state again, but it's robust if 
anything is ever added in the space between kmem_cache_init() and 
kmem_cache_init_late() for a reason.  slab_is_available() certainly 
doesn't need it because we don't kmem_cache_create() in between the two.

When you consider those solutions, it doesn't appear as though removing 
the gfp_t formal in init_kmem_cache_nodes() is really that much of a 
cleanup.

> Is gfp_allowed_mask properly updated during boot? Then we could just use
> 
> 	GFP_KERNEL & gfp_allowed_mask
> 
> in these locations? Still bad since we are wasting code on correctness
> checks.
> 

That certainly does get us GFP_NOWAIT (same as GFP_BOOT_MASK) before irqs 
are enabled and GFP_KERNEL afterwards since gfp_allowed_mask is updated at 
the same time.  If it's worth getting of the gfp_t formal in 
init_kmem_cache_nodes() so much, then that masking would deserve a big fat 
comment :)

> Noone thought about this when designing these checks? The checks cannot be
> fixed up to consider boot time so that we do not have to do artistics in
> the code?
> 

I think gfp_allowed_mask is the intended solution since it simply masks 
off GFP_KERNEL and turns those allocations into GFP_BOOT_MASK before it 
gets updated.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2010-06-21 21:09 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-15 19:07 slub: remove dynamic dma slab allocation Christoph Lameter
2010-06-15 19:11 ` [RFC] slub: Simplify boot kmem_cache_cpu allocations Christoph Lameter
2010-06-16  8:53   ` Tejun Heo
2010-06-16 16:33     ` Christoph Lameter
2010-06-16 17:18       ` Tejun Heo
2010-06-16 17:35         ` Christoph Lameter
2010-06-17  8:49           ` Tejun Heo
2010-06-17  9:01             ` Pekka Enberg
2010-06-17 13:43             ` Christoph Lameter
2010-06-18 16:58               ` [PATCH 1/2] percpu: make @dyn_size always mean min dyn_size in first chunk init functions Tejun Heo
2010-06-18 17:29                 ` Christoph Lameter
2010-06-18 17:31                 ` Christoph Lameter
2010-06-18 17:39                   ` Tejun Heo
2010-06-18 18:03                     ` Christoph Lameter
2010-06-19  8:23                       ` Tejun Heo
2010-06-18 16:58               ` [PATCH 2/2] percpu: allow limited allocation before slab is online Tejun Heo
2010-06-18 22:30 ` slub: remove dynamic dma slab allocation David Rientjes
2010-06-21 14:25   ` Christoph Lameter
2010-06-21 19:56     ` David Rientjes
2010-06-21 20:32       ` Christoph Lameter
2010-06-21 21:08         ` David Rientjes

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.