linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] Manage jbd allocations from its own slabs
@ 2006-08-23 23:08 Badari Pulavarty
  2006-08-23 23:34 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Badari Pulavarty @ 2006-08-23 23:08 UTC (permalink / raw)
  To: Herbert Xu, akpm; +Cc: lkml, ext2-devel

Hi,

Here is the fix to "bh: Ensure bh fits within a page" problem
caused by JBD.

BTW, I realized that this problem can happen only with 1k, 2k
filesystems - as 4k, 8k allocations disable slab debug 
automatically. But for completeness, I created slabs for those
also.

What do you think ? I ran basic tests and things are fine.

Thanks,
Badari

JBD currently allocates commit and frozen buffers from slabs.
With CONFIG_SLAB_DEBUG, its possible for an allocation to
cross the page boundary causing IO problems.

https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=200127

So, instead of allocating these from regular slabs - manage
allocation from its own slabs and disable slab debug for these slabs.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
---
 fs/jbd/commit.c      |    6 +--
 fs/jbd/journal.c     |   88 ++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/jbd/transaction.c |    9 ++---
 include/linux/jbd.h  |    3 +
 4 files changed, 95 insertions(+), 11 deletions(-)

Index: linux-2.6.18-rc4/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/journal.c	2006-08-23 14:47:25.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/journal.c	2006-08-23 15:59:06.000000000 -0700
@@ -328,10 +328,10 @@
 		char *tmp;
 
 		jbd_unlock_bh_state(bh_in);
-		tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
+		tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
 		jbd_lock_bh_state(bh_in);
 		if (jh_in->b_frozen_data) {
-			kfree(tmp);
+			jbd_slab_free(tmp, bh_in->b_size);
 			goto repeat;
 		}
 
@@ -1612,6 +1612,89 @@
 }
 
 /*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs and allocate
+ * frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+static kmem_cache_t *jbd_slab[4];
+static const char *jbd_slab_names[4] = {
+	"jbd_1k",
+	"jbd_2k",
+	"jbd_4k",
+	"jbd_8k",
+};
+
+static void journal_destroy_jbd_slabs(void)
+{
+	int i;
+
+	for (i=0; i<4; i++) {
+		if (jbd_slab[i])
+			kmem_cache_destroy(jbd_slab[i]);
+		jbd_slab[i] = NULL;
+	}
+}
+
+static int journal_init_jbd_slabs(void)
+{
+	int i = 0;
+	int retval = 0;
+
+	for (i=0; i<4; i++) {
+		unsigned long slab_size = 1024 << i;
+		jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+					slab_size, slab_size,
+					0, NULL, NULL);
+		if (jbd_slab[i] == 0) {
+			journal_destroy_jbd_slabs();
+			retval = -ENOMEM;
+			printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+			goto out;
+		}
+	}
+out:
+	return retval;
+}
+
+static int jbd_find_slab_index(size_t size)
+{
+	int idx = 0;
+
+	switch (size) {
+	case 1024:	idx = 0;
+			break;
+	case 2048:	idx = 1;
+			break;
+	case 4096:	idx = 2;
+			break;
+	case 8192:	idx = 3;
+			break;
+	default:	printk("JBD unknown slab size %ld\n", size);
+			BUG();
+	}
+	return idx;
+}
+
+void * jbd_slab_alloc(size_t size, gfp_t flags)
+{
+	int idx;
+
+	idx = jbd_find_slab_index(size);
+	return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd_slab_free(void *ptr,  size_t size)
+{
+	int idx;
+
+	idx = jbd_find_slab_index(size);
+	kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
  * Journal_head storage management
  */
 static kmem_cache_t *journal_head_cache;
@@ -1799,13 +1882,13 @@
 				printk(KERN_WARNING "%s: freeing "
 						"b_frozen_data\n",
 						__FUNCTION__);
-				kfree(jh->b_frozen_data);
+				jbd_slab_free(jh->b_frozen_data, bh->b_size);
 			}
 			if (jh->b_committed_data) {
 				printk(KERN_WARNING "%s: freeing "
 						"b_committed_data\n",
 						__FUNCTION__);
-				kfree(jh->b_committed_data);
+				jbd_slab_free(jh->b_committed_data, bh->b_size);
 			}
 			bh->b_private = NULL;
 			jh->b_bh = NULL;	/* debug, really */
@@ -1953,6 +2036,8 @@
 		ret = journal_init_journal_head_cache();
 	if (ret == 0)
 		ret = journal_init_handle_cache();
+	if (ret == 0)
+		ret = journal_init_jbd_slabs();
 	return ret;
 }
 
@@ -1961,6 +2046,7 @@
 	journal_destroy_revoke_caches();
 	journal_destroy_journal_head_cache();
 	journal_destroy_handle_cache();
+	journal_destroy_jbd_slabs();
 }
 
 static int __init journal_init(void)
Index: linux-2.6.18-rc4/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/transaction.c	2006-08-23 14:05:57.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/transaction.c	2006-08-23 14:50:17.000000000 -0700
@@ -666,8 +666,9 @@
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
 				jbd_unlock_bh_state(bh);
-				frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
-							    GFP_NOFS);
+				frozen_buffer =
+					jbd_slab_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
 				if (!frozen_buffer) {
 					printk(KERN_EMERG
 					       "%s: OOM for frozen_buffer\n",
@@ -879,7 +880,7 @@
 
 repeat:
 	if (!jh->b_committed_data) {
-		committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS);
+		committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 		if (!committed_data) {
 			printk(KERN_EMERG "%s: No memory for committed data\n",
 				__FUNCTION__);
@@ -906,7 +907,7 @@
 out:
 	journal_put_journal_head(jh);
 	if (unlikely(committed_data))
-		kfree(committed_data);
+		jbd_slab_free(committed_data, bh->b_size);
 	return err;
 }
 
Index: linux-2.6.18-rc4/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/commit.c	2006-08-23 14:05:57.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/commit.c	2006-08-23 14:50:17.000000000 -0700
@@ -261,7 +261,7 @@
 			struct buffer_head *bh = jh2bh(jh);
 
 			jbd_lock_bh_state(bh);
-			kfree(jh->b_committed_data);
+			jbd_slab_free(jh->b_committed_data, bh->b_size);
 			jh->b_committed_data = NULL;
 			jbd_unlock_bh_state(bh);
 		}
@@ -745,14 +745,14 @@
 		 * Otherwise, we can just throw away the frozen data now.
 		 */
 		if (jh->b_committed_data) {
-			kfree(jh->b_committed_data);
+			jbd_slab_free(jh->b_committed_data, bh->b_size);
 			jh->b_committed_data = NULL;
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_frozen_data = NULL;
 			}
 		} else if (jh->b_frozen_data) {
-			kfree(jh->b_frozen_data);
+			jbd_slab_free(jh->b_frozen_data, bh->b_size);
 			jh->b_frozen_data = NULL;
 		}
 
Index: linux-2.6.18-rc4/include/linux/jbd.h
===================================================================
--- linux-2.6.18-rc4.orig/include/linux/jbd.h	2006-08-23 14:05:57.000000000 -0700
+++ linux-2.6.18-rc4/include/linux/jbd.h	2006-08-23 14:50:17.000000000 -0700
@@ -72,6 +72,9 @@
 #endif
 
 extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
+extern void * jbd_slab_alloc(size_t size, gfp_t flags);
+extern void jbd_slab_free(void *ptr, size_t size);
+
 #define jbd_kmalloc(size, flags) \
 	__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
 #define jbd_rep_kmalloc(size, flags) \





^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-23 23:08 [RFC][PATCH] Manage jbd allocations from its own slabs Badari Pulavarty
@ 2006-08-23 23:34 ` Andrew Morton
  2006-08-24 12:41   ` [Ext2-devel] " Dave Kleikamp
  2006-08-25  0:00   ` Badari Pulavarty
  2006-08-24 12:29 ` [Ext2-devel] " Dave Kleikamp
  2006-08-24 18:53 ` Christoph Hellwig
  2 siblings, 2 replies; 9+ messages in thread
From: Andrew Morton @ 2006-08-23 23:34 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Herbert Xu, lkml, ext2-devel

On Wed, 23 Aug 2006 16:08:15 -0700
Badari Pulavarty <pbadari@us.ibm.com> wrote:

> Hi,
> 
> Here is the fix to "bh: Ensure bh fits within a page" problem
> caused by JBD.
> 
> BTW, I realized that this problem can happen only with 1k, 2k
> filesystems - as 4k, 8k allocations disable slab debug 
> automatically. But for completeness, I created slabs for those
> also.
> 
> What do you think ? I ran basic tests and things are fine.
> 

Thanks for working on this.

> ...
>
>  /*
> + * jbd slab management: create 1k, 2k, 4k, 8k slabs and allocate
> + * frozen and commit buffers from these slabs.
> + *
> + * Reason for doing this is to avoid, SLAB_DEBUG - since it could
> + * cause bh to cross page boundary.
> + */
> +
> +static kmem_cache_t *jbd_slab[4];
> +static const char *jbd_slab_names[4] = {
> +	"jbd_1k",
> +	"jbd_2k",
> +	"jbd_4k",
> +	"jbd_8k",
> +};
> +
> +static void journal_destroy_jbd_slabs(void)
> +{
> +	int i;
> +
> +	for (i=0; i<4; i++) {
> +		if (jbd_slab[i])
> +			kmem_cache_destroy(jbd_slab[i]);
> +		jbd_slab[i] = NULL;
> +	}
> +}
> +
> +static int journal_init_jbd_slabs(void)
> +{
> +	int i = 0;
> +	int retval = 0;
> +
> +	for (i=0; i<4; i++) {
> +		unsigned long slab_size = 1024 << i;
> +		jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
> +					slab_size, slab_size,
> +					0, NULL, NULL);

OK, passing align=slab_size fixes the bug.

> +		if (jbd_slab[i] == 0) {
> +			journal_destroy_jbd_slabs();
> +			retval = -ENOMEM;
> +			printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
> +			goto out;
> +		}
> +	}
> +out:
> +	return retval;
> +}

Do we have to create all four slabs up-front?  Perhaps we can defer that
until mount-time, after we have determined the filesystem's block size.

That way, most people's machines will only ever create a single slab cache:
jbd_4k.

> +static int jbd_find_slab_index(size_t size)
> +{
> +	int idx = 0;
> +
> +	switch (size) {
> +	case 1024:	idx = 0;
> +			break;
> +	case 2048:	idx = 1;
> +			break;
> +	case 4096:	idx = 2;
> +			break;
> +	case 8192:	idx = 3;
> +			break;
> +	default:	printk("JBD unknown slab size %ld\n", size);
> +			BUG();
> +	}
> +	return idx;
> +}

I'd suggest this function be changed to directly return a kmem_cache_t *.

> +void * jbd_slab_alloc(size_t size, gfp_t flags)
> +{
> +	int idx;
> +
> +	idx = jbd_find_slab_index(size);
> +	return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
> +}
> +
> +void jbd_slab_free(void *ptr,  size_t size)
> +{
> +	int idx;
> +
> +	idx = jbd_find_slab_index(size);
> +	kmem_cache_free(jbd_slab[idx], ptr);
> +}

Then these become simpler.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-23 23:08 [RFC][PATCH] Manage jbd allocations from its own slabs Badari Pulavarty
  2006-08-23 23:34 ` Andrew Morton
@ 2006-08-24 12:29 ` Dave Kleikamp
  2006-08-24 17:12   ` Badari Pulavarty
  2006-08-24 18:53 ` Christoph Hellwig
  2 siblings, 1 reply; 9+ messages in thread
From: Dave Kleikamp @ 2006-08-24 12:29 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Herbert Xu, akpm, ext2-devel, lkml

On Wed, 2006-08-23 at 16:08 -0700, Badari Pulavarty wrote:
> Hi,
> 
> Here is the fix to "bh: Ensure bh fits within a page" problem
> caused by JBD.
> 
> BTW, I realized that this problem can happen only with 1k, 2k
> filesystems - as 4k, 8k allocations disable slab debug 
> automatically. But for completeness, I created slabs for those
> also.

With a larger base page size, you could run into the same problems for
4K and 8K allocations, so it's a good thing to do them all.

> What do you think ? I ran basic tests and things are fine.

Looks sane to me.

Shaggy
-- 
David Kleikamp
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-23 23:34 ` Andrew Morton
@ 2006-08-24 12:41   ` Dave Kleikamp
  2006-08-25  0:00   ` Badari Pulavarty
  1 sibling, 0 replies; 9+ messages in thread
From: Dave Kleikamp @ 2006-08-24 12:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Badari Pulavarty, ext2-devel, Herbert Xu, lkml

On Wed, 2006-08-23 at 16:34 -0700, Andrew Morton wrote:
> On Wed, 23 Aug 2006 16:08:15 -0700
> Badari Pulavarty <pbadari@us.ibm.com> wrote:

> >  /*
> > + * jbd slab management: create 1k, 2k, 4k, 8k slabs and allocate
> > + * frozen and commit buffers from these slabs.
> > + *
> > + * Reason for doing this is to avoid, SLAB_DEBUG - since it could
> > + * cause bh to cross page boundary.
> > + */
> > +
> > +static kmem_cache_t *jbd_slab[4];
> > +static const char *jbd_slab_names[4] = {
> > +	"jbd_1k",
> > +	"jbd_2k",
> > +	"jbd_4k",
> > +	"jbd_8k",
> > +};
> > +
> > +static void journal_destroy_jbd_slabs(void)
> > +{
> > +	int i;
> > +
> > +	for (i=0; i<4; i++) {
> > +		if (jbd_slab[i])
> > +			kmem_cache_destroy(jbd_slab[i]);
> > +		jbd_slab[i] = NULL;
> > +	}
> > +}
> > +
> > +static int journal_init_jbd_slabs(void)
> > +{
> > +	int i = 0;
> > +	int retval = 0;
> > +
> > +	for (i=0; i<4; i++) {
> > +		unsigned long slab_size = 1024 << i;
> > +		jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
> > +					slab_size, slab_size,
> > +					0, NULL, NULL);
> 
> OK, passing align=slab_size fixes the bug.

The comments above don't mention the alignment of the slabs, so a
comment here may help explain that the alignment is the key to avoiding
the page-straddling.  Or you could elaborate above.

Shaggy
-- 
David Kleikamp
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-24 12:29 ` [Ext2-devel] " Dave Kleikamp
@ 2006-08-24 17:12   ` Badari Pulavarty
  0 siblings, 0 replies; 9+ messages in thread
From: Badari Pulavarty @ 2006-08-24 17:12 UTC (permalink / raw)
  To: Dave Kleikamp; +Cc: Herbert Xu, akpm, ext2-devel, lkml

Dave Kleikamp wrote:
> On Wed, 2006-08-23 at 16:08 -0700, Badari Pulavarty wrote:
>   
>> Hi,
>>
>> Here is the fix to "bh: Ensure bh fits within a page" problem
>> caused by JBD.
>>
>> BTW, I realized that this problem can happen only with 1k, 2k
>> filesystems - as 4k, 8k allocations disable slab debug 
>> automatically. But for completeness, I created slabs for those
>> also.
>>     
>
> With a larger base page size, you could run into the same problems for
> 4K and 8K allocations, so it's a good thing to do them all.
>
>   
Yes, with bigger base page size - this can happen for 4k, 8k also.

And also, I am re-doing the patch to address Andrew's comments - I will 
send out latest
in few hours (currently testing).

Thanks,
Badari



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-23 23:08 [RFC][PATCH] Manage jbd allocations from its own slabs Badari Pulavarty
  2006-08-23 23:34 ` Andrew Morton
  2006-08-24 12:29 ` [Ext2-devel] " Dave Kleikamp
@ 2006-08-24 18:53 ` Christoph Hellwig
  2006-08-24 19:11   ` Badari Pulavarty
  2 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2006-08-24 18:53 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Herbert Xu, akpm, lkml, ext2-devel

On Wed, Aug 23, 2006 at 04:08:15PM -0700, Badari Pulavarty wrote:
> Hi,
> 
> Here is the fix to "bh: Ensure bh fits within a page" problem
> caused by JBD.
> 
> BTW, I realized that this problem can happen only with 1k, 2k
> filesystems - as 4k, 8k allocations disable slab debug 
> automatically. But for completeness, I created slabs for those
> also.
> 
> What do you think ? I ran basic tests and things are fine.

Why can't you just use alloc_page?  I bet the whole slab overhead
eats more memory than what's wasted when using alloc_pages.  Especially
as the typical usecase is a 4k blocks filesystem with 4k pagesize
where the overhead of alloc_page is non-existant.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-24 18:53 ` Christoph Hellwig
@ 2006-08-24 19:11   ` Badari Pulavarty
  2006-08-25  2:24     ` [Ext2-devel] " Theodore Tso
  0 siblings, 1 reply; 9+ messages in thread
From: Badari Pulavarty @ 2006-08-24 19:11 UTC (permalink / raw)
  To: Christoph Hellwig, Badari Pulavarty, Herbert Xu, akpm, lkml, ext2-devel

Christoph Hellwig wrote:
> On Wed, Aug 23, 2006 at 04:08:15PM -0700, Badari Pulavarty wrote:
>   
>> Hi,
>>
>> Here is the fix to "bh: Ensure bh fits within a page" problem
>> caused by JBD.
>>
>> BTW, I realized that this problem can happen only with 1k, 2k
>> filesystems - as 4k, 8k allocations disable slab debug 
>> automatically. But for completeness, I created slabs for those
>> also.
>>
>> What do you think ? I ran basic tests and things are fine.
>>     
>
> Why can't you just use alloc_page?  I bet the whole slab overhead
> eats more memory than what's wasted when using alloc_pages.  Especially
> as the typical usecase is a 4k blocks filesystem with 4k pagesize
> where the overhead of alloc_page is non-existant.
>   

Yes. That was what proposed earlier. But for 1k, 2k allocations we end 
up wasting whole page.
Isn't it ? Thats why I created right sized slabs and disable slab-debug. 
I guess, I can do this
only for 1k, 2k filesystems and directly use alloc_page() for 4k and 8k 
- but that would make
code ugly and also it doesn't handle cases for bigger base pagesize 
systems (64k power).

Thanks,
Badari



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-23 23:34 ` Andrew Morton
  2006-08-24 12:41   ` [Ext2-devel] " Dave Kleikamp
@ 2006-08-25  0:00   ` Badari Pulavarty
  1 sibling, 0 replies; 9+ messages in thread
From: Badari Pulavarty @ 2006-08-25  0:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Herbert Xu, lkml, ext2-devel

Andrew,

Here is the latest patch. Unfortunately, its not surviving my stress
tests on 1k filesystem. I keep running into 

fs/buffer.c: 2791 assert in submit_bh()
	        BUG_ON(!buffer_mapped(bh));

I haven't touched "bh" itself. So, I am not sure whats happening
here. I am trying to reproduce it on mainline 2.6.18-rc4 (seen
it once so far - but not consistently).

Changes since the last patch:

- create appropriate slabs only when we mount the filesystem with
that blocksize.
- simplify the find slab idx process and get it by shifting size.

Thanks,
Badari

JBD currently allocates commit and frozen buffers from slabs.
With CONFIG_SLAB_DEBUG, its possible for an allocation to
cross the page boundary causing IO problems.

https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=200127

So, instead of allocating these from regular slabs - manage
allocation from its own slabs and disable slab debug for these slabs.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
---
 fs/jbd/commit.c      |    6 +--
 fs/jbd/journal.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/jbd/transaction.c |    9 ++---
 include/linux/jbd.h  |    3 +
 4 files changed, 93 insertions(+), 11 deletions(-)

Index: linux-2.6.18-rc4/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/journal.c	2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/journal.c	2006-08-24 16:19:27.000000000 -0700
@@ -84,6 +84,7 @@
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int journal_create_jbd_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -328,10 +329,10 @@
 		char *tmp;
 
 		jbd_unlock_bh_state(bh_in);
-		tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
+		tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
 		jbd_lock_bh_state(bh_in);
 		if (jh_in->b_frozen_data) {
-			kfree(tmp);
+			jbd_slab_free(tmp, bh_in->b_size);
 			goto repeat;
 		}
 
@@ -1090,6 +1091,13 @@
 		}
 	}
 
+	/*
+	 * Make sure to create a slab for this blocksize
+	 */
+	err = journal_create_jbd_slab(cpu_to_be32(journal->j_superblock->s_blocksize));
+	if (err)
+		return err;
+
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
 	if (journal_recover(journal))
@@ -1612,6 +1620,76 @@
 }
 
 /*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
+ * and allocate frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+#define JBD_MAX_SLABS 5
+#define JBD_SLAB_INDEX(size)  (size >> 11)
+
+static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+	"jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
+};
+
+static void journal_destroy_jbd_slabs(void)
+{
+	int i;
+
+	for (i=0; i<JBD_MAX_SLABS; i++) {
+		if (jbd_slab[i])
+			kmem_cache_destroy(jbd_slab[i]);
+		jbd_slab[i] = NULL;
+	}
+}
+
+static int journal_create_jbd_slab(size_t slab_size)
+{
+	int i = JBD_SLAB_INDEX(slab_size);
+
+	BUG_ON(i >= JBD_MAX_SLABS);
+	/*
+	 * Check if we already have a slab created for this size
+	 */
+	if (jbd_slab[i])
+		return 0;
+
+	/*
+	 * Create a slab and force alignment to be same as slabsize -
+	 * this will make sure that allocations won't cross the page
+	 * boundary.
+	 */
+	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+				slab_size, slab_size, 0, NULL, NULL);
+	if (!jbd_slab[i]) {
+		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void * jbd_slab_alloc(size_t size, gfp_t flags)
+{
+	int idx;
+
+	idx = JBD_SLAB_INDEX(size);
+	BUG_ON(jbd_slab[idx] == NULL);
+	return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd_slab_free(void *ptr,  size_t size)
+{
+	int idx;
+
+	idx = JBD_SLAB_INDEX(size);
+	BUG_ON(jbd_slab[idx] == NULL);
+	kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
  * Journal_head storage management
  */
 static kmem_cache_t *journal_head_cache;
@@ -1799,13 +1877,13 @@
 				printk(KERN_WARNING "%s: freeing "
 						"b_frozen_data\n",
 						__FUNCTION__);
-				kfree(jh->b_frozen_data);
+				jbd_slab_free(jh->b_frozen_data, bh->b_size);
 			}
 			if (jh->b_committed_data) {
 				printk(KERN_WARNING "%s: freeing "
 						"b_committed_data\n",
 						__FUNCTION__);
-				kfree(jh->b_committed_data);
+				jbd_slab_free(jh->b_committed_data, bh->b_size);
 			}
 			bh->b_private = NULL;
 			jh->b_bh = NULL;	/* debug, really */
@@ -1961,6 +2039,7 @@
 	journal_destroy_revoke_caches();
 	journal_destroy_journal_head_cache();
 	journal_destroy_handle_cache();
+	journal_destroy_jbd_slabs();
 }
 
 static int __init journal_init(void)
Index: linux-2.6.18-rc4/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/transaction.c	2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/transaction.c	2006-08-24 13:23:55.000000000 -0700
@@ -666,8 +666,9 @@
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
 				jbd_unlock_bh_state(bh);
-				frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
-							    GFP_NOFS);
+				frozen_buffer =
+					jbd_slab_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
 				if (!frozen_buffer) {
 					printk(KERN_EMERG
 					       "%s: OOM for frozen_buffer\n",
@@ -879,7 +880,7 @@
 
 repeat:
 	if (!jh->b_committed_data) {
-		committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS);
+		committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 		if (!committed_data) {
 			printk(KERN_EMERG "%s: No memory for committed data\n",
 				__FUNCTION__);
@@ -906,7 +907,7 @@
 out:
 	journal_put_journal_head(jh);
 	if (unlikely(committed_data))
-		kfree(committed_data);
+		jbd_slab_free(committed_data, bh->b_size);
 	return err;
 }
 
Index: linux-2.6.18-rc4/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-rc4.orig/fs/jbd/commit.c	2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/fs/jbd/commit.c	2006-08-24 13:23:55.000000000 -0700
@@ -261,7 +261,7 @@
 			struct buffer_head *bh = jh2bh(jh);
 
 			jbd_lock_bh_state(bh);
-			kfree(jh->b_committed_data);
+			jbd_slab_free(jh->b_committed_data, bh->b_size);
 			jh->b_committed_data = NULL;
 			jbd_unlock_bh_state(bh);
 		}
@@ -745,14 +745,14 @@
 		 * Otherwise, we can just throw away the frozen data now.
 		 */
 		if (jh->b_committed_data) {
-			kfree(jh->b_committed_data);
+			jbd_slab_free(jh->b_committed_data, bh->b_size);
 			jh->b_committed_data = NULL;
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_frozen_data = NULL;
 			}
 		} else if (jh->b_frozen_data) {
-			kfree(jh->b_frozen_data);
+			jbd_slab_free(jh->b_frozen_data, bh->b_size);
 			jh->b_frozen_data = NULL;
 		}
 
Index: linux-2.6.18-rc4/include/linux/jbd.h
===================================================================
--- linux-2.6.18-rc4.orig/include/linux/jbd.h	2006-08-24 13:23:28.000000000 -0700
+++ linux-2.6.18-rc4/include/linux/jbd.h	2006-08-24 13:23:55.000000000 -0700
@@ -72,6 +72,9 @@
 #endif
 
 extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
+extern void * jbd_slab_alloc(size_t size, gfp_t flags);
+extern void jbd_slab_free(void *ptr, size_t size);
+
 #define jbd_kmalloc(size, flags) \
 	__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
 #define jbd_rep_kmalloc(size, flags) \




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Ext2-devel] [RFC][PATCH] Manage jbd allocations from its own slabs
  2006-08-24 19:11   ` Badari Pulavarty
@ 2006-08-25  2:24     ` Theodore Tso
  0 siblings, 0 replies; 9+ messages in thread
From: Theodore Tso @ 2006-08-25  2:24 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Christoph Hellwig, Herbert Xu, akpm, lkml, ext2-devel

On Thu, Aug 24, 2006 at 12:11:25PM -0700, Badari Pulavarty wrote:
> > Why can't you just use alloc_page?  I bet the whole slab overhead
> > eats more memory than what's wasted when using alloc_pages.  Especially
> > as the typical usecase is a 4k blocks filesystem with 4k pagesize
> > where the overhead of alloc_page is non-existant.
> 
> Yes. That was what proposed earlier. But for 1k, 2k allocations we
> end up wasting whole page.  Isn't it ? Thats why I created right
> sized slabs and disable slab-debug.  I guess, I can do this only for
> 1k, 2k filesystems and directly use alloc_page() for 4k and 8k - but
> that would make code ugly and also it doesn't handle cases for
> bigger base pagesize systems (64k power).

Is there some way we can cleanly have a shortcut case where we use
alloc_page() if fs_blocksize == PAGE_SIZE?  The efficiency gains for
what will be the common case on many architectures will probably make
this worthwhile....

						- Ted

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2006-08-25  2:24 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-08-23 23:08 [RFC][PATCH] Manage jbd allocations from its own slabs Badari Pulavarty
2006-08-23 23:34 ` Andrew Morton
2006-08-24 12:41   ` [Ext2-devel] " Dave Kleikamp
2006-08-25  0:00   ` Badari Pulavarty
2006-08-24 12:29 ` [Ext2-devel] " Dave Kleikamp
2006-08-24 17:12   ` Badari Pulavarty
2006-08-24 18:53 ` Christoph Hellwig
2006-08-24 19:11   ` Badari Pulavarty
2006-08-25  2:24     ` [Ext2-devel] " Theodore Tso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).