[PATCH 1/2] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer()
@ 2009-12-02  1:14 Theodore Ts'o
  2009-12-02  1:14 ` [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations Theodore Ts'o
  0 siblings, 1 reply; 8+ messages in thread
From: Theodore Ts'o @ 2009-12-02  1:14 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

OOM happens.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c  |    4 ++++
 fs/jbd2/journal.c |    4 ++++
 2 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d..8896c1d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		JBUFFER_TRACE(jh, "ph3: write metadata");
 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 						      jh, &new_jh, blocknr);
+		if (flags < 0) {
+			jbd2_journal_abort(journal, flags);
+			continue;
+		}
 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 		wbuf[bufs++] = jh2bh(new_jh);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index af60d98..3f473fa 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -358,6 +358,10 @@ repeat:
 
 		jbd_unlock_bh_state(bh_in);
 		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+		if (!tmp) {
+			jbd2_journal_put_journal_head(new_jh);
+			return -ENOMEM;
+		}
 		jbd_lock_bh_state(bh_in);
 		if (jh_in->b_frozen_data) {
 			jbd2_free(tmp, bh_in->b_size);
-- 
1.6.5.216.g5288a.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02  1:14 [PATCH 1/2] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Theodore Ts'o
@ 2009-12-02  1:14 ` Theodore Ts'o
  2009-12-02 16:48   ` Josef Bacik
  2009-12-02 16:53   ` Eric Sandeen
  0 siblings, 2 replies; 8+ messages in thread
From: Theodore Ts'o @ 2009-12-02  1:14 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

Now that the SLUB seems to be fixed so that it respects the requested
alignment, use kmem_cache_alloc() to allocator if the block size of
the buffer heads to be allocated is less than the page size.
Previously, we were using 16k page on a Power system for each buffer,
even when the file system was using 1k or 4k block size.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c    |  116 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |   11 +----
 2 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3f473fa..8e0c0a9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
 		}
 	}
 
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
 	if (jbd2_journal_recover(journal))
@@ -1806,6 +1816,111 @@ size_t journal_tag_bytes(journal_t *journal)
 }
 
 /*
+ * JBD memory management
+ */
+
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD2_MAX_SLABS; i++) {
+		if (jbd2_slab[i])
+			kmem_cache_destroy(jbd2_slab[i]);
+		jbd2_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+	size_t slab_size;
+
+	if (size == PAGE_SIZE)
+		return 0;
+
+	if (i >= JBD2_MAX_SLABS)
+		return -EINVAL;
+
+	if (unlikely(i < 0))
+		i = 0;
+	if (jbd2_slab[i])
+		return 0;	/* Already created */
+
+	slab_size = 1 << (i+10);
+	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+					 slab_size, 0, NULL);
+	if (!jbd2_slab[i]) {
+		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+
+	BUG_ON(i >= JBD2_MAX_SLABS);
+	if (unlikely(i < 0))
+		i = 0;
+	BUG_ON(jbd2_slab[i] == 0);
+	return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+	void *ptr;
+
+	BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+	flags |= __GFP_REPEAT;
+	if (size == PAGE_SIZE)
+		ptr = (void *)__get_free_pages(flags, 0);
+	else if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			ptr = (void *)__get_free_pages(flags, order);
+		else
+			ptr = vmalloc(size);
+	} else
+		ptr = kmem_cache_alloc(get_slab(size), flags);
+
+	/* Check alignment; SLUB has gotten this wrong in the past,
+	 * and this can lead to user data corruption! */
+	BUG_ON(((unsigned long) ptr) & (size-1));
+
+	return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+	if (size == PAGE_SIZE) {
+		free_pages((unsigned long)ptr, 0);
+		return;
+	}
+	if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			free_pages((unsigned long)ptr, order);
+		else
+			vfree(ptr);
+		return;
+	}
+	kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
  * Journal_head storage management
  */
 static struct kmem_cache *jbd2_journal_head_cache;
@@ -2202,6 +2317,7 @@ static void jbd2_journal_destroy_caches(void)
 	jbd2_journal_destroy_revoke_caches();
 	jbd2_journal_destroy_jbd2_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f1011f7..a7a8c7a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
 #define jbd_debug(f, a...)	/**/
 #endif
 
-static inline void *jbd2_alloc(size_t size, gfp_t flags)
-{
-	return (void *)__get_free_pages(flags, get_order(size));
-}

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02  1:14 ` [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations Theodore Ts'o
@ 2009-12-02 16:48   ` Josef Bacik
  2009-12-02 16:53   ` Eric Sandeen
  1 sibling, 0 replies; 8+ messages in thread
From: Josef Bacik @ 2009-12-02 16:48 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Ext4 Developers List

On Tue, Dec 01, 2009 at 08:14:11PM -0500, Theodore Ts'o wrote:
> Now that the SLUB seems to be fixed so that it respects the requested
> alignment, use kmem_cache_alloc() to allocator if the block size of
> the buffer heads to be allocated is less than the page size.
> Previously, we were using 16k page on a Power system for each buffer,
> even when the file system was using 1k or 4k block size.
> 
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Heh funny you should bring this up, we just fixed a problem in RHEL5 related to
using slabs.  If you have a non-jbd based root filesystem and then mount a bunch
of jbd-based fs's at the same time you will race with the cache creation stuff
and end up getting errors about creating the same slab cache multiple times.
The way this was fixed in RHEL5 was to put a mutex around the slabcache creation
and deletion so this sort of race.  It seems that this patch has the same issue
as the 2.6.18 jbd had.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02  1:14 ` [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations Theodore Ts'o
  2009-12-02 16:48   ` Josef Bacik
@ 2009-12-02 16:53   ` Eric Sandeen
  2009-12-02 19:31     ` Andreas Dilger
                       ` (2 more replies)
  1 sibling, 3 replies; 8+ messages in thread
From: Eric Sandeen @ 2009-12-02 16:53 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Ext4 Developers List

Theodore Ts'o wrote:
> Now that the SLUB seems to be fixed so that it respects the requested
> alignment, use kmem_cache_alloc() to allocator if the block size of
> the buffer heads to be allocated is less than the page size.
> Previously, we were using 16k page on a Power system for each buffer,
> even when the file system was using 1k or 4k block size.

So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
more or less, right:

JBD: Replace slab allocations with page allocations

Author: Mingming Cao <cmm@us.ibm.com>

JBD allocate memory for committed_data and frozen_data from slab. However
JBD should not pass slab pages down to the block layer.
Use page allocator pages instead. This will also prepare JBD for the large blocksize patchset.

Was alignment the only reason that commit went in?


Also, there is a race issue here I think, that we've had bugs
on internally, but w/ the above commit it didn't matter anymore:

If you simultaneously mount several jbd(2)-based filesystems
at once, before the slabs get created, nothing protects jbd2_slab[]
and we can race on creation/initialization of that array.

See also https://bugzilla.lustre.org/show_bug.cgi?id=19184

and a proposed patch:

https://bugzilla.lustre.org/attachment.cgi?id=22906

but I think that patch is a little heavy-handed, the down_reads
of the slab lock seem to be extraneous because if the fs is mounted,
nobody will be tearing down that slab anyway, since that only
happens on module unload, right?

-Eric

> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
>  fs/jbd2/journal.c    |  116 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/jbd2.h |   11 +----
>  2 files changed, 118 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 3f473fa..8e0c0a9 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -39,6 +39,8 @@
>  #include <linux/seq_file.h>
>  #include <linux/math64.h>
>  #include <linux/hash.h>
> +#include <linux/log2.h>
> +#include <linux/vmalloc.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/jbd2.h>
> @@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
>  
>  static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
>  static void __journal_abort_soft (journal_t *journal, int errno);
> +static int jbd2_journal_create_slab(size_t slab_size);
>  
>  /*
>   * Helper function used to manage commit timeouts
> @@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
>  		}
>  	}
>  
> +	/*
> +	 * Create a slab for this blocksize
> +	 */
> +	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
> +	if (err)
> +		return err;
> +
>  	/* Let the recovery code check whether it needs to recover any
>  	 * data from the journal. */
>  	if (jbd2_journal_recover(journal))
> @@ -1806,6 +1816,111 @@ size_t journal_tag_bytes(journal_t *journal)
>  }
>  
>  /*
> + * JBD memory management
> + */
> +
> +#define JBD2_MAX_SLABS 8
> +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
> +
> +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
> +	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
> +	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
> +};
> +
> +
> +static void jbd2_journal_destroy_slabs(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < JBD2_MAX_SLABS; i++) {
> +		if (jbd2_slab[i])
> +			kmem_cache_destroy(jbd2_slab[i]);
> +		jbd2_slab[i] = NULL;
> +	}
> +}
> +
> +static int jbd2_journal_create_slab(size_t size)
> +{
> +	int i = order_base_2(size) - 10;
> +	size_t slab_size;
> +
> +	if (size == PAGE_SIZE)
> +		return 0;
> +
> +	if (i >= JBD2_MAX_SLABS)
> +		return -EINVAL;
> +
> +	if (unlikely(i < 0))
> +		i = 0;
> +	if (jbd2_slab[i])
> +		return 0;	/* Already created */
> +
> +	slab_size = 1 << (i+10);
> +	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
> +					 slab_size, 0, NULL);
> +	if (!jbd2_slab[i]) {
> +		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
> +		return -ENOMEM;
> +	}
> +	return 0;
> +}
> +
> +static struct kmem_cache *get_slab(size_t size)
> +{
> +	int i = order_base_2(size) - 10;
> +
> +	BUG_ON(i >= JBD2_MAX_SLABS);
> +	if (unlikely(i < 0))
> +		i = 0;
> +	BUG_ON(jbd2_slab[i] == 0);
> +	return jbd2_slab[i];
> +}
> +
> +void *jbd2_alloc(size_t size, gfp_t flags)
> +{
> +	void *ptr;
> +
> +	BUG_ON(size & (size-1)); /* Must be a power of 2 */
> +
> +	flags |= __GFP_REPEAT;
> +	if (size == PAGE_SIZE)
> +		ptr = (void *)__get_free_pages(flags, 0);
> +	else if (size > PAGE_SIZE) {
> +		int order = get_order(size);
> +
> +		if (order < 3)
> +			ptr = (void *)__get_free_pages(flags, order);
> +		else
> +			ptr = vmalloc(size);
> +	} else
> +		ptr = kmem_cache_alloc(get_slab(size), flags);
> +
> +	/* Check alignment; SLUB has gotten this wrong in the past,
> +	 * and this can lead to user data corruption! */
> +	BUG_ON(((unsigned long) ptr) & (size-1));
> +
> +	return ptr;
> +}
> +
> +void jbd2_free(void *ptr, size_t size)
> +{
> +	if (size == PAGE_SIZE) {
> +		free_pages((unsigned long)ptr, 0);
> +		return;
> +	}
> +	if (size > PAGE_SIZE) {
> +		int order = get_order(size);
> +
> +		if (order < 3)
> +			free_pages((unsigned long)ptr, order);
> +		else
> +			vfree(ptr);
> +		return;
> +	}
> +	kmem_cache_free(get_slab(size), ptr);
> +};
> +
> +/*
>   * Journal_head storage management
>   */
>  static struct kmem_cache *jbd2_journal_head_cache;
> @@ -2202,6 +2317,7 @@ static void jbd2_journal_destroy_caches(void)
>  	jbd2_journal_destroy_revoke_caches();
>  	jbd2_journal_destroy_jbd2_journal_head_cache();
>  	jbd2_journal_destroy_handle_cache();
> +	jbd2_journal_destroy_slabs();
>  }
>  
>  static int __init journal_init(void)
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index f1011f7..a7a8c7a 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
>  #define jbd_debug(f, a...)	/**/
>  #endif
>  
> -static inline void *jbd2_alloc(size_t size, gfp_t flags)
> -{
> -	return (void *)__get_free_pages(flags, get_order(size));
> -}
> -
> -static inline void jbd2_free(void *ptr, size_t size)
> -{
> -	free_pages((unsigned long)ptr, get_order(size));
> -};
> +extern void *jbd2_alloc(size_t size, gfp_t flags);
> +extern void jbd2_free(void *ptr, size_t size);
>  
>  #define JBD2_MIN_JOURNAL_BLOCKS 1024
>  


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02 16:53   ` Eric Sandeen
@ 2009-12-02 19:31     ` Andreas Dilger
  2009-12-02 20:12     ` tytso
  2009-12-05  6:43     ` [PATCH 2/2] " Eric Sandeen
  2 siblings, 0 replies; 8+ messages in thread
From: Andreas Dilger @ 2009-12-02 19:31 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Theodore Ts'o, Brian Behlendorf, Ext4 Developers List

On 2009-12-02, at 09:53, Eric Sandeen wrote:
> So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
> more or less, right:
>
> JBD: Replace slab allocations with page allocations
>
> Author: Mingming Cao <cmm@us.ibm.com>
>
> JBD allocate memory for committed_data and frozen_data from slab.  
> However JBD should not pass slab pages down to the block layer.
> Use page allocator pages instead. This will also prepare JBD for the  
> large blocksize patchset.
>
> Was alignment the only reason that commit went in?

The reason had to do with the block layer, but I don't recall if  
alignment was the only reason for it.

> Also, there is a race issue here I think, that we've had bugs
> on internally, but w/ the above commit it didn't matter anymore:
>
> If you simultaneously mount several jbd(2)-based filesystems
> at once, before the slabs get created, nothing protects jbd2_slab[]
> and we can race on creation/initialization of that array.
>
> See also https://bugzilla.lustre.org/show_bug.cgi?id=19184
>
> and a proposed patch:
>
> https://bugzilla.lustre.org/attachment.cgi?id=22906
>
> but I think that patch is a little heavy-handed, the down_reads
> of the slab lock seem to be extraneous because if the fs is mounted,
> nobody will be tearing down that slab anyway, since that only
> happens on module unload, right?

Right, the slabs should only be created once, but they can never be  
destroyed while in use, so the rest of the locking is superfluous,  
though it at least deserves to be replaced by a comment indicating why  
no locking is needed for it.

>> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
>> ---
>> fs/jbd2/journal.c    |  116 ++++++++++++++++++++++++++++++++++++++++ 
>> ++++++++++
>> include/linux/jbd2.h |   11 +----
>> 2 files changed, 118 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
>> index 3f473fa..8e0c0a9 100644
>> --- a/fs/jbd2/journal.c
>> +++ b/fs/jbd2/journal.c
>> @@ -39,6 +39,8 @@
>> #include <linux/seq_file.h>
>> #include <linux/math64.h>
>> #include <linux/hash.h>
>> +#include <linux/log2.h>
>> +#include <linux/vmalloc.h>
>>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/jbd2.h>
>> @@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
>>
>> static int journal_convert_superblock_v1(journal_t *,  
>> journal_superblock_t *);
>> static void __journal_abort_soft (journal_t *journal, int errno);
>> +static int jbd2_journal_create_slab(size_t slab_size);
>>
>> /*
>>  * Helper function used to manage commit timeouts
>> @@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
>> 		}
>> 	}
>>
>> +	/*
>> +	 * Create a slab for this blocksize
>> +	 */
>> +	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
>> +	if (err)
>> +		return err;
>> +
>> 	/* Let the recovery code check whether it needs to recover any
>> 	 * data from the journal. */
>> 	if (jbd2_journal_recover(journal))
>> @@ -1806,6 +1816,111 @@ size_t journal_tag_bytes(journal_t *journal)
>> }
>>
>> /*
>> + * JBD memory management
>> + */
>> +
>> +#define JBD2_MAX_SLABS 8
>> +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
>> +
>> +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
>> +	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
>> +	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
>> +};
>> +
>> +
>> +static void jbd2_journal_destroy_slabs(void)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < JBD2_MAX_SLABS; i++) {
>> +		if (jbd2_slab[i])
>> +			kmem_cache_destroy(jbd2_slab[i]);
>> +		jbd2_slab[i] = NULL;
>> +	}
>> +}
>> +
>> +static int jbd2_journal_create_slab(size_t size)
>> +{
>> +	int i = order_base_2(size) - 10;
>> +	size_t slab_size;
>> +
>> +	if (size == PAGE_SIZE)
>> +		return 0;
>> +
>> +	if (i >= JBD2_MAX_SLABS)
>> +		return -EINVAL;
>> +
>> +	if (unlikely(i < 0))
>> +		i = 0;
>> +	if (jbd2_slab[i])
>> +		return 0;	/* Already created */
>> +
>> +	slab_size = 1 << (i+10);
>> +	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
>> +					 slab_size, 0, NULL);
>> +	if (!jbd2_slab[i]) {
>> +		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
>> +		return -ENOMEM;
>> +	}
>> +	return 0;
>> +}
>> +
>> +static struct kmem_cache *get_slab(size_t size)
>> +{
>> +	int i = order_base_2(size) - 10;
>> +
>> +	BUG_ON(i >= JBD2_MAX_SLABS);
>> +	if (unlikely(i < 0))
>> +		i = 0;
>> +	BUG_ON(jbd2_slab[i] == 0);
>> +	return jbd2_slab[i];
>> +}
>> +
>> +void *jbd2_alloc(size_t size, gfp_t flags)
>> +{
>> +	void *ptr;
>> +
>> +	BUG_ON(size & (size-1)); /* Must be a power of 2 */
>> +
>> +	flags |= __GFP_REPEAT;
>> +	if (size == PAGE_SIZE)
>> +		ptr = (void *)__get_free_pages(flags, 0);
>> +	else if (size > PAGE_SIZE) {
>> +		int order = get_order(size);
>> +
>> +		if (order < 3)
>> +			ptr = (void *)__get_free_pages(flags, order);
>> +		else
>> +			ptr = vmalloc(size);
>> +	} else
>> +		ptr = kmem_cache_alloc(get_slab(size), flags);
>> +
>> +	/* Check alignment; SLUB has gotten this wrong in the past,
>> +	 * and this can lead to user data corruption! */
>> +	BUG_ON(((unsigned long) ptr) & (size-1));
>> +
>> +	return ptr;
>> +}
>> +
>> +void jbd2_free(void *ptr, size_t size)
>> +{
>> +	if (size == PAGE_SIZE) {
>> +		free_pages((unsigned long)ptr, 0);
>> +		return;
>> +	}
>> +	if (size > PAGE_SIZE) {
>> +		int order = get_order(size);
>> +
>> +		if (order < 3)
>> +			free_pages((unsigned long)ptr, order);
>> +		else
>> +			vfree(ptr);
>> +		return;
>> +	}
>> +	kmem_cache_free(get_slab(size), ptr);
>> +};
>> +
>> +/*
>>  * Journal_head storage management
>>  */
>> static struct kmem_cache *jbd2_journal_head_cache;
>> @@ -2202,6 +2317,7 @@ static void jbd2_journal_destroy_caches(void)
>> 	jbd2_journal_destroy_revoke_caches();
>> 	jbd2_journal_destroy_jbd2_journal_head_cache();
>> 	jbd2_journal_destroy_handle_cache();
>> +	jbd2_journal_destroy_slabs();
>> }
>>
>> static int __init journal_init(void)
>> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
>> index f1011f7..a7a8c7a 100644
>> --- a/include/linux/jbd2.h
>> +++ b/include/linux/jbd2.h
>> @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
>> #define jbd_debug(f, a...)	/**/
>> #endif
>>
>> -static inline void *jbd2_alloc(size_t size, gfp_t flags)
>> -{
>> -	return (void *)__get_free_pages(flags, get_order(size));
>> -}
>> -
>> -static inline void jbd2_free(void *ptr, size_t size)
>> -{
>> -	free_pages((unsigned long)ptr, get_order(size));
>> -};
>> +extern void *jbd2_alloc(size_t size, gfp_t flags);
>> +extern void jbd2_free(void *ptr, size_t size);
>>
>> #define JBD2_MIN_JOURNAL_BLOCKS 1024
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux- 
> ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02 16:53   ` Eric Sandeen
  2009-12-02 19:31     ` Andreas Dilger
@ 2009-12-02 20:12     ` tytso
  2009-12-03 23:45       ` [PATCH -v2] " Theodore Ts'o
  2009-12-05  6:43     ` [PATCH 2/2] " Eric Sandeen
  2 siblings, 1 reply; 8+ messages in thread
From: tytso @ 2009-12-02 20:12 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Ext4 Developers List

On Wed, Dec 02, 2009 at 10:53:26AM -0600, Eric Sandeen wrote:
> 
> So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
> more or less, right:
> 
> JBD: Replace slab allocations with page allocations

Close, but not quite.  We still use getfreepage() in the case where
blocksize==pagesize.  So in the most common case of 4k blocks on
x86/x86_64, we won't be sufferring the overhead of using the
SLAB/SLUB/SLOB/SLQB machinery.

> Was alignment the only reason that commit went in?

Alignment and if debugging was enabled, it meant that you needed two
contiguous pages for every 4k shadow buffer_head used by the JBD
layer, not to mention the overhead of tracking each page in the slab
cache.

> Also, there is a race issue here I think, that we've had bugs
> on internally, but w/ the above commit it didn't matter anymore:
> 
> If you simultaneously mount several jbd(2)-based filesystems
> at once, before the slabs get created, nothing protects jbd2_slab[]
> and we can race on creation/initialization of that array.
> 
> See a proposed patch:
> 
> https://bugzilla.lustre.org/attachment.cgi?id=22906
> 
> but I think that patch is a little heavy-handed, the down_reads
> of the slab lock seem to be extraneous because if the fs is mounted,
> nobody will be tearing down that slab anyway, since that only
> happens on module unload, right?

Yeah, we only need the lock at mount time, when we are filling in the
jbd_slab[] array.  Thanks for pointing that out; I'll fix it and
resend the patch.

					- Ted

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH -v2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02 20:12     ` tytso
@ 2009-12-03 23:45       ` Theodore Ts'o
  0 siblings, 0 replies; 8+ messages in thread
From: Theodore Ts'o @ 2009-12-03 23:45 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

Now that the SLUB seems to be fixed so that it respects the requested
alignment, use kmem_cache_alloc() to allocator if the block size of
the buffer heads to be allocated is less than the page size.
Previously, we were using 16k page on a Power system for each buffer,
even when the file system was using 1k or 4k block size.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
Address the concerns found in https://bugzilla.lustre.org/show_bug.cgi?id=19184

 fs/jbd2/journal.c    |  132 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |   11 +---
 2 files changed, 134 insertions(+), 9 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3f473fa..5824353 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -92,6 +94,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -1247,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
 		}
 	}
 
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
 	if (jbd2_journal_recover(journal))
@@ -1806,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal)
 }
 
 /*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD2_MAX_SLABS; i++) {
+		if (jbd2_slab[i])
+			kmem_cache_destroy(jbd2_slab[i]);
+		jbd2_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+	size_t slab_size;
+
+	if (size == PAGE_SIZE)
+		return 0;
+
+	if (i >= JBD2_MAX_SLABS)
+		return -EINVAL;
+
+	if (unlikely(i < 0))
+		i = 0;
+	down(&jbd2_slab_create_sem);
+	if (jbd2_slab[i]) {
+		up(&jbd2_slab_create_sem);
+		return 0;	/* Already created */
+	}
+
+	slab_size = 1 << (i+10);
+	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+					 slab_size, 0, NULL);
+	up(&jbd2_slab_create_sem);
+	if (!jbd2_slab[i]) {
+		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+
+	BUG_ON(i >= JBD2_MAX_SLABS);
+	if (unlikely(i < 0))
+		i = 0;
+	BUG_ON(jbd2_slab[i] == 0);
+	return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+	void *ptr;
+
+	BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+	flags |= __GFP_REPEAT;
+	if (size == PAGE_SIZE)
+		ptr = (void *)__get_free_pages(flags, 0);
+	else if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			ptr = (void *)__get_free_pages(flags, order);
+		else
+			ptr = vmalloc(size);
+	} else
+		ptr = kmem_cache_alloc(get_slab(size), flags);
+
+	/* Check alignment; SLUB has gotten this wrong in the past,
+	 * and this can lead to user data corruption! */
+	BUG_ON(((unsigned long) ptr) & (size-1));
+
+	return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+	if (size == PAGE_SIZE) {
+		free_pages((unsigned long)ptr, 0);
+		return;
+	}
+	if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			free_pages((unsigned long)ptr, order);
+		else
+			vfree(ptr);
+		return;
+	}
+	kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
  * Journal_head storage management
  */
 static struct kmem_cache *jbd2_journal_head_cache;
@@ -2202,6 +2333,7 @@ static void jbd2_journal_destroy_caches(void)
 	jbd2_journal_destroy_revoke_caches();
 	jbd2_journal_destroy_jbd2_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f1011f7..a7a8c7a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
 #define jbd_debug(f, a...)	/**/
 #endif
 
-static inline void *jbd2_alloc(size_t size, gfp_t flags)
-{
-	return (void *)__get_free_pages(flags, get_order(size));
-}

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations
  2009-12-02 16:53   ` Eric Sandeen
  2009-12-02 19:31     ` Andreas Dilger
  2009-12-02 20:12     ` tytso
@ 2009-12-05  6:43     ` Eric Sandeen
  2 siblings, 0 replies; 8+ messages in thread
From: Eric Sandeen @ 2009-12-05  6:43 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Ext4 Developers List

Eric Sandeen wrote:
> Theodore Ts'o wrote:
>> Now that the SLUB seems to be fixed so that it respects the requested
>> alignment, use kmem_cache_alloc() to allocator if the block size of
>> the buffer heads to be allocated is less than the page size.
>> Previously, we were using 16k page on a Power system for each buffer,
>> even when the file system was using 1k or 4k block size.
> 
> So, this undoes commit c089d490dfbf53bc0893dc9ef57cf3ee6448314d
> more or less, right:
> 
> JBD: Replace slab allocations with page allocations
> 
> Author: Mingming Cao <cmm@us.ibm.com>
> 
> JBD allocate memory for committed_data and frozen_data from slab. However
> JBD should not pass slab pages down to the block layer.
> Use page allocator pages instead. This will also prepare JBD for the large blocksize patchset.
> 
> Was alignment the only reason that commit went in?

Actually, Christoph reminded me that iscsi & co will not like this.

See commit 1fa40b01ae4d1b00e366d4949edcc230f5cd6d99 for xfs moving
in the opposite direction back in 2007...

-Eric

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2009-12-05  6:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-12-02  1:14 [PATCH 1/2] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Theodore Ts'o
2009-12-02  1:14 ` [PATCH 2/2] ext4: Use slab allocator for sub-page sized allocations Theodore Ts'o
2009-12-02 16:48   ` Josef Bacik
2009-12-02 16:53   ` Eric Sandeen
2009-12-02 19:31     ` Andreas Dilger
2009-12-02 20:12     ` tytso
2009-12-03 23:45       ` [PATCH -v2] " Theodore Ts'o
2009-12-05  6:43     ` [PATCH 2/2] " Eric Sandeen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.