All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-08-24 10:50 David Rientjes
  2010-08-24 10:50 ` [patch 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
                   ` (6 more replies)
  0 siblings, 7 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 10:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jens Axboe, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
respectively, except that they will never return NULL and instead loop
forever trying to allocate memory.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

These were added as helper functions for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 drivers/md/dm-region-hash.c |    2 +-
 fs/btrfs/inode.c            |    2 +-
 fs/gfs2/log.c               |    2 +-
 fs/gfs2/rgrp.c              |   18 ++++---------
 fs/jbd/transaction.c        |   11 ++------
 fs/reiserfs/journal.c       |    3 +-
 include/linux/slab.h        |   55 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
 
 	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
 	if (unlikely(!nreg))
-		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
+		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
 
 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
 		      DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
 
-	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
 	delayed->inode = inode;
 
 	spin_lock(&fs_info->delayed_iput_lock);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
-	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
 	INIT_LIST_HEAD(&ai->ai_ail1_list);
 	INIT_LIST_HEAD(&ai->ai_ail2_list);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 		rgrp_blk++;
 
 		if (!bi->bi_clone) {
-			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
-					       GFP_NOFS | __GFP_NOFAIL);
+			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
+					       GFP_NOFS);
 			memcpy(bi->bi_clone + bi->bi_offset,
 			       bi->bi_bh->b_data + bi->bi_offset,
 			       bi->bi_len);
@@ -1759,9 +1759,6 @@ fail:
  * @block: the block
  *
  * Figure out what RG a block belongs to and add that RG to the list
- *
- * FIXME: Don't use NOFAIL
- *
  */
 
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
@@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
 	if (rlist->rl_rgrps == rlist->rl_space) {
 		new_space = rlist->rl_space + 10;
 
-		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
-			      GFP_NOFS | __GFP_NOFAIL);
+		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS);
 
 		if (rlist->rl_rgd) {
 			memcpy(tmp, rlist->rl_rgd,
@@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
  * @rlist: the list of resource groups
  * @state: the lock state to acquire the RG lock in
  * @flags: the modifier flags for the holder structures
- *
- * FIXME: Don't use NOFAIL
- *
  */
 
 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
 {
 	unsigned int x;
 
-	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
-				GFP_NOFS | __GFP_NOFAIL);
+	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
+				sizeof(struct gfs2_holder), GFP_NOFS);
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
 				state, 0,
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 	}
 
 alloc_transaction:
-	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
-		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	if (!journal->j_running_transaction)
+		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
+						GFP_NOFS);
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
-	jl = kzalloc(sizeof(struct reiserfs_journal_list),
-		     GFP_NOFS | __GFP_NOFAIL);
+	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
diff --git a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+/**
+ * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kmalloc_nofail(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kmalloc(size, flags);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(size=%lu flags=0x%x)\n",
+				size, flags);
+	}
+}
+
+/**
+ * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kcalloc(n, size, flags);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(n=%lu size=%lu flags=0x%x)\n",
+				n, size, flags);
+	}
+}
+
+/**
+ * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kzalloc).
+ */
+static inline void *kzalloc_nofail(size_t size, gfp_t flags)
+{
+	return kmalloc_nofail(size, flags | __GFP_ZERO);
+}
+
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch 2/5] mm: add nofail variant of kmem_cache_zalloc
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
@ 2010-08-24 10:50 ` David Rientjes
  2010-08-24 10:50 ` [patch 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 10:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Steven Whitehouse, Jens Axboe, cluster-devel, linux-kernel

Add kmem_cache_zalloc_nofail().  This function is equivalent to
kmem_cache_zalloc(), except that it will never return NULL and instead
loop forever trying to allocate memory.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/gfs2/meta_io.c    |    2 +-
 include/linux/slab.h |   18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -289,7 +289,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 		return;
 	}
 
-	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+	bd = kmem_cache_zalloc_nofail(gfs2_bufdata_cachep, GFP_NOFS);
 	bd->bd_bh = bh;
 	bd->bd_gl = gl;
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -313,6 +313,24 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
 	return kmem_cache_alloc(k, flags | __GFP_ZERO);
 }
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kmem_cache_zalloc_nofail(struct kmem_cache *k, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kmem_cache_zalloc(k, flags);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(flags=0x%x)\n",
+				flags);
+	}
+}
+
 /**
  * kzalloc - allocate memory. The memory is set to zero.
  * @size: how many bytes of memory are required.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch 3/5] fs: add nofail variant of alloc_buffer_head
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
  2010-08-24 10:50 ` [patch 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
@ 2010-08-24 10:50 ` David Rientjes
  2010-08-24 12:17     ` [Cluster-devel] " Jan Kara
  2010-08-24 10:50 ` [patch 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-24 10:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Steven Whitehouse, Jens Axboe, Jan Kara, cluster-devel,
	linux-ext4, linux-kernel

Add alloc_buffer_head_nofail().  This function is equivalent to
alloc_buffer_head(), except that it will never return NULL and instead
loop forever trying to allocate memory.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/buffer.c                 |   18 ++++++++++++++++++
 fs/gfs2/log.c               |    2 +-
 fs/jbd/journal.c            |    2 +-
 include/linux/buffer_head.h |    1 +
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3238,6 +3238,24 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 }
 EXPORT_SYMBOL(alloc_buffer_head);
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags)
+{
+	struct buffer_head *ret;
+
+	for (;;) {
+		ret = alloc_buffer_head(gfp_flags);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory; no fallback implemented "
+				"(flags=0x%x)\n",
+				gfp_flags);
+	}
+}
+
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -523,7 +523,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
 	struct buffer_head *bh;
 
-	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+	bh = alloc_buffer_head_nofail(GFP_NOFS);
 	atomic_set(&bh->b_count, 1);
 	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
 	set_bh_page(bh, real->b_page, bh_offset(real));
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -301,7 +301,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
 	 */
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	new_bh = alloc_buffer_head_nofail(GFP_NOFS);
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -176,6 +176,7 @@ void __breadahead(struct block_device *, sector_t block, unsigned int size);
 struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
 void invalidate_bh_lrus(void);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
+struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch 4/5] btrfs: add nofail variant of set_extent_dirty
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
  2010-08-24 10:50 ` [patch 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
  2010-08-24 10:50 ` [patch 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
@ 2010-08-24 10:50 ` David Rientjes
  2010-08-24 13:30   ` Peter Zijlstra
  2010-08-24 10:50 ` [patch 5/5] ntfs: remove dependency on __GFP_NOFAIL David Rientjes
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-24 10:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Mason, linux-btrfs, linux-kernel

Add set_extent_dirty_nofail().  This function is equivalent to
set_extent_dirty(), except that it will never fail because of allocation
failure and instead loop forever trying to allocate memory.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/btrfs/extent-tree.c |    8 ++++----
 fs/btrfs/extent_io.c   |   19 +++++++++++++++++++
 fs/btrfs/extent_io.h   |    2 ++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3831,9 +3831,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 
-			set_extent_dirty(info->pinned_extents,
+			set_extent_dirty_nofail(info->pinned_extents,
 					 bytenr, bytenr + num_bytes - 1,
-					 GFP_NOFS | __GFP_NOFAIL);
+					 GFP_NOFS);
 		}
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
@@ -3872,8 +3872,8 @@ static int pin_down_extent(struct btrfs_root *root,
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
-	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
-			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	set_extent_dirty_nofail(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -940,6 +940,25 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 			      NULL, mask);
 }
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	int ret;
+
+	for (;;) {
+		ret = set_extent_dirty(tree, start, end, mask);
+		if (ret != -ENOMEM)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(flags=0x%x)\n",
+				mask);
+	}
+}
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -197,6 +197,8 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch 5/5] ntfs: remove dependency on __GFP_NOFAIL
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
                   ` (2 preceding siblings ...)
  2010-08-24 10:50 ` [patch 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
@ 2010-08-24 10:50 ` David Rientjes
  2010-08-24 12:15   ` [Cluster-devel] " Jan Kara
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 10:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Anton Altaparmakov, linux-ntfs-dev, linux-kernel

Reimplement ntfs_malloc_nofs_nofail() to loop forever calling
ntfs_malloc_nofs() until the allocation succeeds.

If the first allocation attempt fails, a warning will be emitted,
including a call trace.  Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/ntfs/malloc.h |   14 ++++++++++++--
 1 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -76,11 +76,21 @@ static inline void *ntfs_malloc_nofs(unsigned long size)
  * This function guarantees that the allocation will succeed.  It will sleep
  * for as long as it takes to complete the allocation.
  *
- * If there was insufficient memory to complete the request, return NULL.
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
  */
 static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 {
-	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+	void *ret;
+
+	for (;;) {
+		ret = ntfs_malloc_nofs(size);
+		if (ret)
+			return ret;
+		WARN_ONCE(1, "Out of memory, no fallback implemented "
+				"(size=%lu)\n",
+				size);
+	}
 }
 
 static inline void ntfs_free(void *addr)

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
@ 2010-08-24 12:15   ` Jan Kara
  2010-08-24 10:50 ` [patch 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 87+ messages in thread
From: Jan Kara @ 2010-08-24 12:15 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jens Axboe, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Tue 24-08-10 03:50:19, David Rientjes wrote:
> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
> functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
> respectively, except that they will never return NULL and instead loop
> forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> These were added as helper functions for documentation and auditability.
> No future callers should be added.
  This looks like a cleaner approach than the previous one. You can add
Acked-by: Jan Kara <jack@suse.cz>
  for the JBD part.

								Honza
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  drivers/md/dm-region-hash.c |    2 +-
>  fs/btrfs/inode.c            |    2 +-
>  fs/gfs2/log.c               |    2 +-
>  fs/gfs2/rgrp.c              |   18 ++++---------
>  fs/jbd/transaction.c        |   11 ++------
>  fs/reiserfs/journal.c       |    3 +-
>  include/linux/slab.h        |   55 +++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 68 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
> --- a/drivers/md/dm-region-hash.c
> +++ b/drivers/md/dm-region-hash.c
> @@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
>  
>  	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
>  	if (unlikely(!nreg))
> -		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
> +		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
>  
>  	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
>  		      DM_RH_CLEAN : DM_RH_NOSYNC;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
>  	if (atomic_add_unless(&inode->i_count, -1, 1))
>  		return;
>  
> -	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
> +	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
>  	delayed->inode = inode;
>  
>  	spin_lock(&fs_info->delayed_iput_lock);
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
>  	}
>  	trace_gfs2_log_flush(sdp, 1);
>  
> -	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
> +	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
>  	INIT_LIST_HEAD(&ai->ai_ail1_list);
>  	INIT_LIST_HEAD(&ai->ai_ail2_list);
>  
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  		rgrp_blk++;
>  
>  		if (!bi->bi_clone) {
> -			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
> -					       GFP_NOFS | __GFP_NOFAIL);
> +			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
> +					       GFP_NOFS);
>  			memcpy(bi->bi_clone + bi->bi_offset,
>  			       bi->bi_bh->b_data + bi->bi_offset,
>  			       bi->bi_len);
> @@ -1759,9 +1759,6 @@ fail:
>   * @block: the block
>   *
>   * Figure out what RG a block belongs to and add that RG to the list
> - *
> - * FIXME: Don't use NOFAIL
> - *
>   */
>  
>  void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
> @@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
>  	if (rlist->rl_rgrps == rlist->rl_space) {
>  		new_space = rlist->rl_space + 10;
>  
> -		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
> -			      GFP_NOFS | __GFP_NOFAIL);
> +		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
> +			      GFP_NOFS);
>  
>  		if (rlist->rl_rgd) {
>  			memcpy(tmp, rlist->rl_rgd,
> @@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
>   * @rlist: the list of resource groups
>   * @state: the lock state to acquire the RG lock in
>   * @flags: the modifier flags for the holder structures
> - *
> - * FIXME: Don't use NOFAIL
> - *
>   */
>  
>  void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
>  {
>  	unsigned int x;
>  
> -	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
> -				GFP_NOFS | __GFP_NOFAIL);
> +	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
> +				sizeof(struct gfs2_holder), GFP_NOFS);
>  	for (x = 0; x < rlist->rl_rgrps; x++)
>  		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
>  				state, 0,
> diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
> --- a/fs/jbd/transaction.c
> +++ b/fs/jbd/transaction.c
> @@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
>  	}
>  
>  alloc_transaction:
> -	if (!journal->j_running_transaction) {
> -		new_transaction = kzalloc(sizeof(*new_transaction),
> -						GFP_NOFS|__GFP_NOFAIL);
> -		if (!new_transaction) {
> -			ret = -ENOMEM;
> -			goto out;
> -		}
> -	}
> +	if (!journal->j_running_transaction)
> +		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
> +						GFP_NOFS);
>  
>  	jbd_debug(3, "New handle %p going live.\n", handle);
>  
> diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
> --- a/fs/reiserfs/journal.c
> +++ b/fs/reiserfs/journal.c
> @@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
>  static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
>  {
>  	struct reiserfs_journal_list *jl;
> -	jl = kzalloc(sizeof(struct reiserfs_journal_list),
> -		     GFP_NOFS | __GFP_NOFAIL);
> +	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
>  	INIT_LIST_HEAD(&jl->j_list);
>  	INIT_LIST_HEAD(&jl->j_working_list);
>  	INIT_LIST_HEAD(&jl->j_tail_bh_list);
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
>  	return kmalloc_node(size, flags | __GFP_ZERO, node);
>  }
>  
> +/**
> + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kmalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kmalloc(size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory, no fallback implemented "
> +				"(size=%lu flags=0x%x)\n",
> +				size, flags);
> +	}
> +}
> +
> +/**
> + * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
> + * @n: number of elements.
> + * @size: element size.
> + * @flags: the type of memory to allocate (see kcalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kcalloc(n, size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory, no fallback implemented "
> +				"(n=%lu size=%lu flags=0x%x)\n",
> +				n, size, flags);
> +	}
> +}
> +
> +/**
> + * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kzalloc).
> + */
> +static inline void *kzalloc_nofail(size_t size, gfp_t flags)
> +{
> +	return kmalloc_nofail(size, flags | __GFP_ZERO);
> +}
> +
>  void __init kmem_cache_init_late(void);
>  
>  #endif	/* _LINUX_SLAB_H */
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [Cluster-devel] [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-08-24 12:15   ` Jan Kara
  0 siblings, 0 replies; 87+ messages in thread
From: Jan Kara @ 2010-08-24 12:15 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Tue 24-08-10 03:50:19, David Rientjes wrote:
> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
> functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
> respectively, except that they will never return NULL and instead loop
> forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> These were added as helper functions for documentation and auditability.
> No future callers should be added.
  This looks like a cleaner approach than the previous one. You can add
Acked-by: Jan Kara <jack@suse.cz>
  for the JBD part.

								Honza
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  drivers/md/dm-region-hash.c |    2 +-
>  fs/btrfs/inode.c            |    2 +-
>  fs/gfs2/log.c               |    2 +-
>  fs/gfs2/rgrp.c              |   18 ++++---------
>  fs/jbd/transaction.c        |   11 ++------
>  fs/reiserfs/journal.c       |    3 +-
>  include/linux/slab.h        |   55 +++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 68 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
> --- a/drivers/md/dm-region-hash.c
> +++ b/drivers/md/dm-region-hash.c
> @@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
>  
>  	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
>  	if (unlikely(!nreg))
> -		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
> +		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
>  
>  	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
>  		      DM_RH_CLEAN : DM_RH_NOSYNC;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
>  	if (atomic_add_unless(&inode->i_count, -1, 1))
>  		return;
>  
> -	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
> +	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
>  	delayed->inode = inode;
>  
>  	spin_lock(&fs_info->delayed_iput_lock);
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
>  	}
>  	trace_gfs2_log_flush(sdp, 1);
>  
> -	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
> +	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
>  	INIT_LIST_HEAD(&ai->ai_ail1_list);
>  	INIT_LIST_HEAD(&ai->ai_ail2_list);
>  
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  		rgrp_blk++;
>  
>  		if (!bi->bi_clone) {
> -			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
> -					       GFP_NOFS | __GFP_NOFAIL);
> +			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
> +					       GFP_NOFS);
>  			memcpy(bi->bi_clone + bi->bi_offset,
>  			       bi->bi_bh->b_data + bi->bi_offset,
>  			       bi->bi_len);
> @@ -1759,9 +1759,6 @@ fail:
>   * @block: the block
>   *
>   * Figure out what RG a block belongs to and add that RG to the list
> - *
> - * FIXME: Don't use NOFAIL
> - *
>   */
>  
>  void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
> @@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
>  	if (rlist->rl_rgrps == rlist->rl_space) {
>  		new_space = rlist->rl_space + 10;
>  
> -		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
> -			      GFP_NOFS | __GFP_NOFAIL);
> +		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
> +			      GFP_NOFS);
>  
>  		if (rlist->rl_rgd) {
>  			memcpy(tmp, rlist->rl_rgd,
> @@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
>   * @rlist: the list of resource groups
>   * @state: the lock state to acquire the RG lock in
>   * @flags: the modifier flags for the holder structures
> - *
> - * FIXME: Don't use NOFAIL
> - *
>   */
>  
>  void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
>  {
>  	unsigned int x;
>  
> -	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
> -				GFP_NOFS | __GFP_NOFAIL);
> +	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
> +				sizeof(struct gfs2_holder), GFP_NOFS);
>  	for (x = 0; x < rlist->rl_rgrps; x++)
>  		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
>  				state, 0,
> diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
> --- a/fs/jbd/transaction.c
> +++ b/fs/jbd/transaction.c
> @@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
>  	}
>  
>  alloc_transaction:
> -	if (!journal->j_running_transaction) {
> -		new_transaction = kzalloc(sizeof(*new_transaction),
> -						GFP_NOFS|__GFP_NOFAIL);
> -		if (!new_transaction) {
> -			ret = -ENOMEM;
> -			goto out;
> -		}
> -	}
> +	if (!journal->j_running_transaction)
> +		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
> +						GFP_NOFS);
>  
>  	jbd_debug(3, "New handle %p going live.\n", handle);
>  
> diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
> --- a/fs/reiserfs/journal.c
> +++ b/fs/reiserfs/journal.c
> @@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
>  static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
>  {
>  	struct reiserfs_journal_list *jl;
> -	jl = kzalloc(sizeof(struct reiserfs_journal_list),
> -		     GFP_NOFS | __GFP_NOFAIL);
> +	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
>  	INIT_LIST_HEAD(&jl->j_list);
>  	INIT_LIST_HEAD(&jl->j_working_list);
>  	INIT_LIST_HEAD(&jl->j_tail_bh_list);
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -334,6 +334,61 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
>  	return kmalloc_node(size, flags | __GFP_ZERO, node);
>  }
>  
> +/**
> + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kmalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kmalloc(size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory, no fallback implemented "
> +				"(size=%lu flags=0x%x)\n",
> +				size, flags);
> +	}
> +}
> +
> +/**
> + * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
> + * @n: number of elements.
> + * @size: element size.
> + * @flags: the type of memory to allocate (see kcalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kcalloc(n, size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory, no fallback implemented "
> +				"(n=%lu size=%lu flags=0x%x)\n",
> +				n, size, flags);
> +	}
> +}
> +
> +/**
> + * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kzalloc).
> + */
> +static inline void *kzalloc_nofail(size_t size, gfp_t flags)
> +{
> +	return kmalloc_nofail(size, flags | __GFP_ZERO);
> +}
> +
>  void __init kmem_cache_init_late(void);
>  
>  #endif	/* _LINUX_SLAB_H */
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 3/5] fs: add nofail variant of alloc_buffer_head
  2010-08-24 10:50 ` [patch 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
@ 2010-08-24 12:17     ` Jan Kara
  0 siblings, 0 replies; 87+ messages in thread
From: Jan Kara @ 2010-08-24 12:17 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Steven Whitehouse, Jens Axboe, Jan Kara,
	cluster-devel, linux-ext4, linux-kernel

On Tue 24-08-10 03:50:30, David Rientjes wrote:
> Add alloc_buffer_head_nofail().  This function is equivalent to
> alloc_buffer_head(), except that it will never return NULL and instead
> loop forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> This was added as a helper function for documentation and auditability.
> No future callers should be added.
  Acked-by: Jan Kara <jack@suse.cz>
for the JBD part here as well.

								Honza
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  fs/buffer.c                 |   18 ++++++++++++++++++
>  fs/gfs2/log.c               |    2 +-
>  fs/jbd/journal.c            |    2 +-
>  include/linux/buffer_head.h |    1 +
>  4 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -3238,6 +3238,24 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
>  }
>  EXPORT_SYMBOL(alloc_buffer_head);
>  
> +/*
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags)
> +{
> +	struct buffer_head *ret;
> +
> +	for (;;) {
> +		ret = alloc_buffer_head(gfp_flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory; no fallback implemented "
> +				"(flags=0x%x)\n",
> +				gfp_flags);
> +	}
> +}
> +
>  void free_buffer_head(struct buffer_head *bh)
>  {
>  	BUG_ON(!list_empty(&bh->b_assoc_buffers));
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -523,7 +523,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
>  	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
>  	struct buffer_head *bh;
>  
> -	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
> +	bh = alloc_buffer_head_nofail(GFP_NOFS);
>  	atomic_set(&bh->b_count, 1);
>  	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
>  	set_bh_page(bh, real->b_page, bh_offset(real));
> diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
> --- a/fs/jbd/journal.c
> +++ b/fs/jbd/journal.c
> @@ -301,7 +301,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
>  	 */
>  	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
>  
> -	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
> +	new_bh = alloc_buffer_head_nofail(GFP_NOFS);
>  	/* keep subsequent assertions sane */
>  	new_bh->b_state = 0;
>  	init_buffer(new_bh, NULL, NULL);
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -176,6 +176,7 @@ void __breadahead(struct block_device *, sector_t block, unsigned int size);
>  struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
>  void invalidate_bh_lrus(void);
>  struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
> +struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags);
>  void free_buffer_head(struct buffer_head * bh);
>  void unlock_buffer(struct buffer_head *bh);
>  void __lock_buffer(struct buffer_head *bh);
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [Cluster-devel] [patch 3/5] fs: add nofail variant of alloc_buffer_head
@ 2010-08-24 12:17     ` Jan Kara
  0 siblings, 0 replies; 87+ messages in thread
From: Jan Kara @ 2010-08-24 12:17 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Tue 24-08-10 03:50:30, David Rientjes wrote:
> Add alloc_buffer_head_nofail().  This function is equivalent to
> alloc_buffer_head(), except that it will never return NULL and instead
> loop forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> This was added as a helper function for documentation and auditability.
> No future callers should be added.
  Acked-by: Jan Kara <jack@suse.cz>
for the JBD part here as well.

								Honza
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  fs/buffer.c                 |   18 ++++++++++++++++++
>  fs/gfs2/log.c               |    2 +-
>  fs/jbd/journal.c            |    2 +-
>  include/linux/buffer_head.h |    1 +
>  4 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -3238,6 +3238,24 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
>  }
>  EXPORT_SYMBOL(alloc_buffer_head);
>  
> +/*
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags)
> +{
> +	struct buffer_head *ret;
> +
> +	for (;;) {
> +		ret = alloc_buffer_head(gfp_flags);
> +		if (ret)
> +			return ret;
> +		WARN_ONCE(1, "Out of memory; no fallback implemented "
> +				"(flags=0x%x)\n",
> +				gfp_flags);
> +	}
> +}
> +
>  void free_buffer_head(struct buffer_head *bh)
>  {
>  	BUG_ON(!list_empty(&bh->b_assoc_buffers));
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -523,7 +523,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
>  	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
>  	struct buffer_head *bh;
>  
> -	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
> +	bh = alloc_buffer_head_nofail(GFP_NOFS);
>  	atomic_set(&bh->b_count, 1);
>  	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
>  	set_bh_page(bh, real->b_page, bh_offset(real));
> diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
> --- a/fs/jbd/journal.c
> +++ b/fs/jbd/journal.c
> @@ -301,7 +301,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
>  	 */
>  	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
>  
> -	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
> +	new_bh = alloc_buffer_head_nofail(GFP_NOFS);
>  	/* keep subsequent assertions sane */
>  	new_bh->b_state = 0;
>  	init_buffer(new_bh, NULL, NULL);
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -176,6 +176,7 @@ void __breadahead(struct block_device *, sector_t block, unsigned int size);
>  struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
>  void invalidate_bh_lrus(void);
>  struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
> +struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags);
>  void free_buffer_head(struct buffer_head * bh);
>  void unlock_buffer(struct buffer_head *bh);
>  void __lock_buffer(struct buffer_head *bh);
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
                   ` (4 preceding siblings ...)
  2010-08-24 12:15   ` [Cluster-devel] " Jan Kara
@ 2010-08-24 13:29 ` Peter Zijlstra
  2010-08-24 13:33   ` Jens Axboe
                     ` (2 more replies)
  2010-09-02  1:02 ` [patch v2 " David Rientjes
  6 siblings, 3 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-24 13:29 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jens Axboe, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:
> These were added as helper functions for documentation and auditability.
> No future callers should be added.

git grep GFP_NOFAIL isn't auditable enough?

might as well declare these functions depricated if you really want to
do this.

FWIW I think mason said he'd fix btrfs to not suck like this 'soon'.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 4/5] btrfs: add nofail variant of set_extent_dirty
  2010-08-24 10:50 ` [patch 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
@ 2010-08-24 13:30   ` Peter Zijlstra
  0 siblings, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-24 13:30 UTC (permalink / raw)
  To: David Rientjes; +Cc: Andrew Morton, Chris Mason, linux-btrfs, linux-kernel

On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:
> Add set_extent_dirty_nofail().  This function is equivalent to
> set_extent_dirty(), except that it will never fail because of allocation
> failure and instead loop forever trying to allocate memory.
> 
> If the first allocation attempt fails, a warning will be emitted,
> including a call trace.  Subsequent failures will suppress this warning.
> 
> This was added as a helper function for documentation and auditability.
> No future callers should be added.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  fs/btrfs/extent-tree.c |    8 ++++----
>  fs/btrfs/extent_io.c   |   19 +++++++++++++++++++
>  fs/btrfs/extent_io.h   |    2 ++
>  3 files changed, 25 insertions(+), 4 deletions(-)

I'd much rather someone help mason to clean this up.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and  kzalloc
  2010-08-24 13:29 ` Peter Zijlstra
@ 2010-08-24 13:33   ` Jens Axboe
  2010-08-24 20:11       ` David Rientjes
  2010-08-24 13:55   ` Dave Chinner
  2010-08-24 20:08   ` David Rientjes
  2 siblings, 1 reply; 87+ messages in thread
From: Jens Axboe @ 2010-08-24 13:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Rientjes, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On 2010-08-24 15:29, Peter Zijlstra wrote:
> On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:
>> These were added as helper functions for documentation and auditability.
>> No future callers should be added.
> 
> git grep GFP_NOFAIL isn't auditable enough?
> 
> might as well declare these functions depricated if you really want to
> do this.

Agree, adding a helper function would seem to be going in the reverse
direction imho. Nobody will read that comment on top of the function.

Should be possible to warn at build time for anyone using __GFP_NOFAIL
without wrapping it in a function.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 13:29 ` Peter Zijlstra
  2010-08-24 13:33   ` Jens Axboe
@ 2010-08-24 13:55   ` Dave Chinner
  2010-08-24 14:03     ` Peter Zijlstra
  2010-08-24 20:12     ` David Rientjes
  2010-08-24 20:08   ` David Rientjes
  2 siblings, 2 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-24 13:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Rientjes, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jens Axboe, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Tue, Aug 24, 2010 at 03:29:18PM +0200, Peter Zijlstra wrote:
> On Tue, 2010-08-24 at 03:50 -0700, David Rientjes wrote:
> > These were added as helper functions for documentation and auditability.
> > No future callers should be added.
> 
> git grep GFP_NOFAIL isn't auditable enough?
> 
> might as well declare these functions depricated if you really want to
> do this.

Also, if you are going to add tight loops, you might want to put a
backoff in the loops like "congestion_wait(BLK_RW_ASYNC, HZ/50);" so
that they don't spin....

FWIW, in all this "allocations can't fail" churn, no one has noticed
that XFS has been doing these "allocations can't fail" loop in
kmem_alloc() and kmem_zone_alloc(), well, forever. I can't ever
remember seeing it report a potential deadlock, though....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 13:55   ` Dave Chinner
@ 2010-08-24 14:03     ` Peter Zijlstra
  2010-08-24 20:12     ` David Rientjes
  1 sibling, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-24 14:03 UTC (permalink / raw)
  To: Dave Chinner
  Cc: David Rientjes, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jens Axboe, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Tue, 2010-08-24 at 23:55 +1000, Dave Chinner wrote:
> no one has noticed
> that XFS has been doing these "allocations can't fail" loop in
> kmem_alloc() and kmem_zone_alloc(), well, forever 

I occasionally check if its still there and cry a little ;-)

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 13:29 ` Peter Zijlstra
  2010-08-24 13:33   ` Jens Axboe
  2010-08-24 13:55   ` Dave Chinner
@ 2010-08-24 20:08   ` David Rientjes
  2 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 20:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jens Axboe, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Tue, 24 Aug 2010, Peter Zijlstra wrote:

> > These were added as helper functions for documentation and auditability.
> > No future callers should be added.
> 
> git grep GFP_NOFAIL isn't auditable enough?
> 

I'm trying to remove that bit, and __GFP_REPEAT, to simplify and remove 
several branches from the page allocator.

> might as well declare these functions depricated if you really want to
> do this.
> 
> FWIW I think mason said he'd fix btrfs to not suck like this 'soon'.
> 

Great!

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 13:33   ` Jens Axboe
@ 2010-08-24 20:11       ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 20:11 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Tue, 24 Aug 2010, Jens Axboe wrote:

> Should be possible to warn at build time for anyone using __GFP_NOFAIL
> without wrapping it in a function.
> 

We could make this __deprecated functions as Peter suggested if you think 
build time warnings for existing users would be helpful.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and  kzalloc
@ 2010-08-24 20:11       ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 20:11 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Tue, 24 Aug 2010, Jens Axboe wrote:

> Should be possible to warn at build time for anyone using __GFP_NOFAIL
> without wrapping it in a function.
> 

We could make this __deprecated functions as Peter suggested if you think 
build time warnings for existing users would be helpful.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 13:55   ` Dave Chinner
  2010-08-24 14:03     ` Peter Zijlstra
@ 2010-08-24 20:12     ` David Rientjes
  1 sibling, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-24 20:12 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Peter Zijlstra, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jens Axboe, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Tue, 24 Aug 2010, Dave Chinner wrote:

> > git grep GFP_NOFAIL isn't auditable enough?
> > 
> > might as well declare these functions depricated if you really want to
> > do this.
> 
> Also, if you are going to add tight loops, you might want to put a
> backoff in the loops like "congestion_wait(BLK_RW_ASYNC, HZ/50);" so
> that they don't spin....
> 

These loops don't actually loop at all, all users are passing 
order < PAGE_ALLOC_COSTLY_ORDER which implicitly loop forever in the page 
allocator without killing anything (they are all GFP_NOIO or GFP_NOFS, so 
the oom killer isn't involved).

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 20:11       ` David Rientjes
  (?)
@ 2010-08-25 11:24       ` Ted Ts'o
  2010-08-25 11:35         ` Peter Zijlstra
  2010-08-25 14:13         ` Peter Zijlstra
  -1 siblings, 2 replies; 87+ messages in thread
From: Ted Ts'o @ 2010-08-25 11:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Jens Axboe, Peter Zijlstra, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Tue, Aug 24, 2010 at 01:11:26PM -0700, David Rientjes wrote:
> On Tue, 24 Aug 2010, Jens Axboe wrote:
> 
> > Should be possible to warn at build time for anyone using __GFP_NOFAIL
> > without wrapping it in a function.
> > 
> 
> We could make this __deprecated functions as Peter suggested if you think 
> build time warnings for existing users would be helpful.

Let me take a few steps backwards and look at the problem from a
somewhat higher level.

Part of the problem is that we have a few places in the kernel where
failure is really not an option --- or rather, if we're going to fail
while we're in the middle of doing a commit, our choices really are
(a) retry the loop in the jbd layer (which Andrew really doesn't
like), (b) keep our own private cache of free memory so we don't fail
and/or loop, (c) fail the file system and mark it read-only, or (d)
panic.

There are other places where we can fail safely (for example, in jbd's
start_this_handle, although that just pushes the layer up the stack,
and ultimately, to userspace where most userspace programs don't
really expect ENOMEM to get returned by a random disk write --- how
much do _you_ trust a random GNOME or KDE developer to do correct
error checking and do something sane at the application?)

So we can mark the retry loop helper function as deprecated, and that
will make some of these cases go away, but ultimately if we're going
to fail the memory allocation, something bad is going to happen, and
the only question is whether we want to have something bad happen by
looping in the memory allocator, or to force the file system to
panic/oops the system, or have random application die and/or lose user
data because they don't expect write() to return ENOMEM.

So at some level it would be nice if we had a few different levels of
"we *really* need this memory".  Level 0 might be, "if we can't get
it, no biggie, we'll figure out some other way around it.  I doubt
there is much at level 0, but in theory, we could have some good
citizens that fall in that camp and who simply will bypass some
optimization if they can't get the memory.  Level 1 might be, if you
can't get the memory, we will propagate a failure up to userspace, but
it's probably a relatively "safe" place to fail (i.e., when the user
is opening a file).  Level 2 might be, "if you can't get the memory,
we will propgate a failure up to userspace, but it's at a space where
most applications are lazy f*ckers, and this may lead to serious
application errors" (example: close(2), and this is a file system that
only pushes the file to the server at close time, e.g. AFS).  Level 3
might be, "if you can't get the memory, I'm going to fail the file
system, or some other global subsystem, that will probably result in
the system crashing or needing to be rebooted".

We can ignore this problem and pretend it doesn't exist at the memory
allocator level, but that means the callers are going to be doing
their own thing to try to avoid having really bad things happening at
really-low-memory occasions.  And this may mean looping, whether we
mark the function as deprecated or not.

This is becoming even more of an issue now given that with
containerization, we have jokers who are forcing tasks to run in very
small memory containers, which means that failures can happen far more
frequently --- and in some cases, just because the process running the
task happens to be in an extremely constrained memory cgroup, doesn't
mean that failing the entire system is really such a great idea.  Or
maybe that means memory cgroups are kinda busted.  :-)

      	   	 		    	  	   - Ted

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 11:24       ` Ted Ts'o
@ 2010-08-25 11:35         ` Peter Zijlstra
  2010-08-25 11:57           ` Ted Ts'o
  2010-08-25 14:13         ` Peter Zijlstra
  1 sibling, 1 reply; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 11:35 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 07:24 -0400, Ted Ts'o wrote:
> Part of the problem is that we have a few places in the kernel where
> failure is really not an option --- or rather, if we're going to fail
> while we're in the middle of doing a commit, our choices really are
> (a) retry the loop in the jbd layer (which Andrew really doesn't
> like), (b) keep our own private cache of free memory so we don't fail
> and/or loop, (c) fail the file system and mark it read-only, or (d)
> panic. 

d) do the allocation before you're committed to going fwd and can still
fail and back out.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 11:35         ` Peter Zijlstra
@ 2010-08-25 11:57           ` Ted Ts'o
  2010-08-25 12:48             ` Peter Zijlstra
  0 siblings, 1 reply; 87+ messages in thread
From: Ted Ts'o @ 2010-08-25 11:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote:
> On Wed, 2010-08-25 at 07:24 -0400, Ted Ts'o wrote:
> > Part of the problem is that we have a few places in the kernel where
> > failure is really not an option --- or rather, if we're going to fail
> > while we're in the middle of doing a commit, our choices really are
> > (a) retry the loop in the jbd layer (which Andrew really doesn't
> > like), (b) keep our own private cache of free memory so we don't fail
> > and/or loop, (c) fail the file system and mark it read-only, or (d)
> > panic. 
> 
> d) do the allocation before you're committed to going fwd and can still
> fail and back out.

Sure in some cases that can be done, but the commit has to happen at
some point, or we run out of journal space, at which point we're back
to (c) or (d).

						- Ted



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 11:57           ` Ted Ts'o
@ 2010-08-25 12:48             ` Peter Zijlstra
  2010-08-25 12:52               ` Peter Zijlstra
  2010-08-25 13:24               ` Dave Chinner
  0 siblings, 2 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 12:48 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 07:57 -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote:
> > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts'o wrote:
> > > Part of the problem is that we have a few places in the kernel where
> > > failure is really not an option --- or rather, if we're going to fail
> > > while we're in the middle of doing a commit, our choices really are
> > > (a) retry the loop in the jbd layer (which Andrew really doesn't
> > > like), (b) keep our own private cache of free memory so we don't fail
> > > and/or loop, (c) fail the file system and mark it read-only, or (d)
> > > panic. 
> > 
> > d) do the allocation before you're committed to going fwd and can still
> > fail and back out.
> 
> Sure in some cases that can be done, but the commit has to happen at
> some point, or we run out of journal space, at which point we're back
> to (c) or (d).

Well (b) sounds a lot saner than either of those. Simply revert to a
state that is sub-optimal but has bounded memory use and reserve that
memory up-front. That way you can always get out of a tight memory spot.

Its what the block layer has always done to avoid the memory deadlock
situation, it has a private stash of BIOs that is big enough to always
service some IO, and as long as IO is happening stuff keeps moving fwd
and we don't deadlock.

Filesystems might have a slightly harder time creating such a bounded
state because there might be more involved like journals and the like,
but still it should be possible to create something like that (my swap
over nfs patches created such a state for the network rx side of
things).

Also, we cannot let our fear of crappy userspace get in the way of doing
sensible things. Your example of write(2) returning -ENOMEM is not
correct though, the syscall (and the page_mkwrite callback for mmap()s)
happens before we actually dirty the data and need to write things out,
so we can always simply wait for memory to become available to dirty.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 12:48             ` Peter Zijlstra
@ 2010-08-25 12:52               ` Peter Zijlstra
  2010-08-25 13:20                 ` Theodore Tso
  2010-08-25 13:24               ` Dave Chinner
  1 sibling, 1 reply; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 12:52 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 14:48 +0200, Peter Zijlstra wrote:

> > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts'o wrote:

> > > > (a) retry the loop in the jbd layer (which Andrew really doesn't
> > > > like), (b) keep our own private cache of free memory so we don't fail
> > > > and/or loop, 

> Well (b) sounds a lot saner than either of those. Simply revert to a
> state that is sub-optimal but has bounded memory use and reserve that
> memory up-front. That way you can always get out of a tight memory spot.

Also, there's a good reason for disliking (a), its a deadlock scenario,
suppose we need to write out data to get free pages, but the writing out
is blocked on requiring free pages.

There's really nothing the page allocator can do to help you there, its
a situation you have to avoid getting into.



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 12:52               ` Peter Zijlstra
@ 2010-08-25 13:20                 ` Theodore Tso
  2010-08-25 13:31                   ` Peter Zijlstra
  2010-08-25 13:34                   ` Peter Zijlstra
  0 siblings, 2 replies; 87+ messages in thread
From: Theodore Tso @ 2010-08-25 13:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel


On Aug 25, 2010, at 8:52 AM, Peter Zijlstra wrote:
> Also, there's a good reason for disliking (a), its a deadlock scenario,
> suppose we need to write out data to get free pages, but the writing out
> is blocked on requiring free pages.
> 
> There's really nothing the page allocator can do to help you there, its
> a situation you have to avoid getting into.

Well, if all of these users start having their own private pools of emergency memory, I'm not sure that's such a great idea either.

And in some cases, there *is* extra memory.  For example, if the reason why the page allocator failed is because there isn't enough memory in the current process's cgroup, maybe it's important enough that the kernel code might decide to say, "violate the cgroup constraints --- it's more important that we not bring down the entire system" than to honor whatever yahoo decided that a particular cgroup has been set down to something ridiculous like 512mb, when there's plenty of free physical memory --- but not in that cgroup.

-- Ted


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 12:48             ` Peter Zijlstra
  2010-08-25 12:52               ` Peter Zijlstra
@ 2010-08-25 13:24               ` Dave Chinner
  2010-08-25 13:35                 ` Peter Zijlstra
  1 sibling, 1 reply; 87+ messages in thread
From: Dave Chinner @ 2010-08-25 13:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ted Ts'o, David Rientjes, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 02:48:36PM +0200, Peter Zijlstra wrote:
> On Wed, 2010-08-25 at 07:57 -0400, Ted Ts'o wrote:
> > On Wed, Aug 25, 2010 at 01:35:32PM +0200, Peter Zijlstra wrote:
> > > On Wed, 2010-08-25 at 07:24 -0400, Ted Ts'o wrote:
> > > > Part of the problem is that we have a few places in the kernel where
> > > > failure is really not an option --- or rather, if we're going to fail
> > > > while we're in the middle of doing a commit, our choices really are
> > > > (a) retry the loop in the jbd layer (which Andrew really doesn't
> > > > like), (b) keep our own private cache of free memory so we don't fail
> > > > and/or loop, (c) fail the file system and mark it read-only, or (d)
> > > > panic. 
> > > 
> > > d) do the allocation before you're committed to going fwd and can still
> > > fail and back out.
> > 
> > Sure in some cases that can be done, but the commit has to happen at
> > some point, or we run out of journal space, at which point we're back
> > to (c) or (d).
> 
> Well (b) sounds a lot saner than either of those. Simply revert to a
> state that is sub-optimal but has bounded memory use and reserve that
> memory up-front. That way you can always get out of a tight memory spot.
> 
> Its what the block layer has always done to avoid the memory deadlock
> situation, it has a private stash of BIOs that is big enough to always
> service some IO, and as long as IO is happening stuff keeps moving fwd
> and we don't deadlock.
> 
> Filesystems might have a slightly harder time creating such a bounded
> state because there might be more involved like journals and the like,
> but still it should be possible to create something like that (my swap
> over nfs patches created such a state for the network rx side of
> things).

Filesystems are way more complex than the block layer - the block
layer simply doesn't have to handle situations were thread X is
holding A, B and C, while thread Y needs C to complete the
transaction. thread Y is the user of the low memory pool, but has
almost depleted it and so even if we swith to thread X, the pool doe
snot have enouhg memory for X to complete and allow us to switch
back to Y and have it complete, freeing the memory from the pool
that it holds.

That is, the guarantee that we will always make progress simply does
not exist in filesystems, so a mempool-like concept seems to me to
be doomed from the start....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:20                 ` Theodore Tso
@ 2010-08-25 13:31                   ` Peter Zijlstra
  2010-08-25 20:43                     ` David Rientjes
  2010-08-25 13:34                   ` Peter Zijlstra
  1 sibling, 1 reply; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 13:31 UTC (permalink / raw)
  To: Theodore Tso
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 09:20 -0400, Theodore Tso wrote:
> On Aug 25, 2010, at 8:52 AM, Peter Zijlstra wrote:
> > Also, there's a good reason for disliking (a), its a deadlock scenario,
> > suppose we need to write out data to get free pages, but the writing out
> > is blocked on requiring free pages.
> > 
> > There's really nothing the page allocator can do to help you there, its
> > a situation you have to avoid getting into.
> 
> Well, if all of these users start having their own private pools of
> emergency memory, I'm not sure that's such a great idea either.
> 
> And in some cases, there *is* extra memory.  For example, if the
> reason why the page allocator failed is because there isn't enough
> memory in the current process's cgroup, maybe it's important enough
> that the kernel code might decide to say, "violate the cgroup
> constraints --- it's more important that we not bring down the entire
> system" than to honor whatever yahoo decided that a particular cgroup
> has been set down to something ridiculous like 512mb, when there's
> plenty of free physical memory --- but not in that cgroup.

I'm not sure, but I think the cgroup thing doesn't account kernel
allocations, in which case the above problem doesn't exist.

For the cpuset case we punch through the cpuset constraints for kernel
allocations (unless __GFP_HARDWALL).

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:20                 ` Theodore Tso
  2010-08-25 13:31                   ` Peter Zijlstra
@ 2010-08-25 13:34                   ` Peter Zijlstra
  1 sibling, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 13:34 UTC (permalink / raw)
  To: Theodore Tso
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 09:20 -0400, Theodore Tso wrote:
> Well, if all of these users start having their own private pools of
> emergency memory, I'm not sure that's such a great idea either.

That's a secondary problem, and could be reduced by something like the
memory reservation system I provided in the swap-over-nfs patches. Of
course, when all these users can actually happen concurrently there's
really nothing much you can do about it.



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:24               ` Dave Chinner
@ 2010-08-25 13:35                 ` Peter Zijlstra
  2010-08-25 20:53                   ` Ted Ts'o
                                     ` (2 more replies)
  0 siblings, 3 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 13:35 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Ted Ts'o, David Rientjes, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 23:24 +1000, Dave Chinner wrote:
> 
> That is, the guarantee that we will always make progress simply does
> not exist in filesystems, so a mempool-like concept seems to me to
> be doomed from the start.... 

While I appreciate that it might be somewhat (a lot) harder for a
filesystem to provide that guarantee, I'd be deeply worried about your
claim that its impossible.

It would render a system without swap very prone to deadlocks. Even with
the very tight dirty page accounting we currently have you can fill all
your memory with anonymous pages, at which point there's nothing free
and you require writeout of dirty pages to succeed.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 11:24       ` Ted Ts'o
  2010-08-25 11:35         ` Peter Zijlstra
@ 2010-08-25 14:13         ` Peter Zijlstra
  1 sibling, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 14:13 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: David Rientjes, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 07:24 -0400, Ted Ts'o wrote:
> There are other places where we can fail safely (for example, in jbd's
> start_this_handle, although that just pushes the layer up the stack,
> and ultimately, to userspace where most userspace programs don't
> really expect ENOMEM to get returned by a random disk write 

While talking with Chris about this, if you can indeed push the error
out that far you can basically ensure this particular fs-op does not
complicate the journal commit and thereby limit the number of extra
entries in your journal, and thus the amount of memory required.

So once stuff starts failing, push out ops back out of the filesystem
code, force a journal commit, and then let these ops retry. There is no
need to actually push the -ENOMEM all the way back to userspace.



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:31                   ` Peter Zijlstra
@ 2010-08-25 20:43                     ` David Rientjes
  2010-08-25 20:55                       ` Peter Zijlstra
  0 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Theodore Tso, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 25 Aug 2010, Peter Zijlstra wrote:

> I'm not sure, but I think the cgroup thing doesn't account kernel
> allocations, in which case the above problem doesn't exist.
> 

Right, the only cgroup that does is cpusets because it binds the memory 
allocations to a set of nodes.

> For the cpuset case we punch through the cpuset constraints for kernel
> allocations (unless __GFP_HARDWALL).
> 

__GFP_HARDWALL doesn't mean that the allocation won't be constrained, this 
is a common misconception.  __GFP_HARDWALL only prevents us from looking 
at our cpuset.mem_exclusive flag and checking our nearest common ancestor 
cpuset if we can block.

The cpusets case is actually the easiest to fix: use GFP_ATOMIC.  
GFP_ATOMIC allocations aren't bound by any cpuset and, in the general 
case, can allocate below the min watermark because of 
ALLOC_HARD | ALLOC_HARDER in the page allocator which creates the notion 
of "memory reserves" available to these tasks.  Then, success really 
depends on the setting of the watermarks instead.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:35                 ` Peter Zijlstra
@ 2010-08-25 20:53                   ` Ted Ts'o
  2010-08-25 20:59                     ` David Rientjes
                                       ` (4 more replies)
  2010-08-25 20:58                   ` David Rientjes
  2010-08-26  0:09                   ` Dave Chinner
  2 siblings, 5 replies; 87+ messages in thread
From: Ted Ts'o @ 2010-08-25 20:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, David Rientjes, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote:
> 
> While I appreciate that it might be somewhat (a lot) harder for a
> filesystem to provide that guarantee, I'd be deeply worried about your
> claim that its impossible.
> 
> It would render a system without swap very prone to deadlocks. Even with
> the very tight dirty page accounting we currently have you can fill all
> your memory with anonymous pages, at which point there's nothing free
> and you require writeout of dirty pages to succeed.

For file systems that do delayed allocation, the situation is very
similar to swapping over NFS.  Sometimes in order to make some free
memory, you need to spend some free memory...  which implies that for
these file systems, being more aggressive about triggering writeout,
and being more aggressive about throttling processes which are
creating too many dirty pages, especially dirty delayed allocaiton
pages (regardless of whether this is via write(2) or accessing mmapped
memory), is a really good idea.

A pool of free pages which is reserved for routines that are doing
page cleaning would probably also be a good idea.  Maybe that's just
retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
need our own special pool, or maybe we need to dynamically resize the
GFP_ATOMIC pool based on how many subsystems might need to use it....

Just brainstorming here; what do people think?

							- Ted

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:43                     ` David Rientjes
@ 2010-08-25 20:55                       ` Peter Zijlstra
  2010-08-25 21:11                         ` David Rientjes
  0 siblings, 1 reply; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 20:55 UTC (permalink / raw)
  To: David Rientjes
  Cc: Theodore Tso, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 13:43 -0700, David Rientjes wrote:
> The cpusets case is actually the easiest to fix: use GFP_ATOMIC.  

I don't think that's a valid usage of GFP_ATOMIC, I think we should
fallback to outside the cpuset for kernel allocations by default.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:35                 ` Peter Zijlstra
  2010-08-25 20:53                   ` Ted Ts'o
@ 2010-08-25 20:58                   ` David Rientjes
  2010-08-25 21:11                     ` Christoph Lameter
  2010-08-26  0:09                   ` Dave Chinner
  2 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, Ted Ts'o, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, 25 Aug 2010, Peter Zijlstra wrote:

> While I appreciate that it might be somewhat (a lot) harder for a
> filesystem to provide that guarantee, I'd be deeply worried about your
> claim that its impossible.
> 
> It would render a system without swap very prone to deadlocks. Even with
> the very tight dirty page accounting we currently have you can fill all
> your memory with anonymous pages, at which point there's nothing free
> and you require writeout of dirty pages to succeed.
> 

While I'd really love for callers to be able to gracefully handle getting 
a NULL back from the page allocator in all cases, it's not a prerequisite 
for this patchset.  This patchset actually does nothing interesting except 
removing the __GFP_NOFAIL bit from their gfp mask.  All of these 
allocations already loop looking for memory because they have orders that 
are less than PAGE_ALLOC_COSTLY_ORDER (which defaults to 3).  So the loops 
in kzalloc_nofail(), etc., never actually loop.

Demanding that the page allocator return order-3 memory in any context is 
a non-starter, so I'm not really interested in that.  I'm more concerned 
about proper error handling being implemented for these callers iff 
someone redefines PAGE_ALLOC_COSTLY_ORDER to something else, perhaps even 
0.

Callers can, when desperate for memory, use GFP_ATOMIC to use some memory 
reserves across zones, hopefully order-0 and not an egregious amount.  But 
the remainder of the burden really is back on the caller when this is 
depleted or it needs higher order allocs to be fixed in a way that doesn't 
rely on memory that doesn't exist.  That's an implementation choice by the 
caller and I agree that some failsafe behavior is the only way that we 
don't get really bad results like corrupted user data or filesystems.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:53                   ` Ted Ts'o
@ 2010-08-25 20:59                     ` David Rientjes
  2010-08-25 20:59                     ` David Rientjes
                                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:59 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Dave Chinner, Jens Axboe,
	Andrew Morton, Ne

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that's just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
> 

PF_MEMALLOC is used for those tasks, not GFP_ATOMIC.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:53                   ` Ted Ts'o
@ 2010-08-25 20:59                       ` David Rientjes
  2010-08-25 20:59                     ` David Rientjes
                                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:59 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Dave Chinner, Jens Axboe,
	Andrew Morton, Ne

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that's just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
> 

PF_MEMALLOC is used for those tasks, not GFP_ATOMIC.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-08-25 20:59                       ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:59 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Dave Chinner, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that's just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
> 

PF_MEMALLOC is used for those tasks, not GFP_ATOMIC.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:53                   ` Ted Ts'o
  2010-08-25 20:59                     ` David Rientjes
@ 2010-08-25 20:59                     ` David Rientjes
  2010-08-25 20:59                     ` David Rientjes
                                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:59 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Dave Chinner, Jens Axboe,
	Andrew Morton, Ne

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that's just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
> 

PF_MEMALLOC is used for those tasks, not GFP_ATOMIC.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:53                   ` Ted Ts'o
  2010-08-25 20:59                     ` David Rientjes
  2010-08-25 20:59                     ` David Rientjes
@ 2010-08-25 20:59                     ` David Rientjes
  2010-08-25 20:59                       ` David Rientjes
  2010-08-25 21:35                     ` Peter Zijlstra
  4 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-25 20:59 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Dave Chinner, Jens Axboe,
	Andrew Morton, Ne

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that's just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....
> 

PF_MEMALLOC is used for those tasks, not GFP_ATOMIC.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:58                   ` David Rientjes
@ 2010-08-25 21:11                     ` Christoph Lameter
  2010-08-25 21:21                       ` Peter Zijlstra
  2010-08-25 21:23                       ` David Rientjes
  0 siblings, 2 replies; 87+ messages in thread
From: Christoph Lameter @ 2010-08-25 21:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

We already have __GFP_NOFAIL behavior for slab allocations since
a __GFP_NOFAIL flag is passed through to the page allocator if no objects
are available.

The page allocator will do the same as when called directly with
__GFP_NOFAIL and so we have consistent behavior regardless of the
allocator used.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:55                       ` Peter Zijlstra
@ 2010-08-25 21:11                         ` David Rientjes
  2010-08-25 21:27                           ` Peter Zijlstra
  0 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-25 21:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Theodore Tso, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 25 Aug 2010, Peter Zijlstra wrote:

> > The cpusets case is actually the easiest to fix: use GFP_ATOMIC.  
> 
> I don't think that's a valid usage of GFP_ATOMIC, I think we should
> fallback to outside the cpuset for kernel allocations by default.

Cpusets doesn't enforce isolation for only user memory, it's always bound 
_all_ allocations that aren't atomic or in irq context (or oom killed 
tasks).  Allowing slab, for example, to be allocated in other cpusets 
could cause them to oom themselves since they are bound by the same memory 
isolation policy that all other cpusets are.  We'd get random oom 
conditions in cpusets only depending on where the slab was allocated at 
now fault to those applications themselves, and that's certainly not a 
situation we want.  The memory controller cgroup also has slab accounting 
on their TODO list.

If you think GFP_ATOMIC is inappropriate in these contexts, then they are 
by definition blockable.  So this seems like a good candidate for using 
memory compaction since we're talking only about PAGE_ALLOC_COSTLY_ORDER 
and higher allocs, even though it's only currently configurable for 
hugepages.

There's still no hard guarantee that the memory will allocatable 
(GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I 
don't see how continuously looping the page allocator is possibly supposed 
to help in these situations.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 21:11                     ` Christoph Lameter
@ 2010-08-25 21:21                       ` Peter Zijlstra
  2010-08-25 21:23                       ` David Rientjes
  1 sibling, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 21:21 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Rientjes, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 2010-08-25 at 16:11 -0500, Christoph Lameter wrote:
> We already have __GFP_NOFAIL behavior for slab allocations since
> a __GFP_NOFAIL flag is passed through to the page allocator if no objects
> are available.
> 
> The page allocator will do the same as when called directly with
> __GFP_NOFAIL and so we have consistent behavior regardless of the
> allocator used.

Thank's for quoting the context to which you're replying, that really
helps parsing things..


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 21:11                     ` Christoph Lameter
  2010-08-25 21:21                       ` Peter Zijlstra
@ 2010-08-25 21:23                       ` David Rientjes
  2010-08-25 21:35                         ` Christoph Lameter
  1 sibling, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-25 21:23 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, Christoph Lameter wrote:

> We already have __GFP_NOFAIL behavior for slab allocations since
> a __GFP_NOFAIL flag is passed through to the page allocator if no objects
> are available.
> 

It all depends on what flags are passed to kmalloc(), slab nor slub 
enforce __GFP_NOFAIL behavior themselves.  In slab, cache_grow() will 
return NULL depending on whether the page allocator returns NULL, and that 
would only happen for __GFP_NORETRY or
cachep->gfp->gfporder >= PAGE_ALLOC_COSTLY_ORDER.  In slub, the default 
order is tried with __GFP_NORETRY and if it returns NULL, the higher order 
alloc will fail under the same circumstances.  So the nofail behavior for 
slab depends only on the flags passed from the caller.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 21:11                         ` David Rientjes
@ 2010-08-25 21:27                           ` Peter Zijlstra
  2010-08-25 23:11                             ` David Rientjes
  0 siblings, 1 reply; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 21:27 UTC (permalink / raw)
  To: David Rientjes
  Cc: Theodore Tso, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 14:11 -0700, David Rientjes wrote:
> 
> There's still no hard guarantee that the memory will allocatable 
> (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I 
> don't see how continuously looping the page allocator is possibly supposed 
> to help in these situations. 

Why do you think I'm a proponent of that behaviour?

I've been arguing that the existance of GFP_NOFAIL is the bug, and I
started the whole discussion because your patchset didn't outline the
purpose of its existance, it merely changes __GFP_NOFAIL usage into
$foo_nofail() functions, which on its own is a rather daft change.

Optimizing the page allocator by removing those conditional from its
innards into an outer loop not used by most callers seems a fine goal,
but you didn't state that.

Also, I like the postfix proposed by Andi better: _i_suck() :-)

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 21:23                       ` David Rientjes
@ 2010-08-25 21:35                         ` Christoph Lameter
  2010-08-25 23:05                           ` David Rientjes
  0 siblings, 1 reply; 87+ messages in thread
From: Christoph Lameter @ 2010-08-25 21:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, David Rientjes wrote:

> It all depends on what flags are passed to kmalloc(), slab nor slub
> enforce __GFP_NOFAIL behavior themselves.  In slab, cache_grow() will
> return NULL depending on whether the page allocator returns NULL, and that
> would only happen for __GFP_NORETRY or
> cachep->gfp->gfporder >= PAGE_ALLOC_COSTLY_ORDER.  In slub, the default
> order is tried with __GFP_NORETRY and if it returns NULL, the higher order
> alloc will fail under the same circumstances.  So the nofail behavior for
> slab depends only on the flags passed from the caller.

If the higher order fails in slub then an order 0 alloc is attempted
without __GFP_NORETRY. In both cases the nofail behavior of the page
allocator determines the outcode.

True if the caller mixes in __GFP_NORETRY then you may still get NULL. But
that is an issue that can be resolved by the caller.


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 20:53                   ` Ted Ts'o
                                       ` (3 preceding siblings ...)
  2010-08-25 20:59                       ` David Rientjes
@ 2010-08-25 21:35                     ` Peter Zijlstra
  4 siblings, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-25 21:35 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: Dave Chinner, David Rientjes, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 16:53 -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote:
> > 
> > While I appreciate that it might be somewhat (a lot) harder for a
> > filesystem to provide that guarantee, I'd be deeply worried about your
> > claim that its impossible.
> > 
> > It would render a system without swap very prone to deadlocks. Even with
> > the very tight dirty page accounting we currently have you can fill all
> > your memory with anonymous pages, at which point there's nothing free
> > and you require writeout of dirty pages to succeed.
> 
> For file systems that do delayed allocation, the situation is very
> similar to swapping over NFS.  Sometimes in order to make some free
> memory, you need to spend some free memory... 

Which means you need to be able to compute a bounded amount of that
memory.

>  which implies that for
> these file systems, being more aggressive about triggering writeout,
> and being more aggressive about throttling processes which are
> creating too many dirty pages, especially dirty delayed allocaiton
> pages (regardless of whether this is via write(2) or accessing mmapped
> memory), is a really good idea.

That seems unrelated, the VM has a strict dirty limit and controls
writeback when needed. That part works.

> A pool of free pages which is reserved for routines that are doing
> page cleaning would probably also be a good idea.  Maybe that's just
> retrying with GFP_ATOMIC if a normal allocation fails, or maybe we
> need our own special pool, or maybe we need to dynamically resize the
> GFP_ATOMIC pool based on how many subsystems might need to use it....

We have a smallish reserve, accessible with PF_MEMALLOC, but its use is
not regulated nor bounded, it just mostly works good enough.



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 21:35                         ` Christoph Lameter
@ 2010-08-25 23:05                           ` David Rientjes
  2010-08-26  1:30                             ` Christoph Lameter
  0 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-25 23:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, Christoph Lameter wrote:

> If the higher order fails in slub then an order 0 alloc is attempted
> without __GFP_NORETRY. In both cases the nofail behavior of the page
> allocator determines the outcode.
> 

Right, I thought you said the slab layer passes __GFP_NOFAIL when there's 
no objects available.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 21:27                           ` Peter Zijlstra
@ 2010-08-25 23:11                             ` David Rientjes
  2010-08-26  0:19                               ` Ted Ts'o
  0 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-25 23:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Theodore Tso, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 25 Aug 2010, Peter Zijlstra wrote:

> > There's still no hard guarantee that the memory will allocatable 
> > (GFP_KERNEL, the compaction, then GFP_ATOMIC could all still fail), but I 
> > don't see how continuously looping the page allocator is possibly supposed 
> > to help in these situations. 
> 
> Why do you think I'm a proponent of that behaviour?
> 

I don't, I was agreeing with what you're saying :)

> I've been arguing that the existance of GFP_NOFAIL is the bug, and I
> started the whole discussion because your patchset didn't outline the
> purpose of its existance, it merely changes __GFP_NOFAIL usage into
> $foo_nofail() functions, which on its own is a rather daft change.
> 

I originally pushed these to the callers, but Andrew thought it would be 
better so that we could audit the existing users that have no fallback 
behavior, even though they could go implement it on their own (like 
getblk() which actually does try _some_ memory freeing).  It eliminates 
the flag from the page allocator and saves branches in the slowpath.  We 
don't need this logic in the allocator itself, it can exist at a higher 
level and, with deprecation, will hopefully be incentive enough for those 
subsystems to fix the issue.

I'll repropose the patchset with __deprecated as you suggested.  Thanks!

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 13:35                 ` Peter Zijlstra
  2010-08-25 20:53                   ` Ted Ts'o
  2010-08-25 20:58                   ` David Rientjes
@ 2010-08-26  0:09                   ` Dave Chinner
  2 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  0:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ted Ts'o, David Rientjes, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 03:35:42PM +0200, Peter Zijlstra wrote:
> On Wed, 2010-08-25 at 23:24 +1000, Dave Chinner wrote:
> > 
> > That is, the guarantee that we will always make progress simply does
> > not exist in filesystems, so a mempool-like concept seems to me to
> > be doomed from the start.... 
> 
> While I appreciate that it might be somewhat (a lot) harder for a
> filesystem to provide that guarantee, I'd be deeply worried about your
> claim that its impossible.

I didn't say impossible, just that there's no way we can always
guarantee of forward progress with a specific, bound pool of memory.

Sure, we know what the worst case amount of log space is needed for
each transaction (i.e. how many pages that will be dirtied), but
that does not take into account all the blocks that need to be read
to make those modifications, the memory needed for stuff like btree
cursors, log tickets, transaction commit vectors, btree blocks
needed to do the searches, etc.  A typical transaction reservation
on a 4k block filesystem is between 200-400k (it's worst case), and
if you add in all the other allocations that might be required,
we're at the order of requiring megabytes of RAM to guarantee a
single transaction will succeed in low memory conditions. The exact
requirement is very difficult to quantify algorithmically, but for a
single transaction it should be possible.

However, consider the case of running a thousand concurrent
transactions and in the middle of that the system runs out of
memory. All the transactions need memory allocation to succeed, some
are blocked waiting for resources held in other transactions, etc.
Firstly, how to you stop all the transactions from making further
progress to serialise access to the low memory pool?  Secondly, how
do you select which transaction you want to use the low memory pool?
What do you do if the selected transaction then blocks on a resource
held by another transaction (which you can't know ahead of time)? Do
you switch to another thread and hope the pool doesn't run dry? What
do you do when (not if) the memory pool runs dry?

I'm sure this could be done, but it's lot of difficult, unrewarding
work that greatly increases code complexity, touches a massive
amount of the filesystem code base, exponentially increases the test
matrix, is likely to have significant operational overhead, and even
then there's no guarantee that we've got it right. That doesn't
sound like a good solution to me.

> It would render a system without swap very prone to deadlocks. Even with
> the very tight dirty page accounting we currently have you can fill all
> your memory with anonymous pages, at which point there's nothing free
> and you require writeout of dirty pages to succeed.

Then don't allow anonymous pages to fill all of memory when there is
no swap available - i.e. keep a larger pool of free memory when
there is no swap available. That's a much simpler solution than
turning all the filesystems upside down to try to make them not need
allocation....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 23:11                             ` David Rientjes
@ 2010-08-26  0:19                               ` Ted Ts'o
  2010-08-26  0:30                                 ` David Rientjes
                                                   ` (3 more replies)
  0 siblings, 4 replies; 87+ messages in thread
From: Ted Ts'o @ 2010-08-26  0:19 UTC (permalink / raw)
  To: David Rientjes
  Cc: Peter Zijlstra, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 04:11:38PM -0700, David Rientjes wrote:
> 
> I'll repropose the patchset with __deprecated as you suggested.  Thanks!

And what Dave and I are saying is that we'll either need to do our on
loop to avoid the deprecation warning, or the use of the deprecated
function will probably be used forever...

	      	       	     	 - Ted

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  0:19                               ` Ted Ts'o
@ 2010-08-26  0:30                                   ` David Rientjes
  2010-08-26  0:30                                 ` David Rientjes
                                                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  0:30 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > I'll repropose the patchset with __deprecated as you suggested.  Thanks!
> 
> And what Dave and I are saying is that we'll either need to do our on
> loop to avoid the deprecation warning, or the use of the deprecated
> function will probably be used forever...
> 

We certainly hope that nobody will reimplement the same function without 
the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
where there's no looping at a higher level.  So perhaps the best 
alternative is to implement the same _nofail() functions but do a 
WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

I think it's really sad that the caller can't know what the upper bounds 
of its memory requirement are ahead of time or at least be able to 
implement a memory freeing function when kmalloc() returns NULL.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  0:19                               ` Ted Ts'o
@ 2010-08-26  0:30                                 ` David Rientjes
  2010-08-26  0:30                                 ` David Rientjes
                                                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  0:30 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > I'll repropose the patchset with __deprecated as you suggested.  Thanks!
> 
> And what Dave and I are saying is that we'll either need to do our on
> loop to avoid the deprecation warning, or the use of the deprecated
> function will probably be used forever...
> 

We certainly hope that nobody will reimplement the same function without 
the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
where there's no looping at a higher level.  So perhaps the best 
alternative is to implement the same _nofail() functions but do a 
WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

I think it's really sad that the caller can't know what the upper bounds 
of its memory requirement are ahead of time or at least be able to 
implement a memory freeing function when kmalloc() returns NULL.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-08-26  0:30                                   ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  0:30 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > I'll repropose the patchset with __deprecated as you suggested.  Thanks!
> 
> And what Dave and I are saying is that we'll either need to do our on
> loop to avoid the deprecation warning, or the use of the deprecated
> function will probably be used forever...
> 

We certainly hope that nobody will reimplement the same function without 
the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
where there's no looping at a higher level.  So perhaps the best 
alternative is to implement the same _nofail() functions but do a 
WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

I think it's really sad that the caller can't know what the upper bounds 
of its memory requirement are ahead of time or at least be able to 
implement a memory freeing function when kmalloc() returns NULL.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  0:19                               ` Ted Ts'o
                                                   ` (2 preceding siblings ...)
  2010-08-26  0:30                                   ` David Rientjes
@ 2010-08-26  0:30                                 ` David Rientjes
  3 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  0:30 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > I'll repropose the patchset with __deprecated as you suggested.  Thanks!
> 
> And what Dave and I are saying is that we'll either need to do our on
> loop to avoid the deprecation warning, or the use of the deprecated
> function will probably be used forever...
> 

We certainly hope that nobody will reimplement the same function without 
the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
where there's no looping at a higher level.  So perhaps the best 
alternative is to implement the same _nofail() functions but do a 
WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

I think it's really sad that the caller can't know what the upper bounds 
of its memory requirement are ahead of time or at least be able to 
implement a memory freeing function when kmalloc() returns NULL.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  0:19                               ` Ted Ts'o
  2010-08-26  0:30                                 ` David Rientjes
@ 2010-08-26  0:30                                 ` David Rientjes
  2010-08-26  0:30                                   ` David Rientjes
  2010-08-26  0:30                                 ` David Rientjes
  3 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  0:30 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > I'll repropose the patchset with __deprecated as you suggested.  Thanks!
> 
> And what Dave and I are saying is that we'll either need to do our on
> loop to avoid the deprecation warning, or the use of the deprecated
> function will probably be used forever...
> 

We certainly hope that nobody will reimplement the same function without 
the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
where there's no looping at a higher level.  So perhaps the best 
alternative is to implement the same _nofail() functions but do a 
WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

I think it's really sad that the caller can't know what the upper bounds 
of its memory requirement are ahead of time or at least be able to 
implement a memory freeing function when kmalloc() returns NULL.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-25 23:05                           ` David Rientjes
@ 2010-08-26  1:30                             ` Christoph Lameter
  2010-08-26  3:12                               ` David Rientjes
  0 siblings, 1 reply; 87+ messages in thread
From: Christoph Lameter @ 2010-08-26  1:30 UTC (permalink / raw)
  To: David Rientjes
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, David Rientjes wrote:

> On Wed, 25 Aug 2010, Christoph Lameter wrote:
>
> > If the higher order fails in slub then an order 0 alloc is attempted
> > without __GFP_NORETRY. In both cases the nofail behavior of the page
> > allocator determines the outcode.
> >
>
> Right, I thought you said the slab layer passes __GFP_NOFAIL when there's
> no objects available.

Yes, the slab layer calls the page allocator when there are no objects
available and passes the __GFP_NOFAIL that the user may have set in the
call to the page allocator.

Why then add new functions that do the same?




^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  0:30                                   ` David Rientjes
  (?)
@ 2010-08-26  1:48                                   ` Ted Ts'o
  2010-08-26  3:09                                     ` David Rientjes
                                                       ` (7 more replies)
  -1 siblings, 8 replies; 87+ messages in thread
From: Ted Ts'o @ 2010-08-26  1:48 UTC (permalink / raw)
  To: David Rientjes
  Cc: Peter Zijlstra, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> 
> We certainly hope that nobody will reimplement the same function without 
> the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> where there's no looping at a higher level.  So perhaps the best 
> alternative is to implement the same _nofail() functions but do a 
> WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?

Yeah, that sounds better.

> I think it's really sad that the caller can't know what the upper bounds 
> of its memory requirement are ahead of time or at least be able to 
> implement a memory freeing function when kmalloc() returns NULL.

Oh, we can determine an upper bound.  You might just not like it.
Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
be around 400k for a transaction.  My guess is that the worst case for
ext3/ext4 is probably around 256k or so; like XFS, most of the time,
it would be a lot less.  (At least, if data != journalled; if we are
doing data journalling and every single data block begins with
0xc03b3998U, we'll need to allocate a 4k page for every single data
block written.)  We could dynamically calculate an upper bound if we
had to.  Of course, if ext3/ext4 is attached to a network block
device, then it could get a lot worse than 256k, of course.

	      	      	      	    	- Ted

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
  2010-08-26  3:09                                     ` David Rientjes
  2010-08-26  3:09                                     ` David Rientjes
@ 2010-08-26  3:09                                     ` David Rientjes
  2010-08-26  3:09                                       ` David Rientjes
                                                       ` (4 subsequent siblings)
  7 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  3:09 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 

Ok, and we'll make it a WARN_ON_ONCE() to be nice to the kernel log.  
Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, 
this serves to ensure that we aren't dependent on the page allocator's 
implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which 
case the loop in the _nofail() functions would actually do something.

> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.  My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less.  (At least, if data != journalled; if we are
> doing data journalling and every single data block begins with
> 0xc03b3998U, we'll need to allocate a 4k page for every single data
> block written.)  We could dynamically calculate an upper bound if we
> had to.  Of course, if ext3/ext4 is attached to a network block
> device, then it could get a lot worse than 256k, of course.
> 

On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
a fallback behavior when GFP_NOFS or GFP_NOIO fails?

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
  2010-08-26  3:09                                     ` David Rientjes
@ 2010-08-26  3:09                                     ` David Rientjes
  2010-08-26  3:09                                     ` David Rientjes
                                                       ` (5 subsequent siblings)
  7 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  3:09 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 

Ok, and we'll make it a WARN_ON_ONCE() to be nice to the kernel log.  
Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, 
this serves to ensure that we aren't dependent on the page allocator's 
implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which 
case the loop in the _nofail() functions would actually do something.

> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.  My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less.  (At least, if data != journalled; if we are
> doing data journalling and every single data block begins with
> 0xc03b3998U, we'll need to allocate a 4k page for every single data
> block written.)  We could dynamically calculate an upper bound if we
> had to.  Of course, if ext3/ext4 is attached to a network block
> device, then it could get a lot worse than 256k, of course.
> 

On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
a fallback behavior when GFP_NOFS or GFP_NOIO fails?

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
@ 2010-08-26  3:09                                       ` David Rientjes
  2010-08-26  3:09                                     ` David Rientjes
                                                         ` (6 subsequent siblings)
  7 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  3:09 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 

Ok, and we'll make it a WARN_ON_ONCE() to be nice to the kernel log.  
Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, 
this serves to ensure that we aren't dependent on the page allocator's 
implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which 
case the loop in the _nofail() functions would actually do something.

> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.  My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less.  (At least, if data != journalled; if we are
> doing data journalling and every single data block begins with
> 0xc03b3998U, we'll need to allocate a 4k page for every single data
> block written.)  We could dynamically calculate an upper bound if we
> had to.  Of course, if ext3/ext4 is attached to a network block
> device, then it could get a lot worse than 256k, of course.
> 

On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
a fallback behavior when GFP_NOFS or GFP_NOIO fails?

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-08-26  3:09                                       ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  3:09 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 

Ok, and we'll make it a WARN_ON_ONCE() to be nice to the kernel log.  
Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, 
this serves to ensure that we aren't dependent on the page allocator's 
implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which 
case the loop in the _nofail() functions would actually do something.

> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.  My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less.  (At least, if data != journalled; if we are
> doing data journalling and every single data block begins with
> 0xc03b3998U, we'll need to allocate a 4k page for every single data
> block written.)  We could dynamically calculate an upper bound if we
> had to.  Of course, if ext3/ext4 is attached to a network block
> device, then it could get a lot worse than 256k, of course.
> 

On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
a fallback behavior when GFP_NOFS or GFP_NOIO fails?

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
@ 2010-08-26  3:09                                     ` David Rientjes
  2010-08-26  3:09                                     ` David Rientjes
                                                       ` (6 subsequent siblings)
  7 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26  3:09 UTC (permalink / raw)
  To: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair

On Wed, 25 Aug 2010, Ted Ts'o wrote:

> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 

Ok, and we'll make it a WARN_ON_ONCE() to be nice to the kernel log.  
Although the current patchset does this with WARN_ON_ONCE(1, ...) instead, 
this serves to ensure that we aren't dependent on the page allocator's 
implementation to always loop for order < PAGE_ALLOC_COSTLY_ORDER in which 
case the loop in the _nofail() functions would actually do something.

> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.  My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less.  (At least, if data != journalled; if we are
> doing data journalling and every single data block begins with
> 0xc03b3998U, we'll need to allocate a 4k page for every single data
> block written.)  We could dynamically calculate an upper bound if we
> had to.  Of course, if ext3/ext4 is attached to a network block
> device, then it could get a lot worse than 256k, of course.
> 

On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
a fallback behavior when GFP_NOFS or GFP_NOIO fails?

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:30                             ` Christoph Lameter
@ 2010-08-26  3:12                               ` David Rientjes
  2010-08-26 14:16                                 ` Christoph Lameter
  0 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-08-26  3:12 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, Christoph Lameter wrote:

> > Right, I thought you said the slab layer passes __GFP_NOFAIL when there's
> > no objects available.
> 
> Yes, the slab layer calls the page allocator when there are no objects
> available and passes the __GFP_NOFAIL that the user may have set in the
> call to the page allocator.
> 
> Why then add new functions that do the same?
> 

Because we can remove the flag, remove branches from the page allocator 
slowpath, and none of these allocations actually are dependent on 
__GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
                                                       ` (5 preceding siblings ...)
  2010-08-26  6:38                                     ` Dave Chinner
@ 2010-08-26  6:38                                     ` Dave Chinner
  2010-08-26  6:38                                       ` Dave Chinner
  7 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  6:38 UTC (permalink / raw)
  To: Ted Ts'o, David Rientjes, Peter Zijlstra, Jens Axboe,
	Andrew Morton, Ne

On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> > 
> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.

For a 4k block size filesystem.

If I use 64k block size directories (which XFS can even on 4k page
size machines), the maximum transaction reservation goes up to at
around 3MB, and that's just for blocks being _modified_. It's not
the limit on the amount of memory that may need to be allocated
during a transaction....

> My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less. 

Right, it usually is a lot less, but one of the big problems is that
during low memory situations memory reclaim of the metadata page
cache actually causes _more_ memory allocation during tranactions
than otherwise would occur.......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
                                                       ` (4 preceding siblings ...)
  2010-08-26  6:38                                     ` Dave Chinner
@ 2010-08-26  6:38                                     ` Dave Chinner
  2010-08-26  6:38                                     ` Dave Chinner
  2010-08-26  6:38                                       ` Dave Chinner
  7 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  6:38 UTC (permalink / raw)
  To: Ted Ts'o, David Rientjes, Peter Zijlstra, Jens Axboe,
	Andrew Morton, Ne

On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> > 
> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.

For a 4k block size filesystem.

If I use 64k block size directories (which XFS can even on 4k page
size machines), the maximum transaction reservation goes up to at
around 3MB, and that's just for blocks being _modified_. It's not
the limit on the amount of memory that may need to be allocated
during a transaction....

> My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less. 

Right, it usually is a lot less, but one of the big problems is that
during low memory situations memory reclaim of the metadata page
cache actually causes _more_ memory allocation during tranactions
than otherwise would occur.......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
@ 2010-08-26  6:38                                       ` Dave Chinner
  2010-08-26  3:09                                     ` David Rientjes
                                                         ` (6 subsequent siblings)
  7 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  6:38 UTC (permalink / raw)
  To: Ted Ts'o, David Rientjes, Peter Zijlstra, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> > 
> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.

For a 4k block size filesystem.

If I use 64k block size directories (which XFS can even on 4k page
size machines), the maximum transaction reservation goes up to at
around 3MB, and that's just for blocks being _modified_. It's not
the limit on the amount of memory that may need to be allocated
during a transaction....

> My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less. 

Right, it usually is a lot less, but one of the big problems is that
during low memory situations memory reclaim of the metadata page
cache actually causes _more_ memory allocation during tranactions
than otherwise would occur.......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-08-26  6:38                                       ` Dave Chinner
  0 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  6:38 UTC (permalink / raw)
  To: Ted Ts'o, David Rientjes, Peter Zijlstra, Jens Axboe,
	Andrew Morton, Ne

On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> > 
> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.

For a 4k block size filesystem.

If I use 64k block size directories (which XFS can even on 4k page
size machines), the maximum transaction reservation goes up to at
around 3MB, and that's just for blocks being _modified_. It's not
the limit on the amount of memory that may need to be allocated
during a transaction....

> My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less. 

Right, it usually is a lot less, but one of the big problems is that
during low memory situations memory reclaim of the metadata page
cache actually causes _more_ memory allocation during tranactions
than otherwise would occur.......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  1:48                                   ` Ted Ts'o
                                                       ` (3 preceding siblings ...)
  2010-08-26  3:09                                       ` David Rientjes
@ 2010-08-26  6:38                                     ` Dave Chinner
  2010-08-26  6:38                                     ` Dave Chinner
                                                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  6:38 UTC (permalink / raw)
  To: Ted Ts'o, David Rientjes, Peter Zijlstra, Jens Axboe,
	Andrew Morton, Ne

On Wed, Aug 25, 2010 at 09:48:47PM -0400, Ted Ts'o wrote:
> On Wed, Aug 25, 2010 at 05:30:42PM -0700, David Rientjes wrote:
> > 
> > We certainly hope that nobody will reimplement the same function without 
> > the __deprecated warning, especially for order < PAGE_ALLOC_COSTLY_ORDER 
> > where there's no looping at a higher level.  So perhaps the best 
> > alternative is to implement the same _nofail() functions but do a 
> > WARN_ON(get_order(size) > PAGE_ALLOC_COSTLY_ORDER) instead?
> 
> Yeah, that sounds better.
> 
> > I think it's really sad that the caller can't know what the upper bounds 
> > of its memory requirement are ahead of time or at least be able to 
> > implement a memory freeing function when kmalloc() returns NULL.
> 
> Oh, we can determine an upper bound.  You might just not like it.
> Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> be around 400k for a transaction.

For a 4k block size filesystem.

If I use 64k block size directories (which XFS can even on 4k page
size machines), the maximum transaction reservation goes up to at
around 3MB, and that's just for blocks being _modified_. It's not
the limit on the amount of memory that may need to be allocated
during a transaction....

> My guess is that the worst case for
> ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> it would be a lot less. 

Right, it usually is a lot less, but one of the big problems is that
during low memory situations memory reclaim of the metadata page
cache actually causes _more_ memory allocation during tranactions
than otherwise would occur.......

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  3:09                                       ` David Rientjes
  (?)
@ 2010-08-26  7:06                                       ` Dave Chinner
  -1 siblings, 0 replies; 87+ messages in thread
From: Dave Chinner @ 2010-08-26  7:06 UTC (permalink / raw)
  To: David Rientjes
  Cc: Ted Ts'o, Peter Zijlstra, Jens Axboe, Andrew Morton,
	Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jan Kara, Frederic Weisbecker, linux-raid, linux-btrfs,
	cluster-devel, linux-ext4, reiserfs-devel, linux-kernel

On Wed, Aug 25, 2010 at 08:09:21PM -0700, David Rientjes wrote:
> On Wed, 25 Aug 2010, Ted Ts'o wrote:
> > > I think it's really sad that the caller can't know what the upper bounds 
> > > of its memory requirement are ahead of time or at least be able to 
> > > implement a memory freeing function when kmalloc() returns NULL.
> > 
> > Oh, we can determine an upper bound.  You might just not like it.
> > Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> > be around 400k for a transaction.  My guess is that the worst case for
> > ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> > it would be a lot less.  (At least, if data != journalled; if we are
> > doing data journalling and every single data block begins with
> > 0xc03b3998U, we'll need to allocate a 4k page for every single data
> > block written.)  We could dynamically calculate an upper bound if we
> > had to.  Of course, if ext3/ext4 is attached to a network block
> > device, then it could get a lot worse than 256k, of course.
> > 
> 
> On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
> is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
> so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
> a fallback behavior when GFP_NOFS or GFP_NOIO fails?

It would take a handful of concurrent transactions in XFS with
worst case memory allocation requirements to exhaust that pool, and
then we really would be in trouble.  Alternatively, it would take a
few allocations from each of a couple of thousand concurrent
transactions to get to the same point.

Bound memory pools only work when serialised access to the pool can
be enforced and there are no dependencies on other operations in
progress for completion of the work and freeing of the memory.
This is where it becomes exceedingly difficult to guarantee
progress.

One of the ideas that has floated around (I think Mel Gorman came up
with it first) was that if hardening the filesystem is so difficult,
why not just harden a single path via a single thread? e.g. we allow
the bdi flusher thread to have a separate reserve pool of free
pages, and when memory allocations start to fail, then that thread
can dip into it's pool to complete the writeback of the dirty pages
being flushed.  When a fileystem attaches to a bdi, it can specify
the size of the reserve pool it needs. 

This can be easily tested for during allocation (say a PF_ flag) and
switched to the reserve pool as necessary. because it is per-thread,
access to the pool is guaranteed to serialised. Memory reclaim can
then refill these pools before putting pages on freelists. This
could give us a mechanism for ensuring that allocations succeed in
the ->writepage path without needing to care about filesystem
implementation details.

And in the case of ext3/4, a pool could be attached to the jbd
thread as well so that it never starves of memory when commits are
required...

So, rather than turning filesystems upside down, maybe we should
revisit per-thread reserve pools for threads that are tasked with
cleaning pages for the VM?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  3:09                                       ` David Rientjes
  (?)
  (?)
@ 2010-08-26  8:29                                       ` Peter Zijlstra
  -1 siblings, 0 replies; 87+ messages in thread
From: Peter Zijlstra @ 2010-08-26  8:29 UTC (permalink / raw)
  To: David Rientjes
  Cc: Ted Ts'o, Jens Axboe, Andrew Morton, Neil Brown,
	Alasdair G Kergon, Chris Mason, Steven Whitehouse, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Wed, 2010-08-25 at 20:09 -0700, David Rientjes wrote:
> > Oh, we can determine an upper bound.  You might just not like it.
> > Actually ext3/ext4 shouldn't be as bad as XFS, which Dave estimated to
> > be around 400k for a transaction.  My guess is that the worst case for
> > ext3/ext4 is probably around 256k or so; like XFS, most of the time,
> > it would be a lot less.  (At least, if data != journalled; if we are
> > doing data journalling and every single data block begins with
> > 0xc03b3998U, we'll need to allocate a 4k page for every single data
> > block written.)  We could dynamically calculate an upper bound if we
> > had to.  Of course, if ext3/ext4 is attached to a network block
> > device, then it could get a lot worse than 256k, of course.
> > 
> On my 8GB machine, /proc/zoneinfo says the min watermark for ZONE_NORMAL 
> is 5086 pages, or ~20MB.  GFP_ATOMIC would allow access to ~12MB of that, 
> so perhaps we should consider this is an acceptable abuse of GFP_ATOMIC as 
> a fallback behavior when GFP_NOFS or GFP_NOIO fails? 

Agreed with the fact that 400k isn't much to worry about.

Not agreed with the GFP_ATOMIC stmt.

Direct reclaim already has PF_MEMALLOC, but then we also need a
concurrency limit on that, otherwise you can still easily blow though
your reserves by having 100 concurrency users of it, resulting in an
upper bound of 40000k instead, which will be too much.

There were patches to limit the direct reclaim contexts, not sure they
ever got anywhere..

It is something to consider in the re-design of the whole
direct-reclaim/writeback paths though..

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26  3:12                               ` David Rientjes
@ 2010-08-26 14:16                                 ` Christoph Lameter
  2010-08-26 22:31                                   ` David Rientjes
  0 siblings, 1 reply; 87+ messages in thread
From: Christoph Lameter @ 2010-08-26 14:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 25 Aug 2010, David Rientjes wrote:

> Because we can remove the flag, remove branches from the page allocator
> slowpath, and none of these allocations actually are dependent on
> __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.

Then we can simply remove __GFP_NOFAIL? Functions are only needed for
higher order allocs that can fail?


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-26 14:16                                 ` Christoph Lameter
@ 2010-08-26 22:31                                   ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-08-26 22:31 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, Dave Chinner, Ted Ts'o, Jens Axboe,
	Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Thu, 26 Aug 2010, Christoph Lameter wrote:

> > Because we can remove the flag, remove branches from the page allocator
> > slowpath, and none of these allocations actually are dependent on
> > __GFP_NOFAIL since they are all under PAGE_ALLOC_COSTLY_ORDER.
> 
> Then we can simply remove __GFP_NOFAIL? Functions are only needed for
> higher order allocs that can fail?
> 

Yes, that's the intent.  We'd like to add the 
WARN_ON_ONCE(get_order(size) >= PAGE_ALLOC_COSTLY_ORDER) warning, though, 
so we're ensured that redefinition of that #define doesn't cause 
allocations to fail to that don't have appropriate error handling or 
future callers use higher order allocs.  The _nofail() functions help that 
and do some due diligence in ensuring that we aren't changing gfp flags 
based only on the current page allocator implementation which may later 
change with very specialized corner cases.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
                   ` (5 preceding siblings ...)
  2010-08-24 13:29 ` Peter Zijlstra
@ 2010-09-02  1:02 ` David Rientjes
  2010-09-02  1:03   ` [patch v2 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
                     ` (5 more replies)
  6 siblings, 6 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-02  1:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jens Axboe, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
respectively, except that they will never return NULL and instead loop
forever trying to allocate memory.

If the first allocation attempt fails because the page allocator doesn't 
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

These were added as helper functions for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 drivers/md/dm-region-hash.c |    2 +-
 fs/btrfs/inode.c            |    2 +-
 fs/gfs2/log.c               |    2 +-
 fs/gfs2/rgrp.c              |   18 +++++----------
 fs/jbd/transaction.c        |   11 ++------
 fs/reiserfs/journal.c       |    3 +-
 include/linux/slab.h        |   51 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -290,7 +290,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
 
 	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
 	if (unlikely(!nreg))
-		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
+		nreg = kmalloc_nofail(sizeof(*nreg), GFP_NOIO);
 
 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
 		      DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1967,7 +1967,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
 
-	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed = kmalloc_nofail(sizeof(*delayed), GFP_NOFS);
 	delayed->inode = inode;
 
 	spin_lock(&fs_info->delayed_iput_lock);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -709,7 +709,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
-	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+	ai = kzalloc_nofail(sizeof(struct gfs2_ail), GFP_NOFS);
 	INIT_LIST_HEAD(&ai->ai_ail1_list);
 	INIT_LIST_HEAD(&ai->ai_ail2_list);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1440,8 +1440,8 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 		rgrp_blk++;
 
 		if (!bi->bi_clone) {
-			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
-					       GFP_NOFS | __GFP_NOFAIL);
+			bi->bi_clone = kmalloc_nofail(bi->bi_bh->b_size,
+					       GFP_NOFS);
 			memcpy(bi->bi_clone + bi->bi_offset,
 			       bi->bi_bh->b_data + bi->bi_offset,
 			       bi->bi_len);
@@ -1759,9 +1759,6 @@ fail:
  * @block: the block
  *
  * Figure out what RG a block belongs to and add that RG to the list
- *
- * FIXME: Don't use NOFAIL
- *
  */
 
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
@@ -1789,8 +1786,8 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
 	if (rlist->rl_rgrps == rlist->rl_space) {
 		new_space = rlist->rl_space + 10;
 
-		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
-			      GFP_NOFS | __GFP_NOFAIL);
+		tmp = kcalloc_nofail(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS);
 
 		if (rlist->rl_rgd) {
 			memcpy(tmp, rlist->rl_rgd,
@@ -1811,17 +1808,14 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
  * @rlist: the list of resource groups
  * @state: the lock state to acquire the RG lock in
  * @flags: the modifier flags for the holder structures
- *
- * FIXME: Don't use NOFAIL
- *
  */
 
 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
 {
 	unsigned int x;
 
-	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
-				GFP_NOFS | __GFP_NOFAIL);
+	rlist->rl_ghs = kcalloc_nofail(rlist->rl_rgrps,
+				sizeof(struct gfs2_holder), GFP_NOFS);
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
 				state, 0,
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -98,14 +98,9 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 	}
 
 alloc_transaction:
-	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
-		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	if (!journal->j_running_transaction)
+		new_transaction = kzalloc_nofail(sizeof(*new_transaction),
+						GFP_NOFS);
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2593,8 +2593,7 @@ static int journal_read(struct super_block *sb)
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
-	jl = kzalloc(sizeof(struct reiserfs_journal_list),
-		     GFP_NOFS | __GFP_NOFAIL);
+	jl = kzalloc_nofail(sizeof(struct reiserfs_journal_list), GFP_NOFS);
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
diff --git a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+/**
+ * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kmalloc_nofail(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kmalloc(size, flags);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
+/**
+ * kcalloc_nofail - infinitely loop until kcalloc() succeeds.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kcalloc_nofail(size_t n, size_t size, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kcalloc(n, size, flags);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
+/**
+ * kzalloc_nofail - infinitely loop until kzalloc() succeeds.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kzalloc).
+ */
+static inline void *kzalloc_nofail(size_t size, gfp_t flags)
+{
+	return kmalloc_nofail(size, flags | __GFP_ZERO);
+}
+
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch v2 2/5] mm: add nofail variant of kmem_cache_zalloc
  2010-09-02  1:02 ` [patch v2 " David Rientjes
@ 2010-09-02  1:03   ` David Rientjes
  2010-09-02  1:03   ` [patch v2 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-02  1:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Steven Whitehouse, Jens Axboe, cluster-devel, linux-kernel

Add kmem_cache_zalloc_nofail().  This function is equivalent to
kmem_cache_zalloc(), except that it will never return NULL and instead
loop forever trying to allocate memory.

If the first allocation attempt fails because the page allocator doesn't 
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/gfs2/meta_io.c    |    2 +-
 include/linux/slab.h |   17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -289,7 +289,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 		return;
 	}
 
-	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+	bd = kmem_cache_zalloc_nofail(gfs2_bufdata_cachep, GFP_NOFS);
 	bd->bd_bh = bh;
 	bd->bd_gl = gl;
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -313,6 +313,23 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
 	return kmem_cache_alloc(k, flags | __GFP_ZERO);
 }
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+static inline void *kmem_cache_zalloc_nofail(struct kmem_cache *k, gfp_t flags)
+{
+	void *ret;
+
+	for (;;) {
+		ret = kmem_cache_zalloc(k, flags);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(kmem_cache_size(k)) >
+						PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
 /**
  * kzalloc - allocate memory. The memory is set to zero.
  * @size: how many bytes of memory are required.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch v2 3/5] fs: add nofail variant of alloc_buffer_head
  2010-09-02  1:02 ` [patch v2 " David Rientjes
  2010-09-02  1:03   ` [patch v2 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
@ 2010-09-02  1:03   ` David Rientjes
  2010-09-02  1:03   ` [patch v2 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-02  1:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Steven Whitehouse, Jens Axboe, Jan Kara, cluster-devel,
	linux-ext4, linux-kernel

Add alloc_buffer_head_nofail().  This function is equivalent to
alloc_buffer_head(), except that it will never return NULL and instead
loop forever trying to allocate memory.

If the first allocation attempt fails because the page allocator doesn't
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/buffer.c                 |   17 +++++++++++++++++
 fs/gfs2/log.c               |    2 +-
 fs/jbd/journal.c            |    2 +-
 include/linux/buffer_head.h |    1 +
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3238,6 +3238,23 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 }
 EXPORT_SYMBOL(alloc_buffer_head);
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags)
+{
+	struct buffer_head *ret;
+
+	for (;;) {
+		ret = alloc_buffer_head(gfp_flags);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(sizeof(struct buffer_head)) >
+						PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -523,7 +523,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
 	struct buffer_head *bh;
 
-	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+	bh = alloc_buffer_head_nofail(GFP_NOFS);
 	atomic_set(&bh->b_count, 1);
 	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
 	set_bh_page(bh, real->b_page, bh_offset(real));
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -301,7 +301,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
 	 */
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	new_bh = alloc_buffer_head_nofail(GFP_NOFS);
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -176,6 +176,7 @@ void __breadahead(struct block_device *, sector_t block, unsigned int size);
 struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
 void invalidate_bh_lrus(void);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
+struct buffer_head *alloc_buffer_head_nofail(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch v2 4/5] btrfs: add nofail variant of set_extent_dirty
  2010-09-02  1:02 ` [patch v2 " David Rientjes
  2010-09-02  1:03   ` [patch v2 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
  2010-09-02  1:03   ` [patch v2 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
@ 2010-09-02  1:03   ` David Rientjes
  2010-09-02  1:03   ` [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL David Rientjes
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-02  1:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Mason, linux-btrfs, linux-kernel

Add set_extent_dirty_nofail().  This function is equivalent to
set_extent_dirty(), except that it will never fail because of allocation
failure and instead loop forever trying to allocate memory.

If the first allocation attempt fails because the page allocator doesn't
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/btrfs/extent-tree.c |    8 ++++----
 fs/btrfs/extent_io.c   |   18 ++++++++++++++++++
 fs/btrfs/extent_io.h   |    2 ++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3831,9 +3831,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 
-			set_extent_dirty(info->pinned_extents,
+			set_extent_dirty_nofail(info->pinned_extents,
 					 bytenr, bytenr + num_bytes - 1,
-					 GFP_NOFS | __GFP_NOFAIL);
+					 GFP_NOFS);
 		}
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
@@ -3872,8 +3872,8 @@ static int pin_down_extent(struct btrfs_root *root,
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
-	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
-			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	set_extent_dirty_nofail(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -940,6 +940,24 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 			      NULL, mask);
 }
 
+/*
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
+ */
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	int ret;
+
+	for (;;) {
+		ret = set_extent_dirty(tree, start, end, mask);
+		if (ret != -ENOMEM)
+			return ret;
+		WARN_ON_ONCE(get_order(sizeof(struct extent_state)) >
+						PAGE_ALLOC_COSTLY_ORDER);
+	}
+}
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -197,6 +197,8 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_dirty_nofail(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL
  2010-09-02  1:02 ` [patch v2 " David Rientjes
                     ` (2 preceding siblings ...)
  2010-09-02  1:03   ` [patch v2 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
@ 2010-09-02  1:03   ` David Rientjes
  2010-09-02  9:08     ` Anton Altaparmakov
  2010-09-02  7:59   ` [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc Jiri Slaby
  2010-09-06  9:05   ` David Rientjes
  5 siblings, 1 reply; 87+ messages in thread
From: David Rientjes @ 2010-09-02  1:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Anton Altaparmakov, linux-ntfs-dev, linux-kernel

Reimplement ntfs_malloc_nofs_nofail() to loop forever calling
ntfs_malloc_nofs() until the allocation succeeds.

If the first allocation attempt fails because the page allocator doesn't
implicitly loop, a warning will be emitted, including a call trace.
Subsequent failures will suppress this warning.

This was added as a helper function for documentation and auditability.
No future callers should be added.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/ntfs/malloc.h |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -76,11 +76,19 @@ static inline void *ntfs_malloc_nofs(unsigned long size)
  * This function guarantees that the allocation will succeed.  It will sleep
  * for as long as it takes to complete the allocation.
  *
- * If there was insufficient memory to complete the request, return NULL.
+ * NOTE: no new callers of this function should be implemented!
+ * All memory allocations should be failable whenever possible.
  */
 static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 {
-	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+	void *ret;
+
+	for (;;) {
+		ret = ntfs_malloc_nofs(size);
+		if (ret)
+			return ret;
+		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
+	}
 }
 
 static inline void ntfs_free(void *addr)

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-09-02  1:02 ` [patch v2 " David Rientjes
                     ` (3 preceding siblings ...)
  2010-09-02  1:03   ` [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL David Rientjes
@ 2010-09-02  7:59   ` Jiri Slaby
  2010-09-02 14:51       ` [Cluster-devel] " Jan Kara
  2010-09-05 23:01     ` David Rientjes
  2010-09-06  9:05   ` David Rientjes
  5 siblings, 2 replies; 87+ messages in thread
From: Jiri Slaby @ 2010-09-02  7:59 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jens Axboe, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On 09/02/2010 03:02 AM, David Rientjes wrote:
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
>  	return kmalloc_node(size, flags | __GFP_ZERO, node);
>  }
>  
> +/**
> + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> + * @size: how many bytes of memory are required.
> + * @flags: the type of memory to allocate (see kmalloc).
> + *
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
> + */
> +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> +{
> +	void *ret;
> +
> +	for (;;) {
> +		ret = kmalloc(size, flags);
> +		if (ret)
> +			return ret;
> +		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);

This doesn't work as you expect. kmalloc will warn every time it fails.
__GFP_NOFAIL used to disable the warning. Actually what's wrong with
__GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
are needed.

> +	}



-- 
js

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL
  2010-09-02  1:03   ` [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL David Rientjes
@ 2010-09-02  9:08     ` Anton Altaparmakov
  2010-09-05 22:55       ` David Rientjes
  0 siblings, 1 reply; 87+ messages in thread
From: Anton Altaparmakov @ 2010-09-02  9:08 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Anton Altaparmakov, linux-ntfs-dev, linux-kernel

Hi David,

On 2 Sep 2010, at 02:03, David Rientjes wrote:
> Reimplement ntfs_malloc_nofs_nofail() to loop forever calling
> ntfs_malloc_nofs() until the allocation succeeds.
> 
> If the first allocation attempt fails because the page allocator doesn't
> implicitly loop, a warning will be emitted, including a call trace.
> Subsequent failures will suppress this warning.
> 
> This was added as a helper function for documentation and auditability.
> No future callers should be added.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
> fs/ntfs/malloc.h |   12 ++++++++++--
> 1 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
> --- a/fs/ntfs/malloc.h
> +++ b/fs/ntfs/malloc.h
> @@ -76,11 +76,19 @@ static inline void *ntfs_malloc_nofs(unsigned long size)
>  * This function guarantees that the allocation will succeed.  It will sleep
>  * for as long as it takes to complete the allocation.
>  *
> - * If there was insufficient memory to complete the request, return NULL.
> + * NOTE: no new callers of this function should be implemented!
> + * All memory allocations should be failable whenever possible.
>  */
> static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
> {
> -	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
> +	void *ret;
> +
> +	for (;;) {
> +		ret = ntfs_malloc_nofs(size);
> +		if (ret)
> +			return ret;
> +		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);

What is the relevance of the above get_order check?  ntfs_malloc_nofs() does a vmalloc() if size >= PAGE_SIZE so I cannot see how that has anything to do with get_order()?

Other than that patch looks fine.

Note the _nofail function is only called from one place and there will be no more call sites for it.  One day we ought to work something out so it is not needed at all but given it only gets called in very obscure circumstances (definitely not in the general code paths) it shouldn't matter too much...

Best regards,

	Anton


> +	}
> }
> 
> static inline void ntfs_free(void *addr)

Best regards,

	Anton
-- 
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer, http://www.linux-ntfs.org/


^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-09-02  7:59   ` [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc Jiri Slaby
@ 2010-09-02 14:51       ` Jan Kara
  2010-09-05 23:01     ` David Rientjes
  1 sibling, 0 replies; 87+ messages in thread
From: Jan Kara @ 2010-09-02 14:51 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: David Rientjes, Andrew Morton, Neil Brown, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jens Axboe, Jan Kara,
	Frederic Weisbecker, linux-raid, linux-btrfs, cluster-devel,
	linux-ext4, reiserfs-devel, linux-kernel

On Thu 02-09-10 09:59:13, Jiri Slaby wrote:
> On 09/02/2010 03:02 AM, David Rientjes wrote:
> > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57
> > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
> > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> >  
> > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.  +
> > * @size: how many bytes of memory are required.  + * @flags: the type
> > of memory to allocate (see kmalloc).  + * + * NOTE: no new callers of
> > this function should be implemented!  + * All memory allocations should
> > be failable whenever possible.  + */ +static inline void
> > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > return ret; +		WARN_ON_ONCE(get_order(size) >
> > PAGE_ALLOC_COSTLY_ORDER);
> 
> This doesn't work as you expect. kmalloc will warn every time it fails.
> __GFP_NOFAIL used to disable the warning. Actually what's wrong with
> __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> are needed.
  David should probably add the reasoning to the changelogs so that he
doesn't have to explain again and again ;). But if I understood it
correctly, the concern is that the looping checks slightly impact fast path
of the callers which do not need it. Generally, also looping for a long
time inside allocator isn't a nice thing but some callers aren't able to do
better for now to the patch is imperfect in this sence...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [Cluster-devel] [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-09-02 14:51       ` Jan Kara
  0 siblings, 0 replies; 87+ messages in thread
From: Jan Kara @ 2010-09-02 14:51 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Thu 02-09-10 09:59:13, Jiri Slaby wrote:
> On 09/02/2010 03:02 AM, David Rientjes wrote:
> > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57
> > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
> > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> >  
> > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.  +
> > * @size: how many bytes of memory are required.  + * @flags: the type
> > of memory to allocate (see kmalloc).  + * + * NOTE: no new callers of
> > this function should be implemented!  + * All memory allocations should
> > be failable whenever possible.  + */ +static inline void
> > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > return ret; +		WARN_ON_ONCE(get_order(size) >
> > PAGE_ALLOC_COSTLY_ORDER);
> 
> This doesn't work as you expect. kmalloc will warn every time it fails.
> __GFP_NOFAIL used to disable the warning. Actually what's wrong with
> __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> are needed.
  David should probably add the reasoning to the changelogs so that he
doesn't have to explain again and again ;). But if I understood it
correctly, the concern is that the looping checks slightly impact fast path
of the callers which do not need it. Generally, also looping for a long
time inside allocator isn't a nice thing but some callers aren't able to do
better for now to the patch is imperfect in this sence...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-09-02 14:51       ` [Cluster-devel] " Jan Kara
  (?)
@ 2010-09-02 21:15         ` Neil Brown
  -1 siblings, 0 replies; 87+ messages in thread
From: Neil Brown @ 2010-09-02 21:15 UTC (permalink / raw)
  To: Jan Kara
  Cc: cluster-devel, linux-ext4, Jiri Slaby, Jens Axboe,
	reiserfs-devel, linux-kernel, linux-raid, David Rientjes,
	Andrew Morton, linux-btrfs, Frederic Weisbecker, Chris Mason

On Thu, 2 Sep 2010 16:51:41 +0200
Jan Kara <jack@suse.cz> wrote:

> On Thu 02-09-10 09:59:13, Jiri Slaby wrote:
> > On 09/02/2010 03:02 AM, David Rientjes wrote:
> > > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57
> > > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
> > > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> > >  
> > > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.  +
> > > * @size: how many bytes of memory are required.  + * @flags: the type
> > > of memory to allocate (see kmalloc).  + * + * NOTE: no new callers of
> > > this function should be implemented!  + * All memory allocations should
> > > be failable whenever possible.  + */ +static inline void
> > > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > > return ret; +		WARN_ON_ONCE(get_order(size) >
> > > PAGE_ALLOC_COSTLY_ORDER);
> > 
> > This doesn't work as you expect. kmalloc will warn every time it fails.
> > __GFP_NOFAIL used to disable the warning. Actually what's wrong with
> > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> > are needed.
>   David should probably add the reasoning to the changelogs so that he
> doesn't have to explain again and again ;). But if I understood it
> correctly, the concern is that the looping checks slightly impact fast path
> of the callers which do not need it. Generally, also looping for a long
> time inside allocator isn't a nice thing but some callers aren't able to do
> better for now to the patch is imperfect in this sence...
>

I'm actually a bit confused about this too.
I thought David said he was removing a branch on the *slow* path - which make
sense as you wouldn't even test NOFAIL until you had a failure.
Why are branches on the slow-path an issue??
This is an important question to me because I still hope to see the
swap-over-nfs patches merged eventually and they add a branch on the slow
path (if I remember correctly).

NeilBrown

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-09-02 21:15         ` Neil Brown
  0 siblings, 0 replies; 87+ messages in thread
From: Neil Brown @ 2010-09-02 21:15 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jiri Slaby, David Rientjes, Andrew Morton, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jens Axboe, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Thu, 2 Sep 2010 16:51:41 +0200
Jan Kara <jack@suse.cz> wrote:

> On Thu 02-09-10 09:59:13, Jiri Slaby wrote:
> > On 09/02/2010 03:02 AM, David Rientjes wrote:
> > > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57
> > > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
> > > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> > >  
> > > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.  +
> > > * @size: how many bytes of memory are required.  + * @flags: the type
> > > of memory to allocate (see kmalloc).  + * + * NOTE: no new callers of
> > > this function should be implemented!  + * All memory allocations should
> > > be failable whenever possible.  + */ +static inline void
> > > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > > return ret; +		WARN_ON_ONCE(get_order(size) >
> > > PAGE_ALLOC_COSTLY_ORDER);
> > 
> > This doesn't work as you expect. kmalloc will warn every time it fails.
> > __GFP_NOFAIL used to disable the warning. Actually what's wrong with
> > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> > are needed.
>   David should probably add the reasoning to the changelogs so that he
> doesn't have to explain again and again ;). But if I understood it
> correctly, the concern is that the looping checks slightly impact fast path
> of the callers which do not need it. Generally, also looping for a long
> time inside allocator isn't a nice thing but some callers aren't able to do
> better for now to the patch is imperfect in this sence...
>

I'm actually a bit confused about this too.
I thought David said he was removing a branch on the *slow* path - which make
sense as you wouldn't even test NOFAIL until you had a failure.
Why are branches on the slow-path an issue??
This is an important question to me because I still hope to see the
swap-over-nfs patches merged eventually and they add a branch on the slow
path (if I remember correctly).

NeilBrown

^ permalink raw reply	[flat|nested] 87+ messages in thread

* [Cluster-devel] [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
@ 2010-09-02 21:15         ` Neil Brown
  0 siblings, 0 replies; 87+ messages in thread
From: Neil Brown @ 2010-09-02 21:15 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Thu, 2 Sep 2010 16:51:41 +0200
Jan Kara <jack@suse.cz> wrote:

> On Thu 02-09-10 09:59:13, Jiri Slaby wrote:
> > On 09/02/2010 03:02 AM, David Rientjes wrote:
> > > --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -334,6 +334,57
> > > @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
> > > return kmalloc_node(size, flags | __GFP_ZERO, node); }
> > >  
> > > +/** + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.  +
> > > * @size: how many bytes of memory are required.  + * @flags: the type
> > > of memory to allocate (see kmalloc).  + * + * NOTE: no new callers of
> > > this function should be implemented!  + * All memory allocations should
> > > be failable whenever possible.  + */ +static inline void
> > > *kmalloc_nofail(size_t size, gfp_t flags) +{ +	void *ret; + +	for
> > > (;;) { +		ret = kmalloc(size, flags); +		if (ret) +
> > > return ret; +		WARN_ON_ONCE(get_order(size) >
> > > PAGE_ALLOC_COSTLY_ORDER);
> > 
> > This doesn't work as you expect. kmalloc will warn every time it fails.
> > __GFP_NOFAIL used to disable the warning. Actually what's wrong with
> > __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> > are needed.
>   David should probably add the reasoning to the changelogs so that he
> doesn't have to explain again and again ;). But if I understood it
> correctly, the concern is that the looping checks slightly impact fast path
> of the callers which do not need it. Generally, also looping for a long
> time inside allocator isn't a nice thing but some callers aren't able to do
> better for now to the patch is imperfect in this sence...
>

I'm actually a bit confused about this too.
I thought David said he was removing a branch on the *slow* path - which make
sense as you wouldn't even test NOFAIL until you had a failure.
Why are branches on the slow-path an issue??
This is an important question to me because I still hope to see the
swap-over-nfs patches merged eventually and they add a branch on the slow
path (if I remember correctly).

NeilBrown



^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL
  2010-09-02  9:08     ` Anton Altaparmakov
@ 2010-09-05 22:55       ` David Rientjes
  0 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-05 22:55 UTC (permalink / raw)
  To: Anton Altaparmakov
  Cc: Andrew Morton, Anton Altaparmakov, linux-ntfs-dev, linux-kernel

On Thu, 2 Sep 2010, Anton Altaparmakov wrote:

> > @@ -76,11 +76,19 @@ static inline void *ntfs_malloc_nofs(unsigned long size)
> >  * This function guarantees that the allocation will succeed.  It will sleep
> >  * for as long as it takes to complete the allocation.
> >  *
> > - * If there was insufficient memory to complete the request, return NULL.
> > + * NOTE: no new callers of this function should be implemented!
> > + * All memory allocations should be failable whenever possible.
> >  */
> > static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
> > {
> > -	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
> > +	void *ret;
> > +
> > +	for (;;) {
> > +		ret = ntfs_malloc_nofs(size);
> > +		if (ret)
> > +			return ret;
> > +		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
> 
> What is the relevance of the above get_order check?  ntfs_malloc_nofs() 
> does a vmalloc() if size >= PAGE_SIZE so I cannot see how that has 
> anything to do with get_order()?
> 

It probably shouldn't be doing vmalloc() for size == PAGE_SIZE, but I can 
understand how it would use it for higher order allocations.  I didn't 
look at that specifically because I assumed these were going through the 
page allocator (and no current user should have any order less than 
PAGE_ALLOC_COSTLY_ORDER, correct me if I'm wrong).

I'm pretty sure we want some sort of warning emitted once if this has the 
potential to loop forever, so it should probably be changed to WARN_ONCE() 
in this patch only.  Thanks for pointing that out.

> Other than that patch looks fine.
> 
> Note the _nofail function is only called from one place and there will 
> be no more call sites for it.  One day we ought to work something out so 
> it is not needed at all but given it only gets called in very obscure 
> circumstances (definitely not in the general code paths) it shouldn't 
> matter too much...
> 

That would be great!

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-09-02  7:59   ` [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc Jiri Slaby
  2010-09-02 14:51       ` [Cluster-devel] " Jan Kara
@ 2010-09-05 23:01     ` David Rientjes
  1 sibling, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-05 23:01 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Andrew Morton, Neil Brown, Alasdair G Kergon, Chris Mason,
	Steven Whitehouse, Jens Axboe, Jan Kara, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Thu, 2 Sep 2010, Jiri Slaby wrote:

> > @@ -334,6 +334,57 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
> >  	return kmalloc_node(size, flags | __GFP_ZERO, node);
> >  }
> >  
> > +/**
> > + * kmalloc_nofail - infinitely loop until kmalloc() succeeds.
> > + * @size: how many bytes of memory are required.
> > + * @flags: the type of memory to allocate (see kmalloc).
> > + *
> > + * NOTE: no new callers of this function should be implemented!
> > + * All memory allocations should be failable whenever possible.
> > + */
> > +static inline void *kmalloc_nofail(size_t size, gfp_t flags)
> > +{
> > +	void *ret;
> > +
> > +	for (;;) {
> > +		ret = kmalloc(size, flags);
> > +		if (ret)
> > +			return ret;
> > +		WARN_ON_ONCE(get_order(size) > PAGE_ALLOC_COSTLY_ORDER);
> 
> This doesn't work as you expect. kmalloc will warn every time it fails.

It actually does work as I expect since the WARN_ON_ONCE() never even gets 
triggered here for any of kmalloc_nofail()'s callers since they all have 
sizes small enough that the conditional is never true.  The page allocator 
implicitly loops forever if the order <= PAGE_ALLOC_COSTLY_ORDER, so this 
warning is only emitted if the #define implementation of the page 
allocator only changes.  That's intended, we want to know the consequences 
of our change.

> __GFP_NOFAIL used to disable the warning.

No, it didn't, it was unnecessary for all of the kmalloc_nofail() callers 
since they already implicitly loop.  No warning is emitted (these are all 
GFP_NOFS or GFP_NOIO users, there are no order-4 or larger GFP_ATOMIC 
allocators using this interface).

> Actually what's wrong with
> __GFP_NOFAIL? I cannot find a reason in the changelogs why the patches
> are needed.
> 

Couple reasons:

 - it's unnecessary in the page allocator, it can be implemented at a 
   higher level, which allows us to remove these branches from the 
   slowpath,

 - it's mostly unnecessary since all users have orders that will 
   implicitly loop forever anyway (although __GFP_NOFAIL does guarantee 
   that it won't fail even if we change the looping behavior internally,
   these warnings help to isolate cases where it's needed), and

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-09-02 21:15         ` Neil Brown
  (?)
  (?)
@ 2010-09-05 23:03         ` David Rientjes
  -1 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-05 23:03 UTC (permalink / raw)
  To: Neil Brown
  Cc: Jan Kara, Jiri Slaby, Andrew Morton, Alasdair G Kergon,
	Chris Mason, Steven Whitehouse, Jens Axboe, Frederic Weisbecker,
	linux-raid, linux-btrfs, cluster-devel, linux-ext4,
	reiserfs-devel, linux-kernel

On Fri, 3 Sep 2010, Neil Brown wrote:

> I'm actually a bit confused about this too.
> I thought David said he was removing a branch on the *slow* path - which make
> sense as you wouldn't even test NOFAIL until you had a failure.
> Why are branches on the slow-path an issue??

They aren't necessarily an issue in the performance sense, this is a 
cleanup series since all converted callers to using these new functions 
(and the eventual removal of __GFP_NOFAIL entirely) are using the bit 
unnecessarily since they all have orders that implicitly loop in the page 
allocator forever already, with or without the flag.

^ permalink raw reply	[flat|nested] 87+ messages in thread

* Re: [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc
  2010-09-02  1:02 ` [patch v2 " David Rientjes
                     ` (4 preceding siblings ...)
  2010-09-02  7:59   ` [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc Jiri Slaby
@ 2010-09-06  9:05   ` David Rientjes
  5 siblings, 0 replies; 87+ messages in thread
From: David Rientjes @ 2010-09-06  9:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Neil Brown, Alasdair G Kergon, Chris Mason, Steven Whitehouse,
	Jens Axboe, Jan Kara, Frederic Weisbecker, linux-raid,
	linux-btrfs, cluster-devel, linux-ext4, reiserfs-devel,
	linux-kernel

On Wed, 1 Sep 2010, David Rientjes wrote:

> Add kmalloc_nofail(), kcalloc_nofail(), and kzalloc_nofail().  These
> functions are equivalent to kmalloc(), kcalloc(), and kzalloc(),
> respectively, except that they will never return NULL and instead loop
> forever trying to allocate memory.
> 
> If the first allocation attempt fails because the page allocator doesn't 
> implicitly loop, a warning will be emitted, including a call trace.
> Subsequent failures will suppress this warning.
> 
> These were added as helper functions for documentation and auditability.
> No future callers should be added.
> 

Are there any objections to merging this series through -mm with the 
exception of the fifth patch for ntfs?  That particular patch needs to 
have its WARN_ON_ONCE() condition rewritten since it fallbacks to 
vmalloc for high order allocs.

Thanks.

^ permalink raw reply	[flat|nested] 87+ messages in thread

end of thread, other threads:[~2010-09-06  9:05 UTC | newest]

Thread overview: 87+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-24 10:50 [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc David Rientjes
2010-08-24 10:50 ` [patch 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
2010-08-24 10:50 ` [patch 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
2010-08-24 12:17   ` Jan Kara
2010-08-24 12:17     ` [Cluster-devel] " Jan Kara
2010-08-24 10:50 ` [patch 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
2010-08-24 13:30   ` Peter Zijlstra
2010-08-24 10:50 ` [patch 5/5] ntfs: remove dependency on __GFP_NOFAIL David Rientjes
2010-08-24 12:15 ` [patch 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc Jan Kara
2010-08-24 12:15   ` [Cluster-devel] " Jan Kara
2010-08-24 13:29 ` Peter Zijlstra
2010-08-24 13:33   ` Jens Axboe
2010-08-24 20:11     ` David Rientjes
2010-08-24 20:11       ` David Rientjes
2010-08-25 11:24       ` Ted Ts'o
2010-08-25 11:35         ` Peter Zijlstra
2010-08-25 11:57           ` Ted Ts'o
2010-08-25 12:48             ` Peter Zijlstra
2010-08-25 12:52               ` Peter Zijlstra
2010-08-25 13:20                 ` Theodore Tso
2010-08-25 13:31                   ` Peter Zijlstra
2010-08-25 20:43                     ` David Rientjes
2010-08-25 20:55                       ` Peter Zijlstra
2010-08-25 21:11                         ` David Rientjes
2010-08-25 21:27                           ` Peter Zijlstra
2010-08-25 23:11                             ` David Rientjes
2010-08-26  0:19                               ` Ted Ts'o
2010-08-26  0:30                                 ` David Rientjes
2010-08-26  0:30                                 ` David Rientjes
2010-08-26  0:30                                 ` David Rientjes
2010-08-26  0:30                                   ` David Rientjes
2010-08-26  1:48                                   ` Ted Ts'o
2010-08-26  3:09                                     ` David Rientjes
2010-08-26  3:09                                     ` David Rientjes
2010-08-26  3:09                                     ` David Rientjes
2010-08-26  3:09                                     ` David Rientjes
2010-08-26  3:09                                       ` David Rientjes
2010-08-26  7:06                                       ` Dave Chinner
2010-08-26  8:29                                       ` Peter Zijlstra
2010-08-26  6:38                                     ` Dave Chinner
2010-08-26  6:38                                     ` Dave Chinner
2010-08-26  6:38                                     ` Dave Chinner
2010-08-26  6:38                                     ` Dave Chinner
2010-08-26  6:38                                       ` Dave Chinner
2010-08-26  0:30                                 ` David Rientjes
2010-08-25 13:34                   ` Peter Zijlstra
2010-08-25 13:24               ` Dave Chinner
2010-08-25 13:35                 ` Peter Zijlstra
2010-08-25 20:53                   ` Ted Ts'o
2010-08-25 20:59                     ` David Rientjes
2010-08-25 20:59                     ` David Rientjes
2010-08-25 20:59                     ` David Rientjes
2010-08-25 20:59                     ` David Rientjes
2010-08-25 20:59                       ` David Rientjes
2010-08-25 21:35                     ` Peter Zijlstra
2010-08-25 20:58                   ` David Rientjes
2010-08-25 21:11                     ` Christoph Lameter
2010-08-25 21:21                       ` Peter Zijlstra
2010-08-25 21:23                       ` David Rientjes
2010-08-25 21:35                         ` Christoph Lameter
2010-08-25 23:05                           ` David Rientjes
2010-08-26  1:30                             ` Christoph Lameter
2010-08-26  3:12                               ` David Rientjes
2010-08-26 14:16                                 ` Christoph Lameter
2010-08-26 22:31                                   ` David Rientjes
2010-08-26  0:09                   ` Dave Chinner
2010-08-25 14:13         ` Peter Zijlstra
2010-08-24 13:55   ` Dave Chinner
2010-08-24 14:03     ` Peter Zijlstra
2010-08-24 20:12     ` David Rientjes
2010-08-24 20:08   ` David Rientjes
2010-09-02  1:02 ` [patch v2 " David Rientjes
2010-09-02  1:03   ` [patch v2 2/5] mm: add nofail variant of kmem_cache_zalloc David Rientjes
2010-09-02  1:03   ` [patch v2 3/5] fs: add nofail variant of alloc_buffer_head David Rientjes
2010-09-02  1:03   ` [patch v2 4/5] btrfs: add nofail variant of set_extent_dirty David Rientjes
2010-09-02  1:03   ` [patch v2 5/5] ntfs: remove dependency on __GFP_NOFAIL David Rientjes
2010-09-02  9:08     ` Anton Altaparmakov
2010-09-05 22:55       ` David Rientjes
2010-09-02  7:59   ` [patch v2 1/5] mm: add nofail variants of kmalloc kcalloc and kzalloc Jiri Slaby
2010-09-02 14:51     ` Jan Kara
2010-09-02 14:51       ` [Cluster-devel] " Jan Kara
2010-09-02 21:15       ` Neil Brown
2010-09-02 21:15         ` [Cluster-devel] " Neil Brown
2010-09-02 21:15         ` Neil Brown
2010-09-05 23:03         ` David Rientjes
2010-09-05 23:01     ` David Rientjes
2010-09-06  9:05   ` David Rientjes

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.