raid5-cache: avoid GFP_NOFAIL allocation

All of lore.kernel.org
 help / color / mirror / Atom feed

* raid5-cache: avoid GFP_NOFAIL allocation
@ 2015-12-17 22:09 Christoph Hellwig
  2015-12-17 22:09 ` [PATCH 1/3] raid5-cache: use a bio_set Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-17 22:09 UTC (permalink / raw)
  To: neilb, shli; +Cc: linux-raid

bio_sets, mempools and high level retries are our friends:


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/3] raid5-cache: use a bio_set
  2015-12-17 22:09 raid5-cache: avoid GFP_NOFAIL allocation Christoph Hellwig
@ 2015-12-17 22:09 ` Christoph Hellwig
  2015-12-17 22:09 ` [PATCH 2/3] raid5-cache: use a mempool for the metadata block Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-17 22:09 UTC (permalink / raw)
  To: neilb, shli; +Cc: linux-raid

This allows us to make guaranteed forward progress.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/raid5-cache.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 668e973..0a64e97 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,12 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/*
+ * We only need 2 bios per I/O unit to make progress, but ensure we
+ * have a few more available to not get too tight.
+ */
+#define R5L_POOL_SIZE	4
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -70,6 +76,7 @@ struct r5l_log {
 	struct bio flush_bio;
 
 	struct kmem_cache *io_kc;
+	struct bio_set *bs;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -248,7 +255,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 
 	bio->bi_rw = WRITE;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1153,6 +1160,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	if (!log->bs)
+		goto io_bs;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
@@ -1170,6 +1181,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	bioset_free(log->bs);
+io_bs:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -1179,6 +1192,7 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	bioset_free(log->bs);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/3] raid5-cache: use a mempool for the metadata block
  2015-12-17 22:09 raid5-cache: avoid GFP_NOFAIL allocation Christoph Hellwig
  2015-12-17 22:09 ` [PATCH 1/3] raid5-cache: use a bio_set Christoph Hellwig
@ 2015-12-17 22:09 ` Christoph Hellwig
  2015-12-17 22:09 ` [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail Christoph Hellwig
  2015-12-17 23:31 ` raid5-cache: avoid GFP_NOFAIL allocation NeilBrown
  3 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-17 22:09 UTC (permalink / raw)
  To: neilb, shli; +Cc: linux-raid

We only have a limited number in flight, so use a page based mempool.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/raid5-cache.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0a64e97..e0a605f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -77,6 +77,7 @@ struct r5l_log {
 
 	struct kmem_cache *io_kc;
 	struct bio_set *bs;
+	mempool_t *meta_pool;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -216,7 +217,7 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
-	__free_page(io->meta_page);
+	mempool_free(io->meta_page, log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -293,8 +294,9 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	INIT_LIST_HEAD(&io->stripe_list);
 	io->state = IO_UNIT_RUNNING;
 
-	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 	block = page_address(io->meta_page);
+	clear_page(block);
 	block->magic = cpu_to_le32(R5LOG_MAGIC);
 	block->version = R5LOG_VERSION;
 	block->seq = cpu_to_le64(log->seq);
@@ -1164,6 +1166,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->bs)
 		goto io_bs;
 
+	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
+	if (!log->meta_pool)
+		goto out_mempool;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
@@ -1178,9 +1184,12 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	conf->log = log;
 	return 0;
+
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	mempool_destroy(log->meta_pool);
+out_mempool:
 	bioset_free(log->bs);
 io_bs:
 	kmem_cache_destroy(log->io_kc);
@@ -1192,6 +1201,7 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	mempool_destroy(log->meta_pool);
 	bioset_free(log->bs);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-17 22:09 raid5-cache: avoid GFP_NOFAIL allocation Christoph Hellwig
  2015-12-17 22:09 ` [PATCH 1/3] raid5-cache: use a bio_set Christoph Hellwig
  2015-12-17 22:09 ` [PATCH 2/3] raid5-cache: use a mempool for the metadata block Christoph Hellwig
@ 2015-12-17 22:09 ` Christoph Hellwig
  2015-12-17 23:48   ` Shaohua Li
  2015-12-17 23:31 ` raid5-cache: avoid GFP_NOFAIL allocation NeilBrown
  3 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-17 22:09 UTC (permalink / raw)
  To: neilb, shli; +Cc: linux-raid

And propagate the error up the stack so we can add the stripe
to no_stripes_list and retry our log operation later.  This avoids
blocking raid5d due to reclaim, an it allows to get rid of the
deadlock-prone GFP_NOFAIL allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/raid5-cache.c | 49 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e0a605f..ddee884 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -287,8 +287,10 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	/* We can't handle memory allocate failure so far */
-	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
@@ -326,8 +328,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
 		r5l_submit_current_io(log);
 
-	if (!log->current_io)
+	if (!log->current_io) {
 		log->current_io = r5l_new_meta(log);
+		if (!log->current_io)
+			return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -372,11 +378,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	r5_reserve_log_entry(log, io);
 }
 
-static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
+static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 			   int data_pages, int parity_pages)
 {
 	int i;
 	int meta_size;
+	int ret;
 	struct r5l_io_unit *io;
 
 	meta_size =
@@ -385,7 +392,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 		sizeof(struct r5l_payload_data_parity) +
 		sizeof(__le32) * parity_pages;
 
-	r5l_get_meta(log, meta_size);
+	ret = r5l_get_meta(log, meta_size);
+	if (ret)
+		return ret;
+
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
@@ -415,6 +425,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	list_add_tail(&sh->log_list, &io->stripe_list);
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
+
+	return 0;
 }
 
 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
@@ -429,6 +441,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	int meta_size;
 	int reserve;
 	int i;
+	int ret = 0;
 
 	if (!log)
 		return -EAGAIN;
@@ -477,18 +490,24 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
-	if (r5l_has_free_space(log, reserve))
-		r5l_log_stripe(log, sh, data_pages, parity_pages);
-	else {
-		spin_lock(&log->no_space_stripes_lock);
-		list_add_tail(&sh->log_list, &log->no_space_stripes);
-		spin_unlock(&log->no_space_stripes_lock);
-
-		r5l_wake_reclaim(log, reserve);
-	}
-	mutex_unlock(&log->io_mutex);
+	if (!r5l_has_free_space(log, reserve))
+		goto err_retry;
 
+	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+	if (ret)
+		goto err_retry;
+
+out_unlock:
+	mutex_unlock(&log->io_mutex);
 	return 0;
+
+err_retry:
+	spin_lock(&log->no_space_stripes_lock);
+	list_add_tail(&sh->log_list, &log->no_space_stripes);
+	spin_unlock(&log->no_space_stripes_lock);
+
+	r5l_wake_reclaim(log, reserve);
+	goto out_unlock;
 }
 
 void r5l_write_stripe_run(struct r5l_log *log)
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: raid5-cache: avoid GFP_NOFAIL allocation
  2015-12-17 22:09 raid5-cache: avoid GFP_NOFAIL allocation Christoph Hellwig
                   ` (2 preceding siblings ...)
  2015-12-17 22:09 ` [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail Christoph Hellwig
@ 2015-12-17 23:31 ` NeilBrown
  3 siblings, 0 replies; 15+ messages in thread
From: NeilBrown @ 2015-12-17 23:31 UTC (permalink / raw)
  To: Christoph Hellwig, shli; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 156 bytes --]

On Fri, Dec 18 2015, Christoph Hellwig wrote:

> bio_sets, mempools and high level retries are our friends:

Yes they are!

all applied, Thanks.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-17 22:09 ` [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail Christoph Hellwig
@ 2015-12-17 23:48   ` Shaohua Li
  2015-12-18  1:51     ` NeilBrown
  0 siblings, 1 reply; 15+ messages in thread
From: Shaohua Li @ 2015-12-17 23:48 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: neilb, linux-raid

On Thu, Dec 17, 2015 at 11:09:57PM +0100, Christoph Hellwig wrote:
> And propagate the error up the stack so we can add the stripe
> to no_stripes_list and retry our log operation later.  This avoids
> blocking raid5d due to reclaim, an it allows to get rid of the
> deadlock-prone GFP_NOFAIL allocation.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/md/raid5-cache.c | 49 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 34 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index e0a605f..ddee884 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -287,8 +287,10 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
>  	struct r5l_io_unit *io;
>  	struct r5l_meta_block *block;
>  
> -	/* We can't handle memory allocate failure so far */
> -	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
> +	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
> +	if (!io)
> +		return NULL;
> +
>  	io->log = log;
>  	INIT_LIST_HEAD(&io->log_sibling);
>  	INIT_LIST_HEAD(&io->stripe_list);
> @@ -326,8 +328,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
>  	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
>  		r5l_submit_current_io(log);
>  
> -	if (!log->current_io)
> +	if (!log->current_io) {
>  		log->current_io = r5l_new_meta(log);
> +		if (!log->current_io)
> +			return -ENOMEM;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -372,11 +378,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
>  	r5_reserve_log_entry(log, io);
>  }
>  
> -static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
> +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  			   int data_pages, int parity_pages)
>  {
>  	int i;
>  	int meta_size;
> +	int ret;
>  	struct r5l_io_unit *io;
>  
>  	meta_size =
> @@ -385,7 +392,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  		sizeof(struct r5l_payload_data_parity) +
>  		sizeof(__le32) * parity_pages;
>  
> -	r5l_get_meta(log, meta_size);
> +	ret = r5l_get_meta(log, meta_size);
> +	if (ret)
> +		return ret;
> +
>  	io = log->current_io;
>  
>  	for (i = 0; i < sh->disks; i++) {
> @@ -415,6 +425,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  	list_add_tail(&sh->log_list, &io->stripe_list);
>  	atomic_inc(&io->pending_stripe);
>  	sh->log_io = io;
> +
> +	return 0;
>  }
>  
>  static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
> @@ -429,6 +441,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  	int meta_size;
>  	int reserve;
>  	int i;
> +	int ret = 0;
>  
>  	if (!log)
>  		return -EAGAIN;
> @@ -477,18 +490,24 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  	mutex_lock(&log->io_mutex);
>  	/* meta + data */
>  	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
> -	if (r5l_has_free_space(log, reserve))
> -		r5l_log_stripe(log, sh, data_pages, parity_pages);
> -	else {
> -		spin_lock(&log->no_space_stripes_lock);
> -		list_add_tail(&sh->log_list, &log->no_space_stripes);
> -		spin_unlock(&log->no_space_stripes_lock);
> -
> -		r5l_wake_reclaim(log, reserve);
> -	}
> -	mutex_unlock(&log->io_mutex);
> +	if (!r5l_has_free_space(log, reserve))
> +		goto err_retry;
>  
> +	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
> +	if (ret)
> +		goto err_retry;
> +
> +out_unlock:
> +	mutex_unlock(&log->io_mutex);
>  	return 0;
> +
> +err_retry:
> +	spin_lock(&log->no_space_stripes_lock);
> +	list_add_tail(&sh->log_list, &log->no_space_stripes);
> +	spin_unlock(&log->no_space_stripes_lock);
> +
> +	r5l_wake_reclaim(log, reserve);
> +	goto out_unlock;
>  }

if the reclaim thread doesn't have anything to reclaim,
r5l_run_no_space_stripes isn't called. we might miss the retry.

I'm a little worrying about the GFP_ATOMIC allocation. In the first try,
GFP_NOWAIT is better. And on the other hand, why sleep is bad here? We
could use GFP_NOIO | __GFP_NORETRY, there is no deadlock risk.

In the retry, GFP_NOIO looks better. No deadlock too, since it's not
called from raid5d (maybe we shouldn't call from reclaim thread if using
GFP_NOIO, a workqueue is better). Otherwise we could keep retring but do
nothing.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-17 23:48   ` Shaohua Li
@ 2015-12-18  1:51     ` NeilBrown
  2015-12-18  1:58       ` Shaohua Li
  2015-12-18 11:23       ` Christoph Hellwig
  0 siblings, 2 replies; 15+ messages in thread
From: NeilBrown @ 2015-12-18  1:51 UTC (permalink / raw)
  To: Shaohua Li, Christoph Hellwig; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 6011 bytes --]

On Fri, Dec 18 2015, Shaohua Li wrote:

> On Thu, Dec 17, 2015 at 11:09:57PM +0100, Christoph Hellwig wrote:
>> And propagate the error up the stack so we can add the stripe
>> to no_stripes_list and retry our log operation later.  This avoids
>> blocking raid5d due to reclaim, an it allows to get rid of the
>> deadlock-prone GFP_NOFAIL allocation.
>> 
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>> ---
>>  drivers/md/raid5-cache.c | 49 +++++++++++++++++++++++++++++++++---------------
>>  1 file changed, 34 insertions(+), 15 deletions(-)
>> 
>> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
>> index e0a605f..ddee884 100644
>> --- a/drivers/md/raid5-cache.c
>> +++ b/drivers/md/raid5-cache.c
>> @@ -287,8 +287,10 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
>>  	struct r5l_io_unit *io;
>>  	struct r5l_meta_block *block;
>>  
>> -	/* We can't handle memory allocate failure so far */
>> -	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
>> +	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
>> +	if (!io)
>> +		return NULL;
>> +
>>  	io->log = log;
>>  	INIT_LIST_HEAD(&io->log_sibling);
>>  	INIT_LIST_HEAD(&io->stripe_list);
>> @@ -326,8 +328,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
>>  	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
>>  		r5l_submit_current_io(log);
>>  
>> -	if (!log->current_io)
>> +	if (!log->current_io) {
>>  		log->current_io = r5l_new_meta(log);
>> +		if (!log->current_io)
>> +			return -ENOMEM;
>> +	}
>> +
>>  	return 0;
>>  }
>>  
>> @@ -372,11 +378,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
>>  	r5_reserve_log_entry(log, io);
>>  }
>>  
>> -static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>> +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>>  			   int data_pages, int parity_pages)
>>  {
>>  	int i;
>>  	int meta_size;
>> +	int ret;
>>  	struct r5l_io_unit *io;
>>  
>>  	meta_size =
>> @@ -385,7 +392,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>>  		sizeof(struct r5l_payload_data_parity) +
>>  		sizeof(__le32) * parity_pages;
>>  
>> -	r5l_get_meta(log, meta_size);
>> +	ret = r5l_get_meta(log, meta_size);
>> +	if (ret)
>> +		return ret;
>> +
>>  	io = log->current_io;
>>  
>>  	for (i = 0; i < sh->disks; i++) {
>> @@ -415,6 +425,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>>  	list_add_tail(&sh->log_list, &io->stripe_list);
>>  	atomic_inc(&io->pending_stripe);
>>  	sh->log_io = io;
>> +
>> +	return 0;
>>  }
>>  
>>  static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
>> @@ -429,6 +441,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>>  	int meta_size;
>>  	int reserve;
>>  	int i;
>> +	int ret = 0;
>>  
>>  	if (!log)
>>  		return -EAGAIN;
>> @@ -477,18 +490,24 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>>  	mutex_lock(&log->io_mutex);
>>  	/* meta + data */
>>  	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
>> -	if (r5l_has_free_space(log, reserve))
>> -		r5l_log_stripe(log, sh, data_pages, parity_pages);
>> -	else {
>> -		spin_lock(&log->no_space_stripes_lock);
>> -		list_add_tail(&sh->log_list, &log->no_space_stripes);
>> -		spin_unlock(&log->no_space_stripes_lock);
>> -
>> -		r5l_wake_reclaim(log, reserve);
>> -	}
>> -	mutex_unlock(&log->io_mutex);
>> +	if (!r5l_has_free_space(log, reserve))
>> +		goto err_retry;
>>  
>> +	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
>> +	if (ret)
>> +		goto err_retry;
>> +
>> +out_unlock:
>> +	mutex_unlock(&log->io_mutex);
>>  	return 0;
>> +
>> +err_retry:
>> +	spin_lock(&log->no_space_stripes_lock);
>> +	list_add_tail(&sh->log_list, &log->no_space_stripes);
>> +	spin_unlock(&log->no_space_stripes_lock);
>> +
>> +	r5l_wake_reclaim(log, reserve);
>> +	goto out_unlock;
>>  }
>
> if the reclaim thread doesn't have anything to reclaim,
> r5l_run_no_space_stripes isn't called. we might miss the retry.

so something like this:
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 18de1fc4a75b..b63878edf7e9 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -596,7 +596,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 		return;
 	}
 
-	if (r5l_reclaimable_space(log) > log->max_free_space)
+	if (r5l_reclaimable_space(log) > log->max_free_space ||
+	    !list_empty(&log->no_space_stripes))
 		r5l_wake_reclaim(log, 0);
 
 	spin_unlock_irqrestore(&log->io_list_lock, flags);

or is that too simplistic?

>
> I'm a little worrying about the GFP_ATOMIC allocation. In the first try,
> GFP_NOWAIT is better. And on the other hand, why sleep is bad here? We
> could use GFP_NOIO | __GFP_NORETRY, there is no deadlock risk.
>
> In the retry, GFP_NOIO looks better. No deadlock too, since it's not
> called from raid5d (maybe we shouldn't call from reclaim thread if using
> GFP_NOIO, a workqueue is better). Otherwise we could keep retring but do
> nothing.

I did wonder a little bit about that.
GFP_ATOMIC is (__GFP_HIGH)
GFP_NOIO | __GFP_NORETRY is  (__GFP_WAIT | __GFP_NORETRY)

It isn't clear that we need 'HIGH', and WAIT with NORETRY should be OK.
It allows __alloc_pages_direct_reclaim, but only once and never waits
for other IO.

We probably should add __GFP_NOWARN too because we expect occasional
failure.

So

--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -287,7 +287,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
+	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
 	if (!io)
 		return NULL;
 


Thoughts?

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-18  1:51     ` NeilBrown
@ 2015-12-18  1:58       ` Shaohua Li
  2015-12-18 11:25         ` Christoph Hellwig
  2015-12-18 11:23       ` Christoph Hellwig
  1 sibling, 1 reply; 15+ messages in thread
From: Shaohua Li @ 2015-12-18  1:58 UTC (permalink / raw)
  To: NeilBrown; +Cc: Christoph Hellwig, linux-raid

On Fri, Dec 18, 2015 at 12:51:07PM +1100, NeilBrown wrote:
> On Fri, Dec 18 2015, Shaohua Li wrote:
> 
> > On Thu, Dec 17, 2015 at 11:09:57PM +0100, Christoph Hellwig wrote:
> >> And propagate the error up the stack so we can add the stripe
> >> to no_stripes_list and retry our log operation later.  This avoids
> >> blocking raid5d due to reclaim, an it allows to get rid of the
> >> deadlock-prone GFP_NOFAIL allocation.
> >> 
> >> Signed-off-by: Christoph Hellwig <hch@lst.de>
> >> ---
> >>  drivers/md/raid5-cache.c | 49 +++++++++++++++++++++++++++++++++---------------
> >>  1 file changed, 34 insertions(+), 15 deletions(-)
> >> 
> >> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> >> index e0a605f..ddee884 100644
> >> --- a/drivers/md/raid5-cache.c
> >> +++ b/drivers/md/raid5-cache.c
> >> @@ -287,8 +287,10 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
> >>  	struct r5l_io_unit *io;
> >>  	struct r5l_meta_block *block;
> >>  
> >> -	/* We can't handle memory allocate failure so far */
> >> -	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
> >> +	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
> >> +	if (!io)
> >> +		return NULL;
> >> +
> >>  	io->log = log;
> >>  	INIT_LIST_HEAD(&io->log_sibling);
> >>  	INIT_LIST_HEAD(&io->stripe_list);
> >> @@ -326,8 +328,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
> >>  	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
> >>  		r5l_submit_current_io(log);
> >>  
> >> -	if (!log->current_io)
> >> +	if (!log->current_io) {
> >>  		log->current_io = r5l_new_meta(log);
> >> +		if (!log->current_io)
> >> +			return -ENOMEM;
> >> +	}
> >> +
> >>  	return 0;
> >>  }
> >>  
> >> @@ -372,11 +378,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
> >>  	r5_reserve_log_entry(log, io);
> >>  }
> >>  
> >> -static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
> >> +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
> >>  			   int data_pages, int parity_pages)
> >>  {
> >>  	int i;
> >>  	int meta_size;
> >> +	int ret;
> >>  	struct r5l_io_unit *io;
> >>  
> >>  	meta_size =
> >> @@ -385,7 +392,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
> >>  		sizeof(struct r5l_payload_data_parity) +
> >>  		sizeof(__le32) * parity_pages;
> >>  
> >> -	r5l_get_meta(log, meta_size);
> >> +	ret = r5l_get_meta(log, meta_size);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >>  	io = log->current_io;
> >>  
> >>  	for (i = 0; i < sh->disks; i++) {
> >> @@ -415,6 +425,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
> >>  	list_add_tail(&sh->log_list, &io->stripe_list);
> >>  	atomic_inc(&io->pending_stripe);
> >>  	sh->log_io = io;
> >> +
> >> +	return 0;
> >>  }
> >>  
> >>  static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
> >> @@ -429,6 +441,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
> >>  	int meta_size;
> >>  	int reserve;
> >>  	int i;
> >> +	int ret = 0;
> >>  
> >>  	if (!log)
> >>  		return -EAGAIN;
> >> @@ -477,18 +490,24 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
> >>  	mutex_lock(&log->io_mutex);
> >>  	/* meta + data */
> >>  	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
> >> -	if (r5l_has_free_space(log, reserve))
> >> -		r5l_log_stripe(log, sh, data_pages, parity_pages);
> >> -	else {
> >> -		spin_lock(&log->no_space_stripes_lock);
> >> -		list_add_tail(&sh->log_list, &log->no_space_stripes);
> >> -		spin_unlock(&log->no_space_stripes_lock);
> >> -
> >> -		r5l_wake_reclaim(log, reserve);
> >> -	}
> >> -	mutex_unlock(&log->io_mutex);
> >> +	if (!r5l_has_free_space(log, reserve))
> >> +		goto err_retry;
> >>  
> >> +	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
> >> +	if (ret)
> >> +		goto err_retry;
> >> +
> >> +out_unlock:
> >> +	mutex_unlock(&log->io_mutex);
> >>  	return 0;
> >> +
> >> +err_retry:
> >> +	spin_lock(&log->no_space_stripes_lock);
> >> +	list_add_tail(&sh->log_list, &log->no_space_stripes);
> >> +	spin_unlock(&log->no_space_stripes_lock);
> >> +
> >> +	r5l_wake_reclaim(log, reserve);
> >> +	goto out_unlock;
> >>  }
> >
> > if the reclaim thread doesn't have anything to reclaim,
> > r5l_run_no_space_stripes isn't called. we might miss the retry.
> 
> so something like this:
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 18de1fc4a75b..b63878edf7e9 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -596,7 +596,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
>  		return;
>  	}
>  
> -	if (r5l_reclaimable_space(log) > log->max_free_space)
> +	if (r5l_reclaimable_space(log) > log->max_free_space ||
> +	    !list_empty(&log->no_space_stripes))
>  		r5l_wake_reclaim(log, 0);
>  
>  	spin_unlock_irqrestore(&log->io_list_lock, flags);
> 
> or is that too simplistic?

maybe add a new list and run the list at the begining of r5l_do_reclaim().

> >
> > I'm a little worrying about the GFP_ATOMIC allocation. In the first try,
> > GFP_NOWAIT is better. And on the other hand, why sleep is bad here? We
> > could use GFP_NOIO | __GFP_NORETRY, there is no deadlock risk.
> >
> > In the retry, GFP_NOIO looks better. No deadlock too, since it's not
> > called from raid5d (maybe we shouldn't call from reclaim thread if using
> > GFP_NOIO, a workqueue is better). Otherwise we could keep retring but do
> > nothing.
> 
> I did wonder a little bit about that.
> GFP_ATOMIC is (__GFP_HIGH)
> GFP_NOIO | __GFP_NORETRY is  (__GFP_WAIT | __GFP_NORETRY)
> 
> It isn't clear that we need 'HIGH', and WAIT with NORETRY should be OK.
> It allows __alloc_pages_direct_reclaim, but only once and never waits
> for other IO.
> 
> We probably should add __GFP_NOWARN too because we expect occasional
> failure.
> 
> So
> 
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -287,7 +287,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
>  	struct r5l_io_unit *io;
>  	struct r5l_meta_block *block;
>  
> -	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
> +	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
>  	if (!io)
>  		return NULL;

Looks good.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-18  1:51     ` NeilBrown
  2015-12-18  1:58       ` Shaohua Li
@ 2015-12-18 11:23       ` Christoph Hellwig
  2015-12-20 22:51         ` NeilBrown
  1 sibling, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-18 11:23 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, Christoph Hellwig, linux-raid

On Fri, Dec 18, 2015 at 12:51:07PM +1100, NeilBrown wrote:
> > if the reclaim thread doesn't have anything to reclaim,
> > r5l_run_no_space_stripes isn't called. we might miss the retry.
> 
> so something like this:

that looks fine to me.  I'll give a spin on my QA setup.

> > I'm a little worrying about the GFP_ATOMIC allocation. In the first try,
> > GFP_NOWAIT is better. And on the other hand, why sleep is bad here? We
> > could use GFP_NOIO | __GFP_NORETRY, there is no deadlock risk.
> >
> > In the retry, GFP_NOIO looks better. No deadlock too, since it's not
> > called from raid5d (maybe we shouldn't call from reclaim thread if using
> > GFP_NOIO, a workqueue is better). Otherwise we could keep retring but do
> > nothing.
> 
> I did wonder a little bit about that.
> GFP_ATOMIC is (__GFP_HIGH)
> GFP_NOIO | __GFP_NORETRY is  (__GFP_WAIT | __GFP_NORETRY)
> 
> It isn't clear that we need 'HIGH', and WAIT with NORETRY should be OK.
> It allows __alloc_pages_direct_reclaim, but only once and never waits
> for other IO.

In general we go for HIGH on non-sleeping allocation to avoid having
the stalled.  This is especially important in the I/O path.

WAIT means we will reclaim and wait for it, which looks a little dangerous
to me.  In general I'd prefer not to use obscure gfp flag combination
unless there is a real need, and it's clearly documented.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-18  1:58       ` Shaohua Li
@ 2015-12-18 11:25         ` Christoph Hellwig
  2015-12-18 23:07           ` Shaohua Li
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-18 11:25 UTC (permalink / raw)
  To: Shaohua Li; +Cc: NeilBrown, Christoph Hellwig, linux-raid

[can you trim your replies please?  I had to trim almost 150 lines of
 junk before getting to your reply!]

> >  	spin_unlock_irqrestore(&log->io_list_lock, flags);
> > 
> > or is that too simplistic?
> 
> maybe add a new list and run the list at the begining of r5l_do_reclaim().

What's the advantage of another list?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-18 11:25         ` Christoph Hellwig
@ 2015-12-18 23:07           ` Shaohua Li
  2015-12-20 22:59             ` NeilBrown
  0 siblings, 1 reply; 15+ messages in thread
From: Shaohua Li @ 2015-12-18 23:07 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: NeilBrown, linux-raid

On Fri, Dec 18, 2015 at 12:25:35PM +0100, Christoph Hellwig wrote:
> [can you trim your replies please?  I had to trim almost 150 lines of
>  junk before getting to your reply!]
> 
> > >  	spin_unlock_irqrestore(&log->io_list_lock, flags);
> > > 
> > > or is that too simplistic?
> > 
> > maybe add a new list and run the list at the begining of r5l_do_reclaim().
> 
> What's the advantage of another list?

I mean simply waking up reclaim thread doesn't work as r5l_do_reclaim()
will return if reclaimable == 0. There is no guarantee reclaimable space
is available in the allocation failure. So we'd better move the retry at
the begining of r5l_do_reclaim(). If yes, we can't reuse the
no_space_stripes list.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-18 11:23       ` Christoph Hellwig
@ 2015-12-20 22:51         ` NeilBrown
  0 siblings, 0 replies; 15+ messages in thread
From: NeilBrown @ 2015-12-20 22:51 UTC (permalink / raw)
  Cc: Shaohua Li, Christoph Hellwig, linux-raid

[-- Attachment #1: Type: text/plain, Size: 2724 bytes --]

On Fri, Dec 18 2015, Christoph Hellwig wrote:

> On Fri, Dec 18, 2015 at 12:51:07PM +1100, NeilBrown wrote:
>> > if the reclaim thread doesn't have anything to reclaim,
>> > r5l_run_no_space_stripes isn't called. we might miss the retry.
>> 
>> so something like this:
>
> that looks fine to me.  I'll give a spin on my QA setup.
>
>> > I'm a little worrying about the GFP_ATOMIC allocation. In the first try,
>> > GFP_NOWAIT is better. And on the other hand, why sleep is bad here? We
>> > could use GFP_NOIO | __GFP_NORETRY, there is no deadlock risk.
>> >
>> > In the retry, GFP_NOIO looks better. No deadlock too, since it's not
>> > called from raid5d (maybe we shouldn't call from reclaim thread if using
>> > GFP_NOIO, a workqueue is better). Otherwise we could keep retring but do
>> > nothing.
>> 
>> I did wonder a little bit about that.
>> GFP_ATOMIC is (__GFP_HIGH)
>> GFP_NOIO | __GFP_NORETRY is  (__GFP_WAIT | __GFP_NORETRY)
>> 
>> It isn't clear that we need 'HIGH', and WAIT with NORETRY should be OK.
>> It allows __alloc_pages_direct_reclaim, but only once and never waits
>> for other IO.
>
> In general we go for HIGH on non-sleeping allocation to avoid having
> the stalled.  This is especially important in the I/O path.
>
> WAIT means we will reclaim and wait for it, which looks a little dangerous
> to me.  In general I'd prefer not to use obscure gfp flag combination
> unless there is a real need, and it's clearly documented.

I don't think "will reclaim and wait for it" is accurate.

Certainly page_alloc will only try direct reclaim when WAIT is set, but
I don't think it actually waits for anything particular.

There are various waits in the direct reclaim path such as
throttle_vm_writeout(), but they seem to be throttling delays rather
than "wait for something particular" delays.
Also direct reclaim (e.g. in shrinkers) are allows to sleep (as long as
they don't violate NOFS or NOIO), so waiting can definitely happen for
non-throttling reasons.

I *think* (not 100% sure) that __GFP_WAIT means that the alloc code is
allowed to schedule().  Without that flag it mustn't.

So it makes sense to use WAIT when it is appropriate for a delay to be
introduce to throttle allocations.  I don't think that is the case in
raid5d.  raid5d is too low level - throttling it will not directly
affect higher level allocations.

So while my reasoning is a bit different, I agree that we don't really
want WAIT in an allocation in raid5d.

.... oh.  I just noticed that __GFP_WAIT was renamed to __GFP_RECLAIM
last month.  And there is a new __GFP_DIRECT_RECLAIM.  I think I'm going
to have to learn how this stuff works again.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-18 23:07           ` Shaohua Li
@ 2015-12-20 22:59             ` NeilBrown
  2015-12-22 15:20               ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: NeilBrown @ 2015-12-20 22:59 UTC (permalink / raw)
  To: Shaohua Li, Christoph Hellwig; +Cc: linux-raid

[-- Attachment #1: Type: text/plain, Size: 1950 bytes --]

On Sat, Dec 19 2015, Shaohua Li wrote:

> On Fri, Dec 18, 2015 at 12:25:35PM +0100, Christoph Hellwig wrote:
>> [can you trim your replies please?  I had to trim almost 150 lines of
>>  junk before getting to your reply!]
>> 
>> > >  	spin_unlock_irqrestore(&log->io_list_lock, flags);
>> > > 
>> > > or is that too simplistic?
>> > 
>> > maybe add a new list and run the list at the begining of r5l_do_reclaim().
>> 
>> What's the advantage of another list?
>
> I mean simply waking up reclaim thread doesn't work as r5l_do_reclaim()
> will return if reclaimable == 0. There is no guarantee reclaimable space
> is available in the allocation failure. So we'd better move the retry at
> the begining of r5l_do_reclaim(). If yes, we can't reuse the
> no_space_stripes list.

They certainly are conceptually different lists.  Whether the same
linked list can be used for both is an engineering question.

One list is for updates that cannot be handled yet because the log is
full.  Once the head of the log is cleared and the start pointer
updated, those requests can be handled (or some of them can).

The other list is for updates that cannot be handled yet because a
kmalloc failed.  There is no clear trigger for when to handle those
again so we would need to retry quite frequently.  It would be easy to
miss retrying in rare circumstances...

I wonder if we should have a mempool for these io units too.
We would allocate with GFP_ATOMIC (or similar) so the allocation woult
fail instead of blocking, but we would then know that an allocation
could only fail if there was another request in flight.  So the place
where we free an io_unit would be the obviously correct place to trigger
a retry of the delayed-due-to-mem-allocation-failure stripes.

So I think I would prefer two lists, another mempool, and very well
defined places to retry the two lists.  Is that over-engineering?

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-20 22:59             ` NeilBrown
@ 2015-12-22 15:20               ` Christoph Hellwig
  2015-12-22 22:29                 ` NeilBrown
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2015-12-22 15:20 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid

> I wonder if we should have a mempool for these io units too.
> We would allocate with GFP_ATOMIC (or similar) so the allocation woult
> fail instead of blocking, but we would then know that an allocation
> could only fail if there was another request in flight.  So the place
> where we free an io_unit would be the obviously correct place to trigger
> a retry of the delayed-due-to-mem-allocation-failure stripes.
> 
> So I think I would prefer two lists, another mempool, and very well
> defined places to retry the two lists.  Is that over-engineering?

How about the variant below (relative to md/for-next)?  This implements
the above and passes testing fine:

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 18de1fc..4fa9457 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -75,7 +75,10 @@ struct r5l_log {
 	struct list_head finished_ios;	/* io_units which settle down in log disk */
 	struct bio flush_bio;
 
+	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
+
 	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
 	struct bio_set *bs;
 	mempool_t *meta_pool;
 
@@ -287,9 +290,10 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
+	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 	if (!io)
 		return NULL;
+	memset(io, 0, sizeof(*io));
 
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
@@ -490,24 +494,25 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
-	if (!r5l_has_free_space(log, reserve))
-		goto err_retry;
+	if (!r5l_has_free_space(log, reserve)) {
+		spin_lock(&log->no_space_stripes_lock);
+		list_add_tail(&sh->log_list, &log->no_space_stripes);
+		spin_unlock(&log->no_space_stripes_lock);
+
+		r5l_wake_reclaim(log, reserve);
+		goto out_unlock;
+	}
 
 	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
-	if (ret)
-		goto err_retry;
+	if (ret) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
 
 out_unlock:
 	mutex_unlock(&log->io_mutex);
 	return 0;
-
-err_retry:
-	spin_lock(&log->no_space_stripes_lock);
-	list_add_tail(&sh->log_list, &log->no_space_stripes);
-	spin_unlock(&log->no_space_stripes_lock);
-
-	r5l_wake_reclaim(log, reserve);
-	goto out_unlock;
 }
 
 void r5l_write_stripe_run(struct r5l_log *log)
@@ -559,6 +564,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
 				 log->next_checkpoint);
 }
 
+static void r5l_run_no_mem_stripe(struct r5l_log *log)
+{
+	struct stripe_head *sh;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		sh = list_first_entry(&log->no_mem_stripes,
+				      struct stripe_head, log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
 static bool r5l_complete_finished_ios(struct r5l_log *log)
 {
 	struct r5l_io_unit *io, *next;
@@ -575,7 +595,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_cp_seq = io->seq;
 
 		list_del(&io->log_sibling);
-		kmem_cache_free(log->io_kc, io);
+		mempool_free(io, log->io_pool);
+		r5l_run_no_mem_stripe(log);
 
 		found = true;
 	}
@@ -1189,6 +1210,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
+	if (!log->io_pool)
+		goto io_pool;
+
 	log->bs = bioset_create(R5L_POOL_SIZE, 0);
 	if (!log->bs)
 		goto io_bs;
@@ -1203,6 +1228,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 		goto reclaim_thread;
 	init_waitqueue_head(&log->iounit_wait);
 
+	INIT_LIST_HEAD(&log->no_mem_stripes);
+
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
@@ -1219,6 +1246,8 @@ reclaim_thread:
 out_mempool:
 	bioset_free(log->bs);
 io_bs:
+	mempool_destroy(log->io_pool);
+io_pool:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail
  2015-12-22 15:20               ` Christoph Hellwig
@ 2015-12-22 22:29                 ` NeilBrown
  0 siblings, 0 replies; 15+ messages in thread
From: NeilBrown @ 2015-12-22 22:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Shaohua Li, linux-raid

[-- Attachment #1: Type: text/plain, Size: 4745 bytes --]

On Wed, Dec 23 2015, Christoph Hellwig wrote:

>> I wonder if we should have a mempool for these io units too.
>> We would allocate with GFP_ATOMIC (or similar) so the allocation woult
>> fail instead of blocking, but we would then know that an allocation
>> could only fail if there was another request in flight.  So the place
>> where we free an io_unit would be the obviously correct place to trigger
>> a retry of the delayed-due-to-mem-allocation-failure stripes.
>> 
>> So I think I would prefer two lists, another mempool, and very well
>> defined places to retry the two lists.  Is that over-engineering?
>
> How about the variant below (relative to md/for-next)?  This implements
> the above and passes testing fine:
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 18de1fc..4fa9457 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -75,7 +75,10 @@ struct r5l_log {
>  	struct list_head finished_ios;	/* io_units which settle down in log disk */
>  	struct bio flush_bio;
>  
> +	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
> +
>  	struct kmem_cache *io_kc;
> +	mempool_t *io_pool;
>  	struct bio_set *bs;
>  	mempool_t *meta_pool;
>  
> @@ -287,9 +290,10 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
>  	struct r5l_io_unit *io;
>  	struct r5l_meta_block *block;
>  
> -	io = kmem_cache_zalloc(log->io_kc, GFP_ATOMIC);
> +	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
>  	if (!io)
>  		return NULL;
> +	memset(io, 0, sizeof(*io));
>  
>  	io->log = log;
>  	INIT_LIST_HEAD(&io->log_sibling);
> @@ -490,24 +494,25 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  	mutex_lock(&log->io_mutex);
>  	/* meta + data */
>  	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
> -	if (!r5l_has_free_space(log, reserve))
> -		goto err_retry;
> +	if (!r5l_has_free_space(log, reserve)) {
> +		spin_lock(&log->no_space_stripes_lock);
> +		list_add_tail(&sh->log_list, &log->no_space_stripes);
> +		spin_unlock(&log->no_space_stripes_lock);
> +
> +		r5l_wake_reclaim(log, reserve);
> +		goto out_unlock;
> +	}
>  
>  	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
> -	if (ret)
> -		goto err_retry;
> +	if (ret) {
> +		spin_lock_irq(&log->io_list_lock);
> +		list_add_tail(&sh->log_list, &log->no_mem_stripes);
> +		spin_unlock_irq(&log->io_list_lock);
> +	}
>  
>  out_unlock:
>  	mutex_unlock(&log->io_mutex);
>  	return 0;
> -
> -err_retry:
> -	spin_lock(&log->no_space_stripes_lock);
> -	list_add_tail(&sh->log_list, &log->no_space_stripes);
> -	spin_unlock(&log->no_space_stripes_lock);
> -
> -	r5l_wake_reclaim(log, reserve);
> -	goto out_unlock;
>  }
>  
>  void r5l_write_stripe_run(struct r5l_log *log)
> @@ -559,6 +564,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
>  				 log->next_checkpoint);
>  }
>  
> +static void r5l_run_no_mem_stripe(struct r5l_log *log)
> +{
> +	struct stripe_head *sh;
> +
> +	assert_spin_locked(&log->io_list_lock);
> +
> +	if (!list_empty(&log->no_mem_stripes)) {
> +		sh = list_first_entry(&log->no_mem_stripes,
> +				      struct stripe_head, log_list);
> +		list_del_init(&sh->log_list);
> +		set_bit(STRIPE_HANDLE, &sh->state);
> +		raid5_release_stripe(sh);
> +	}
> +}
> +
>  static bool r5l_complete_finished_ios(struct r5l_log *log)
>  {
>  	struct r5l_io_unit *io, *next;
> @@ -575,7 +595,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
>  		log->next_cp_seq = io->seq;
>  
>  		list_del(&io->log_sibling);
> -		kmem_cache_free(log->io_kc, io);
> +		mempool_free(io, log->io_pool);
> +		r5l_run_no_mem_stripe(log);
>  
>  		found = true;
>  	}
> @@ -1189,6 +1210,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  	if (!log->io_kc)
>  		goto io_kc;
>  
> +	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
> +	if (!log->io_pool)
> +		goto io_pool;
> +
>  	log->bs = bioset_create(R5L_POOL_SIZE, 0);
>  	if (!log->bs)
>  		goto io_bs;
> @@ -1203,6 +1228,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  		goto reclaim_thread;
>  	init_waitqueue_head(&log->iounit_wait);
>  
> +	INIT_LIST_HEAD(&log->no_mem_stripes);
> +
>  	INIT_LIST_HEAD(&log->no_space_stripes);
>  	spin_lock_init(&log->no_space_stripes_lock);
>  
> @@ -1219,6 +1246,8 @@ reclaim_thread:
>  out_mempool:
>  	bioset_free(log->bs);
>  io_bs:
> +	mempool_destroy(log->io_pool);
> +io_pool:
>  	kmem_cache_destroy(log->io_kc);
>  io_kc:
>  	kfree(log);

Yes, that looks just right - thanks.
I feel a lot more confident that this code won't deadlock.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-12-22 22:29 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-17 22:09 raid5-cache: avoid GFP_NOFAIL allocation Christoph Hellwig
2015-12-17 22:09 ` [PATCH 1/3] raid5-cache: use a bio_set Christoph Hellwig
2015-12-17 22:09 ` [PATCH 2/3] raid5-cache: use a mempool for the metadata block Christoph Hellwig
2015-12-17 22:09 ` [PATCH 3/3] raid5: allow r5l_io_unit allocations to fail Christoph Hellwig
2015-12-17 23:48   ` Shaohua Li
2015-12-18  1:51     ` NeilBrown
2015-12-18  1:58       ` Shaohua Li
2015-12-18 11:25         ` Christoph Hellwig
2015-12-18 23:07           ` Shaohua Li
2015-12-20 22:59             ` NeilBrown
2015-12-22 15:20               ` Christoph Hellwig
2015-12-22 22:29                 ` NeilBrown
2015-12-18 11:23       ` Christoph Hellwig
2015-12-20 22:51         ` NeilBrown
2015-12-17 23:31 ` raid5-cache: avoid GFP_NOFAIL allocation NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.