All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
@ 2014-10-06 12:27 Bart Van Assche
  2014-10-06 17:40 ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Bart Van Assche @ 2014-10-06 12:27 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Robert Elliott, Ming Lei, Sagi Grimberg, linux-kernel

Ensure that bt_clear_tag() increments bs->wait_cnt if one or more
threads are waiting for a tag. Remove a superfluous
waitqueue_active() call from blk_mq_tag_wakeup_all(). This patch
avoids that bt_get() hangs as follows if the number of hardware
contexts is below the number of CPU threads:

INFO: task fio:6739 blocked for more than 120 seconds.
      Not tainted 3.17.0-rc7+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
fio             D ffff88085fcd2740     0  6739   6688 0x00000000
 ffff8807fb96f830 0000000000000086 ffff8808350eb000 ffff8807fb96ffd8
 0000000000012740 0000000000012740 ffff88085b885000 ffff8808350eb000
 ffff88085fcd2fe0 ffff8807fb96f920 ffff88080061ddc8 ffffe8ffffcc4600
Call Trace:
 [<ffffffff8142f52d>] io_schedule+0x9d/0x130
 [<ffffffff812016bf>] bt_get+0xef/0x180
 [<ffffffff8107f440>] ? prepare_to_wait_event+0x110/0x110
 [<ffffffff81201a0f>] blk_mq_get_tag+0x9f/0xd0
 [<ffffffff811fdedb>] __blk_mq_alloc_request+0x1b/0x200
 [<ffffffff811ff3bc>] blk_mq_map_request+0x15c/0x1b0
 [<ffffffff8120079e>] blk_mq_make_request+0x6e/0x270
 [<ffffffff8110a99f>] ? mempool_alloc+0x4f/0x130
 [<ffffffff811f3af0>] generic_make_request+0xc0/0x110
 [<ffffffff811f3bab>] submit_bio+0x6b/0x140
 [<ffffffff81110e4b>] ? set_page_dirty_lock+0x2b/0x40
 [<ffffffff811eea57>] ? bio_set_pages_dirty+0x47/0x60
 [<ffffffff811907d0>] do_blockdev_direct_IO+0x2080/0x3220
 [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
 [<ffffffff811919bc>] __blockdev_direct_IO+0x4c/0x50
 [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
 [<ffffffff8118c1de>] blkdev_direct_IO+0x4e/0x50
 [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
 [<ffffffff81109a13>] generic_file_read_iter+0x513/0x5e0
 [<ffffffff8119d8a7>] ? kiocb_free+0x37/0x40
 [<ffffffff8118c660>] ? ioctl_by_bdev+0x40/0x40
 [<ffffffff8118c697>] blkdev_read_iter+0x37/0x40
 [<ffffffff8119e6b4>] aio_run_iocb+0x1e4/0x3c0
 [<ffffffff8114dfa6>] ? kmem_cache_alloc+0xd6/0x3d0
 [<ffffffff8114e045>] ? kmem_cache_alloc+0x175/0x3d0
 [<ffffffff8119f5cc>] do_io_submit+0x11c/0x490
 [<ffffffff8119f950>] SyS_io_submit+0x10/0x20
 [<ffffffff81432fd2>] system_call_fastpath+0x16/0x1b

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Robert Elliott <Elliott@hp.com>
Cc: <stable@vger.kernel.org>
---
 block/blk-mq-tag.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index b5088f0..08d3b1c 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -80,8 +80,7 @@ static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {
 		struct bt_wait_state *bs = &bt->bs[wake_index];
 
-		if (waitqueue_active(&bs->wait))
-			wake_up(&bs->wait);
+		wake_up(&bs->wait);
 
 		wake_index = bt_index_inc(wake_index);
 	}
@@ -346,20 +345,20 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 	 */
 	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
 
-	bs = bt_wake_ptr(bt);
-	if (!bs)
-		return;
-
-	wait_cnt = atomic_dec_return(&bs->wait_cnt);
-	if (wait_cnt == 0) {
-wake:
-		atomic_add(bt->wake_cnt, &bs->wait_cnt);
-		bt_index_atomic_inc(&bt->wake_index);
-		wake_up(&bs->wait);
-	} else if (wait_cnt < 0) {
-		wait_cnt = atomic_inc_return(&bs->wait_cnt);
-		if (!wait_cnt)
-			goto wake;
+	for (;;) {
+		bs = bt_wake_ptr(bt);
+		if (!bs)
+			return;
+
+		wait_cnt = atomic_dec_return(&bs->wait_cnt);
+		if (unlikely(wait_cnt < 0))
+			wait_cnt = atomic_inc_return(&bs->wait_cnt);
+		if (wait_cnt == 0) {
+			atomic_add(bt->wake_cnt, &bs->wait_cnt);
+			bt_index_atomic_inc(&bt->wake_index);
+			wake_up(&bs->wait);
+			return;
+		}
 	}
 }
 
-- 
1.8.4.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-10-06 12:27 [PATCH] blk-mq: Avoid that I/O hangs in bt_get() Bart Van Assche
@ 2014-10-06 17:40 ` Jens Axboe
  2014-10-06 18:53   ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2014-10-06 17:40 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Robert Elliott, Ming Lei, Sagi Grimberg, linux-kernel

On 10/06/2014 06:27 AM, Bart Van Assche wrote:
> Ensure that bt_clear_tag() increments bs->wait_cnt if one or more
> threads are waiting for a tag. Remove a superfluous
> waitqueue_active() call from blk_mq_tag_wakeup_all(). This patch
> avoids that bt_get() hangs as follows if the number of hardware
> contexts is below the number of CPU threads:
> 
> INFO: task fio:6739 blocked for more than 120 seconds.
>       Not tainted 3.17.0-rc7+ #1
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> fio             D ffff88085fcd2740     0  6739   6688 0x00000000
>  ffff8807fb96f830 0000000000000086 ffff8808350eb000 ffff8807fb96ffd8
>  0000000000012740 0000000000012740 ffff88085b885000 ffff8808350eb000
>  ffff88085fcd2fe0 ffff8807fb96f920 ffff88080061ddc8 ffffe8ffffcc4600
> Call Trace:
>  [<ffffffff8142f52d>] io_schedule+0x9d/0x130
>  [<ffffffff812016bf>] bt_get+0xef/0x180
>  [<ffffffff8107f440>] ? prepare_to_wait_event+0x110/0x110
>  [<ffffffff81201a0f>] blk_mq_get_tag+0x9f/0xd0
>  [<ffffffff811fdedb>] __blk_mq_alloc_request+0x1b/0x200
>  [<ffffffff811ff3bc>] blk_mq_map_request+0x15c/0x1b0
>  [<ffffffff8120079e>] blk_mq_make_request+0x6e/0x270
>  [<ffffffff8110a99f>] ? mempool_alloc+0x4f/0x130
>  [<ffffffff811f3af0>] generic_make_request+0xc0/0x110
>  [<ffffffff811f3bab>] submit_bio+0x6b/0x140
>  [<ffffffff81110e4b>] ? set_page_dirty_lock+0x2b/0x40
>  [<ffffffff811eea57>] ? bio_set_pages_dirty+0x47/0x60
>  [<ffffffff811907d0>] do_blockdev_direct_IO+0x2080/0x3220
>  [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
>  [<ffffffff811919bc>] __blockdev_direct_IO+0x4c/0x50
>  [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
>  [<ffffffff8118c1de>] blkdev_direct_IO+0x4e/0x50
>  [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
>  [<ffffffff81109a13>] generic_file_read_iter+0x513/0x5e0
>  [<ffffffff8119d8a7>] ? kiocb_free+0x37/0x40
>  [<ffffffff8118c660>] ? ioctl_by_bdev+0x40/0x40
>  [<ffffffff8118c697>] blkdev_read_iter+0x37/0x40
>  [<ffffffff8119e6b4>] aio_run_iocb+0x1e4/0x3c0
>  [<ffffffff8114dfa6>] ? kmem_cache_alloc+0xd6/0x3d0
>  [<ffffffff8114e045>] ? kmem_cache_alloc+0x175/0x3d0
>  [<ffffffff8119f5cc>] do_io_submit+0x11c/0x490
>  [<ffffffff8119f950>] SyS_io_submit+0x10/0x20
>  [<ffffffff81432fd2>] system_call_fastpath+0x16/0x1b
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@canonical.com>
> Cc: Robert Elliott <Elliott@hp.com>
> Cc: <stable@vger.kernel.org>
> ---
>  block/blk-mq-tag.c | 31 +++++++++++++++----------------
>  1 file changed, 15 insertions(+), 16 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index b5088f0..08d3b1c 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -80,8 +80,7 @@ static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
>  	for (i = 0; i < BT_WAIT_QUEUES; i++) {
>  		struct bt_wait_state *bs = &bt->bs[wake_index];
>  
> -		if (waitqueue_active(&bs->wait))
> -			wake_up(&bs->wait);
> +		wake_up(&bs->wait);
>  
>  		wake_index = bt_index_inc(wake_index);
>  	}
> @@ -346,20 +345,20 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
>  	 */
>  	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
>  
> -	bs = bt_wake_ptr(bt);
> -	if (!bs)
> -		return;
> -
> -	wait_cnt = atomic_dec_return(&bs->wait_cnt);
> -	if (wait_cnt == 0) {
> -wake:
> -		atomic_add(bt->wake_cnt, &bs->wait_cnt);
> -		bt_index_atomic_inc(&bt->wake_index);
> -		wake_up(&bs->wait);
> -	} else if (wait_cnt < 0) {
> -		wait_cnt = atomic_inc_return(&bs->wait_cnt);
> -		if (!wait_cnt)
> -			goto wake;
> +	for (;;) {
> +		bs = bt_wake_ptr(bt);
> +		if (!bs)
> +			return;
> +
> +		wait_cnt = atomic_dec_return(&bs->wait_cnt);
> +		if (unlikely(wait_cnt < 0))
> +			wait_cnt = atomic_inc_return(&bs->wait_cnt);
> +		if (wait_cnt == 0) {
> +			atomic_add(bt->wake_cnt, &bs->wait_cnt);
> +			bt_index_atomic_inc(&bt->wake_index);
> +			wake_up(&bs->wait);
> +			return;
> +		}
>  	}
>  }

I've been able to reproduce this this morning, and your patch does seem
to fix it. The inc/add logic is making my head spin a bit. And we now
end up banging a lot more on the waitqueue lock through
prepare_to_wait(), so there's a substantial performance regression to go
with the change.

I'll fiddle with this a bit and see if we can't retain existing
performance properties under tag contention, while still fixing the hang.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-10-06 17:40 ` Jens Axboe
@ 2014-10-06 18:53   ` Jens Axboe
  2014-10-07  8:46     ` Bart Van Assche
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2014-10-06 18:53 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Robert Elliott, Ming Lei, Sagi Grimberg, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 4995 bytes --]

On 10/06/2014 11:40 AM, Jens Axboe wrote:
> On 10/06/2014 06:27 AM, Bart Van Assche wrote:
>> Ensure that bt_clear_tag() increments bs->wait_cnt if one or more
>> threads are waiting for a tag. Remove a superfluous
>> waitqueue_active() call from blk_mq_tag_wakeup_all(). This patch
>> avoids that bt_get() hangs as follows if the number of hardware
>> contexts is below the number of CPU threads:
>>
>> INFO: task fio:6739 blocked for more than 120 seconds.
>>       Not tainted 3.17.0-rc7+ #1
>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> fio             D ffff88085fcd2740     0  6739   6688 0x00000000
>>  ffff8807fb96f830 0000000000000086 ffff8808350eb000 ffff8807fb96ffd8
>>  0000000000012740 0000000000012740 ffff88085b885000 ffff8808350eb000
>>  ffff88085fcd2fe0 ffff8807fb96f920 ffff88080061ddc8 ffffe8ffffcc4600
>> Call Trace:
>>  [<ffffffff8142f52d>] io_schedule+0x9d/0x130
>>  [<ffffffff812016bf>] bt_get+0xef/0x180
>>  [<ffffffff8107f440>] ? prepare_to_wait_event+0x110/0x110
>>  [<ffffffff81201a0f>] blk_mq_get_tag+0x9f/0xd0
>>  [<ffffffff811fdedb>] __blk_mq_alloc_request+0x1b/0x200
>>  [<ffffffff811ff3bc>] blk_mq_map_request+0x15c/0x1b0
>>  [<ffffffff8120079e>] blk_mq_make_request+0x6e/0x270
>>  [<ffffffff8110a99f>] ? mempool_alloc+0x4f/0x130
>>  [<ffffffff811f3af0>] generic_make_request+0xc0/0x110
>>  [<ffffffff811f3bab>] submit_bio+0x6b/0x140
>>  [<ffffffff81110e4b>] ? set_page_dirty_lock+0x2b/0x40
>>  [<ffffffff811eea57>] ? bio_set_pages_dirty+0x47/0x60
>>  [<ffffffff811907d0>] do_blockdev_direct_IO+0x2080/0x3220
>>  [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
>>  [<ffffffff811919bc>] __blockdev_direct_IO+0x4c/0x50
>>  [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
>>  [<ffffffff8118c1de>] blkdev_direct_IO+0x4e/0x50
>>  [<ffffffff8118bac0>] ? I_BDEV+0x10/0x10
>>  [<ffffffff81109a13>] generic_file_read_iter+0x513/0x5e0
>>  [<ffffffff8119d8a7>] ? kiocb_free+0x37/0x40
>>  [<ffffffff8118c660>] ? ioctl_by_bdev+0x40/0x40
>>  [<ffffffff8118c697>] blkdev_read_iter+0x37/0x40
>>  [<ffffffff8119e6b4>] aio_run_iocb+0x1e4/0x3c0
>>  [<ffffffff8114dfa6>] ? kmem_cache_alloc+0xd6/0x3d0
>>  [<ffffffff8114e045>] ? kmem_cache_alloc+0x175/0x3d0
>>  [<ffffffff8119f5cc>] do_io_submit+0x11c/0x490
>>  [<ffffffff8119f950>] SyS_io_submit+0x10/0x20
>>  [<ffffffff81432fd2>] system_call_fastpath+0x16/0x1b
>>
>> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Ming Lei <ming.lei@canonical.com>
>> Cc: Robert Elliott <Elliott@hp.com>
>> Cc: <stable@vger.kernel.org>
>> ---
>>  block/blk-mq-tag.c | 31 +++++++++++++++----------------
>>  1 file changed, 15 insertions(+), 16 deletions(-)
>>
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index b5088f0..08d3b1c 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -80,8 +80,7 @@ static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
>>  	for (i = 0; i < BT_WAIT_QUEUES; i++) {
>>  		struct bt_wait_state *bs = &bt->bs[wake_index];
>>  
>> -		if (waitqueue_active(&bs->wait))
>> -			wake_up(&bs->wait);
>> +		wake_up(&bs->wait);
>>  
>>  		wake_index = bt_index_inc(wake_index);
>>  	}
>> @@ -346,20 +345,20 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
>>  	 */
>>  	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
>>  
>> -	bs = bt_wake_ptr(bt);
>> -	if (!bs)
>> -		return;
>> -
>> -	wait_cnt = atomic_dec_return(&bs->wait_cnt);
>> -	if (wait_cnt == 0) {
>> -wake:
>> -		atomic_add(bt->wake_cnt, &bs->wait_cnt);
>> -		bt_index_atomic_inc(&bt->wake_index);
>> -		wake_up(&bs->wait);
>> -	} else if (wait_cnt < 0) {
>> -		wait_cnt = atomic_inc_return(&bs->wait_cnt);
>> -		if (!wait_cnt)
>> -			goto wake;
>> +	for (;;) {
>> +		bs = bt_wake_ptr(bt);
>> +		if (!bs)
>> +			return;
>> +
>> +		wait_cnt = atomic_dec_return(&bs->wait_cnt);
>> +		if (unlikely(wait_cnt < 0))
>> +			wait_cnt = atomic_inc_return(&bs->wait_cnt);
>> +		if (wait_cnt == 0) {
>> +			atomic_add(bt->wake_cnt, &bs->wait_cnt);
>> +			bt_index_atomic_inc(&bt->wake_index);
>> +			wake_up(&bs->wait);
>> +			return;
>> +		}
>>  	}
>>  }
> 
> I've been able to reproduce this this morning, and your patch does seem
> to fix it. The inc/add logic is making my head spin a bit. And we now
> end up banging a lot more on the waitqueue lock through
> prepare_to_wait(), so there's a substantial performance regression to go
> with the change.
> 
> I'll fiddle with this a bit and see if we can't retain existing
> performance properties under tag contention, while still fixing the hang.

So I think your patch fixes the issue because it just keeps decrementing
the wait counts, hence waking up a lot more than it should. This is also
why I see a huge increase in wait queue spinlock time.

Does this work for you? I think the issue is plainly that we end up
setting the batch counts too high. But tell me more about the number of
queues, the depth (total or per queue?), and the fio job you are running.

-- 
Jens Axboe


[-- Attachment #2: blk-mq-tag-wake-depth.patch --]
[-- Type: text/x-patch, Size: 442 bytes --]

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c1b9242..74a4168 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -463,8 +463,8 @@ static void bt_update_count(struct blk_mq_bitmap_tags *bt,
 	}
 
 	bt->wake_cnt = BT_WAIT_BATCH;
-	if (bt->wake_cnt > depth / 4)
-		bt->wake_cnt = max(1U, depth / 4);
+	if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
+		bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
 
 	bt->depth = depth;
 }

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-10-06 18:53   ` Jens Axboe
@ 2014-10-07  8:46     ` Bart Van Assche
  2014-10-07 14:44       ` Jens Axboe
  2014-11-06 13:41       ` Bart Van Assche
  0 siblings, 2 replies; 9+ messages in thread
From: Bart Van Assche @ 2014-10-07  8:46 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Robert Elliott, Ming Lei, Sagi Grimberg, linux-kernel

On 10/06/14 20:53, Jens Axboe wrote:
> On 10/06/2014 11:40 AM, Jens Axboe wrote:
>> I've been able to reproduce this this morning, and your patch does seem
>> to fix it. The inc/add logic is making my head spin a bit. And we now
>> end up banging a lot more on the waitqueue lock through
>> prepare_to_wait(), so there's a substantial performance regression to go
>> with the change.
>>
>> I'll fiddle with this a bit and see if we can't retain existing
>> performance properties under tag contention, while still fixing the hang.
> 
> So I think your patch fixes the issue because it just keeps decrementing
> the wait counts, hence waking up a lot more than it should. This is also
> why I see a huge increase in wait queue spinlock time.
> 
> Does this work for you? I think the issue is plainly that we end up
> setting the batch counts too high. But tell me more about the number of
> queues, the depth (total or per queue?), and the fio job you are running.

Hello Jens,

Thanks for looking into this. I can't reproduce the I/O lockup after 
having reverted my patch and after having applied your patch. In the 
test I ran fio was started with the following command-line options:
 
fio --bs=512 --ioengine=libaio --rw=randread --buffered=0 --numjobs=12 
--iodepth=128 --iodepth_batch=64 --iodepth_batch_complete=64 --thread 
--norandommap --loops=2147483648 --runtime=3600 --group_reporting 
--gtod_reduce=1 --name=/dev/sdo --filename=/dev/sdo --invalidate=1

This job was run on a system with 12 CPU threads and against a SCSI 
initiator driver for which the number of hardware contexts had been set 
to 6. Queue depth per hardware queue was set to 127:
$ cat /sys/class/scsi_host/host10/can_queue
127

This is what fio reports about the average queue depth:

IOdepths: 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=100.0%
  submit: 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=100.0%, >=64=0.0%
complete: 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=100.0%, >=64=0.0%

While we are at it, how about the patch below ? That patch shouldn't
change any functionality but should make bt_clear_tag() slightly easier
to read.

Thanks,

Bart.

[PATCH] blk-mq: Make bt_clear_tag() easier to read

Eliminate a backwards goto statement from bt_clear_tag().

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq-tag.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 3d1a956..2c63a2b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -351,15 +351,12 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 		return;
 
 	wait_cnt = atomic_dec_return(&bs->wait_cnt);
+	if (unlikely(wait_cnt < 0))
+		wait_cnt = atomic_inc_return(&bs->wait_cnt);
 	if (wait_cnt == 0) {
-wake:
 		atomic_add(bt->wake_cnt, &bs->wait_cnt);
 		bt_index_atomic_inc(&bt->wake_index);
 		wake_up(&bs->wait);
-	} else if (wait_cnt < 0) {
-		wait_cnt = atomic_inc_return(&bs->wait_cnt);
-		if (!wait_cnt)
-			goto wake;
 	}
 }
 
-- 
1.8.4.5



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-10-07  8:46     ` Bart Van Assche
@ 2014-10-07 14:44       ` Jens Axboe
  2014-11-06 13:41       ` Bart Van Assche
  1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2014-10-07 14:44 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Robert Elliott, Ming Lei, Sagi Grimberg, linux-kernel

On 10/07/2014 02:46 AM, Bart Van Assche wrote:
> On 10/06/14 20:53, Jens Axboe wrote:
>> On 10/06/2014 11:40 AM, Jens Axboe wrote:
>>> I've been able to reproduce this this morning, and your patch does seem
>>> to fix it. The inc/add logic is making my head spin a bit. And we now
>>> end up banging a lot more on the waitqueue lock through
>>> prepare_to_wait(), so there's a substantial performance regression to go
>>> with the change.
>>>
>>> I'll fiddle with this a bit and see if we can't retain existing
>>> performance properties under tag contention, while still fixing the hang.
>>
>> So I think your patch fixes the issue because it just keeps decrementing
>> the wait counts, hence waking up a lot more than it should. This is also
>> why I see a huge increase in wait queue spinlock time.
>>
>> Does this work for you? I think the issue is plainly that we end up
>> setting the batch counts too high. But tell me more about the number of
>> queues, the depth (total or per queue?), and the fio job you are running.
> 
> Hello Jens,
> 
> Thanks for looking into this. I can't reproduce the I/O lockup after 
> having reverted my patch and after having applied your patch. In the 
> test I ran fio was started with the following command-line options:
>  
> fio --bs=512 --ioengine=libaio --rw=randread --buffered=0 --numjobs=12 
> --iodepth=128 --iodepth_batch=64 --iodepth_batch_complete=64 --thread 
> --norandommap --loops=2147483648 --runtime=3600 --group_reporting 
> --gtod_reduce=1 --name=/dev/sdo --filename=/dev/sdo --invalidate=1
> 
> This job was run on a system with 12 CPU threads and against a SCSI 
> initiator driver for which the number of hardware contexts had been set 
> to 6. Queue depth per hardware queue was set to 127:
> $ cat /sys/class/scsi_host/host10/can_queue
> 127
> 
> This is what fio reports about the average queue depth:
> 
> IOdepths: 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=100.0%
>   submit: 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=100.0%, >=64=0.0%
> complete: 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=100.0%, >=64=0.0%

Great, so that makes sense to me. I'll get the patch applied and marked
for stable. I'll mark it as fixes commit 4bb659b156996.

> 
> While we are at it, how about the patch below ? That patch shouldn't
> change any functionality but should make bt_clear_tag() slightly easier
> to read.

Agree, that looks cleaner and is more readable.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-10-07  8:46     ` Bart Van Assche
  2014-10-07 14:44       ` Jens Axboe
@ 2014-11-06 13:41       ` Bart Van Assche
  2014-12-08 14:55         ` Bart Van Assche
  1 sibling, 1 reply; 9+ messages in thread
From: Bart Van Assche @ 2014-11-06 13:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, Robert Elliott, linux-kernel

On 10/07/14 10:46, Bart Van Assche wrote:
> Thanks for looking into this. I can't reproduce the I/O lockup after
> having reverted my patch and after having applied your patch. In the
> test I ran fio was started with the following command-line options:
>
> fio --bs=512 --ioengine=libaio --rw=randread --buffered=0 --numjobs=12
> --iodepth=128 --iodepth_batch=64 --iodepth_batch_complete=64 --thread
> --norandommap --loops=2147483648 --runtime=3600 --group_reporting
> --gtod_reduce=1 --name=/dev/sdo --filename=/dev/sdo --invalidate=1
>
> This job was run on a system with 12 CPU threads and against a SCSI
> initiator driver for which the number of hardware contexts had been set
> to 6. Queue depth per hardware queue was set to 127:
> $ cat /sys/class/scsi_host/host10/can_queue
> 127

(replying to my own e-mail)

Hello Jens,

With kernel 3.18-rc3 and with can_queue=62 I can trigger a hang in 
bt_get() easily. The four call traces reported by echo w > 
/proc/sysrq-trigger are as follows:

Call Trace:
  [<ffffffff813d60ac>] io_schedule+0x9c/0x130
  [<ffffffff811b41bf>] bt_get+0xef/0x180
  [<ffffffff811b450f>] blk_mq_get_tag+0x9f/0xd0
  [<ffffffff811b09c6>] __blk_mq_alloc_request+0x16/0x1f0
  [<ffffffff811b1da3>] blk_mq_map_request+0x123/0x130
  [<ffffffff811b31c9>] blk_mq_make_request+0x69/0x280
  [<ffffffff811a8420>] generic_make_request+0xc0/0x110
  [<ffffffff811a84d4>] submit_bio+0x64/0x130
  [<ffffffff81147848>] do_blockdev_direct_IO+0x1dc8/0x2da0
  [<ffffffff81148867>] __blockdev_direct_IO+0x47/0x50
  [<ffffffff811435c9>] blkdev_direct_IO+0x49/0x50
  [<ffffffff810c8ce6>] generic_file_read_iter+0x546/0x610
  [<ffffffff81143962>] blkdev_read_iter+0x32/0x40
  [<ffffffff81155358>] aio_run_iocb+0x1f8/0x400
  [<ffffffff811562c1>] do_io_submit+0x121/0x490
  [<ffffffff8115663b>] SyS_io_submit+0xb/0x10
  [<ffffffff813d9612>] system_call_fastpath+0x12/0x17

Please let me know if you need more information.

Bart.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-11-06 13:41       ` Bart Van Assche
@ 2014-12-08 14:55         ` Bart Van Assche
  2014-12-08 16:49           ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Bart Van Assche @ 2014-12-08 14:55 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, Robert Elliott, linux-kernel

On 11/06/14 14:41, Bart Van Assche wrote:
> With kernel 3.18-rc3 and with can_queue=62 I can trigger a hang in
> bt_get() easily.

(once more replying to my own e-mail)

Hello Jens,

Finally I found the time to look further into this. The patch below 
seems to be sufficient to prevent this hang. However, I'm not a block 
layer expert so it's not clear to me whether the patch below makes sense ?

Thanks,

Bart.

[PATCH] blk-mq: Fix bt_get() hang

Avoid that if there are fewer hardware queues than CPU threads that
bt_get() can hang. The symptoms of the hang were as follows:
* All tags allocated for a particular hardware queue.
* (nr_tags) pending commands for that hardware queue.
* No pending commands for the software queues associated with that
   hardware queue.
---
  block/blk-mq-tag.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 67ab88b..e88af88 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -256,6 +256,8 @@ static int bt_get(struct blk_mq_alloc_data *data,
  			break;
  		}

+		blk_mq_run_hw_queue(hctx, false);
+
  		blk_mq_put_ctx(data->ctx);

  		io_schedule();

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-12-08 14:55         ` Bart Van Assche
@ 2014-12-08 16:49           ` Jens Axboe
  2014-12-08 17:59             ` Bart Van Assche
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2014-12-08 16:49 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Christoph Hellwig, Robert Elliott, linux-kernel

On 12/08/2014 07:55 AM, Bart Van Assche wrote:
> On 11/06/14 14:41, Bart Van Assche wrote:
>> With kernel 3.18-rc3 and with can_queue=62 I can trigger a hang in
>> bt_get() easily.
>
> (once more replying to my own e-mail)
>
> Hello Jens,
>
> Finally I found the time to look further into this. The patch below
> seems to be sufficient to prevent this hang. However, I'm not a block
> layer expert so it's not clear to me whether the patch below makes sense ?
>
> Thanks,
>
> Bart.
>
> [PATCH] blk-mq: Fix bt_get() hang
>
> Avoid that if there are fewer hardware queues than CPU threads that
> bt_get() can hang. The symptoms of the hang were as follows:
> * All tags allocated for a particular hardware queue.
> * (nr_tags) pending commands for that hardware queue.
> * No pending commands for the software queues associated with that
>    hardware queue.
> ---
>   block/blk-mq-tag.c | 2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 67ab88b..e88af88 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -256,6 +256,8 @@ static int bt_get(struct blk_mq_alloc_data *data,
>               break;
>           }
>
> +        blk_mq_run_hw_queue(hctx, false);
> +
>           blk_mq_put_ctx(data->ctx);
>
>           io_schedule();

Ah yes, that could be an issue for some cases, we do need to run the 
queue there. For a tag map shared across hardware queues, we might need 
to run more than just the current queue, however. For now we can safely 
assume that we allocate fairly, so it should not be an issue.

It might be worth experimenting with doing a __bt_get() after the queue 
run before going to sleep, in case the queue run found completions as well.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Avoid that I/O hangs in bt_get()
  2014-12-08 16:49           ` Jens Axboe
@ 2014-12-08 17:59             ` Bart Van Assche
  0 siblings, 0 replies; 9+ messages in thread
From: Bart Van Assche @ 2014-12-08 17:59 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, Robert Elliott, linux-kernel

On 12/08/14 17:49, Jens Axboe wrote:
> On 12/08/2014 07:55 AM, Bart Van Assche wrote:
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index 67ab88b..e88af88 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -256,6 +256,8 @@ static int bt_get(struct blk_mq_alloc_data *data,
>>               break;
>>           }
>>
>> +        blk_mq_run_hw_queue(hctx, false);
>> +
>>           blk_mq_put_ctx(data->ctx);
>>
>>           io_schedule();
> 
> Ah yes, that could be an issue for some cases, we do need to run the 
> queue there. For a tag map shared across hardware queues, we might need 
> to run more than just the current queue, however. For now we can safely 
> assume that we allocate fairly, so it should not be an issue.
> 
> It might be worth experimenting with doing a __bt_get() after the queue 
> run before going to sleep, in case the queue run found completions as well.

Unless anyone objects I will start testing the following patch:

[PATCH] blk-mq: Fix bt_get() hang

Avoid that if there are fewer hardware queues than CPU threads that
bt_get() can hang. The symptoms of the hang were as follows:
* All tags allocated for a particular hardware queue.
* (nr_tags) pending commands for that hardware queue.
* No pending commands for the software queues associated with that
  hardware queue.

The call stack that corresponds to the hang is as follows:

io_schedule+0x9c/0x130
bt_get+0xef/0x180
blk_mq_get_tag+0x9f/0xd0
__blk_mq_alloc_request+0x16/0x1f0
blk_mq_map_request+0x123/0x130
blk_mq_make_request+0x69/0x280
generic_make_request+0xc0/0x110
submit_bio+0x64/0x130
do_blockdev_direct_IO+0x1dc8/0x2da0
__blockdev_direct_IO+0x47/0x50
blkdev_direct_IO+0x49/0x50
generic_file_read_iter+0x546/0x610
blkdev_read_iter+0x32/0x40
aio_run_iocb+0x1f8/0x400
do_io_submit+0x121/0x490
SyS_io_submit+0xb/0x10
system_call_fastpath+0x12/0x17
---
 block/blk-mq-tag.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c22491e..14ab120 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -256,6 +256,12 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		if (tag != -1)
 			break;
 
+		blk_mq_run_hw_queue(hctx, false);
+
+		tag = __bt_get(hctx, bt, last_tag);
+		if (tag != -1)
+			break;
+
 		blk_mq_put_ctx(data->ctx);
 
 		io_schedule();
-- 
2.1.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-12-08 17:59 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-06 12:27 [PATCH] blk-mq: Avoid that I/O hangs in bt_get() Bart Van Assche
2014-10-06 17:40 ` Jens Axboe
2014-10-06 18:53   ` Jens Axboe
2014-10-07  8:46     ` Bart Van Assche
2014-10-07 14:44       ` Jens Axboe
2014-11-06 13:41       ` Bart Van Assche
2014-12-08 14:55         ` Bart Van Assche
2014-12-08 16:49           ` Jens Axboe
2014-12-08 17:59             ` Bart Van Assche

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.