linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
@ 2021-03-19  1:00 Bart Van Assche
  2021-03-22  0:42 ` Shinichiro Kawasaki
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Bart Van Assche @ 2021-03-19  1:00 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Christoph Hellwig, Bart Van Assche, Ming Lei,
	John Garry, Khazhy Kumykov, Shinichiro Kawasaki

Multiple users have reported use-after-free complaints similar to the
following (see also https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/):

BUG: KASAN: use-after-free in bt_iter+0x86/0xf0
Read of size 8 at addr ffff88803b335240 by task fio/21412

CPU: 0 PID: 21412 Comm: fio Tainted: G        W         4.20.0-rc6-dbg+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
 dump_stack+0x86/0xca
 print_address_description+0x71/0x239
 kasan_report.cold.5+0x242/0x301
 __asan_load8+0x54/0x90
 bt_iter+0x86/0xf0
 blk_mq_queue_tag_busy_iter+0x373/0x5e0
 blk_mq_in_flight+0x96/0xb0
 part_in_flight+0x40/0x140
 part_round_stats+0x18e/0x370
 blk_account_io_start+0x3d7/0x670
 blk_mq_bio_to_request+0x19c/0x3a0
 blk_mq_make_request+0x7a9/0xcb0
 generic_make_request+0x41d/0x960
 submit_bio+0x9b/0x250
 do_blockdev_direct_IO+0x435c/0x4c70
 __blockdev_direct_IO+0x79/0x88
 ext4_direct_IO+0x46c/0xc00
 generic_file_direct_write+0x119/0x210
 __generic_file_write_iter+0x11c/0x280
 ext4_file_write_iter+0x1b8/0x6f0
 aio_write+0x204/0x310
 io_submit_one+0x9d3/0xe80
 __x64_sys_io_submit+0x115/0x340
 do_syscall_64+0x71/0x210

When multiple request queues share a tag set and when switching the I/O
scheduler for one of the request queues that uses this tag set, the
following race can happen:
* blk_mq_tagset_busy_iter() calls bt_tags_iter() and bt_tags_iter() assigns
  a pointer to a scheduler request to the local variable 'rq'.
* blk_mq_sched_free_requests() is called to free hctx->sched_tags.
* blk_mq_tagset_busy_iter() dereferences 'rq' and triggers a use-after-free.

Fix this race as follows:
* Use rcu_assign_pointer() and srcu_dereference() to access hctx->tags->rqs[].
* Protect hctx->tags->rqs[] reads with an SRCU read-side lock.
* Call srcu_barrier() before freeing scheduler requests.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Khazhy Kumykov <khazhy@google.com>
Cc: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-core.c   |  9 ++++++++-
 block/blk-mq-tag.c | 29 +++++++++++++++++++----------
 block/blk-mq-tag.h |  2 +-
 block/blk-mq.c     | 21 +++++++++++++++++----
 block/blk-mq.h     |  1 +
 block/blk.h        |  7 +++++++
 6 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fc60ff208497..88b65c59e605 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -52,6 +52,7 @@
 #include "blk-pm.h"
 #include "blk-rq-qos.h"
 
+DEFINE_SRCU(blk_sched_srcu);
 struct dentry *blk_debugfs_root;
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -412,8 +413,14 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * it is safe to free requests now.
 	 */
 	mutex_lock(&q->sysfs_lock);
-	if (q->elevator)
+	if (q->elevator) {
+		/*
+		 * Barrier between clearing hctx->tags->rqs[] and freeing
+		 * scheduler requests.
+		 */
+		srcu_barrier(&blk_sched_srcu);
 		blk_mq_sched_free_requests(q);
+	}
 	mutex_unlock(&q->sysfs_lock);
 
 	percpu_ref_exit(&q->q_usage_counter);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9c92053e704d..f3afaf1520cd 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -206,18 +206,24 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	struct blk_mq_tags *tags = hctx->tags;
 	bool reserved = iter_data->reserved;
 	struct request *rq;
+	bool res = true;
+	int idx;
 
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
-	rq = tags->rqs[bitnr];
+
+	idx = srcu_read_lock(&blk_sched_srcu);
+	rq = srcu_dereference(tags->rqs[bitnr], &blk_sched_srcu);
 
 	/*
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
 	 */
 	if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
-		return iter_data->fn(hctx, rq, iter_data->data, reserved);
-	return true;
+		res = iter_data->fn(hctx, rq, iter_data->data, reserved);
+	srcu_read_unlock(&blk_sched_srcu, idx);
+
+	return res;
 }
 
 /**
@@ -264,10 +270,13 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	struct blk_mq_tags *tags = iter_data->tags;
 	bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
 	struct request *rq;
+	bool res = true;
+	int idx;
 
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
 
+	idx = srcu_read_lock(&blk_sched_srcu);
 	/*
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
@@ -275,13 +284,13 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
 		rq = tags->static_rqs[bitnr];
 	else
-		rq = tags->rqs[bitnr];
-	if (!rq)
-		return true;
-	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
-	    !blk_mq_request_started(rq))
-		return true;
-	return iter_data->fn(rq, iter_data->data, reserved);
+		rq = srcu_dereference(tags->rqs[bitnr], &blk_sched_srcu);
+	if (rq && (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
+		   blk_mq_request_started(rq)))
+		res = iter_data->fn(rq, iter_data->data, reserved);
+	srcu_read_unlock(&blk_sched_srcu, idx);
+
+	return res;
 }
 
 /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333a4a..7a6d04733261 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -17,7 +17,7 @@ struct blk_mq_tags {
 	struct sbitmap_queue __bitmap_tags;
 	struct sbitmap_queue __breserved_tags;
 
-	struct request **rqs;
+	struct request __rcu **rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
 };
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d4d7c1caa439..88f23a02c7c3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -495,8 +495,10 @@ static void __blk_mq_free_request(struct request *rq)
 	blk_crypto_free_request(rq);
 	blk_pm_mark_last_busy(rq);
 	rq->mq_hctx = NULL;
-	if (rq->tag != BLK_MQ_NO_TAG)
+	if (rq->tag != BLK_MQ_NO_TAG) {
 		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
+		rcu_assign_pointer(hctx->tags->rqs[rq->tag], NULL);
+	}
 	if (sched_tag != BLK_MQ_NO_TAG)
 		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 	blk_mq_sched_restart(hctx);
@@ -838,9 +840,20 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
+	struct request *rq;
+
 	if (tag < tags->nr_tags) {
-		prefetch(tags->rqs[tag]);
-		return tags->rqs[tag];
+		/*
+		 * The srcu dereference below is protected by the request
+		 * queue usage count. We can only verify that usage count after
+		 * having read the request pointer.
+		 */
+		rq = srcu_dereference_check(tags->rqs[tag], &blk_sched_srcu,
+					    true);
+		WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && rq &&
+			     percpu_ref_is_zero(&rq->q->q_usage_counter));
+		prefetch(rq);
+		return rq;
 	}
 
 	return NULL;
@@ -1111,7 +1124,7 @@ static bool blk_mq_get_driver_tag(struct request *rq)
 		rq->rq_flags |= RQF_MQ_INFLIGHT;
 		__blk_mq_inc_active_requests(hctx);
 	}
-	hctx->tags->rqs[rq->tag] = rq;
+	rcu_assign_pointer(hctx->tags->rqs[rq->tag], rq);
 	return true;
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3616453ca28c..9ccb1818303b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -226,6 +226,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
 					   struct request *rq)
 {
 	blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
+	rcu_assign_pointer(hctx->tags->rqs[rq->tag], NULL);
 	rq->tag = BLK_MQ_NO_TAG;
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
diff --git a/block/blk.h b/block/blk.h
index 3b53e44b967e..28c574cabb91 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -14,6 +14,8 @@
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
+extern struct srcu_struct blk_sched_srcu;
+
 extern struct dentry *blk_debugfs_root;
 
 struct blk_flush_queue {
@@ -203,6 +205,11 @@ static inline void elevator_exit(struct request_queue *q,
 {
 	lockdep_assert_held(&q->sysfs_lock);
 
+	/*
+	 * Barrier between clearing hctx->tags->rqs[] and freeing scheduler
+	 * requests.
+	 */
+	srcu_barrier(&blk_sched_srcu);
 	blk_mq_sched_free_requests(q);
 	__elevator_exit(q, e);
 }

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-19  1:00 [PATCH] blk-mq: Fix races between iterating over requests and freeing requests Bart Van Assche
@ 2021-03-22  0:42 ` Shinichiro Kawasaki
  2021-03-22 22:39 ` Khazhy Kumykov
  2021-03-23 12:34 ` John Garry
  2 siblings, 0 replies; 9+ messages in thread
From: Shinichiro Kawasaki @ 2021-03-22  0:42 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Ming Lei, John Garry,
	Khazhy Kumykov

On Mar 18, 2021 / 18:00, Bart Van Assche wrote:
> Multiple users have reported use-after-free complaints similar to the
> following (see also https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/):
> 
> BUG: KASAN: use-after-free in bt_iter+0x86/0xf0
> Read of size 8 at addr ffff88803b335240 by task fio/21412
> 
> CPU: 0 PID: 21412 Comm: fio Tainted: G        W         4.20.0-rc6-dbg+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> Call Trace:
>  dump_stack+0x86/0xca
>  print_address_description+0x71/0x239
>  kasan_report.cold.5+0x242/0x301
>  __asan_load8+0x54/0x90
>  bt_iter+0x86/0xf0
>  blk_mq_queue_tag_busy_iter+0x373/0x5e0
>  blk_mq_in_flight+0x96/0xb0
>  part_in_flight+0x40/0x140
>  part_round_stats+0x18e/0x370
>  blk_account_io_start+0x3d7/0x670
>  blk_mq_bio_to_request+0x19c/0x3a0
>  blk_mq_make_request+0x7a9/0xcb0
>  generic_make_request+0x41d/0x960
>  submit_bio+0x9b/0x250
>  do_blockdev_direct_IO+0x435c/0x4c70
>  __blockdev_direct_IO+0x79/0x88
>  ext4_direct_IO+0x46c/0xc00
>  generic_file_direct_write+0x119/0x210
>  __generic_file_write_iter+0x11c/0x280
>  ext4_file_write_iter+0x1b8/0x6f0
>  aio_write+0x204/0x310
>  io_submit_one+0x9d3/0xe80
>  __x64_sys_io_submit+0x115/0x340
>  do_syscall_64+0x71/0x210
> 
> When multiple request queues share a tag set and when switching the I/O
> scheduler for one of the request queues that uses this tag set, the
> following race can happen:
> * blk_mq_tagset_busy_iter() calls bt_tags_iter() and bt_tags_iter() assigns
>   a pointer to a scheduler request to the local variable 'rq'.
> * blk_mq_sched_free_requests() is called to free hctx->sched_tags.
> * blk_mq_tagset_busy_iter() dereferences 'rq' and triggers a use-after-free.
> 
> Fix this race as follows:
> * Use rcu_assign_pointer() and srcu_dereference() to access hctx->tags->rqs[].
> * Protect hctx->tags->rqs[] reads with an SRCU read-side lock.
> * Call srcu_barrier() before freeing scheduler requests.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: John Garry <john.garry@huawei.com>
> Cc: Khazhy Kumykov <khazhy@google.com>
> Cc: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>

I confirmed that this patch avoids the KASAN use-after-free message observed
during blktests block/005 run on HDDs behind SAS-HBA. Looks good from testing
point of view.

Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

-- 
Best Regards,
Shin'ichiro Kawasaki

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-19  1:00 [PATCH] blk-mq: Fix races between iterating over requests and freeing requests Bart Van Assche
  2021-03-22  0:42 ` Shinichiro Kawasaki
@ 2021-03-22 22:39 ` Khazhy Kumykov
  2021-03-22 23:37   ` Bart Van Assche
  2021-03-23 12:34 ` John Garry
  2 siblings, 1 reply; 9+ messages in thread
From: Khazhy Kumykov @ 2021-03-22 22:39 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Ming Lei, John Garry,
	Shinichiro Kawasaki

[-- Attachment #1: Type: text/plain, Size: 487 bytes --]

On Thu, Mar 18, 2021 at 6:00 PM Bart Van Assche <bvanassche@acm.org> wrote:
>
> Multiple users have reported use-after-free complaints similar to the
> following (see also https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/):

This fixes the crashes I was seeing. I also looked over the patch and
the dereferencing rules seem correct, although that q_usage_counter
check in the bad case seems racy itself? Thanks!
Reviewed-By: Khazhismel Kumykov <khazhy@google.com>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3996 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-22 22:39 ` Khazhy Kumykov
@ 2021-03-22 23:37   ` Bart Van Assche
  0 siblings, 0 replies; 9+ messages in thread
From: Bart Van Assche @ 2021-03-22 23:37 UTC (permalink / raw)
  To: Khazhy Kumykov
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Ming Lei, John Garry,
	Shinichiro Kawasaki

On 3/22/21 3:39 PM, Khazhy Kumykov wrote:
> On Thu, Mar 18, 2021 at 6:00 PM Bart Van Assche <bvanassche@acm.org> wrote:
>>
>> Multiple users have reported use-after-free complaints similar to the
>> following (see also https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/):
> 
> This fixes the crashes I was seeing. I also looked over the patch and
> the dereferencing rules seem correct, although that q_usage_counter
> check in the bad case seems racy itself? Thanks!
> Reviewed-By: Khazhismel Kumykov <khazhy@google.com>

Thanks Khazy for the review and for the testing. The purpose of the 
q_usage_counter check in blk_mq_tag_to_rq() is to catch calls of 
blk_mq_tag_to_rq() from outside .queue_rq() that happen during or after 
queue deletion. Maybe I should change that check into a test of 
QUEUE_FLAG_DYING.

Bart.



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-19  1:00 [PATCH] blk-mq: Fix races between iterating over requests and freeing requests Bart Van Assche
  2021-03-22  0:42 ` Shinichiro Kawasaki
  2021-03-22 22:39 ` Khazhy Kumykov
@ 2021-03-23 12:34 ` John Garry
  2021-03-23 15:39   ` Jens Axboe
  2021-03-23 16:53   ` Bart Van Assche
  2 siblings, 2 replies; 9+ messages in thread
From: John Garry @ 2021-03-23 12:34 UTC (permalink / raw)
  To: Bart Van Assche, Jens Axboe
  Cc: linux-block, Christoph Hellwig, Ming Lei, Khazhy Kumykov,
	Shinichiro Kawasaki

On 19/03/2021 01:00, Bart Van Assche wrote:
> Multiple users have reported use-after-free complaints similar to the
> following (see also https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/):
> 
> BUG: KASAN: use-after-free in bt_iter+0x86/0xf0
> Read of size 8 at addr ffff88803b335240 by task fio/21412
> 
> CPU: 0 PID: 21412 Comm: fio Tainted: G        W         4.20.0-rc6-dbg+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> Call Trace:
>   dump_stack+0x86/0xca
>   print_address_description+0x71/0x239
>   kasan_report.cold.5+0x242/0x301
>   __asan_load8+0x54/0x90
>   bt_iter+0x86/0xf0
>   blk_mq_queue_tag_busy_iter+0x373/0x5e0
>   blk_mq_in_flight+0x96/0xb0
>   part_in_flight+0x40/0x140
>   part_round_stats+0x18e/0x370
>   blk_account_io_start+0x3d7/0x670
>   blk_mq_bio_to_request+0x19c/0x3a0
>   blk_mq_make_request+0x7a9/0xcb0
>   generic_make_request+0x41d/0x960
>   submit_bio+0x9b/0x250
>   do_blockdev_direct_IO+0x435c/0x4c70
>   __blockdev_direct_IO+0x79/0x88
>   ext4_direct_IO+0x46c/0xc00
>   generic_file_direct_write+0x119/0x210
>   __generic_file_write_iter+0x11c/0x280
>   ext4_file_write_iter+0x1b8/0x6f0
>   aio_write+0x204/0x310
>   io_submit_one+0x9d3/0xe80
>   __x64_sys_io_submit+0x115/0x340
>   do_syscall_64+0x71/0x210
> 

Hi Bart,

Do we have any performance figures to say that the effect is negligible?

FYI, I did some testing on this, and it looks to solve problems in all 
known scenarios, including interactions of blk_mq_tagset_busy_iter(), 
and blk_mq_queue_tag_busy_iter().

Thanks,
John

> When multiple request queues share a tag set and when switching the I/O
> scheduler for one of the request queues that uses this tag set, the
> following race can happen:
> * blk_mq_tagset_busy_iter() calls bt_tags_iter() and bt_tags_iter() assigns
>    a pointer to a scheduler request to the local variable 'rq'.
> * blk_mq_sched_free_requests() is called to free hctx->sched_tags.
> * blk_mq_tagset_busy_iter() dereferences 'rq' and triggers a use-after-free.
> 
> Fix this race as follows:
> * Use rcu_assign_pointer() and srcu_dereference() to access hctx->tags->rqs[].
> * Protect hctx->tags->rqs[] reads with an SRCU read-side lock.
> * Call srcu_barrier() before freeing scheduler requests.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: John Garry <john.garry@huawei.com>
> Cc: Khazhy Kumykov <khazhy@google.com>
> Cc: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>   block/blk-core.c   |  9 ++++++++-
>   block/blk-mq-tag.c | 29 +++++++++++++++++++----------
>   block/blk-mq-tag.h |  2 +-
>   block/blk-mq.c     | 21 +++++++++++++++++----
>   block/blk-mq.h     |  1 +
>   block/blk.h        |  7 +++++++
>   6 files changed, 53 insertions(+), 16 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index fc60ff208497..88b65c59e605 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -52,6 +52,7 @@
>   #include "blk-pm.h"
>   #include "blk-rq-qos.h"
>   
> +DEFINE_SRCU(blk_sched_srcu);

out of interest, any reason that this is global and not per tagset?

>   struct dentry *blk_debugfs_root;
>   
>   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
> @@ -412,8 +413,14 @@ void blk_cleanup_queue(struct request_queue *q)
>   	 * it is safe to free requests now.
>   	 */
>   	mutex_lock(&q->sysfs_lock);
> -	if (q->elevator)
> +	if (q->elevator) {
> +		/*
> +		 * Barrier between clearing hctx->tags->rqs[] and freeing
> +		 * scheduler requests.
> +		 */
> +		srcu_barrier(&blk_sched_srcu);
>   		blk_mq_sched_free_requests(q);
> +	}
>   	mutex_unlock(&q->sysfs_lock);
>   
>   	percpu_ref_exit(&q->q_usage_counter);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 9c92053e704d..f3afaf1520cd 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -206,18 +206,24 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>   	struct blk_mq_tags *tags = hctx->tags;
>   	bool reserved = iter_data->reserved;
>   	struct request *rq;
> +	bool res = true;
> +	int idx;
>   
>   	if (!reserved)
>   		bitnr += tags->nr_reserved_tags;
> -	rq = tags->rqs[bitnr];
> +
> +	idx = srcu_read_lock(&blk_sched_srcu);
> +	rq = srcu_dereference(tags->rqs[bitnr], &blk_sched_srcu);
>   
>   	/*
>   	 * We can hit rq == NULL here, because the tagging functions
>   	 * test and set the bit before assigning ->rqs[].
>   	 */
>   	if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
> -		return iter_data->fn(hctx, rq, iter_data->data, reserved);
> -	return true;
> +		res = iter_data->fn(hctx, rq, iter_data->data, reserved);
> +	srcu_read_unlock(&blk_sched_srcu, idx);
> +
> +	return res;
>   }
>   
>   /**
> @@ -264,10 +270,13 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>   	struct blk_mq_tags *tags = iter_data->tags;
>   	bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
>   	struct request *rq;
> +	bool res = true;
> +	int idx;
>   
>   	if (!reserved)
>   		bitnr += tags->nr_reserved_tags;
>   
> +	idx = srcu_read_lock(&blk_sched_srcu);
>   	/*
>   	 * We can hit rq == NULL here, because the tagging functions
>   	 * test and set the bit before assigning ->rqs[].
> @@ -275,13 +284,13 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>   	if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
>   		rq = tags->static_rqs[bitnr];
>   	else
> -		rq = tags->rqs[bitnr];
> -	if (!rq)
> -		return true;
> -	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
> -	    !blk_mq_request_started(rq))
> -		return true;
> -	return iter_data->fn(rq, iter_data->data, reserved);
> +		rq = srcu_dereference(tags->rqs[bitnr], &blk_sched_srcu);
> +	if (rq && (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
> +		   blk_mq_request_started(rq)))
> +		res = iter_data->fn(rq, iter_data->data, reserved);
> +	srcu_read_unlock(&blk_sched_srcu, idx);
> +
> +	return res;
>   }
>   
>   /**
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 7d3e6b333a4a..7a6d04733261 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -17,7 +17,7 @@ struct blk_mq_tags {
>   	struct sbitmap_queue __bitmap_tags;
>   	struct sbitmap_queue __breserved_tags;
>   
> -	struct request **rqs;
> +	struct request __rcu **rqs;
>   	struct request **static_rqs;
>   	struct list_head page_list;
>   };
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d4d7c1caa439..88f23a02c7c3 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -495,8 +495,10 @@ static void __blk_mq_free_request(struct request *rq)
>   	blk_crypto_free_request(rq);
>   	blk_pm_mark_last_busy(rq);
>   	rq->mq_hctx = NULL;
> -	if (rq->tag != BLK_MQ_NO_TAG)
> +	if (rq->tag != BLK_MQ_NO_TAG) {
>   		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
> +		rcu_assign_pointer(hctx->tags->rqs[rq->tag], NULL);
> +	}
>   	if (sched_tag != BLK_MQ_NO_TAG)
>   		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
>   	blk_mq_sched_restart(hctx);
> @@ -838,9 +840,20 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
>   
>   struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
>   {
> +	struct request *rq;
> +
>   	if (tag < tags->nr_tags) {
> -		prefetch(tags->rqs[tag]);
> -		return tags->rqs[tag];
> +		/*
> +		 * The srcu dereference below is protected by the request
> +		 * queue usage count. We can only verify that usage count after
> +		 * having read the request pointer.
> +		 */
> +		rq = srcu_dereference_check(tags->rqs[tag], &blk_sched_srcu,
> +					    true);
> +		WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && rq &&
> +			     percpu_ref_is_zero(&rq->q->q_usage_counter));
> +		prefetch(rq);
> +		return rq;
>   	}
>   
>   	return NULL;
> @@ -1111,7 +1124,7 @@ static bool blk_mq_get_driver_tag(struct request *rq)
>   		rq->rq_flags |= RQF_MQ_INFLIGHT;
>   		__blk_mq_inc_active_requests(hctx);
>   	}
> -	hctx->tags->rqs[rq->tag] = rq;
> +	rcu_assign_pointer(hctx->tags->rqs[rq->tag], rq);
>   	return true;
>   }
>   
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 3616453ca28c..9ccb1818303b 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -226,6 +226,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
>   					   struct request *rq)
>   {
>   	blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
> +	rcu_assign_pointer(hctx->tags->rqs[rq->tag], NULL);
>   	rq->tag = BLK_MQ_NO_TAG;
>   
>   	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
> diff --git a/block/blk.h b/block/blk.h
> index 3b53e44b967e..28c574cabb91 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -14,6 +14,8 @@
>   /* Max future timer expiry for timeouts */
>   #define BLK_MAX_TIMEOUT		(5 * HZ)
>   
> +extern struct srcu_struct blk_sched_srcu;
> +
>   extern struct dentry *blk_debugfs_root;
>   
>   struct blk_flush_queue {
> @@ -203,6 +205,11 @@ static inline void elevator_exit(struct request_queue *q,
>   {
>   	lockdep_assert_held(&q->sysfs_lock);
>   
> +	/*
> +	 * Barrier between clearing hctx->tags->rqs[] and freeing scheduler
> +	 * requests.
> +	 */
> +	srcu_barrier(&blk_sched_srcu);
>   	blk_mq_sched_free_requests(q);
>   	__elevator_exit(q, e);
>   }
> .
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-23 12:34 ` John Garry
@ 2021-03-23 15:39   ` Jens Axboe
  2021-03-23 16:26     ` John Garry
  2021-03-23 16:53   ` Bart Van Assche
  1 sibling, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2021-03-23 15:39 UTC (permalink / raw)
  To: John Garry, Bart Van Assche
  Cc: linux-block, Christoph Hellwig, Ming Lei, Khazhy Kumykov,
	Shinichiro Kawasaki

On 3/23/21 6:34 AM, John Garry wrote:
> On 19/03/2021 01:00, Bart Van Assche wrote:
>> Multiple users have reported use-after-free complaints similar to the
>> following (see also https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/):
>>
>> BUG: KASAN: use-after-free in bt_iter+0x86/0xf0
>> Read of size 8 at addr ffff88803b335240 by task fio/21412
>>
>> CPU: 0 PID: 21412 Comm: fio Tainted: G        W         4.20.0-rc6-dbg+ #3
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
>> Call Trace:
>>   dump_stack+0x86/0xca
>>   print_address_description+0x71/0x239
>>   kasan_report.cold.5+0x242/0x301
>>   __asan_load8+0x54/0x90
>>   bt_iter+0x86/0xf0
>>   blk_mq_queue_tag_busy_iter+0x373/0x5e0
>>   blk_mq_in_flight+0x96/0xb0
>>   part_in_flight+0x40/0x140
>>   part_round_stats+0x18e/0x370
>>   blk_account_io_start+0x3d7/0x670
>>   blk_mq_bio_to_request+0x19c/0x3a0
>>   blk_mq_make_request+0x7a9/0xcb0
>>   generic_make_request+0x41d/0x960
>>   submit_bio+0x9b/0x250
>>   do_blockdev_direct_IO+0x435c/0x4c70
>>   __blockdev_direct_IO+0x79/0x88
>>   ext4_direct_IO+0x46c/0xc00
>>   generic_file_direct_write+0x119/0x210
>>   __generic_file_write_iter+0x11c/0x280
>>   ext4_file_write_iter+0x1b8/0x6f0
>>   aio_write+0x204/0x310
>>   io_submit_one+0x9d3/0xe80
>>   __x64_sys_io_submit+0x115/0x340
>>   do_syscall_64+0x71/0x210
>>
> 
> Hi Bart,
> 
> Do we have any performance figures to say that the effect is negligible?

I ran this through my usual peak testing, it's pretty good at finding
any changes in performance related to changes in overhead. The workload
is a pretty simple 512b random read, QD 128, using io_uring and polled
IO.

It seems to cause a slight slowdown for me. Performance before the patch
is around 3.23-3.27M IOPS, and after we're at around 3.20-3.22. Looking
at perf diff, the most interesting bits seem to be:


2.09%     -1.05%  [kernel.vmlinux]  [k] blk_mq_get_tag
0.48%     +0.98%  [kernel.vmlinux]  [k] __do_sys_io_uring_enter
1.49%     +0.85%  [kernel.vmlinux]  [k] __blk_mq_alloc_request
          +0.71%  [kernel.vmlinux]  [k] __blk_mq_free_request

which seems to show some shifting around of cost (often happens), but
generally up a bit looking at the blk side.

So nothing really major here, and I don't think it's something that
should getting this fixed. John, I can run your series through the same,
let me know.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-23 15:39   ` Jens Axboe
@ 2021-03-23 16:26     ` John Garry
  2021-03-26  2:14       ` Bart Van Assche
  0 siblings, 1 reply; 9+ messages in thread
From: John Garry @ 2021-03-23 16:26 UTC (permalink / raw)
  To: Jens Axboe, Bart Van Assche
  Cc: linux-block, Christoph Hellwig, Ming Lei, Khazhy Kumykov,
	Shinichiro Kawasaki


>>
>> Do we have any performance figures to say that the effect is negligible?
> 
> I ran this through my usual peak testing, it's pretty good at finding
> any changes in performance related to changes in overhead. The workload
> is a pretty simple 512b random read, QD 128, using io_uring and polled
> IO.
> 
> It seems to cause a slight slowdown for me. Performance before the patch
> is around 3.23-3.27M IOPS, and after we're at around 3.20-3.22. Looking
> at perf diff, the most interesting bits seem to be:
> 
> 
> 2.09%     -1.05%  [kernel.vmlinux]  [k] blk_mq_get_tag
> 0.48%     +0.98%  [kernel.vmlinux]  [k] __do_sys_io_uring_enter
> 1.49%     +0.85%  [kernel.vmlinux]  [k] __blk_mq_alloc_request
>            +0.71%  [kernel.vmlinux]  [k] __blk_mq_free_request
> 
> which seems to show some shifting around of cost (often happens), but
> generally up a bit looking at the blk side.
> 
> So nothing really major here, and I don't think it's something that
> should getting this fixed.


>John, I can run your series through the same,
> let me know.
> 

The first patch in my series solves the issues reported by community also:
https://lore.kernel.org/linux-block/1614957294-188540-2-git-send-email-john.garry@huawei.com/

And that patch should give no throughput performance hit.

The other two patches in that series are to solve more theoretical 
issues of iterator interactions, but their approach is unacceptable.

Obviously your call. IMHO, Bart's solution would make more sense.

Thanks,
John

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-23 12:34 ` John Garry
  2021-03-23 15:39   ` Jens Axboe
@ 2021-03-23 16:53   ` Bart Van Assche
  1 sibling, 0 replies; 9+ messages in thread
From: Bart Van Assche @ 2021-03-23 16:53 UTC (permalink / raw)
  To: John Garry, Jens Axboe
  Cc: linux-block, Christoph Hellwig, Ming Lei, Khazhy Kumykov,
	Shinichiro Kawasaki

On 3/23/21 5:34 AM, John Garry wrote:
> Do we have any performance figures to say that the effect is negligible?

Jens has been so kind to run a quick performance test (thanks Jens!).

> FYI, I did some testing on this, and it looks to solve problems in all
> known scenarios, including interactions of blk_mq_tagset_busy_iter(),
> and blk_mq_queue_tag_busy_iter().

Thanks for the testing!

>>   +DEFINE_SRCU(blk_sched_srcu);
> 
> out of interest, any reason that this is global and not per tagset?

That's a great question. Making it global was the easiest approach to
evaluate and test an SRCU-based solution. I can change the approach to
one SRCU struct per tag set but that will increase the size of each tag
set. The size of an SRCU structure is significant, and in addition to
this structure SRCU allocates memory per CPU:

struct srcu_struct {
	struct srcu_node node[NUM_RCU_NODES];	/* Combining tree. */
	struct srcu_node *level[RCU_NUM_LVLS + 1];
						/* First node at each level. */
	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
	spinlock_t __private lock;		/* Protect counters */
	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
	unsigned int srcu_idx;			/* Current rdr array element. */
	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
	unsigned long srcu_last_gp_end;		/* Last GP end timestamp (ns) */
	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
	struct completion srcu_barrier_completion;
						/* Awaken barrier rq at end. */
	atomic_t srcu_barrier_cpu_cnt;		/* # CPUs not yet posting a */
						/*  callback for the barrier */
						/*  operation. */
	struct delayed_work work;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	struct lockdep_map dep_map;
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
};

See also the alloc_percpu() call in init_srcu_struct_fields(). Does
everyone agree with increasing the size of each tag set with a data
structure that has a size that is proportional to the number of CPUs?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blk-mq: Fix races between iterating over requests and freeing requests
  2021-03-23 16:26     ` John Garry
@ 2021-03-26  2:14       ` Bart Van Assche
  0 siblings, 0 replies; 9+ messages in thread
From: Bart Van Assche @ 2021-03-26  2:14 UTC (permalink / raw)
  To: John Garry, Jens Axboe
  Cc: linux-block, Christoph Hellwig, Ming Lei, Khazhy Kumykov,
	Shinichiro Kawasaki

On 3/23/21 9:26 AM, John Garry wrote:
> The first patch in my series solves the issues reported by community also:
> https://lore.kernel.org/linux-block/1614957294-188540-2-git-send-email-john.garry@huawei.com/
> 
> And that patch should give no throughput performance hit.
> 
> The other two patches in that series are to solve more theoretical
> issues of iterator interactions, but their approach is unacceptable.
> 
> Obviously your call. IMHO, Bart's solution would make more sense.

Hi John,

In case this would not be clear, I really appreciate your work and the
time you have spent on analyzing this issue and preparing a solution.
Personally I prefer the solution to clear request pointers immediately
after these have been freed since that approach results in simpler code
and (hopefully) in code that will be easier to maintain.

Bart.


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-03-26  2:15 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-19  1:00 [PATCH] blk-mq: Fix races between iterating over requests and freeing requests Bart Van Assche
2021-03-22  0:42 ` Shinichiro Kawasaki
2021-03-22 22:39 ` Khazhy Kumykov
2021-03-22 23:37   ` Bart Van Assche
2021-03-23 12:34 ` John Garry
2021-03-23 15:39   ` Jens Axboe
2021-03-23 16:26     ` John Garry
2021-03-26  2:14       ` Bart Van Assche
2021-03-23 16:53   ` Bart Van Assche

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).