linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] block: free sched's request pool in blk_cleanup_queue
@ 2019-06-04 13:08 Ming Lei
  2019-06-05  1:27 ` Yi Zhang
  2019-06-06 14:47 ` Benjamin Block
  0 siblings, 2 replies; 5+ messages in thread
From: Ming Lei @ 2019-06-04 13:08 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Bart Van Assche, Christoph Hellwig,
	kernel test robot

In theory, IO scheduler belongs to request queue, and the request pool
of sched tags belongs to the request queue too.

However, the current tags allocation interfaces are re-used for both
driver tags and sched tags, and driver tags is definitely host wide,
and doesn't belong to any request queue, same with its request pool.
So we need tagset instance for freeing request of sched tags.

Meantime, blk_mq_free_tag_set() often follows blk_cleanup_queue() in case
of non-BLK_MQ_F_TAG_SHARED, this way requires that request pool of sched
tags to be freed before calling blk_mq_free_tag_set().

Commit 47cdee29ef9d94e ("block: move blk_exit_queue into __blk_release_queue")
moves blk_exit_queue into __blk_release_queue for simplying the fast
path in generic_make_request(), then causes oops during freeing requests
of sched tags in __blk_release_queue().

Fix the above issue by move freeing request pool of sched tags into
blk_cleanup_queue(), this way is safe becasue queue has been frozen and no any
in-queue requests at that time. Freeing sched tags has to be kept in queue's
release handler becasue there might be un-completed dispatch activity
which might refer to sched tags.

Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Christoph Hellwig <hch@lst.de>
Fixes: 47cdee29ef9d94e485eb08f962c74943023a5271 ("block: move blk_exit_queue into __blk_release_queue")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c     | 13 +++++++++++++
 block/blk-mq-sched.c | 30 +++++++++++++++++++++++++++---
 block/blk-mq-sched.h |  1 +
 block/blk-sysfs.c    |  2 +-
 block/blk.h          | 10 +++++++++-
 block/elevator.c     |  2 +-
 6 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ee1b35fe8572..8340f69670d8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -320,6 +320,19 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (queue_is_mq(q))
 		blk_mq_exit_queue(q);
 
+	/*
+	 * In theory, request pool of sched_tags belongs to request queue.
+	 * However, the current implementation requires tag_set for freeing
+	 * requests, so free the pool now.
+	 *
+	 * Queue has become frozen, there can't be any in-queue requests, so
+	 * it is safe to free requests now.
+	 */
+	mutex_lock(&q->sysfs_lock);
+	if (q->elevator)
+		blk_mq_sched_free_requests(q);
+	mutex_unlock(&q->sysfs_lock);
+
 	percpu_ref_exit(&q->q_usage_counter);
 
 	/* @q is and will stay empty, shutdown and put */
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74c6bb871f7e..500cb04901cc 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -475,14 +475,18 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 	return ret;
 }
 
+/* called in queue's release handler, tagset has gone away */
 static void blk_mq_sched_tags_teardown(struct request_queue *q)
 {
-	struct blk_mq_tag_set *set = q->tag_set;
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_sched_free_tags(set, hctx, i);
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (hctx->sched_tags) {
+			blk_mq_free_rq_map(hctx->sched_tags);
+			hctx->sched_tags = NULL;
+		}
+	}
 }
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
@@ -523,6 +527,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 			ret = e->ops.init_hctx(hctx, i);
 			if (ret) {
 				eq = q->elevator;
+				blk_mq_sched_free_requests(q);
 				blk_mq_exit_sched(q, eq);
 				kobject_put(&eq->kobj);
 				return ret;
@@ -534,11 +539,30 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	return 0;
 
 err:
+	blk_mq_sched_free_requests(q);
 	blk_mq_sched_tags_teardown(q);
 	q->elevator = NULL;
 	return ret;
 }
 
+/*
+ * called in either blk_queue_cleanup or elevator_switch, tagset
+ * is required for freeing requests
+ */
+void blk_mq_sched_free_requests(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	lockdep_assert_held(&q->sysfs_lock);
+	WARN_ON(!q->elevator);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (hctx->sched_tags)
+			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+	}
+}
+
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index c7bdb52367ac..3cf92cbbd8ac 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -28,6 +28,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
+void blk_mq_sched_free_requests(struct request_queue *q);
 
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 75b5281cc577..977c659dcd18 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -850,7 +850,7 @@ static void blk_exit_queue(struct request_queue *q)
 	 */
 	if (q->elevator) {
 		ioc_clear_queue(q);
-		elevator_exit(q, q->elevator);
+		__elevator_exit(q, q->elevator);
 		q->elevator = NULL;
 	}
 
diff --git a/block/blk.h b/block/blk.h
index 91b3581b7c7a..7814aa207153 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -6,6 +6,7 @@
 #include <linux/blk-mq.h>
 #include <xen/xen.h>
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
@@ -176,10 +177,17 @@ void blk_insert_flush(struct request *rq);
 int elevator_init_mq(struct request_queue *q);
 int elevator_switch_mq(struct request_queue *q,
 			      struct elevator_type *new_e);
-void elevator_exit(struct request_queue *, struct elevator_queue *);
+void __elevator_exit(struct request_queue *, struct elevator_queue *);
 int elv_register_queue(struct request_queue *q);
 void elv_unregister_queue(struct request_queue *q);
 
+static inline void elevator_exit(struct request_queue *q,
+		struct elevator_queue *e)
+{
+	blk_mq_sched_free_requests(q);
+	__elevator_exit(q, e);
+}
+
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/elevator.c b/block/elevator.c
index ec55d5fc0b3e..2f17d66d0e61 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -178,7 +178,7 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
 
-void elevator_exit(struct request_queue *q, struct elevator_queue *e)
+void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->type->ops.exit_sched)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] block: free sched's request pool in blk_cleanup_queue
  2019-06-04 13:08 [PATCH] block: free sched's request pool in blk_cleanup_queue Ming Lei
@ 2019-06-05  1:27 ` Yi Zhang
  2019-06-06 14:47 ` Benjamin Block
  1 sibling, 0 replies; 5+ messages in thread
From: Yi Zhang @ 2019-06-05  1:27 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: linux-block, Bart Van Assche, Christoph Hellwig, kernel test robot

Tested-by: Yi Zhang <yi.zhang@redhat.com>

This patch fixed bellow issue which triggered by blktests block/006, thanks.

Kernel 5.2.0-rc3 on an x86_64

[  567.066143] run blktests block/006 at 2019-06-05 03:18:04
[  567.089457] null: module loaded
[  593.937235] BUG: unable to handle page fault for address: 
ffffffffc0738110
[  593.938442] #PF: supervisor read access in kernel mode
[  593.939230] #PF: error_code(0x0000) - not-present page
[  593.940011] PGD 11760f067 P4D 11760f067 PUD 117611067 PMD 12b61f067 PTE 0
[  593.941048] Oops: 0000 [#1] SMP PTI
[  593.941607] CPU: 0 PID: 1106 Comm: kworker/0:0 Not tainted 5.2.0-rc3 #1
[  593.942607] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  593.943511] Workqueue: events __blk_release_queue
[  593.944240] RIP: 0010:blk_mq_free_rqs+0x21/0xd0
[  593.944930] Code: c6 e8 83 f0 ff ff eb 9f 90 0f 1f 44 00 00 41 56 41 
55 41 54 55 48 89 f5 53 48 83 be 90 00 00 00 00 74 56 48 8b 47 38 49 89 
fd <48> 83 78 50 00 74 48 8b 06 85 c0 74 42 41 89 d6 31 db 48 8b 85 98
[  593.947773] RSP: 0018:ffffa21300e7fde8 EFLAGS: 00010286
[  593.948562] RAX: ffffffffc07380c0 RBX: 0000000000000000 RCX: 
0000000000002ab8
[  593.949737] RDX: 0000000000000000 RSI: ffff9296971089c0 RDI: 
ffff9296975fb438
[  593.950811] RBP: ffff9296971089c0 R08: 000000000002f0e0 R09: 
ffffffffb6427890
[  593.951878] R10: ffffd41084a3fd80 R11: 0000000000000001 R12: 
ffff9296a91e58b0
[  593.952947] R13: ffff9296975fb438 R14: ffff9296ab5b2600 R15: 
ffff9296a91e6080
[  593.954011] FS:  0000000000000000(0000) GS:ffff9296bba00000(0000) 
knlGS:0000000000000000
[  593.955252] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  593.956147] CR2: ffffffffc0738110 CR3: 000000011760a003 CR4: 
00000000003606f0
[  593.957249] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  593.958354] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  593.959452] Call Trace:
[  593.959877]  blk_mq_sched_tags_teardown+0x40/0x70
[  593.960628]  blk_mq_exit_sched+0x88/0xa0
[  593.961246]  elevator_exit+0x30/0x50
[  593.961817]  __blk_release_queue+0x5c/0x100
[  593.962497]  process_one_work+0x1a1/0x3a0
[  593.963138]  worker_thread+0x30/0x380
[  593.963714]  ? pwq_unbound_release_workfn+0xd0/0xd0
[  593.964479]  kthread+0x112/0x130
[  593.964992]  ? __kthread_parkme+0x70/0x70
[  593.965641]  ret_from_fork+0x35/0x40
[  593.966215] Modules linked in: sunrpc snd_hda_codec_generic 
ledtrig_audio snd_hda_intel snd_hda_codec nfit snd_hda_core libnvdimm 
snd_hwdep snd_seq crct10dif_pclmul snd_seq_device crc32_pclmul snd_pcm 
ghash_clmulni_intel snd_timer snd pcspkr joydev virtio_balloon i2c_piix4 
soundcore ip_tables xfs libcrc32c qxl drm_kms_helper ata_generic 
syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ata_piix libata 
virtio_net net_failover crc32c_intel serio_raw virtio_blk virtio_console 
failover dm_mirror dm_region_hash dm_log dm_mod [last unloaded: null_blk]
[  593.973735] CR2: ffffffffc0738110
[  593.974270] ---[ end trace 782d8ae6cfec57e7 ]---
[  593.974983] RIP: 0010:blk_mq_free_rqs+0x21/0xd0
[  593.975691] Code: c6 e8 83 f0 ff ff eb 9f 90 0f 1f 44 00 00 41 56 41 
55 41 54 55 48 89 f5 53 48 83 be 90 00 00 00 00 74 56 48 8b 47 38 49 89 
fd <48> 83 78 50 00 74 48 8b 06 85 c0 74 42 41 89 d6 31 db 48 8b 85 98
[  593.978548] RSP: 0018:ffffa21300e7fde8 EFLAGS: 00010286
[  593.979466] RAX: ffffffffc07380c0 RBX: 0000000000000000 RCX: 
0000000000002ab8
[  593.980578] RDX: 0000000000000000 RSI: ffff9296971089c0 RDI: 
ffff9296975fb438
[  593.981677] RBP: ffff9296971089c0 R08: 000000000002f0e0 R09: 
ffffffffb6427890
[  593.982780] R10: ffffd41084a3fd80 R11: 0000000000000001 R12: 
ffff9296a91e58b0
[  593.983883] R13: ffff9296975fb438 R14: ffff9296ab5b2600 R15: 
ffff9296a91e6080
[  593.984982] FS:  0000000000000000(0000) GS:ffff9296bba00000(0000) 
knlGS:0000000000000000
[  593.986239] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  593.987138] CR2: ffffffffc0738110 CR3: 000000011760a003 CR4: 
00000000003606f0
[  593.988241] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  593.989344] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  593.990447] Kernel panic - not syncing: Fatal exception
[  593.992292] Kernel Offset: 0x35000000 from 0xffffffff81000000 
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[  593.994147] ---[ end Kernel panic - not syncing: Fatal exception ]---

On 6/4/19 9:08 PM, Ming Lei wrote:
> In theory, IO scheduler belongs to request queue, and the request pool
> of sched tags belongs to the request queue too.
>
> However, the current tags allocation interfaces are re-used for both
> driver tags and sched tags, and driver tags is definitely host wide,
> and doesn't belong to any request queue, same with its request pool.
> So we need tagset instance for freeing request of sched tags.
>
> Meantime, blk_mq_free_tag_set() often follows blk_cleanup_queue() in case
> of non-BLK_MQ_F_TAG_SHARED, this way requires that request pool of sched
> tags to be freed before calling blk_mq_free_tag_set().
>
> Commit 47cdee29ef9d94e ("block: move blk_exit_queue into __blk_release_queue")
> moves blk_exit_queue into __blk_release_queue for simplying the fast
> path in generic_make_request(), then causes oops during freeing requests
> of sched tags in __blk_release_queue().
>
> Fix the above issue by move freeing request pool of sched tags into
> blk_cleanup_queue(), this way is safe becasue queue has been frozen and no any
> in-queue requests at that time. Freeing sched tags has to be kept in queue's
> release handler becasue there might be un-completed dispatch activity
> which might refer to sched tags.
>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Fixes: 47cdee29ef9d94e485eb08f962c74943023a5271 ("block: move blk_exit_queue into __blk_release_queue")
> Reported-by: kernel test robot <rong.a.chen@intel.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   block/blk-core.c     | 13 +++++++++++++
>   block/blk-mq-sched.c | 30 +++++++++++++++++++++++++++---
>   block/blk-mq-sched.h |  1 +
>   block/blk-sysfs.c    |  2 +-
>   block/blk.h          | 10 +++++++++-
>   block/elevator.c     |  2 +-
>   6 files changed, 52 insertions(+), 6 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ee1b35fe8572..8340f69670d8 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -320,6 +320,19 @@ void blk_cleanup_queue(struct request_queue *q)
>   	if (queue_is_mq(q))
>   		blk_mq_exit_queue(q);
>   
> +	/*
> +	 * In theory, request pool of sched_tags belongs to request queue.
> +	 * However, the current implementation requires tag_set for freeing
> +	 * requests, so free the pool now.
> +	 *
> +	 * Queue has become frozen, there can't be any in-queue requests, so
> +	 * it is safe to free requests now.
> +	 */
> +	mutex_lock(&q->sysfs_lock);
> +	if (q->elevator)
> +		blk_mq_sched_free_requests(q);
> +	mutex_unlock(&q->sysfs_lock);
> +
>   	percpu_ref_exit(&q->q_usage_counter);
>   
>   	/* @q is and will stay empty, shutdown and put */
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 74c6bb871f7e..500cb04901cc 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -475,14 +475,18 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
>   	return ret;
>   }
>   
> +/* called in queue's release handler, tagset has gone away */
>   static void blk_mq_sched_tags_teardown(struct request_queue *q)
>   {
> -	struct blk_mq_tag_set *set = q->tag_set;
>   	struct blk_mq_hw_ctx *hctx;
>   	int i;
>   
> -	queue_for_each_hw_ctx(q, hctx, i)
> -		blk_mq_sched_free_tags(set, hctx, i);
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		if (hctx->sched_tags) {
> +			blk_mq_free_rq_map(hctx->sched_tags);
> +			hctx->sched_tags = NULL;
> +		}
> +	}
>   }
>   
>   int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> @@ -523,6 +527,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>   			ret = e->ops.init_hctx(hctx, i);
>   			if (ret) {
>   				eq = q->elevator;
> +				blk_mq_sched_free_requests(q);
>   				blk_mq_exit_sched(q, eq);
>   				kobject_put(&eq->kobj);
>   				return ret;
> @@ -534,11 +539,30 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>   	return 0;
>   
>   err:
> +	blk_mq_sched_free_requests(q);
>   	blk_mq_sched_tags_teardown(q);
>   	q->elevator = NULL;
>   	return ret;
>   }
>   
> +/*
> + * called in either blk_queue_cleanup or elevator_switch, tagset
> + * is required for freeing requests
> + */
> +void blk_mq_sched_free_requests(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	int i;
> +
> +	lockdep_assert_held(&q->sysfs_lock);
> +	WARN_ON(!q->elevator);
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		if (hctx->sched_tags)
> +			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
> +	}
> +}
> +
>   void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
>   {
>   	struct blk_mq_hw_ctx *hctx;
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index c7bdb52367ac..3cf92cbbd8ac 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -28,6 +28,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
>   
>   int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
>   void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
> +void blk_mq_sched_free_requests(struct request_queue *q);
>   
>   static inline bool
>   blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 75b5281cc577..977c659dcd18 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -850,7 +850,7 @@ static void blk_exit_queue(struct request_queue *q)
>   	 */
>   	if (q->elevator) {
>   		ioc_clear_queue(q);
> -		elevator_exit(q, q->elevator);
> +		__elevator_exit(q, q->elevator);
>   		q->elevator = NULL;
>   	}
>   
> diff --git a/block/blk.h b/block/blk.h
> index 91b3581b7c7a..7814aa207153 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -6,6 +6,7 @@
>   #include <linux/blk-mq.h>
>   #include <xen/xen.h>
>   #include "blk-mq.h"
> +#include "blk-mq-sched.h"
>   
>   /* Max future timer expiry for timeouts */
>   #define BLK_MAX_TIMEOUT		(5 * HZ)
> @@ -176,10 +177,17 @@ void blk_insert_flush(struct request *rq);
>   int elevator_init_mq(struct request_queue *q);
>   int elevator_switch_mq(struct request_queue *q,
>   			      struct elevator_type *new_e);
> -void elevator_exit(struct request_queue *, struct elevator_queue *);
> +void __elevator_exit(struct request_queue *, struct elevator_queue *);
>   int elv_register_queue(struct request_queue *q);
>   void elv_unregister_queue(struct request_queue *q);
>   
> +static inline void elevator_exit(struct request_queue *q,
> +		struct elevator_queue *e)
> +{
> +	blk_mq_sched_free_requests(q);
> +	__elevator_exit(q, e);
> +}
> +
>   struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
>   
>   #ifdef CONFIG_FAIL_IO_TIMEOUT
> diff --git a/block/elevator.c b/block/elevator.c
> index ec55d5fc0b3e..2f17d66d0e61 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -178,7 +178,7 @@ static void elevator_release(struct kobject *kobj)
>   	kfree(e);
>   }
>   
> -void elevator_exit(struct request_queue *q, struct elevator_queue *e)
> +void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
>   {
>   	mutex_lock(&e->sysfs_lock);
>   	if (e->type->ops.exit_sched)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] block: free sched's request pool in blk_cleanup_queue
  2019-06-04 13:08 [PATCH] block: free sched's request pool in blk_cleanup_queue Ming Lei
  2019-06-05  1:27 ` Yi Zhang
@ 2019-06-06 14:47 ` Benjamin Block
  2019-06-06 22:43   ` Ming Lei
  1 sibling, 1 reply; 5+ messages in thread
From: Benjamin Block @ 2019-06-06 14:47 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Bart Van Assche, Christoph Hellwig,
	kernel test robot, Jens Remus

On Tue, Jun 04, 2019 at 09:08:02PM +0800, Ming Lei wrote:
> In theory, IO scheduler belongs to request queue, and the request pool
> of sched tags belongs to the request queue too.
> 
> However, the current tags allocation interfaces are re-used for both
> driver tags and sched tags, and driver tags is definitely host wide,
> and doesn't belong to any request queue, same with its request pool.
> So we need tagset instance for freeing request of sched tags.
> 
> Meantime, blk_mq_free_tag_set() often follows blk_cleanup_queue() in case
> of non-BLK_MQ_F_TAG_SHARED, this way requires that request pool of sched
> tags to be freed before calling blk_mq_free_tag_set().
> 
> Commit 47cdee29ef9d94e ("block: move blk_exit_queue into __blk_release_queue")
> moves blk_exit_queue into __blk_release_queue for simplying the fast
> path in generic_make_request(), then causes oops during freeing requests
> of sched tags in __blk_release_queue().
> 
> Fix the above issue by move freeing request pool of sched tags into
> blk_cleanup_queue(), this way is safe becasue queue has been frozen and no any
> in-queue requests at that time. Freeing sched tags has to be kept in queue's
> release handler becasue there might be un-completed dispatch activity
> which might refer to sched tags.
> 
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Fixes: 47cdee29ef9d94e485eb08f962c74943023a5271 ("block: move blk_exit_queue into __blk_release_queue")
> Reported-by: kernel test robot <rong.a.chen@intel.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Our CI meanwhile also crashes regularly because of this:

  run blktests block/002 at 2019-06-06 14:44:55
  Unable to handle kernel pointer dereference in virtual kernel address space, Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6803
  Fault in home space mode while using kernel ASCE.
  AS:0000000057290007 R3:0000000000000024
  Oops: 0038 ilc:3 [#1] PREEMPT SMP
  Modules linked in: ...
  CPU: 4 PID: 139 Comm: kworker/4:2 Kdump: loaded Not tainted 5.2.0-rc3-master-05489-g55f909514069 #3
  Hardware name: IBM 3906 M03 703 (LPAR)
  Workqueue: events __blk_release_queue
  Krnl PSW : 0704e00180000000 000000005657db18 (blk_mq_free_rqs+0x48/0x128)
             R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
  Krnl GPRS: 00000000a8309db5 6b6b6b6b6b6b6b6b 000000008beb3858 00000000a2befbc8
             0000000000000000 0000000000000001 0000000056bb16c8 00000000b4070aa8
             000000008beb3858 000000008bc46b38 00000000a2befbc8 0000000000000000
             00000000bafb8100 00000000568e8040 000003e0092b3c30 000003e0092b3be0
  Krnl Code: 000000005657db0a: a7f4006e            brc     15,5657dbe6
             000000005657db0e: e31020380004       lg      %r1,56(%r2)
            #000000005657db14: b9040082           lgr     %r8,%r2
            >000000005657db18: e31010500002       ltg     %r1,80(%r1)
             000000005657db1e: a784ffee           brc     8,5657dafa
             000000005657db22: e32030000012       lt      %r2,0(%r3)
             000000005657db28: a784ffe9           brc     8,5657dafa
             000000005657db2c: b9040074           lgr     %r7,%r4
  Call Trace:
  ([<000000008ff8ed00>] 0x8ff8ed00)
   [<0000000056582958>] blk_mq_sched_tags_teardown+0x68/0x98
   [<0000000056583396>] blk_mq_exit_sched+0xc6/0xd8
   [<0000000056569324>] elevator_exit+0x54/0x70
   [<0000000056570644>] __blk_release_queue+0x84/0x110
   [<0000000055f416c6>] process_one_work+0x3a6/0x6b8
   [<0000000055f41c50>] worker_thread+0x278/0x478
   [<0000000055f49e08>] kthread+0x160/0x178
   [<00000000568d83e8>] ret_from_fork+0x34/0x38
  INFO: lockdep is turned off.
  Last Breaking-Event-Address:
   [<000000005657daf6>] blk_mq_free_rqs+0x26/0x128
  Kernel panic - not syncing: Fatal exception: panic_on_oops
  run blktests block/003 at 2019-06-06 14:44:56

When I tried to reproduced this with this patch, it went away (at least all of
blktest/block ran w/o crash).

I don't feel competent enough to review this patch right now, but it would be
good if we get something upstream for this.

-- 
With Best Regards, Benjamin Block      /      Linux on IBM Z Kernel Development
IBM Systems & Technology Group   /  IBM Deutschland Research & Development GmbH
Vorsitz. AufsR.: Matthias Hartmann       /      Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen / Registergericht: AmtsG Stuttgart, HRB 243294


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] block: free sched's request pool in blk_cleanup_queue
  2019-06-06 14:47 ` Benjamin Block
@ 2019-06-06 22:43   ` Ming Lei
  2019-06-07  4:39     ` Jens Axboe
  0 siblings, 1 reply; 5+ messages in thread
From: Ming Lei @ 2019-06-06 22:43 UTC (permalink / raw)
  To: Benjamin Block
  Cc: Jens Axboe, linux-block, Bart Van Assche, Christoph Hellwig,
	kernel test robot, Jens Remus

On Thu, Jun 06, 2019 at 04:47:14PM +0200, Benjamin Block wrote:
> On Tue, Jun 04, 2019 at 09:08:02PM +0800, Ming Lei wrote:
> > In theory, IO scheduler belongs to request queue, and the request pool
> > of sched tags belongs to the request queue too.
> > 
> > However, the current tags allocation interfaces are re-used for both
> > driver tags and sched tags, and driver tags is definitely host wide,
> > and doesn't belong to any request queue, same with its request pool.
> > So we need tagset instance for freeing request of sched tags.
> > 
> > Meantime, blk_mq_free_tag_set() often follows blk_cleanup_queue() in case
> > of non-BLK_MQ_F_TAG_SHARED, this way requires that request pool of sched
> > tags to be freed before calling blk_mq_free_tag_set().
> > 
> > Commit 47cdee29ef9d94e ("block: move blk_exit_queue into __blk_release_queue")
> > moves blk_exit_queue into __blk_release_queue for simplying the fast
> > path in generic_make_request(), then causes oops during freeing requests
> > of sched tags in __blk_release_queue().
> > 
> > Fix the above issue by move freeing request pool of sched tags into
> > blk_cleanup_queue(), this way is safe becasue queue has been frozen and no any
> > in-queue requests at that time. Freeing sched tags has to be kept in queue's
> > release handler becasue there might be un-completed dispatch activity
> > which might refer to sched tags.
> > 
> > Cc: Bart Van Assche <bvanassche@acm.org>
> > Cc: Christoph Hellwig <hch@lst.de>
> > Fixes: 47cdee29ef9d94e485eb08f962c74943023a5271 ("block: move blk_exit_queue into __blk_release_queue")
> > Reported-by: kernel test robot <rong.a.chen@intel.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> 
> Our CI meanwhile also crashes regularly because of this:
> 
>   run blktests block/002 at 2019-06-06 14:44:55
>   Unable to handle kernel pointer dereference in virtual kernel address space, Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6803
>   Fault in home space mode while using kernel ASCE.
>   AS:0000000057290007 R3:0000000000000024
>   Oops: 0038 ilc:3 [#1] PREEMPT SMP
>   Modules linked in: ...
>   CPU: 4 PID: 139 Comm: kworker/4:2 Kdump: loaded Not tainted 5.2.0-rc3-master-05489-g55f909514069 #3
>   Hardware name: IBM 3906 M03 703 (LPAR)
>   Workqueue: events __blk_release_queue
>   Krnl PSW : 0704e00180000000 000000005657db18 (blk_mq_free_rqs+0x48/0x128)
>              R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
>   Krnl GPRS: 00000000a8309db5 6b6b6b6b6b6b6b6b 000000008beb3858 00000000a2befbc8
>              0000000000000000 0000000000000001 0000000056bb16c8 00000000b4070aa8
>              000000008beb3858 000000008bc46b38 00000000a2befbc8 0000000000000000
>              00000000bafb8100 00000000568e8040 000003e0092b3c30 000003e0092b3be0
>   Krnl Code: 000000005657db0a: a7f4006e            brc     15,5657dbe6
>              000000005657db0e: e31020380004       lg      %r1,56(%r2)
>             #000000005657db14: b9040082           lgr     %r8,%r2
>             >000000005657db18: e31010500002       ltg     %r1,80(%r1)
>              000000005657db1e: a784ffee           brc     8,5657dafa
>              000000005657db22: e32030000012       lt      %r2,0(%r3)
>              000000005657db28: a784ffe9           brc     8,5657dafa
>              000000005657db2c: b9040074           lgr     %r7,%r4
>   Call Trace:
>   ([<000000008ff8ed00>] 0x8ff8ed00)
>    [<0000000056582958>] blk_mq_sched_tags_teardown+0x68/0x98
>    [<0000000056583396>] blk_mq_exit_sched+0xc6/0xd8
>    [<0000000056569324>] elevator_exit+0x54/0x70
>    [<0000000056570644>] __blk_release_queue+0x84/0x110
>    [<0000000055f416c6>] process_one_work+0x3a6/0x6b8
>    [<0000000055f41c50>] worker_thread+0x278/0x478
>    [<0000000055f49e08>] kthread+0x160/0x178
>    [<00000000568d83e8>] ret_from_fork+0x34/0x38
>   INFO: lockdep is turned off.
>   Last Breaking-Event-Address:
>    [<000000005657daf6>] blk_mq_free_rqs+0x26/0x128
>   Kernel panic - not syncing: Fatal exception: panic_on_oops
>   run blktests block/003 at 2019-06-06 14:44:56
> 
> When I tried to reproduced this with this patch, it went away (at least all of
> blktest/block ran w/o crash).
> 
> I don't feel competent enough to review this patch right now, but it would be
> good if we get something upstream for this.

Hi Jens, Christoph and Guys,

Could you take a look at this patch? We have at least 3 reports on this
issue, and I believe more will come if it isn't fixed.

Jens, sorry for interrupting your vocation.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] block: free sched's request pool in blk_cleanup_queue
  2019-06-06 22:43   ` Ming Lei
@ 2019-06-07  4:39     ` Jens Axboe
  0 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2019-06-07  4:39 UTC (permalink / raw)
  To: Ming Lei, Benjamin Block
  Cc: linux-block, Bart Van Assche, Christoph Hellwig,
	kernel test robot, Jens Remus

On 6/6/19 4:43 PM, Ming Lei wrote:
> On Thu, Jun 06, 2019 at 04:47:14PM +0200, Benjamin Block wrote:
>> On Tue, Jun 04, 2019 at 09:08:02PM +0800, Ming Lei wrote:
>>> In theory, IO scheduler belongs to request queue, and the request pool
>>> of sched tags belongs to the request queue too.
>>>
>>> However, the current tags allocation interfaces are re-used for both
>>> driver tags and sched tags, and driver tags is definitely host wide,
>>> and doesn't belong to any request queue, same with its request pool.
>>> So we need tagset instance for freeing request of sched tags.
>>>
>>> Meantime, blk_mq_free_tag_set() often follows blk_cleanup_queue() in case
>>> of non-BLK_MQ_F_TAG_SHARED, this way requires that request pool of sched
>>> tags to be freed before calling blk_mq_free_tag_set().
>>>
>>> Commit 47cdee29ef9d94e ("block: move blk_exit_queue into __blk_release_queue")
>>> moves blk_exit_queue into __blk_release_queue for simplying the fast
>>> path in generic_make_request(), then causes oops during freeing requests
>>> of sched tags in __blk_release_queue().
>>>
>>> Fix the above issue by move freeing request pool of sched tags into
>>> blk_cleanup_queue(), this way is safe becasue queue has been frozen and no any
>>> in-queue requests at that time. Freeing sched tags has to be kept in queue's
>>> release handler becasue there might be un-completed dispatch activity
>>> which might refer to sched tags.
>>>
>>> Cc: Bart Van Assche <bvanassche@acm.org>
>>> Cc: Christoph Hellwig <hch@lst.de>
>>> Fixes: 47cdee29ef9d94e485eb08f962c74943023a5271 ("block: move blk_exit_queue into __blk_release_queue")
>>> Reported-by: kernel test robot <rong.a.chen@intel.com>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>
>> Our CI meanwhile also crashes regularly because of this:
>>
>>    run blktests block/002 at 2019-06-06 14:44:55
>>    Unable to handle kernel pointer dereference in virtual kernel address space, Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6803
>>    Fault in home space mode while using kernel ASCE.
>>    AS:0000000057290007 R3:0000000000000024
>>    Oops: 0038 ilc:3 [#1] PREEMPT SMP
>>    Modules linked in: ...
>>    CPU: 4 PID: 139 Comm: kworker/4:2 Kdump: loaded Not tainted 5.2.0-rc3-master-05489-g55f909514069 #3
>>    Hardware name: IBM 3906 M03 703 (LPAR)
>>    Workqueue: events __blk_release_queue
>>    Krnl PSW : 0704e00180000000 000000005657db18 (blk_mq_free_rqs+0x48/0x128)
>>               R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
>>    Krnl GPRS: 00000000a8309db5 6b6b6b6b6b6b6b6b 000000008beb3858 00000000a2befbc8
>>               0000000000000000 0000000000000001 0000000056bb16c8 00000000b4070aa8
>>               000000008beb3858 000000008bc46b38 00000000a2befbc8 0000000000000000
>>               00000000bafb8100 00000000568e8040 000003e0092b3c30 000003e0092b3be0
>>    Krnl Code: 000000005657db0a: a7f4006e            brc     15,5657dbe6
>>               000000005657db0e: e31020380004       lg      %r1,56(%r2)
>>              #000000005657db14: b9040082           lgr     %r8,%r2
>>              >000000005657db18: e31010500002       ltg     %r1,80(%r1)
>>               000000005657db1e: a784ffee           brc     8,5657dafa
>>               000000005657db22: e32030000012       lt      %r2,0(%r3)
>>               000000005657db28: a784ffe9           brc     8,5657dafa
>>               000000005657db2c: b9040074           lgr     %r7,%r4
>>    Call Trace:
>>    ([<000000008ff8ed00>] 0x8ff8ed00)
>>     [<0000000056582958>] blk_mq_sched_tags_teardown+0x68/0x98
>>     [<0000000056583396>] blk_mq_exit_sched+0xc6/0xd8
>>     [<0000000056569324>] elevator_exit+0x54/0x70
>>     [<0000000056570644>] __blk_release_queue+0x84/0x110
>>     [<0000000055f416c6>] process_one_work+0x3a6/0x6b8
>>     [<0000000055f41c50>] worker_thread+0x278/0x478
>>     [<0000000055f49e08>] kthread+0x160/0x178
>>     [<00000000568d83e8>] ret_from_fork+0x34/0x38
>>    INFO: lockdep is turned off.
>>    Last Breaking-Event-Address:
>>     [<000000005657daf6>] blk_mq_free_rqs+0x26/0x128
>>    Kernel panic - not syncing: Fatal exception: panic_on_oops
>>    run blktests block/003 at 2019-06-06 14:44:56
>>
>> When I tried to reproduced this with this patch, it went away (at least all of
>> blktest/block ran w/o crash).
>>
>> I don't feel competent enough to review this patch right now, but it would be
>> good if we get something upstream for this.
> 
> Hi Jens, Christoph and Guys,
> 
> Could you take a look at this patch? We have at least 3 reports on this
> issue, and I believe more will come if it isn't fixed.

I have queued it up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-06-07  4:39 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-04 13:08 [PATCH] block: free sched's request pool in blk_cleanup_queue Ming Lei
2019-06-05  1:27 ` Yi Zhang
2019-06-06 14:47 ` Benjamin Block
2019-06-06 22:43   ` Ming Lei
2019-06-07  4:39     ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).