All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-18 14:06 ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-18 14:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Bart van Assche, Ming Lei, linux-nvme,
	linux-block, Hannes Reinecke, Hannes Reinecke

When calling blk_queue_split() it will be using the per-queue
bioset to allocate the split bio from. However, blk_steal_bios()
might move the bio to another queue, _and_ the original queue
might be removed completely (nvme is especially prone to do so).
That leaves the bvecs of the split bio with a missing / destroyed
mempool, and a really fun crash in bio_endio().

Signed-off-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-core.c       |  9 +--------
 block/blk-merge.c      | 36 ++++++++++++++++++++++++------------
 block/blk-sysfs.c      |  2 --
 include/linux/blkdev.h |  1 -
 4 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4673ebe42255..9317e3de2337 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -474,7 +474,6 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
@@ -488,13 +487,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (ret)
-		goto fail_id;
-
 	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
 	if (!q->backing_dev_info)
-		goto fail_split;
+		goto fail_id;
 
 	q->stats = blk_alloc_queue_stats();
 	if (!q->stats)
@@ -544,8 +539,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	blk_free_queue_stats(q->stats);
 fail_stats:
 	bdi_put(q->backing_dev_info);
-fail_split:
-	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8f96d683b577..9c8636794944 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,6 +12,12 @@
 
 #include "blk.h"
 
+/*
+ * bio_split_bio_set is the bio-set containing bio and iovec memory pools used
+ * by bio_split.
+ */
+struct bio_set bio_split_bio_set;
+
 /*
  * Check if the two bvecs from two bios can be merged to one segment.  If yes,
  * no need to check gap between the two bios since the 1st bio and the 1st bvec
@@ -77,7 +83,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 
 static struct bio *blk_bio_discard_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs,
 					 unsigned *nsegs)
 {
 	unsigned int max_discard_sectors, granularity;
@@ -116,11 +121,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	if (split_sectors > tmp)
 		split_sectors -= tmp;
 
-	return bio_split(bio, split_sectors, GFP_NOIO, bs);
+	return bio_split(bio, split_sectors, GFP_NOIO, &bio_split_bio_set);
 }
 
 static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
-		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+		struct bio *bio, unsigned *nsegs)
 {
 	*nsegs = 1;
 
@@ -130,12 +135,12 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 	if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO,
+			 &bio_split_bio_set);
 }
 
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio *bio,
-					    struct bio_set *bs,
 					    unsigned *nsegs)
 {
 	*nsegs = 1;
@@ -146,7 +151,8 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO,
+			 &bio_split_bio_set);
 }
 
 static inline unsigned get_max_io_size(struct request_queue *q,
@@ -230,7 +236,6 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
 
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs,
 					 unsigned *segs)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
@@ -290,7 +295,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	*segs = nsegs;
 
 	if (do_split) {
-		new = bio_split(bio, sectors, GFP_NOIO, bs);
+		new = bio_split(bio, sectors, GFP_NOIO, &bio_split_bio_set);
 		if (new)
 			bio = new;
 	}
@@ -310,16 +315,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_discard_split(q, *bio, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, &nsegs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, &nsegs);
 		break;
 	}
 
@@ -979,3 +984,10 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 		return ELEVATOR_FRONT_MERGE;
 	return ELEVATOR_NO_MERGE;
 }
+
+static int __init blk_bio_split_init(void)
+{
+	return bioset_init(&bio_split_bio_set, BIO_POOL_SIZE, 0,
+			   BIOSET_NEED_BVECS);
+}
+subsys_initcall(blk_bio_split_init);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 422327089e0f..e72785751f1a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -868,8 +868,6 @@ static void __blk_release_queue(struct work_struct *work)
 	if (queue_is_mq(q))
 		blk_mq_debugfs_unregister(q);
 
-	bioset_exit(&q->bio_split);
-
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4b85dc066264..31e9b37e71d4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -552,7 +552,6 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		bio_split;
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-18 14:06 ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-18 14:06 UTC (permalink / raw)


When calling blk_queue_split() it will be using the per-queue
bioset to allocate the split bio from. However, blk_steal_bios()
might move the bio to another queue, _and_ the original queue
might be removed completely (nvme is especially prone to do so).
That leaves the bvecs of the split bio with a missing / destroyed
mempool, and a really fun crash in bio_endio().

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 block/blk-core.c       |  9 +--------
 block/blk-merge.c      | 36 ++++++++++++++++++++++++------------
 block/blk-sysfs.c      |  2 --
 include/linux/blkdev.h |  1 -
 4 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4673ebe42255..9317e3de2337 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -474,7 +474,6 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
@@ -488,13 +487,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (ret)
-		goto fail_id;
-
 	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
 	if (!q->backing_dev_info)
-		goto fail_split;
+		goto fail_id;
 
 	q->stats = blk_alloc_queue_stats();
 	if (!q->stats)
@@ -544,8 +539,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	blk_free_queue_stats(q->stats);
 fail_stats:
 	bdi_put(q->backing_dev_info);
-fail_split:
-	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8f96d683b577..9c8636794944 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,6 +12,12 @@
 
 #include "blk.h"
 
+/*
+ * bio_split_bio_set is the bio-set containing bio and iovec memory pools used
+ * by bio_split.
+ */
+struct bio_set bio_split_bio_set;
+
 /*
  * Check if the two bvecs from two bios can be merged to one segment.  If yes,
  * no need to check gap between the two bios since the 1st bio and the 1st bvec
@@ -77,7 +83,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 
 static struct bio *blk_bio_discard_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs,
 					 unsigned *nsegs)
 {
 	unsigned int max_discard_sectors, granularity;
@@ -116,11 +121,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	if (split_sectors > tmp)
 		split_sectors -= tmp;
 
-	return bio_split(bio, split_sectors, GFP_NOIO, bs);
+	return bio_split(bio, split_sectors, GFP_NOIO, &bio_split_bio_set);
 }
 
 static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
-		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+		struct bio *bio, unsigned *nsegs)
 {
 	*nsegs = 1;
 
@@ -130,12 +135,12 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 	if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO,
+			 &bio_split_bio_set);
 }
 
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio *bio,
-					    struct bio_set *bs,
 					    unsigned *nsegs)
 {
 	*nsegs = 1;
@@ -146,7 +151,8 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO,
+			 &bio_split_bio_set);
 }
 
 static inline unsigned get_max_io_size(struct request_queue *q,
@@ -230,7 +236,6 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
 
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs,
 					 unsigned *segs)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
@@ -290,7 +295,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	*segs = nsegs;
 
 	if (do_split) {
-		new = bio_split(bio, sectors, GFP_NOIO, bs);
+		new = bio_split(bio, sectors, GFP_NOIO, &bio_split_bio_set);
 		if (new)
 			bio = new;
 	}
@@ -310,16 +315,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_discard_split(q, *bio, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, &nsegs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, &nsegs);
 		break;
 	}
 
@@ -979,3 +984,10 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 		return ELEVATOR_FRONT_MERGE;
 	return ELEVATOR_NO_MERGE;
 }
+
+static int __init blk_bio_split_init(void)
+{
+	return bioset_init(&bio_split_bio_set, BIO_POOL_SIZE, 0,
+			   BIOSET_NEED_BVECS);
+}
+subsys_initcall(blk_bio_split_init);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 422327089e0f..e72785751f1a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -868,8 +868,6 @@ static void __blk_release_queue(struct work_struct *work)
 	if (queue_is_mq(q))
 		blk_mq_debugfs_unregister(q);
 
-	bioset_exit(&q->bio_split);
-
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4b85dc066264..31e9b37e71d4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -552,7 +552,6 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		bio_split;
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-18 14:06 ` Hannes Reinecke
@ 2019-04-18 14:34   ` Ming Lei
  -1 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-18 14:34 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei,
	linux-nvme, linux-block, Christoph Hellwig, neilb

Hi Hannes,

On Thu, Apr 18, 2019 at 04:06:32PM +0200, Hannes Reinecke wrote:
> When calling blk_queue_split() it will be using the per-queue
> bioset to allocate the split bio from. However, blk_steal_bios()
> might move the bio to another queue, _and_ the original queue
> might be removed completely (nvme is especially prone to do so).

Could you explain a bit how the original queue is removed in case
that blk_steal_bios() is involved?

> That leaves the bvecs of the split bio with a missing / destroyed
> mempool, and a really fun crash in bio_endio().

per-queue bioset is used originally for avoiding deadlock, are you
sure the static bioset is safe?


Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-18 14:34   ` Ming Lei
  0 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-18 14:34 UTC (permalink / raw)


Hi Hannes,

On Thu, Apr 18, 2019@04:06:32PM +0200, Hannes Reinecke wrote:
> When calling blk_queue_split() it will be using the per-queue
> bioset to allocate the split bio from. However, blk_steal_bios()
> might move the bio to another queue, _and_ the original queue
> might be removed completely (nvme is especially prone to do so).

Could you explain a bit how the original queue is removed in case
that blk_steal_bios() is involved?

> That leaves the bvecs of the split bio with a missing / destroyed
> mempool, and a really fun crash in bio_endio().

per-queue bioset is used originally for avoiding deadlock, are you
sure the static bioset is safe?


Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-18 14:34   ` Ming Lei
@ 2019-04-18 15:02     ` Hannes Reinecke
  -1 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-18 15:02 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei,
	linux-nvme, linux-block, Christoph Hellwig, neilb

On 4/18/19 4:34 PM, Ming Lei wrote:
> Hi Hannes,
> 
> On Thu, Apr 18, 2019 at 04:06:32PM +0200, Hannes Reinecke wrote:
>> When calling blk_queue_split() it will be using the per-queue
>> bioset to allocate the split bio from. However, blk_steal_bios()
>> might move the bio to another queue, _and_ the original queue
>> might be removed completely (nvme is especially prone to do so).
> 
> Could you explain a bit how the original queue is removed in case
> that blk_steal_bios() is involved?
> 
It's not blk_steal_bios() which removes the queue. What happens is:

- bio returns with error
- blk_steal_bios() moves bio over to a different queue
- nvme_reset_ctrl() is called
- Error detection finds that the original device is gone
- nvme calls nvme_remove_ns()
- nvme_remove_ns() removes the original request queue alongside the 
bio_set from which the bvecs have been allocated from.
- 'stolen' bio is completed
- 'stolen' bio calls bio_endio()
- bio_endio() calls mempool_free() on the bvecs, referencing the mempool 
from the original queue
- crash

>> That leaves the bvecs of the split bio with a missing / destroyed
>> mempool, and a really fun crash in bio_endio().
> 
> per-queue bioset is used originally for avoiding deadlock, are you
> sure the static bioset is safe?
> 
If that turns out be be an issue we could be having a per 'ns_head' 
bio_set for allocating the split bio from.
But the main point is that we cannot use the bioset from the request 
queue as the queue (and the bioset) might be removed during the lifetime 
of the bio.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-18 15:02     ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-18 15:02 UTC (permalink / raw)


On 4/18/19 4:34 PM, Ming Lei wrote:
> Hi Hannes,
> 
> On Thu, Apr 18, 2019@04:06:32PM +0200, Hannes Reinecke wrote:
>> When calling blk_queue_split() it will be using the per-queue
>> bioset to allocate the split bio from. However, blk_steal_bios()
>> might move the bio to another queue, _and_ the original queue
>> might be removed completely (nvme is especially prone to do so).
> 
> Could you explain a bit how the original queue is removed in case
> that blk_steal_bios() is involved?
> 
It's not blk_steal_bios() which removes the queue. What happens is:

- bio returns with error
- blk_steal_bios() moves bio over to a different queue
- nvme_reset_ctrl() is called
- Error detection finds that the original device is gone
- nvme calls nvme_remove_ns()
- nvme_remove_ns() removes the original request queue alongside the 
bio_set from which the bvecs have been allocated from.
- 'stolen' bio is completed
- 'stolen' bio calls bio_endio()
- bio_endio() calls mempool_free() on the bvecs, referencing the mempool 
from the original queue
- crash

>> That leaves the bvecs of the split bio with a missing / destroyed
>> mempool, and a really fun crash in bio_endio().
> 
> per-queue bioset is used originally for avoiding deadlock, are you
> sure the static bioset is safe?
> 
If that turns out be be an issue we could be having a per 'ns_head' 
bio_set for allocating the split bio from.
But the main point is that we cannot use the bioset from the request 
queue as the queue (and the bioset) might be removed during the lifetime 
of the bio.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-18 15:02     ` Hannes Reinecke
@ 2019-04-18 18:26       ` Edmund Nadolski (Microsoft)
  -1 siblings, 0 replies; 28+ messages in thread
From: Edmund Nadolski (Microsoft) @ 2019-04-18 18:26 UTC (permalink / raw)
  To: Hannes Reinecke, Ming Lei
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei, neilb,
	linux-nvme, linux-block, Christoph Hellwig

On 4/18/19 8:02 AM, Hannes Reinecke wrote:
> On 4/18/19 4:34 PM, Ming Lei wrote:
>> Hi Hannes,
>> 
>> On Thu, Apr 18, 2019 at 04:06:32PM +0200, Hannes Reinecke wrote:
>>> When calling blk_queue_split() it will be using the per-queue
>>> bioset to allocate the split bio from. However, blk_steal_bios()
>>> might move the bio to another queue, _and_ the original queue
>>> might be removed completely (nvme is especially prone to do so).
>> 
>> Could you explain a bit how the original queue is removed in case
>> that blk_steal_bios() is involved?
>> 
> It's not blk_steal_bios() which removes the queue. What happens is:
> 
> - bio returns with error
> - blk_steal_bios() moves bio over to a different queue
> - nvme_reset_ctrl() is called
> - Error detection finds that the original device is gone
> - nvme calls nvme_remove_ns()
> - nvme_remove_ns() removes the original request queue alongside the
> bio_set from which the bvecs have been allocated from.
> - 'stolen' bio is completed
> - 'stolen' bio calls bio_endio()
> - bio_endio() calls mempool_free() on the bvecs, referencing the mempool
> from the original queue
> - crash
> 
>>> That leaves the bvecs of the split bio with a missing / destroyed
>>> mempool, and a really fun crash in bio_endio().
>> 
>> per-queue bioset is used originally for avoiding deadlock, are you
>> sure the static bioset is safe?
>> 
> If that turns out be be an issue we could be having a per 'ns_head'
> bio_set for allocating the split bio from.
> But the main point is that we cannot use the bioset from the request
> queue as the queue (and the bioset) might be removed during the lifetime
> of the bio.

Does this mean that as-is, this is safe because nvme (currently) is the 
only consumer of blk_steal_bios()?

(Sorry if this is a lame question, I'm in the process of learning this 
code.)

TIA,
Ed


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-18 18:26       ` Edmund Nadolski (Microsoft)
  0 siblings, 0 replies; 28+ messages in thread
From: Edmund Nadolski (Microsoft) @ 2019-04-18 18:26 UTC (permalink / raw)


On 4/18/19 8:02 AM, Hannes Reinecke wrote:
> On 4/18/19 4:34 PM, Ming Lei wrote:
>> Hi Hannes,
>> 
>> On Thu, Apr 18, 2019@04:06:32PM +0200, Hannes Reinecke wrote:
>>> When calling blk_queue_split() it will be using the per-queue
>>> bioset to allocate the split bio from. However, blk_steal_bios()
>>> might move the bio to another queue, _and_ the original queue
>>> might be removed completely (nvme is especially prone to do so).
>> 
>> Could you explain a bit how the original queue is removed in case
>> that blk_steal_bios() is involved?
>> 
> It's not blk_steal_bios() which removes the queue. What happens is:
> 
> - bio returns with error
> - blk_steal_bios() moves bio over to a different queue
> - nvme_reset_ctrl() is called
> - Error detection finds that the original device is gone
> - nvme calls nvme_remove_ns()
> - nvme_remove_ns() removes the original request queue alongside the
> bio_set from which the bvecs have been allocated from.
> - 'stolen' bio is completed
> - 'stolen' bio calls bio_endio()
> - bio_endio() calls mempool_free() on the bvecs, referencing the mempool
> from the original queue
> - crash
> 
>>> That leaves the bvecs of the split bio with a missing / destroyed
>>> mempool, and a really fun crash in bio_endio().
>> 
>> per-queue bioset is used originally for avoiding deadlock, are you
>> sure the static bioset is safe?
>> 
> If that turns out be be an issue we could be having a per 'ns_head'
> bio_set for allocating the split bio from.
> But the main point is that we cannot use the bioset from the request
> queue as the queue (and the bioset) might be removed during the lifetime
> of the bio.

Does this mean that as-is, this is safe because nvme (currently) is the 
only consumer of blk_steal_bios()?

(Sorry if this is a lame question, I'm in the process of learning this 
code.)

TIA,
Ed

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-18 14:34   ` Ming Lei
@ 2019-04-24 17:20     ` Sagi Grimberg
  -1 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2019-04-24 17:20 UTC (permalink / raw)
  To: Ming Lei, Hannes Reinecke
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei, neilb,
	linux-nvme, linux-block, Christoph Hellwig


> per-queue bioset is used originally for avoiding deadlock, are you
> sure the static bioset is safe?

Can you explain this? I didn't find any indication of that in the change
log history...

Originally introduced by Kent:
--
commit 54efd50bfd873e2dbf784e0b21a8027ba4299a3e
Author: Kent Overstreet <kent.overstreet@gmail.com>
Date:   Thu Apr 23 22:37:18 2015 -0700

     block: make generic_make_request handle arbitrarily sized bios

     The way the block layer is currently written, it goes to great lengths
     to avoid having to split bios; upper layer code (such as 
bio_add_page())
     checks what the underlying device can handle and tries to always create
     bios that don't need to be split.

     But this approach becomes unwieldy and eventually breaks down with
     stacked devices and devices with dynamic limits, and it adds a lot of
     complexity. If the block layer could split bios as needed, we could
     eliminate a lot of complexity elsewhere - particularly in stacked
     drivers. Code that creates bios can then create whatever size bios are
     convenient, and more importantly stacked drivers don't have to deal 
with
     both their own bio size limitations and the limitations of the
     (potentially multiple) devices underneath them.  In the future this 
will
     let us delete merge_bvec_fn and a bunch of other code.

     We do this by adding calls to blk_queue_split() to the various
     make_request functions that need it - a few can already handle 
arbitrary
     size bios. Note that we add the call _after_ any call to
     blk_queue_bounce(); this means that blk_queue_split() and
     blk_recalc_rq_segments() don't need to be concerned with bouncing
     affecting segment merging.

     Some make_request_fn() callbacks were simple enough to audit and verify
     they don't need blk_queue_split() calls. The skipped ones are:

      * nfhd_make_request (arch/m68k/emu/nfblock.c)
      * axon_ram_make_request (arch/powerpc/sysdev/axonram.c)
      * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c)
      * brd_make_request (ramdisk - drivers/block/brd.c)
      * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c)
      * loop_make_request
      * null_queue_bio
      * bcache's make_request fns

     Some others are almost certainly safe to remove now, but will be left
     for future patches.
--

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-24 17:20     ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2019-04-24 17:20 UTC (permalink / raw)



> per-queue bioset is used originally for avoiding deadlock, are you
> sure the static bioset is safe?

Can you explain this? I didn't find any indication of that in the change
log history...

Originally introduced by Kent:
--
commit 54efd50bfd873e2dbf784e0b21a8027ba4299a3e
Author: Kent Overstreet <kent.overstreet at gmail.com>
Date:   Thu Apr 23 22:37:18 2015 -0700

     block: make generic_make_request handle arbitrarily sized bios

     The way the block layer is currently written, it goes to great lengths
     to avoid having to split bios; upper layer code (such as 
bio_add_page())
     checks what the underlying device can handle and tries to always create
     bios that don't need to be split.

     But this approach becomes unwieldy and eventually breaks down with
     stacked devices and devices with dynamic limits, and it adds a lot of
     complexity. If the block layer could split bios as needed, we could
     eliminate a lot of complexity elsewhere - particularly in stacked
     drivers. Code that creates bios can then create whatever size bios are
     convenient, and more importantly stacked drivers don't have to deal 
with
     both their own bio size limitations and the limitations of the
     (potentially multiple) devices underneath them.  In the future this 
will
     let us delete merge_bvec_fn and a bunch of other code.

     We do this by adding calls to blk_queue_split() to the various
     make_request functions that need it - a few can already handle 
arbitrary
     size bios. Note that we add the call _after_ any call to
     blk_queue_bounce(); this means that blk_queue_split() and
     blk_recalc_rq_segments() don't need to be concerned with bouncing
     affecting segment merging.

     Some make_request_fn() callbacks were simple enough to audit and verify
     they don't need blk_queue_split() calls. The skipped ones are:

      * nfhd_make_request (arch/m68k/emu/nfblock.c)
      * axon_ram_make_request (arch/powerpc/sysdev/axonram.c)
      * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c)
      * brd_make_request (ramdisk - drivers/block/brd.c)
      * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c)
      * loop_make_request
      * null_queue_bio
      * bcache's make_request fns

     Some others are almost certainly safe to remove now, but will be left
     for future patches.
--

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-24 17:20     ` Sagi Grimberg
@ 2019-04-24 18:56       ` Hannes Reinecke
  -1 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-24 18:56 UTC (permalink / raw)
  To: Sagi Grimberg, Ming Lei
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei, neilb,
	linux-nvme, linux-block, Christoph Hellwig

On 4/24/19 7:20 PM, Sagi Grimberg wrote:
> 
>> per-queue bioset is used originally for avoiding deadlock, are you
>> sure the static bioset is safe?
> 
> Can you explain this? I didn't find any indication of that in the change
> log history...
> 
> Originally introduced by Kent:
> -- 
> commit 54efd50bfd873e2dbf784e0b21a8027ba4299a3e
> Author: Kent Overstreet <kent.overstreet@gmail.com>
> Date:   Thu Apr 23 22:37:18 2015 -0700
> 
Phew. I was wondering whether I'd been too stupid to find it.
And having looked at it a bit closer, moving the bioset into the holding 
structure would induce more churn, as we'd need to pass the bioset into
blk_queue_split(), which would mean to update all drivers (11 in total) 
for no real gain IMO; bio splitting shouldn't really be a regular 
occurrence and hence the bioset should only be used infrequently.

However, I really would like to get feedback to the actual patch, as 
this solves a real customer issues we've had (or still have, depending 
on the view :-)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke            Teamlead Storage & Networking
hare@suse.de                              +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-24 18:56       ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-24 18:56 UTC (permalink / raw)


On 4/24/19 7:20 PM, Sagi Grimberg wrote:
> 
>> per-queue bioset is used originally for avoiding deadlock, are you
>> sure the static bioset is safe?
> 
> Can you explain this? I didn't find any indication of that in the change
> log history...
> 
> Originally introduced by Kent:
> -- 
> commit 54efd50bfd873e2dbf784e0b21a8027ba4299a3e
> Author: Kent Overstreet <kent.overstreet at gmail.com>
> Date:?? Thu Apr 23 22:37:18 2015 -0700
> 
Phew. I was wondering whether I'd been too stupid to find it.
And having looked at it a bit closer, moving the bioset into the holding 
structure would induce more churn, as we'd need to pass the bioset into
blk_queue_split(), which would mean to update all drivers (11 in total) 
for no real gain IMO; bio splitting shouldn't really be a regular 
occurrence and hence the bioset should only be used infrequently.

However, I really would like to get feedback to the actual patch, as 
this solves a real customer issues we've had (or still have, depending 
on the view :-)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke            Teamlead Storage & Networking
hare at suse.de                              +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-24 18:56       ` Hannes Reinecke
@ 2019-04-24 18:58         ` Sagi Grimberg
  -1 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2019-04-24 18:58 UTC (permalink / raw)
  To: Hannes Reinecke, Ming Lei
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei, neilb,
	linux-nvme, linux-block, Christoph Hellwig


> Phew. I was wondering whether I'd been too stupid to find it.
> And having looked at it a bit closer, moving the bioset into the holding 
> structure would induce more churn, as we'd need to pass the bioset into
> blk_queue_split(), which would mean to update all drivers (11 in total) 
> for no real gain IMO; bio splitting shouldn't really be a regular 
> occurrence and hence the bioset should only be used infrequently.
> 
> However, I really would like to get feedback to the actual patch, as 
> this solves a real customer issues we've had (or still have, depending 
> on the view :-)

I think that the patch is good, which is why I asked Ming to provide
more info...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-24 18:58         ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2019-04-24 18:58 UTC (permalink / raw)



> Phew. I was wondering whether I'd been too stupid to find it.
> And having looked at it a bit closer, moving the bioset into the holding 
> structure would induce more churn, as we'd need to pass the bioset into
> blk_queue_split(), which would mean to update all drivers (11 in total) 
> for no real gain IMO; bio splitting shouldn't really be a regular 
> occurrence and hence the bioset should only be used infrequently.
> 
> However, I really would like to get feedback to the actual patch, as 
> this solves a real customer issues we've had (or still have, depending 
> on the view :-)

I think that the patch is good, which is why I asked Ming to provide
more info...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-18 14:06 ` Hannes Reinecke
@ 2019-04-24 19:49   ` Bart Van Assche
  -1 siblings, 0 replies; 28+ messages in thread
From: Bart Van Assche @ 2019-04-24 19:49 UTC (permalink / raw)
  To: Hannes Reinecke, Jens Axboe
  Cc: Christoph Hellwig, Ming Lei, linux-nvme, linux-block, Hannes Reinecke

On Thu, 2019-04-18 at 16:06 +0200, Hannes Reinecke wrote:
> +static int __init blk_bio_split_init(void)
> +{
> +       return bioset_init(&bio_split_bio_set, BIO_POOL_SIZE, 0,
> +                          BIOSET_NEED_BVECS);
> +}

The slab allocator uses __init for some of its initialization. Can it happen
that blk_bio_split_init() is called before slab initialization has finished?

Otherwise this patch looks fine to me.

Bart.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-24 19:49   ` Bart Van Assche
  0 siblings, 0 replies; 28+ messages in thread
From: Bart Van Assche @ 2019-04-24 19:49 UTC (permalink / raw)


On Thu, 2019-04-18@16:06 +0200, Hannes Reinecke wrote:
> +static int __init blk_bio_split_init(void)
> +{
> +       return bioset_init(&bio_split_bio_set, BIO_POOL_SIZE, 0,
> +                          BIOSET_NEED_BVECS);
> +}

The slab allocator uses __init for some of its initialization. Can it happen
that blk_bio_split_init() is called before slab initialization has finished?

Otherwise this patch looks fine to me.

Bart.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-24 17:20     ` Sagi Grimberg
@ 2019-04-24 22:14       ` Ming Lei
  -1 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-24 22:14 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Hannes Reinecke, Jens Axboe, Hannes Reinecke, Bart van Assche,
	Ming Lei, neilb, linux-nvme, linux-block, Christoph Hellwig,
	Kent Overstreet

On Wed, Apr 24, 2019 at 10:20:46AM -0700, Sagi Grimberg wrote:
> 
> > per-queue bioset is used originally for avoiding deadlock, are you
> > sure the static bioset is safe?
> 
> Can you explain this? I didn't find any indication of that in the change
> log history...
> 
> Originally introduced by Kent:

bio split can be run from stacking drivers, for example, MD over NVMe,
if the global reserved mempool is consumed by MD bio splitting, then
no any progress can be made when splitting on bio submitted to NVMe.

Kent may have more details...

Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-24 22:14       ` Ming Lei
  0 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-24 22:14 UTC (permalink / raw)


On Wed, Apr 24, 2019@10:20:46AM -0700, Sagi Grimberg wrote:
> 
> > per-queue bioset is used originally for avoiding deadlock, are you
> > sure the static bioset is safe?
> 
> Can you explain this? I didn't find any indication of that in the change
> log history...
> 
> Originally introduced by Kent:

bio split can be run from stacking drivers, for example, MD over NVMe,
if the global reserved mempool is consumed by MD bio splitting, then
no any progress can be made when splitting on bio submitted to NVMe.

Kent may have more details...

Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-24 22:14       ` Ming Lei
@ 2019-04-25  0:41         ` Ming Lei
  -1 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-25  0:41 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Hannes Reinecke, Jens Axboe, Hannes Reinecke, Bart van Assche,
	Ming Lei, neilb, linux-nvme, linux-block, Christoph Hellwig,
	Kent Overstreet

On Thu, Apr 25, 2019 at 06:14:22AM +0800, Ming Lei wrote:
> On Wed, Apr 24, 2019 at 10:20:46AM -0700, Sagi Grimberg wrote:
> > 
> > > per-queue bioset is used originally for avoiding deadlock, are you
> > > sure the static bioset is safe?
> > 
> > Can you explain this? I didn't find any indication of that in the change
> > log history...
> > 
> > Originally introduced by Kent:
> 
> bio split can be run from stacking drivers, for example, MD over NVMe,
> if the global reserved mempool is consumed by MD bio splitting, then
> no any progress can be made when splitting on bio submitted to NVMe.
> 
> Kent may have more details...

I guess it might be fine to use one shared global bio_set for all
lowest underlying queues, could be all queues except for loop, dm, md
, drbd, bcache, ...

Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-25  0:41         ` Ming Lei
  0 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-25  0:41 UTC (permalink / raw)


On Thu, Apr 25, 2019@06:14:22AM +0800, Ming Lei wrote:
> On Wed, Apr 24, 2019@10:20:46AM -0700, Sagi Grimberg wrote:
> > 
> > > per-queue bioset is used originally for avoiding deadlock, are you
> > > sure the static bioset is safe?
> > 
> > Can you explain this? I didn't find any indication of that in the change
> > log history...
> > 
> > Originally introduced by Kent:
> 
> bio split can be run from stacking drivers, for example, MD over NVMe,
> if the global reserved mempool is consumed by MD bio splitting, then
> no any progress can be made when splitting on bio submitted to NVMe.
> 
> Kent may have more details...

I guess it might be fine to use one shared global bio_set for all
lowest underlying queues, could be all queues except for loop, dm, md
, drbd, bcache, ...

Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-24 19:49   ` Bart Van Assche
@ 2019-04-25  6:06     ` Hannes Reinecke
  -1 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-25  6:06 UTC (permalink / raw)
  To: Bart Van Assche, Jens Axboe
  Cc: Christoph Hellwig, Ming Lei, linux-nvme, linux-block, Hannes Reinecke

On 4/24/19 9:49 PM, Bart Van Assche wrote:
> On Thu, 2019-04-18 at 16:06 +0200, Hannes Reinecke wrote:
>> +static int __init blk_bio_split_init(void)
>> +{
>> +       return bioset_init(&bio_split_bio_set, BIO_POOL_SIZE, 0,
>> +                          BIOSET_NEED_BVECS);
>> +}
> 
> The slab allocator uses __init for some of its initialization. Can it happen
> that blk_bio_split_init() is called before slab initialization has finished?
> 
> Otherwise this patch looks fine to me.
> 
Most unlikely.
We have another static bioset (fs_bio_set in block/bio.c) which is 
initialised in the same manner.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-25  6:06     ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-25  6:06 UTC (permalink / raw)


On 4/24/19 9:49 PM, Bart Van Assche wrote:
> On Thu, 2019-04-18@16:06 +0200, Hannes Reinecke wrote:
>> +static int __init blk_bio_split_init(void)
>> +{
>> +       return bioset_init(&bio_split_bio_set, BIO_POOL_SIZE, 0,
>> +                          BIOSET_NEED_BVECS);
>> +}
> 
> The slab allocator uses __init for some of its initialization. Can it happen
> that blk_bio_split_init() is called before slab initialization has finished?
> 
> Otherwise this patch looks fine to me.
> 
Most unlikely.
We have another static bioset (fs_bio_set in block/bio.c) which is 
initialised in the same manner.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-25  0:41         ` Ming Lei
@ 2019-04-25 14:32           ` Hannes Reinecke
  -1 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-25 14:32 UTC (permalink / raw)
  To: Ming Lei, Sagi Grimberg
  Cc: Jens Axboe, Hannes Reinecke, Bart van Assche, Ming Lei, neilb,
	linux-nvme, linux-block, Christoph Hellwig, Kent Overstreet

On 4/25/19 2:41 AM, Ming Lei wrote:
> On Thu, Apr 25, 2019 at 06:14:22AM +0800, Ming Lei wrote:
>> On Wed, Apr 24, 2019 at 10:20:46AM -0700, Sagi Grimberg wrote:
>>>
>>>> per-queue bioset is used originally for avoiding deadlock, are you
>>>> sure the static bioset is safe?
>>>
>>> Can you explain this? I didn't find any indication of that in the change
>>> log history...
>>>
>>> Originally introduced by Kent:
>>
>> bio split can be run from stacking drivers, for example, MD over NVMe,
>> if the global reserved mempool is consumed by MD bio splitting, then
>> no any progress can be made when splitting on bio submitted to NVMe.
>>
>> Kent may have more details...
> 
> I guess it might be fine to use one shared global bio_set for all
> lowest underlying queues, could be all queues except for loop, dm, md
> , drbd, bcache, ...
> 
But wasn't the overall idea of stacking drivers that we propagate the 
queue limits up to the uppermost drivers, so that we have to do a split 
only at the upper layers?
Furthermore, it's not every bio which needs to be split, only those 
which straddle some device limitations.
The only ones not being able to propagate the queue limits is MD, and 
that is already using a private bio_set here.

I've worked on a patchset to provide a separate bio set for each of the 
stacking drivers, but the interface I've been able to come up with is 
not very nice, and I really doubt it's worth it.

So I'd advocate to test with this patch first, and only provide 
individual biosets once we find it's an issue.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-25 14:32           ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-25 14:32 UTC (permalink / raw)


On 4/25/19 2:41 AM, Ming Lei wrote:
> On Thu, Apr 25, 2019@06:14:22AM +0800, Ming Lei wrote:
>> On Wed, Apr 24, 2019@10:20:46AM -0700, Sagi Grimberg wrote:
>>>
>>>> per-queue bioset is used originally for avoiding deadlock, are you
>>>> sure the static bioset is safe?
>>>
>>> Can you explain this? I didn't find any indication of that in the change
>>> log history...
>>>
>>> Originally introduced by Kent:
>>
>> bio split can be run from stacking drivers, for example, MD over NVMe,
>> if the global reserved mempool is consumed by MD bio splitting, then
>> no any progress can be made when splitting on bio submitted to NVMe.
>>
>> Kent may have more details...
> 
> I guess it might be fine to use one shared global bio_set for all
> lowest underlying queues, could be all queues except for loop, dm, md
> , drbd, bcache, ...
> 
But wasn't the overall idea of stacking drivers that we propagate the 
queue limits up to the uppermost drivers, so that we have to do a split 
only at the upper layers?
Furthermore, it's not every bio which needs to be split, only those 
which straddle some device limitations.
The only ones not being able to propagate the queue limits is MD, and 
that is already using a private bio_set here.

I've worked on a patchset to provide a separate bio set for each of the 
stacking drivers, but the interface I've been able to come up with is 
not very nice, and I really doubt it's worth it.

So I'd advocate to test with this patch first, and only provide 
individual biosets once we find it's an issue.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-25 14:32           ` Hannes Reinecke
@ 2019-04-25 15:36             ` Ming Lei
  -1 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-25 15:36 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Sagi Grimberg, Jens Axboe, Hannes Reinecke, Bart van Assche,
	Ming Lei, neilb, linux-nvme, linux-block, Christoph Hellwig,
	Kent Overstreet

On Thu, Apr 25, 2019 at 04:32:42PM +0200, Hannes Reinecke wrote:
> On 4/25/19 2:41 AM, Ming Lei wrote:
> > On Thu, Apr 25, 2019 at 06:14:22AM +0800, Ming Lei wrote:
> > > On Wed, Apr 24, 2019 at 10:20:46AM -0700, Sagi Grimberg wrote:
> > > > 
> > > > > per-queue bioset is used originally for avoiding deadlock, are you
> > > > > sure the static bioset is safe?
> > > > 
> > > > Can you explain this? I didn't find any indication of that in the change
> > > > log history...
> > > > 
> > > > Originally introduced by Kent:
> > > 
> > > bio split can be run from stacking drivers, for example, MD over NVMe,
> > > if the global reserved mempool is consumed by MD bio splitting, then
> > > no any progress can be made when splitting on bio submitted to NVMe.
> > > 
> > > Kent may have more details...
> > 
> > I guess it might be fine to use one shared global bio_set for all
> > lowest underlying queues, could be all queues except for loop, dm, md
> > , drbd, bcache, ...
> > 
> But wasn't the overall idea of stacking drivers that we propagate the queue
> limits up to the uppermost drivers, so that we have to do a split only at
> the upper layers?

For example, LVM over RAID, the limits of LVM queue is figured out and fixed
during creating LVM. However, new device may be added to the RAID. Then the
underlying queue's limit may not be propagated to LVM's queue's limit.

And we did discuss the topic of 'block: dm: restack queue_limits'
before, looks not see any progress made.

Also loop doesn't consider stack limits at all.

> Furthermore, it's not every bio which needs to be split, only those which
> straddle some device limitations.
> The only ones not being able to propagate the queue limits is MD, and that
> is already using a private bio_set here.

If DM and the lowest queue share one same bio_set(mem_pool), it isn't
enough for MD to use private bio_set.

> 
> I've worked on a patchset to provide a separate bio set for each of the
> stacking drivers, but the interface I've been able to come up with is not
> very nice, and I really doubt it's worth it.
> 
> So I'd advocate to test with this patch first, and only provide individual
> biosets once we find it's an issue.

The above issue of LVM over RAID was a real one reported inside Red Hat.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-25 15:36             ` Ming Lei
  0 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2019-04-25 15:36 UTC (permalink / raw)


On Thu, Apr 25, 2019@04:32:42PM +0200, Hannes Reinecke wrote:
> On 4/25/19 2:41 AM, Ming Lei wrote:
> > On Thu, Apr 25, 2019@06:14:22AM +0800, Ming Lei wrote:
> > > On Wed, Apr 24, 2019@10:20:46AM -0700, Sagi Grimberg wrote:
> > > > 
> > > > > per-queue bioset is used originally for avoiding deadlock, are you
> > > > > sure the static bioset is safe?
> > > > 
> > > > Can you explain this? I didn't find any indication of that in the change
> > > > log history...
> > > > 
> > > > Originally introduced by Kent:
> > > 
> > > bio split can be run from stacking drivers, for example, MD over NVMe,
> > > if the global reserved mempool is consumed by MD bio splitting, then
> > > no any progress can be made when splitting on bio submitted to NVMe.
> > > 
> > > Kent may have more details...
> > 
> > I guess it might be fine to use one shared global bio_set for all
> > lowest underlying queues, could be all queues except for loop, dm, md
> > , drbd, bcache, ...
> > 
> But wasn't the overall idea of stacking drivers that we propagate the queue
> limits up to the uppermost drivers, so that we have to do a split only at
> the upper layers?

For example, LVM over RAID, the limits of LVM queue is figured out and fixed
during creating LVM. However, new device may be added to the RAID. Then the
underlying queue's limit may not be propagated to LVM's queue's limit.

And we did discuss the topic of 'block: dm: restack queue_limits'
before, looks not see any progress made.

Also loop doesn't consider stack limits at all.

> Furthermore, it's not every bio which needs to be split, only those which
> straddle some device limitations.
> The only ones not being able to propagate the queue limits is MD, and that
> is already using a private bio_set here.

If DM and the lowest queue share one same bio_set(mem_pool), it isn't
enough for MD to use private bio_set.

> 
> I've worked on a patchset to provide a separate bio set for each of the
> stacking drivers, but the interface I've been able to come up with is not
> very nice, and I really doubt it's worth it.
> 
> So I'd advocate to test with this patch first, and only provide individual
> biosets once we find it's an issue.

The above issue of LVM over RAID was a real one reported inside Red Hat.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] block: use static bio_set for bio_split() calls
  2019-04-25 15:36             ` Ming Lei
@ 2019-04-30 11:48               ` Hannes Reinecke
  -1 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-30 11:48 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Hannes Reinecke, Sagi Grimberg, Ming Lei, neilb,
	linux-nvme, Kent Overstreet, linux-block, Christoph Hellwig,
	Bart van Assche

On 4/25/19 5:36 PM, Ming Lei wrote:
> On Thu, Apr 25, 2019 at 04:32:42PM +0200, Hannes Reinecke wrote:
>> On 4/25/19 2:41 AM, Ming Lei wrote:
>>> On Thu, Apr 25, 2019 at 06:14:22AM +0800, Ming Lei wrote:
>>>> On Wed, Apr 24, 2019 at 10:20:46AM -0700, Sagi Grimberg wrote:
>>>>>
>>>>>> per-queue bioset is used originally for avoiding deadlock, are you
>>>>>> sure the static bioset is safe?
>>>>>
>>>>> Can you explain this? I didn't find any indication of that in the change
>>>>> log history...
>>>>>
>>>>> Originally introduced by Kent:
>>>>
>>>> bio split can be run from stacking drivers, for example, MD over NVMe,
>>>> if the global reserved mempool is consumed by MD bio splitting, then
>>>> no any progress can be made when splitting on bio submitted to NVMe.
>>>>
>>>> Kent may have more details...
>>>
>>> I guess it might be fine to use one shared global bio_set for all
>>> lowest underlying queues, could be all queues except for loop, dm, md
>>> , drbd, bcache, ...
>>>
>> But wasn't the overall idea of stacking drivers that we propagate the queue
>> limits up to the uppermost drivers, so that we have to do a split only at
>> the upper layers?
> 
> For example, LVM over RAID, the limits of LVM queue is figured out and fixed
> during creating LVM. However, new device may be added to the RAID. Then the
> underlying queue's limit may not be propagated to LVM's queue's limit.
> 
> And we did discuss the topic of 'block: dm: restack queue_limits'
> before, looks not see any progress made.
> 
> Also loop doesn't consider stack limits at all.
> 
>> Furthermore, it's not every bio which needs to be split, only those which
>> straddle some device limitations.
>> The only ones not being able to propagate the queue limits is MD, and that
>> is already using a private bio_set here.
> 
> If DM and the lowest queue share one same bio_set(mem_pool), it isn't
> enough for MD to use private bio_set.
> 
Ah, right. I've reviewed the patches implementing the per-queue biosets, 
and indeed we'll need to use them.
But meanwhile I've found another way of circumventing this issue, so 
this patch can be dropped.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] block: use static bio_set for bio_split() calls
@ 2019-04-30 11:48               ` Hannes Reinecke
  0 siblings, 0 replies; 28+ messages in thread
From: Hannes Reinecke @ 2019-04-30 11:48 UTC (permalink / raw)


On 4/25/19 5:36 PM, Ming Lei wrote:
> On Thu, Apr 25, 2019@04:32:42PM +0200, Hannes Reinecke wrote:
>> On 4/25/19 2:41 AM, Ming Lei wrote:
>>> On Thu, Apr 25, 2019@06:14:22AM +0800, Ming Lei wrote:
>>>> On Wed, Apr 24, 2019@10:20:46AM -0700, Sagi Grimberg wrote:
>>>>>
>>>>>> per-queue bioset is used originally for avoiding deadlock, are you
>>>>>> sure the static bioset is safe?
>>>>>
>>>>> Can you explain this? I didn't find any indication of that in the change
>>>>> log history...
>>>>>
>>>>> Originally introduced by Kent:
>>>>
>>>> bio split can be run from stacking drivers, for example, MD over NVMe,
>>>> if the global reserved mempool is consumed by MD bio splitting, then
>>>> no any progress can be made when splitting on bio submitted to NVMe.
>>>>
>>>> Kent may have more details...
>>>
>>> I guess it might be fine to use one shared global bio_set for all
>>> lowest underlying queues, could be all queues except for loop, dm, md
>>> , drbd, bcache, ...
>>>
>> But wasn't the overall idea of stacking drivers that we propagate the queue
>> limits up to the uppermost drivers, so that we have to do a split only at
>> the upper layers?
> 
> For example, LVM over RAID, the limits of LVM queue is figured out and fixed
> during creating LVM. However, new device may be added to the RAID. Then the
> underlying queue's limit may not be propagated to LVM's queue's limit.
> 
> And we did discuss the topic of 'block: dm: restack queue_limits'
> before, looks not see any progress made.
> 
> Also loop doesn't consider stack limits at all.
> 
>> Furthermore, it's not every bio which needs to be split, only those which
>> straddle some device limitations.
>> The only ones not being able to propagate the queue limits is MD, and that
>> is already using a private bio_set here.
> 
> If DM and the lowest queue share one same bio_set(mem_pool), it isn't
> enough for MD to use private bio_set.
> 
Ah, right. I've reviewed the patches implementing the per-queue biosets, 
and indeed we'll need to use them.
But meanwhile I've found another way of circumventing this issue, so 
this patch can be dropped.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2019-04-30 11:57 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-18 14:06 [PATCH] block: use static bio_set for bio_split() calls Hannes Reinecke
2019-04-18 14:06 ` Hannes Reinecke
2019-04-18 14:34 ` Ming Lei
2019-04-18 14:34   ` Ming Lei
2019-04-18 15:02   ` Hannes Reinecke
2019-04-18 15:02     ` Hannes Reinecke
2019-04-18 18:26     ` Edmund Nadolski (Microsoft)
2019-04-18 18:26       ` Edmund Nadolski (Microsoft)
2019-04-24 17:20   ` Sagi Grimberg
2019-04-24 17:20     ` Sagi Grimberg
2019-04-24 18:56     ` Hannes Reinecke
2019-04-24 18:56       ` Hannes Reinecke
2019-04-24 18:58       ` Sagi Grimberg
2019-04-24 18:58         ` Sagi Grimberg
2019-04-24 22:14     ` Ming Lei
2019-04-24 22:14       ` Ming Lei
2019-04-25  0:41       ` Ming Lei
2019-04-25  0:41         ` Ming Lei
2019-04-25 14:32         ` Hannes Reinecke
2019-04-25 14:32           ` Hannes Reinecke
2019-04-25 15:36           ` Ming Lei
2019-04-25 15:36             ` Ming Lei
2019-04-30 11:48             ` Hannes Reinecke
2019-04-30 11:48               ` Hannes Reinecke
2019-04-24 19:49 ` Bart Van Assche
2019-04-24 19:49   ` Bart Van Assche
2019-04-25  6:06   ` Hannes Reinecke
2019-04-25  6:06     ` Hannes Reinecke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.