All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] md/md0: optimize raid0 discard handling
@ 2017-05-08  0:36 Shaohua Li
  2017-05-08  7:31 ` Coly Li
  2017-05-09  1:03 ` NeilBrown
  0 siblings, 2 replies; 6+ messages in thread
From: Shaohua Li @ 2017-05-08  0:36 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, NeilBrown, Coly Li

There are complaints that raid0 discard handling is slow. Currently we
divide discard request into chunks and dispatch to underlayer disks. The
block layer will do merge to form big requests. This causes a lot of
request split/merge and uses significant CPU time.

A simple idea is to calculate the range for each raid disk for an IO
request and send a discard request to raid disks, which will avoid the
split/merge completely. Previously Coly tried the approach, but the
implementation was too complex because of raid0 zones. This patch always
split bio in zone boundary and handle bio within one zone. It simplifies
the implementation a lot.

Cc: NeilBrown <neilb@suse.com>
Cc: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 102 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 84e5859..d6c0bc7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
-		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
 
 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
 		blk_queue_io_opt(mddev->queue,
@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 	}
 }
 
+static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+	struct r0conf *conf = mddev->private;
+	struct strip_zone *zone;
+	sector_t start = bio->bi_iter.bi_sector;
+	sector_t end;
+	unsigned int stripe_size;
+	sector_t first_stripe_index, last_stripe_index;
+	sector_t start_disk_offset;
+	unsigned int start_disk_index;
+	sector_t end_disk_offset;
+	unsigned int end_disk_index;
+	unsigned int disk;
+
+	zone = find_zone(conf, &start);
+
+	if (bio_end_sector(bio) > zone->zone_end) {
+		struct bio *split = bio_split(bio,
+			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
+			mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		end = zone->zone_end;
+	} else
+		end = bio_end_sector(bio);
+
+	if (zone != conf->strip_zone)
+		end = end - zone[-1].zone_end;
+
+	/* Now start and end is the offset in zone */
+	stripe_size = zone->nb_dev * mddev->chunk_sectors;
+
+	first_stripe_index = start;
+	sector_div(first_stripe_index, stripe_size);
+	last_stripe_index = end;
+	sector_div(last_stripe_index, stripe_size);
+
+	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+		mddev->chunk_sectors;
+	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
+		mddev->chunk_sectors) +
+		first_stripe_index * mddev->chunk_sectors;
+	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+		mddev->chunk_sectors;
+	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
+		mddev->chunk_sectors) +
+		last_stripe_index * mddev->chunk_sectors;
+
+	for (disk = 0; disk < zone->nb_dev; disk++) {
+		sector_t dev_start, dev_end;
+		struct bio *discard_bio = NULL;
+		struct md_rdev *rdev;
+
+		if (disk < start_disk_index)
+			dev_start = (first_stripe_index + 1) *
+				mddev->chunk_sectors;
+		else if (disk > start_disk_index)
+			dev_start = first_stripe_index * mddev->chunk_sectors;
+		else
+			dev_start = start_disk_offset;
+
+		if (disk < end_disk_index)
+			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+		else if (disk > end_disk_index)
+			dev_end = last_stripe_index * mddev->chunk_sectors;
+		else
+			dev_end = end_disk_offset;
+
+		if (dev_end <= dev_start)
+			continue;
+
+		rdev = conf->devlist[(zone - conf->strip_zone) *
+			conf->strip_zone[0].nb_dev + disk];
+		if (__blkdev_issue_discard(rdev->bdev,
+			dev_start + zone->dev_start + rdev->data_offset,
+			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+		    !discard_bio)
+			continue;
+		bio_chain(discard_bio, bio);
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+				discard_bio, disk_devt(mddev->gendisk),
+				bio->bi_iter.bi_sector);
+		generic_make_request(discard_bio);
+	}
+	bio_endio(bio);
+}
+
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct strip_zone *zone;
@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+		raid0_handle_discard(mddev, bio);
+		return;
+	}
+
 	bio_sector = bio->bi_iter.bi_sector;
 	sector = bio_sector;
 	chunk_sects = mddev->chunk_sectors;
@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	bio->bi_iter.bi_sector = sector + zone->dev_start +
 		tmp_dev->data_offset;
 
-	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio);
-	} else {
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-					      bio, disk_devt(mddev->gendisk),
-					      bio_sector);
-		mddev_check_writesame(mddev, bio);
-		mddev_check_write_zeroes(mddev, bio);
-		generic_make_request(bio);
-	}
+	if (mddev->gendisk)
+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+				      bio, disk_devt(mddev->gendisk),
+				      bio_sector);
+	mddev_check_writesame(mddev, bio);
+	mddev_check_write_zeroes(mddev, bio);
+	generic_make_request(bio);
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] md/md0: optimize raid0 discard handling
  2017-05-08  0:36 [PATCH] md/md0: optimize raid0 discard handling Shaohua Li
@ 2017-05-08  7:31 ` Coly Li
  2017-05-08 16:37   ` Shaohua Li
  2017-05-09  1:03 ` NeilBrown
  1 sibling, 1 reply; 6+ messages in thread
From: Coly Li @ 2017-05-08  7:31 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: Kernel-team, NeilBrown

On 2017/5/8 上午8:36, Shaohua Li wrote:
> There are complaints that raid0 discard handling is slow. Currently we
> divide discard request into chunks and dispatch to underlayer disks. The
> block layer will do merge to form big requests. This causes a lot of
> request split/merge and uses significant CPU time.
> 
> A simple idea is to calculate the range for each raid disk for an IO
> request and send a discard request to raid disks, which will avoid the
> split/merge completely. Previously Coly tried the approach, but the
> implementation was too complex because of raid0 zones. This patch always
> split bio in zone boundary and handle bio within one zone. It simplifies
> the implementation a lot.
> 
> Cc: NeilBrown <neilb@suse.com>
> Cc: Coly Li <colyli@suse.de>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 102 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index 84e5859..d6c0bc7 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
>  		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>  		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>  		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
> -		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
> +		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
>  
>  		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
>  		blk_queue_io_opt(mddev->queue,
> @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
>  	}
>  }
>  
> +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
> +{
> +	struct r0conf *conf = mddev->private;
> +	struct strip_zone *zone;
> +	sector_t start = bio->bi_iter.bi_sector;
> +	sector_t end;
> +	unsigned int stripe_size;
> +	sector_t first_stripe_index, last_stripe_index;
> +	sector_t start_disk_offset;
> +	unsigned int start_disk_index;
> +	sector_t end_disk_offset;
> +	unsigned int end_disk_index;
> +	unsigned int disk;
> +
> +	zone = find_zone(conf, &start);
> +
> +	if (bio_end_sector(bio) > zone->zone_end) {
> +		struct bio *split = bio_split(bio,
> +			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
> +			mddev->bio_set);
> +		bio_chain(split, bio);
> +		generic_make_request(bio);
> +		bio = split;
> +		end = zone->zone_end;
> +	} else
> +		end = bio_end_sector(bio);
> +

Nice, good idea. Indeed, I have a WIP patch which trys to follow your
idea, but no clean as yours.


> +	if (zone != conf->strip_zone)
> +		end = end - zone[-1].zone_end;
> +
> +	/* Now start and end is the offset in zone */
> +	stripe_size = zone->nb_dev * mddev->chunk_sectors;
> +
> +	first_stripe_index = start;
> +	sector_div(first_stripe_index, stripe_size);
> +	last_stripe_index = end;
> +	sector_div(last_stripe_index, stripe_size);
> +
> +	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
> +		mddev->chunk_sectors;
> +	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
> +		mddev->chunk_sectors) +
> +		first_stripe_index * mddev->chunk_sectors;
> +	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
> +		mddev->chunk_sectors;
> +	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
> +		mddev->chunk_sectors) +
> +		last_stripe_index * mddev->chunk_sectors;
> +

Nice code, again! Much simpler. It is enjoyed experience when reading
these calculation :-)


> +	for (disk = 0; disk < zone->nb_dev; disk++) {
> +		sector_t dev_start, dev_end;
> +		struct bio *discard_bio = NULL;
> +		struct md_rdev *rdev;
> +
> +		if (disk < start_disk_index)
> +			dev_start = (first_stripe_index + 1) *
> +				mddev->chunk_sectors;
> +		else if (disk > start_disk_index)
> +			dev_start = first_stripe_index * mddev->chunk_sectors;
> +		else
> +			dev_start = start_disk_offset;
> +
> +		if (disk < end_disk_index)
> +			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
> +		else if (disk > end_disk_index)
> +			dev_end = last_stripe_index * mddev->chunk_sectors;
> +		else
> +			dev_end = end_disk_offset;
> +
> +		if (dev_end <= dev_start)
> +			continue;
> +
> +		rdev = conf->devlist[(zone - conf->strip_zone) *
> +			conf->strip_zone[0].nb_dev + disk];
> +		if (__blkdev_issue_discard(rdev->bdev,
> +			dev_start + zone->dev_start + rdev->data_offset,
> +			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
> +		    !discard_bio)
> +			continue;

Could you please give me any hint ? Why __blkdev_issue_discard() is
called here. The following generic_make_request() will issue the bio to
underlying device and call __blkdev_issue_discard() in its code path
eventually, why call this function explicitly here?

> +		bio_chain(discard_bio, bio);
> +		if (mddev->gendisk)
> +			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
> +				discard_bio, disk_devt(mddev->gendisk),
> +				bio->bi_iter.bi_sector);
> +		generic_make_request(discard_bio);
> +	}
> +	bio_endio(bio);
> +}
> +
>  static void raid0_make_request(struct mddev *mddev, struct bio *bio)
>  {
>  	struct strip_zone *zone;
> @@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
>  		return;
>  	}
>  
> +	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
> +		raid0_handle_discard(mddev, bio);
> +		return;
> +	}
> +
>  	bio_sector = bio->bi_iter.bi_sector;
>  	sector = bio_sector;
>  	chunk_sects = mddev->chunk_sectors;
> @@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
>  	bio->bi_iter.bi_sector = sector + zone->dev_start +
>  		tmp_dev->data_offset;
>  
> -	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
> -		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
> -		/* Just ignore it */
> -		bio_endio(bio);
> -	} else {
> -		if (mddev->gendisk)
> -			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
> -					      bio, disk_devt(mddev->gendisk),
> -					      bio_sector);
> -		mddev_check_writesame(mddev, bio);
> -		mddev_check_write_zeroes(mddev, bio);
> -		generic_make_request(bio);
> -	}
> +	if (mddev->gendisk)
> +		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
> +				      bio, disk_devt(mddev->gendisk),
> +				      bio_sector);
> +	mddev_check_writesame(mddev, bio);
> +	mddev_check_write_zeroes(mddev, bio);
> +	generic_make_request(bio);
>  }
>  
>  static void raid0_status(struct seq_file *seq, struct mddev *mddev)
> 

In general I enjoy read this patch and learn nice idea from it. I'd like
to add an "Acked-by: Coly Li <colyli@suse.de", and waiting for your
response to my question.

Thanks.

Coly



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] md/md0: optimize raid0 discard handling
  2017-05-08  7:31 ` Coly Li
@ 2017-05-08 16:37   ` Shaohua Li
  2017-05-08 16:52     ` Coly Li
  0 siblings, 1 reply; 6+ messages in thread
From: Shaohua Li @ 2017-05-08 16:37 UTC (permalink / raw)
  To: Coly Li; +Cc: Shaohua Li, linux-raid, Kernel-team, NeilBrown

On Mon, May 08, 2017 at 03:31:47PM +0800, Coly Li wrote:
> On 2017/5/8 上午8:36, Shaohua Li wrote:
> > There are complaints that raid0 discard handling is slow. Currently we
> > divide discard request into chunks and dispatch to underlayer disks. The
> > block layer will do merge to form big requests. This causes a lot of
> > request split/merge and uses significant CPU time.
> > 
> > A simple idea is to calculate the range for each raid disk for an IO
> > request and send a discard request to raid disks, which will avoid the
> > split/merge completely. Previously Coly tried the approach, but the
> > implementation was too complex because of raid0 zones. This patch always
> > split bio in zone boundary and handle bio within one zone. It simplifies
> > the implementation a lot.
> > 
> > Cc: NeilBrown <neilb@suse.com>
> > Cc: Coly Li <colyli@suse.de>
> > Signed-off-by: Shaohua Li <shli@fb.com>
> > ---
> >  drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 102 insertions(+), 14 deletions(-)
> > 
> > diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> > index 84e5859..d6c0bc7 100644
> > --- a/drivers/md/raid0.c
> > +++ b/drivers/md/raid0.c
> > @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
> >  		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
> >  		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
> >  		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
> > -		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
> > +		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
> >  
> >  		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
> >  		blk_queue_io_opt(mddev->queue,
> > @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
> >  	}
> >  }
> >  
> > +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
> > +{
> > +	struct r0conf *conf = mddev->private;
> > +	struct strip_zone *zone;
> > +	sector_t start = bio->bi_iter.bi_sector;
> > +	sector_t end;
> > +	unsigned int stripe_size;
> > +	sector_t first_stripe_index, last_stripe_index;
> > +	sector_t start_disk_offset;
> > +	unsigned int start_disk_index;
> > +	sector_t end_disk_offset;
> > +	unsigned int end_disk_index;
> > +	unsigned int disk;
> > +
> > +	zone = find_zone(conf, &start);
> > +
> > +	if (bio_end_sector(bio) > zone->zone_end) {
> > +		struct bio *split = bio_split(bio,
> > +			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
> > +			mddev->bio_set);
> > +		bio_chain(split, bio);
> > +		generic_make_request(bio);
> > +		bio = split;
> > +		end = zone->zone_end;
> > +	} else
> > +		end = bio_end_sector(bio);
> > +
> 
> Nice, good idea. Indeed, I have a WIP patch which trys to follow your
> idea, but no clean as yours.
> 
> 
> > +	if (zone != conf->strip_zone)
> > +		end = end - zone[-1].zone_end;
> > +
> > +	/* Now start and end is the offset in zone */
> > +	stripe_size = zone->nb_dev * mddev->chunk_sectors;
> > +
> > +	first_stripe_index = start;
> > +	sector_div(first_stripe_index, stripe_size);
> > +	last_stripe_index = end;
> > +	sector_div(last_stripe_index, stripe_size);
> > +
> > +	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
> > +		mddev->chunk_sectors;
> > +	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
> > +		mddev->chunk_sectors) +
> > +		first_stripe_index * mddev->chunk_sectors;
> > +	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
> > +		mddev->chunk_sectors;
> > +	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
> > +		mddev->chunk_sectors) +
> > +		last_stripe_index * mddev->chunk_sectors;
> > +
> 
> Nice code, again! Much simpler. It is enjoyed experience when reading
> these calculation :-)
> 
> 
> > +	for (disk = 0; disk < zone->nb_dev; disk++) {
> > +		sector_t dev_start, dev_end;
> > +		struct bio *discard_bio = NULL;
> > +		struct md_rdev *rdev;
> > +
> > +		if (disk < start_disk_index)
> > +			dev_start = (first_stripe_index + 1) *
> > +				mddev->chunk_sectors;
> > +		else if (disk > start_disk_index)
> > +			dev_start = first_stripe_index * mddev->chunk_sectors;
> > +		else
> > +			dev_start = start_disk_offset;
> > +
> > +		if (disk < end_disk_index)
> > +			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
> > +		else if (disk > end_disk_index)
> > +			dev_end = last_stripe_index * mddev->chunk_sectors;
> > +		else
> > +			dev_end = end_disk_offset;
> > +
> > +		if (dev_end <= dev_start)
> > +			continue;
> > +
> > +		rdev = conf->devlist[(zone - conf->strip_zone) *
> > +			conf->strip_zone[0].nb_dev + disk];
> > +		if (__blkdev_issue_discard(rdev->bdev,
> > +			dev_start + zone->dev_start + rdev->data_offset,
> > +			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
> > +		    !discard_bio)
> > +			continue;
> 
> Could you please give me any hint ? Why __blkdev_issue_discard() is
> called here. The following generic_make_request() will issue the bio to
> underlying device and call __blkdev_issue_discard() in its code path
> eventually, why call this function explicitly here?

I was going to allocate a bio, init it and dispatch it to raid disks, but found
__blkdev_issue_discard essentially does these, so reuse it. did you see any
problem?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] md/md0: optimize raid0 discard handling
  2017-05-08 16:37   ` Shaohua Li
@ 2017-05-08 16:52     ` Coly Li
  0 siblings, 0 replies; 6+ messages in thread
From: Coly Li @ 2017-05-08 16:52 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, NeilBrown

On 2017/5/9 上午12:37, Shaohua Li wrote:
> On Mon, May 08, 2017 at 03:31:47PM +0800, Coly Li wrote:
>> On 2017/5/8 上午8:36, Shaohua Li wrote:
>>> There are complaints that raid0 discard handling is slow. Currently we
>>> divide discard request into chunks and dispatch to underlayer disks. The
>>> block layer will do merge to form big requests. This causes a lot of
>>> request split/merge and uses significant CPU time.
>>>
>>> A simple idea is to calculate the range for each raid disk for an IO
>>> request and send a discard request to raid disks, which will avoid the
>>> split/merge completely. Previously Coly tried the approach, but the
>>> implementation was too complex because of raid0 zones. This patch always
>>> split bio in zone boundary and handle bio within one zone. It simplifies
>>> the implementation a lot.
>>>
>>> Cc: NeilBrown <neilb@suse.com>
>>> Cc: Coly Li <colyli@suse.de>
>>> Signed-off-by: Shaohua Li <shli@fb.com>
>>> ---
>>>  drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++-------
>>>  1 file changed, 102 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>>> index 84e5859..d6c0bc7 100644
>>> --- a/drivers/md/raid0.c
>>> +++ b/drivers/md/raid0.c
>>> @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
>>>  		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>>>  		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>>>  		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
>>> -		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
>>> +		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
>>>  
>>>  		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
>>>  		blk_queue_io_opt(mddev->queue,
>>> @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
>>>  	}
>>>  }
>>>  
>>> +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
>>> +{
>>> +	struct r0conf *conf = mddev->private;
>>> +	struct strip_zone *zone;
>>> +	sector_t start = bio->bi_iter.bi_sector;
>>> +	sector_t end;
>>> +	unsigned int stripe_size;
>>> +	sector_t first_stripe_index, last_stripe_index;
>>> +	sector_t start_disk_offset;
>>> +	unsigned int start_disk_index;
>>> +	sector_t end_disk_offset;
>>> +	unsigned int end_disk_index;
>>> +	unsigned int disk;
>>> +
>>> +	zone = find_zone(conf, &start);
>>> +
>>> +	if (bio_end_sector(bio) > zone->zone_end) {
>>> +		struct bio *split = bio_split(bio,
>>> +			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
>>> +			mddev->bio_set);
>>> +		bio_chain(split, bio);
>>> +		generic_make_request(bio);
>>> +		bio = split;
>>> +		end = zone->zone_end;
>>> +	} else
>>> +		end = bio_end_sector(bio);
>>> +
>>
>> Nice, good idea. Indeed, I have a WIP patch which trys to follow your
>> idea, but no clean as yours.
>>
>>
>>> +	if (zone != conf->strip_zone)
>>> +		end = end - zone[-1].zone_end;
>>> +
>>> +	/* Now start and end is the offset in zone */
>>> +	stripe_size = zone->nb_dev * mddev->chunk_sectors;
>>> +
>>> +	first_stripe_index = start;
>>> +	sector_div(first_stripe_index, stripe_size);
>>> +	last_stripe_index = end;
>>> +	sector_div(last_stripe_index, stripe_size);
>>> +
>>> +	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
>>> +		mddev->chunk_sectors;
>>> +	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
>>> +		mddev->chunk_sectors) +
>>> +		first_stripe_index * mddev->chunk_sectors;
>>> +	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
>>> +		mddev->chunk_sectors;
>>> +	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
>>> +		mddev->chunk_sectors) +
>>> +		last_stripe_index * mddev->chunk_sectors;
>>> +
>>
>> Nice code, again! Much simpler. It is enjoyed experience when reading
>> these calculation :-)
>>
>>
>>> +	for (disk = 0; disk < zone->nb_dev; disk++) {
>>> +		sector_t dev_start, dev_end;
>>> +		struct bio *discard_bio = NULL;
>>> +		struct md_rdev *rdev;
>>> +
>>> +		if (disk < start_disk_index)
>>> +			dev_start = (first_stripe_index + 1) *
>>> +				mddev->chunk_sectors;
>>> +		else if (disk > start_disk_index)
>>> +			dev_start = first_stripe_index * mddev->chunk_sectors;
>>> +		else
>>> +			dev_start = start_disk_offset;
>>> +
>>> +		if (disk < end_disk_index)
>>> +			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
>>> +		else if (disk > end_disk_index)
>>> +			dev_end = last_stripe_index * mddev->chunk_sectors;
>>> +		else
>>> +			dev_end = end_disk_offset;
>>> +
>>> +		if (dev_end <= dev_start)
>>> +			continue;
>>> +
>>> +		rdev = conf->devlist[(zone - conf->strip_zone) *
>>> +			conf->strip_zone[0].nb_dev + disk];
>>> +		if (__blkdev_issue_discard(rdev->bdev,
>>> +			dev_start + zone->dev_start + rdev->data_offset,
>>> +			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
>>> +		    !discard_bio)
>>> +			continue;
>>
>> Could you please give me any hint ? Why __blkdev_issue_discard() is
>> called here. The following generic_make_request() will issue the bio to
>> underlying device and call __blkdev_issue_discard() in its code path
>> eventually, why call this function explicitly here?
> 
> I was going to allocate a bio, init it and dispatch it to raid disks, but found
> __blkdev_issue_discard essentially does these, so reuse it. did you see any
> problem?
> 

I don't see anything suspicious here. Just wondering is it mandatory to
call __blkdev_issue_discard() here. It's clear to me, thanks for your
explanation.

Coly

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] md/md0: optimize raid0 discard handling
  2017-05-08  0:36 [PATCH] md/md0: optimize raid0 discard handling Shaohua Li
  2017-05-08  7:31 ` Coly Li
@ 2017-05-09  1:03 ` NeilBrown
  2017-05-10 16:09   ` Shaohua Li
  1 sibling, 1 reply; 6+ messages in thread
From: NeilBrown @ 2017-05-09  1:03 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: Kernel-team, Coly Li

[-- Attachment #1: Type: text/plain, Size: 7062 bytes --]

On Sun, May 07 2017, Shaohua Li wrote:

> There are complaints that raid0 discard handling is slow. Currently we
> divide discard request into chunks and dispatch to underlayer disks. The
> block layer will do merge to form big requests. This causes a lot of
> request split/merge and uses significant CPU time.
>
> A simple idea is to calculate the range for each raid disk for an IO
> request and send a discard request to raid disks, which will avoid the
> split/merge completely. Previously Coly tried the approach, but the
> implementation was too complex because of raid0 zones. This patch always
> split bio in zone boundary and handle bio within one zone. It simplifies
> the implementation a lot.
>
> Cc: NeilBrown <neilb@suse.com>
> Cc: Coly Li <colyli@suse.de>
> Signed-off-by: Shaohua Li <shli@fb.com>

Reviewed-by: NeilBrown <neilb@suse.com>

I'm a little bit bothered by the use for __blkdev_issue_discard() which
uses bio_alloc(), which allocates from fs_bio_set.  This code isn't in a
filesystem, so it feels wrong.

fs_bio_set has 4 entries.  If 4 different threads call
__blkdev_issue_discard() on a raid0 device, they could get allocated all
of the entries from the pool.  Then when raid0 calls
__blkdev_issue_discard(), the pool is empty.

I don't think this is actually a problem as discard (presumably) doesn't
happen on the write-out-for-memory-reclaim path, so the bio_alloc() will
eventually be able to get memory from kmalloc() rather than from the
pool.

Maybe next_bio() should use the bio_split pool from
bio->bi_bdev->queue.  But it probably doesn't really matter.

Thanks,
NeilBrown


> ---
>  drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 102 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index 84e5859..d6c0bc7 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
>  		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>  		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
>  		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
> -		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
> +		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
>  
>  		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
>  		blk_queue_io_opt(mddev->queue,
> @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
>  	}
>  }
>  
> +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
> +{
> +	struct r0conf *conf = mddev->private;
> +	struct strip_zone *zone;
> +	sector_t start = bio->bi_iter.bi_sector;
> +	sector_t end;
> +	unsigned int stripe_size;
> +	sector_t first_stripe_index, last_stripe_index;
> +	sector_t start_disk_offset;
> +	unsigned int start_disk_index;
> +	sector_t end_disk_offset;
> +	unsigned int end_disk_index;
> +	unsigned int disk;
> +
> +	zone = find_zone(conf, &start);
> +
> +	if (bio_end_sector(bio) > zone->zone_end) {
> +		struct bio *split = bio_split(bio,
> +			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
> +			mddev->bio_set);
> +		bio_chain(split, bio);
> +		generic_make_request(bio);
> +		bio = split;
> +		end = zone->zone_end;
> +	} else
> +		end = bio_end_sector(bio);
> +
> +	if (zone != conf->strip_zone)
> +		end = end - zone[-1].zone_end;
> +
> +	/* Now start and end is the offset in zone */
> +	stripe_size = zone->nb_dev * mddev->chunk_sectors;
> +
> +	first_stripe_index = start;
> +	sector_div(first_stripe_index, stripe_size);
> +	last_stripe_index = end;
> +	sector_div(last_stripe_index, stripe_size);
> +
> +	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
> +		mddev->chunk_sectors;
> +	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
> +		mddev->chunk_sectors) +
> +		first_stripe_index * mddev->chunk_sectors;
> +	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
> +		mddev->chunk_sectors;
> +	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
> +		mddev->chunk_sectors) +
> +		last_stripe_index * mddev->chunk_sectors;
> +
> +	for (disk = 0; disk < zone->nb_dev; disk++) {
> +		sector_t dev_start, dev_end;
> +		struct bio *discard_bio = NULL;
> +		struct md_rdev *rdev;
> +
> +		if (disk < start_disk_index)
> +			dev_start = (first_stripe_index + 1) *
> +				mddev->chunk_sectors;
> +		else if (disk > start_disk_index)
> +			dev_start = first_stripe_index * mddev->chunk_sectors;
> +		else
> +			dev_start = start_disk_offset;
> +
> +		if (disk < end_disk_index)
> +			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
> +		else if (disk > end_disk_index)
> +			dev_end = last_stripe_index * mddev->chunk_sectors;
> +		else
> +			dev_end = end_disk_offset;
> +
> +		if (dev_end <= dev_start)
> +			continue;
> +
> +		rdev = conf->devlist[(zone - conf->strip_zone) *
> +			conf->strip_zone[0].nb_dev + disk];
> +		if (__blkdev_issue_discard(rdev->bdev,
> +			dev_start + zone->dev_start + rdev->data_offset,
> +			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
> +		    !discard_bio)
> +			continue;
> +		bio_chain(discard_bio, bio);
> +		if (mddev->gendisk)
> +			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
> +				discard_bio, disk_devt(mddev->gendisk),
> +				bio->bi_iter.bi_sector);
> +		generic_make_request(discard_bio);
> +	}
> +	bio_endio(bio);
> +}
> +
>  static void raid0_make_request(struct mddev *mddev, struct bio *bio)
>  {
>  	struct strip_zone *zone;
> @@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
>  		return;
>  	}
>  
> +	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
> +		raid0_handle_discard(mddev, bio);
> +		return;
> +	}
> +
>  	bio_sector = bio->bi_iter.bi_sector;
>  	sector = bio_sector;
>  	chunk_sects = mddev->chunk_sectors;
> @@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
>  	bio->bi_iter.bi_sector = sector + zone->dev_start +
>  		tmp_dev->data_offset;
>  
> -	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
> -		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
> -		/* Just ignore it */
> -		bio_endio(bio);
> -	} else {
> -		if (mddev->gendisk)
> -			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
> -					      bio, disk_devt(mddev->gendisk),
> -					      bio_sector);
> -		mddev_check_writesame(mddev, bio);
> -		mddev_check_write_zeroes(mddev, bio);
> -		generic_make_request(bio);
> -	}
> +	if (mddev->gendisk)
> +		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
> +				      bio, disk_devt(mddev->gendisk),
> +				      bio_sector);
> +	mddev_check_writesame(mddev, bio);
> +	mddev_check_write_zeroes(mddev, bio);
> +	generic_make_request(bio);
>  }
>  
>  static void raid0_status(struct seq_file *seq, struct mddev *mddev)
> -- 
> 2.9.3

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] md/md0: optimize raid0 discard handling
  2017-05-09  1:03 ` NeilBrown
@ 2017-05-10 16:09   ` Shaohua Li
  0 siblings, 0 replies; 6+ messages in thread
From: Shaohua Li @ 2017-05-10 16:09 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, Kernel-team, Coly Li

On Tue, May 09, 2017 at 11:03:02AM +1000, Neil Brown wrote:
> On Sun, May 07 2017, Shaohua Li wrote:
> 
> > There are complaints that raid0 discard handling is slow. Currently we
> > divide discard request into chunks and dispatch to underlayer disks. The
> > block layer will do merge to form big requests. This causes a lot of
> > request split/merge and uses significant CPU time.
> >
> > A simple idea is to calculate the range for each raid disk for an IO
> > request and send a discard request to raid disks, which will avoid the
> > split/merge completely. Previously Coly tried the approach, but the
> > implementation was too complex because of raid0 zones. This patch always
> > split bio in zone boundary and handle bio within one zone. It simplifies
> > the implementation a lot.
> >
> > Cc: NeilBrown <neilb@suse.com>
> > Cc: Coly Li <colyli@suse.de>
> > Signed-off-by: Shaohua Li <shli@fb.com>
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> 
> I'm a little bit bothered by the use for __blkdev_issue_discard() which
> uses bio_alloc(), which allocates from fs_bio_set.  This code isn't in a
> filesystem, so it feels wrong.
> 
> fs_bio_set has 4 entries.  If 4 different threads call
> __blkdev_issue_discard() on a raid0 device, they could get allocated all
> of the entries from the pool.  Then when raid0 calls
> __blkdev_issue_discard(), the pool is empty.
> 
> I don't think this is actually a problem as discard (presumably) doesn't
> happen on the write-out-for-memory-reclaim path, so the bio_alloc() will
> eventually be able to get memory from kmalloc() rather than from the
> pool.
> 
> Maybe next_bio() should use the bio_split pool from
> bio->bi_bdev->queue.  But it probably doesn't really matter.

Indead this part isn't comfortable. I'd suppose discard is only called in
transaction thread of fs, so only one thread is calling into discard. Probably
not a big deal.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-05-10 16:09 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-08  0:36 [PATCH] md/md0: optimize raid0 discard handling Shaohua Li
2017-05-08  7:31 ` Coly Li
2017-05-08 16:37   ` Shaohua Li
2017-05-08 16:52     ` Coly Li
2017-05-09  1:03 ` NeilBrown
2017-05-10 16:09   ` Shaohua Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.