All of lore.kernel.org
 help / color / mirror / Atom feed
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
To: linux-block@vger.kernel.org
Cc: tytso@mit.edu, Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Subject: [PATCH] block: add split_alignment for request queue
Date: Mon, 15 Jun 2020 17:56:33 -0700	[thread overview]
Message-ID: <20200616005633.172804-1-harshadshirwadkar@gmail.com> (raw)

This feature allows the user to control the alignment at which request
queue is allowed to split bios. Google CloudSQL's 16k user space
application expects that direct io writes aligned at 16k boundary in
the user-space are not split by kernel at non-16k boundaries. More
details about this feature can be found in CloudSQL's Cloud Next 2018
presentation[1]. The underlying block device is capable of performing
16k aligned writes atomically. Thus, this allows the user-space SQL
application to avoid double-writes (to protect against partial
failures) which are very costly provided that these writes are not
split at non-16k boundary by any underlying layers.

We make use of Ext4's bigalloc feature to ensure that writes issued by
Ext4 are 16k aligned. But, 16K aligned data writes may get merged with
contiguous non-16k aligned Ext4 metadata writes. Such a write request
would be broken by the kernel only guaranteeing that the individually
split requests are physical block size aligned.

We started observing a significant increase in 16k unaligned splits in
5.4. Bisect points to commit 07173c3ec276cbb18dc0e0687d37d310e98a1480
("block: enable multipage bvecs"). This patch enables multipage bvecs
resulting in multiple 16k aligned writes issued by the user-space to
be merged into one big IO at first. Later, __blk_queue_split() splits
these IOs while trying to align individual split IOs to be physical
block size.

Newly added split_alignment parameter is the alignment at which
requeust queue is allowed to split IO request. By default this
alignment is turned off and current behavior is unchanged.

[1] CloudNext'18 "Optimizaing performance for Cloud SQL for MySQL"
    https://www.youtube.com/watch?v=gIeuiGg-_iw

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 Documentation/block/queue-sysfs.rst |  8 ++++
 block/blk-merge.c                   | 64 ++++++++++++++++++++++-------
 block/blk-sysfs.c                   | 30 ++++++++++++++
 include/linux/blkdev.h              |  1 +
 4 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index 6a8513af9201..c3eaba149415 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -251,4 +251,12 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC
 do not support zone commands, they will be treated as regular block devices
 and zoned will report "none".
 
+split_alignment (RW)
+----------------------
+This is the alignment in bytes at which the requeust queue is allowed
+to split IO requests. Once this value is set, the requeust queue
+splits IOs such that the individual IOs are aligned to
+split_alignment. The value of 0 indicates that an IO request can be
+split anywhere. This value must be a power of 2.
+
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1534ed736363..cdf337c74b83 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,15 +105,18 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
 {
+	sector_t split;
+
 	*nsegs = 0;
 
-	if (!q->limits.max_write_zeroes_sectors)
-		return NULL;
+	split = q->limits.max_write_zeroes_sectors;
+	if (split && q->split_alignment >> 9)
+		split = round_down(split, q->split_alignment >> 9);
 
-	if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+	if (!split || bio_sectors(bio) <= split)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+	return bio_split(bio, split, GFP_NOIO, bs);
 }
 
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
@@ -121,15 +124,18 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio_set *bs,
 					    unsigned *nsegs)
 {
+	sector_t split;
+
 	*nsegs = 1;
 
-	if (!q->limits.max_write_same_sectors)
-		return NULL;
+	split = q->limits.max_write_same_sectors;
+	if (split && q->split_alignment >> 9)
+		split = round_down(split, q->split_alignment >> 9);
 
-	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
+	if (!split || bio_sectors(bio) <= split)
 		return NULL;
 
-	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+	return bio_split(bio, split, GFP_NOIO, bs);
 }
 
 /*
@@ -248,7 +254,10 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned nsegs = 0, sectors = 0;
+	unsigned int nsegs = 0, nsegs_aligned = 0;
+	unsigned int sectors = 0, sectors_aligned = 0, before = 0, after = 0;
+	unsigned int sector_alignment =
+		q->split_alignment ? (q->split_alignment >> 9) : 0;
 	const unsigned max_sectors = get_max_io_size(q, bio);
 	const unsigned max_segs = queue_max_segments(q);
 
@@ -264,12 +273,31 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		    sectors + (bv.bv_len >> 9) <= max_sectors &&
 		    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 			nsegs++;
-			sectors += bv.bv_len >> 9;
-		} else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
-					 max_sectors)) {
-			goto split;
+			before = round_down(sectors, sector_alignment);
+			sectors += (bv.bv_len >> 9);
+			after = round_down(sectors, sector_alignment);
+			if (sector_alignment && before != after) {
+				/* This is a valid split point */
+				nsegs_aligned = nsegs;
+				sectors_aligned = after;
+			}
+			goto next;
 		}
-
+		if (sector_alignment) {
+			before = round_down(sectors, sector_alignment);
+			after = round_down(sectors + (bv.bv_len >> 9),
+					  sector_alignment);
+			if ((nsegs < max_segs) && before != after &&
+			    ((after - before) << 9) + bv.bv_offset <=  PAGE_SIZE
+			    && after <= max_sectors) {
+				sectors_aligned = after;
+				nsegs_aligned = nsegs + 1;
+			}
+		}
+		if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
+				    max_sectors))
+			goto split;
+next:
 		bvprv = bv;
 		bvprvp = &bvprv;
 	}
@@ -278,7 +306,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	return NULL;
 split:
 	*segs = nsegs;
-	return bio_split(bio, sectors, GFP_NOIO, bs);
+	if (sector_alignment && sectors_aligned == 0)
+		return NULL;
+
+	*segs = sector_alignment ? nsegs_aligned : nsegs;
+
+	return bio_split(bio, sector_alignment ? sectors_aligned : sectors,
+			 GFP_NOIO, bs);
 }
 
 /**
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fca9b158f4a0..f045c7a79a74 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -529,6 +529,29 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
+static ssize_t queue_split_alignment_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->split_alignment, page);
+}
+
+static ssize_t queue_split_alignment_store(struct request_queue *q, const char *page,
+						size_t count)
+{
+	unsigned long split_alignment;
+	int ret;
+
+	ret = queue_var_store(&split_alignment, page, count);
+	if (ret < 0)
+		return ret;
+
+	/* split_alignment can only be a power of 2 */
+	if (split_alignment & (split_alignment - 1))
+		return -EINVAL;
+
+	q->split_alignment = split_alignment;
+	return count;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = 0644 },
 	.show = queue_requests_show,
@@ -727,6 +750,12 @@ static struct queue_sysfs_entry throtl_sample_time_entry = {
 };
 #endif
 
+static struct queue_sysfs_entry queue_split_alignment = {
+	.attr = {.name = "split_alignment", .mode = 0644 },
+	.show = queue_split_alignment_show,
+	.store = queue_split_alignment_store,
+};
+
 static struct attribute *queue_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -766,6 +795,7 @@ static struct attribute *queue_attrs[] = {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	&throtl_sample_time_entry.attr,
 #endif
+	&queue_split_alignment.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 32868fbedc9e..e8feb43f6fdd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -473,6 +473,7 @@ struct request_queue {
 	void			*dma_drain_buffer;
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
+	unsigned int		split_alignment;
 
 	unsigned int		rq_timeout;
 	int			poll_nsec;
-- 
2.27.0.290.gba653c62da-goog


             reply	other threads:[~2020-06-16  0:56 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-16  0:56 Harshad Shirwadkar [this message]
2020-06-16  2:14 ` [PATCH] block: add split_alignment for request queue John Dorminy
2020-06-16  2:25   ` Martin K. Petersen
2020-06-16  2:40 ` Ming Lei
2020-06-16  2:50   ` harshad shirwadkar
2020-06-16  3:24     ` harshad shirwadkar
2020-06-16  4:16       ` Ming Lei
2020-06-16  7:29 ` Christoph Hellwig
2020-06-16 11:12   ` Sweet Tea Dorminy
2020-06-16 14:38     ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200616005633.172804-1-harshadshirwadkar@gmail.com \
    --to=harshadshirwadkar@gmail.com \
    --cc=linux-block@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.