[PATCH RFC 08/16] block: Add support for atomic_write_unit

From: John Garry <john.g.garry@oracle.com>
To: axboe@kernel.dk, kbusch@kernel.org, hch@lst.de, sagi@grimberg.me,
	martin.petersen@oracle.com, djwong@kernel.org,
	viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com,
	jejb@linux.ibm.com
Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-nvme@lists.infradead.org, linux-scsi@vger.kernel.org,
	linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-security-module@vger.kernel.org, paul@paul-moore.com,
	jmorris@namei.org, serge@hallyn.com,
	John Garry <john.g.garry@oracle.com>
Subject: [PATCH RFC 08/16] block: Add support for atomic_write_unit
Date: Wed,  3 May 2023 18:38:13 +0000	[thread overview]
Message-ID: <20230503183821.1473305-9-john.g.garry@oracle.com> (raw)
In-Reply-To: <20230503183821.1473305-1-john.g.garry@oracle.com>

Add bio.atomic_write_unit, which is the min size which we can split a bio.
Any bio needs to be split in a multiple of this size and also aligned to
this size.

In __bio_iov_iter_get_pages(), use atomic_write_unit to trim a bio to
be a multiple of atomic_write_unit.

In bio_split_rw(), we need to consider splitting as follows:
- For a regular split which does not cross an atomic write boundary, same
  as in __bio_iov_iter_get_pages(), trim to be a multiple of
  atomic_write_unit
- We also need to check for when a bio straddles an atomic write boundary.
  In this case, split to be start/end-aligned with the boundary.

We need to ignore lim->max_sectors since to may be less than
bio->write_atomic_unit, which we cannot tolerate.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/bio.c               |  7 +++-
 block/blk-merge.c         | 84 ++++++++++++++++++++++++++++++++++-----
 include/linux/blk_types.h |  2 +
 3 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index fd11614bba4d..fc2f29e1c14c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -247,6 +247,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	      unsigned short max_vecs, blk_opf_t opf)
 {
 	bio->bi_next = NULL;
+	bio->atomic_write_unit = 0;
 	bio->bi_bdev = bdev;
 	bio->bi_opf = opf;
 	bio->bi_flags = 0;
@@ -815,6 +816,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_iter = bio_src->bi_iter;
 
+	bio->atomic_write_unit = bio_src->atomic_write_unit;
 	if (bio->bi_bdev) {
 		if (bio->bi_bdev == bio_src->bi_bdev &&
 		    bio_flagged(bio_src, BIO_REMAPPED))
@@ -1273,7 +1275,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 
-	trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+	if (bio->atomic_write_unit)
+		trim = size & (bio->atomic_write_unit - 1);
+	else
+		trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
 	iov_iter_revert(iter, trim);
 
 	size -= trim;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6460abdb2426..95ab6b644955 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -171,7 +171,17 @@ static inline unsigned get_max_io_size(struct bio *bio,
 {
 	unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
 	unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
-	unsigned max_sectors = lim->max_sectors, start, end;
+	unsigned max_sectors, start, end;
+
+	/*
+	 * We ignore lim->max_sectors for atomic writes simply because
+	 * it may less than bio->write_atomic_unit, which we cannot
+	 * tolerate.
+	 */
+	if (bio->bi_opf & REQ_ATOMIC)
+		max_sectors = lim->atomic_write_max_bytes >> SECTOR_SHIFT;
+	else
+		max_sectors = lim->max_sectors;
 
 	if (lim->chunk_sectors) {
 		max_sectors = min(max_sectors,
@@ -256,6 +266,22 @@ static bool bvec_split_segs(const struct queue_limits *lim,
 	return len > 0 || bv->bv_len > max_len;
 }
 
+static bool bio_straddles_boundary(struct bio *bio, unsigned int bytes,
+				   unsigned int boundary)
+{
+	loff_t start = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	loff_t end = start + bytes;
+	loff_t start_mod = start % boundary;
+	loff_t end_mod = end % boundary;
+
+	if (end - start > boundary)
+		return true;
+	if ((start_mod > end_mod) && (start_mod && end_mod))
+		return true;
+
+	return false;
+}
+
 /**
  * bio_split_rw - split a bio in two bios
  * @bio:  [in] bio to be split
@@ -276,10 +302,15 @@ static bool bvec_split_segs(const struct queue_limits *lim,
  * responsible for ensuring that @bs is only destroyed after processing of the
  * split bio has finished.
  */
+
+
 struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 		unsigned *segs, struct bio_set *bs, unsigned max_bytes)
 {
+	unsigned int atomic_write_boundary = lim->atomic_write_boundary;
+	bool atomic_write = bio->bi_opf & REQ_ATOMIC;
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
+	bool straddles_boundary = false;
 	struct bvec_iter iter;
 	unsigned nsegs = 0, bytes = 0;
 
@@ -291,14 +322,31 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
 			goto split;
 
+		if (atomic_write && atomic_write_boundary) {
+			straddles_boundary = bio_straddles_boundary(bio,
+					bytes + bv.bv_len, atomic_write_boundary);
+		}
 		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
-		    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+		    bv.bv_offset + bv.bv_len <= PAGE_SIZE &&
+		    !straddles_boundary) {
 			nsegs++;
 			bytes += bv.bv_len;
 		} else {
-			if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
-					lim->max_segments, max_bytes))
+			bool split_the_segs =
+				bvec_split_segs(lim, &bv, &nsegs, &bytes,
+						lim->max_segments, max_bytes);
+
+			/*
+			 * We may not actually straddle the boundary as we may
+			 * have added less bytes than anticipated
+			 */
+			if (straddles_boundary) {
+				straddles_boundary = bio_straddles_boundary(bio,
+						bytes, atomic_write_boundary);
+			}
+
+			if (split_the_segs || straddles_boundary)
 				goto split;
 		}
 
@@ -321,12 +369,25 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 
 	*segs = nsegs;
 
-	/*
-	 * Individual bvecs might not be logical block aligned. Round down the
-	 * split size so that each bio is properly block size aligned, even if
-	 * we do not use the full hardware limits.
-	 */
-	bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+	if (straddles_boundary) {
+		loff_t new_end = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + bytes;
+		unsigned int trim = new_end & (atomic_write_boundary - 1);
+		bytes -= trim;
+		new_end = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + bytes;
+		BUG_ON(new_end % atomic_write_boundary);
+	} else if (bio->atomic_write_unit) {
+		unsigned int atomic_write_unit = bio->atomic_write_unit;
+		unsigned int trim = bytes % atomic_write_unit;
+
+		bytes -= trim;
+	} else {
+		/*
+		 * Individual bvecs might not be logical block aligned. Round down the
+		 * split size so that each bio is properly block size aligned, even if
+		 * we do not use the full hardware limits.
+		 */
+		bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+	}
 
 	/*
 	 * Bio splitting may cause subtle trouble such as hang when doing sync
@@ -355,7 +416,8 @@ struct bio *__bio_split_to_limits(struct bio *bio,
 				  const struct queue_limits *lim,
 				  unsigned int *nr_segs)
 {
-	struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
+	struct block_device *bi_bdev = bio->bi_bdev;
+	struct bio_set *bs = &bi_bdev->bd_disk->bio_split;
 	struct bio *split;
 
 	switch (bio_op(bio)) {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 347b52e00322..daa44eac9f14 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -303,6 +303,8 @@ struct bio {
 
 	struct bio_set		*bi_pool;
 
+	unsigned int atomic_write_unit;
+
 	/*
 	 * We can inline a number of vecs at the end of the bio, to avoid
 	 * double allocations for a small number of bio_vecs. This member
-- 
2.31.1