All of lore.kernel.org
 help / color / mirror / Atom feed
From: John Garry <john.g.garry@oracle.com>
To: axboe@kernel.dk, kbusch@kernel.org, hch@lst.de, sagi@grimberg.me,
	martin.petersen@oracle.com, djwong@kernel.org,
	viro@zeniv.linux.org.uk, brauner@kernel.org, dchinner@redhat.com,
	jejb@linux.ibm.com
Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-nvme@lists.infradead.org, linux-scsi@vger.kernel.org,
	linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-security-module@vger.kernel.org, paul@paul-moore.com,
	jmorris@namei.org, serge@hallyn.com,
	John Garry <john.g.garry@oracle.com>
Subject: [PATCH RFC 08/16] block: Add support for atomic_write_unit
Date: Wed,  3 May 2023 18:38:13 +0000	[thread overview]
Message-ID: <20230503183821.1473305-9-john.g.garry@oracle.com> (raw)
In-Reply-To: <20230503183821.1473305-1-john.g.garry@oracle.com>

Add bio.atomic_write_unit, which is the min size which we can split a bio.
Any bio needs to be split in a multiple of this size and also aligned to
this size.

In __bio_iov_iter_get_pages(), use atomic_write_unit to trim a bio to
be a multiple of atomic_write_unit.

In bio_split_rw(), we need to consider splitting as follows:
- For a regular split which does not cross an atomic write boundary, same
  as in __bio_iov_iter_get_pages(), trim to be a multiple of
  atomic_write_unit
- We also need to check for when a bio straddles an atomic write boundary.
  In this case, split to be start/end-aligned with the boundary.

We need to ignore lim->max_sectors since to may be less than
bio->write_atomic_unit, which we cannot tolerate.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/bio.c               |  7 +++-
 block/blk-merge.c         | 84 ++++++++++++++++++++++++++++++++++-----
 include/linux/blk_types.h |  2 +
 3 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index fd11614bba4d..fc2f29e1c14c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -247,6 +247,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	      unsigned short max_vecs, blk_opf_t opf)
 {
 	bio->bi_next = NULL;
+	bio->atomic_write_unit = 0;
 	bio->bi_bdev = bdev;
 	bio->bi_opf = opf;
 	bio->bi_flags = 0;
@@ -815,6 +816,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_iter = bio_src->bi_iter;
 
+	bio->atomic_write_unit = bio_src->atomic_write_unit;
 	if (bio->bi_bdev) {
 		if (bio->bi_bdev == bio_src->bi_bdev &&
 		    bio_flagged(bio_src, BIO_REMAPPED))
@@ -1273,7 +1275,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 
-	trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+	if (bio->atomic_write_unit)
+		trim = size & (bio->atomic_write_unit - 1);
+	else
+		trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
 	iov_iter_revert(iter, trim);
 
 	size -= trim;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6460abdb2426..95ab6b644955 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -171,7 +171,17 @@ static inline unsigned get_max_io_size(struct bio *bio,
 {
 	unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
 	unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
-	unsigned max_sectors = lim->max_sectors, start, end;
+	unsigned max_sectors, start, end;
+
+	/*
+	 * We ignore lim->max_sectors for atomic writes simply because
+	 * it may less than bio->write_atomic_unit, which we cannot
+	 * tolerate.
+	 */
+	if (bio->bi_opf & REQ_ATOMIC)
+		max_sectors = lim->atomic_write_max_bytes >> SECTOR_SHIFT;
+	else
+		max_sectors = lim->max_sectors;
 
 	if (lim->chunk_sectors) {
 		max_sectors = min(max_sectors,
@@ -256,6 +266,22 @@ static bool bvec_split_segs(const struct queue_limits *lim,
 	return len > 0 || bv->bv_len > max_len;
 }
 
+static bool bio_straddles_boundary(struct bio *bio, unsigned int bytes,
+				   unsigned int boundary)
+{
+	loff_t start = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	loff_t end = start + bytes;
+	loff_t start_mod = start % boundary;
+	loff_t end_mod = end % boundary;
+
+	if (end - start > boundary)
+		return true;
+	if ((start_mod > end_mod) && (start_mod && end_mod))
+		return true;
+
+	return false;
+}
+
 /**
  * bio_split_rw - split a bio in two bios
  * @bio:  [in] bio to be split
@@ -276,10 +302,15 @@ static bool bvec_split_segs(const struct queue_limits *lim,
  * responsible for ensuring that @bs is only destroyed after processing of the
  * split bio has finished.
  */
+
+
 struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 		unsigned *segs, struct bio_set *bs, unsigned max_bytes)
 {
+	unsigned int atomic_write_boundary = lim->atomic_write_boundary;
+	bool atomic_write = bio->bi_opf & REQ_ATOMIC;
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
+	bool straddles_boundary = false;
 	struct bvec_iter iter;
 	unsigned nsegs = 0, bytes = 0;
 
@@ -291,14 +322,31 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
 			goto split;
 
+		if (atomic_write && atomic_write_boundary) {
+			straddles_boundary = bio_straddles_boundary(bio,
+					bytes + bv.bv_len, atomic_write_boundary);
+		}
 		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
-		    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+		    bv.bv_offset + bv.bv_len <= PAGE_SIZE &&
+		    !straddles_boundary) {
 			nsegs++;
 			bytes += bv.bv_len;
 		} else {
-			if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
-					lim->max_segments, max_bytes))
+			bool split_the_segs =
+				bvec_split_segs(lim, &bv, &nsegs, &bytes,
+						lim->max_segments, max_bytes);
+
+			/*
+			 * We may not actually straddle the boundary as we may
+			 * have added less bytes than anticipated
+			 */
+			if (straddles_boundary) {
+				straddles_boundary = bio_straddles_boundary(bio,
+						bytes, atomic_write_boundary);
+			}
+
+			if (split_the_segs || straddles_boundary)
 				goto split;
 		}
 
@@ -321,12 +369,25 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 
 	*segs = nsegs;
 
-	/*
-	 * Individual bvecs might not be logical block aligned. Round down the
-	 * split size so that each bio is properly block size aligned, even if
-	 * we do not use the full hardware limits.
-	 */
-	bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+	if (straddles_boundary) {
+		loff_t new_end = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + bytes;
+		unsigned int trim = new_end & (atomic_write_boundary - 1);
+		bytes -= trim;
+		new_end = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + bytes;
+		BUG_ON(new_end % atomic_write_boundary);
+	} else if (bio->atomic_write_unit) {
+		unsigned int atomic_write_unit = bio->atomic_write_unit;
+		unsigned int trim = bytes % atomic_write_unit;
+
+		bytes -= trim;
+	} else {
+		/*
+		 * Individual bvecs might not be logical block aligned. Round down the
+		 * split size so that each bio is properly block size aligned, even if
+		 * we do not use the full hardware limits.
+		 */
+		bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+	}
 
 	/*
 	 * Bio splitting may cause subtle trouble such as hang when doing sync
@@ -355,7 +416,8 @@ struct bio *__bio_split_to_limits(struct bio *bio,
 				  const struct queue_limits *lim,
 				  unsigned int *nr_segs)
 {
-	struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
+	struct block_device *bi_bdev = bio->bi_bdev;
+	struct bio_set *bs = &bi_bdev->bd_disk->bio_split;
 	struct bio *split;
 
 	switch (bio_op(bio)) {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 347b52e00322..daa44eac9f14 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -303,6 +303,8 @@ struct bio {
 
 	struct bio_set		*bi_pool;
 
+	unsigned int atomic_write_unit;
+
 	/*
 	 * We can inline a number of vecs at the end of the bio, to avoid
 	 * double allocations for a small number of bio_vecs. This member
-- 
2.31.1


  parent reply	other threads:[~2023-05-03 18:40 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-03 18:38 [PATCH RFC 00/16] block atomic writes John Garry
2023-05-03 18:38 ` [PATCH RFC 01/16] block: Add atomic write operations to request_queue limits John Garry
2023-05-03 21:39   ` Dave Chinner
2023-05-04 18:14     ` John Garry
2023-05-04 22:26       ` Dave Chinner
2023-05-05  7:54         ` John Garry
2023-05-05 22:00           ` Darrick J. Wong
2023-05-07  1:59             ` Martin K. Petersen
2023-05-05 23:18           ` Dave Chinner
2023-05-06  9:38             ` John Garry
2023-05-07  2:35             ` Martin K. Petersen
2023-05-05 22:47         ` Eric Biggers
2023-05-05 23:31           ` Dave Chinner
2023-05-06  0:08             ` Eric Biggers
2023-05-09  0:19   ` Mike Snitzer
2023-05-09  0:19     ` [dm-devel] " Mike Snitzer
2023-05-17 17:02     ` John Garry
2023-05-17 17:02       ` [dm-devel] " John Garry
2023-05-03 18:38 ` [PATCH RFC 02/16] fs/bdev: Add atomic write support info to statx John Garry
2023-05-03 21:58   ` Dave Chinner
2023-05-04  8:45     ` John Garry
2023-05-04 22:40       ` Dave Chinner
2023-05-05  8:01         ` John Garry
2023-05-05 22:04           ` Darrick J. Wong
2023-05-03 18:38 ` [PATCH RFC 03/16] xfs: Support atomic write for statx John Garry
2023-05-03 22:17   ` Dave Chinner
2023-05-05 22:10     ` Darrick J. Wong
2023-05-03 18:38 ` [PATCH RFC 04/16] fs: Add RWF_ATOMIC and IOCB_ATOMIC flags for atomic write support John Garry
2023-05-03 18:38 ` [PATCH RFC 05/16] block: Add REQ_ATOMIC flag John Garry
2023-05-03 18:38 ` [PATCH RFC 06/16] block: Limit atomic writes according to bio and queue limits John Garry
2023-05-03 18:53   ` Keith Busch
2023-05-04  8:24     ` John Garry
2023-05-03 18:38 ` [PATCH RFC 07/16] block: Add bdev_find_max_atomic_write_alignment() John Garry
2023-05-04  1:57   ` kernel test robot
2023-05-03 18:38 ` John Garry [this message]
2023-05-04  4:00   ` [PATCH RFC 08/16] block: Add support for atomic_write_unit kernel test robot
2023-05-03 18:38 ` [PATCH RFC 09/16] block: Add blk_validate_atomic_write_op() John Garry
2023-05-03 18:38 ` [PATCH RFC 10/16] block: Add fops atomic write support John Garry
2023-05-03 18:38 ` [PATCH RFC 11/16] fs: iomap: Atomic " John Garry
2023-05-04  5:00   ` Dave Chinner
2023-05-05 21:19     ` Darrick J. Wong
2023-05-05 23:56       ` Dave Chinner
2023-05-03 18:38 ` [PATCH RFC 12/16] xfs: Add support for fallocate2 John Garry
2023-05-03 23:21   ` kernel test robot
2023-05-03 23:21   ` kernel test robot
2023-05-03 23:26   ` Dave Chinner
2023-05-05 22:23     ` Darrick J. Wong
2023-05-05 23:42       ` Dave Chinner
2023-05-04  7:28   ` kernel test robot
2023-05-03 18:38 ` [PATCH RFC 13/16] scsi: sd: Support reading atomic properties from block limits VPD John Garry
2023-05-03 18:38 ` [PATCH RFC 14/16] scsi: sd: Add WRITE_ATOMIC_16 support John Garry
2023-05-03 18:48   ` Bart Van Assche
2023-05-04  8:17     ` John Garry
2023-05-03 18:38 ` [PATCH RFC 15/16] scsi: scsi_debug: Atomic write support John Garry
2023-05-04  2:17   ` kernel test robot
2023-05-03 18:38 ` [PATCH RFC 16/16] nvme: Support atomic writes John Garry
2023-05-03 18:49   ` Bart Van Assche
2023-05-04  8:19     ` John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230503183821.1473305-9-john.g.garry@oracle.com \
    --to=john.g.garry@oracle.com \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=dchinner@redhat.com \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=jejb@linux.ibm.com \
    --cc=jmorris@namei.org \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=paul@paul-moore.com \
    --cc=sagi@grimberg.me \
    --cc=serge@hallyn.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.