All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv3 0/6] direct io dma alignment
@ 2022-05-23 21:01 Keith Busch
  2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
                   ` (5 more replies)
  0 siblings, 6 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

Here is the 3rd version enabling non-block aligned user space addresses
for O_DIRECT.

Changes since v2:

  Folded in improvements cleanin gup zone append pages (Christoph)

  Added documentation the exported attribute (Bart)

  Split bdev_dma_alignment() into its own patch (Christoph)

  Removed fs/ from implementing support for these address offsets for
  now

  Fixed up a couple places assuming SECTOR_SIZE multiple bv_len

On that last point, I searched through much of the code and found a few
other places that also assumed this bv_len size, but they didn't apply
to this change since they don't set 'dma_alignment'. This got me
thinking, though, should this be a new attribute, 'dio_alignment',
instead of using the pre-existing 'dma_alignment' for this purpose? Or
is it clear enough that drivers setting the existing attribute ought to
know what they're getting into?

Keith Busch (6):
  block/bio: remove duplicate append pages code
  block: export dma_alignment attribute
  block: introduce bdev_dma_alignment helper
  block/merge: count bytes instead of sectors
  block/bounce: count bytes instead of sectors
  block: relax direct io memory alignment

 Documentation/ABI/stable/sysfs-block |   9 ++
 block/bio.c                          | 118 +++++++++++++--------------
 block/blk-merge.c                    |  35 ++++----
 block/blk-sysfs.c                    |   7 ++
 block/bounce.c                       |   5 +-
 block/fops.c                         |  41 +++++++---
 include/linux/blkdev.h               |   5 ++
 7 files changed, 127 insertions(+), 93 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
@ 2022-05-23 21:01 ` Keith Busch
  2022-05-24  6:00   ` Christoph Hellwig
                     ` (2 more replies)
  2022-05-23 21:01 ` [PATCHv3 2/6] block: export dma_alignment attribute Keith Busch
                   ` (4 subsequent siblings)
  5 siblings, 3 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

The getting pages setup for zone append and normal IO are identical. Use
common code for each.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2->v3:

  Folded in improvement suggestions from Christoph

 block/bio.c | 105 +++++++++++++++++++++-------------------------------
 1 file changed, 42 insertions(+), 63 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index a3893d80dccc..55d2a9c4e312 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1158,6 +1158,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
 		put_page(pages[i]);
 }
 
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int offset)
+{
+	bool same_page = false;
+
+	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+		if (WARN_ON_ONCE(bio_full(bio, len)))
+			return -EINVAL;
+		__bio_add_page(bio, page, len, offset);
+		return 0;
+	}
+
+	if (same_page)
+		put_page(page);
+	return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int offset)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	bool same_page = false;
+
+	if (bio_add_hw_page(q, bio, page, len, offset,
+			queue_max_zone_append_sectors(q), &same_page) != len)
+		return -EINVAL;
+	if (same_page)
+		put_page(page);
+	return 0;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -1176,7 +1207,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
-	bool same_page = false;
 	ssize_t size, left;
 	unsigned len, i;
 	size_t offset;
@@ -1185,7 +1215,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	 * Move page array up in the allocated memory for the bio vecs as far as
 	 * possible so that we can start filling biovecs from the beginning
 	 * without overwriting the temporary page array.
-	*/
+	 */
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
@@ -1195,18 +1225,18 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
+		int ret;
 
 		len = min_t(size_t, PAGE_SIZE - offset, left);
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+			ret = bio_iov_add_zone_append_page(bio, page, len,
+					offset);
+		else
+			ret = bio_iov_add_page(bio, page, len, offset);
 
-		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-			if (same_page)
-				put_page(page);
-		} else {
-			if (WARN_ON_ONCE(bio_full(bio, len))) {
-				bio_put_pages(pages + i, left, offset);
-				return -EINVAL;
-			}
-			__bio_add_page(bio, page, len, offset);
+		if (ret) {
+			bio_put_pages(pages + i, left, offset);
+			return ret;
 		}
 		offset = 0;
 	}
@@ -1215,54 +1245,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	return 0;
 }
 
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
-	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
-	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
-	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
-	struct page **pages = (struct page **)bv;
-	ssize_t size, left;
-	unsigned len, i;
-	size_t offset;
-	int ret = 0;
-
-	if (WARN_ON_ONCE(!max_append_sectors))
-		return 0;
-
-	/*
-	 * Move page array up in the allocated memory for the bio vecs as far as
-	 * possible so that we can start filling biovecs from the beginning
-	 * without overwriting the temporary page array.
-	 */
-	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
-	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
-
-	for (left = size, i = 0; left > 0; left -= len, i++) {
-		struct page *page = pages[i];
-		bool same_page = false;
-
-		len = min_t(size_t, PAGE_SIZE - offset, left);
-		if (bio_add_hw_page(q, bio, page, len, offset,
-				max_append_sectors, &same_page) != len) {
-			bio_put_pages(pages + i, left, offset);
-			ret = -EINVAL;
-			break;
-		}
-		if (same_page)
-			put_page(page);
-		offset = 0;
-	}
-
-	iov_iter_advance(iter, size - left);
-	return ret;
-}
-
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
@@ -1297,10 +1279,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	}
 
 	do {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-			ret = __bio_iov_append_get_pages(bio, iter);
-		else
-			ret = __bio_iov_iter_get_pages(bio, iter);
+		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
 	/* don't account direct I/O as memory stall */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv3 2/6] block: export dma_alignment attribute
  2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
  2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
@ 2022-05-23 21:01 ` Keith Busch
  2022-05-24  6:02   ` Christoph Hellwig
  2022-05-24  6:24   ` Johannes Thumshirn
  2022-05-23 21:01 ` [PATCHv3 3/6] block: introduce bdev_dma_alignment helper Keith Busch
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

User space may want to know how to align their buffers to avoid
bouncing. Export the queue attribute.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2->v3:

  Added Documentation (Bart)

 Documentation/ABI/stable/sysfs-block | 9 +++++++++
 block/blk-sysfs.c                    | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index e8797cd09aff..08f6d00df138 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -142,6 +142,15 @@ Description:
 		Default value of this file is '1'(on).
 
 
+What:		/sys/block/<disk>/queue/dma_alignment
+Date:		May 2022
+Contact:	linux-block@vger.kernel.org
+Description:
+		Reports the alignment that user space addresses must have to be
+		used for raw block device access with O_DIRECT and other driver
+		specific passthrough mechanisms.
+
+
 What:		/sys/block/<disk>/queue/chunk_sectors
 Date:		September 2016
 Contact:	Hannes Reinecke <hare@suse.com>
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 88bd41d4cb59..14607565d781 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
 	return queue_var_show(q->limits.virt_boundary_mask, page);
 }
 
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_dma_alignment(q), page);
+}
+
 #define QUEUE_SYSFS_BIT_FNS(name, flag, neg)				\
 static ssize_t								\
 queue_##name##_show(struct request_queue *q, char *page)		\
@@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
 QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
 QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
 QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
@@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
 	&blk_throtl_sample_time_entry.attr,
 #endif
 	&queue_virt_boundary_mask_entry.attr,
+	&queue_dma_alignment_entry.attr,
 	NULL,
 };
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv3 3/6] block: introduce bdev_dma_alignment helper
  2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
  2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
  2022-05-23 21:01 ` [PATCHv3 2/6] block: export dma_alignment attribute Keith Busch
@ 2022-05-23 21:01 ` Keith Busch
  2022-05-24  6:02   ` Christoph Hellwig
  2022-05-24  6:25   ` Johannes Thumshirn
  2022-05-23 21:01 ` [PATCHv3 4/6] block/merge: count bytes instead of sectors Keith Busch
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

Preparing for upcoming dma_alignment users that have a block_device, but
don't need the request_queue.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5bdf2ac9142c..834b981ef01b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1365,6 +1365,11 @@ static inline int queue_dma_alignment(const struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
+static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
+{
+	return queue_dma_alignment(bdev_get_queue(bdev));
+}
+
 static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv3 4/6] block/merge: count bytes instead of sectors
  2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
                   ` (2 preceding siblings ...)
  2022-05-23 21:01 ` [PATCHv3 3/6] block: introduce bdev_dma_alignment helper Keith Busch
@ 2022-05-23 21:01 ` Keith Busch
  2022-05-24  6:07   ` Christoph Hellwig
  2022-05-23 21:01 ` [PATCHv3 5/6] block/bounce: " Keith Busch
  2022-05-23 21:01 ` [PATCHv3 6/6] block: relax direct io memory alignment Keith Busch
  5 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

Individual bv_len's may not be a sector size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-merge.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7771dacc99cb..46cbbc10d68d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -201,11 +201,11 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
  * @nsegs:    [in,out] Number of segments in the bio being built. Incremented
  *            by the number of segments from @bv that may be appended to that
  *            bio without exceeding @max_segs
- * @sectors:  [in,out] Number of sectors in the bio being built. Incremented
- *            by the number of sectors from @bv that may be appended to that
- *            bio without exceeding @max_sectors
+ * @bytes:    [in,out] Number of bytes in the bio being built. Incremented
+ *            by the number of bytes from @bv that may be appended to that
+ *            bio without exceeding @max_bytes
  * @max_segs: [in] upper bound for *@nsegs
- * @max_sectors: [in] upper bound for *@sectors
+ * @max_bytes: [in] upper bound for *@bytes
  *
  * When splitting a bio, it can happen that a bvec is encountered that is too
  * big to fit in a single segment and hence that it has to be split in the
@@ -216,10 +216,10 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
  */
 static bool bvec_split_segs(const struct request_queue *q,
 			    const struct bio_vec *bv, unsigned *nsegs,
-			    unsigned *sectors, unsigned max_segs,
-			    unsigned max_sectors)
+			    unsigned *bytes, unsigned max_segs,
+			    unsigned max_bytes)
 {
-	unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+	unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
 	unsigned len = min(bv->bv_len, max_len);
 	unsigned total_len = 0;
 	unsigned seg_size = 0;
@@ -237,7 +237,7 @@ static bool bvec_split_segs(const struct request_queue *q,
 			break;
 	}
 
-	*sectors += total_len >> 9;
+	*bytes += total_len;
 
 	/* tell the caller to split the bvec if it is too big to fit */
 	return len > 0 || bv->bv_len > max_len;
@@ -269,8 +269,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned nsegs = 0, sectors = 0;
-	const unsigned max_sectors = get_max_io_size(q, bio);
+	unsigned nsegs = 0, bytes = 0;
+	const unsigned max_bytes = get_max_io_size(q, bio) << 9;
 	const unsigned max_segs = queue_max_segments(q);
 
 	bio_for_each_bvec(bv, bio, iter) {
@@ -282,12 +282,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			goto split;
 
 		if (nsegs < max_segs &&
-		    sectors + (bv.bv_len >> 9) <= max_sectors &&
+		    bytes + bv.bv_len <= max_bytes &&
 		    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 			nsegs++;
-			sectors += bv.bv_len >> 9;
-		} else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
-					 max_sectors)) {
+			bytes += bv.bv_len;
+		} else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
+					   max_bytes)) {
 			goto split;
 		}
 
@@ -299,6 +299,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	return NULL;
 split:
 	*segs = nsegs;
+	bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
 
 	/*
 	 * Bio splitting may cause subtle trouble such as hang when doing sync
@@ -306,7 +307,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	 * big IO can be trival, disable iopoll when split needed.
 	 */
 	bio_clear_polled(bio);
-	return bio_split(bio, sectors, GFP_NOIO, bs);
+	return bio_split(bio, bytes >> 9, GFP_NOIO, bs);
 }
 
 /**
@@ -375,7 +376,7 @@ EXPORT_SYMBOL(blk_queue_split);
 unsigned int blk_recalc_rq_segments(struct request *rq)
 {
 	unsigned int nr_phys_segs = 0;
-	unsigned int nr_sectors = 0;
+	unsigned int bytes = 0;
 	struct req_iterator iter;
 	struct bio_vec bv;
 
@@ -398,7 +399,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
 	}
 
 	rq_for_each_bvec(bv, rq, iter)
-		bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+		bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
 				UINT_MAX, UINT_MAX);
 	return nr_phys_segs;
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv3 5/6] block/bounce: count bytes instead of sectors
  2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
                   ` (3 preceding siblings ...)
  2022-05-23 21:01 ` [PATCHv3 4/6] block/merge: count bytes instead of sectors Keith Busch
@ 2022-05-23 21:01 ` Keith Busch
  2022-05-24  6:09   ` Christoph Hellwig
  2022-05-24 14:32   ` Pankaj Raghav
  2022-05-23 21:01 ` [PATCHv3 6/6] block: relax direct io memory alignment Keith Busch
  5 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

Individual bv_len's may not be a sector size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bounce.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/bounce.c b/block/bounce.c
index 8f7b6fe3b4db..20a43c4dbdda 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -207,17 +207,18 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	struct bvec_iter iter;
 	unsigned i = 0;
 	bool bounce = false;
-	int sectors = 0;
+	int sectors = 0, bytes = 0;
 
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_VECS)
-			sectors += from.bv_len >> 9;
+			bytes += from.bv_len;
 		if (PageHighMem(from.bv_page))
 			bounce = true;
 	}
 	if (!bounce)
 		return;
 
+	sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> 9;
 	if (sectors < bio_sectors(*bio_orig)) {
 		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
 		bio_chain(bio, *bio_orig);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCHv3 6/6] block: relax direct io memory alignment
  2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
                   ` (4 preceding siblings ...)
  2022-05-23 21:01 ` [PATCHv3 5/6] block/bounce: " Keith Busch
@ 2022-05-23 21:01 ` Keith Busch
  2022-05-24  6:12   ` Christoph Hellwig
  2022-05-24 15:19   ` Pankaj Raghav
  5 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-23 21:01 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

From: Keith Busch <kbusch@kernel.org>

Use the address alignment requirements from the hardware for direct io
instead of requiring addresses be aligned to the block size. User space
can discover the alignment requirements from the dma_alignment queue
attribute.

User space can specify any hardware compatible DMA offset for each
segment, but every segment length is still required to be a multiple of
the block size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2->v3:

  Removed iomap support for now

  Added alignment help function instead of duplicating it (Christoph)

  Added comment explaining ALIGN_DOWN

  Added check for iov alignment in _async case

 block/bio.c  | 13 +++++++++++++
 block/fops.c | 41 ++++++++++++++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 55d2a9c4e312..c8ea14ad87f6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1205,6 +1205,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
 	ssize_t size, left;
@@ -1219,7 +1220,19 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
+	/*
+	 * Each segment in the iov is required to be a block size multiple.
+	 * However, we may not be able to get the entire segment if it spans
+	 * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
+	 * result to ensure the bio's total size is correct. The remainder of
+	 * the iov data will be picked up in the next bio iteration.
+	 *
+	 * If the result is ever 0, that indicates the iov fails the segment
+	 * size requirement and is an error.
+	 */
 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (size > 0)
+		size = ALIGN_DOWN(size, queue_logical_block_size(q));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
diff --git a/block/fops.c b/block/fops.c
index b9b83030e0df..218e4a8b92aa 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -42,6 +42,16 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
 	return op;
 }
 
+static int blkdev_dio_aligned(struct block_device *bdev, loff_t pos,
+			      struct iov_iter *iter)
+{
+	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
+		return -EINVAL;
+	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
+		return -EINVAL;
+	return 0;
+}
+
 #define DIO_INLINE_BIO_VECS 4
 
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
@@ -54,9 +64,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	struct bio bio;
 	ssize_t ret;
 
-	if ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1))
-		return -EINVAL;
+	ret = blkdev_dio_aligned(bdev, pos, iter);
+	if (ret)
+		return ret;
 
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
 		vecs = inline_vecs;
@@ -80,6 +90,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	ret = bio_iov_iter_get_pages(&bio, iter);
 	if (unlikely(ret))
 		goto out;
+	/* check if iov is not aligned */
+	if (unlikely(iov_iter_count(iter))) {
+		ret = -EINVAL;
+		goto out;
+	}
 	ret = bio.bi_iter.bi_size;
 
 	if (iov_iter_rw(iter) == WRITE)
@@ -171,11 +186,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	loff_t pos = iocb->ki_pos;
-	int ret = 0;
+	int ret;
 
-	if ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1))
-		return -EINVAL;
+	ret = blkdev_dio_aligned(bdev, pos, iter);
+	if (ret)
+		return ret;
 
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
@@ -296,11 +311,11 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	loff_t pos = iocb->ki_pos;
-	int ret = 0;
+	int ret;
 
-	if ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1))
-		return -EINVAL;
+	ret = blkdev_dio_aligned(bdev, pos, iter);
+	if (ret)
+		return ret;
 
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
@@ -323,6 +338,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		bio_iov_bvec_set(bio, iter);
 	} else {
 		ret = bio_iov_iter_get_pages(bio, iter);
+
+		/* check if iov is not aligned */
+		if (unlikely(iov_iter_count(iter)))
+			ret = -EINVAL;
 		if (unlikely(ret)) {
 			bio_put(bio);
 			return ret;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
@ 2022-05-24  6:00   ` Christoph Hellwig
  2022-05-24  6:24   ` Johannes Thumshirn
  2022-05-24 14:17   ` Pankaj Raghav
  2 siblings, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2022-05-24  6:00 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 2/6] block: export dma_alignment attribute
  2022-05-23 21:01 ` [PATCHv3 2/6] block: export dma_alignment attribute Keith Busch
@ 2022-05-24  6:02   ` Christoph Hellwig
  2022-05-24  6:24   ` Johannes Thumshirn
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2022-05-24  6:02 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 3/6] block: introduce bdev_dma_alignment helper
  2022-05-23 21:01 ` [PATCHv3 3/6] block: introduce bdev_dma_alignment helper Keith Busch
@ 2022-05-24  6:02   ` Christoph Hellwig
  2022-05-24  6:25   ` Johannes Thumshirn
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2022-05-24  6:02 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

On Mon, May 23, 2022 at 02:01:16PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Preparing for upcoming dma_alignment users that have a block_device, but
> don't need the request_queue.

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 4/6] block/merge: count bytes instead of sectors
  2022-05-23 21:01 ` [PATCHv3 4/6] block/merge: count bytes instead of sectors Keith Busch
@ 2022-05-24  6:07   ` Christoph Hellwig
  0 siblings, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2022-05-24  6:07 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

On Mon, May 23, 2022 at 02:01:17PM -0700, Keith Busch wrote:
> +	const unsigned max_bytes = get_max_io_size(q, bio) << 9;

> +	return bio_split(bio, bytes >> 9, GFP_NOIO, bs);

These should use SECTOR_SHIFT.

>  split:
>  	*segs = nsegs;
> +	bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));

Please add a comment explaining the added alignment here.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 5/6] block/bounce: count bytes instead of sectors
  2022-05-23 21:01 ` [PATCHv3 5/6] block/bounce: " Keith Busch
@ 2022-05-24  6:09   ` Christoph Hellwig
  2022-05-25 14:08     ` Keith Busch
  2022-05-24 14:32   ` Pankaj Raghav
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2022-05-24  6:09 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

On Mon, May 23, 2022 at 02:01:18PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Individual bv_len's may not be a sector size.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  block/bounce.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/block/bounce.c b/block/bounce.c
> index 8f7b6fe3b4db..20a43c4dbdda 100644
> --- a/block/bounce.c
> +++ b/block/bounce.c
> @@ -207,17 +207,18 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
>  	struct bvec_iter iter;
>  	unsigned i = 0;
>  	bool bounce = false;
> -	int sectors = 0;
> +	int sectors = 0, bytes = 0;
>  
>  	bio_for_each_segment(from, *bio_orig, iter) {
>  		if (i++ < BIO_MAX_VECS)
> -			sectors += from.bv_len >> 9;
> +			bytes += from.bv_len;
>  		if (PageHighMem(from.bv_page))
>  			bounce = true;
>  	}
>  	if (!bounce)
>  		return;
>  
> +	sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> 9;

Same comment about SECTOR_SHIFT and a comment here.  That being said,
why do we even align here?  Shouldn't bytes always be setor aligned here
and this should be a WARN_ON or other sanity check?  Probably the same
for the previous patch.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 6/6] block: relax direct io memory alignment
  2022-05-23 21:01 ` [PATCHv3 6/6] block: relax direct io memory alignment Keith Busch
@ 2022-05-24  6:12   ` Christoph Hellwig
  2022-05-24 15:19   ` Pankaj Raghav
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2022-05-24  6:12 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

On Mon, May 23, 2022 at 02:01:19PM -0700, Keith Busch wrote:
>   Removed iomap support for now

Do you plan to add a separate patch for it?  It would be a shame to
miss it especially as you said you already tested xfs.

> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>  	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>  	struct page **pages = (struct page **)bv;
>  	ssize_t size, left;
> @@ -1219,7 +1220,19 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
>  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
>  
> +	/*
> +	 * Each segment in the iov is required to be a block size multiple.
> +	 * However, we may not be able to get the entire segment if it spans
> +	 * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
> +	 * result to ensure the bio's total size is correct. The remainder of
> +	 * the iov data will be picked up in the next bio iteration.
> +	 *
> +	 * If the result is ever 0, that indicates the iov fails the segment
> +	 * size requirement and is an error.
> +	 */
>  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +	if (size > 0)
> +		size = ALIGN_DOWN(size, queue_logical_block_size(q));

I think we can simply use bdev_logical_block_size here and remove the need
for the q local variable.

Given that bio_iov_iter_get_pages is used by more than the block device
direct I/O code maybe split this up further?

> @@ -42,6 +42,16 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>  	return op;
>  }
>  
> +static int blkdev_dio_aligned(struct block_device *bdev, loff_t pos,
> +			      struct iov_iter *iter)
> +{
> +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> +		return -EINVAL;
> +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> +		return -EINVAL;
> +	return 0;
> +}

I'd also split adding this helper into another prep patch to see
the actual change in behavior more easily.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
  2022-05-24  6:00   ` Christoph Hellwig
@ 2022-05-24  6:24   ` Johannes Thumshirn
  2022-05-24 14:17   ` Pankaj Raghav
  2 siblings, 0 replies; 26+ messages in thread
From: Johannes Thumshirn @ 2022-05-24  6:24 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

Looks good,
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 2/6] block: export dma_alignment attribute
  2022-05-23 21:01 ` [PATCHv3 2/6] block: export dma_alignment attribute Keith Busch
  2022-05-24  6:02   ` Christoph Hellwig
@ 2022-05-24  6:24   ` Johannes Thumshirn
  1 sibling, 0 replies; 26+ messages in thread
From: Johannes Thumshirn @ 2022-05-24  6:24 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

Looks good,
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 3/6] block: introduce bdev_dma_alignment helper
  2022-05-23 21:01 ` [PATCHv3 3/6] block: introduce bdev_dma_alignment helper Keith Busch
  2022-05-24  6:02   ` Christoph Hellwig
@ 2022-05-24  6:25   ` Johannes Thumshirn
  1 sibling, 0 replies; 26+ messages in thread
From: Johannes Thumshirn @ 2022-05-24  6:25 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, ebiggers,
	Keith Busch

Looks good,
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
  2022-05-24  6:00   ` Christoph Hellwig
  2022-05-24  6:24   ` Johannes Thumshirn
@ 2022-05-24 14:17   ` Pankaj Raghav
  2022-05-24 15:38     ` Keith Busch
  2 siblings, 1 reply; 26+ messages in thread
From: Pankaj Raghav @ 2022-05-24 14:17 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

On Mon, May 23, 2022 at 02:01:14PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The getting pages setup for zone append and normal IO are identical. Use
> common code for each.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> +
> +static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int offset)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	bool same_page = false;
> +
> +	if (bio_add_hw_page(q, bio, page, len, offset,
> +			queue_max_zone_append_sectors(q), &same_page) != len)
> +		return -EINVAL;
> +	if (same_page)
> +		put_page(page);
> +	return 0;
> +}
> +
>  
> -static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
> -{
> -	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> -	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
> -	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> -	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
> -	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
> -	struct page **pages = (struct page **)bv;
> -	ssize_t size, left;
> -	unsigned len, i;
> -	size_t offset;
> -	int ret = 0;
> -

> -	if (WARN_ON_ONCE(!max_append_sectors))
> -		return 0;
I don't see this check in the append path. Should it be added in
bio_iov_add_zone_append_page() function?
> -
> -	/*
> -	 * Move page array up in the allocated memory for the bio vecs as far as
> -	 * possible so that we can start filling biovecs from the beginning
> -	 * without overwriting the temporary page array.
> -	 */
> -	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
> -	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> -
> -	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> -	if (unlikely(size <= 0))

-- 
Pankaj Raghav


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 5/6] block/bounce: count bytes instead of sectors
  2022-05-23 21:01 ` [PATCHv3 5/6] block/bounce: " Keith Busch
  2022-05-24  6:09   ` Christoph Hellwig
@ 2022-05-24 14:32   ` Pankaj Raghav
  1 sibling, 0 replies; 26+ messages in thread
From: Pankaj Raghav @ 2022-05-24 14:32 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch

On Mon, May 23, 2022 at 02:01:18PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Individual bv_len's may not be a sector size.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  block/bounce.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/block/bounce.c b/block/bounce.c
> index 8f7b6fe3b4db..20a43c4dbdda 100644
> --- a/block/bounce.c
> +++ b/block/bounce.c
> @@ -207,17 +207,18 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
>  	struct bvec_iter iter;
>  	unsigned i = 0;
>  	bool bounce = false;
> -	int sectors = 0;
> +	int sectors = 0, bytes = 0;
>  
>  	bio_for_each_segment(from, *bio_orig, iter) {
>  		if (i++ < BIO_MAX_VECS)
> -			sectors += from.bv_len >> 9;
> +			bytes += from.bv_len;
bv.len is unsigned int. bytes variable should also unsigned int to be on
the safe side.

>  		if (PageHighMem(from.bv_page))
>  			bounce = true;
>  	}
>  	if (!bounce)
>  		return;
>  
> +	sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> 9;
>  	if (sectors < bio_sectors(*bio_orig)) {
>  		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
>  		bio_chain(bio, *bio_orig);
> -- 
> 2.30.2
> 

-- 
Pankaj Raghav

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 6/6] block: relax direct io memory alignment
  2022-05-23 21:01 ` [PATCHv3 6/6] block: relax direct io memory alignment Keith Busch
  2022-05-24  6:12   ` Christoph Hellwig
@ 2022-05-24 15:19   ` Pankaj Raghav
  1 sibling, 0 replies; 26+ messages in thread
From: Pankaj Raghav @ 2022-05-24 15:19 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, ebiggers, Keith Busch, Pankaj Raghav

On Mon, May 23, 2022 at 02:01:19PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> diff --git a/block/fops.c b/block/fops.c
> index b9b83030e0df..218e4a8b92aa 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -42,6 +42,16 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
>  	return op;
>  }
>  
> +static int blkdev_dio_aligned(struct block_device *bdev, loff_t pos,
> +			      struct iov_iter *iter)
> +{
> +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
Minor nit as ALIGN* macros have been used in other places, this
alignment check can also be changed to using IS_ALIGNED macro:
if (!IS_ALIGNED(pos | iov_iter_count(iter), bdev_logical_block_size(bdev)))
> +		return -EINVAL;
> +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> +		return -EINVAL;
> +	return 0;
> +}
> +
>  #define DIO_INLINE_BIO_VECS 4
>  
>  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> @@ -54,9 +64,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
>  	struct bio bio;
>  	ssize_t ret;
>  
> -	if ((pos | iov_iter_alignment(iter)) &
> -	    (bdev_logical_block_size(bdev) - 1))
> -		return -EINVAL;
> +	ret = blkdev_dio_aligned(bdev, pos, iter);
> +	if (ret)
> +		return ret;
>  

-- 
Pankaj Raghav

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-24 14:17   ` Pankaj Raghav
@ 2022-05-24 15:38     ` Keith Busch
  2022-05-25  7:49       ` Pankaj Raghav
  0 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2022-05-24 15:38 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal, ebiggers

On Tue, May 24, 2022 at 04:17:54PM +0200, Pankaj Raghav wrote:
> On Mon, May 23, 2022 at 02:01:14PM -0700, Keith Busch wrote:
> > -	if (WARN_ON_ONCE(!max_append_sectors))
> > -		return 0;
> I don't see this check in the append path. Should it be added in
> bio_iov_add_zone_append_page() function?

I'm not sure this check makes a lot of sense. If it just returns 0 here, then
won't that get bio_iov_iter_get_pages() stuck in an infinite loop? The bio
isn't filling, the iov isn't advancing, and 0 indicates keep-going.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-24 15:38     ` Keith Busch
@ 2022-05-25  7:49       ` Pankaj Raghav
  2022-05-25  8:30         ` Damien Le Moal
  2022-05-25 13:37         ` Keith Busch
  0 siblings, 2 replies; 26+ messages in thread
From: Pankaj Raghav @ 2022-05-25  7:49 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal, ebiggers, Pankaj Raghav

On Tue, May 24, 2022 at 09:38:32AM -0600, Keith Busch wrote:
> On Tue, May 24, 2022 at 04:17:54PM +0200, Pankaj Raghav wrote:
> > On Mon, May 23, 2022 at 02:01:14PM -0700, Keith Busch wrote:
> > > -	if (WARN_ON_ONCE(!max_append_sectors))
> > > -		return 0;
> > I don't see this check in the append path. Should it be added in
> > bio_iov_add_zone_append_page() function?
> 
> I'm not sure this check makes a lot of sense. If it just returns 0 here, then
> won't that get bio_iov_iter_get_pages() stuck in an infinite loop? The bio
> isn't filling, the iov isn't advancing, and 0 indicates keep-going.
Yeah but if max_append_sectors is zero, then bio_add_hw_page() also
returns 0 as follows:
....
	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
		return 0;
....
With WARN_ON_ONCE, we at least get a warning message if it gets stuck in an
infinite loop because of max_append_sectors being zero right?

-- 
Pankaj Raghav

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-25  7:49       ` Pankaj Raghav
@ 2022-05-25  8:30         ` Damien Le Moal
  2022-05-25 13:37         ` Keith Busch
  1 sibling, 0 replies; 26+ messages in thread
From: Damien Le Moal @ 2022-05-25  8:30 UTC (permalink / raw)
  To: Pankaj Raghav, Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, ebiggers, Pankaj Raghav

On 5/25/22 16:49, Pankaj Raghav wrote:
> On Tue, May 24, 2022 at 09:38:32AM -0600, Keith Busch wrote:
>> On Tue, May 24, 2022 at 04:17:54PM +0200, Pankaj Raghav wrote:
>>> On Mon, May 23, 2022 at 02:01:14PM -0700, Keith Busch wrote:
>>>> -	if (WARN_ON_ONCE(!max_append_sectors))
>>>> -		return 0;
>>> I don't see this check in the append path. Should it be added in
>>> bio_iov_add_zone_append_page() function?
>>
>> I'm not sure this check makes a lot of sense. If it just returns 0 here, then
>> won't that get bio_iov_iter_get_pages() stuck in an infinite loop? The bio
>> isn't filling, the iov isn't advancing, and 0 indicates keep-going.
> Yeah but if max_append_sectors is zero, then bio_add_hw_page() also
> returns 0 as follows:
> ....
> 	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
> 		return 0;
> ....
> With WARN_ON_ONCE, we at least get a warning message if it gets stuck in an
> infinite loop because of max_append_sectors being zero right?
> 

Warning about an infinite loop that can be recovered from only by
rebooting the machine is not very useful...
If max_append_sectors is zero and bio_iov_add_zone_append_page() is
called, this is an error (stupid user) and everything should be failed
with -ENOSUPP or -EIO.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-25  7:49       ` Pankaj Raghav
  2022-05-25  8:30         ` Damien Le Moal
@ 2022-05-25 13:37         ` Keith Busch
  2022-05-25 14:25           ` Pankaj Raghav
  1 sibling, 1 reply; 26+ messages in thread
From: Keith Busch @ 2022-05-25 13:37 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal, ebiggers, Pankaj Raghav

On Wed, May 25, 2022 at 09:49:41AM +0200, Pankaj Raghav wrote:
> On Tue, May 24, 2022 at 09:38:32AM -0600, Keith Busch wrote:
> > On Tue, May 24, 2022 at 04:17:54PM +0200, Pankaj Raghav wrote:
> > > On Mon, May 23, 2022 at 02:01:14PM -0700, Keith Busch wrote:
> > > > -	if (WARN_ON_ONCE(!max_append_sectors))
> > > > -		return 0;
> > > I don't see this check in the append path. Should it be added in
> > > bio_iov_add_zone_append_page() function?
> > 
> > I'm not sure this check makes a lot of sense. If it just returns 0 here, then
> > won't that get bio_iov_iter_get_pages() stuck in an infinite loop? The bio
> > isn't filling, the iov isn't advancing, and 0 indicates keep-going.
> Yeah but if max_append_sectors is zero, then bio_add_hw_page() also
> returns 0 as follows:
> ....
> 	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
> 		return 0;
> ....
> With WARN_ON_ONCE, we at least get a warning message if it gets stuck in an
> infinite loop because of max_append_sectors being zero right?

The return for this function is the added length, not an indicator of success.
And we already handle '0' as an error from bio_iov_add_zone_append_page():

	if (bio_add_hw_page(q, bio, page, len, offset,
			queue_max_zone_append_sectors(q), &same_page) != len)
		return -EINVAL;

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 5/6] block/bounce: count bytes instead of sectors
  2022-05-24  6:09   ` Christoph Hellwig
@ 2022-05-25 14:08     ` Keith Busch
  2022-05-25 14:17       ` Keith Busch
  0 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2022-05-25 14:08 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team,
	bvanassche, damien.lemoal, ebiggers

On Tue, May 24, 2022 at 08:09:21AM +0200, Christoph Hellwig wrote:
> On Mon, May 23, 2022 at 02:01:18PM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > Individual bv_len's may not be a sector size.
> > 
> > Signed-off-by: Keith Busch <kbusch@kernel.org>
> > ---
> >  block/bounce.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/bounce.c b/block/bounce.c
> > index 8f7b6fe3b4db..20a43c4dbdda 100644
> > --- a/block/bounce.c
> > +++ b/block/bounce.c
> > @@ -207,17 +207,18 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
> >  	struct bvec_iter iter;
> >  	unsigned i = 0;
> >  	bool bounce = false;
> > -	int sectors = 0;
> > +	int sectors = 0, bytes = 0;
> >  
> >  	bio_for_each_segment(from, *bio_orig, iter) {
> >  		if (i++ < BIO_MAX_VECS)
> > -			sectors += from.bv_len >> 9;
> > +			bytes += from.bv_len;
> >  		if (PageHighMem(from.bv_page))
> >  			bounce = true;
> >  	}
> >  	if (!bounce)
> >  		return;
> >  
> > +	sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> 9;
> 
> Same comment about SECTOR_SHIFT and a comment here.  That being said,
> why do we even align here?  Shouldn't bytes always be setor aligned here
> and this should be a WARN_ON or other sanity check?  Probably the same
> for the previous patch.

Yes, the total bytes should be sector aligned.

I'm not exactly sure why we're counting it this way, though. We could just skip
the iterative addition and get the total from bio->bi_iter.bi_size. Unless
bio_orig has more segments that BIO_MAX_VECS, which doesn't seem possible.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 5/6] block/bounce: count bytes instead of sectors
  2022-05-25 14:08     ` Keith Busch
@ 2022-05-25 14:17       ` Keith Busch
  0 siblings, 0 replies; 26+ messages in thread
From: Keith Busch @ 2022-05-25 14:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team,
	bvanassche, damien.lemoal, ebiggers

On Wed, May 25, 2022 at 08:08:07AM -0600, Keith Busch wrote:
> On Tue, May 24, 2022 at 08:09:21AM +0200, Christoph Hellwig wrote:
> > >  	bio_for_each_segment(from, *bio_orig, iter) {
> > >  		if (i++ < BIO_MAX_VECS)
> > > -			sectors += from.bv_len >> 9;
> > > +			bytes += from.bv_len;
> > >  		if (PageHighMem(from.bv_page))
> > >  			bounce = true;
> > >  	}
> > >  	if (!bounce)
> > >  		return;
> > >  
> > > +	sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> 9;
> > 
> > Same comment about SECTOR_SHIFT and a comment here.  That being said,
> > why do we even align here?  Shouldn't bytes always be setor aligned here
> > and this should be a WARN_ON or other sanity check?  Probably the same
> > for the previous patch.
> 
> Yes, the total bytes should be sector aligned.
> 
> I'm not exactly sure why we're counting it this way, though. We could just skip
> the iterative addition and get the total from bio->bi_iter.bi_size. Unless
> bio_orig has more segments that BIO_MAX_VECS, which doesn't seem possible.

Oh, there's a comment explaining the original can have more than BIO_MAX_VECS,
so the ALIGN_DOWN is necessary to ensure a logical block sized split.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCHv3 1/6] block/bio: remove duplicate append pages code
  2022-05-25 13:37         ` Keith Busch
@ 2022-05-25 14:25           ` Pankaj Raghav
  0 siblings, 0 replies; 26+ messages in thread
From: Pankaj Raghav @ 2022-05-25 14:25 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal, ebiggers, Pankaj Raghav

On Wed, May 25, 2022 at 07:37:45AM -0600, Keith Busch wrote:
> On Wed, May 25, 2022 at 09:49:41AM +0200, Pankaj Raghav wrote:
> > On Tue, May 24, 2022 at 09:38:32AM -0600, Keith Busch wrote:
> > > On Tue, May 24, 2022 at 04:17:54PM +0200, Pankaj Raghav wrote:
> > > > On Mon, May 23, 2022 at 02:01:14PM -0700, Keith Busch wrote:
> > > > > -	if (WARN_ON_ONCE(!max_append_sectors))
> > > > > -		return 0;
> > > > I don't see this check in the append path. Should it be added in
> > > > bio_iov_add_zone_append_page() function?
> > > 
> > > I'm not sure this check makes a lot of sense. If it just returns 0 here, then
> > > won't that get bio_iov_iter_get_pages() stuck in an infinite loop? The bio
> > > isn't filling, the iov isn't advancing, and 0 indicates keep-going.
> > Yeah but if max_append_sectors is zero, then bio_add_hw_page() also
> > returns 0 as follows:
> > ....
> > 	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
> > 		return 0;
> > ....
> > With WARN_ON_ONCE, we at least get a warning message if it gets stuck in an
> > infinite loop because of max_append_sectors being zero right?
> 
> The return for this function is the added length, not an indicator of success.
> And we already handle '0' as an error from bio_iov_add_zone_append_page():
> 
> 	if (bio_add_hw_page(q, bio, page, len, offset,
> 			queue_max_zone_append_sectors(q), &same_page) != len)
Ah. I didn't see the `!=len` part. Sorry for the noise and ignore this
comment.
> 		return -EINVAL;

-- 
Pankaj Raghav

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-05-25 14:25 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-23 21:01 [PATCHv3 0/6] direct io dma alignment Keith Busch
2022-05-23 21:01 ` [PATCHv3 1/6] block/bio: remove duplicate append pages code Keith Busch
2022-05-24  6:00   ` Christoph Hellwig
2022-05-24  6:24   ` Johannes Thumshirn
2022-05-24 14:17   ` Pankaj Raghav
2022-05-24 15:38     ` Keith Busch
2022-05-25  7:49       ` Pankaj Raghav
2022-05-25  8:30         ` Damien Le Moal
2022-05-25 13:37         ` Keith Busch
2022-05-25 14:25           ` Pankaj Raghav
2022-05-23 21:01 ` [PATCHv3 2/6] block: export dma_alignment attribute Keith Busch
2022-05-24  6:02   ` Christoph Hellwig
2022-05-24  6:24   ` Johannes Thumshirn
2022-05-23 21:01 ` [PATCHv3 3/6] block: introduce bdev_dma_alignment helper Keith Busch
2022-05-24  6:02   ` Christoph Hellwig
2022-05-24  6:25   ` Johannes Thumshirn
2022-05-23 21:01 ` [PATCHv3 4/6] block/merge: count bytes instead of sectors Keith Busch
2022-05-24  6:07   ` Christoph Hellwig
2022-05-23 21:01 ` [PATCHv3 5/6] block/bounce: " Keith Busch
2022-05-24  6:09   ` Christoph Hellwig
2022-05-25 14:08     ` Keith Busch
2022-05-25 14:17       ` Keith Busch
2022-05-24 14:32   ` Pankaj Raghav
2022-05-23 21:01 ` [PATCHv3 6/6] block: relax direct io memory alignment Keith Busch
2022-05-24  6:12   ` Christoph Hellwig
2022-05-24 15:19   ` Pankaj Raghav

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.