All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2 0/3] direct io alignment relax
@ 2022-05-18 17:11 Keith Busch
  2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
                   ` (4 more replies)
  0 siblings, 5 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-18 17:11 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Including the fs list this time.

I am still working on a better interface to report the dio alignment to
an application. The most recent suggestion of using statx is proving to
be less straight forward than I thought, but I don't want to hold this
series up for that.

Changes from v1:

  Included a prep patch to unify rw with zone append (Damien)

  Using ALIGN macro instead of reimplementing it (Bart)

  Squashed the segment size alignment patch into the "relax" patch since
  the check is only needed because of that patch.

  Fixed a check for short r/w in the _simple case. 

Keith Busch (3):
  block/bio: remove duplicate append pages code
  block: export dma_alignment attribute
  block: relax direct io memory alignment

 block/bio.c            | 93 +++++++++++++++++++-----------------------
 block/blk-sysfs.c      |  7 ++++
 block/fops.c           | 20 ++++++---
 fs/direct-io.c         | 11 +++--
 fs/iomap/direct-io.c   |  3 +-
 include/linux/blkdev.h |  5 +++
 6 files changed, 76 insertions(+), 63 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCHv2 1/3] block/bio: remove duplicate append pages code
  2022-05-18 17:11 [PATCHv2 0/3] direct io alignment relax Keith Busch
@ 2022-05-18 17:11 ` Keith Busch
  2022-05-18 20:21   ` Chaitanya Kulkarni
                     ` (2 more replies)
  2022-05-18 17:11 ` [PATCHv2 2/3] block: export dma_alignment attribute Keith Busch
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-18 17:11 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The setup for getting pages are identical for zone append and normal IO.
Use common code for each.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio.c | 90 ++++++++++++++++++++++-------------------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index a3893d80dccc..320514a47527 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1158,6 +1158,39 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
 		put_page(pages[i]);
 }
 
+static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter,
+				      struct page **pages, ssize_t size,
+				      size_t offset)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
+	unsigned len, i;
+	ssize_t left;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(!max_append_sectors))
+		return 0;
+
+	for (left = size, i = 0; left > 0; left -= len, i++) {
+		struct page *page = pages[i];
+		bool same_page = false;
+
+		len = min_t(size_t, PAGE_SIZE - offset, left);
+		if (bio_add_hw_page(q, bio, page, len, offset,
+				max_append_sectors, &same_page) != len) {
+			bio_put_pages(pages + i, left, offset);
+			ret = -EINVAL;
+			break;
+		}
+		if (same_page)
+			put_page(page);
+		offset = 0;
+	}
+
+	iov_iter_advance(iter, size - left);
+	return ret;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -1193,6 +1226,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+		return __bio_iov_append_get_pages(bio, iter, pages, size,
+						  offset);
+
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
 
@@ -1215,54 +1252,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	return 0;
 }
 
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
-	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
-	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
-	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
-	struct page **pages = (struct page **)bv;
-	ssize_t size, left;
-	unsigned len, i;
-	size_t offset;
-	int ret = 0;
-
-	if (WARN_ON_ONCE(!max_append_sectors))
-		return 0;
-
-	/*
-	 * Move page array up in the allocated memory for the bio vecs as far as
-	 * possible so that we can start filling biovecs from the beginning
-	 * without overwriting the temporary page array.
-	 */
-	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
-	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
-
-	for (left = size, i = 0; left > 0; left -= len, i++) {
-		struct page *page = pages[i];
-		bool same_page = false;
-
-		len = min_t(size_t, PAGE_SIZE - offset, left);
-		if (bio_add_hw_page(q, bio, page, len, offset,
-				max_append_sectors, &same_page) != len) {
-			bio_put_pages(pages + i, left, offset);
-			ret = -EINVAL;
-			break;
-		}
-		if (same_page)
-			put_page(page);
-		offset = 0;
-	}
-
-	iov_iter_advance(iter, size - left);
-	return ret;
-}
-
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
@@ -1297,10 +1286,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	}
 
 	do {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-			ret = __bio_iov_append_get_pages(bio, iter);
-		else
-			ret = __bio_iov_iter_get_pages(bio, iter);
+		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
 	/* don't account direct I/O as memory stall */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCHv2 2/3] block: export dma_alignment attribute
  2022-05-18 17:11 [PATCHv2 0/3] direct io alignment relax Keith Busch
  2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
@ 2022-05-18 17:11 ` Keith Busch
  2022-05-18 20:22   ` Chaitanya Kulkarni
                     ` (2 more replies)
  2022-05-18 17:11 ` [PATCHv2 3/3] block: relax direct io memory alignment Keith Busch
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-18 17:11 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

From: Keith Busch <kbusch@kernel.org>

User space may want to know how to align their buffers to avoid
bouncing. Export the queue attribute.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-sysfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 88bd41d4cb59..14607565d781 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
 	return queue_var_show(q->limits.virt_boundary_mask, page);
 }
 
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_dma_alignment(q), page);
+}
+
 #define QUEUE_SYSFS_BIT_FNS(name, flag, neg)				\
 static ssize_t								\
 queue_##name##_show(struct request_queue *q, char *page)		\
@@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
 QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
 QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
 QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
@@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
 	&blk_throtl_sample_time_entry.attr,
 #endif
 	&queue_virt_boundary_mask_entry.attr,
+	&queue_dma_alignment_entry.attr,
 	NULL,
 };
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-18 17:11 [PATCHv2 0/3] direct io alignment relax Keith Busch
  2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
  2022-05-18 17:11 ` [PATCHv2 2/3] block: export dma_alignment attribute Keith Busch
@ 2022-05-18 17:11 ` Keith Busch
  2022-05-19  0:14   ` Eric Biggers
  2022-05-19  7:38   ` Christoph Hellwig
  2022-05-18 22:45 ` [PATCHv2 0/3] direct io alignment relax Jens Axboe
  2022-05-18 23:26 ` Eric Biggers
  4 siblings, 2 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-18 17:11 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Use the address alignment requirements from the hardware for direct io
instead of requiring addresses be aligned to the block size. User space
can discover the alignment requirements from the dma_alignment queue
attribute.

User space can specify any hardware compatible DMA offset for each
segment, but every segment length is still required to be a multiple of
the block size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v1->v2:

  Squashed the alignment patch into this one

  Use ALIGN_DOWN macro instead of reimplementing it

  Check for unalignment in _simple case

 block/bio.c            |  3 +++
 block/fops.c           | 20 ++++++++++++++------
 fs/direct-io.c         | 11 +++++++----
 fs/iomap/direct-io.c   |  3 ++-
 include/linux/blkdev.h |  5 +++++
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 320514a47527..bde9b475a4d8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1207,6 +1207,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
 	bool same_page = false;
@@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (size > 0)
+		size = ALIGN_DOWN(size, queue_logical_block_size(q));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
diff --git a/block/fops.c b/block/fops.c
index b9b83030e0df..d8537c29602f 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	struct bio bio;
 	ssize_t ret;
 
-	if ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1))
+	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
+		return -EINVAL;
+	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
 		return -EINVAL;
 
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
@@ -80,6 +81,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	ret = bio_iov_iter_get_pages(&bio, iter);
 	if (unlikely(ret))
 		goto out;
+	if (unlikely(iov_iter_count(iter))) {
+		/* iov is not aligned for a single bio */
+		ret = -EINVAL;
+		goto out;
+	}
 	ret = bio.bi_iter.bi_size;
 
 	if (iov_iter_rw(iter) == WRITE)
@@ -173,8 +179,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
-	if ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1))
+	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
+		return -EINVAL;
+	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
 		return -EINVAL;
 
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -298,8 +305,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
-	if ((pos | iov_iter_alignment(iter)) &
-	    (bdev_logical_block_size(bdev) - 1))
+	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
+		return -EINVAL;
+	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
 		return -EINVAL;
 
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 840752006f60..64cc176be60c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1131,7 +1131,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	struct dio_submit sdio = { 0, };
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
-	unsigned long align = offset | iov_iter_alignment(iter);
+	unsigned long align = iov_iter_alignment(iter);
 
 	/*
 	 * Avoid references to bdev if not absolutely needed to give
@@ -1165,11 +1165,14 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		goto fail_dio;
 	}
 
-	if (align & blocksize_mask) {
-		if (bdev)
+	if ((offset | align) & blocksize_mask) {
+		if (bdev) {
 			blkbits = blksize_bits(bdev_logical_block_size(bdev));
+			if (align & bdev_dma_alignment(bdev))
+				goto fail_dio;
+		}
 		blocksize_mask = (1 << blkbits) - 1;
-		if (align & blocksize_mask)
+		if ((offset | count) & blocksize_mask)
 			goto fail_dio;
 	}
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 80f9b047aa1b..0256d28baa8e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -244,7 +244,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	size_t copied = 0;
 	size_t orig_count;
 
-	if ((pos | length | align) & ((1 << blkbits) - 1))
+	if ((pos | length) & ((1 << blkbits) - 1) ||
+	    align & bdev_dma_alignment(iomap->bdev))
 		return -EINVAL;
 
 	if (iomap->type == IOMAP_UNWRITTEN) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5bdf2ac9142c..834b981ef01b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1365,6 +1365,11 @@ static inline int queue_dma_alignment(const struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
+static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
+{
+	return queue_dma_alignment(bdev_get_queue(bdev));
+}
+
 static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 1/3] block/bio: remove duplicate append pages code
  2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
@ 2022-05-18 20:21   ` Chaitanya Kulkarni
  2022-05-19  4:28   ` Bart Van Assche
  2022-05-19  7:32   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Chaitanya Kulkarni @ 2022-05-18 20:21 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

On 5/18/22 10:11, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The setup for getting pages are identical for zone append and normal IO.
> Use common code for each.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>

Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 2/3] block: export dma_alignment attribute
  2022-05-18 17:11 ` [PATCHv2 2/3] block: export dma_alignment attribute Keith Busch
@ 2022-05-18 20:22   ` Chaitanya Kulkarni
  2022-05-19  4:30   ` Bart Van Assche
  2022-05-19  7:33   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Chaitanya Kulkarni @ 2022-05-18 20:22 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

On 5/18/22 10:11, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> User space may want to know how to align their buffers to avoid
> bouncing. Export the queue attribute.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---


Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-18 17:11 [PATCHv2 0/3] direct io alignment relax Keith Busch
                   ` (2 preceding siblings ...)
  2022-05-18 17:11 ` [PATCHv2 3/3] block: relax direct io memory alignment Keith Busch
@ 2022-05-18 22:45 ` Jens Axboe
  2022-05-19  7:42   ` Christoph Hellwig
  2022-05-18 23:26 ` Eric Biggers
  4 siblings, 1 reply; 42+ messages in thread
From: Jens Axboe @ 2022-05-18 22:45 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: Kernel Team, hch, bvanassche, damien.lemoal, Keith Busch

On 5/18/22 11:11 AM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Including the fs list this time.
> 
> I am still working on a better interface to report the dio alignment to
> an application. The most recent suggestion of using statx is proving to
> be less straight forward than I thought, but I don't want to hold this
> series up for that.

This looks good to me. Anyone object to queueing this one up?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-18 17:11 [PATCHv2 0/3] direct io alignment relax Keith Busch
                   ` (3 preceding siblings ...)
  2022-05-18 22:45 ` [PATCHv2 0/3] direct io alignment relax Jens Axboe
@ 2022-05-18 23:26 ` Eric Biggers
  2022-05-19  0:51   ` Keith Busch
  4 siblings, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-18 23:26 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, Keith Busch

On Wed, May 18, 2022 at 10:11:28AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Including the fs list this time.
> 
> I am still working on a better interface to report the dio alignment to
> an application. The most recent suggestion of using statx is proving to
> be less straight forward than I thought, but I don't want to hold this
> series up for that.
> 

Note that I already implemented the statx support and sent it out for review:
https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#u
However, the patch series only received one comment.  I can send it out again if
people have become interested in it again...

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-18 17:11 ` [PATCHv2 3/3] block: relax direct io memory alignment Keith Busch
@ 2022-05-19  0:14   ` Eric Biggers
  2022-05-19  1:00     ` Keith Busch
  2022-05-19  7:38   ` Christoph Hellwig
  1 sibling, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-19  0:14 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, Keith Busch

On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> diff --git a/block/fops.c b/block/fops.c
> index b9b83030e0df..d8537c29602f 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
>  	struct bio bio;
>  	ssize_t ret;
>  
> -	if ((pos | iov_iter_alignment(iter)) &
> -	    (bdev_logical_block_size(bdev) - 1))
> +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> +		return -EINVAL;
> +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
>  		return -EINVAL;

The block layer makes a lot of assumptions that bios can be split at any bvec
boundary.  With this patch, bios whose length isn't a multiple of the logical
block size can be generated by splitting, which isn't valid.

Also some devices aren't compatible with logical blocks spanning bdevs at all.
dm-crypt errors out in this case, for example.

What am I missing?

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-18 23:26 ` Eric Biggers
@ 2022-05-19  0:51   ` Keith Busch
  2022-05-19  1:02     ` Chaitanya Kulkarni
  0 siblings, 1 reply; 42+ messages in thread
From: Keith Busch @ 2022-05-19  0:51 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 11:26:20PM +0000, Eric Biggers wrote:
> On Wed, May 18, 2022 at 10:11:28AM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > Including the fs list this time.
> > 
> > I am still working on a better interface to report the dio alignment to
> > an application. The most recent suggestion of using statx is proving to
> > be less straight forward than I thought, but I don't want to hold this
> > series up for that.
> > 
> 
> Note that I already implemented the statx support and sent it out for review:
> https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#u
> However, the patch series only received one comment.  I can send it out again if
> people have become interested in it again...

Thanks, I didn't see that the first time around, but I'll be sure to look at
your new version. It sounds like you encountered the same problem I did
regarding block device handles: the devtmpfs inodes for the raw block device
handles are not the bdev inodes. I do think it's useful the alignment
attributes are accessible through the block device files, though.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  0:14   ` Eric Biggers
@ 2022-05-19  1:00     ` Keith Busch
  2022-05-19  1:53       ` Eric Biggers
  2022-05-19  7:39       ` Christoph Hellwig
  0 siblings, 2 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-19  1:00 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > diff --git a/block/fops.c b/block/fops.c
> > index b9b83030e0df..d8537c29602f 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> >  	struct bio bio;
> >  	ssize_t ret;
> >  
> > -	if ((pos | iov_iter_alignment(iter)) &
> > -	    (bdev_logical_block_size(bdev) - 1))
> > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > +		return -EINVAL;
> > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> >  		return -EINVAL;
> 
> The block layer makes a lot of assumptions that bios can be split at any bvec
> boundary.  With this patch, bios whose length isn't a multiple of the logical
> block size can be generated by splitting, which isn't valid.

How? This patch ensures every segment is block size aligned. We can always
split a bio in the middle of a bvec for any lower level hardware constraints,
and I'm not finding any splitting criteria that would try to break a bio on a
non-block aligned boundary.

> Also some devices aren't compatible with logical blocks spanning bdevs at all.
> dm-crypt errors out in this case, for example.

I'm sorry, but I am not understanding this.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-19  0:51   ` Keith Busch
@ 2022-05-19  1:02     ` Chaitanya Kulkarni
  2022-05-19  2:02       ` Eric Biggers
  0 siblings, 1 reply; 42+ messages in thread
From: Chaitanya Kulkarni @ 2022-05-19  1:02 UTC (permalink / raw)
  To: Keith Busch, Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On 5/18/22 17:51, Keith Busch wrote:
> On Wed, May 18, 2022 at 11:26:20PM +0000, Eric Biggers wrote:
>> On Wed, May 18, 2022 at 10:11:28AM -0700, Keith Busch wrote:
>>> From: Keith Busch <kbusch@kernel.org>
>>>
>>> Including the fs list this time.
>>>
>>> I am still working on a better interface to report the dio alignment to
>>> an application. The most recent suggestion of using statx is proving to
>>> be less straight forward than I thought, but I don't want to hold this
>>> series up for that.
>>>
>>
>> Note that I already implemented the statx support and sent it out for review:
>> https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#u
>> However, the patch series only received one comment.  I can send it out again if
>> people have become interested in it again...
> 
> Thanks, I didn't see that the first time around, but I'll be sure to look at
> your new version. It sounds like you encountered the same problem I did
> regarding block device handles: the devtmpfs inodes for the raw block device
> handles are not the bdev inodes. I do think it's useful the alignment
> attributes are accessible through the block device files, though.

Irrespective of above problem, as per my review comment [1] on the
initial version of Eric's series I really want to see the generic
interface that can accommodate exposing optimal values for different
operations REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES/REQ_OP_VERIFY etc.
and not only for read/write.

-ck

https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#r3ffe9183c372fb97a9753e286f9cf6400e8ec272



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  1:00     ` Keith Busch
@ 2022-05-19  1:53       ` Eric Biggers
  2022-05-19  1:59         ` Keith Busch
  2022-05-19  7:39       ` Christoph Hellwig
  1 sibling, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-19  1:53 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> > On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > > diff --git a/block/fops.c b/block/fops.c
> > > index b9b83030e0df..d8537c29602f 100644
> > > --- a/block/fops.c
> > > +++ b/block/fops.c
> > > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> > >  	struct bio bio;
> > >  	ssize_t ret;
> > >  
> > > -	if ((pos | iov_iter_alignment(iter)) &
> > > -	    (bdev_logical_block_size(bdev) - 1))
> > > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > > +		return -EINVAL;
> > > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> > >  		return -EINVAL;
> > 
> > The block layer makes a lot of assumptions that bios can be split at any bvec
> > boundary.  With this patch, bios whose length isn't a multiple of the logical
> > block size can be generated by splitting, which isn't valid.
> 
> How? This patch ensures every segment is block size aligned.

No, it doesn't.  It ensures that the *total* length of each bio is logical block
size aligned.  It doesn't ensure that for the individual bvecs.  By decreasing
the required memory alignment to below the logical block size, you're allowing
logical blocks to span a page boundary.  Whenever the two pages involved aren't
physically contiguous, the data of the block will be split across two bvecs.

> > Also some devices aren't compatible with logical blocks spanning bdevs at all.
> > dm-crypt errors out in this case, for example.
> 
> I'm sorry, but I am not understanding this.

I meant to write bvecs, not bdevs.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  1:53       ` Eric Biggers
@ 2022-05-19  1:59         ` Keith Busch
  2022-05-19  2:08           ` Eric Biggers
  0 siblings, 1 reply; 42+ messages in thread
From: Keith Busch @ 2022-05-19  1:59 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 06:53:11PM -0700, Eric Biggers wrote:
> On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> > On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> > > On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > > > diff --git a/block/fops.c b/block/fops.c
> > > > index b9b83030e0df..d8537c29602f 100644
> > > > --- a/block/fops.c
> > > > +++ b/block/fops.c
> > > > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> > > >  	struct bio bio;
> > > >  	ssize_t ret;
> > > >  
> > > > -	if ((pos | iov_iter_alignment(iter)) &
> > > > -	    (bdev_logical_block_size(bdev) - 1))
> > > > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > > > +		return -EINVAL;
> > > > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> > > >  		return -EINVAL;
> > > 
> > > The block layer makes a lot of assumptions that bios can be split at any bvec
> > > boundary.  With this patch, bios whose length isn't a multiple of the logical
> > > block size can be generated by splitting, which isn't valid.
> > 
> > How? This patch ensures every segment is block size aligned.
> 
> No, it doesn't.  It ensures that the *total* length of each bio is logical block
> size aligned.  It doesn't ensure that for the individual bvecs.  By decreasing
> the required memory alignment to below the logical block size, you're allowing
> logical blocks to span a page boundary.  Whenever the two pages involved aren't
> physically contiguous, the data of the block will be split across two bvecs.

I'm aware that spanning pages can cause bad splits on the bi_max_vecs
condition, but I believe it's well handled here. Unless I'm terribly confused,
which is certainly possible, I think you may have missed this part of the
patch:

@@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (size > 0)
+		size = ALIGN_DOWN(size, queue_logical_block_size(q));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-19  1:02     ` Chaitanya Kulkarni
@ 2022-05-19  2:02       ` Eric Biggers
  2022-05-19  7:43         ` hch
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-19  2:02 UTC (permalink / raw)
  To: Chaitanya Kulkarni
  Cc: Keith Busch, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, hch, bvanassche, damien.lemoal

On Thu, May 19, 2022 at 01:02:42AM +0000, Chaitanya Kulkarni wrote:
> On 5/18/22 17:51, Keith Busch wrote:
> > On Wed, May 18, 2022 at 11:26:20PM +0000, Eric Biggers wrote:
> >> On Wed, May 18, 2022 at 10:11:28AM -0700, Keith Busch wrote:
> >>> From: Keith Busch <kbusch@kernel.org>
> >>>
> >>> Including the fs list this time.
> >>>
> >>> I am still working on a better interface to report the dio alignment to
> >>> an application. The most recent suggestion of using statx is proving to
> >>> be less straight forward than I thought, but I don't want to hold this
> >>> series up for that.
> >>>
> >>
> >> Note that I already implemented the statx support and sent it out for review:
> >> https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#u
> >> However, the patch series only received one comment.  I can send it out again if
> >> people have become interested in it again...
> > 
> > Thanks, I didn't see that the first time around, but I'll be sure to look at
> > your new version. It sounds like you encountered the same problem I did
> > regarding block device handles: the devtmpfs inodes for the raw block device
> > handles are not the bdev inodes. I do think it's useful the alignment
> > attributes are accessible through the block device files, though.
> 
> Irrespective of above problem, as per my review comment [1] on the
> initial version of Eric's series I really want to see the generic
> interface that can accommodate exposing optimal values for different
> operations REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES/REQ_OP_VERIFY etc.
> and not only for read/write.
> 
> -ck
> 
> https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#r3ffe9183c372fb97a9753e286f9cf6400e8ec272

Userspace doesn't know what REQ_OP_* are; they are internal implementation
details of the block layer.  What UAPIs, specifically, do you have in mind for
wanting to know the alignment requirements of?

If you're thinking about about BLKDISCARD and BLKZEROOUT, those are block device
ioctls, so statx() doesn't seem like a great fit for them.  There is already a
place to expose block device properties to userspace (/sys/block/$dev/).  Direct
I/O is different because direct I/O is not just a feature of block devices but
also of regular files, and it is affected by filesystem-level constraints.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  1:59         ` Keith Busch
@ 2022-05-19  2:08           ` Eric Biggers
  2022-05-19  2:25             ` Keith Busch
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-19  2:08 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 07:59:36PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 06:53:11PM -0700, Eric Biggers wrote:
> > On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> > > On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> > > > On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > > > > diff --git a/block/fops.c b/block/fops.c
> > > > > index b9b83030e0df..d8537c29602f 100644
> > > > > --- a/block/fops.c
> > > > > +++ b/block/fops.c
> > > > > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> > > > >  	struct bio bio;
> > > > >  	ssize_t ret;
> > > > >  
> > > > > -	if ((pos | iov_iter_alignment(iter)) &
> > > > > -	    (bdev_logical_block_size(bdev) - 1))
> > > > > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > > > > +		return -EINVAL;
> > > > > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> > > > >  		return -EINVAL;
> > > > 
> > > > The block layer makes a lot of assumptions that bios can be split at any bvec
> > > > boundary.  With this patch, bios whose length isn't a multiple of the logical
> > > > block size can be generated by splitting, which isn't valid.
> > > 
> > > How? This patch ensures every segment is block size aligned.
> > 
> > No, it doesn't.  It ensures that the *total* length of each bio is logical block
> > size aligned.  It doesn't ensure that for the individual bvecs.  By decreasing
> > the required memory alignment to below the logical block size, you're allowing
> > logical blocks to span a page boundary.  Whenever the two pages involved aren't
> > physically contiguous, the data of the block will be split across two bvecs.
> 
> I'm aware that spanning pages can cause bad splits on the bi_max_vecs
> condition, but I believe it's well handled here. Unless I'm terribly confused,
> which is certainly possible, I think you may have missed this part of the
> patch:
> 
> @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> 
>  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +	if (size > 0)
> +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
>  	if (unlikely(size <= 0))
>  		return size ? size : -EFAULT;
> 

That makes the total length of each "batch" of pages be a multiple of the
logical block size, but individual logical blocks within that batch can still be
divided into multiple bvecs in the loop just below it:

	for (left = size, i = 0; left > 0; left -= len, i++) {
		struct page *page = pages[i];

		len = min_t(size_t, PAGE_SIZE - offset, left);

		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
			if (same_page)
				put_page(page);
		} else {
			if (WARN_ON_ONCE(bio_full(bio, len))) {
				bio_put_pages(pages + i, left, offset);
				return -EINVAL;
			}
			__bio_add_page(bio, page, len, offset);
		}
		offset = 0;
	}

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  2:08           ` Eric Biggers
@ 2022-05-19  2:25             ` Keith Busch
  2022-05-19  3:27               ` Eric Biggers
  0 siblings, 1 reply; 42+ messages in thread
From: Keith Busch @ 2022-05-19  2:25 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 07:08:11PM -0700, Eric Biggers wrote:
> On Wed, May 18, 2022 at 07:59:36PM -0600, Keith Busch wrote:
> > I'm aware that spanning pages can cause bad splits on the bi_max_vecs
> > condition, but I believe it's well handled here. Unless I'm terribly confused,
> > which is certainly possible, I think you may have missed this part of the
> > patch:
> > 
> > @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> > 
> >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > +	if (size > 0)
> > +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
> >  	if (unlikely(size <= 0))
> >  		return size ? size : -EFAULT;
> > 
> 
> That makes the total length of each "batch" of pages be a multiple of the
> logical block size, but individual logical blocks within that batch can still be
> divided into multiple bvecs in the loop just below it:

I understand that, but the existing code conservatively assumes all pages are
physically discontiguous and wouldn't have requested more pages if it didn't
have enough bvecs for each of them:

	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;

So with the segment alignment guarantee, and ensured available bvec space, the
created bio will always be a logical block size multiple.

If we need to split it later due to some other constraint, we'll only split on
a logical block size, even if its in the middle of a bvec.

> 	for (left = size, i = 0; left > 0; left -= len, i++) {
> 		struct page *page = pages[i];
> 
> 		len = min_t(size_t, PAGE_SIZE - offset, left);
> 
> 		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
> 			if (same_page)
> 				put_page(page);
> 		} else {
> 			if (WARN_ON_ONCE(bio_full(bio, len))) {
> 				bio_put_pages(pages + i, left, offset);
> 				return -EINVAL;
> 			}
> 			__bio_add_page(bio, page, len, offset);
> 		}
> 		offset = 0;
> 	}
> 
> - Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  2:25             ` Keith Busch
@ 2022-05-19  3:27               ` Eric Biggers
  2022-05-19  4:40                 ` Bart Van Assche
  2022-05-19  4:56                 ` Keith Busch
  0 siblings, 2 replies; 42+ messages in thread
From: Eric Biggers @ 2022-05-19  3:27 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 08:25:26PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 07:08:11PM -0700, Eric Biggers wrote:
> > On Wed, May 18, 2022 at 07:59:36PM -0600, Keith Busch wrote:
> > > I'm aware that spanning pages can cause bad splits on the bi_max_vecs
> > > condition, but I believe it's well handled here. Unless I'm terribly confused,
> > > which is certainly possible, I think you may have missed this part of the
> > > patch:
> > > 
> > > @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> > > 
> > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > +	if (size > 0)
> > > +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
> > >  	if (unlikely(size <= 0))
> > >  		return size ? size : -EFAULT;
> > > 
> > 
> > That makes the total length of each "batch" of pages be a multiple of the
> > logical block size, but individual logical blocks within that batch can still be
> > divided into multiple bvecs in the loop just below it:
> 
> I understand that, but the existing code conservatively assumes all pages are
> physically discontiguous and wouldn't have requested more pages if it didn't
> have enough bvecs for each of them:
> 
> 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> 
> So with the segment alignment guarantee, and ensured available bvec space, the
> created bio will always be a logical block size multiple.
> 
> If we need to split it later due to some other constraint, we'll only split on
> a logical block size, even if its in the middle of a bvec.
> 

So the bio ends up with a total length that is a multiple of the logical block
size, but the lengths of the individual bvecs in the bio are *not* necessarily
multiples of the logical block size.  That's the problem.

Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
of 512.  That was implied by it being a multiple of the logical block size.  But
the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 1/3] block/bio: remove duplicate append pages code
  2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
  2022-05-18 20:21   ` Chaitanya Kulkarni
@ 2022-05-19  4:28   ` Bart Van Assche
  2022-05-19  7:32   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Bart Van Assche @ 2022-05-19  4:28 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, damien.lemoal, Keith Busch

On 5/18/22 19:11, Keith Busch wrote:
> +	for (left = size, i = 0; left > 0; left -= len, i++) {
> +		struct page *page = pages[i];
> +		bool same_page = false;
> +
> +		len = min_t(size_t, PAGE_SIZE - offset, left);
> +		if (bio_add_hw_page(q, bio, page, len, offset,
> +				max_append_sectors, &same_page) != len) {
> +			bio_put_pages(pages + i, left, offset);
> +			ret = -EINVAL;
> +			break;
> +		}
> +		if (same_page)
> +			put_page(page);
> +		offset = 0;
> +	}

Consider renaming 'same_page' into 'merged'. I think that name reflects 
much better the purpose of that variable.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 2/3] block: export dma_alignment attribute
  2022-05-18 17:11 ` [PATCHv2 2/3] block: export dma_alignment attribute Keith Busch
  2022-05-18 20:22   ` Chaitanya Kulkarni
@ 2022-05-19  4:30   ` Bart Van Assche
  2022-05-19  7:33   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Bart Van Assche @ 2022-05-19  4:30 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, damien.lemoal, Keith Busch

On 5/18/22 19:11, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> User space may want to know how to align their buffers to avoid
> bouncing. Export the queue attribute.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/blk-sysfs.c | 7 +++++++
>   1 file changed, 7 insertions(+)
> 
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 88bd41d4cb59..14607565d781 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
>   	return queue_var_show(q->limits.virt_boundary_mask, page);
>   }
>   
> +static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(queue_dma_alignment(q), page);
> +}
> +
>   #define QUEUE_SYSFS_BIT_FNS(name, flag, neg)				\
>   static ssize_t								\
>   queue_##name##_show(struct request_queue *q, char *page)		\
> @@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
>   QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
>   QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
>   QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
> +QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
>   
>   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>   QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
> @@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
>   	&blk_throtl_sample_time_entry.attr,
>   #endif
>   	&queue_virt_boundary_mask_entry.attr,
> +	&queue_dma_alignment_entry.attr,
>   	NULL,
>   };

Please add an entry for the new sysfs attribute in 
Documentation/ABI/stable/sysfs-block.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  3:27               ` Eric Biggers
@ 2022-05-19  4:40                 ` Bart Van Assche
  2022-05-19  4:56                 ` Keith Busch
  1 sibling, 0 replies; 42+ messages in thread
From: Bart Van Assche @ 2022-05-19  4:40 UTC (permalink / raw)
  To: Eric Biggers, Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	damien.lemoal

On 5/19/22 05:27, Eric Biggers wrote:
> But the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).

There are block drivers that support byte alignment of block layer data 
buffers. From the iSCSI over TCP driver:

	blk_queue_dma_alignment(sdev->request_queue, 0);

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  3:27               ` Eric Biggers
  2022-05-19  4:40                 ` Bart Van Assche
@ 2022-05-19  4:56                 ` Keith Busch
  2022-05-19  6:45                   ` Damien Le Moal
                                     ` (2 more replies)
  1 sibling, 3 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-19  4:56 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
> 
> So the bio ends up with a total length that is a multiple of the logical block
> size, but the lengths of the individual bvecs in the bio are *not* necessarily
> multiples of the logical block size.  That's the problem.

I'm surely missing something here. I know the bvecs are not necessarily lbs
aligned, but why does that matter? Is there some driver that can only take
exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
queue limit into account, but I am not sure that we need to.
 
> Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
> of 512.  

Could you point me to some examples?

> That was implied by it being a multiple of the logical block size.  But
> the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).

That's the driver this was tested on, though I just changed it to 4 bytes for
5.19.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  4:56                 ` Keith Busch
@ 2022-05-19  6:45                   ` Damien Le Moal
  2022-05-19 17:19                     ` Eric Biggers
  2022-05-19  7:41                   ` Christoph Hellwig
  2022-05-19 17:01                   ` Keith Busch
  2 siblings, 1 reply; 42+ messages in thread
From: Damien Le Moal @ 2022-05-19  6:45 UTC (permalink / raw)
  To: Keith Busch, Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche

On 2022/05/19 6:56, Keith Busch wrote:
> On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
>>
>> So the bio ends up with a total length that is a multiple of the logical block
>> size, but the lengths of the individual bvecs in the bio are *not* necessarily
>> multiples of the logical block size.  That's the problem.
> 
> I'm surely missing something here. I know the bvecs are not necessarily lbs
> aligned, but why does that matter? Is there some driver that can only take
> exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
> queue limit into account, but I am not sure that we need to.

For direct IO, the first bvec will always be aligned to a logical block size.
See __blkdev_direct_IO() and __blkdev_direct_IO_simple():

        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

And given that, all bvecs should also be LBA aligned since the LBA size is
always a divisor of the page size. Since splitting is always done on an LBA
boundary, I do not see how we can ever get bvecs that are not LBA aligned.
Or I am missing something too...

>  
>> Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
>> of 512.  
> 
> Could you point me to some examples?
> 
>> That was implied by it being a multiple of the logical block size.  But
>> the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).
> 
> That's the driver this was tested on, though I just changed it to 4 bytes for
> 5.19.


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 1/3] block/bio: remove duplicate append pages code
  2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
  2022-05-18 20:21   ` Chaitanya Kulkarni
  2022-05-19  4:28   ` Bart Van Assche
@ 2022-05-19  7:32   ` Christoph Hellwig
  2022-05-19 14:19     ` Keith Busch
  2 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-19  7:32 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, Keith Busch

On Wed, May 18, 2022 at 10:11:29AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The setup for getting pages are identical for zone append and normal IO.
> Use common code for each.

How about using even more common code and avoiding churn at the same
time?  Something like:

diff --git a/block/bio.c b/block/bio.c
index a3893d80dccc9..15da722ed26d1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1158,6 +1158,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
 		put_page(pages[i]);
 }
 
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int offset)
+{
+	bool same_page = false;
+
+	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+		if (WARN_ON_ONCE(bio_full(bio, len)))
+			return -EINVAL;
+		__bio_add_page(bio, page, len, offset);
+		return 0;
+	}
+
+	if (same_page)
+		put_page(page);
+	return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int offset)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	bool same_page = false;
+
+	if (bio_add_hw_page(q, bio, page, len, offset,
+			queue_max_zone_append_sectors(q), &same_page) != len)
+		return -EINVAL;
+	if (same_page)
+		put_page(page);
+	return 0;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -1176,7 +1207,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
-	bool same_page = false;
 	ssize_t size, left;
 	unsigned len, i;
 	size_t offset;
@@ -1195,18 +1225,18 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
+		int ret;
 
 		len = min_t(size_t, PAGE_SIZE - offset, left);
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND)	
+			ret = bio_iov_add_zone_append_page(bio, page, len,
+					offset);
+		else
+			ret = bio_iov_add_page(bio, page, len, offset);
 
-		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-			if (same_page)
-				put_page(page);
-		} else {
-			if (WARN_ON_ONCE(bio_full(bio, len))) {
-				bio_put_pages(pages + i, left, offset);
-				return -EINVAL;
-			}
-			__bio_add_page(bio, page, len, offset);
+		if (ret) {
+			bio_put_pages(pages + i, left, offset);
+			return ret;
 		}
 		offset = 0;
 	}
@@ -1215,54 +1245,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	return 0;
 }
 
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
-	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
-	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
-	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
-	struct page **pages = (struct page **)bv;
-	ssize_t size, left;
-	unsigned len, i;
-	size_t offset;
-	int ret = 0;
-
-	if (WARN_ON_ONCE(!max_append_sectors))
-		return 0;
-
-	/*
-	 * Move page array up in the allocated memory for the bio vecs as far as
-	 * possible so that we can start filling biovecs from the beginning
-	 * without overwriting the temporary page array.
-	 */
-	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
-	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
-
-	for (left = size, i = 0; left > 0; left -= len, i++) {
-		struct page *page = pages[i];
-		bool same_page = false;
-
-		len = min_t(size_t, PAGE_SIZE - offset, left);
-		if (bio_add_hw_page(q, bio, page, len, offset,
-				max_append_sectors, &same_page) != len) {
-			bio_put_pages(pages + i, left, offset);
-			ret = -EINVAL;
-			break;
-		}
-		if (same_page)
-			put_page(page);
-		offset = 0;
-	}
-
-	iov_iter_advance(iter, size - left);
-	return ret;
-}
-
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
@@ -1297,10 +1279,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	}
 
 	do {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-			ret = __bio_iov_append_get_pages(bio, iter);
-		else
-			ret = __bio_iov_iter_get_pages(bio, iter);
+		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
 	/* don't account direct I/O as memory stall */

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 2/3] block: export dma_alignment attribute
  2022-05-18 17:11 ` [PATCHv2 2/3] block: export dma_alignment attribute Keith Busch
  2022-05-18 20:22   ` Chaitanya Kulkarni
  2022-05-19  4:30   ` Bart Van Assche
@ 2022-05-19  7:33   ` Christoph Hellwig
  2 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-19  7:33 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, Keith Busch

On Wed, May 18, 2022 at 10:11:30AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> User space may want to know how to align their buffers to avoid
> bouncing. Export the queue attribute.

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-18 17:11 ` [PATCHv2 3/3] block: relax direct io memory alignment Keith Busch
  2022-05-19  0:14   ` Eric Biggers
@ 2022-05-19  7:38   ` Christoph Hellwig
  2022-05-19 14:08     ` Keith Busch
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-19  7:38 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, Keith Busch

> @@ -1207,6 +1207,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  {
>  	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
>  	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>  	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>  	struct page **pages = (struct page **)bv;
>  	bool same_page = false;
> @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
>  
>  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +	if (size > 0)
> +		size = ALIGN_DOWN(size, queue_logical_block_size(q));

So if we do get a size that is not logical block size alignment here,
we reduce it to the block size aligned one below.  Why do we do that?

> +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> +		return -EINVAL;
> +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
>  		return -EINVAL;

Can we have a little inline helper for these checks instead of
duplicating them three times?

> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 840752006f60..64cc176be60c 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -1131,7 +1131,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	struct dio_submit sdio = { 0, };
>  	struct buffer_head map_bh = { 0, };
>  	struct blk_plug plug;
> -	unsigned long align = offset | iov_iter_alignment(iter);
> +	unsigned long align = iov_iter_alignment(iter);

I'd much prefer to not just relax this for random file systems,
and especially not the legacy direct I/O code.  I think we can eventually
do iomap, but only after an audit and test of each file system, which
might require a new IOMAP_DIO_* flag at least initially.

> +static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
> +{
> +	return queue_dma_alignment(bdev_get_queue(bdev));
> +}

Plase do this in a separate patch.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  1:00     ` Keith Busch
  2022-05-19  1:53       ` Eric Biggers
@ 2022-05-19  7:39       ` Christoph Hellwig
  2022-05-19 22:31         ` Keith Busch
  1 sibling, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-19  7:39 UTC (permalink / raw)
  To: Keith Busch
  Cc: Eric Biggers, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, hch, bvanassche, damien.lemoal

On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> How? This patch ensures every segment is block size aligned. We can always
> split a bio in the middle of a bvec for any lower level hardware constraints,
> and I'm not finding any splitting criteria that would try to break a bio on a
> non-block aligned boundary.

Do you mean bio_vec with segment here?  How do we ensure that?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  4:56                 ` Keith Busch
  2022-05-19  6:45                   ` Damien Le Moal
@ 2022-05-19  7:41                   ` Christoph Hellwig
  2022-05-19 16:35                     ` Keith Busch
  2022-05-19 17:01                   ` Keith Busch
  2 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-19  7:41 UTC (permalink / raw)
  To: Keith Busch
  Cc: Eric Biggers, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, hch, bvanassche, damien.lemoal

On Wed, May 18, 2022 at 10:56:04PM -0600, Keith Busch wrote:
> I'm surely missing something here. I know the bvecs are not necessarily lbs
> aligned, but why does that matter? Is there some driver that can only take
> exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
> queue limit into account, but I am not sure that we need to.

At least stacking drivers had all kinds of interesting limitations in
this area.  How much testing did this series get with all kinds of
interesting dm targets and md pesonalities?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-18 22:45 ` [PATCHv2 0/3] direct io alignment relax Jens Axboe
@ 2022-05-19  7:42   ` Christoph Hellwig
  2022-05-19 12:46     ` Jens Axboe
  0 siblings, 1 reply; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-19  7:42 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, linux-fsdevel, linux-block, Kernel Team, hch,
	bvanassche, damien.lemoal, Keith Busch

On Wed, May 18, 2022 at 04:45:10PM -0600, Jens Axboe wrote:
> On 5/18/22 11:11 AM, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > Including the fs list this time.
> > 
> > I am still working on a better interface to report the dio alignment to
> > an application. The most recent suggestion of using statx is proving to
> > be less straight forward than I thought, but I don't want to hold this
> > series up for that.
> 
> This looks good to me. Anyone object to queueing this one up?

Yes.  I really do like this feature, but I don't think it is ready to
rush it in.  In addition to the ongoing discussions in this thread
we absolutely need proper statx support for the alignments to avoid
userspace growing all kinds of sysfs growling crap to make use of it.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-19  2:02       ` Eric Biggers
@ 2022-05-19  7:43         ` hch
  0 siblings, 0 replies; 42+ messages in thread
From: hch @ 2022-05-19  7:43 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Chaitanya Kulkarni, Keith Busch, Keith Busch, linux-fsdevel,
	linux-block, axboe, Kernel Team, hch, bvanassche, damien.lemoal

On Wed, May 18, 2022 at 07:02:43PM -0700, Eric Biggers wrote:
> If you're thinking about about BLKDISCARD and BLKZEROOUT, those are block device
> ioctls, so statx() doesn't seem like a great fit for them.  There is already a
> place to expose block device properties to userspace (/sys/block/$dev/).  Direct
> I/O is different because direct I/O is not just a feature of block devices but
> also of regular files, and it is affected by filesystem-level constraints.

Agreed.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 0/3] direct io alignment relax
  2022-05-19  7:42   ` Christoph Hellwig
@ 2022-05-19 12:46     ` Jens Axboe
  0 siblings, 0 replies; 42+ messages in thread
From: Jens Axboe @ 2022-05-19 12:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-fsdevel, linux-block, Kernel Team, bvanassche,
	damien.lemoal, Keith Busch

On 5/19/22 1:42 AM, Christoph Hellwig wrote:
> On Wed, May 18, 2022 at 04:45:10PM -0600, Jens Axboe wrote:
>> On 5/18/22 11:11 AM, Keith Busch wrote:
>>> From: Keith Busch <kbusch@kernel.org>
>>>
>>> Including the fs list this time.
>>>
>>> I am still working on a better interface to report the dio alignment to
>>> an application. The most recent suggestion of using statx is proving to
>>> be less straight forward than I thought, but I don't want to hold this
>>> series up for that.
>>
>> This looks good to me. Anyone object to queueing this one up?
> 
> Yes.  I really do like this feature, but I don't think it is ready to
> rush it in.  In addition to the ongoing discussions in this thread
> we absolutely need proper statx support for the alignments to avoid
> userspace growing all kinds of sysfs growling crap to make use of it.

OK fair enough, I do agree that we need a better story for exposing this
data, and in fact for a whole bunch of other stuff that is currently
hard to get programatically. We can defer to 5.20 and get the statx side
hashed out at the same time.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  7:38   ` Christoph Hellwig
@ 2022-05-19 14:08     ` Keith Busch
  2022-05-20  6:10       ` Christoph Hellwig
  0 siblings, 1 reply; 42+ messages in thread
From: Keith Busch @ 2022-05-19 14:08 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team,
	bvanassche, damien.lemoal

On Thu, May 19, 2022 at 09:38:11AM +0200, Christoph Hellwig wrote:
> > @@ -1207,6 +1207,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  {
> >  	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> >  	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
> > +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> >  	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
> >  	struct page **pages = (struct page **)bv;
> >  	bool same_page = false;
> > @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> >  
> >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > +	if (size > 0)
> > +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
> 
> So if we do get a size that is not logical block size alignment here,
> we reduce it to the block size aligned one below.  Why do we do that?

There are two possibilities:

In the first case, the number of pages in this iteration exceeds bi_max_vecs.
Rounding down completes the bio with a block aligned size, and the remainder
will be picked up for the next bio, or possibly even the current bio if the
pages are sufficiently physically contiguous.

The other case is a bad iov. If we're doing __blkdev_direct_IO(), it will error
out immediately if the rounded size is 0, or the next iteration when the next
size is rounded to 0. If we're doing the __blkdev_direct_IO_simple(), it will
error out when it sees the iov hasn't advanced to the end.

And ... I just noticed I missed the size check __blkdev_direct_IO_async().
 
> > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > +		return -EINVAL;
> > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> >  		return -EINVAL;
> 
> Can we have a little inline helper for these checks instead of
> duplicating them three times?

Absolutely.

> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > index 840752006f60..64cc176be60c 100644
> > --- a/fs/direct-io.c
> > +++ b/fs/direct-io.c
> > @@ -1131,7 +1131,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
> >  	struct dio_submit sdio = { 0, };
> >  	struct buffer_head map_bh = { 0, };
> >  	struct blk_plug plug;
> > -	unsigned long align = offset | iov_iter_alignment(iter);
> > +	unsigned long align = iov_iter_alignment(iter);
> 
> I'd much prefer to not just relax this for random file systems,
> and especially not the legacy direct I/O code.  I think we can eventually
> do iomap, but only after an audit and test of each file system, which
> might require a new IOMAP_DIO_* flag at least initially.

I did some testing with xfs, but I can certainly run more a lot more tests. I
do think filesystem support for this capability is important, so I hope we
eventually get there.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 1/3] block/bio: remove duplicate append pages code
  2022-05-19  7:32   ` Christoph Hellwig
@ 2022-05-19 14:19     ` Keith Busch
  0 siblings, 0 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-19 14:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team,
	bvanassche, damien.lemoal

On Thu, May 19, 2022 at 09:32:56AM +0200, Christoph Hellwig wrote:
> On Wed, May 18, 2022 at 10:11:29AM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > The setup for getting pages are identical for zone append and normal IO.
> > Use common code for each.
> 
> How about using even more common code and avoiding churn at the same
> time?  Something like:

Yes, I'll fold this in.
 
> diff --git a/block/bio.c b/block/bio.c
> index a3893d80dccc9..15da722ed26d1 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1158,6 +1158,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
>  		put_page(pages[i]);
>  }
>  
> +static int bio_iov_add_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int offset)
> +{
> +	bool same_page = false;
> +
> +	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
> +		if (WARN_ON_ONCE(bio_full(bio, len)))
> +			return -EINVAL;
> +		__bio_add_page(bio, page, len, offset);
> +		return 0;
> +	}
> +
> +	if (same_page)
> +		put_page(page);
> +	return 0;
> +}
> +
> +static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
> +		unsigned int len, unsigned int offset)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	bool same_page = false;
> +
> +	if (bio_add_hw_page(q, bio, page, len, offset,
> +			queue_max_zone_append_sectors(q), &same_page) != len)
> +		return -EINVAL;
> +	if (same_page)
> +		put_page(page);
> +	return 0;
> +}
> +
>  #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
>  
>  /**
> @@ -1176,7 +1207,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
>  	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>  	struct page **pages = (struct page **)bv;
> -	bool same_page = false;
>  	ssize_t size, left;
>  	unsigned len, i;
>  	size_t offset;
> @@ -1195,18 +1225,18 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  
>  	for (left = size, i = 0; left > 0; left -= len, i++) {
>  		struct page *page = pages[i];
> +		int ret;
>  
>  		len = min_t(size_t, PAGE_SIZE - offset, left);
> +		if (bio_op(bio) == REQ_OP_ZONE_APPEND)	
> +			ret = bio_iov_add_zone_append_page(bio, page, len,
> +					offset);
> +		else
> +			ret = bio_iov_add_page(bio, page, len, offset);
>  
> -		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
> -			if (same_page)
> -				put_page(page);
> -		} else {
> -			if (WARN_ON_ONCE(bio_full(bio, len))) {
> -				bio_put_pages(pages + i, left, offset);
> -				return -EINVAL;
> -			}
> -			__bio_add_page(bio, page, len, offset);
> +		if (ret) {
> +			bio_put_pages(pages + i, left, offset);
> +			return ret;
>  		}
>  		offset = 0;
>  	}
> @@ -1215,54 +1245,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	return 0;
>  }
>  
> -static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
> -{
> -	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> -	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
> -	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> -	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
> -	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
> -	struct page **pages = (struct page **)bv;
> -	ssize_t size, left;
> -	unsigned len, i;
> -	size_t offset;
> -	int ret = 0;
> -
> -	if (WARN_ON_ONCE(!max_append_sectors))
> -		return 0;
> -
> -	/*
> -	 * Move page array up in the allocated memory for the bio vecs as far as
> -	 * possible so that we can start filling biovecs from the beginning
> -	 * without overwriting the temporary page array.
> -	 */
> -	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
> -	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> -
> -	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> -	if (unlikely(size <= 0))
> -		return size ? size : -EFAULT;
> -
> -	for (left = size, i = 0; left > 0; left -= len, i++) {
> -		struct page *page = pages[i];
> -		bool same_page = false;
> -
> -		len = min_t(size_t, PAGE_SIZE - offset, left);
> -		if (bio_add_hw_page(q, bio, page, len, offset,
> -				max_append_sectors, &same_page) != len) {
> -			bio_put_pages(pages + i, left, offset);
> -			ret = -EINVAL;
> -			break;
> -		}
> -		if (same_page)
> -			put_page(page);
> -		offset = 0;
> -	}
> -
> -	iov_iter_advance(iter, size - left);
> -	return ret;
> -}
> -
>  /**
>   * bio_iov_iter_get_pages - add user or kernel pages to a bio
>   * @bio: bio to add pages to
> @@ -1297,10 +1279,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	}
>  
>  	do {
> -		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
> -			ret = __bio_iov_append_get_pages(bio, iter);
> -		else
> -			ret = __bio_iov_iter_get_pages(bio, iter);
> +		ret = __bio_iov_iter_get_pages(bio, iter);
>  	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
>  
>  	/* don't account direct I/O as memory stall */

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  7:41                   ` Christoph Hellwig
@ 2022-05-19 16:35                     ` Keith Busch
  2022-05-20  6:07                       ` Christoph Hellwig
  0 siblings, 1 reply; 42+ messages in thread
From: Keith Busch @ 2022-05-19 16:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Eric Biggers, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, bvanassche, damien.lemoal

On Thu, May 19, 2022 at 09:41:14AM +0200, Christoph Hellwig wrote:
> On Wed, May 18, 2022 at 10:56:04PM -0600, Keith Busch wrote:
> > I'm surely missing something here. I know the bvecs are not necessarily lbs
> > aligned, but why does that matter? Is there some driver that can only take
> > exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
> > queue limit into account, but I am not sure that we need to.
> 
> At least stacking drivers had all kinds of interesting limitations in
> this area.  How much testing did this series get with all kinds of
> interesting dm targets and md pesonalities?

The dma limit doesn't stack (should it?). It gets the default, sector size - 1,
so their constraints are effectively unchanged.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  4:56                 ` Keith Busch
  2022-05-19  6:45                   ` Damien Le Moal
  2022-05-19  7:41                   ` Christoph Hellwig
@ 2022-05-19 17:01                   ` Keith Busch
  2022-05-19 17:27                     ` Eric Biggers
  2 siblings, 1 reply; 42+ messages in thread
From: Keith Busch @ 2022-05-19 17:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Wed, May 18, 2022 at 10:56:04PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
> > Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
> > of 512.  
> 
> Could you point me to some examples?

Just to answer my own question, blk-crypto and blk-merge appear to have these
512 byte bv_len assumptions.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  6:45                   ` Damien Le Moal
@ 2022-05-19 17:19                     ` Eric Biggers
  2022-05-20  3:41                       ` Damien Le Moal
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-19 17:19 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Keith Busch, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, hch, bvanassche

On Thu, May 19, 2022 at 08:45:55AM +0200, Damien Le Moal wrote:
> On 2022/05/19 6:56, Keith Busch wrote:
> > On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
> >>
> >> So the bio ends up with a total length that is a multiple of the logical block
> >> size, but the lengths of the individual bvecs in the bio are *not* necessarily
> >> multiples of the logical block size.  That's the problem.
> > 
> > I'm surely missing something here. I know the bvecs are not necessarily lbs
> > aligned, but why does that matter? Is there some driver that can only take
> > exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
> > queue limit into account, but I am not sure that we need to.
> 
> For direct IO, the first bvec will always be aligned to a logical block size.
> See __blkdev_direct_IO() and __blkdev_direct_IO_simple():
> 
>         if ((pos | iov_iter_alignment(iter)) &
>             (bdev_logical_block_size(bdev) - 1))
>                 return -EINVAL;
> 
> And given that, all bvecs should also be LBA aligned since the LBA size is
> always a divisor of the page size. Since splitting is always done on an LBA
> boundary, I do not see how we can ever get bvecs that are not LBA aligned.
> Or I am missing something too...
> 

You're looking at the current upstream code.  This patch changes that to:

	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
		return -EINVAL;
	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
		return -EINVAL;

So, if this patch is accepted, then the file position and total I/O length will
need to be logical block aligned (as before), but the memory address and length
of each individual iovec will no longer need to be logical block aligned.  How
the iovecs get turned into bios (and thus bvecs) is a little complicated, but
the result is that logical blocks will be able to span bvecs.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19 17:01                   ` Keith Busch
@ 2022-05-19 17:27                     ` Eric Biggers
  2022-05-19 17:43                       ` Keith Busch
  0 siblings, 1 reply; 42+ messages in thread
From: Eric Biggers @ 2022-05-19 17:27 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Thu, May 19, 2022 at 11:01:05AM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 10:56:04PM -0600, Keith Busch wrote:
> > On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
> > > Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
> > > of 512.  
> > 
> > Could you point me to some examples?
> 
> Just to answer my own question, blk-crypto and blk-merge appear to have these
> 512 byte bv_len assumptions.

blk_bio_segment_split() and bio_advance_iter() are two more examples.

- Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19 17:27                     ` Eric Biggers
@ 2022-05-19 17:43                       ` Keith Busch
  0 siblings, 0 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-19 17:43 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal

On Thu, May 19, 2022 at 10:27:04AM -0700, Eric Biggers wrote:
> On Thu, May 19, 2022 at 11:01:05AM -0600, Keith Busch wrote:
> > On Wed, May 18, 2022 at 10:56:04PM -0600, Keith Busch wrote:
> > > On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
> > > > Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
> > > > of 512.  
> > > 
> > > Could you point me to some examples?
> > 
> > Just to answer my own question, blk-crypto and blk-merge appear to have these
> > 512 byte bv_len assumptions.
> 
> blk_bio_segment_split() and bio_advance_iter() are two more examples.

I agree about blk_bio_segment_split(), but bio_advance_iter() looks fine. That
just assumes the entire length is a multiple of 512, not any particular bvec.

Anyway, I accept your point that some existing code has this assumption, and
will address these before the next revision.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19  7:39       ` Christoph Hellwig
@ 2022-05-19 22:31         ` Keith Busch
  0 siblings, 0 replies; 42+ messages in thread
From: Keith Busch @ 2022-05-19 22:31 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Eric Biggers, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, bvanassche, damien.lemoal

On Thu, May 19, 2022 at 09:39:12AM +0200, Christoph Hellwig wrote:
> On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> > How? This patch ensures every segment is block size aligned. We can always
> > split a bio in the middle of a bvec for any lower level hardware constraints,
> > and I'm not finding any splitting criteria that would try to break a bio on a
> > non-block aligned boundary.
> 
> Do you mean bio_vec with segment here?  How do we ensure that?

I may not be understanding what you're asking here, but I'll try to answer.

In this path, we are dealing with ITER_IOVEC type. This patch ensures each
virtually contiguous segment is a block size multiple via the ALIGN_DOWN part
of the patch, and builds the bvec accordingly. An error will occur if the
ALIGN_DOWN ever results in a 0 size, so we will know each segment the user
provided is appropriately sized.

That's not to say each resulting bvec is block sized. Only the sum of all the
bvecs is guaranteed that property.

Consider something like the below userspace snippet reading a 512b sector
crossing a page with an unusual offset. If the two pages are not contiguous:

  bvec[0].bv_offset = 4092
  bvec[0].bv_len = 4

  bvec[1].bv_offset = 0
  bvec[1].bv_len = 508

If they are contiguous, you'd just get 1 bvec with the same bv_offset, but a
512b len.

---
	int ret, fd;
	char *buf;
...
	ret = posix_memalign((void **)&buf, 4096, 8192);
...
	ret = pread(fd, buf + 4092, 512, 0);
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19 17:19                     ` Eric Biggers
@ 2022-05-20  3:41                       ` Damien Le Moal
  0 siblings, 0 replies; 42+ messages in thread
From: Damien Le Moal @ 2022-05-20  3:41 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, Keith Busch, linux-fsdevel, linux-block, axboe,
	Kernel Team, hch, bvanassche

On 5/20/22 02:19, Eric Biggers wrote:
> On Thu, May 19, 2022 at 08:45:55AM +0200, Damien Le Moal wrote:
>> On 2022/05/19 6:56, Keith Busch wrote:
>>> On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
>>>>
>>>> So the bio ends up with a total length that is a multiple of the logical block
>>>> size, but the lengths of the individual bvecs in the bio are *not* necessarily
>>>> multiples of the logical block size.  That's the problem.
>>>
>>> I'm surely missing something here. I know the bvecs are not necessarily lbs
>>> aligned, but why does that matter? Is there some driver that can only take
>>> exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
>>> queue limit into account, but I am not sure that we need to.
>>
>> For direct IO, the first bvec will always be aligned to a logical block size.
>> See __blkdev_direct_IO() and __blkdev_direct_IO_simple():
>>
>>         if ((pos | iov_iter_alignment(iter)) &
>>             (bdev_logical_block_size(bdev) - 1))
>>                 return -EINVAL;
>>
>> And given that, all bvecs should also be LBA aligned since the LBA size is
>> always a divisor of the page size. Since splitting is always done on an LBA
>> boundary, I do not see how we can ever get bvecs that are not LBA aligned.
>> Or I am missing something too...
>>
> 
> You're looking at the current upstream code.  This patch changes that to:
> 
> 	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> 		return -EINVAL;
> 	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> 		return -EINVAL;
> 
> So, if this patch is accepted, then the file position and total I/O length will
> need to be logical block aligned (as before), but the memory address and length
> of each individual iovec will no longer need to be logical block aligned.  How
> the iovecs get turned into bios (and thus bvecs) is a little complicated, but
> the result is that logical blocks will be able to span bvecs.

Indeed. I missed that change in patch 3. Got it.

> 
> - Eric


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19 16:35                     ` Keith Busch
@ 2022-05-20  6:07                       ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-20  6:07 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Eric Biggers, Keith Busch, linux-fsdevel,
	linux-block, axboe, Kernel Team, bvanassche, damien.lemoal

On Thu, May 19, 2022 at 10:35:23AM -0600, Keith Busch wrote:
> > At least stacking drivers had all kinds of interesting limitations in
> > this area.  How much testing did this series get with all kinds of
> > interesting dm targets and md pesonalities?
> 
> The dma limit doesn't stack (should it?). It gets the default, sector size - 1,
> so their constraints are effectively unchanged.

In general it should stack, as modulo implementation issues stacking
drivers sould not care about the alignment.  However since it doesn't
there might be some of those.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
  2022-05-19 14:08     ` Keith Busch
@ 2022-05-20  6:10       ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2022-05-20  6:10 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-fsdevel, linux-block,
	axboe, Kernel Team, bvanassche, damien.lemoal

On Thu, May 19, 2022 at 08:08:50AM -0600, Keith Busch wrote:
> > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > +	if (size > 0)
> > > +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
> > 
> > So if we do get a size that is not logical block size alignment here,
> > we reduce it to the block size aligned one below.  Why do we do that?
> 
> There are two possibilities:
> 
> In the first case, the number of pages in this iteration exceeds bi_max_vecs.
> Rounding down completes the bio with a block aligned size, and the remainder
> will be picked up for the next bio, or possibly even the current bio if the
> pages are sufficiently physically contiguous.
> 
> The other case is a bad iov. If we're doing __blkdev_direct_IO(), it will error
> out immediately if the rounded size is 0, or the next iteration when the next
> size is rounded to 0. If we're doing the __blkdev_direct_IO_simple(), it will
> error out when it sees the iov hasn't advanced to the end.

Can you please document this with a comment in the code?

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2022-05-20  6:11 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-18 17:11 [PATCHv2 0/3] direct io alignment relax Keith Busch
2022-05-18 17:11 ` [PATCHv2 1/3] block/bio: remove duplicate append pages code Keith Busch
2022-05-18 20:21   ` Chaitanya Kulkarni
2022-05-19  4:28   ` Bart Van Assche
2022-05-19  7:32   ` Christoph Hellwig
2022-05-19 14:19     ` Keith Busch
2022-05-18 17:11 ` [PATCHv2 2/3] block: export dma_alignment attribute Keith Busch
2022-05-18 20:22   ` Chaitanya Kulkarni
2022-05-19  4:30   ` Bart Van Assche
2022-05-19  7:33   ` Christoph Hellwig
2022-05-18 17:11 ` [PATCHv2 3/3] block: relax direct io memory alignment Keith Busch
2022-05-19  0:14   ` Eric Biggers
2022-05-19  1:00     ` Keith Busch
2022-05-19  1:53       ` Eric Biggers
2022-05-19  1:59         ` Keith Busch
2022-05-19  2:08           ` Eric Biggers
2022-05-19  2:25             ` Keith Busch
2022-05-19  3:27               ` Eric Biggers
2022-05-19  4:40                 ` Bart Van Assche
2022-05-19  4:56                 ` Keith Busch
2022-05-19  6:45                   ` Damien Le Moal
2022-05-19 17:19                     ` Eric Biggers
2022-05-20  3:41                       ` Damien Le Moal
2022-05-19  7:41                   ` Christoph Hellwig
2022-05-19 16:35                     ` Keith Busch
2022-05-20  6:07                       ` Christoph Hellwig
2022-05-19 17:01                   ` Keith Busch
2022-05-19 17:27                     ` Eric Biggers
2022-05-19 17:43                       ` Keith Busch
2022-05-19  7:39       ` Christoph Hellwig
2022-05-19 22:31         ` Keith Busch
2022-05-19  7:38   ` Christoph Hellwig
2022-05-19 14:08     ` Keith Busch
2022-05-20  6:10       ` Christoph Hellwig
2022-05-18 22:45 ` [PATCHv2 0/3] direct io alignment relax Jens Axboe
2022-05-19  7:42   ` Christoph Hellwig
2022-05-19 12:46     ` Jens Axboe
2022-05-18 23:26 ` Eric Biggers
2022-05-19  0:51   ` Keith Busch
2022-05-19  1:02     ` Chaitanya Kulkarni
2022-05-19  2:02       ` Eric Biggers
2022-05-19  7:43         ` hch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.