linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
@ 2020-12-01 12:06 Ming Lei
  2020-12-01 12:52 ` Matthew Wilcox
                   ` (4 more replies)
  0 siblings, 5 replies; 23+ messages in thread
From: Ming Lei @ 2020-12-01 12:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Matthew Wilcox, Pavel Begunkov,
	Christoph Hellwig, linux-fsdevel

Pavel reported that iov_iter_npages is a bit heavy in case of bvec
iter.

Turns out it isn't necessary to iterate every page in the bvec iter,
and we call iov_iter_npages() just for figuring out how many bio
vecs need to be allocated. And we can simply map each vector in bvec iter
to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
bvec iter.

Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
real usage.

This patch is based on Mathew's post:

https://lore.kernel.org/linux-block/20201120123931.GN29991@casper.infradead.org/

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 fs/block_dev.c       | 30 +++++++++++++++---------------
 fs/iomap/direct-io.c | 14 +++++++-------
 include/linux/bio.h  | 10 ++++++++++
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d8664f5c1ff6..4fd9bb4306db 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -218,7 +218,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
 
 static ssize_t
 __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
-		int nr_pages)
+		int nr_vecs)
 {
 	struct file *file = iocb->ki_filp;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
@@ -233,16 +233,16 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	if (nr_pages <= DIO_INLINE_BIO_VECS)
+	if (nr_vecs <= DIO_INLINE_BIO_VECS)
 		vecs = inline_vecs;
 	else {
-		vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+		vecs = kmalloc_array(nr_vecs, sizeof(struct bio_vec),
 				     GFP_KERNEL);
 		if (!vecs)
 			return -ENOMEM;
 	}
 
-	bio_init(&bio, vecs, nr_pages);
+	bio_init(&bio, vecs, nr_vecs);
 	bio_set_dev(&bio, bdev);
 	bio.bi_iter.bi_sector = pos >> 9;
 	bio.bi_write_hint = iocb->ki_hint;
@@ -353,7 +353,7 @@ static void blkdev_bio_end_io(struct bio *bio)
 }
 
 static ssize_t
-__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
+__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_vecs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
@@ -371,7 +371,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+	bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, &blkdev_dio_pool);
 
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -420,8 +420,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
-		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
-		if (!nr_pages) {
+		nr_vecs = bio_iov_iter_nvecs(iter, BIO_MAX_PAGES);
+		if (!nr_vecs) {
 			bool polled = false;
 
 			if (iocb->ki_flags & IOCB_HIPRI) {
@@ -451,7 +451,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		}
 
 		submit_bio(bio);
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio = bio_alloc(GFP_KERNEL, nr_vecs);
 	}
 
 	if (!is_poll)
@@ -483,15 +483,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
-	int nr_pages;
+	int nr_vecs;
 
-	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
-	if (!nr_pages)
+	nr_vecs = bio_iov_iter_nvecs(iter, BIO_MAX_PAGES + 1);
+	if (!nr_vecs)
 		return 0;
-	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
-		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+	if (is_sync_kiocb(iocb) && nr_vecs <= BIO_MAX_PAGES)
+		return __blkdev_direct_IO_simple(iocb, iter, nr_vecs);
 
-	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+	return __blkdev_direct_IO(iocb, iter, min(nr_vecs, BIO_MAX_PAGES));
 }
 
 static __init int blkdev_init(void)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 933f234d5bec..cc779ecc8144 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -211,7 +211,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct bio *bio;
 	bool need_zeroout = false;
 	bool use_fua = false;
-	int nr_pages, ret = 0;
+	int nr_vecs, ret = 0;
 	size_t copied = 0;
 	size_t orig_count;
 
@@ -250,9 +250,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	orig_count = iov_iter_count(dio->submit.iter);
 	iov_iter_truncate(dio->submit.iter, length);
 
-	nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
-	if (nr_pages <= 0) {
-		ret = nr_pages;
+	nr_vecs = bio_iov_iter_nvecs(dio->submit.iter, BIO_MAX_PAGES);
+	if (nr_vecs <= 0) {
+		ret = nr_vecs;
 		goto out;
 	}
 
@@ -271,7 +271,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 			goto out;
 		}
 
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio = bio_alloc(GFP_KERNEL, nr_vecs);
 		bio_set_dev(bio, iomap->bdev);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
@@ -308,10 +308,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		dio->size += n;
 		copied += n;
 
-		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+		nr_vecs = bio_iov_iter_nvecs(dio->submit.iter, BIO_MAX_PAGES);
 		iomap_dio_submit_bio(dio, iomap, bio, pos);
 		pos += n;
-	} while (nr_pages);
+	} while (nr_vecs);
 
 	/*
 	 * We need to zeroout the tail of a sub-block write if the extent type
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ecf67108f091..b985857ce9d1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -10,6 +10,7 @@
 #include <linux/ioprio.h>
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include <linux/blk_types.h>
+#include <linux/uio.h>
 
 #define BIO_DEBUG
 
@@ -807,4 +808,13 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
 		bio->bi_opf |= REQ_NOWAIT;
 }
 
+static inline int bio_iov_iter_nvecs(const struct iov_iter *i, int maxvecs)
+{
+	if (!iov_iter_count(i))
+		return 0;
+	if (iov_iter_is_bvec(i))
+               return min_t(int, maxvecs, i->nr_segs);
+	return iov_iter_npages(i, maxvecs);
+}
+
 #endif /* __LINUX_BIO_H */
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:06 [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs Ming Lei
@ 2020-12-01 12:52 ` Matthew Wilcox
  2020-12-01 12:59   ` Christoph Hellwig
  2020-12-02  1:46   ` Ming Lei
  2020-12-02 14:06 ` Pavel Begunkov
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 23+ messages in thread
From: Matthew Wilcox @ 2020-12-01 12:52 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Pavel Begunkov, Christoph Hellwig,
	linux-fsdevel

On Tue, Dec 01, 2020 at 08:06:52PM +0800, Ming Lei wrote:
> Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> iter.
> 
> Turns out it isn't necessary to iterate every page in the bvec iter,
> and we call iov_iter_npages() just for figuring out how many bio
> vecs need to be allocated. And we can simply map each vector in bvec iter
> to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> bvec iter.
> 
> Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> real usage.
> 
> This patch is based on Mathew's post:
> 
> https://lore.kernel.org/linux-block/20201120123931.GN29991@casper.infradead.org/

But the only reason we want to know 'nr_vecs' is so we can allocate a
BIO which has that many vecs, right?  But we then don't actually use the
vecs in the bio because we use the ones already present in the iter.
That was why I had it return 1, not nr_vecs.

Did I miss something?

> +static inline int bio_iov_iter_nvecs(const struct iov_iter *i, int maxvecs)
> +{
> +	if (!iov_iter_count(i))
> +		return 0;
> +	if (iov_iter_is_bvec(i))
> +               return min_t(int, maxvecs, i->nr_segs);
> +	return iov_iter_npages(i, maxvecs);
> +}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:52 ` Matthew Wilcox
@ 2020-12-01 12:59   ` Christoph Hellwig
  2020-12-01 13:17     ` Pavel Begunkov
  2020-12-02  1:46   ` Ming Lei
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2020-12-01 12:59 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Ming Lei, Jens Axboe, linux-block, Pavel Begunkov,
	Christoph Hellwig, linux-fsdevel

On Tue, Dec 01, 2020 at 12:52:51PM +0000, Matthew Wilcox wrote:
> But the only reason we want to know 'nr_vecs' is so we can allocate a
> BIO which has that many vecs, right?  But we then don't actually use the
> vecs in the bio because we use the ones already present in the iter.
> That was why I had it return 1, not nr_vecs.
> 
> Did I miss something?

Right now __bio_iov_bvec_add_pages does not reuse the bvecs in the
iter.  That being said while we are optmizing this path we might a well
look into reusing them..

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:59   ` Christoph Hellwig
@ 2020-12-01 13:17     ` Pavel Begunkov
  2020-12-01 13:32       ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-01 13:17 UTC (permalink / raw)
  To: Christoph Hellwig, Matthew Wilcox
  Cc: Ming Lei, Jens Axboe, linux-block, linux-fsdevel

On 01/12/2020 12:59, Christoph Hellwig wrote:
> On Tue, Dec 01, 2020 at 12:52:51PM +0000, Matthew Wilcox wrote:
>> But the only reason we want to know 'nr_vecs' is so we can allocate a
>> BIO which has that many vecs, right?  But we then don't actually use the
>> vecs in the bio because we use the ones already present in the iter.
>> That was why I had it return 1, not nr_vecs.
>>
>> Did I miss something?
> 
> Right now __bio_iov_bvec_add_pages does not reuse the bvecs in the
> iter.  That being said while we are optmizing this path we might a well
> look into reusing them..

I was thinking about memcpy bvec instead of iterating as a first step,
and then try to reuse passed in bvec.

A thing that doesn't play nice with that is setting BIO_WORKINGSET in
__bio_add_page(), which requires to iterate all pages anyway. I have no
clue what it is, so rather to ask if we can optimise it out somehow?
Apart from pre-computing for specific cases...

E.g. can pages of a single bvec segment be both in and out of a working
set? (i.e. PageWorkingset(page)).

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 13:17     ` Pavel Begunkov
@ 2020-12-01 13:32       ` Christoph Hellwig
  2020-12-01 13:36         ` Pavel Begunkov
  2020-12-03 22:36         ` Johannes Weiner
  0 siblings, 2 replies; 23+ messages in thread
From: Christoph Hellwig @ 2020-12-01 13:32 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Christoph Hellwig, Matthew Wilcox, Ming Lei, Jens Axboe,
	linux-block, linux-fsdevel, Johannes Weiner

On Tue, Dec 01, 2020 at 01:17:49PM +0000, Pavel Begunkov wrote:
> I was thinking about memcpy bvec instead of iterating as a first step,
> and then try to reuse passed in bvec.
> 
> A thing that doesn't play nice with that is setting BIO_WORKINGSET in
> __bio_add_page(), which requires to iterate all pages anyway. I have no
> clue what it is, so rather to ask if we can optimise it out somehow?
> Apart from pre-computing for specific cases...
> 
> E.g. can pages of a single bvec segment be both in and out of a working
> set? (i.e. PageWorkingset(page)).

Adding Johannes for the PageWorkingset logic, which keeps confusing me
everytime I look at it.  I think it is intended to deal with pages
being swapped out and in, and doesn't make much sense to look at in
any form for direct I/O, but as said I'm rather confused by this code.

If PageWorkingset is a non-issue we should be able to just point the
bio at the biovec array.  I think that be done by allocating the bio
with nr_iovecs == 0, and then just updating >bi_io_vec and ->bi_vcnt
using a little helper like this:

static inline void bio_assign_bvec(struct bio *bio, struct bio_vec *bvecs,
		unsigned short nr_bvecs)
{
	WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0);
	bio->bi_io_vec = bvecs;
	bio->bi_vcnt = nr_bvecs;
}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 13:32       ` Christoph Hellwig
@ 2020-12-01 13:36         ` Pavel Begunkov
  2020-12-01 13:45           ` Christoph Hellwig
  2020-12-03 22:36         ` Johannes Weiner
  1 sibling, 1 reply; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-01 13:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Matthew Wilcox, Ming Lei, Jens Axboe, linux-block, linux-fsdevel,
	Johannes Weiner

On 01/12/2020 13:32, Christoph Hellwig wrote:
> On Tue, Dec 01, 2020 at 01:17:49PM +0000, Pavel Begunkov wrote:
>> I was thinking about memcpy bvec instead of iterating as a first step,
>> and then try to reuse passed in bvec.
>>
>> A thing that doesn't play nice with that is setting BIO_WORKINGSET in
>> __bio_add_page(), which requires to iterate all pages anyway. I have no
>> clue what it is, so rather to ask if we can optimise it out somehow?
>> Apart from pre-computing for specific cases...
>>
>> E.g. can pages of a single bvec segment be both in and out of a working
>> set? (i.e. PageWorkingset(page)).
> 
> Adding Johannes for the PageWorkingset logic, which keeps confusing me
> everytime I look at it.  I think it is intended to deal with pages
> being swapped out and in, and doesn't make much sense to look at in
> any form for direct I/O, but as said I'm rather confused by this code.
> 
> If PageWorkingset is a non-issue we should be able to just point the
> bio at the biovec array.  I think that be done by allocating the bio
> with nr_iovecs == 0, and then just updating >bi_io_vec and ->bi_vcnt
> using a little helper like this:

Yeah, that's the idea, but also wanted to verify that callers don't
free it while in use, or if that's not the case to make it conditional
by adding a flag in iov_iter.

Can anybody vow right off the bat that all callers behave well?

> 
> static inline void bio_assign_bvec(struct bio *bio, struct bio_vec *bvecs,
> 		unsigned short nr_bvecs)
> {
> 	WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0);
> 	bio->bi_io_vec = bvecs;
> 	bio->bi_vcnt = nr_bvecs;
> }
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 13:36         ` Pavel Begunkov
@ 2020-12-01 13:45           ` Christoph Hellwig
  2020-12-01 13:48             ` Pavel Begunkov
  2020-12-02  2:10             ` Ming Lei
  0 siblings, 2 replies; 23+ messages in thread
From: Christoph Hellwig @ 2020-12-01 13:45 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Christoph Hellwig, Matthew Wilcox, Ming Lei, Jens Axboe,
	linux-block, linux-fsdevel, Johannes Weiner

On Tue, Dec 01, 2020 at 01:36:22PM +0000, Pavel Begunkov wrote:
> Yeah, that's the idea, but also wanted to verify that callers don't
> free it while in use, or if that's not the case to make it conditional
> by adding a flag in iov_iter.
> 
> Can anybody vow right off the bat that all callers behave well?

Yes, this will need a careful audit, I'm not too sure offhand.  For the
io_uring case which is sortof the fast path the caller won't free them
unless we allow the buffer unregistration to race with I/O.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 13:45           ` Christoph Hellwig
@ 2020-12-01 13:48             ` Pavel Begunkov
  2020-12-02  2:10             ` Ming Lei
  1 sibling, 0 replies; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-01 13:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Matthew Wilcox, Ming Lei, Jens Axboe, linux-block, linux-fsdevel,
	Johannes Weiner

On 01/12/2020 13:45, Christoph Hellwig wrote:
> On Tue, Dec 01, 2020 at 01:36:22PM +0000, Pavel Begunkov wrote:
>> Yeah, that's the idea, but also wanted to verify that callers don't
>> free it while in use, or if that's not the case to make it conditional
>> by adding a flag in iov_iter.
>>
>> Can anybody vow right off the bat that all callers behave well?
> 
> Yes, this will need a careful audit, I'm not too sure offhand.  For the
> io_uring case which is sortof the fast path the caller won't free them
> unless we allow the buffer unregistration to race with I/O.

For registered bufs io_uring waits for such requests to complete first,
so it's fine.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:52 ` Matthew Wilcox
  2020-12-01 12:59   ` Christoph Hellwig
@ 2020-12-02  1:46   ` Ming Lei
  1 sibling, 0 replies; 23+ messages in thread
From: Ming Lei @ 2020-12-02  1:46 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jens Axboe, linux-block, Pavel Begunkov, Christoph Hellwig,
	linux-fsdevel

On Tue, Dec 01, 2020 at 12:52:51PM +0000, Matthew Wilcox wrote:
> On Tue, Dec 01, 2020 at 08:06:52PM +0800, Ming Lei wrote:
> > Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> > iter.
> > 
> > Turns out it isn't necessary to iterate every page in the bvec iter,
> > and we call iov_iter_npages() just for figuring out how many bio
> > vecs need to be allocated. And we can simply map each vector in bvec iter
> > to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> > bvec iter.
> > 
> > Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> > real usage.
> > 
> > This patch is based on Mathew's post:
> > 
> > https://lore.kernel.org/linux-block/20201120123931.GN29991@casper.infradead.org/
> 
> But the only reason we want to know 'nr_vecs' is so we can allocate a
> BIO which has that many vecs, right?  But we then don't actually use the
> vecs in the bio because we use the ones already present in the iter.
> That was why I had it return 1, not nr_vecs.
> 
> Did I miss something?

Please see bio_iov_iter_get_pages():

  do {
  		...
      	if (is_bvec)
        	ret = __bio_iov_bvec_add_pages(bio, iter);
		...
   } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));


So all bvecs in bvec iter will be added to the bio, and it isn't
correct or efficient to just return 1.

thanks,
Ming


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 13:45           ` Christoph Hellwig
  2020-12-01 13:48             ` Pavel Begunkov
@ 2020-12-02  2:10             ` Ming Lei
  2020-12-02  8:02               ` Christoph Hellwig
  1 sibling, 1 reply; 23+ messages in thread
From: Ming Lei @ 2020-12-02  2:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pavel Begunkov, Matthew Wilcox, Jens Axboe, linux-block,
	linux-fsdevel, Johannes Weiner

On Tue, Dec 01, 2020 at 01:45:42PM +0000, Christoph Hellwig wrote:
> On Tue, Dec 01, 2020 at 01:36:22PM +0000, Pavel Begunkov wrote:
> > Yeah, that's the idea, but also wanted to verify that callers don't
> > free it while in use, or if that's not the case to make it conditional
> > by adding a flag in iov_iter.
> > 
> > Can anybody vow right off the bat that all callers behave well?
> 
> Yes, this will need a careful audit, I'm not too sure offhand.  For the
> io_uring case which is sortof the fast path the caller won't free them
> unless we allow the buffer unregistration to race with I/O.

Loop's aio usage is fine, just found fd_execute_rw_aio() isn't good.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-02  2:10             ` Ming Lei
@ 2020-12-02  8:02               ` Christoph Hellwig
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2020-12-02  8:02 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Pavel Begunkov, Matthew Wilcox, Jens Axboe,
	linux-block, linux-fsdevel, Johannes Weiner

On Wed, Dec 02, 2020 at 10:10:21AM +0800, Ming Lei wrote:
> > Yes, this will need a careful audit, I'm not too sure offhand.  For the
> > io_uring case which is sortof the fast path the caller won't free them
> > unless we allow the buffer unregistration to race with I/O.
> 
> Loop's aio usage is fine, just found fd_execute_rw_aio() isn't good.

Yes.  But that one is trivial to fix.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:06 [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs Ming Lei
  2020-12-01 12:52 ` Matthew Wilcox
@ 2020-12-02 14:06 ` Pavel Begunkov
  2020-12-02 15:02 ` Christoph Hellwig
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-02 14:06 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: linux-block, Matthew Wilcox, Christoph Hellwig, linux-fsdevel

On 01/12/2020 12:06, Ming Lei wrote:
> Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> iter.
> 
> Turns out it isn't necessary to iterate every page in the bvec iter,
> and we call iov_iter_npages() just for figuring out how many bio
> vecs need to be allocated. And we can simply map each vector in bvec iter
> to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> bvec iter.

Looks good to me. Except for using BIO_MAX_PAGES for number of vecs, it's
even cleaner because it adds pages constrained by number of vecs, not pages,
with all side effects like skipping __blkdev_direct_IO_simple() for many
page 1-segment bvecs.

One nit below.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>

> 
> Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> real usage.
> 
> This patch is based on Mathew's post:
> 
> https://lore.kernel.org/linux-block/20201120123931.GN29991@casper.infradead.org/
> 
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Pavel Begunkov <asml.silence@gmail.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: linux-fsdevel@vger.kernel.org
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  fs/block_dev.c       | 30 +++++++++++++++---------------
>  fs/iomap/direct-io.c | 14 +++++++-------
>  include/linux/bio.h  | 10 ++++++++++
>  3 files changed, 32 insertions(+), 22 deletions(-)
> 
[...]
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index ecf67108f091..b985857ce9d1 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -10,6 +10,7 @@
[...]
>  
> +static inline int bio_iov_iter_nvecs(const struct iov_iter *i, int maxvecs)
> +{
> +	if (!iov_iter_count(i))
> +		return 0;
> +	if (iov_iter_is_bvec(i))
> +               return min_t(int, maxvecs, i->nr_segs);

spaces instead of tabs

> +	return iov_iter_npages(i, maxvecs);
> +}
> +
>  #endif /* __LINUX_BIO_H */
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:06 [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs Ming Lei
  2020-12-01 12:52 ` Matthew Wilcox
  2020-12-02 14:06 ` Pavel Begunkov
@ 2020-12-02 15:02 ` Christoph Hellwig
  2020-12-02 16:56 ` Jens Axboe
  2020-12-07 18:07 ` Pavel Begunkov
  4 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2020-12-02 15:02 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Matthew Wilcox, Pavel Begunkov,
	Christoph Hellwig, linux-fsdevel

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:06 [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs Ming Lei
                   ` (2 preceding siblings ...)
  2020-12-02 15:02 ` Christoph Hellwig
@ 2020-12-02 16:56 ` Jens Axboe
  2020-12-07 18:07 ` Pavel Begunkov
  4 siblings, 0 replies; 23+ messages in thread
From: Jens Axboe @ 2020-12-02 16:56 UTC (permalink / raw)
  To: Ming Lei
  Cc: linux-block, Matthew Wilcox, Pavel Begunkov, Christoph Hellwig,
	linux-fsdevel

On 12/1/20 5:06 AM, Ming Lei wrote:
> Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> iter.
> 
> Turns out it isn't necessary to iterate every page in the bvec iter,
> and we call iov_iter_npages() just for figuring out how many bio
> vecs need to be allocated. And we can simply map each vector in bvec iter
> to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> bvec iter.
> 
> Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> real usage.

I'd really prefer to keep the renaming separate from the actual change.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 13:32       ` Christoph Hellwig
  2020-12-01 13:36         ` Pavel Begunkov
@ 2020-12-03 22:36         ` Johannes Weiner
  2020-12-03 23:43           ` Pavel Begunkov
  2020-12-04 12:48           ` Christoph Hellwig
  1 sibling, 2 replies; 23+ messages in thread
From: Johannes Weiner @ 2020-12-03 22:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pavel Begunkov, Matthew Wilcox, Ming Lei, Jens Axboe,
	linux-block, linux-fsdevel

On Tue, Dec 01, 2020 at 01:32:26PM +0000, Christoph Hellwig wrote:
> On Tue, Dec 01, 2020 at 01:17:49PM +0000, Pavel Begunkov wrote:
> > I was thinking about memcpy bvec instead of iterating as a first step,
> > and then try to reuse passed in bvec.
> > 
> > A thing that doesn't play nice with that is setting BIO_WORKINGSET in
> > __bio_add_page(), which requires to iterate all pages anyway. I have no
> > clue what it is, so rather to ask if we can optimise it out somehow?
> > Apart from pre-computing for specific cases...
> > 
> > E.g. can pages of a single bvec segment be both in and out of a working
> > set? (i.e. PageWorkingset(page)).
> 
> Adding Johannes for the PageWorkingset logic, which keeps confusing me
> everytime I look at it.  I think it is intended to deal with pages
> being swapped out and in, and doesn't make much sense to look at in
> any form for direct I/O, but as said I'm rather confused by this code.

Correct, it's only interesting for pages under LRU management - page
cache and swap pages. It should not matter for direct IO.

The VM uses the page flag to tell the difference between cold faults
(empty cache startup e.g.), and thrashing pages which are being read
back not long after they have been reclaimed. This influences reclaim
behavior, but can also indicate a general lack of memory.

The BIO_WORKINGSET flag is for the latter. To calculate the time
wasted by a lack of memory (memory pressure), we measure the total
time processes wait for thrashing pages. Usually that time is
dominated by waiting for in-flight io to complete and pages to become
uptodate. These waits are annotated on the page cache side.

However, in some cases, the IO submission path itself can block for
extended periods - if the device is congested or submissions are
throttled due to cgroup policy. To capture those waits, the bio is
flagged when it's for thrashing pages, and then submit_bio() will
report submission time of that bio as a thrashing-related delay.

[ Obviously, in theory bios could have a mix of thrashing and
  non-thrashing pages, and the submission stall could have occurred
  even without the thrashing pages. But in practice we have locality,
  where groups of pages tend to be accessed/reclaimed/refaulted
  together. The assumption that the whole bio is due to thrashing when
  we see the first thrashing page is a workable simplification. ]

HTH

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-03 22:36         ` Johannes Weiner
@ 2020-12-03 23:43           ` Pavel Begunkov
  2020-12-04 12:48           ` Christoph Hellwig
  1 sibling, 0 replies; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-03 23:43 UTC (permalink / raw)
  To: Johannes Weiner, Christoph Hellwig
  Cc: Matthew Wilcox, Ming Lei, Jens Axboe, linux-block, linux-fsdevel

On 03/12/2020 22:36, Johannes Weiner wrote:
> On Tue, Dec 01, 2020 at 01:32:26PM +0000, Christoph Hellwig wrote:
>> On Tue, Dec 01, 2020 at 01:17:49PM +0000, Pavel Begunkov wrote:
>>> I was thinking about memcpy bvec instead of iterating as a first step,
>>> and then try to reuse passed in bvec.
>>>
>>> A thing that doesn't play nice with that is setting BIO_WORKINGSET in
>>> __bio_add_page(), which requires to iterate all pages anyway. I have no
>>> clue what it is, so rather to ask if we can optimise it out somehow?
>>> Apart from pre-computing for specific cases...
>>>
>>> E.g. can pages of a single bvec segment be both in and out of a working
>>> set? (i.e. PageWorkingset(page)).
>>
>> Adding Johannes for the PageWorkingset logic, which keeps confusing me
>> everytime I look at it.  I think it is intended to deal with pages
>> being swapped out and in, and doesn't make much sense to look at in
>> any form for direct I/O, but as said I'm rather confused by this code.
> 
> Correct, it's only interesting for pages under LRU management - page
> cache and swap pages. It should not matter for direct IO.
> 
> The VM uses the page flag to tell the difference between cold faults
> (empty cache startup e.g.), and thrashing pages which are being read
> back not long after they have been reclaimed. This influences reclaim
> behavior, but can also indicate a general lack of memory.
> 
> The BIO_WORKINGSET flag is for the latter. To calculate the time
> wasted by a lack of memory (memory pressure), we measure the total
> time processes wait for thrashing pages. Usually that time is
> dominated by waiting for in-flight io to complete and pages to become
> uptodate. These waits are annotated on the page cache side.
> 
> However, in some cases, the IO submission path itself can block for
> extended periods - if the device is congested or submissions are
> throttled due to cgroup policy. To capture those waits, the bio is
> flagged when it's for thrashing pages, and then submit_bio() will
> report submission time of that bio as a thrashing-related delay.

TIL, thanks Johannes

> 
> [ Obviously, in theory bios could have a mix of thrashing and
>   non-thrashing pages, and the submission stall could have occurred
>   even without the thrashing pages. But in practice we have locality,
>   where groups of pages tend to be accessed/reclaimed/refaulted
>   together. The assumption that the whole bio is due to thrashing when
>   we see the first thrashing page is a workable simplification. ]
Great, then the last piece left before hacking this up is killing off
mutating bio_for_each_segment_all(). But don't think anyone will be sad
for it.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-03 22:36         ` Johannes Weiner
  2020-12-03 23:43           ` Pavel Begunkov
@ 2020-12-04 12:48           ` Christoph Hellwig
  2020-12-10 13:18             ` Johannes Weiner
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2020-12-04 12:48 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Christoph Hellwig, Pavel Begunkov, Matthew Wilcox, Ming Lei,
	Jens Axboe, linux-block, linux-fsdevel

On Thu, Dec 03, 2020 at 05:36:07PM -0500, Johannes Weiner wrote:
> Correct, it's only interesting for pages under LRU management - page
> cache and swap pages. It should not matter for direct IO.
> 
> The VM uses the page flag to tell the difference between cold faults
> (empty cache startup e.g.), and thrashing pages which are being read
> back not long after they have been reclaimed. This influences reclaim
> behavior, but can also indicate a general lack of memory.

I really wonder if we should move setting the flag out of bio_add_page
and into the writeback code, as it will do the wrong things for
non-writeback I/O, that is direct I/O or its in-kernel equivalents.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-01 12:06 [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs Ming Lei
                   ` (3 preceding siblings ...)
  2020-12-02 16:56 ` Jens Axboe
@ 2020-12-07 18:07 ` Pavel Begunkov
  2020-12-08  1:21   ` Ming Lei
  2020-12-08  1:50   ` Ming Lei
  4 siblings, 2 replies; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-07 18:07 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: linux-block, Matthew Wilcox, Christoph Hellwig, linux-fsdevel

On 01/12/2020 12:06, Ming Lei wrote:
> Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> iter.
> 
> Turns out it isn't necessary to iterate every page in the bvec iter,
> and we call iov_iter_npages() just for figuring out how many bio
> vecs need to be allocated. And we can simply map each vector in bvec iter
> to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> bvec iter.
> 
> Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> real usage.
> 
> This patch is based on Mathew's post:

Tried this, the system didn't boot + discovered a filesystem blowned after
booting with a stable kernel. That's on top of 4498a8536c816 ("block: use
an xarray for disk->part_tbl"), which works fine. Ideas?

> https://lore.kernel.org/linux-block/20201120123931.GN29991@casper.infradead.org/
> 
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Pavel Begunkov <asml.silence@gmail.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: linux-fsdevel@vger.kernel.org
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  fs/block_dev.c       | 30 +++++++++++++++---------------
>  fs/iomap/direct-io.c | 14 +++++++-------
>  include/linux/bio.h  | 10 ++++++++++
>  3 files changed, 32 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index d8664f5c1ff6..4fd9bb4306db 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -218,7 +218,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>  
>  static ssize_t
>  __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
> -		int nr_pages)
> +		int nr_vecs)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
> @@ -233,16 +233,16 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
>  	    (bdev_logical_block_size(bdev) - 1))
>  		return -EINVAL;
>  
> -	if (nr_pages <= DIO_INLINE_BIO_VECS)
> +	if (nr_vecs <= DIO_INLINE_BIO_VECS)
>  		vecs = inline_vecs;
>  	else {
> -		vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
> +		vecs = kmalloc_array(nr_vecs, sizeof(struct bio_vec),
>  				     GFP_KERNEL);
>  		if (!vecs)
>  			return -ENOMEM;
>  	}
>  
> -	bio_init(&bio, vecs, nr_pages);
> +	bio_init(&bio, vecs, nr_vecs);
>  	bio_set_dev(&bio, bdev);
>  	bio.bi_iter.bi_sector = pos >> 9;
>  	bio.bi_write_hint = iocb->ki_hint;
> @@ -353,7 +353,7 @@ static void blkdev_bio_end_io(struct bio *bio)
>  }
>  
>  static ssize_t
> -__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
> +__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_vecs)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = bdev_file_inode(file);
> @@ -371,7 +371,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  	    (bdev_logical_block_size(bdev) - 1))
>  		return -EINVAL;
>  
> -	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
> +	bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, &blkdev_dio_pool);
>  
>  	dio = container_of(bio, struct blkdev_dio, bio);
>  	dio->is_sync = is_sync = is_sync_kiocb(iocb);
> @@ -420,8 +420,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		dio->size += bio->bi_iter.bi_size;
>  		pos += bio->bi_iter.bi_size;
>  
> -		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
> -		if (!nr_pages) {
> +		nr_vecs = bio_iov_iter_nvecs(iter, BIO_MAX_PAGES);
> +		if (!nr_vecs) {
>  			bool polled = false;
>  
>  			if (iocb->ki_flags & IOCB_HIPRI) {
> @@ -451,7 +451,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		}
>  
>  		submit_bio(bio);
> -		bio = bio_alloc(GFP_KERNEL, nr_pages);
> +		bio = bio_alloc(GFP_KERNEL, nr_vecs);
>  	}
>  
>  	if (!is_poll)
> @@ -483,15 +483,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  static ssize_t
>  blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  {
> -	int nr_pages;
> +	int nr_vecs;
>  
> -	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
> -	if (!nr_pages)
> +	nr_vecs = bio_iov_iter_nvecs(iter, BIO_MAX_PAGES + 1);
> +	if (!nr_vecs)
>  		return 0;
> -	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
> -		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
> +	if (is_sync_kiocb(iocb) && nr_vecs <= BIO_MAX_PAGES)
> +		return __blkdev_direct_IO_simple(iocb, iter, nr_vecs);
>  
> -	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
> +	return __blkdev_direct_IO(iocb, iter, min(nr_vecs, BIO_MAX_PAGES));
>  }
>  
>  static __init int blkdev_init(void)
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 933f234d5bec..cc779ecc8144 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -211,7 +211,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  	struct bio *bio;
>  	bool need_zeroout = false;
>  	bool use_fua = false;
> -	int nr_pages, ret = 0;
> +	int nr_vecs, ret = 0;
>  	size_t copied = 0;
>  	size_t orig_count;
>  
> @@ -250,9 +250,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  	orig_count = iov_iter_count(dio->submit.iter);
>  	iov_iter_truncate(dio->submit.iter, length);
>  
> -	nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
> -	if (nr_pages <= 0) {
> -		ret = nr_pages;
> +	nr_vecs = bio_iov_iter_nvecs(dio->submit.iter, BIO_MAX_PAGES);
> +	if (nr_vecs <= 0) {
> +		ret = nr_vecs;
>  		goto out;
>  	}
>  
> @@ -271,7 +271,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  			goto out;
>  		}
>  
> -		bio = bio_alloc(GFP_KERNEL, nr_pages);
> +		bio = bio_alloc(GFP_KERNEL, nr_vecs);
>  		bio_set_dev(bio, iomap->bdev);
>  		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>  		bio->bi_write_hint = dio->iocb->ki_hint;
> @@ -308,10 +308,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  		dio->size += n;
>  		copied += n;
>  
> -		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
> +		nr_vecs = bio_iov_iter_nvecs(dio->submit.iter, BIO_MAX_PAGES);
>  		iomap_dio_submit_bio(dio, iomap, bio, pos);
>  		pos += n;
> -	} while (nr_pages);
> +	} while (nr_vecs);
>  
>  	/*
>  	 * We need to zeroout the tail of a sub-block write if the extent type
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index ecf67108f091..b985857ce9d1 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -10,6 +10,7 @@
>  #include <linux/ioprio.h>
>  /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
>  #include <linux/blk_types.h>
> +#include <linux/uio.h>
>  
>  #define BIO_DEBUG
>  
> @@ -807,4 +808,13 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
>  		bio->bi_opf |= REQ_NOWAIT;
>  }
>  
> +static inline int bio_iov_iter_nvecs(const struct iov_iter *i, int maxvecs)
> +{
> +	if (!iov_iter_count(i))
> +		return 0;
> +	if (iov_iter_is_bvec(i))
> +               return min_t(int, maxvecs, i->nr_segs);
> +	return iov_iter_npages(i, maxvecs);
> +}
> +
>  #endif /* __LINUX_BIO_H */
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-07 18:07 ` Pavel Begunkov
@ 2020-12-08  1:21   ` Ming Lei
  2020-12-08  1:50   ` Ming Lei
  1 sibling, 0 replies; 23+ messages in thread
From: Ming Lei @ 2020-12-08  1:21 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Jens Axboe, linux-block, Matthew Wilcox, Christoph Hellwig,
	linux-fsdevel

On Mon, Dec 07, 2020 at 06:07:39PM +0000, Pavel Begunkov wrote:
> On 01/12/2020 12:06, Ming Lei wrote:
> > Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> > iter.
> > 
> > Turns out it isn't necessary to iterate every page in the bvec iter,
> > and we call iov_iter_npages() just for figuring out how many bio
> > vecs need to be allocated. And we can simply map each vector in bvec iter
> > to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> > bvec iter.
> > 
> > Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> > real usage.
> > 
> > This patch is based on Mathew's post:
> 
> Tried this, the system didn't boot + discovered a filesystem blowned after
> booting with a stable kernel. That's on top of 4498a8536c816 ("block: use
> an xarray for disk->part_tbl"), which works fine. Ideas?

Is share any log to show the issue?

Can you share us the kernel .config? And what is your root disk? Not see
any issue on this kernel in my KVM test.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-07 18:07 ` Pavel Begunkov
  2020-12-08  1:21   ` Ming Lei
@ 2020-12-08  1:50   ` Ming Lei
  2020-12-08  2:54     ` Pavel Begunkov
  1 sibling, 1 reply; 23+ messages in thread
From: Ming Lei @ 2020-12-08  1:50 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Jens Axboe, linux-block, Matthew Wilcox, Christoph Hellwig,
	linux-fsdevel

On Mon, Dec 07, 2020 at 06:07:39PM +0000, Pavel Begunkov wrote:
> On 01/12/2020 12:06, Ming Lei wrote:
> > Pavel reported that iov_iter_npages is a bit heavy in case of bvec
> > iter.
> > 
> > Turns out it isn't necessary to iterate every page in the bvec iter,
> > and we call iov_iter_npages() just for figuring out how many bio
> > vecs need to be allocated. And we can simply map each vector in bvec iter
> > to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
> > bvec iter.
> > 
> > Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
> > real usage.
> > 
> > This patch is based on Mathew's post:
> 
> Tried this, the system didn't boot + discovered a filesystem blowned after
> booting with a stable kernel. That's on top of 4498a8536c816 ("block: use
> an xarray for disk->part_tbl"), which works fine. Ideas?

I guess it is caused by Christoph's "store a pointer to the block_device in struct bio (again)"
which has been reverted in for-5.11/block.

I'd suggest to run your test against the latest for-5.11/block one more time.

thanks,
Ming


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-08  1:50   ` Ming Lei
@ 2020-12-08  2:54     ` Pavel Begunkov
  0 siblings, 0 replies; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-08  2:54 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Matthew Wilcox, Christoph Hellwig,
	linux-fsdevel

On 08/12/2020 01:50, Ming Lei wrote:
> On Mon, Dec 07, 2020 at 06:07:39PM +0000, Pavel Begunkov wrote:
>> On 01/12/2020 12:06, Ming Lei wrote:
>>> Pavel reported that iov_iter_npages is a bit heavy in case of bvec
>>> iter.
>>>
>>> Turns out it isn't necessary to iterate every page in the bvec iter,
>>> and we call iov_iter_npages() just for figuring out how many bio
>>> vecs need to be allocated. And we can simply map each vector in bvec iter
>>> to bio's vec, so just return iter->nr_segs from bio_iov_iter_nvecs() for
>>> bvec iter.
>>>
>>> Also rename local variable 'nr_pages' as 'nr_vecs' which exactly matches its
>>> real usage.
>>>
>>> This patch is based on Mathew's post:
>>
>> Tried this, the system didn't boot + discovered a filesystem blowned after
>> booting with a stable kernel. That's on top of 4498a8536c816 ("block: use
>> an xarray for disk->part_tbl"), which works fine. Ideas?
> 
> I guess it is caused by Christoph's "store a pointer to the block_device in struct bio (again)"
> which has been reverted in for-5.11/block.
> 
> I'd suggest to run your test against the latest for-5.11/block one more time.

Ah, now it works, thanks for the idea
Unfortunately, I haven't got any logs, it died pretty early

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-04 12:48           ` Christoph Hellwig
@ 2020-12-10 13:18             ` Johannes Weiner
  2020-12-11 13:22               ` Pavel Begunkov
  0 siblings, 1 reply; 23+ messages in thread
From: Johannes Weiner @ 2020-12-10 13:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pavel Begunkov, Matthew Wilcox, Ming Lei, Jens Axboe,
	linux-block, linux-fsdevel

Sorry, I'm only now getting back to this.

On Fri, Dec 04, 2020 at 12:48:49PM +0000, Christoph Hellwig wrote:
> On Thu, Dec 03, 2020 at 05:36:07PM -0500, Johannes Weiner wrote:
> > Correct, it's only interesting for pages under LRU management - page
> > cache and swap pages. It should not matter for direct IO.
> > 
> > The VM uses the page flag to tell the difference between cold faults
> > (empty cache startup e.g.), and thrashing pages which are being read
> > back not long after they have been reclaimed. This influences reclaim
> > behavior, but can also indicate a general lack of memory.
> 
> I really wonder if we should move setting the flag out of bio_add_page
> and into the writeback code, as it will do the wrong things for
> non-writeback I/O, that is direct I/O or its in-kernel equivalents.

Good point. When somebody does direct IO reads into a user page that
happens to have the flag set, we misattribute submission delays.

There is some background discussion from when I first submitted the
patch, which did the annotations on the writeback/page cache side:

https://lore.kernel.org/lkml/20190722201337.19180-1-hannes@cmpxchg.org/

Fragility is a concern, as this is part of the writeback code that is
spread out over several fs-specific implementations, and it's somewhat
easy to get the annotation wrong.

Some possible options I can think of:

1 open-coding the submit_bio() annotations in writeback code, like the original patch
  pros: no bio layer involvement at all - no BIO_WORKINGSET flag
  cons: lots of copy-paste code & comments

2 open-coding if (PageWorkingset()) bio_set_flag(BIO_WORKINGSET) in writeback code
  pros: slightly less complex callsite code, eliminates read check in submit_bio()
  cons: still somewhat copy-pasty (but the surrounding code is as well)

3 adding a bio_add_page_memstall() as per Dave in the original patch thread
  pros: minimal churn and self-documenting (may need a better name)
  cons: easy to incorrectly use raw bio_add_page() in writeback code

4 writeback & direct-io versions for bio_add_page()
  pros: hard to misuse
  cons: awkward interface/layering

5 flag bio itself as writeback or direct-io (BIO_BUFFERED?)
  pros: single version of bio_add_page()
  cons: easy to miss setting the flag, similar to 3

Personally, I'm torn between 2 and 5. What do you think?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs
  2020-12-10 13:18             ` Johannes Weiner
@ 2020-12-11 13:22               ` Pavel Begunkov
  0 siblings, 0 replies; 23+ messages in thread
From: Pavel Begunkov @ 2020-12-11 13:22 UTC (permalink / raw)
  To: Johannes Weiner, Christoph Hellwig
  Cc: Matthew Wilcox, Ming Lei, Jens Axboe, linux-block, linux-fsdevel

On 10/12/2020 13:18, Johannes Weiner wrote:
> Sorry, I'm only now getting back to this.
> 
> On Fri, Dec 04, 2020 at 12:48:49PM +0000, Christoph Hellwig wrote:
>> On Thu, Dec 03, 2020 at 05:36:07PM -0500, Johannes Weiner wrote:
>>> Correct, it's only interesting for pages under LRU management - page
>>> cache and swap pages. It should not matter for direct IO.
>>>
>>> The VM uses the page flag to tell the difference between cold faults
>>> (empty cache startup e.g.), and thrashing pages which are being read
>>> back not long after they have been reclaimed. This influences reclaim
>>> behavior, but can also indicate a general lack of memory.
>>
>> I really wonder if we should move setting the flag out of bio_add_page
>> and into the writeback code, as it will do the wrong things for
>> non-writeback I/O, that is direct I/O or its in-kernel equivalents.
> 
> Good point. When somebody does direct IO reads into a user page that
> happens to have the flag set, we misattribute submission delays.
> 
> There is some background discussion from when I first submitted the
> patch, which did the annotations on the writeback/page cache side:
> 
> https://lore.kernel.org/lkml/20190722201337.19180-1-hannes@cmpxchg.org/
> 
> Fragility is a concern, as this is part of the writeback code that is
> spread out over several fs-specific implementations, and it's somewhat
> easy to get the annotation wrong.
> 
> Some possible options I can think of:
> 
> 1 open-coding the submit_bio() annotations in writeback code, like the original patch
>   pros: no bio layer involvement at all - no BIO_WORKINGSET flag
>   cons: lots of copy-paste code & comments
> 
> 2 open-coding if (PageWorkingset()) bio_set_flag(BIO_WORKINGSET) in writeback code
>   pros: slightly less complex callsite code, eliminates read check in submit_bio()
>   cons: still somewhat copy-pasty (but the surrounding code is as well)
> 
> 3 adding a bio_add_page_memstall() as per Dave in the original patch thread
>   pros: minimal churn and self-documenting (may need a better name)
>   cons: easy to incorrectly use raw bio_add_page() in writeback code
> 
> 4 writeback & direct-io versions for bio_add_page()
>   pros: hard to misuse
>   cons: awkward interface/layering
> 
> 5 flag bio itself as writeback or direct-io (BIO_BUFFERED?)
>   pros: single version of bio_add_page()
>   cons: easy to miss setting the flag, similar to 3
> 
> Personally, I'm torn between 2 and 5. What do you think?

I was thinking that easier would be inverted 3, i.e. letting add_page
with the annotation be and use a special version of it for direct IO.
IIRC we only to change bio_iov_iter_get_pages() + its helpers for that.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2020-12-11 13:28 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-01 12:06 [PATCH] block: add bio_iov_iter_nvecs for figuring out nr_vecs Ming Lei
2020-12-01 12:52 ` Matthew Wilcox
2020-12-01 12:59   ` Christoph Hellwig
2020-12-01 13:17     ` Pavel Begunkov
2020-12-01 13:32       ` Christoph Hellwig
2020-12-01 13:36         ` Pavel Begunkov
2020-12-01 13:45           ` Christoph Hellwig
2020-12-01 13:48             ` Pavel Begunkov
2020-12-02  2:10             ` Ming Lei
2020-12-02  8:02               ` Christoph Hellwig
2020-12-03 22:36         ` Johannes Weiner
2020-12-03 23:43           ` Pavel Begunkov
2020-12-04 12:48           ` Christoph Hellwig
2020-12-10 13:18             ` Johannes Weiner
2020-12-11 13:22               ` Pavel Begunkov
2020-12-02  1:46   ` Ming Lei
2020-12-02 14:06 ` Pavel Begunkov
2020-12-02 15:02 ` Christoph Hellwig
2020-12-02 16:56 ` Jens Axboe
2020-12-07 18:07 ` Pavel Begunkov
2020-12-08  1:21   ` Ming Lei
2020-12-08  1:50   ` Ming Lei
2020-12-08  2:54     ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).