Another split off from the "block optimisation round" series focusing on fops.c. It addresses Cristoph's review, the main change is setting bvec directly instead of passing a "hint" to bio_iov_iter_get_pages() in 3/5, and a prep patch 2/5. I don't find a good way how to deduplicate __blkdev_direct_IO_async(), there are small differences in implementation. If that's fine I'd suggest to do it afterwards, anyway I want to brush up __blkdev_direct_IO(), e.g. remove effectively unnecessary DIO_MULTI_BIO flag. Performance stays the same, solely 1/5 gives ~4.5 -> 4.7 MIOPS, plus 1-2% from 3/5. Pavel Begunkov (5): block: add single bio async direct IO helper block: refactor bio_iov_bvec_set() block: avoid extra iter advance with async iocb block: kill unused polling bits in __blkdev_direct_IO() block: add async version of bio_set_polled block/bio.c | 37 ++++++-------- block/fops.c | 116 ++++++++++++++++++++++++++++++++++++-------- include/linux/bio.h | 6 +++ 3 files changed, 116 insertions(+), 43 deletions(-) -- 2.33.1
As with __blkdev_direct_IO_simple(), we can implement direct IO more efficiently if there is only one bio. Add __blkdev_direct_IO_async() and blkdev_bio_end_io_async(). This patch brings me from 4.45-4.5 MIOPS with nullblk to 4.7+. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- block/fops.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/block/fops.c b/block/fops.c index 396537598e3e..a7b328296912 100644 --- a/block/fops.c +++ b/block/fops.c @@ -305,6 +305,85 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static void blkdev_bio_end_io_async(struct bio *bio) +{ + struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (likely(!bio->bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(bio->bi_status); + } + + iocb->ki_complete(iocb, ret, 0); + + if (dio->flags & DIO_SHOULD_DIRTY) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, + struct iov_iter *iter, + unsigned int nr_pages) +{ + struct block_device *bdev = iocb->ki_filp->private_data; + struct blkdev_dio *dio; + struct bio *bio; + loff_t pos = iocb->ki_pos; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + dio = container_of(bio, struct blkdev_dio, bio); + dio->flags = 0; + dio->iocb = iocb; + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_end_io = blkdev_bio_end_io_async; + bio->bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return ret; + } + dio->size = bio->bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio->bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) { + dio->flags |= DIO_SHOULD_DIRTY; + bio_set_pages_dirty(bio); + } + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio_set_polled(bio, iocb); + submit_bio(bio); + WRITE_ONCE(iocb->private, bio); + } else { + submit_bio(bio); + } + return -EIOCBQUEUED; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { unsigned int nr_pages; @@ -313,9 +392,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - + if (likely(nr_pages <= BIO_MAX_VECS)) { + if (is_sync_kiocb(iocb)) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + return __blkdev_direct_IO_async(iocb, iter, nr_pages); + } return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } -- 2.33.1
Combine bio_iov_bvec_set() and bio_iov_bvec_set_append() and let the caller to do iov_iter_advance(). Also get rid of __bio_iov_bvec_set(), which was duplicated in the final binary, and replace a weird iov_iter_truncate() of a temporal iter copy with min() better reflecting the intention. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- block/bio.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/block/bio.c b/block/bio.c index 46a87c72d2b4..ead1f8a9ff5e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1046,36 +1046,27 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(__bio_release_pages); -static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +static void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { + size_t size = iov_iter_count(iter); + WARN_ON_ONCE(bio->bi_max_vecs); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + size_t max_sectors = queue_max_zone_append_sectors(q); + + size = min(size, max_sectors << SECTOR_SHIFT); + } + bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = iter->count; + bio->bi_iter.bi_size = size; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) -{ - __bio_iov_bvec_set(bio, iter); - iov_iter_advance(iter, iter->count); - return 0; -} - -static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct iov_iter i = *iter; - - iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); - __bio_iov_bvec_set(bio, &i); - iov_iter_advance(iter, i.count); - return 0; -} - static void bio_put_pages(struct page **pages, size_t size, size_t off) { size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); @@ -1217,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int ret = 0; if (iov_iter_is_bvec(iter)) { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - return bio_iov_bvec_set_append(bio, iter); - return bio_iov_bvec_set(bio, iter); + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; } do { -- 2.33.1
Nobody cares about iov iterators state if we return -EIOCBQUEUED, so as the we now have __blkdev_direct_IO_async(), which gets pages only once, we can skip expensive iov_iter_advance(). It's around 1-2% of all CPU spent. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- block/bio.c | 2 +- block/fops.c | 20 +++++++++++++++----- include/linux/bio.h | 1 + 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index ead1f8a9ff5e..15ab0d6d1c06 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1046,7 +1046,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(__bio_release_pages); -static void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { size_t size = iov_iter_count(iter); diff --git a/block/fops.c b/block/fops.c index a7b328296912..8800b0ad5c29 100644 --- a/block/fops.c +++ b/block/fops.c @@ -352,11 +352,21 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return ret; + if (!iov_iter_is_bvec(iter)) { + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return ret; + } + } else { + /* + * Users don't rely on the iterator being in any particular + * state for async I/O returning -EIOCBQUEUED, hence we can + * avoid expensive iov_iter_advance(). Bypass + * bio_iov_iter_get_pages() and set the bvec directly. + */ + bio_iov_bvec_set(bio, iter); } dio->size = bio->bi_iter.bi_size; diff --git a/include/linux/bio.h b/include/linux/bio.h index c88700d1bdc3..fe6bdfbbef66 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,6 +417,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -- 2.33.1
With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now serves only multio-bio I/O, which we don't poll. Now we can remove anything related to I/O polling from it. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- block/fops.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/block/fops.c b/block/fops.c index 8800b0ad5c29..997904963a9d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -190,7 +190,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; - bool do_poll = (iocb->ki_flags & IOCB_HIPRI); bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; int ret = 0; @@ -216,12 +215,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (is_read && iter_is_iovec(iter)) dio->flags |= DIO_SHOULD_DIRTY; - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!(iocb->ki_flags & IOCB_HIPRI)) - blk_start_plug(&plug); + blk_start_plug(&plug); for (;;) { bio_set_dev(bio, bdev); @@ -254,11 +248,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { - if (do_poll) - bio_set_polled(bio, iocb); submit_bio(bio); - if (do_poll) - WRITE_ONCE(iocb->private, bio); break; } if (!(dio->flags & DIO_MULTI_BIO)) { @@ -271,7 +261,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_get(bio); dio->flags |= DIO_MULTI_BIO; atomic_set(&dio->ref, 2); - do_poll = false; } else { atomic_inc(&dio->ref); } @@ -280,8 +269,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio = bio_alloc(GFP_KERNEL, nr_pages); } - if (!(iocb->ki_flags & IOCB_HIPRI)) - blk_finish_plug(&plug); + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; @@ -290,9 +278,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; - - if (!do_poll || !bio_poll(bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); -- 2.33.1
If we know that a iocb is async we can optimise bio_set_polled() a bit, add a new helper bio_set_polled_async(). Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- block/fops.c | 7 +++---- include/linux/bio.h | 5 +++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/block/fops.c b/block/fops.c index 997904963a9d..9cb436de92bb 100644 --- a/block/fops.c +++ b/block/fops.c @@ -367,14 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); + bio_set_polled_async(bio, iocb); submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; submit_bio(bio); } return -EIOCBQUEUED; diff --git a/include/linux/bio.h b/include/linux/bio.h index fe6bdfbbef66..b64161473f3e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -738,6 +738,11 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) bio->bi_opf |= REQ_NOWAIT; } +static inline void bio_set_polled_async(struct bio *bio, struct kiocb *kiocb) +{ + bio->bi_opf |= REQ_POLLED | REQ_NOWAIT; +} + struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #endif /* __LINUX_BIO_H */ -- 2.33.1
On 10/23/21 17:21, Pavel Begunkov wrote:
> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
> serves only multio-bio I/O, which we don't poll. Now we can remove
> anything related to I/O polling from it.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
> block/fops.c | 20 +++-----------------
> 1 file changed, 3 insertions(+), 17 deletions(-)
>
> diff --git a/block/fops.c b/block/fops.c
> index 8800b0ad5c29..997904963a9d 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -190,7 +190,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
> struct blk_plug plug;
> struct blkdev_dio *dio;
> struct bio *bio;
> - bool do_poll = (iocb->ki_flags & IOCB_HIPRI);
> bool is_read = (iov_iter_rw(iter) == READ), is_sync;
> loff_t pos = iocb->ki_pos;
> int ret = 0;
> @@ -216,12 +215,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
> if (is_read && iter_is_iovec(iter))
> dio->flags |= DIO_SHOULD_DIRTY;
>
> - /*
> - * Don't plug for HIPRI/polled IO, as those should go straight
> - * to issue
> - */
> - if (!(iocb->ki_flags & IOCB_HIPRI))
> - blk_start_plug(&plug);
I'm not sure, do we want to leave it conditional here?
--
Pavel Begunkov
On 10/23/21 10:46 AM, Pavel Begunkov wrote:
> On 10/23/21 17:21, Pavel Begunkov wrote:
>> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
>> serves only multio-bio I/O, which we don't poll. Now we can remove
>> anything related to I/O polling from it.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> ---
>> block/fops.c | 20 +++-----------------
>> 1 file changed, 3 insertions(+), 17 deletions(-)
>>
>> diff --git a/block/fops.c b/block/fops.c
>> index 8800b0ad5c29..997904963a9d 100644
>> --- a/block/fops.c
>> +++ b/block/fops.c
>> @@ -190,7 +190,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>> struct blk_plug plug;
>> struct blkdev_dio *dio;
>> struct bio *bio;
>> - bool do_poll = (iocb->ki_flags & IOCB_HIPRI);
>> bool is_read = (iov_iter_rw(iter) == READ), is_sync;
>> loff_t pos = iocb->ki_pos;
>> int ret = 0;
>> @@ -216,12 +215,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>> if (is_read && iter_is_iovec(iter))
>> dio->flags |= DIO_SHOULD_DIRTY;
>>
>> - /*
>> - * Don't plug for HIPRI/polled IO, as those should go straight
>> - * to issue
>> - */
>> - if (!(iocb->ki_flags & IOCB_HIPRI))
>> - blk_start_plug(&plug);
>
> I'm not sure, do we want to leave it conditional here?
For async polled there's only one user and that user plug already...
--
Jens Axboe
On 10/24/21 16:09, Jens Axboe wrote:
> On 10/23/21 10:46 AM, Pavel Begunkov wrote:
>> On 10/23/21 17:21, Pavel Begunkov wrote:
>>> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
>>> serves only multio-bio I/O, which we don't poll. Now we can remove
>>> anything related to I/O polling from it.
>>>
>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>> ---
>>> block/fops.c | 20 +++-----------------
>>> 1 file changed, 3 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/block/fops.c b/block/fops.c
>>> index 8800b0ad5c29..997904963a9d 100644
>>> --- a/block/fops.c
>>> +++ b/block/fops.c
>>> @@ -190,7 +190,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>>> struct blk_plug plug;
>>> struct blkdev_dio *dio;
>>> struct bio *bio;
>>> - bool do_poll = (iocb->ki_flags & IOCB_HIPRI);
>>> bool is_read = (iov_iter_rw(iter) == READ), is_sync;
>>> loff_t pos = iocb->ki_pos;
>>> int ret = 0;
>>> @@ -216,12 +215,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>>> if (is_read && iter_is_iovec(iter))
>>> dio->flags |= DIO_SHOULD_DIRTY;
>>>
>>> - /*
>>> - * Don't plug for HIPRI/polled IO, as those should go straight
>>> - * to issue
>>> - */
>>> - if (!(iocb->ki_flags & IOCB_HIPRI))
>>> - blk_start_plug(&plug);
>>
>> I'm not sure, do we want to leave it conditional here?
>
> For async polled there's only one user and that user plug already...
It's __blkdev_direct_IO() though, covers both sync and async
--
Pavel Begunkov
On Oct 24, 2021, at 12:18 PM, Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 10/24/21 16:09, Jens Axboe wrote:
>>> On 10/23/21 10:46 AM, Pavel Begunkov wrote:
>>> On 10/23/21 17:21, Pavel Begunkov wrote:
>>>> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
>>>> serves only multio-bio I/O, which we don't poll. Now we can remove
>>>> anything related to I/O polling from it.
>>>>
>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> ---
>>>> block/fops.c | 20 +++-----------------
>>>> 1 file changed, 3 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/block/fops.c b/block/fops.c
>>>> index 8800b0ad5c29..997904963a9d 100644
>>>> --- a/block/fops.c
>>>> +++ b/block/fops.c
>>>> @@ -190,7 +190,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>>>> struct blk_plug plug;
>>>> struct blkdev_dio *dio;
>>>> struct bio *bio;
>>>> - bool do_poll = (iocb->ki_flags & IOCB_HIPRI);
>>>> bool is_read = (iov_iter_rw(iter) == READ), is_sync;
>>>> loff_t pos = iocb->ki_pos;
>>>> int ret = 0;
>>>> @@ -216,12 +215,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>>>> if (is_read && iter_is_iovec(iter))
>>>> dio->flags |= DIO_SHOULD_DIRTY;
>>>> - /*
>>>> - * Don't plug for HIPRI/polled IO, as those should go straight
>>>> - * to issue
>>>> - */
>>>> - if (!(iocb->ki_flags & IOCB_HIPRI))
>>>> - blk_start_plug(&plug);
>>>
>>> I'm not sure, do we want to leave it conditional here?
>> For async polled there's only one user and that user plug already...
>
> It's __blkdev_direct_IO() though, covers both sync and async
Pointless to plug for sync, though.
On Sat, Oct 23, 2021 at 05:21:31PM +0100, Pavel Begunkov wrote:
> I don't find a good way how to deduplicate __blkdev_direct_IO_async(),
> there are small differences in implementation. If that's fine I'd
> suggest to do it afterwards, anyway I want to brush up
> __blkdev_direct_IO(), e.g. remove effectively unnecessary DIO_MULTI_BIO
> flag.
If we add another code path we should at last drop the DIO_MULTI_BIO
code to keep the complexity down in that function.
On Sat, Oct 23, 2021 at 05:21:33PM +0100, Pavel Begunkov wrote:
> Combine bio_iov_bvec_set() and bio_iov_bvec_set_append() and let the
> caller to do iov_iter_advance(). Also get rid of __bio_iov_bvec_set(),
> which was duplicated in the final binary, and replace a weird
> iov_iter_truncate() of a temporal iter copy with min() better reflecting
> the intention.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
On Sat, Oct 23, 2021 at 05:21:34PM +0100, Pavel Begunkov wrote: > --- a/block/fops.c > +++ b/block/fops.c > @@ -352,11 +352,21 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, > bio->bi_end_io = blkdev_bio_end_io_async; > bio->bi_ioprio = iocb->ki_ioprio; > > - ret = bio_iov_iter_get_pages(bio, iter); > - if (unlikely(ret)) { > - bio->bi_status = BLK_STS_IOERR; > - bio_endio(bio); > - return ret; > + if (!iov_iter_is_bvec(iter)) { > + ret = bio_iov_iter_get_pages(bio, iter); > + if (unlikely(ret)) { > + bio->bi_status = BLK_STS_IOERR; > + bio_endio(bio); > + return ret; > + } Nit: I generally find it much nicer to read if simple if statements don't use pointless negations. > + } else { > + /* > + * Users don't rely on the iterator being in any particular > + * state for async I/O returning -EIOCBQUEUED, hence we can > + * avoid expensive iov_iter_advance(). Bypass > + * bio_iov_iter_get_pages() and set the bvec directly. > + */ > + bio_iov_bvec_set(bio, iter); So if this optimization is so useful, please also do it for non-bvec iov_iters, which is what 99% of the applications actually use.
On Sat, Oct 23, 2021 at 05:21:35PM +0100, Pavel Begunkov wrote:
> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
> serves only multio-bio I/O, which we don't poll. Now we can remove
> anything related to I/O polling from it.
Looks good, but please also remove the entire non-multi bio support
while you're at it.
On Sat, Oct 23, 2021 at 05:21:36PM +0100, Pavel Begunkov wrote:
> +static inline void bio_set_polled_async(struct bio *bio, struct kiocb *kiocb)
> +{
> + bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
> +}
As mentioned last time I'm a little skeptical of this optimization. But
if you an Jens thing it is worth it just drop this helper and assign
the flags directly.
On 10/25/21 08:35, Christoph Hellwig wrote:
> On Sat, Oct 23, 2021 at 05:21:35PM +0100, Pavel Begunkov wrote:
>> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
>> serves only multio-bio I/O, which we don't poll. Now we can remove
>> anything related to I/O polling from it.
>
> Looks good, but please also remove the entire non-multi bio support
> while you're at it.
ok, will include into the series
--
Pavel Begunkov
On 10/25/21 08:36, Christoph Hellwig wrote:
> On Sat, Oct 23, 2021 at 05:21:36PM +0100, Pavel Begunkov wrote:
>> +static inline void bio_set_polled_async(struct bio *bio, struct kiocb *kiocb)
>> +{
>> + bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
>> +}
>
> As mentioned last time I'm a little skeptical of this optimization. But
> if you an Jens thing it is worth it just drop this helper and assign
> the flags directly.
Ok. I'll keep it last in the series, looks Jens doesn't mind
applying patchsets partially.
--
Pavel Begunkov
On 10/25/21 11:12, Pavel Begunkov wrote:
> On 10/25/21 08:35, Christoph Hellwig wrote:
>> On Sat, Oct 23, 2021 at 05:21:35PM +0100, Pavel Begunkov wrote:
>>> With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
>>> serves only multio-bio I/O, which we don't poll. Now we can remove
>>> anything related to I/O polling from it.
>>
>> Looks good, but please also remove the entire non-multi bio support
>> while you're at it.
>
> ok, will include into the series
You mean simplifying __blkdev_direct_IO(), right? E.g. as mentioned
removing DIO_MULTI_BIO.
There is also some potential for doing bio_iov_vecs_to_alloc()
only once and doing some refactoring on top of it, but would
need extra legwork that is better to go separately.
--
Pavel Begunkov
On 10/25/21 08:33, Christoph Hellwig wrote:
> On Sat, Oct 23, 2021 at 05:21:34PM +0100, Pavel Begunkov wrote:
>> --- a/block/fops.c
>> +++ b/block/fops.c
>> @@ -352,11 +352,21 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
>> bio->bi_end_io = blkdev_bio_end_io_async;
>> bio->bi_ioprio = iocb->ki_ioprio;
>>
>> - ret = bio_iov_iter_get_pages(bio, iter);
>> - if (unlikely(ret)) {
>> - bio->bi_status = BLK_STS_IOERR;
>> - bio_endio(bio);
>> - return ret;
>> + if (!iov_iter_is_bvec(iter)) {
>> + ret = bio_iov_iter_get_pages(bio, iter);
>> + if (unlikely(ret)) {
>> + bio->bi_status = BLK_STS_IOERR;
>> + bio_endio(bio);
>> + return ret;
>> + }
>
> Nit: I generally find it much nicer to read if simple if statements
> don't use pointless negations.
>
>> + } else {
>> + /*
>> + * Users don't rely on the iterator being in any particular
>> + * state for async I/O returning -EIOCBQUEUED, hence we can
>> + * avoid expensive iov_iter_advance(). Bypass
>> + * bio_iov_iter_get_pages() and set the bvec directly.
>> + */
>> + bio_iov_bvec_set(bio, iter);
>
> So if this optimization is so useful, please also do it for
> non-bvec iov_iters, which is what 99% of the applications actually
> use.
It's an async path, so mainly io_uring or aio, I don't think there
is much profit in doing that for iov, especially behind iov -> bvec
translation with page referencing.
Could've been done nonetheless, but what I think about looks too
ugly because of the loop inside of bio_iov_iter_get_pages(). Don't
think the sketch below is viable, any better ideas?
ssize_t __bio_iov_iter_get_pages() {
...
/* not advancing */
return size;
}
do {
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = __bio_iov_append_get_pages(bio, iter);
else
ret = __bio_iov_iter_get_pages(bio, iter);
if (ret < 0)
break;
iov_iter_advance(ret);
} while (iov_iter_count(iter) && !bio_full(bio, 0));
and copy paste into fops.c
do {
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = __bio_iov_append_get_pages(bio, iter);
else
ret = __bio_iov_iter_get_pages(bio, iter);
if (ret < 0 || iov_iter_count(iter) == ret || bio_full(bio, 0))
break;
iov_iter_advance(ret);
} while (1);
--
Pavel Begunkov
On Sat, 23 Oct 2021 17:21:31 +0100, Pavel Begunkov wrote:
> Another split off from the "block optimisation round" series focusing
> on fops.c. It addresses Cristoph's review, the main change is
> setting bvec directly instead of passing a "hint"
> to bio_iov_iter_get_pages() in 3/5, and a prep patch 2/5.
>
> I don't find a good way how to deduplicate __blkdev_direct_IO_async(),
> there are small differences in implementation. If that's fine I'd
> suggest to do it afterwards, anyway I want to brush up
> __blkdev_direct_IO(), e.g. remove effectively unnecessary DIO_MULTI_BIO
> flag.
>
> [...]
Applied, thanks!
[1/5] block: add single bio async direct IO helper
(no commit info)
[2/5] block: refactor bio_iov_bvec_set()
(no commit info)
Best regards,
--
Jens Axboe
On Mon, Oct 25, 2021 at 11:27:12AM +0100, Pavel Begunkov wrote:
> On 10/25/21 11:12, Pavel Begunkov wrote:
> > On 10/25/21 08:35, Christoph Hellwig wrote:
> > > On Sat, Oct 23, 2021 at 05:21:35PM +0100, Pavel Begunkov wrote:
> > > > With addition of __blkdev_direct_IO_async(), __blkdev_direct_IO() now
> > > > serves only multio-bio I/O, which we don't poll. Now we can remove
> > > > anything related to I/O polling from it.
> > >
> > > Looks good, but please also remove the entire non-multi bio support
> > > while you're at it.
> >
> > ok, will include into the series
>
> You mean simplifying __blkdev_direct_IO(), right? E.g. as mentioned
> removing DIO_MULTI_BIO.
Yes.