* [PATCH 1/3] fs: pass iocb to do_generic_file_read
2017-06-29 21:25 non-blockling buffered reads Christoph Hellwig
@ 2017-06-29 21:25 ` Christoph Hellwig
2017-06-29 21:25 ` [PATCH 2/3] fs: support IOCB_NOWAIT in generic_file_buffered_read Christoph Hellwig
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2017-06-29 21:25 UTC (permalink / raw)
To: viro, axboe
Cc: Milosz Tanski, Goldwyn Rodrigues, mgorman, Volker.Lendecke,
linux-fsdevel, linux-block
And rename it to the more descriptive generic_file_buffered_read while
at it.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
mm/filemap.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 742034e56100..3df0a57cd48e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1792,9 +1792,8 @@ static void shrink_readahead_size_eio(struct file *filp,
}
/**
- * do_generic_file_read - generic file read routine
- * @filp: the file to read
- * @ppos: current file position
+ * generic_file_buffered_read - generic file read routine
+ * @iocb: the iocb to read
* @iter: data destination
* @written: already copied
*
@@ -1804,12 +1803,14 @@ static void shrink_readahead_size_eio(struct file *filp,
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
+ struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
+ loff_t *ppos = &iocb->ki_pos;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
@@ -2057,14 +2058,14 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- ssize_t retval = 0;
size_t count = iov_iter_count(iter);
+ ssize_t retval = 0;
if (!count)
goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
+ struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t size;
@@ -2105,7 +2106,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
}
- retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
+ retval = generic_file_buffered_read(iocb, iter, retval);
out:
return retval;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/3] fs: support IOCB_NOWAIT in generic_file_buffered_read
2017-06-29 21:25 non-blockling buffered reads Christoph Hellwig
2017-06-29 21:25 ` [PATCH 1/3] fs: pass iocb to do_generic_file_read Christoph Hellwig
@ 2017-06-29 21:25 ` Christoph Hellwig
2017-06-29 21:25 ` [PATCH 3/3] fs: support RWF_NOWAIT for buffered reads Christoph Hellwig
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2017-06-29 21:25 UTC (permalink / raw)
To: viro, axboe
Cc: Milosz Tanski, Goldwyn Rodrigues, mgorman, Volker.Lendecke,
linux-fsdevel, linux-block
From: Milosz Tanski <milosz@adfin.com>
Allow generic_file_buffered_read to bail out early instead of waiting for
the page lock or reading a page if IOCB_NOWAIT is specified.
Signed-off-by: Milosz Tanski <milosz@adfin.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Sage Weil <sage@redhat.com>
---
mm/filemap.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/mm/filemap.c b/mm/filemap.c
index 3df0a57cd48e..d021931c8e74 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1843,6 +1843,8 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
page = find_get_page(mapping, index);
if (!page) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
@@ -1948,6 +1950,11 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
continue;
page_not_up_to_date:
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+
/* Get exclusive access to the page ... */
error = lock_page_killable(page);
if (unlikely(error))
@@ -1967,6 +1974,12 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
goto page_ok;
}
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ unlock_page(page);
+ put_page(page);
+ goto would_block;
+ }
+
readpage:
/*
* A previous I/O error may have been due to temporary
@@ -2037,6 +2050,8 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
goto readpage;
}
+would_block:
+ error = -EAGAIN;
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_SHIFT;
--
2.11.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/3] fs: support RWF_NOWAIT for buffered reads
2017-06-29 21:25 non-blockling buffered reads Christoph Hellwig
2017-06-29 21:25 ` [PATCH 1/3] fs: pass iocb to do_generic_file_read Christoph Hellwig
2017-06-29 21:25 ` [PATCH 2/3] fs: support IOCB_NOWAIT in generic_file_buffered_read Christoph Hellwig
@ 2017-06-29 21:25 ` Christoph Hellwig
2017-06-30 3:43 ` Goldwyn Rodrigues
2017-06-29 23:46 ` non-blockling " Al Viro
` (2 subsequent siblings)
5 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2017-06-29 21:25 UTC (permalink / raw)
To: viro, axboe
Cc: Milosz Tanski, Goldwyn Rodrigues, mgorman, Volker.Lendecke,
linux-fsdevel, linux-block
This is based on the old idea and code from Milosz Tanski. With the
aio nowait code it becomes mostly trivial now.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/aio.c | 6 ------
fs/btrfs/file.c | 2 +-
fs/ext4/file.c | 6 +++---
fs/xfs/xfs_file.c | 11 +++++++++--
include/linux/fs.h | 6 +++---
5 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index dcad3a66748c..d93daa076726 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1593,12 +1593,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
goto out_put_req;
}
- if ((req->common.ki_flags & IOCB_NOWAIT) &&
- !(req->common.ki_flags & IOCB_DIRECT)) {
- ret = -EOPNOTSUPP;
- goto out_put_req;
- }
-
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 59e2dccdf75b..f82dad241b39 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3088,7 +3088,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_AIO_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT;
return generic_file_open(inode, filp);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 58e2eeaa0bc4..5330ea99681a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -223,6 +223,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
+ if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
+ return -EAGAIN;
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -455,9 +457,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
return ret;
}
- /* Set the flags to support nowait AIO */
- filp->f_mode |= FMODE_AIO_NOWAIT;
-
+ filp->f_mode |= FMODE_NOWAIT;
return dquot_file_open(inode, filp);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 17f27a2fb5e2..1068f0b01090 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -259,7 +259,11 @@ xfs_file_buffered_aio_read(
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -636,6 +640,9 @@ xfs_file_buffered_aio_write(
int enospc = 0;
int iolock;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
@@ -911,7 +918,7 @@ xfs_file_open(
return -EFBIG;
if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_AIO_NOWAIT;
+ file->f_mode |= FMODE_NOWAIT;
return 0;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65adbddb3163..a78834bedb61 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -144,8 +144,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
-/* File is capable of returning -EAGAIN if AIO will block */
-#define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000)
+/* File is capable of returning -EAGAIN if I/O will block */
+#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
@@ -3092,7 +3092,7 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
return -EOPNOTSUPP;
if (flags & RWF_NOWAIT) {
- if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+ if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
return -EOPNOTSUPP;
ki->ki_flags |= IOCB_NOWAIT;
}
--
2.11.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 3/3] fs: support RWF_NOWAIT for buffered reads
2017-06-29 21:25 ` [PATCH 3/3] fs: support RWF_NOWAIT for buffered reads Christoph Hellwig
@ 2017-06-30 3:43 ` Goldwyn Rodrigues
0 siblings, 0 replies; 9+ messages in thread
From: Goldwyn Rodrigues @ 2017-06-30 3:43 UTC (permalink / raw)
To: Christoph Hellwig, viro, axboe
Cc: Milosz Tanski, mgorman, Volker.Lendecke, linux-fsdevel, linux-block
On 06/29/2017 04:25 PM, Christoph Hellwig wrote:
> This is based on the old idea and code from Milosz Tanski. With the
> aio nowait code it becomes mostly trivial now.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/aio.c | 6 ------
> fs/btrfs/file.c | 2 +-
> fs/ext4/file.c | 6 +++---
> fs/xfs/xfs_file.c | 11 +++++++++--
> include/linux/fs.h | 6 +++---
> 5 files changed, 16 insertions(+), 15 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index dcad3a66748c..d93daa076726 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1593,12 +1593,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
> goto out_put_req;
> }
>
> - if ((req->common.ki_flags & IOCB_NOWAIT) &&
> - !(req->common.ki_flags & IOCB_DIRECT)) {
> - ret = -EOPNOTSUPP;
> - goto out_put_req;
> - }
> -
> ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
> if (unlikely(ret)) {
> pr_debug("EFAULT: aio_key\n");
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 59e2dccdf75b..f82dad241b39 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -3088,7 +3088,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
>
> static int btrfs_file_open(struct inode *inode, struct file *filp)
> {
> - filp->f_mode |= FMODE_AIO_NOWAIT;
> + filp->f_mode |= FMODE_NOWAIT;
> return generic_file_open(inode, filp);
> }
>
We should modify the NOWAIT check in btrfs_file_write_iter() to return
-EAGAIN for buffered writes.
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 58e2eeaa0bc4..5330ea99681a 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -223,6 +223,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
> if (IS_DAX(inode))
> return ext4_dax_write_iter(iocb, from);
> #endif
> + if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
> + return -EAGAIN;
>
> if (!inode_trylock(inode)) {
> if (iocb->ki_flags & IOCB_NOWAIT)
> @@ -455,9 +457,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
> return ret;
> }
>
> - /* Set the flags to support nowait AIO */
> - filp->f_mode |= FMODE_AIO_NOWAIT;
> -
> + filp->f_mode |= FMODE_NOWAIT;
> return dquot_file_open(inode, filp);
> }
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 17f27a2fb5e2..1068f0b01090 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -259,7 +259,11 @@ xfs_file_buffered_aio_read(
>
> trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
>
> - xfs_ilock(ip, XFS_IOLOCK_SHARED);
> + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EAGAIN;
> + xfs_ilock(ip, XFS_IOLOCK_SHARED);
> + }
> ret = generic_file_read_iter(iocb, to);
> xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>
> @@ -636,6 +640,9 @@ xfs_file_buffered_aio_write(
> int enospc = 0;
> int iolock;
>
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EAGAIN;
> +
> write_retry:
> iolock = XFS_IOLOCK_EXCL;
> xfs_ilock(ip, iolock);
> @@ -911,7 +918,7 @@ xfs_file_open(
> return -EFBIG;
> if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
> return -EIO;
> - file->f_mode |= FMODE_AIO_NOWAIT;
> + file->f_mode |= FMODE_NOWAIT;
> return 0;
> }
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 65adbddb3163..a78834bedb61 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -144,8 +144,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> /* File was opened by fanotify and shouldn't generate fanotify events */
> #define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
>
> -/* File is capable of returning -EAGAIN if AIO will block */
> -#define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000)
> +/* File is capable of returning -EAGAIN if I/O will block */
> +#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
>
> /*
> * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> @@ -3092,7 +3092,7 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
> return -EOPNOTSUPP;
>
> if (flags & RWF_NOWAIT) {
> - if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
> + if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
> return -EOPNOTSUPP;
> ki->ki_flags |= IOCB_NOWAIT;
> }
>
--
Goldwyn
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: non-blockling buffered reads
2017-06-29 21:25 non-blockling buffered reads Christoph Hellwig
` (2 preceding siblings ...)
2017-06-29 21:25 ` [PATCH 3/3] fs: support RWF_NOWAIT for buffered reads Christoph Hellwig
@ 2017-06-29 23:46 ` Al Viro
2017-06-30 0:34 ` Christoph Hellwig
2017-06-30 1:11 ` Milosz Tanski
2017-06-30 1:12 ` Milosz Tanski
5 siblings, 1 reply; 9+ messages in thread
From: Al Viro @ 2017-06-29 23:46 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe, Milosz Tanski, Goldwyn Rodrigues, mgorman,
Volker.Lendecke, linux-fsdevel, linux-block
On Thu, Jun 29, 2017 at 02:25:00PM -0700, Christoph Hellwig wrote:
> This series resurrects the old patches from Milosz to implement
> non-blocking buffered reads. Thanks to the non-blocking AIO code from
> Goldwyn the implementation becomes pretty much trivial. As that
> implementation is in the block tree I would suggest that we merge
> these patches through the block tree as well. I've also forward
> ported the test Milosz sent for recent xfsprogs to verify it works
> properly, but I'll still have to address the review comments for it.
> I'll also volunteer to work with Goldwyn to properly document the
> RWF_NOWAIT flag in the man page including this change.
Hmm... It's not quite non-blocking, though - copy_page_to_iter() can
bloody well block, as any copying to userland. I'm not saying that
it's a significant problem (if you are reading into something that
had been mmaped from NFS server, etc., you are getting what you'd
asked for), but it's worth documenting.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: non-blockling buffered reads
2017-06-29 23:46 ` non-blockling " Al Viro
@ 2017-06-30 0:34 ` Christoph Hellwig
0 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2017-06-30 0:34 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, axboe, Milosz Tanski, Goldwyn Rodrigues,
mgorman, Volker.Lendecke, linux-fsdevel, linux-block
On Fri, Jun 30, 2017 at 12:46:57AM +0100, Al Viro wrote:
> Hmm... It's not quite non-blocking, though - copy_page_to_iter() can
> bloody well block, as any copying to userland. I'm not saying that
> it's a significant problem (if you are reading into something that
> had been mmaped from NFS server, etc., you are getting what you'd
> asked for), but it's worth documenting.
True. I'll ensure this is documented in the man page.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: non-blockling buffered reads
2017-06-29 21:25 non-blockling buffered reads Christoph Hellwig
` (3 preceding siblings ...)
2017-06-29 23:46 ` non-blockling " Al Viro
@ 2017-06-30 1:11 ` Milosz Tanski
2017-06-30 1:12 ` Milosz Tanski
5 siblings, 0 replies; 9+ messages in thread
From: Milosz Tanski @ 2017-06-30 1:11 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Al Viro, Jens Axboe, Goldwyn Rodrigues, Mel Gorman,
Volker Lendecke, linux-fsdevel, linux-block
[-- Attachment #1: Type: text/plain, Size: 5333 bytes --]
On Thu, Jun 29, 2017 at 5:25 PM, Christoph Hellwig <hch@lst.de> wrote:
> This series resurrects the old patches from Milosz to implement
> non-blocking buffered reads. Thanks to the non-blocking AIO code from
> Goldwyn the implementation becomes pretty much trivial. As that
> implementation is in the block tree I would suggest that we merge
> these patches through the block tree as well. I've also forward
> ported the test Milosz sent for recent xfsprogs to verify it works
> properly, but I'll still have to address the review comments for it.
> I'll also volunteer to work with Goldwyn to properly document the
> RWF_NOWAIT flag in the man page including this change.
>
I had update patches for the man pages; so I'll check tomorrow if I can dig
up the changes and I'll forward them on.
>
> Here are additional details from the original cover letter from Milosz,
> where the flag was still called RWF_NONBLOCK:
>
>
> Background:
>
> Using a threadpool to emulate non-blocking operations on regular buffered
> files is a common pattern today (samba, libuv, etc...) Applications split
> the
> work between network bound threads (epoll) and IO threadpool. Not every
> application can use sendfile syscall (TLS / post-processing).
>
> This common pattern leads to increased request latency. Latency can be
> due to
> additional synchronization between the threads or fast (cached data)
> request
> stuck behind slow request (large / uncached data).
>
> The preadv2 syscall with RWF_NONBLOCK lets userspace applications bypass
> enqueuing operation in the threadpool if it's already available in the
> pagecache.
>
>
> Performance numbers (newer Samba):
>
> https://drive.google.com/file/d/0B3maCn0jCvYncndGbXJKbGlhejQ/
> view?usp=sharing
> https://docs.google.com/spreadsheets/d/1GGTivi-
> MfZU0doMzomG4XUo9ioWtRvOGQ5FId042L6s/edit?usp=sharing
>
>
> Performance number (older):
>
> Some perf data generated using fio comparing the posix aio engine to a
> version
> of the posix AIO engine that attempts to performs "fast" reads before
> submitting the operations to the queue. This workflow is on ext4
> partition on
> raid0 (test / build-rig.) Simulating our database access patern workload
> using
> 16kb read accesses. Our database uses a home-spun posix aio like queue
> (samba
> does the same thing.)
>
> f1: ~73% rand read over mostly cached data (zipf med-size dataset)
> f2: ~18% rand read over mostly un-cached data (uniform large-dataset)
> f3: ~9% seq-read over large dataset
>
> before:
>
> f1:
> bw (KB /s): min= 11, max= 9088, per=0.56%, avg=969.54, stdev=827.99
> lat (msec) : 50=0.01%, 100=1.06%, 250=5.88%, 500=4.08%, 750=12.48%
> lat (msec) : 1000=17.27%, 2000=49.86%, >=2000=9.42%
> f2:
> bw (KB /s): min= 2, max= 1882, per=0.16%, avg=273.28, stdev=220.26
> lat (msec) : 250=5.65%, 500=3.31%, 750=15.64%, 1000=24.59%,
> 2000=46.56%
> lat (msec) : >=2000=4.33%
> f3:
> bw (KB /s): min= 0, max=265568, per=99.95%, avg=174575.10,
> stdev=34526.89
> lat (usec) : 2=0.01%, 4=0.01%, 10=0.02%, 20=0.27%, 50=10.82%
> lat (usec) : 100=50.34%, 250=5.05%, 500=7.12%, 750=6.60%, 1000=4.55%
> lat (msec) : 2=8.73%, 4=3.49%, 10=1.83%, 20=0.89%, 50=0.22%
> lat (msec) : 100=0.05%, 250=0.02%, 500=0.01%
> total:
> READ: io=102365MB, aggrb=174669KB/s, minb=240KB/s, maxb=173599KB/s,
> mint=600001msec, maxt=600113msec
>
> after (with fast read using preadv2 before submit):
>
> f1:
> bw (KB /s): min= 3, max=14897, per=1.28%, avg=2276.69,
> stdev=2930.39
> lat (usec) : 2=70.63%, 4=0.01%
> lat (msec) : 250=0.20%, 500=2.26%, 750=1.18%, 2000=0.22%,
> >=2000=25.53%
> f2:
> bw (KB /s): min= 2, max= 2362, per=0.14%, avg=249.83, stdev=222.00
> lat (msec) : 250=6.35%, 500=1.78%, 750=9.29%, 1000=20.49%, 2000=52.18%
> lat (msec) : >=2000=9.99%
> f3:
> bw (KB /s): min= 1, max=245448, per=100.00%, avg=177366.50,
> stdev=35995.60
> lat (usec) : 2=64.04%, 4=0.01%, 10=0.01%, 20=0.06%, 50=0.43%
> lat (usec) : 100=0.20%, 250=1.27%, 500=2.93%, 750=3.93%, 1000=7.35%
> lat (msec) : 2=14.27%, 4=2.88%, 10=1.54%, 20=0.81%, 50=0.22%
> lat (msec) : 100=0.05%, 250=0.02%
> total:
> READ: io=103941MB, aggrb=177339KB/s, minb=213KB/s, maxb=176375KB/s,
> mint=600020msec, maxt=600178msec
>
> Interpreting the results you can see total bandwidth stays the same but
> overall
> request latency is decreased in f1 (random, mostly cached) and f3
> (sequential)
> workloads. There is a slight bump in latency for since it's random data
> that's
> unlikely to be cached but we're always trying "fast read".
>
> In our application we have starting keeping track of "fast read"
> hits/misses
> and for files / requests that have a lot hit ratio we don't do "fast
> reads"
> mostly getting rid of extra latency in the uncached cases. In our real
> world
> work load we were able to reduce average response time by 20 to 30%
> (depends
> on amount of IO done by request).
>
> I've performed other benchmarks and I have no observed any perf
> regressions in
> any of the normal (old) code paths.
>
--
Milosz Tanski
CTO
16 East 34th Street, 15th floor
New York, NY 10016
p: 646-253-9055
e: milosz@adfin.com
[-- Attachment #2: Type: text/html, Size: 6822 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: non-blockling buffered reads
2017-06-29 21:25 non-blockling buffered reads Christoph Hellwig
` (4 preceding siblings ...)
2017-06-30 1:11 ` Milosz Tanski
@ 2017-06-30 1:12 ` Milosz Tanski
5 siblings, 0 replies; 9+ messages in thread
From: Milosz Tanski @ 2017-06-30 1:12 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Al Viro, Jens Axboe, Goldwyn Rodrigues, Mel Gorman,
Volker Lendecke, linux-fsdevel, linux-block
On Thu, Jun 29, 2017 at 5:25 PM, Christoph Hellwig <hch@lst.de> wrote:
>
> This series resurrects the old patches from Milosz to implement
> non-blocking buffered reads. Thanks to the non-blocking AIO code from
> Goldwyn the implementation becomes pretty much trivial. As that
> implementation is in the block tree I would suggest that we merge
> these patches through the block tree as well. I've also forward
> ported the test Milosz sent for recent xfsprogs to verify it works
> properly, but I'll still have to address the review comments for it.
> I'll also volunteer to work with Goldwyn to properly document the
> RWF_NOWAIT flag in the man page including this change.
I had update patches for the man pages; so I'll check tomorrow if I
can dig up the changes and I'll forward them on.
>
> Here are additional details from the original cover letter from Milosz,
> where the flag was still called RWF_NONBLOCK:
>
>
> Background:
>
> Using a threadpool to emulate non-blocking operations on regular buffered
> files is a common pattern today (samba, libuv, etc...) Applications split the
> work between network bound threads (epoll) and IO threadpool. Not every
> application can use sendfile syscall (TLS / post-processing).
>
> This common pattern leads to increased request latency. Latency can be due to
> additional synchronization between the threads or fast (cached data) request
> stuck behind slow request (large / uncached data).
>
> The preadv2 syscall with RWF_NONBLOCK lets userspace applications bypass
> enqueuing operation in the threadpool if it's already available in the
> pagecache.
>
>
> Performance numbers (newer Samba):
>
> https://drive.google.com/file/d/0B3maCn0jCvYncndGbXJKbGlhejQ/view?usp=sharing
> https://docs.google.com/spreadsheets/d/1GGTivi-MfZU0doMzomG4XUo9ioWtRvOGQ5FId042L6s/edit?usp=sharing
>
>
> Performance number (older):
>
> Some perf data generated using fio comparing the posix aio engine to a version
> of the posix AIO engine that attempts to performs "fast" reads before
> submitting the operations to the queue. This workflow is on ext4 partition on
> raid0 (test / build-rig.) Simulating our database access patern workload using
> 16kb read accesses. Our database uses a home-spun posix aio like queue (samba
> does the same thing.)
>
> f1: ~73% rand read over mostly cached data (zipf med-size dataset)
> f2: ~18% rand read over mostly un-cached data (uniform large-dataset)
> f3: ~9% seq-read over large dataset
>
> before:
>
> f1:
> bw (KB /s): min= 11, max= 9088, per=0.56%, avg=969.54, stdev=827.99
> lat (msec) : 50=0.01%, 100=1.06%, 250=5.88%, 500=4.08%, 750=12.48%
> lat (msec) : 1000=17.27%, 2000=49.86%, >=2000=9.42%
> f2:
> bw (KB /s): min= 2, max= 1882, per=0.16%, avg=273.28, stdev=220.26
> lat (msec) : 250=5.65%, 500=3.31%, 750=15.64%, 1000=24.59%, 2000=46.56%
> lat (msec) : >=2000=4.33%
> f3:
> bw (KB /s): min= 0, max=265568, per=99.95%, avg=174575.10,
> stdev=34526.89
> lat (usec) : 2=0.01%, 4=0.01%, 10=0.02%, 20=0.27%, 50=10.82%
> lat (usec) : 100=50.34%, 250=5.05%, 500=7.12%, 750=6.60%, 1000=4.55%
> lat (msec) : 2=8.73%, 4=3.49%, 10=1.83%, 20=0.89%, 50=0.22%
> lat (msec) : 100=0.05%, 250=0.02%, 500=0.01%
> total:
> READ: io=102365MB, aggrb=174669KB/s, minb=240KB/s, maxb=173599KB/s,
> mint=600001msec, maxt=600113msec
>
> after (with fast read using preadv2 before submit):
>
> f1:
> bw (KB /s): min= 3, max=14897, per=1.28%, avg=2276.69, stdev=2930.39
> lat (usec) : 2=70.63%, 4=0.01%
> lat (msec) : 250=0.20%, 500=2.26%, 750=1.18%, 2000=0.22%, >=2000=25.53%
> f2:
> bw (KB /s): min= 2, max= 2362, per=0.14%, avg=249.83, stdev=222.00
> lat (msec) : 250=6.35%, 500=1.78%, 750=9.29%, 1000=20.49%, 2000=52.18%
> lat (msec) : >=2000=9.99%
> f3:
> bw (KB /s): min= 1, max=245448, per=100.00%, avg=177366.50,
> stdev=35995.60
> lat (usec) : 2=64.04%, 4=0.01%, 10=0.01%, 20=0.06%, 50=0.43%
> lat (usec) : 100=0.20%, 250=1.27%, 500=2.93%, 750=3.93%, 1000=7.35%
> lat (msec) : 2=14.27%, 4=2.88%, 10=1.54%, 20=0.81%, 50=0.22%
> lat (msec) : 100=0.05%, 250=0.02%
> total:
> READ: io=103941MB, aggrb=177339KB/s, minb=213KB/s, maxb=176375KB/s,
> mint=600020msec, maxt=600178msec
>
> Interpreting the results you can see total bandwidth stays the same but overall
> request latency is decreased in f1 (random, mostly cached) and f3 (sequential)
> workloads. There is a slight bump in latency for since it's random data that's
> unlikely to be cached but we're always trying "fast read".
>
> In our application we have starting keeping track of "fast read" hits/misses
> and for files / requests that have a lot hit ratio we don't do "fast reads"
> mostly getting rid of extra latency in the uncached cases. In our real world
> work load we were able to reduce average response time by 20 to 30% (depends
> on amount of IO done by request).
>
> I've performed other benchmarks and I have no observed any perf regressions in
> any of the normal (old) code paths.
--
Milosz Tanski
CTO
16 East 34th Street, 15th floor
New York, NY 10016
p: 646-253-9055
e: milosz@adfin.com
^ permalink raw reply [flat|nested] 9+ messages in thread