All of lore.kernel.org
 help / color / mirror / Atom feed
From: Damien Le Moal <Damien.LeMoal@wdc.com>
To: Christoph Hellwig <hch@lst.de>,
	Dave Chinner <david@fromorbit.com>,
	Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Naohiro Aota <Naohiro.Aota@wdc.com>,
	Johannes Thumshirn <jth@kernel.org>,
	Matthew Wilcox <willy@infradead.org>,
	"linux-btrfs@vger.kernel.org" <linux-btrfs@vger.kernel.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
	"cluster-devel@redhat.com" <cluster-devel@redhat.com>,
	"linux-ext4@vger.kernel.org" <linux-ext4@vger.kernel.org>,
	"linux-xfs@vger.kernel.org" <linux-xfs@vger.kernel.org>
Subject: Re: [PATCH 2/2] iomap: fall back to buffered writes for invalidation failures
Date: Tue, 14 Jul 2020 01:41:34 +0000	[thread overview]
Message-ID: <CY4PR04MB37512AD7FD85DEB3014D01D2E7610@CY4PR04MB3751.namprd04.prod.outlook.com> (raw)
In-Reply-To: 20200713074633.875946-3-hch@lst.de

On 2020/07/13 16:51, Christoph Hellwig wrote:
> Failing to invalid the page cache means data in incoherent, which is

s/in incoherent/is incoherent

> a very bad state for the system.  Always fall back to buffered I/O
> through the page cache if we can't invalidate mappings.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/ext4/file.c       |  2 ++
>  fs/gfs2/file.c       |  3 ++-
>  fs/iomap/direct-io.c | 13 ++++++++-----
>  fs/iomap/trace.h     |  1 +
>  fs/xfs/xfs_file.c    |  4 ++--
>  fs/zonefs/super.c    |  7 +++++--
>  6 files changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 2a01e31a032c4c..0da6c2a2c32c1e 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -544,6 +544,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   is_sync_kiocb(iocb) || unaligned_io || extend);
> +	if (ret == -EREMCHG)
> +		ret = 0;
>  
>  	if (extend)
>  		ret = ext4_handle_inode_extension(inode, offset, ret, count);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index fe305e4bfd3734..c7907d40c61d17 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -814,7 +814,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  
>  	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
>  			   is_sync_kiocb(iocb));
> -
> +	if (ret == -EREMCHG)
> +		ret = 0;
>  out:
>  	gfs2_glock_dq(&gh);
>  out_uninit:
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 190967e87b69e4..62626235cdbe8d 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -10,6 +10,7 @@
>  #include <linux/backing-dev.h>
>  #include <linux/uio.h>
>  #include <linux/task_io_accounting_ops.h>
> +#include "trace.h"
>  
>  #include "../internal.h"
>  
> @@ -478,13 +479,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (iov_iter_rw(iter) == WRITE) {
>  		/*
>  		 * Try to invalidate cache pages for the range we are writing.
> -		 * If this invalidation fails, tough, the write will still work,
> -		 * but racing two incompatible write paths is a pretty crazy
> -		 * thing to do, so we don't support it 100%.
> +		 * If this invalidation fails, let the caller fall back to
> +		 * buffered I/O.
>  		 */
>  		if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
> -				end >> PAGE_SHIFT))
> -			dio_warn_stale_pagecache(iocb->ki_filp);
> +				end >> PAGE_SHIFT)) {
> +			trace_iomap_dio_invalidate_fail(inode, pos, count);
> +			ret = -EREMCHG;

I am wondering if it is OK to unconditionally always return -EREMCHG here.
Shouldn't this depend on the return code of invalidate_inode_pages2_range() ?
ret may be the value returned by mapping->a_ops->launder_page(page) instead of
-EBUSY that invalidate_inode_pages2_range() otherwise returns for a failed
invalidation. Isn't their any error condition that would be better served by not
forcing the fallback to buffered write ? E.g. -ENOMEM ?



> +			goto out_free_dio;
> +		}
>  
>  		if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
>  			ret = sb_init_dio_done_wq(inode->i_sb);
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index 5693a39d52fb63..fdc7ae388476f5 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -74,6 +74,7 @@ DEFINE_EVENT(iomap_range_class, name,	\
>  DEFINE_RANGE_EVENT(iomap_writepage);
>  DEFINE_RANGE_EVENT(iomap_releasepage);
>  DEFINE_RANGE_EVENT(iomap_invalidatepage);
> +DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
>  
>  #define IOMAP_TYPE_STRINGS \
>  	{ IOMAP_HOLE,		"HOLE" }, \
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 00db81eac80d6c..551cca39fa3ba6 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -553,8 +553,8 @@ xfs_file_dio_aio_write(
>  	xfs_iunlock(ip, iolock);
>  
>  	/*
> -	 * No fallback to buffered IO on errors for XFS, direct IO will either
> -	 * complete fully or fail.
> +	 * No partial fallback to buffered IO on errors for XFS, direct IO will
> +	 * either complete fully or fail.
>  	 */
>  	ASSERT(ret < 0 || ret == count);
>  	return ret;
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 07bc42d62673ce..793850454b752f 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -786,8 +786,11 @@ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
>  		return -EFBIG;
>  
> -	if (iocb->ki_flags & IOCB_DIRECT)
> -		return zonefs_file_dio_write(iocb, from);
> +	if (iocb->ki_flags & IOCB_DIRECT) {
> +		ret = zonefs_file_dio_write(iocb, from);
> +		if (ret != -EREMCHG)
> +			return ret;
> +	}

This looks fine. This would happen only for conventional zone writes since
sequential zone writes cannot ever issue direct IOs on top of cached data as
that would be a forbidden "overwrite" operation.

>  
>  	return zonefs_file_buffered_write(iocb, from);
>  }
> 


-- 
Damien Le Moal
Western Digital Research

WARNING: multiple messages have this Message-ID (diff)
From: Damien Le Moal <Damien.LeMoal@wdc.com>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] [PATCH 2/2] iomap: fall back to buffered writes for invalidation failures
Date: Tue, 14 Jul 2020 01:41:34 +0000	[thread overview]
Message-ID: <CY4PR04MB37512AD7FD85DEB3014D01D2E7610@CY4PR04MB3751.namprd04.prod.outlook.com> (raw)
In-Reply-To: 20200713074633.875946-3-hch@lst.de

On 2020/07/13 16:51, Christoph Hellwig wrote:
> Failing to invalid the page cache means data in incoherent, which is

s/in incoherent/is incoherent

> a very bad state for the system.  Always fall back to buffered I/O
> through the page cache if we can't invalidate mappings.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/ext4/file.c       |  2 ++
>  fs/gfs2/file.c       |  3 ++-
>  fs/iomap/direct-io.c | 13 ++++++++-----
>  fs/iomap/trace.h     |  1 +
>  fs/xfs/xfs_file.c    |  4 ++--
>  fs/zonefs/super.c    |  7 +++++--
>  6 files changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 2a01e31a032c4c..0da6c2a2c32c1e 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -544,6 +544,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   is_sync_kiocb(iocb) || unaligned_io || extend);
> +	if (ret == -EREMCHG)
> +		ret = 0;
>  
>  	if (extend)
>  		ret = ext4_handle_inode_extension(inode, offset, ret, count);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index fe305e4bfd3734..c7907d40c61d17 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -814,7 +814,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  
>  	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
>  			   is_sync_kiocb(iocb));
> -
> +	if (ret == -EREMCHG)
> +		ret = 0;
>  out:
>  	gfs2_glock_dq(&gh);
>  out_uninit:
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 190967e87b69e4..62626235cdbe8d 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -10,6 +10,7 @@
>  #include <linux/backing-dev.h>
>  #include <linux/uio.h>
>  #include <linux/task_io_accounting_ops.h>
> +#include "trace.h"
>  
>  #include "../internal.h"
>  
> @@ -478,13 +479,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  	if (iov_iter_rw(iter) == WRITE) {
>  		/*
>  		 * Try to invalidate cache pages for the range we are writing.
> -		 * If this invalidation fails, tough, the write will still work,
> -		 * but racing two incompatible write paths is a pretty crazy
> -		 * thing to do, so we don't support it 100%.
> +		 * If this invalidation fails, let the caller fall back to
> +		 * buffered I/O.
>  		 */
>  		if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
> -				end >> PAGE_SHIFT))
> -			dio_warn_stale_pagecache(iocb->ki_filp);
> +				end >> PAGE_SHIFT)) {
> +			trace_iomap_dio_invalidate_fail(inode, pos, count);
> +			ret = -EREMCHG;

I am wondering if it is OK to unconditionally always return -EREMCHG here.
Shouldn't this depend on the return code of invalidate_inode_pages2_range() ?
ret may be the value returned by mapping->a_ops->launder_page(page) instead of
-EBUSY that invalidate_inode_pages2_range() otherwise returns for a failed
invalidation. Isn't their any error condition that would be better served by not
forcing the fallback to buffered write ? E.g. -ENOMEM ?



> +			goto out_free_dio;
> +		}
>  
>  		if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
>  			ret = sb_init_dio_done_wq(inode->i_sb);
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index 5693a39d52fb63..fdc7ae388476f5 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -74,6 +74,7 @@ DEFINE_EVENT(iomap_range_class, name,	\
>  DEFINE_RANGE_EVENT(iomap_writepage);
>  DEFINE_RANGE_EVENT(iomap_releasepage);
>  DEFINE_RANGE_EVENT(iomap_invalidatepage);
> +DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
>  
>  #define IOMAP_TYPE_STRINGS \
>  	{ IOMAP_HOLE,		"HOLE" }, \
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 00db81eac80d6c..551cca39fa3ba6 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -553,8 +553,8 @@ xfs_file_dio_aio_write(
>  	xfs_iunlock(ip, iolock);
>  
>  	/*
> -	 * No fallback to buffered IO on errors for XFS, direct IO will either
> -	 * complete fully or fail.
> +	 * No partial fallback to buffered IO on errors for XFS, direct IO will
> +	 * either complete fully or fail.
>  	 */
>  	ASSERT(ret < 0 || ret == count);
>  	return ret;
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 07bc42d62673ce..793850454b752f 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -786,8 +786,11 @@ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
>  		return -EFBIG;
>  
> -	if (iocb->ki_flags & IOCB_DIRECT)
> -		return zonefs_file_dio_write(iocb, from);
> +	if (iocb->ki_flags & IOCB_DIRECT) {
> +		ret = zonefs_file_dio_write(iocb, from);
> +		if (ret != -EREMCHG)
> +			return ret;
> +	}

This looks fine. This would happen only for conventional zone writes since
sequential zone writes cannot ever issue direct IOs on top of cached data as
that would be a forbidden "overwrite" operation.

>  
>  	return zonefs_file_buffered_write(iocb, from);
>  }
> 


-- 
Damien Le Moal
Western Digital Research





  parent reply	other threads:[~2020-07-14  1:41 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-13  7:46 RFC: iomap write invalidation Christoph Hellwig
2020-07-13  7:46 ` [Cluster-devel] " Christoph Hellwig
2020-07-13  7:46 ` [PATCH 1/2] iomap: Only invalidate page cache pages on direct IO writes Christoph Hellwig
2020-07-13  7:46   ` [Cluster-devel] " Christoph Hellwig
2020-07-13  7:46 ` [PATCH 2/2] iomap: fall back to buffered writes for invalidation failures Christoph Hellwig
2020-07-13  7:46   ` [Cluster-devel] " Christoph Hellwig
2020-07-13 11:55   ` Matthew Wilcox
2020-07-13 11:55     ` [Cluster-devel] " Matthew Wilcox
2020-07-14 11:00     ` Christoph Hellwig
2020-07-14 11:00       ` [Cluster-devel] " Christoph Hellwig
2020-07-13 12:20   ` Goldwyn Rodrigues
2020-07-13 12:20     ` [Cluster-devel] " Goldwyn Rodrigues
2020-07-13 16:09     ` David Sterba
2020-07-13 16:09       ` [Cluster-devel] " David Sterba
2020-07-13 15:39   ` Darrick J. Wong
2020-07-13 15:39     ` [Cluster-devel] " Darrick J. Wong
2020-07-14 11:00     ` Christoph Hellwig
2020-07-14 11:00       ` [Cluster-devel] " Christoph Hellwig
2020-07-14  1:41   ` Damien Le Moal [this message]
2020-07-14  1:41     ` Damien Le Moal
2020-07-15  1:47 ` RFC: iomap write invalidation Dave Chinner
2020-07-15  1:47   ` [Cluster-devel] " Dave Chinner
2020-07-20 21:51 ` Goldwyn Rodrigues
2020-07-20 21:51   ` [Cluster-devel] " Goldwyn Rodrigues
2020-07-21 14:53   ` Christoph Hellwig
2020-07-21 14:53     ` [Cluster-devel] " Christoph Hellwig
2020-07-21 14:59     ` Darrick J. Wong
2020-07-21 14:59       ` [Cluster-devel] " Darrick J. Wong
2020-07-21 15:04     ` Matthew Wilcox
2020-07-21 15:04       ` [Cluster-devel] " Matthew Wilcox
2020-07-21 15:06       ` Christoph Hellwig
2020-07-21 15:06         ` [Cluster-devel] " Christoph Hellwig
2020-07-21 15:14         ` Matthew Wilcox
2020-07-21 15:14           ` [Cluster-devel] " Matthew Wilcox
2020-07-21 15:16           ` Christoph Hellwig
2020-07-21 15:16             ` [Cluster-devel] " Christoph Hellwig
2020-07-21 15:27             ` Darrick J. Wong
2020-07-21 15:27               ` [Cluster-devel] " Darrick J. Wong
2020-07-21 15:41               ` Christoph Hellwig
2020-07-21 15:41                 ` [Cluster-devel] " Christoph Hellwig
2020-07-21 15:59                 ` Darrick J. Wong
2020-07-21 15:59                   ` [Cluster-devel] " Darrick J. Wong
2020-07-21 16:01                   ` Christoph Hellwig
2020-07-21 16:01                     ` [Cluster-devel] " Christoph Hellwig
2020-07-21 16:05                     ` Darrick J. Wong
2020-07-21 16:05                       ` [Cluster-devel] " Darrick J. Wong
2020-07-21 15:31             ` Matthew Wilcox
2020-07-21 15:31               ` [Cluster-devel] " Matthew Wilcox
2020-07-21 15:42               ` Christoph Hellwig
2020-07-21 15:42                 ` [Cluster-devel] " Christoph Hellwig
2020-07-21 15:52                 ` Matthew Wilcox
2020-07-21 15:52                   ` [Cluster-devel] " Matthew Wilcox
2020-07-21 16:03                   ` Darrick J. Wong
2020-07-21 16:03                     ` [Cluster-devel] " Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CY4PR04MB37512AD7FD85DEB3014D01D2E7610@CY4PR04MB3751.namprd04.prod.outlook.com \
    --to=damien.lemoal@wdc.com \
    --cc=Naohiro.Aota@wdc.com \
    --cc=cluster-devel@redhat.com \
    --cc=david@fromorbit.com \
    --cc=hch@lst.de \
    --cc=jth@kernel.org \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=rgoldwyn@suse.de \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.