From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-fsdevel-owner@vger.kernel.org>
Received: from mx2.suse.de ([195.135.220.15]:48670 "EHLO mx1.suse.de"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1752969AbdHJN4n (ORCPT <rfc822;linux-fsdevel@vger.kernel.org>);
        Thu, 10 Aug 2017 09:56:43 -0400
Date: Thu, 10 Aug 2017 15:56:40 +0200
From: Jan Kara <jack@suse.cz>
To: Lukas Czerner <lczerner@redhat.com>
Cc: linux-fsdevel@vger.kernel.org, viro@zeniv.linux.org.uk,
        jack@suse.cz, jmoyer@redhat.com, david@fromorbit.com
Subject: Re: [PATCH v6] fs: Fix page cache inconsistency when mixing buffered
 and AIO DIO
Message-ID: <20170810135640.GB14925@quack2.suse.cz>
References: <1500463692-4982-1-git-send-email-lczerner@redhat.com>
 <1502369997-15665-1-git-send-email-lczerner@redhat.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1502369997-15665-1-git-send-email-lczerner@redhat.com>
Sender: linux-fsdevel-owner@vger.kernel.org
List-ID: <linux-fsdevel.vger.kernel.org>

On Thu 10-08-17 14:59:57, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!

It seems the invalidation can be also removed from
generic_file_direct_write(), can't it? It is duplicit there the same way as
it was in the iomap code...

								Honza

> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Cc: Jeff Moyer <jmoyer@redhat.com>
> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> v6: Remove redundant invalidatepage, add warning and comment
> 
>  fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
>  fs/iomap.c     | 29 ++++++++++++++++-------------
>  2 files changed, 59 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..ffb9e19 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> +	int err;
>  
>  	/*
>  	 * AIO submission can race with bio completion to get here while
> @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);
> +	}
> +
>  	if (dio->end_io) {
> -		int err;
>  
>  		// XXX: ki_pos??
>  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
> @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 0392661..c3e299a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,24 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(ret);
> +	}
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	ret = iomap_dio_complete(dio);
>  
> -	/*
> -	 * Try again to invalidate clean pages which might have been cached by
> -	 * non-direct readahead, or faulted in by get_user_pages() if the source
> -	 * of the write was an mmap'ed region of the file we're writing.  Either
> -	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> -	 * this invalidation fails, tough, the write still worked...
> -	 */
> -	if (iov_iter_rw(iter) == WRITE) {
> -		int err = invalidate_inode_pages2_range(mapping,
> -				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> -	}
> -
>  	return ret;
>  
>  out_free_dio:
> -- 
> 2.7.5
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR