Re: [PATCH] fs: allow calling kiocb_end_write() unmatched with kiocb_start_write()

From: Josef Bacik <josef@toxicpanda.com>
To: Amir Goldstein <amir73il@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>,
	Christian Brauner <brauner@kernel.org>,
	David Howells <dhowells@redhat.com>,
	Al Viro <viro@zeniv.linux.org.uk>, Jan Kara <jack@suse.cz>,
	Miklos Szeredi <miklos@szeredi.hu>,
	linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH] fs: allow calling kiocb_end_write() unmatched with kiocb_start_write()
Date: Tue, 21 Nov 2023 16:00:32 -0500	[thread overview]
Message-ID: <20231121210032.GA1675377@perftesting> (raw)
In-Reply-To: <20231121132551.2337431-1-amir73il@gmail.com>

On Tue, Nov 21, 2023 at 03:25:51PM +0200, Amir Goldstein wrote:
> We want to move kiocb_start_write() into vfs_iocb_iter_write(), after
> the permission hook, but leave kiocb_end_write() in the write completion
> handler of the callers of vfs_iocb_iter_write().
> 
> After this change, there will be no way of knowing in completion handler,
> if write has failed before or after calling kiocb_start_write().
> 
> Add a flag IOCB_WRITE_STARTED, which is set and cleared internally by
> kiocb_{start,end}_write(), so that kiocb_end_write() could be called for
> cleanup of async write, whether it was successful or whether it failed
> before or after calling kiocb_start_write().
> 
> This flag must not be copied by stacked filesystems (e.g. overlayfs)
> that clone the iocb to another iocb for io request on a backing file.
> 
> Link: https://lore.kernel.org/r/CAOQ4uxihfJJRxxUhAmOwtD97Lg8PL8RgXw88rH1UfEeP8AtP+w@mail.gmail.com/
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

This is only a problem for cachefiles and overlayfs, and really just for
cachefiles because of the error handling thing.

What if instead we made vfs_iocb_iter_write() call kiocb_end_write() in the ret
!= EIOCBQUEUED case, that way it is in charge of the start and the end, and the
only case where the file system has to worry about is the actual io completion
path when the kiocb is completed.

The result is something like what I've pasted below, completely uncompiled and
untested.  Thanks,

Josef

diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 009d23cd435b..5857241c5918 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
 
 	_enter("%ld", ret);
 
-	kiocb_end_write(iocb);
+	if (ki->was_async)
+		kiocb_end_write(iocb);
 
 	if (ret < 0)
 		trace_cachefiles_io_error(object, inode, ret,
@@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object,
 		ki->iocb.ki_complete = cachefiles_write_complete;
 	atomic_long_add(ki->b_writing, &cache->b_writing);
 
-	kiocb_start_write(&ki->iocb);
-
 	get_file(ki->iocb.ki_filp);
 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
 
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 131621daeb13..19d2682d28f9 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -295,11 +295,6 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
 	struct kiocb *iocb = &aio_req->iocb;
 	struct kiocb *orig_iocb = aio_req->orig_iocb;
 
-	if (iocb->ki_flags & IOCB_WRITE) {
-		kiocb_end_write(iocb);
-		ovl_file_modified(orig_iocb->ki_filp);
-	}
-
 	orig_iocb->ki_pos = iocb->ki_pos;
 	ovl_aio_put(aio_req);
 }
@@ -310,6 +305,11 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
 						   struct ovl_aio_req, iocb);
 	struct kiocb *orig_iocb = aio_req->orig_iocb;
 
+	if (iocb->ki_flags & IOCB_WRITE) {
+		kiocb_end_write(iocb);
+		ovl_file_modified(orig_iocb->ki_filp);
+	}
+
 	ovl_aio_cleanup_handler(aio_req);
 	orig_iocb->ki_complete(orig_iocb, res);
 }
@@ -458,7 +458,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		aio_req->iocb.ki_flags = ifl;
 		aio_req->iocb.ki_complete = ovl_aio_queue_completion;
 		refcount_set(&aio_req->ref, 2);
-		kiocb_start_write(&aio_req->iocb);
 		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
 		ovl_aio_put(aio_req);
 		if (ret != -EIOCBQUEUED)
diff --git a/fs/read_write.c b/fs/read_write.c
index 4771701c896b..6ec3abed43dc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -821,7 +821,10 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 	if (ret < 0)
 		return ret;
 
+	kiocb_start_write(iocb);
 	ret = call_read_iter(file, iocb, iter);
+	if (ret != -EIOCBQUEUED)
+		kiocb_end_write(iocb);
 out:
 	if (ret >= 0)
 		fsnotify_access(file);