All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Avert possible deadlock with splice() and fanotify
@ 2023-11-29 20:07 Amir Goldstein
  2023-11-29 20:07 ` [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct() Amir Goldstein
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Amir Goldstein @ 2023-11-29 20:07 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Josef Bacik, Christoph Hellwig, Jan Kara, David Howells,
	Jens Axboe, Miklos Szeredi, Al Viro, linux-fsdevel

Christian,

Josef has helped me see the light and figure out how to avoid the
possible deadlock, which involves:
- splice() from source file in a loop mounted fs to dest file in
  a host fs, where the loop image file is
- fsfreeze on host fs
- write to host fs in context of fanotify permission event handler
  (FAN_ACCESS_PERM) on the splice source file

The first patch should not be changing any logic.
I only build tested the ceph patch, so hoping to get an
Acked-by/Tested-by from Jeff.

The second patch rids us of the deadlock by not holding
file_start_write() while reading from splice source file.

The patches apply and tested on top of vfs.rw branch.

Thanks,
Amir.

Amir Goldstein (2):
  fs: fork do_splice_copy_file_range() from do_splice_direct()
  fs: move file_start_write() into direct_splice_actor()

 fs/ceph/file.c         |  9 +++--
 fs/overlayfs/copy_up.c |  2 -
 fs/read_write.c        |  8 +---
 fs/splice.c            | 88 +++++++++++++++++++++++++++++++-----------
 include/linux/fs.h     |  2 -
 include/linux/splice.h | 13 ++++---
 6 files changed, 81 insertions(+), 41 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct()
  2023-11-29 20:07 [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
@ 2023-11-29 20:07 ` Amir Goldstein
  2023-11-30 10:09   ` Amir Goldstein
  2023-11-30 13:18   ` Christian Brauner
  2023-11-29 20:07 ` [PATCH 2/2] fs: move file_start_write() into direct_splice_actor() Amir Goldstein
  2023-11-30  8:32 ` [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
  2 siblings, 2 replies; 12+ messages in thread
From: Amir Goldstein @ 2023-11-29 20:07 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Josef Bacik, Christoph Hellwig, Jan Kara, David Howells,
	Jens Axboe, Miklos Szeredi, Al Viro, linux-fsdevel

The new helper is meant to be called from context of ->copy_file_range()
methods instead of do_splice_direct().

Currently, the only difference is that do_splice_copy_file_range() does
not take a splice flags argument and it asserts that file_start_write()
was called.

Soon, do_splice_direct() will be called without file_start_write() held.

Use the new helper from __ceph_copy_file_range(), that was incorrectly
passing the copy_file_range() flags argument as splice flags argument
to do_splice_direct(). the value of flags was 0, so no actual bug fix.

Move the definition of both helpers to linux/splice.h.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/ceph/file.c         |  9 ++---
 fs/read_write.c        |  6 ++--
 fs/splice.c            | 82 ++++++++++++++++++++++++++++++------------
 include/linux/fs.h     |  2 --
 include/linux/splice.h | 13 ++++---
 5 files changed, 75 insertions(+), 37 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..7c2db78e2c6e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -12,6 +12,7 @@
 #include <linux/falloc.h>
 #include <linux/iversion.h>
 #include <linux/ktime.h>
+#include <linux/splice.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 		 * {read,write}_iter, which will get caps again.
 		 */
 		put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
-		ret = do_splice_direct(src_file, &src_off, dst_file,
-				       &dst_off, src_objlen, flags);
+		ret = do_splice_copy_file_range(src_file, &src_off, dst_file,
+						&dst_off, src_objlen);
 		/* Abort on short copies or on error */
 		if (ret < (long)src_objlen) {
 			doutc(cl, "Failed partial copy (%zd)\n", ret);
@@ -3065,8 +3066,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	 */
 	if (len && (len < src_ci->i_layout.object_size)) {
 		doutc(cl, "Final partial copy of %zu bytes\n", len);
-		bytes = do_splice_direct(src_file, &src_off, dst_file,
-					 &dst_off, len, flags);
+		bytes = do_splice_copy_file_range(src_file, &src_off, dst_file,
+						  &dst_off, len);
 		if (bytes > 0)
 			ret += bytes;
 		else
diff --git a/fs/read_write.c b/fs/read_write.c
index f791555fa246..555514cdad53 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1423,10 +1423,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				size_t len, unsigned int flags)
 {
-	lockdep_assert(file_write_started(file_out));
-
-	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+	return do_splice_copy_file_range(file_in, &pos_in, file_out, &pos_out,
+				len > MAX_RW_COUNT ? MAX_RW_COUNT : len);
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
diff --git a/fs/splice.c b/fs/splice.c
index 3fce5f6072dd..3bb4936f8b70 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1158,8 +1158,15 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
 	struct file *file = sd->u.file;
 
-	return do_splice_from(pipe, file, sd->opos, sd->total_len,
-			      sd->flags);
+	return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
+}
+
+static int copy_file_range_splice_actor(struct pipe_inode_info *pipe,
+					struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+
+	return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
 }
 
 static void direct_file_splice_eof(struct splice_desc *sd)
@@ -1170,25 +1177,10 @@ static void direct_file_splice_eof(struct splice_desc *sd)
 		file->f_op->splice_eof(file);
 }
 
-/**
- * do_splice_direct - splices data directly between two files
- * @in:		file to splice from
- * @ppos:	input file offset
- * @out:	file to splice to
- * @opos:	output file offset
- * @len:	number of bytes to splice
- * @flags:	splice modifier flags
- *
- * Description:
- *    For use by do_sendfile(). splice can easily emulate sendfile, but
- *    doing it in the application would incur an extra system call
- *    (splice in + splice out, as compared to just sendfile()). So this helper
- *    can splice directly through a process-private pipe.
- *
- * Callers already called rw_verify_area() on the entire range.
- */
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		      loff_t *opos, size_t len, unsigned int flags)
+static long do_splice_direct_actor(struct file *in, loff_t *ppos,
+				   struct file *out, loff_t *opos,
+				   size_t len, unsigned int flags,
+				   splice_direct_actor *actor)
 {
 	struct splice_desc sd = {
 		.len		= len,
@@ -1207,14 +1199,60 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 	if (unlikely(out->f_flags & O_APPEND))
 		return -EINVAL;
 
-	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
+	ret = splice_direct_to_actor(in, &sd, actor);
 	if (ret > 0)
 		*ppos = sd.pos;
 
 	return ret;
 }
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @opos:	output file offset
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    For use by do_sendfile(). splice can easily emulate sendfile, but
+ *    doing it in the application would incur an extra system call
+ *    (splice in + splice out, as compared to just sendfile()). So this helper
+ *    can splice directly through a process-private pipe.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      loff_t *opos, size_t len, unsigned int flags)
+{
+	return do_splice_direct_actor(in, ppos, out, opos, len, flags,
+				      direct_splice_actor);
+}
 EXPORT_SYMBOL(do_splice_direct);
 
+/**
+ * do_splice_copy_file_range - splices data for copy_file_range()
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @opos:	output file offset
+ * @len:	number of bytes to splice
+ *
+ * Description:
+ *    For use by generic_copy_file_range() and ->copy_file_range() methods.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+long do_splice_copy_file_range(struct file *in, loff_t *ppos, struct file *out,
+			       loff_t *opos, size_t len)
+{
+	lockdep_assert(file_write_started(out));
+
+	return do_splice_direct_actor(in, ppos, out, opos, len, 0,
+				      copy_file_range_splice_actor);
+}
+EXPORT_SYMBOL(do_splice_copy_file_range);
+
 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 {
 	for (;;) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae0e2fb7bcea..04422a0eccdd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3052,8 +3052,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 			 size_t len, unsigned int flags);
 extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		loff_t *opos, size_t len, unsigned int flags);
 
 
 extern void
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 6c461573434d..11e62b641d69 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -80,11 +80,14 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
 long vfs_splice_read(struct file *in, loff_t *ppos,
 		     struct pipe_inode_info *pipe, size_t len,
 		     unsigned int flags);
-extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
-				      splice_direct_actor *);
-extern long do_splice(struct file *in, loff_t *off_in,
-		      struct file *out, loff_t *off_out,
-		      size_t len, unsigned int flags);
+ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd,
+			       splice_direct_actor *actor);
+long do_splice(struct file *in, loff_t *off_in, struct file *out,
+	       loff_t *off_out, size_t len, unsigned int flags);
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      loff_t *opos, size_t len, unsigned int flags);
+long do_splice_copy_file_range(struct file *in, loff_t *ppos, struct file *out,
+			       loff_t *opos, size_t len);
 
 extern long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/2] fs: move file_start_write() into direct_splice_actor()
  2023-11-29 20:07 [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
  2023-11-29 20:07 ` [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct() Amir Goldstein
@ 2023-11-29 20:07 ` Amir Goldstein
  2023-11-30 13:32   ` Jan Kara
  2023-11-30  8:32 ` [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
  2 siblings, 1 reply; 12+ messages in thread
From: Amir Goldstein @ 2023-11-29 20:07 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Josef Bacik, Christoph Hellwig, Jan Kara, David Howells,
	Jens Axboe, Miklos Szeredi, Al Viro, linux-fsdevel

The two callers of do_splice_direct() hold file_start_write() on the
output file.

This may cause file permission hooks to be called indirectly on an
overlayfs lower layer, which is on the same filesystem of the output
file and could lead to deadlock with fanotify permission events.

To fix this potential deadlock, move file_start_write() from the callers
into the direct_splice_actor(), so file_start_write() will not be held
while reading the input file.

Suggested-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/20231128214258.GA2398475@perftesting/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/copy_up.c | 2 --
 fs/read_write.c        | 2 --
 fs/splice.c            | 8 +++++++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 7a44c8212331..294b330aba9f 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -333,11 +333,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
 		if (error)
 			break;
 
-		ovl_start_write(dentry);
 		bytes = do_splice_direct(old_file, &old_pos,
 					 new_file, &new_pos,
 					 this_len, SPLICE_F_MOVE);
-		ovl_end_write(dentry);
 		if (bytes <= 0) {
 			error = bytes;
 			break;
diff --git a/fs/read_write.c b/fs/read_write.c
index 555514cdad53..b7110ee77c1c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1286,10 +1286,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
 		if (retval < 0)
 			goto fput_out;
-		file_start_write(out.file);
 		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
 					  count, fl);
-		file_end_write(out.file);
 	} else {
 		if (out.file->f_flags & O_NONBLOCK)
 			fl |= SPLICE_F_NONBLOCK;
diff --git a/fs/splice.c b/fs/splice.c
index 3bb4936f8b70..261adfdfed9d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1157,8 +1157,12 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 			       struct splice_desc *sd)
 {
 	struct file *file = sd->u.file;
+	long ret;
 
-	return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
+	file_start_write(file);
+	ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
+	file_end_write(file);
+	return ret;
 }
 
 static int copy_file_range_splice_actor(struct pipe_inode_info *pipe,
@@ -1240,6 +1244,8 @@ EXPORT_SYMBOL(do_splice_direct);
  *
  * Description:
  *    For use by generic_copy_file_range() and ->copy_file_range() methods.
+ *    Like do_splice_direct(), but vfs_copy_file_range() already called
+ *    start_file_write() on @out file.
  *
  * Callers already called rw_verify_area() on the entire range.
  */
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/2] Avert possible deadlock with splice() and fanotify
  2023-11-29 20:07 [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
  2023-11-29 20:07 ` [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct() Amir Goldstein
  2023-11-29 20:07 ` [PATCH 2/2] fs: move file_start_write() into direct_splice_actor() Amir Goldstein
@ 2023-11-30  8:32 ` Amir Goldstein
  2023-11-30 10:07   ` Amir Goldstein
  2 siblings, 1 reply; 12+ messages in thread
From: Amir Goldstein @ 2023-11-30  8:32 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Josef Bacik, Christoph Hellwig, Jan Kara, David Howells,
	Jens Axboe, Miklos Szeredi, Al Viro, linux-fsdevel

On Wed, Nov 29, 2023 at 10:07 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> Christian,
>
> Josef has helped me see the light and figure out how to avoid the
> possible deadlock, which involves:
> - splice() from source file in a loop mounted fs to dest file in
>   a host fs, where the loop image file is
> - fsfreeze on host fs
> - write to host fs in context of fanotify permission event handler
>   (FAN_ACCESS_PERM) on the splice source file
>
> The first patch should not be changing any logic.
> I only build tested the ceph patch, so hoping to get an
> Acked-by/Tested-by from Jeff.
>
> The second patch rids us of the deadlock by not holding
> file_start_write() while reading from splice source file.
>

OOPS, I missed another corner case:
The COPY_FILE_SPLICE fallback of server-side-copy in nfsd/ksmbd
needs to use the start-write-safe variant of do_splice_direct(),
because in this case src and dst can be on any two fs.
Expect an extra patch in v2.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/2] Avert possible deadlock with splice() and fanotify
  2023-11-30  8:32 ` [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
@ 2023-11-30 10:07   ` Amir Goldstein
  2023-11-30 13:37     ` Jan Kara
  0 siblings, 1 reply; 12+ messages in thread
From: Amir Goldstein @ 2023-11-30 10:07 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Josef Bacik, Christoph Hellwig, Jan Kara, David Howells,
	Jens Axboe, Miklos Szeredi, Al Viro, linux-fsdevel

On Thu, Nov 30, 2023 at 10:32 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Wed, Nov 29, 2023 at 10:07 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > Christian,
> >
> > Josef has helped me see the light and figure out how to avoid the
> > possible deadlock, which involves:
> > - splice() from source file in a loop mounted fs to dest file in
> >   a host fs, where the loop image file is
> > - fsfreeze on host fs
> > - write to host fs in context of fanotify permission event handler
> >   (FAN_ACCESS_PERM) on the splice source file
> >
> > The first patch should not be changing any logic.
> > I only build tested the ceph patch, so hoping to get an
> > Acked-by/Tested-by from Jeff.
> >
> > The second patch rids us of the deadlock by not holding
> > file_start_write() while reading from splice source file.
> >
>
> OOPS, I missed another corner case:
> The COPY_FILE_SPLICE fallback of server-side-copy in nfsd/ksmbd
> needs to use the start-write-safe variant of do_splice_direct(),
> because in this case src and dst can be on any two fs.
> Expect an extra patch in v2.
>

For the interested, see server-side-copy patch below.
Pushed to branch start-write-safe [1], but will wait with v2 until
I get comments on v1.

Thanks,
Amir.

[1] https://github.com/amir73il/linux/commits/start-write-safe

Author: Amir Goldstein <amir73il@gmail.com>
Date:   Thu Nov 30 11:42:50 2023 +0200

    fs: use do_splice_direct() for nfsd/ksmbd server-side-copy

    nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
    perform kernel copy between two files on any two filesystems.

    Splicing input file, while holding file_start_write() on the output file
    which is on a different sb, posses a risk for fanotify related deadlocks.

    We only need to call splice_file_range() from within the context of
    ->copy_file_range() filesystem methods with file_start_write() held.

    To avoid the possible deadlocks, always use do_splice_direct() instead of
    splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
    without holding file_start_write().

    Signed-off-by: Amir Goldstein <amir73il@gmail.com>

diff --git a/fs/read_write.c b/fs/read_write.c
index 0bc99f38e623..12583e32aa6d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1565,11 +1565,18 @@ ssize_t vfs_copy_file_range(struct file
*file_in, loff_t pos_in,
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
-        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
+        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
+        * for server-side-copy between any two sb.
+        *
+        * In any case, we call do_splice_direct() and not splice_file_range(),
+        * without file_start_write() held, to avoid possible deadlocks related
+        * to splicing from input file, while file_start_write() is held on
+        * the output file on a different sb.
         */
-       ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-                                     flags);
+       file_end_write(file_out);

+       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+                              min_t(size_t, len, MAX_RW_COUNT), 0);
 done:
        if (ret > 0) {
                fsnotify_access(file_in);
@@ -1581,8 +1588,6 @@ ssize_t vfs_copy_file_range(struct file
*file_in, loff_t pos_in,
        inc_syscr(current);
        inc_syscw(current);

-       file_end_write(file_out);
-
        return ret;
 }
 EXPORT_SYMBOL(vfs_copy_file_range);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct()
  2023-11-29 20:07 ` [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct() Amir Goldstein
@ 2023-11-30 10:09   ` Amir Goldstein
  2023-11-30 13:30     ` Jan Kara
  2023-11-30 13:18   ` Christian Brauner
  1 sibling, 1 reply; 12+ messages in thread
From: Amir Goldstein @ 2023-11-30 10:09 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Josef Bacik, Christoph Hellwig, Jan Kara, David Howells,
	Jens Axboe, Miklos Szeredi, Al Viro, linux-fsdevel

On Wed, Nov 29, 2023 at 10:07 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> The new helper is meant to be called from context of ->copy_file_range()
> methods instead of do_splice_direct().
>
> Currently, the only difference is that do_splice_copy_file_range() does
> not take a splice flags argument and it asserts that file_start_write()
> was called.
>
> Soon, do_splice_direct() will be called without file_start_write() held.
>
> Use the new helper from __ceph_copy_file_range(), that was incorrectly
> passing the copy_file_range() flags argument as splice flags argument
> to do_splice_direct(). the value of flags was 0, so no actual bug fix.
>
> Move the definition of both helpers to linux/splice.h.
>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  fs/ceph/file.c         |  9 ++---
>  fs/read_write.c        |  6 ++--
>  fs/splice.c            | 82 ++++++++++++++++++++++++++++++------------
>  include/linux/fs.h     |  2 --
>  include/linux/splice.h | 13 ++++---
>  5 files changed, 75 insertions(+), 37 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 3b5aae29e944..7c2db78e2c6e 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -12,6 +12,7 @@
>  #include <linux/falloc.h>
>  #include <linux/iversion.h>
>  #include <linux/ktime.h>
> +#include <linux/splice.h>
>
>  #include "super.h"
>  #include "mds_client.h"
> @@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>                  * {read,write}_iter, which will get caps again.
>                  */
>                 put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
> -               ret = do_splice_direct(src_file, &src_off, dst_file,
> -                                      &dst_off, src_objlen, flags);
> +               ret = do_splice_copy_file_range(src_file, &src_off, dst_file,
> +                                               &dst_off, src_objlen);
>                 /* Abort on short copies or on error */
>                 if (ret < (long)src_objlen) {
>                         doutc(cl, "Failed partial copy (%zd)\n", ret);
> @@ -3065,8 +3066,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>          */
>         if (len && (len < src_ci->i_layout.object_size)) {
>                 doutc(cl, "Final partial copy of %zu bytes\n", len);
> -               bytes = do_splice_direct(src_file, &src_off, dst_file,
> -                                        &dst_off, len, flags);
> +               bytes = do_splice_copy_file_range(src_file, &src_off, dst_file,
> +                                                 &dst_off, len);
>                 if (bytes > 0)
>                         ret += bytes;
>                 else
> diff --git a/fs/read_write.c b/fs/read_write.c
> index f791555fa246..555514cdad53 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1423,10 +1423,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>                                 struct file *file_out, loff_t pos_out,
>                                 size_t len, unsigned int flags)
>  {
> -       lockdep_assert(file_write_started(file_out));
> -
> -       return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> -                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +       return do_splice_copy_file_range(file_in, &pos_in, file_out, &pos_out,
> +                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len);
>  }
>  EXPORT_SYMBOL(generic_copy_file_range);
>
> diff --git a/fs/splice.c b/fs/splice.c
> index 3fce5f6072dd..3bb4936f8b70 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -1158,8 +1158,15 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
>  {
>         struct file *file = sd->u.file;
>
> -       return do_splice_from(pipe, file, sd->opos, sd->total_len,
> -                             sd->flags);
> +       return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
> +}
> +
> +static int copy_file_range_splice_actor(struct pipe_inode_info *pipe,
> +                                       struct splice_desc *sd)
> +{
> +       struct file *file = sd->u.file;
> +
> +       return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
>  }
>
>  static void direct_file_splice_eof(struct splice_desc *sd)
> @@ -1170,25 +1177,10 @@ static void direct_file_splice_eof(struct splice_desc *sd)
>                 file->f_op->splice_eof(file);
>  }
>
> -/**
> - * do_splice_direct - splices data directly between two files
> - * @in:                file to splice from
> - * @ppos:      input file offset
> - * @out:       file to splice to
> - * @opos:      output file offset
> - * @len:       number of bytes to splice
> - * @flags:     splice modifier flags
> - *
> - * Description:
> - *    For use by do_sendfile(). splice can easily emulate sendfile, but
> - *    doing it in the application would incur an extra system call
> - *    (splice in + splice out, as compared to just sendfile()). So this helper
> - *    can splice directly through a process-private pipe.
> - *
> - * Callers already called rw_verify_area() on the entire range.
> - */
> -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
> -                     loff_t *opos, size_t len, unsigned int flags)
> +static long do_splice_direct_actor(struct file *in, loff_t *ppos,
> +                                  struct file *out, loff_t *opos,
> +                                  size_t len, unsigned int flags,
> +                                  splice_direct_actor *actor)
>  {
>         struct splice_desc sd = {
>                 .len            = len,
> @@ -1207,14 +1199,60 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
>         if (unlikely(out->f_flags & O_APPEND))
>                 return -EINVAL;
>
> -       ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
> +       ret = splice_direct_to_actor(in, &sd, actor);
>         if (ret > 0)
>                 *ppos = sd.pos;
>
>         return ret;
>  }
> +/**
> + * do_splice_direct - splices data directly between two files
> + * @in:                file to splice from
> + * @ppos:      input file offset
> + * @out:       file to splice to
> + * @opos:      output file offset
> + * @len:       number of bytes to splice
> + * @flags:     splice modifier flags
> + *
> + * Description:
> + *    For use by do_sendfile(). splice can easily emulate sendfile, but
> + *    doing it in the application would incur an extra system call
> + *    (splice in + splice out, as compared to just sendfile()). So this helper
> + *    can splice directly through a process-private pipe.
> + *
> + * Callers already called rw_verify_area() on the entire range.
> + */
> +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
> +                     loff_t *opos, size_t len, unsigned int flags)
> +{
> +       return do_splice_direct_actor(in, ppos, out, opos, len, flags,
> +                                     direct_splice_actor);
> +}
>  EXPORT_SYMBOL(do_splice_direct);
>
> +/**
> + * do_splice_copy_file_range - splices data for copy_file_range()
> + * @in:                file to splice from
> + * @ppos:      input file offset
> + * @out:       file to splice to
> + * @opos:      output file offset
> + * @len:       number of bytes to splice
> + *
> + * Description:
> + *    For use by generic_copy_file_range() and ->copy_file_range() methods.
> + *
> + * Callers already called rw_verify_area() on the entire range.
> + */
> +long do_splice_copy_file_range(struct file *in, loff_t *ppos, struct file *out,
> +                              loff_t *opos, size_t len)

FYI, I renamed do_splice_vfs_copy_file_range => splice_file_range in v2
for brevity.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct()
  2023-11-29 20:07 ` [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct() Amir Goldstein
  2023-11-30 10:09   ` Amir Goldstein
@ 2023-11-30 13:18   ` Christian Brauner
  2023-11-30 13:37     ` Amir Goldstein
  1 sibling, 1 reply; 12+ messages in thread
From: Christian Brauner @ 2023-11-30 13:18 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Jeff Layton, Josef Bacik, Christoph Hellwig, Jan Kara,
	David Howells, Jens Axboe, Miklos Szeredi, Al Viro,
	linux-fsdevel

On Wed, Nov 29, 2023 at 10:07:08PM +0200, Amir Goldstein wrote:
> The new helper is meant to be called from context of ->copy_file_range()
> methods instead of do_splice_direct().
> 
> Currently, the only difference is that do_splice_copy_file_range() does
> not take a splice flags argument and it asserts that file_start_write()
> was called.
> 
> Soon, do_splice_direct() will be called without file_start_write() held.
> 
> Use the new helper from __ceph_copy_file_range(), that was incorrectly
> passing the copy_file_range() flags argument as splice flags argument
> to do_splice_direct(). the value of flags was 0, so no actual bug fix.
> 
> Move the definition of both helpers to linux/splice.h.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  fs/ceph/file.c         |  9 ++---
>  fs/read_write.c        |  6 ++--
>  fs/splice.c            | 82 ++++++++++++++++++++++++++++++------------
>  include/linux/fs.h     |  2 --
>  include/linux/splice.h | 13 ++++---
>  5 files changed, 75 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 3b5aae29e944..7c2db78e2c6e 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -12,6 +12,7 @@
>  #include <linux/falloc.h>
>  #include <linux/iversion.h>
>  #include <linux/ktime.h>
> +#include <linux/splice.h>
>  
>  #include "super.h"
>  #include "mds_client.h"
> @@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  		 * {read,write}_iter, which will get caps again.
>  		 */
>  		put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
> -		ret = do_splice_direct(src_file, &src_off, dst_file,
> -				       &dst_off, src_objlen, flags);
> +		ret = do_splice_copy_file_range(src_file, &src_off, dst_file,
> +						&dst_off, src_objlen);
>  		/* Abort on short copies or on error */
>  		if (ret < (long)src_objlen) {
>  			doutc(cl, "Failed partial copy (%zd)\n", ret);
> @@ -3065,8 +3066,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  	 */
>  	if (len && (len < src_ci->i_layout.object_size)) {
>  		doutc(cl, "Final partial copy of %zu bytes\n", len);
> -		bytes = do_splice_direct(src_file, &src_off, dst_file,
> -					 &dst_off, len, flags);
> +		bytes = do_splice_copy_file_range(src_file, &src_off, dst_file,
> +						  &dst_off, len);
>  		if (bytes > 0)
>  			ret += bytes;
>  		else
> diff --git a/fs/read_write.c b/fs/read_write.c
> index f791555fa246..555514cdad53 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1423,10 +1423,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>  				struct file *file_out, loff_t pos_out,
>  				size_t len, unsigned int flags)
>  {

Hm, the low-level helper takes a @flags argument but it's completely
ignored. I think that helper should remove it or it should check:

if (flags)
	return -EINVAL;

in case it's ever called from codepaths where @flags hasn't been
sanitized imho.

> -	lockdep_assert(file_write_started(file_out));
> -
> -	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> -				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +	return do_splice_copy_file_range(file_in, &pos_in, file_out, &pos_out,
> +				len > MAX_RW_COUNT ? MAX_RW_COUNT : len);

clamp(len, 0, MAX_RW_COUNT)

?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct()
  2023-11-30 10:09   ` Amir Goldstein
@ 2023-11-30 13:30     ` Jan Kara
  0 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2023-11-30 13:30 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christian Brauner, Jeff Layton, Josef Bacik, Christoph Hellwig,
	Jan Kara, David Howells, Jens Axboe, Miklos Szeredi, Al Viro,
	linux-fsdevel

On Thu 30-11-23 12:09:09, Amir Goldstein wrote:
> On Wed, Nov 29, 2023 at 10:07 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > The new helper is meant to be called from context of ->copy_file_range()
> > methods instead of do_splice_direct().
> >
> > Currently, the only difference is that do_splice_copy_file_range() does
> > not take a splice flags argument and it asserts that file_start_write()
> > was called.
> >
> > Soon, do_splice_direct() will be called without file_start_write() held.
> >
> > Use the new helper from __ceph_copy_file_range(), that was incorrectly
> > passing the copy_file_range() flags argument as splice flags argument
> > to do_splice_direct(). the value of flags was 0, so no actual bug fix.
> >
> > Move the definition of both helpers to linux/splice.h.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
...
> > +/**
> > + * do_splice_copy_file_range - splices data for copy_file_range()
> > + * @in:                file to splice from
> > + * @ppos:      input file offset
> > + * @out:       file to splice to
> > + * @opos:      output file offset
> > + * @len:       number of bytes to splice
> > + *
> > + * Description:
> > + *    For use by generic_copy_file_range() and ->copy_file_range() methods.
> > + *
> > + * Callers already called rw_verify_area() on the entire range.
> > + */
> > +long do_splice_copy_file_range(struct file *in, loff_t *ppos, struct file *out,
> > +                              loff_t *opos, size_t len)
> 
> FYI, I renamed do_splice_vfs_copy_file_range => splice_file_range in v2
> for brevity.

Yeah, after the rename things look better :). Otherwise I didn't find any
problem so feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] fs: move file_start_write() into direct_splice_actor()
  2023-11-29 20:07 ` [PATCH 2/2] fs: move file_start_write() into direct_splice_actor() Amir Goldstein
@ 2023-11-30 13:32   ` Jan Kara
  0 siblings, 0 replies; 12+ messages in thread
From: Jan Kara @ 2023-11-30 13:32 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christian Brauner, Jeff Layton, Josef Bacik, Christoph Hellwig,
	Jan Kara, David Howells, Jens Axboe, Miklos Szeredi, Al Viro,
	linux-fsdevel

On Wed 29-11-23 22:07:09, Amir Goldstein wrote:
> The two callers of do_splice_direct() hold file_start_write() on the
> output file.
> 
> This may cause file permission hooks to be called indirectly on an
> overlayfs lower layer, which is on the same filesystem of the output
> file and could lead to deadlock with fanotify permission events.
> 
> To fix this potential deadlock, move file_start_write() from the callers
> into the direct_splice_actor(), so file_start_write() will not be held
> while reading the input file.
> 
> Suggested-by: Josef Bacik <josef@toxicpanda.com>
> Link: https://lore.kernel.org/r/20231128214258.GA2398475@perftesting/
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

Yeah, very nice solution! It all looks good to me so feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/2] Avert possible deadlock with splice() and fanotify
  2023-11-30 10:07   ` Amir Goldstein
@ 2023-11-30 13:37     ` Jan Kara
  2023-11-30 13:46       ` Amir Goldstein
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Kara @ 2023-11-30 13:37 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christian Brauner, Jeff Layton, Josef Bacik, Christoph Hellwig,
	Jan Kara, David Howells, Jens Axboe, Miklos Szeredi, Al Viro,
	linux-fsdevel

On Thu 30-11-23 12:07:23, Amir Goldstein wrote:
> On Thu, Nov 30, 2023 at 10:32 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Wed, Nov 29, 2023 at 10:07 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > Christian,
> > >
> > > Josef has helped me see the light and figure out how to avoid the
> > > possible deadlock, which involves:
> > > - splice() from source file in a loop mounted fs to dest file in
> > >   a host fs, where the loop image file is
> > > - fsfreeze on host fs
> > > - write to host fs in context of fanotify permission event handler
> > >   (FAN_ACCESS_PERM) on the splice source file
> > >
> > > The first patch should not be changing any logic.
> > > I only build tested the ceph patch, so hoping to get an
> > > Acked-by/Tested-by from Jeff.
> > >
> > > The second patch rids us of the deadlock by not holding
> > > file_start_write() while reading from splice source file.
> > >
> >
> > OOPS, I missed another corner case:
> > The COPY_FILE_SPLICE fallback of server-side-copy in nfsd/ksmbd
> > needs to use the start-write-safe variant of do_splice_direct(),
> > because in this case src and dst can be on any two fs.
> > Expect an extra patch in v2.
> >
> 
> For the interested, see server-side-copy patch below.
> Pushed to branch start-write-safe [1], but will wait with v2 until
> I get comments on v1.
> 
> Thanks,
> Amir.
> 
> [1] https://github.com/amir73il/linux/commits/start-write-safe
> 
> Author: Amir Goldstein <amir73il@gmail.com>
> Date:   Thu Nov 30 11:42:50 2023 +0200
> 
>     fs: use do_splice_direct() for nfsd/ksmbd server-side-copy
> 
>     nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
>     perform kernel copy between two files on any two filesystems.
> 
>     Splicing input file, while holding file_start_write() on the output file
>     which is on a different sb, posses a risk for fanotify related deadlocks.
> 
>     We only need to call splice_file_range() from within the context of
>     ->copy_file_range() filesystem methods with file_start_write() held.
> 
>     To avoid the possible deadlocks, always use do_splice_direct() instead of
>     splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
>     without holding file_start_write().
> 
>     Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 0bc99f38e623..12583e32aa6d 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1565,11 +1565,18 @@ ssize_t vfs_copy_file_range(struct file
> *file_in, loff_t pos_in,
>          * and which filesystems do not, that will allow userspace tools to
>          * make consistent desicions w.r.t using copy_file_range().
>          *
> -        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
> +        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
> +        * for server-side-copy between any two sb.
> +        *
> +        * In any case, we call do_splice_direct() and not splice_file_range(),
> +        * without file_start_write() held, to avoid possible deadlocks related
> +        * to splicing from input file, while file_start_write() is held on
> +        * the output file on a different sb.
>          */
> -       ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                                     flags);
> +       file_end_write(file_out);
> 
> +       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> +                              min_t(size_t, len, MAX_RW_COUNT), 0);
>  done:
>         if (ret > 0) {
>                 fsnotify_access(file_in);
> @@ -1581,8 +1588,6 @@ ssize_t vfs_copy_file_range(struct file
> *file_in, loff_t pos_in,
>         inc_syscr(current);
>         inc_syscw(current);
> 
> -       file_end_write(file_out);
> -

This file_end_write() is also used by the paths using ->copy_file_range()
and ->remap_file_range() so you need to balance those...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct()
  2023-11-30 13:18   ` Christian Brauner
@ 2023-11-30 13:37     ` Amir Goldstein
  0 siblings, 0 replies; 12+ messages in thread
From: Amir Goldstein @ 2023-11-30 13:37 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jeff Layton, Josef Bacik, Christoph Hellwig, Jan Kara,
	David Howells, Jens Axboe, Miklos Szeredi, Al Viro,
	linux-fsdevel

On Thu, Nov 30, 2023 at 3:18 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Wed, Nov 29, 2023 at 10:07:08PM +0200, Amir Goldstein wrote:
> > The new helper is meant to be called from context of ->copy_file_range()
> > methods instead of do_splice_direct().
> >
> > Currently, the only difference is that do_splice_copy_file_range() does
> > not take a splice flags argument and it asserts that file_start_write()
> > was called.
> >
> > Soon, do_splice_direct() will be called without file_start_write() held.
> >
> > Use the new helper from __ceph_copy_file_range(), that was incorrectly
> > passing the copy_file_range() flags argument as splice flags argument
> > to do_splice_direct(). the value of flags was 0, so no actual bug fix.
> >
> > Move the definition of both helpers to linux/splice.h.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >  fs/ceph/file.c         |  9 ++---
> >  fs/read_write.c        |  6 ++--
> >  fs/splice.c            | 82 ++++++++++++++++++++++++++++++------------
> >  include/linux/fs.h     |  2 --
> >  include/linux/splice.h | 13 ++++---
> >  5 files changed, 75 insertions(+), 37 deletions(-)
> >
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 3b5aae29e944..7c2db78e2c6e 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -12,6 +12,7 @@
> >  #include <linux/falloc.h>
> >  #include <linux/iversion.h>
> >  #include <linux/ktime.h>
> > +#include <linux/splice.h>
> >
> >  #include "super.h"
> >  #include "mds_client.h"
> > @@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> >                * {read,write}_iter, which will get caps again.
> >                */
> >               put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
> > -             ret = do_splice_direct(src_file, &src_off, dst_file,
> > -                                    &dst_off, src_objlen, flags);
> > +             ret = do_splice_copy_file_range(src_file, &src_off, dst_file,
> > +                                             &dst_off, src_objlen);
> >               /* Abort on short copies or on error */
> >               if (ret < (long)src_objlen) {
> >                       doutc(cl, "Failed partial copy (%zd)\n", ret);
> > @@ -3065,8 +3066,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> >        */
> >       if (len && (len < src_ci->i_layout.object_size)) {
> >               doutc(cl, "Final partial copy of %zu bytes\n", len);
> > -             bytes = do_splice_direct(src_file, &src_off, dst_file,
> > -                                      &dst_off, len, flags);
> > +             bytes = do_splice_copy_file_range(src_file, &src_off, dst_file,
> > +                                               &dst_off, len);
> >               if (bytes > 0)
> >                       ret += bytes;
> >               else
> > diff --git a/fs/read_write.c b/fs/read_write.c
> > index f791555fa246..555514cdad53 100644
> > --- a/fs/read_write.c
> > +++ b/fs/read_write.c
> > @@ -1423,10 +1423,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >                               struct file *file_out, loff_t pos_out,
> >                               size_t len, unsigned int flags)
> >  {
>
> Hm, the low-level helper takes a @flags argument but it's completely
> ignored. I think that helper should remove it or it should check:
>
> if (flags)
>         return -EINVAL;
>

It's a good point.
The upstream code and in this v1, generic_copy_file_range() can actually
be called with flag COPY_FILE_SPLICE, but it is a mistake that
I fixed it in my branch for v2, so in v2 I can add this check.

> in case it's ever called from codepaths where @flags hasn't been
> sanitized imho.
>
> > -     lockdep_assert(file_write_started(file_out));
> > -
> > -     return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > -                             len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > +     return do_splice_copy_file_range(file_in, &pos_in, file_out, &pos_out,
> > +                             len > MAX_RW_COUNT ? MAX_RW_COUNT : len);
>
> clamp(len, 0, MAX_RW_COUNT)
>

It is a low level helper, so I don't want to worry about negative len value.
Already changed to min_t(size_t, len, MAX_RW_COUNT) in my branch.

Thanks!
Amir.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/2] Avert possible deadlock with splice() and fanotify
  2023-11-30 13:37     ` Jan Kara
@ 2023-11-30 13:46       ` Amir Goldstein
  0 siblings, 0 replies; 12+ messages in thread
From: Amir Goldstein @ 2023-11-30 13:46 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christian Brauner, Jeff Layton, Josef Bacik, Christoph Hellwig,
	David Howells, Jens Axboe, Miklos Szeredi, Al Viro,
	linux-fsdevel

On Thu, Nov 30, 2023 at 3:37 PM Jan Kara <jack@suse.cz> wrote:
>
> On Thu 30-11-23 12:07:23, Amir Goldstein wrote:
> > On Thu, Nov 30, 2023 at 10:32 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Wed, Nov 29, 2023 at 10:07 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > Christian,
> > > >
> > > > Josef has helped me see the light and figure out how to avoid the
> > > > possible deadlock, which involves:
> > > > - splice() from source file in a loop mounted fs to dest file in
> > > >   a host fs, where the loop image file is
> > > > - fsfreeze on host fs
> > > > - write to host fs in context of fanotify permission event handler
> > > >   (FAN_ACCESS_PERM) on the splice source file
> > > >
> > > > The first patch should not be changing any logic.
> > > > I only build tested the ceph patch, so hoping to get an
> > > > Acked-by/Tested-by from Jeff.
> > > >
> > > > The second patch rids us of the deadlock by not holding
> > > > file_start_write() while reading from splice source file.
> > > >
> > >
> > > OOPS, I missed another corner case:
> > > The COPY_FILE_SPLICE fallback of server-side-copy in nfsd/ksmbd
> > > needs to use the start-write-safe variant of do_splice_direct(),
> > > because in this case src and dst can be on any two fs.
> > > Expect an extra patch in v2.
> > >
> >
> > For the interested, see server-side-copy patch below.
> > Pushed to branch start-write-safe [1], but will wait with v2 until
> > I get comments on v1.
> >
> > Thanks,
> > Amir.
> >
> > [1] https://github.com/amir73il/linux/commits/start-write-safe
> >
> > Author: Amir Goldstein <amir73il@gmail.com>
> > Date:   Thu Nov 30 11:42:50 2023 +0200
> >
> >     fs: use do_splice_direct() for nfsd/ksmbd server-side-copy
> >
> >     nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to
> >     perform kernel copy between two files on any two filesystems.
> >
> >     Splicing input file, while holding file_start_write() on the output file
> >     which is on a different sb, posses a risk for fanotify related deadlocks.
> >
> >     We only need to call splice_file_range() from within the context of
> >     ->copy_file_range() filesystem methods with file_start_write() held.
> >
> >     To avoid the possible deadlocks, always use do_splice_direct() instead of
> >     splice_file_range() for the kernel copy fallback in vfs_copy_file_range()
> >     without holding file_start_write().
> >
> >     Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> >
> > diff --git a/fs/read_write.c b/fs/read_write.c
> > index 0bc99f38e623..12583e32aa6d 100644
> > --- a/fs/read_write.c
> > +++ b/fs/read_write.c
> > @@ -1565,11 +1565,18 @@ ssize_t vfs_copy_file_range(struct file
> > *file_in, loff_t pos_in,
> >          * and which filesystems do not, that will allow userspace tools to
> >          * make consistent desicions w.r.t using copy_file_range().
> >          *
> > -        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
> > +        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
> > +        * for server-side-copy between any two sb.
> > +        *
> > +        * In any case, we call do_splice_direct() and not splice_file_range(),
> > +        * without file_start_write() held, to avoid possible deadlocks related
> > +        * to splicing from input file, while file_start_write() is held on
> > +        * the output file on a different sb.
> >          */
> > -       ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> > -                                     flags);
> > +       file_end_write(file_out);
> >
> > +       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > +                              min_t(size_t, len, MAX_RW_COUNT), 0);
> >  done:
> >         if (ret > 0) {
> >                 fsnotify_access(file_in);
> > @@ -1581,8 +1588,6 @@ ssize_t vfs_copy_file_range(struct file
> > *file_in, loff_t pos_in,
> >         inc_syscr(current);
> >         inc_syscw(current);
> >
> > -       file_end_write(file_out);
> > -
>
> This file_end_write() is also used by the paths using ->copy_file_range()
> and ->remap_file_range() so you need to balance those...

You're right! already found that bug and fixed in my branch:

diff --git a/fs/read_write.c b/fs/read_write.c
index 0bc99f38e623..e0c2c1b5962b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1421,6 +1421,10 @@ ssize_t generic_copy_file_range(struct file
*file_in, loff_t pos_in,
                                struct file *file_out, loff_t pos_out,
                                size_t len, unsigned int flags)
 {
+       /* May only be called from within ->copy_file_range() methods */
+       if (WARN_ON_ONCE(flags))
+               return -EINVAL;
+
        return splice_file_range(file_in, &pos_in, file_out, &pos_out,
                                 min_t(size_t, len, MAX_RW_COUNT));
 }
@@ -1541,19 +1545,22 @@ ssize_t vfs_copy_file_range(struct file
*file_in, loff_t pos_in,
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
-               goto done;
-       }
-
-       if (!splice && file_in->f_op->remap_file_range &&
-           file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+       } else if (!splice && file_in->f_op->remap_file_range &&
+                  file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,
                                file_out, pos_out,
                                min_t(loff_t, MAX_RW_COUNT, len),
                                REMAP_FILE_CAN_SHORTEN);
-               if (ret > 0)
-                       goto done;
+               /* fallback to splice */
+               if (ret <= 0)
+                       splice = true;
        }

+       file_end_write(file_out);
+
+       if (!splice)
+               goto done;
+
        /*
         * We can get here for same sb copy of filesystems that do not implement
         * ->copy_file_range() in case filesystem does not support clone or in
@@ -1565,11 +1572,16 @@ ssize_t vfs_copy_file_range(struct file
*file_in, loff_t pos_in,
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
-        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
+        * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
+        * for server-side-copy between any two sb.
+        *
+        * In any case, we call do_splice_direct() and not splice_file_range(),
+        * without file_start_write() held, to avoid possible deadlocks related
+        * to splicing from input file, while file_start_write() is held on
+        * the output file on a different sb.
         */
-       ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-                                     flags);
-
+       ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+                              min_t(size_t, len, MAX_RW_COUNT), 0);
 done:
        if (ret > 0) {
                fsnotify_access(file_in);
@@ -1581,8 +1593,6 @@ ssize_t vfs_copy_file_range(struct file
*file_in, loff_t pos_in,
        inc_syscr(current);
        inc_syscw(current);

-       file_end_write(file_out);
-
        return ret;
 }
 EXPORT_SYMBOL(vfs_copy_file_range);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-11-30 13:47 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-29 20:07 [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
2023-11-29 20:07 ` [PATCH 1/2] fs: fork do_splice_copy_file_range() from do_splice_direct() Amir Goldstein
2023-11-30 10:09   ` Amir Goldstein
2023-11-30 13:30     ` Jan Kara
2023-11-30 13:18   ` Christian Brauner
2023-11-30 13:37     ` Amir Goldstein
2023-11-29 20:07 ` [PATCH 2/2] fs: move file_start_write() into direct_splice_actor() Amir Goldstein
2023-11-30 13:32   ` Jan Kara
2023-11-30  8:32 ` [PATCH 0/2] Avert possible deadlock with splice() and fanotify Amir Goldstein
2023-11-30 10:07   ` Amir Goldstein
2023-11-30 13:37     ` Jan Kara
2023-11-30 13:46       ` Amir Goldstein

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.