ceph-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] vfs: prevent copy_file_range to copy across devices
       [not found] <CAOQ4uxiFGjdvX2-zh5o46pn7RZhvbGHH0wpzLPuPOom91FwWeQ@mail.gmail.com>
@ 2021-02-15 15:43 ` Luis Henriques
  2021-02-15 16:02   ` Trond Myklebust
                     ` (3 more replies)
  0 siblings, 4 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-15 15:43 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

Nicolas Boichat reported an issue when trying to use the copy_file_range
syscall on a tracefs file.  It failed silently because the file content is
generated on-the-fly (reporting a size of zero) and copy_file_range needs
to know in advance how much data is present.

This commit restores the cross-fs restrictions that existed prior to
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") and
removes generic_copy_file_range() calls from ceph, cifs, fuse, and nfs.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Cc: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

 fs/ceph/file.c     | 21 +++-----------------
 fs/cifs/cifsfs.c   |  3 ---
 fs/fuse/file.c     | 21 +++-----------------
 fs/nfs/nfs4file.c  | 20 +++----------------
 fs/read_write.c    | 49 ++++++++++------------------------------------
 include/linux/fs.h |  3 ---
 6 files changed, 19 insertions(+), 98 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 209535d5b8d3..639bd7bfaea9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2261,9 +2261,9 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
 	return bytes;
 }
 
-static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
-				      struct file *dst_file, loff_t dst_off,
-				      size_t len, unsigned int flags)
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+				    struct file *dst_file, loff_t dst_off,
+				    size_t len, unsigned int flags)
 {
 	struct inode *src_inode = file_inode(src_file);
 	struct inode *dst_inode = file_inode(dst_file);
@@ -2456,21 +2456,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	return ret;
 }
 
-static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
-				    struct file *dst_file, loff_t dst_off,
-				    size_t len, unsigned int flags)
-{
-	ssize_t ret;
-
-	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
-				     len, flags);
-
-	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(src_file, src_off, dst_file,
-					      dst_off, len, flags);
-	return ret;
-}
-
 const struct file_operations ceph_file_fops = {
 	.open = ceph_open,
 	.release = ceph_release,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ab883e84e116..7aa3d20f21c0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1229,9 +1229,6 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
 					len, flags);
 	free_xid(xid);
 
-	if (rc == -EOPNOTSUPP || rc == -EXDEV)
-		rc = generic_copy_file_range(src_file, off, dst_file,
-					     destoff, len, flags);
 	return rc;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8cccecb55fb8..0dd703278e49 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3330,9 +3330,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	return err;
 }
 
-static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
-				      struct file *file_out, loff_t pos_out,
-				      size_t len, unsigned int flags)
+static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t len, unsigned int flags)
 {
 	struct fuse_file *ff_in = file_in->private_data;
 	struct fuse_file *ff_out = file_out->private_data;
@@ -3439,21 +3439,6 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 	return err;
 }
 
-static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
-				    struct file *dst_file, loff_t dst_off,
-				    size_t len, unsigned int flags)
-{
-	ssize_t ret;
-
-	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
-				     len, flags);
-
-	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(src_file, src_off, dst_file,
-					      dst_off, len, flags);
-	return ret;
-}
-
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read_iter	= fuse_file_read_iter,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57b3821d975a..60998209e310 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,9 +133,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
 }
 
 #ifdef CONFIG_NFS_V4_2
-static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
-				      struct file *file_out, loff_t pos_out,
-				      size_t count, unsigned int flags)
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t count, unsigned int flags)
 {
 	struct nfs42_copy_notify_res *cn_resp = NULL;
 	struct nl4_server *nss = NULL;
@@ -189,20 +189,6 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 	return ret;
 }
 
-static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
-				    struct file *file_out, loff_t pos_out,
-				    size_t count, unsigned int flags)
-{
-	ssize_t ret;
-
-	ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
-				     flags);
-	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(file_in, pos_in, file_out,
-					      pos_out, count, flags);
-	return ret;
-}
-
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
 {
 	loff_t ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..b217cd62ae0d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1358,40 +1358,12 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
 }
 #endif
 
-/**
- * generic_copy_file_range - copy data between two files
- * @file_in:	file structure to read from
- * @pos_in:	file offset to read from
- * @file_out:	file structure to write data to
- * @pos_out:	file offset to write data to
- * @len:	amount of data to copy
- * @flags:	copy flags
- *
- * This is a generic filesystem helper to copy data from one file to another.
- * It has no constraints on the source or destination file owners - the files
- * can belong to different superblocks and different filesystem types. Short
- * copies are allowed.
- *
- * This should be called from the @file_out filesystem, as per the
- * ->copy_file_range() method.
- *
- * Returns the number of bytes copied or a negative error indicating the
- * failure.
- */
-
-ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
-				struct file *file_out, loff_t pos_out,
-				size_t len, unsigned int flags)
-{
-	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
-}
-EXPORT_SYMBOL(generic_copy_file_range);
-
 static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  size_t len, unsigned int flags)
 {
+	ssize_t ret = -EXDEV;
+
 	/*
 	 * Although we now allow filesystems to handle cross sb copy, passing
 	 * a file of the wrong filesystem type to filesystem driver can result
@@ -1400,14 +1372,14 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * several different file_system_type structures, but they all end up
 	 * using the same ->copy_file_range() function pointer.
 	 */
-	if (file_out->f_op->copy_file_range &&
-	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
-		return file_out->f_op->copy_file_range(file_in, pos_in,
-						       file_out, pos_out,
-						       len, flags);
+	if (!file_out->f_op->copy_file_range)
+		ret = -EOPNOTSUPP;
+	else if (file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
+		ret = file_out->f_op->copy_file_range(file_in, pos_in,
+						      file_out, pos_out,
+						      len, flags);
 
-	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				       flags);
+	return ret;
 }
 
 /*
@@ -1514,8 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	}
 
 	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				flags);
-	WARN_ON_ONCE(ret == -EOPNOTSUPP);
+				 flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..3aaf627be409 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1910,9 +1910,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
-extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
-				       struct file *file_out, loff_t pos_out,
-				       size_t len, unsigned int flags);
 extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 					 struct file *file_out, loff_t pos_out,
 					 loff_t *count,

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 15:43 ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Luis Henriques
@ 2021-02-15 16:02   ` Trond Myklebust
  2021-02-16  0:25     ` Steve French
  2021-02-15 16:34   ` Amir Goldstein
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 93+ messages in thread
From: Trond Myklebust @ 2021-02-15 16:02 UTC (permalink / raw)
  To: drinkcat, anna.schumaker, iant, gregkh, dchinner, llozano,
	lhenriques, sfrench, darrick.wong, jlayton, amir73il, viro,
	miklos
  Cc: samba-technical, ceph-devel, linux-fsdevel, linux-kernel,
	linux-nfs, linux-cifs

On Mon, 2021-02-15 at 15:43 +0000, Luis Henriques wrote:
> Nicolas Boichat reported an issue when trying to use the
> copy_file_range
> syscall on a tracefs file.  It failed silently because the file
> content is
> generated on-the-fly (reporting a size of zero) and copy_file_range
> needs
> to know in advance how much data is present.

That explanation makes no sense whatsoever. copy_file_range is a non-
atomic operation and so the file can change while being copied. Any
determination of 'how much data is present' that is made in advance
would therefore be a flaw in the copy process being used (i.e.
do_splice_direct()). Does sendfile() also 'issue' in the same way?


-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 15:43 ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Luis Henriques
  2021-02-15 16:02   ` Trond Myklebust
@ 2021-02-15 16:34   ` Amir Goldstein
  2021-02-15 16:53     ` Trond Myklebust
  2021-02-17  4:45   ` Nicolas Boichat
  2021-02-18  7:42   ` Christoph Hellwig
  3 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-15 16:34 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Jeff Layton, Steve French, Miklos Szeredi, Trond Myklebust,
	Anna Schumaker, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Greg KH, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	ceph-devel, linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List

On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Nicolas Boichat reported an issue when trying to use the copy_file_range
> syscall on a tracefs file.  It failed silently because the file content is
> generated on-the-fly (reporting a size of zero) and copy_file_range needs
> to know in advance how much data is present.
>
> This commit restores the cross-fs restrictions that existed prior to
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") and
> removes generic_copy_file_range() calls from ceph, cifs, fuse, and nfs.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Cc: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>

Code looks ok.
You may add:

Reviewed-by: Amir Goldstein <amir73il@gmail.com>

I agree with Trond that the first paragraph of the commit message could
be improved.
The purpose of this change is to fix the change of behavior that
caused the regression.

Before v5.3, behavior was -EXDEV and userspace could fallback to read.
After v5.3, behavior is zero size copy.

It does not matter so much what makes sense for CFR to do in this
case (generic cross-fs copy).  What matters is that nobody asked for
this change and that it caused problems.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 16:34   ` Amir Goldstein
@ 2021-02-15 16:53     ` Trond Myklebust
  2021-02-15 17:24       ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Trond Myklebust @ 2021-02-15 16:53 UTC (permalink / raw)
  To: lhenriques, amir73il
  Cc: samba-technical, drinkcat, iant, linux-cifs, darrick.wong,
	linux-kernel, jlayton, anna.schumaker, llozano, miklos,
	linux-nfs, viro, dchinner, linux-fsdevel, gregkh, sfrench,
	ceph-devel

On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <lhenriques@suse.de>
> wrote:
> > 
> > Nicolas Boichat reported an issue when trying to use the
> > copy_file_range
> > syscall on a tracefs file.  It failed silently because the file
> > content is
> > generated on-the-fly (reporting a size of zero) and copy_file_range
> > needs
> > to know in advance how much data is present.
> > 
> > This commit restores the cross-fs restrictions that existed prior
> > to
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > and
> > removes generic_copy_file_range() calls from ceph, cifs, fuse, and
> > nfs.
> > 
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices")
> > Link: 
> > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > Cc: Nicolas Boichat <drinkcat@chromium.org>
> > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> 
> Code looks ok.
> You may add:
> 
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> 
> I agree with Trond that the first paragraph of the commit message
> could
> be improved.
> The purpose of this change is to fix the change of behavior that
> caused the regression.
> 
> Before v5.3, behavior was -EXDEV and userspace could fallback to
> read.
> After v5.3, behavior is zero size copy.
> 
> It does not matter so much what makes sense for CFR to do in this
> case (generic cross-fs copy).  What matters is that nobody asked for
> this change and that it caused problems.
> 

No. I'm saying that this patch should be NACKed unless there is a real
explanation for why we give crap about this tracefs corner case and why
it can't be fixed.

There are plenty of reasons why copy offload across filesystems makes
sense, and particularly when you're doing NAS. Clone just doesn't cut
it when it comes to disaster recovery (whereas backup to a different
storage unit does). If the client has to do the copy, then you're
effectively doubling the load on the server, and you're adding
potentially unnecessary network traffic (or at the very least you are
doubling that traffic).

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 16:53     ` Trond Myklebust
@ 2021-02-15 17:24       ` Amir Goldstein
  2021-02-15 18:57         ` Trond Myklebust
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-15 17:24 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: lhenriques, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	miklos, linux-nfs, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <lhenriques@suse.de>
> > wrote:
> > >
> > > Nicolas Boichat reported an issue when trying to use the
> > > copy_file_range
> > > syscall on a tracefs file.  It failed silently because the file
> > > content is
> > > generated on-the-fly (reporting a size of zero) and copy_file_range
> > > needs
> > > to know in advance how much data is present.
> > >
> > > This commit restores the cross-fs restrictions that existed prior
> > > to
> > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > and
> > > removes generic_copy_file_range() calls from ceph, cifs, fuse, and
> > > nfs.
> > >
> > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > devices")
> > > Link:
> > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > > Cc: Nicolas Boichat <drinkcat@chromium.org>
> > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >
> > Code looks ok.
> > You may add:
> >
> > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> >
> > I agree with Trond that the first paragraph of the commit message
> > could
> > be improved.
> > The purpose of this change is to fix the change of behavior that
> > caused the regression.
> >
> > Before v5.3, behavior was -EXDEV and userspace could fallback to
> > read.
> > After v5.3, behavior is zero size copy.
> >
> > It does not matter so much what makes sense for CFR to do in this
> > case (generic cross-fs copy).  What matters is that nobody asked for
> > this change and that it caused problems.
> >
>
> No. I'm saying that this patch should be NACKed unless there is a real
> explanation for why we give crap about this tracefs corner case and why
> it can't be fixed.
>
> There are plenty of reasons why copy offload across filesystems makes
> sense, and particularly when you're doing NAS. Clone just doesn't cut
> it when it comes to disaster recovery (whereas backup to a different
> storage unit does). If the client has to do the copy, then you're
> effectively doubling the load on the server, and you're adding
> potentially unnecessary network traffic (or at the very least you are
> doubling that traffic).
>

I don't understand the use case you are describing.

Which filesystem types are you talking about for source and target
of copy_file_range()?

To be clear, the original change was done to support NFS/CIFS server-side
copy and those should not be affected by this change.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 17:24       ` Amir Goldstein
@ 2021-02-15 18:57         ` Trond Myklebust
  2021-02-15 19:43           ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Trond Myklebust @ 2021-02-15 18:57 UTC (permalink / raw)
  To: amir73il
  Cc: samba-technical, drinkcat, iant, linux-cifs, darrick.wong,
	lhenriques, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
> On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
> trondmy@hammerspace.com> wrote:
> > 
> > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
> > > lhenriques@suse.de>
> > > wrote:
> > > > 
> > > > Nicolas Boichat reported an issue when trying to use the
> > > > copy_file_range
> > > > syscall on a tracefs file.  It failed silently because the file
> > > > content is
> > > > generated on-the-fly (reporting a size of zero) and
> > > > copy_file_range
> > > > needs
> > > > to know in advance how much data is present.
> > > > 
> > > > This commit restores the cross-fs restrictions that existed
> > > > prior
> > > > to
> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > devices")
> > > > and
> > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
> > > > and
> > > > nfs.
> > > > 
> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > devices")
> > > > Link:
> > > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > > > Cc: Nicolas Boichat <drinkcat@chromium.org>
> > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > 
> > > Code looks ok.
> > > You may add:
> > > 
> > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> > > 
> > > I agree with Trond that the first paragraph of the commit message
> > > could
> > > be improved.
> > > The purpose of this change is to fix the change of behavior that
> > > caused the regression.
> > > 
> > > Before v5.3, behavior was -EXDEV and userspace could fallback to
> > > read.
> > > After v5.3, behavior is zero size copy.
> > > 
> > > It does not matter so much what makes sense for CFR to do in this
> > > case (generic cross-fs copy).  What matters is that nobody asked
> > > for
> > > this change and that it caused problems.
> > > 
> > 
> > No. I'm saying that this patch should be NACKed unless there is a
> > real
> > explanation for why we give crap about this tracefs corner case and
> > why
> > it can't be fixed.
> > 
> > There are plenty of reasons why copy offload across filesystems
> > makes
> > sense, and particularly when you're doing NAS. Clone just doesn't
> > cut
> > it when it comes to disaster recovery (whereas backup to a
> > different
> > storage unit does). If the client has to do the copy, then you're
> > effectively doubling the load on the server, and you're adding
> > potentially unnecessary network traffic (or at the very least you
> > are
> > doubling that traffic).
> > 
> 
> I don't understand the use case you are describing.
> 
> Which filesystem types are you talking about for source and target
> of copy_file_range()?
> 
> To be clear, the original change was done to support NFS/CIFS server-
> side
> copy and those should not be affected by this change.
> 

That is incorrect: 

ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
*dst,
 u64 dst_pos, u64 count)
{

 /*
 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
 * thread and client rpc slot. The choice of 4MB is somewhat
 * arbitrary. We might instead base this on r/wsize, or make it
 * tunable, or use a time instead of a byte limit, or implement
 * asynchronous copy. In theory a client could also recognize a
 * limit like this and pipeline multiple COPY requests.
 */
 count = min_t(u64, count, 1 << 22);
 return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
}

You are now explicitly changing the behaviour of knfsd when the source
and destination filesystem differ.

For one thing, you are disallowing the NFSv4.2 copy offload use case of
copying from a local filesystem to a remote NFS server. However you are
also disallowing the copy from, say, an XFS formatted partition to an
ext4 partition.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 18:57         ` Trond Myklebust
@ 2021-02-15 19:43           ` Amir Goldstein
  2021-02-16 11:17             ` Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-15 19:43 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: samba-technical, drinkcat, iant, linux-cifs, darrick.wong,
	lhenriques, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
> > trondmy@hammerspace.com> wrote:
> > >
> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
> > > > lhenriques@suse.de>
> > > > wrote:
> > > > >
> > > > > Nicolas Boichat reported an issue when trying to use the
> > > > > copy_file_range
> > > > > syscall on a tracefs file.  It failed silently because the file
> > > > > content is
> > > > > generated on-the-fly (reporting a size of zero) and
> > > > > copy_file_range
> > > > > needs
> > > > > to know in advance how much data is present.
> > > > >
> > > > > This commit restores the cross-fs restrictions that existed
> > > > > prior
> > > > > to
> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > > devices")
> > > > > and
> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
> > > > > and
> > > > > nfs.
> > > > >
> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > > devices")
> > > > > Link:
> > > > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > > > > Cc: Nicolas Boichat <drinkcat@chromium.org>
> > > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > >
> > > > Code looks ok.
> > > > You may add:
> > > >
> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> > > >
> > > > I agree with Trond that the first paragraph of the commit message
> > > > could
> > > > be improved.
> > > > The purpose of this change is to fix the change of behavior that
> > > > caused the regression.
> > > >
> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
> > > > read.
> > > > After v5.3, behavior is zero size copy.
> > > >
> > > > It does not matter so much what makes sense for CFR to do in this
> > > > case (generic cross-fs copy).  What matters is that nobody asked
> > > > for
> > > > this change and that it caused problems.
> > > >
> > >
> > > No. I'm saying that this patch should be NACKed unless there is a
> > > real
> > > explanation for why we give crap about this tracefs corner case and
> > > why
> > > it can't be fixed.
> > >
> > > There are plenty of reasons why copy offload across filesystems
> > > makes
> > > sense, and particularly when you're doing NAS. Clone just doesn't
> > > cut
> > > it when it comes to disaster recovery (whereas backup to a
> > > different
> > > storage unit does). If the client has to do the copy, then you're
> > > effectively doubling the load on the server, and you're adding
> > > potentially unnecessary network traffic (or at the very least you
> > > are
> > > doubling that traffic).
> > >
> >
> > I don't understand the use case you are describing.
> >
> > Which filesystem types are you talking about for source and target
> > of copy_file_range()?
> >
> > To be clear, the original change was done to support NFS/CIFS server-
> > side
> > copy and those should not be affected by this change.
> >
>
> That is incorrect:
>
> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
> *dst,
>  u64 dst_pos, u64 count)
> {
>
>  /*
>  * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>  * thread and client rpc slot. The choice of 4MB is somewhat
>  * arbitrary. We might instead base this on r/wsize, or make it
>  * tunable, or use a time instead of a byte limit, or implement
>  * asynchronous copy. In theory a client could also recognize a
>  * limit like this and pipeline multiple COPY requests.
>  */
>  count = min_t(u64, count, 1 << 22);
>  return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> }
>
> You are now explicitly changing the behaviour of knfsd when the source
> and destination filesystem differ.
>
> For one thing, you are disallowing the NFSv4.2 copy offload use case of
> copying from a local filesystem to a remote NFS server. However you are
> also disallowing the copy from, say, an XFS formatted partition to an
> ext4 partition.
>

Got it.
This is easy to solve with a flag COPY_FILE_SPLICE (or something)
that is internal to kernel users.

FWIW, you may want to look at the loop in ovl_copy_up_data()
for improvements to nfsd_copy_file_range().

We can move the check out to copy_file_range syscall:

        if (flags != 0)
                return -EINVAL;

Leave the fallback from all filesystems and check for the
COPY_FILE_SPLICE flag inside generic_copy_file_range().

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 16:02   ` Trond Myklebust
@ 2021-02-16  0:25     ` Steve French
  0 siblings, 0 replies; 93+ messages in thread
From: Steve French @ 2021-02-16  0:25 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: drinkcat, anna.schumaker, iant, gregkh, dchinner, llozano,
	lhenriques, sfrench, darrick.wong, jlayton, amir73il, viro,
	miklos, samba-technical, ceph-devel, linux-fsdevel, linux-kernel,
	linux-nfs, linux-cifs

On Mon, Feb 15, 2021 at 10:11 AM Trond Myklebust
<trondmy@hammerspace.com> wrote:
>
> On Mon, 2021-02-15 at 15:43 +0000, Luis Henriques wrote:
> > Nicolas Boichat reported an issue when trying to use the
> > copy_file_range
> > syscall on a tracefs file.  It failed silently because the file
> > content is
> > generated on-the-fly (reporting a size of zero) and copy_file_range
> > needs
> > to know in advance how much data is present.
>
> That explanation makes no sense whatsoever. copy_file_range is a non-
> atomic operation and so the file can change while being copied. Any
> determination of 'how much data is present' that is made in advance
> would therefore be a flaw in the copy process being used (i.e.
> do_splice_direct()). Does sendfile() also 'issue' in the same way?

I agree that the explanation of the tracefs problem motivating this
patch doesn't make sense.


-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 19:43           ` Amir Goldstein
@ 2021-02-16 11:17             ` Luis Henriques
  2021-02-16 11:28               ` gregkh
  2021-02-16 13:51               ` Amir Goldstein
  0 siblings, 2 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-16 11:17 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

Amir Goldstein <amir73il@gmail.com> writes:

> On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>>
>> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
>> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
>> > trondmy@hammerspace.com> wrote:
>> > >
>> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
>> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
>> > > > lhenriques@suse.de>
>> > > > wrote:
>> > > > >
>> > > > > Nicolas Boichat reported an issue when trying to use the
>> > > > > copy_file_range
>> > > > > syscall on a tracefs file.  It failed silently because the file
>> > > > > content is
>> > > > > generated on-the-fly (reporting a size of zero) and
>> > > > > copy_file_range
>> > > > > needs
>> > > > > to know in advance how much data is present.
>> > > > >
>> > > > > This commit restores the cross-fs restrictions that existed
>> > > > > prior
>> > > > > to
>> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> > > > > devices")
>> > > > > and
>> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
>> > > > > and
>> > > > > nfs.
>> > > > >
>> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> > > > > devices")
>> > > > > Link:
>> > > > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
>> > > > > Cc: Nicolas Boichat <drinkcat@chromium.org>
>> > > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> > > >
>> > > > Code looks ok.
>> > > > You may add:
>> > > >
>> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
>> > > >
>> > > > I agree with Trond that the first paragraph of the commit message
>> > > > could
>> > > > be improved.
>> > > > The purpose of this change is to fix the change of behavior that
>> > > > caused the regression.
>> > > >
>> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
>> > > > read.
>> > > > After v5.3, behavior is zero size copy.
>> > > >
>> > > > It does not matter so much what makes sense for CFR to do in this
>> > > > case (generic cross-fs copy).  What matters is that nobody asked
>> > > > for
>> > > > this change and that it caused problems.
>> > > >
>> > >
>> > > No. I'm saying that this patch should be NACKed unless there is a
>> > > real
>> > > explanation for why we give crap about this tracefs corner case and
>> > > why
>> > > it can't be fixed.
>> > >
>> > > There are plenty of reasons why copy offload across filesystems
>> > > makes
>> > > sense, and particularly when you're doing NAS. Clone just doesn't
>> > > cut
>> > > it when it comes to disaster recovery (whereas backup to a
>> > > different
>> > > storage unit does). If the client has to do the copy, then you're
>> > > effectively doubling the load on the server, and you're adding
>> > > potentially unnecessary network traffic (or at the very least you
>> > > are
>> > > doubling that traffic).
>> > >
>> >
>> > I don't understand the use case you are describing.
>> >
>> > Which filesystem types are you talking about for source and target
>> > of copy_file_range()?
>> >
>> > To be clear, the original change was done to support NFS/CIFS server-
>> > side
>> > copy and those should not be affected by this change.
>> >
>>
>> That is incorrect:
>>
>> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
>> *dst,
>>  u64 dst_pos, u64 count)
>> {
>>
>>  /*
>>  * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>>  * thread and client rpc slot. The choice of 4MB is somewhat
>>  * arbitrary. We might instead base this on r/wsize, or make it
>>  * tunable, or use a time instead of a byte limit, or implement
>>  * asynchronous copy. In theory a client could also recognize a
>>  * limit like this and pipeline multiple COPY requests.
>>  */
>>  count = min_t(u64, count, 1 << 22);
>>  return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>> }
>>
>> You are now explicitly changing the behaviour of knfsd when the source
>> and destination filesystem differ.
>>
>> For one thing, you are disallowing the NFSv4.2 copy offload use case of
>> copying from a local filesystem to a remote NFS server. However you are
>> also disallowing the copy from, say, an XFS formatted partition to an
>> ext4 partition.
>>
>
> Got it.

Ugh.  And I guess overlayfs may have a similar problem.

> This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> is internal to kernel users.
>
> FWIW, you may want to look at the loop in ovl_copy_up_data()
> for improvements to nfsd_copy_file_range().
>
> We can move the check out to copy_file_range syscall:
>
>         if (flags != 0)
>                 return -EINVAL;
>
> Leave the fallback from all filesystems and check for the
> COPY_FILE_SPLICE flag inside generic_copy_file_range().

Ok, the diff bellow is just to make sure I understood your suggestion.

The patch will also need to:

 - change nfs and overlayfs calls to vfs_copy_file_range() so that they
   use the new flag.

 - check flags in generic_copy_file_checks() to make sure only valid flags
   are used (COPY_FILE_SPLICE at the moment).

Also, where should this flag be defined?  include/uapi/linux/fs.h?

Cheers,
-- 
Luis

diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..341d315d2a96 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				size_t len, unsigned int flags)
 {
+	if (!(flags & COPY_FILE_SPLICE)) {
+		if (!file_out->f_op->copy_file_range)
+			return -EOPNOTSUPP;
+		else if (file_out->f_op->copy_file_range !=
+			 file_in->f_op->copy_file_range)
+			return -EXDEV;
+	}
 	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
 				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
 }
@@ -1474,9 +1481,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 {
 	ssize_t ret;
 
-	if (flags != 0)
-		return -EINVAL;
-
 	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
 				       flags);
 	if (unlikely(ret))

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 11:17             ` Luis Henriques
@ 2021-02-16 11:28               ` gregkh
  2021-02-16 12:01                 ` Luis Henriques
  2021-02-16 13:51               ` Amir Goldstein
  1 sibling, 1 reply; 93+ messages in thread
From: gregkh @ 2021-02-16 11:28 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Trond Myklebust, samba-technical, drinkcat, iant,
	linux-cifs, darrick.wong, linux-kernel, jlayton, anna.schumaker,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	sfrench, ceph-devel

On Tue, Feb 16, 2021 at 11:17:34AM +0000, Luis Henriques wrote:
> Amir Goldstein <amir73il@gmail.com> writes:
> 
> > On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> >>
> >> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
> >> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
> >> > trondmy@hammerspace.com> wrote:
> >> > >
> >> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> >> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
> >> > > > lhenriques@suse.de>
> >> > > > wrote:
> >> > > > >
> >> > > > > Nicolas Boichat reported an issue when trying to use the
> >> > > > > copy_file_range
> >> > > > > syscall on a tracefs file.  It failed silently because the file
> >> > > > > content is
> >> > > > > generated on-the-fly (reporting a size of zero) and
> >> > > > > copy_file_range
> >> > > > > needs
> >> > > > > to know in advance how much data is present.
> >> > > > >
> >> > > > > This commit restores the cross-fs restrictions that existed
> >> > > > > prior
> >> > > > > to
> >> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> > > > > devices")
> >> > > > > and
> >> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
> >> > > > > and
> >> > > > > nfs.
> >> > > > >
> >> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> > > > > devices")
> >> > > > > Link:
> >> > > > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> >> > > > > Cc: Nicolas Boichat <drinkcat@chromium.org>
> >> > > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >> > > >
> >> > > > Code looks ok.
> >> > > > You may add:
> >> > > >
> >> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> >> > > >
> >> > > > I agree with Trond that the first paragraph of the commit message
> >> > > > could
> >> > > > be improved.
> >> > > > The purpose of this change is to fix the change of behavior that
> >> > > > caused the regression.
> >> > > >
> >> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
> >> > > > read.
> >> > > > After v5.3, behavior is zero size copy.
> >> > > >
> >> > > > It does not matter so much what makes sense for CFR to do in this
> >> > > > case (generic cross-fs copy).  What matters is that nobody asked
> >> > > > for
> >> > > > this change and that it caused problems.
> >> > > >
> >> > >
> >> > > No. I'm saying that this patch should be NACKed unless there is a
> >> > > real
> >> > > explanation for why we give crap about this tracefs corner case and
> >> > > why
> >> > > it can't be fixed.
> >> > >
> >> > > There are plenty of reasons why copy offload across filesystems
> >> > > makes
> >> > > sense, and particularly when you're doing NAS. Clone just doesn't
> >> > > cut
> >> > > it when it comes to disaster recovery (whereas backup to a
> >> > > different
> >> > > storage unit does). If the client has to do the copy, then you're
> >> > > effectively doubling the load on the server, and you're adding
> >> > > potentially unnecessary network traffic (or at the very least you
> >> > > are
> >> > > doubling that traffic).
> >> > >
> >> >
> >> > I don't understand the use case you are describing.
> >> >
> >> > Which filesystem types are you talking about for source and target
> >> > of copy_file_range()?
> >> >
> >> > To be clear, the original change was done to support NFS/CIFS server-
> >> > side
> >> > copy and those should not be affected by this change.
> >> >
> >>
> >> That is incorrect:
> >>
> >> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
> >> *dst,
> >>  u64 dst_pos, u64 count)
> >> {
> >>
> >>  /*
> >>  * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> >>  * thread and client rpc slot. The choice of 4MB is somewhat
> >>  * arbitrary. We might instead base this on r/wsize, or make it
> >>  * tunable, or use a time instead of a byte limit, or implement
> >>  * asynchronous copy. In theory a client could also recognize a
> >>  * limit like this and pipeline multiple COPY requests.
> >>  */
> >>  count = min_t(u64, count, 1 << 22);
> >>  return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >> }
> >>
> >> You are now explicitly changing the behaviour of knfsd when the source
> >> and destination filesystem differ.
> >>
> >> For one thing, you are disallowing the NFSv4.2 copy offload use case of
> >> copying from a local filesystem to a remote NFS server. However you are
> >> also disallowing the copy from, say, an XFS formatted partition to an
> >> ext4 partition.
> >>
> >
> > Got it.
> 
> Ugh.  And I guess overlayfs may have a similar problem.
> 
> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > is internal to kernel users.
> >
> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > for improvements to nfsd_copy_file_range().
> >
> > We can move the check out to copy_file_range syscall:
> >
> >         if (flags != 0)
> >                 return -EINVAL;
> >
> > Leave the fallback from all filesystems and check for the
> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> 
> Ok, the diff bellow is just to make sure I understood your suggestion.
> 
> The patch will also need to:
> 
>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>    use the new flag.
> 
>  - check flags in generic_copy_file_checks() to make sure only valid flags
>    are used (COPY_FILE_SPLICE at the moment).
> 
> Also, where should this flag be defined?  include/uapi/linux/fs.h?

Why would userspace want/need this flag?


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 11:28               ` gregkh
@ 2021-02-16 12:01                 ` Luis Henriques
  2021-02-16 12:08                   ` Greg KH
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-16 12:01 UTC (permalink / raw)
  To: gregkh
  Cc: Amir Goldstein, Trond Myklebust, samba-technical, drinkcat, iant,
	linux-cifs, darrick.wong, linux-kernel, jlayton, anna.schumaker,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	sfrench, ceph-devel

"gregkh@linuxfoundation.org" <gregkh@linuxfoundation.org> writes:

> On Tue, Feb 16, 2021 at 11:17:34AM +0000, Luis Henriques wrote:
>> Amir Goldstein <amir73il@gmail.com> writes:
>> 
>> > On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>> >>
>> >> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
>> >> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
>> >> > trondmy@hammerspace.com> wrote:
>> >> > >
>> >> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
>> >> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
>> >> > > > lhenriques@suse.de>
>> >> > > > wrote:
>> >> > > > >
>> >> > > > > Nicolas Boichat reported an issue when trying to use the
>> >> > > > > copy_file_range
>> >> > > > > syscall on a tracefs file.  It failed silently because the file
>> >> > > > > content is
>> >> > > > > generated on-the-fly (reporting a size of zero) and
>> >> > > > > copy_file_range
>> >> > > > > needs
>> >> > > > > to know in advance how much data is present.
>> >> > > > >
>> >> > > > > This commit restores the cross-fs restrictions that existed
>> >> > > > > prior
>> >> > > > > to
>> >> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> >> > > > > devices")
>> >> > > > > and
>> >> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
>> >> > > > > and
>> >> > > > > nfs.
>> >> > > > >
>> >> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> >> > > > > devices")
>> >> > > > > Link:
>> >> > > > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
>> >> > > > > Cc: Nicolas Boichat <drinkcat@chromium.org>
>> >> > > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> >> > > >
>> >> > > > Code looks ok.
>> >> > > > You may add:
>> >> > > >
>> >> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
>> >> > > >
>> >> > > > I agree with Trond that the first paragraph of the commit message
>> >> > > > could
>> >> > > > be improved.
>> >> > > > The purpose of this change is to fix the change of behavior that
>> >> > > > caused the regression.
>> >> > > >
>> >> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
>> >> > > > read.
>> >> > > > After v5.3, behavior is zero size copy.
>> >> > > >
>> >> > > > It does not matter so much what makes sense for CFR to do in this
>> >> > > > case (generic cross-fs copy).  What matters is that nobody asked
>> >> > > > for
>> >> > > > this change and that it caused problems.
>> >> > > >
>> >> > >
>> >> > > No. I'm saying that this patch should be NACKed unless there is a
>> >> > > real
>> >> > > explanation for why we give crap about this tracefs corner case and
>> >> > > why
>> >> > > it can't be fixed.
>> >> > >
>> >> > > There are plenty of reasons why copy offload across filesystems
>> >> > > makes
>> >> > > sense, and particularly when you're doing NAS. Clone just doesn't
>> >> > > cut
>> >> > > it when it comes to disaster recovery (whereas backup to a
>> >> > > different
>> >> > > storage unit does). If the client has to do the copy, then you're
>> >> > > effectively doubling the load on the server, and you're adding
>> >> > > potentially unnecessary network traffic (or at the very least you
>> >> > > are
>> >> > > doubling that traffic).
>> >> > >
>> >> >
>> >> > I don't understand the use case you are describing.
>> >> >
>> >> > Which filesystem types are you talking about for source and target
>> >> > of copy_file_range()?
>> >> >
>> >> > To be clear, the original change was done to support NFS/CIFS server-
>> >> > side
>> >> > copy and those should not be affected by this change.
>> >> >
>> >>
>> >> That is incorrect:
>> >>
>> >> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
>> >> *dst,
>> >>  u64 dst_pos, u64 count)
>> >> {
>> >>
>> >>  /*
>> >>  * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>> >>  * thread and client rpc slot. The choice of 4MB is somewhat
>> >>  * arbitrary. We might instead base this on r/wsize, or make it
>> >>  * tunable, or use a time instead of a byte limit, or implement
>> >>  * asynchronous copy. In theory a client could also recognize a
>> >>  * limit like this and pipeline multiple COPY requests.
>> >>  */
>> >>  count = min_t(u64, count, 1 << 22);
>> >>  return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>> >> }
>> >>
>> >> You are now explicitly changing the behaviour of knfsd when the source
>> >> and destination filesystem differ.
>> >>
>> >> For one thing, you are disallowing the NFSv4.2 copy offload use case of
>> >> copying from a local filesystem to a remote NFS server. However you are
>> >> also disallowing the copy from, say, an XFS formatted partition to an
>> >> ext4 partition.
>> >>
>> >
>> > Got it.
>> 
>> Ugh.  And I guess overlayfs may have a similar problem.
>> 
>> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
>> > is internal to kernel users.
>> >
>> > FWIW, you may want to look at the loop in ovl_copy_up_data()
>> > for improvements to nfsd_copy_file_range().
>> >
>> > We can move the check out to copy_file_range syscall:
>> >
>> >         if (flags != 0)
>> >                 return -EINVAL;
>> >
>> > Leave the fallback from all filesystems and check for the
>> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>> 
>> Ok, the diff bellow is just to make sure I understood your suggestion.
>> 
>> The patch will also need to:
>> 
>>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>>    use the new flag.
>> 
>>  - check flags in generic_copy_file_checks() to make sure only valid flags
>>    are used (COPY_FILE_SPLICE at the moment).
>> 
>> Also, where should this flag be defined?  include/uapi/linux/fs.h?
>
> Why would userspace want/need this flag?

In fact, my question sort of implied yours :-)

What I wanted to know was whether we would like to allow userspace to
_explicitly_ revert to the current behaviour (i.e. use the flag to allow
cross-fs copies) or to continue to return -EINVAL to userspace if flags
are != 0 (in which case this check would need to move to the syscall
definition).

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 12:01                 ` Luis Henriques
@ 2021-02-16 12:08                   ` Greg KH
  0 siblings, 0 replies; 93+ messages in thread
From: Greg KH @ 2021-02-16 12:08 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Trond Myklebust, samba-technical, drinkcat, iant,
	linux-cifs, darrick.wong, linux-kernel, jlayton, anna.schumaker,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	sfrench, ceph-devel

On Tue, Feb 16, 2021 at 12:01:16PM +0000, Luis Henriques wrote:
> "gregkh@linuxfoundation.org" <gregkh@linuxfoundation.org> writes:
> 
> > On Tue, Feb 16, 2021 at 11:17:34AM +0000, Luis Henriques wrote:
> >> Amir Goldstein <amir73il@gmail.com> writes:
> >> 
> >> > On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
> >> >>
> >> >> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
> >> >> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
> >> >> > trondmy@hammerspace.com> wrote:
> >> >> > >
> >> >> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> >> >> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
> >> >> > > > lhenriques@suse.de>
> >> >> > > > wrote:
> >> >> > > > >
> >> >> > > > > Nicolas Boichat reported an issue when trying to use the
> >> >> > > > > copy_file_range
> >> >> > > > > syscall on a tracefs file.  It failed silently because the file
> >> >> > > > > content is
> >> >> > > > > generated on-the-fly (reporting a size of zero) and
> >> >> > > > > copy_file_range
> >> >> > > > > needs
> >> >> > > > > to know in advance how much data is present.
> >> >> > > > >
> >> >> > > > > This commit restores the cross-fs restrictions that existed
> >> >> > > > > prior
> >> >> > > > > to
> >> >> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> >> > > > > devices")
> >> >> > > > > and
> >> >> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
> >> >> > > > > and
> >> >> > > > > nfs.
> >> >> > > > >
> >> >> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> >> > > > > devices")
> >> >> > > > > Link:
> >> >> > > > > https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> >> >> > > > > Cc: Nicolas Boichat <drinkcat@chromium.org>
> >> >> > > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >> >> > > >
> >> >> > > > Code looks ok.
> >> >> > > > You may add:
> >> >> > > >
> >> >> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> >> >> > > >
> >> >> > > > I agree with Trond that the first paragraph of the commit message
> >> >> > > > could
> >> >> > > > be improved.
> >> >> > > > The purpose of this change is to fix the change of behavior that
> >> >> > > > caused the regression.
> >> >> > > >
> >> >> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
> >> >> > > > read.
> >> >> > > > After v5.3, behavior is zero size copy.
> >> >> > > >
> >> >> > > > It does not matter so much what makes sense for CFR to do in this
> >> >> > > > case (generic cross-fs copy).  What matters is that nobody asked
> >> >> > > > for
> >> >> > > > this change and that it caused problems.
> >> >> > > >
> >> >> > >
> >> >> > > No. I'm saying that this patch should be NACKed unless there is a
> >> >> > > real
> >> >> > > explanation for why we give crap about this tracefs corner case and
> >> >> > > why
> >> >> > > it can't be fixed.
> >> >> > >
> >> >> > > There are plenty of reasons why copy offload across filesystems
> >> >> > > makes
> >> >> > > sense, and particularly when you're doing NAS. Clone just doesn't
> >> >> > > cut
> >> >> > > it when it comes to disaster recovery (whereas backup to a
> >> >> > > different
> >> >> > > storage unit does). If the client has to do the copy, then you're
> >> >> > > effectively doubling the load on the server, and you're adding
> >> >> > > potentially unnecessary network traffic (or at the very least you
> >> >> > > are
> >> >> > > doubling that traffic).
> >> >> > >
> >> >> >
> >> >> > I don't understand the use case you are describing.
> >> >> >
> >> >> > Which filesystem types are you talking about for source and target
> >> >> > of copy_file_range()?
> >> >> >
> >> >> > To be clear, the original change was done to support NFS/CIFS server-
> >> >> > side
> >> >> > copy and those should not be affected by this change.
> >> >> >
> >> >>
> >> >> That is incorrect:
> >> >>
> >> >> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
> >> >> *dst,
> >> >>  u64 dst_pos, u64 count)
> >> >> {
> >> >>
> >> >>  /*
> >> >>  * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> >> >>  * thread and client rpc slot. The choice of 4MB is somewhat
> >> >>  * arbitrary. We might instead base this on r/wsize, or make it
> >> >>  * tunable, or use a time instead of a byte limit, or implement
> >> >>  * asynchronous copy. In theory a client could also recognize a
> >> >>  * limit like this and pipeline multiple COPY requests.
> >> >>  */
> >> >>  count = min_t(u64, count, 1 << 22);
> >> >>  return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >> >> }
> >> >>
> >> >> You are now explicitly changing the behaviour of knfsd when the source
> >> >> and destination filesystem differ.
> >> >>
> >> >> For one thing, you are disallowing the NFSv4.2 copy offload use case of
> >> >> copying from a local filesystem to a remote NFS server. However you are
> >> >> also disallowing the copy from, say, an XFS formatted partition to an
> >> >> ext4 partition.
> >> >>
> >> >
> >> > Got it.
> >> 
> >> Ugh.  And I guess overlayfs may have a similar problem.
> >> 
> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> >> > is internal to kernel users.
> >> >
> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> >> > for improvements to nfsd_copy_file_range().
> >> >
> >> > We can move the check out to copy_file_range syscall:
> >> >
> >> >         if (flags != 0)
> >> >                 return -EINVAL;
> >> >
> >> > Leave the fallback from all filesystems and check for the
> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> >> 
> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> >> 
> >> The patch will also need to:
> >> 
> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> >>    use the new flag.
> >> 
> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> >>    are used (COPY_FILE_SPLICE at the moment).
> >> 
> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> >
> > Why would userspace want/need this flag?
> 
> In fact, my question sort of implied yours :-)
> 
> What I wanted to know was whether we would like to allow userspace to
> _explicitly_ revert to the current behaviour (i.e. use the flag to allow
> cross-fs copies) or to continue to return -EINVAL to userspace if flags
> are != 0 (in which case this check would need to move to the syscall
> definition).

No, don't try to mess with userspace that way, the kernel should "just
work".  Well, in this case "work as best as it can, not always
successful...", it's an odd syscall.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 11:17             ` Luis Henriques
  2021-02-16 11:28               ` gregkh
@ 2021-02-16 13:51               ` Amir Goldstein
  2021-02-16 16:42                 ` Luis Henriques
  2021-02-16 18:54                 ` Andreas Dilger
  1 sibling, 2 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-16 13:51 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

> Ugh.  And I guess overlayfs may have a similar problem.

Not exactly.
Generally speaking, overlayfs should call vfs_copy_file_range()
with the flags it got from layer above, so if called from nfsd it
will allow cross fs copy and when called from syscall it won't.

There are some corner cases where overlayfs could benefit from
COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
let's leave those for now. Just leave overlayfs code as is.

>
> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > is internal to kernel users.
> >
> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > for improvements to nfsd_copy_file_range().
> >
> > We can move the check out to copy_file_range syscall:
> >
> >         if (flags != 0)
> >                 return -EINVAL;
> >
> > Leave the fallback from all filesystems and check for the
> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>
> Ok, the diff bellow is just to make sure I understood your suggestion.
>
> The patch will also need to:
>
>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>    use the new flag.
>
>  - check flags in generic_copy_file_checks() to make sure only valid flags
>    are used (COPY_FILE_SPLICE at the moment).
>
> Also, where should this flag be defined?  include/uapi/linux/fs.h?

Grep for REMAP_FILE_
Same header file, same Documentation rst file.

>
> Cheers,
> --
> Luis
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..341d315d2a96 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>                                 struct file *file_out, loff_t pos_out,
>                                 size_t len, unsigned int flags)
>  {
> +       if (!(flags & COPY_FILE_SPLICE)) {
> +               if (!file_out->f_op->copy_file_range)
> +                       return -EOPNOTSUPP;
> +               else if (file_out->f_op->copy_file_range !=
> +                        file_in->f_op->copy_file_range)
> +                       return -EXDEV;
> +       }

That looks strange, because you are duplicating the logic in
do_copy_file_range(). Maybe better:

if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
        return -EINVAL;
if (flags & COPY_FILE_SPLICE)
       return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
                                 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
if (!file_out->f_op->copy_file_range)
        return -EOPNOTSUPP;
return -EXDEV;

>  }
> @@ -1474,9 +1481,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  {
>         ssize_t ret;
>
> -       if (flags != 0)
> -               return -EINVAL;
> -

This needs to move to the beginning of SYSCALL_DEFINE6(copy_file_range,...

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 13:51               ` Amir Goldstein
@ 2021-02-16 16:42                 ` Luis Henriques
  2021-02-16 17:44                   ` Amir Goldstein
  2021-02-16 18:54                 ` Andreas Dilger
  1 sibling, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-16 16:42 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

Amir Goldstein <amir73il@gmail.com> writes:

>> Ugh.  And I guess overlayfs may have a similar problem.
>
> Not exactly.
> Generally speaking, overlayfs should call vfs_copy_file_range()
> with the flags it got from layer above, so if called from nfsd it
> will allow cross fs copy and when called from syscall it won't.
>
> There are some corner cases where overlayfs could benefit from
> COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> let's leave those for now. Just leave overlayfs code as is.

Got it, thanks for clarifying.

>> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
>> > is internal to kernel users.
>> >
>> > FWIW, you may want to look at the loop in ovl_copy_up_data()
>> > for improvements to nfsd_copy_file_range().
>> >
>> > We can move the check out to copy_file_range syscall:
>> >
>> >         if (flags != 0)
>> >                 return -EINVAL;
>> >
>> > Leave the fallback from all filesystems and check for the
>> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>>
>> Ok, the diff bellow is just to make sure I understood your suggestion.
>>
>> The patch will also need to:
>>
>>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>>    use the new flag.
>>
>>  - check flags in generic_copy_file_checks() to make sure only valid flags
>>    are used (COPY_FILE_SPLICE at the moment).
>>
>> Also, where should this flag be defined?  include/uapi/linux/fs.h?
>
> Grep for REMAP_FILE_
> Same header file, same Documentation rst file.
>
>>
>> Cheers,
>> --
>> Luis
>>
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 75f764b43418..341d315d2a96 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>>                                 struct file *file_out, loff_t pos_out,
>>                                 size_t len, unsigned int flags)
>>  {
>> +       if (!(flags & COPY_FILE_SPLICE)) {
>> +               if (!file_out->f_op->copy_file_range)
>> +                       return -EOPNOTSUPP;
>> +               else if (file_out->f_op->copy_file_range !=
>> +                        file_in->f_op->copy_file_range)
>> +                       return -EXDEV;
>> +       }
>
> That looks strange, because you are duplicating the logic in
> do_copy_file_range(). Maybe better:
>
> if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
>         return -EINVAL;
> if (flags & COPY_FILE_SPLICE)
>        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);

My initial reasoning for duplicating the logic in do_copy_file_range() was
to allow the generic_copy_file_range() callers to be left unmodified and
allow the filesystems to default to this implementation.

With this change, I guess that the calls to generic_copy_file_range() from
the different filesystems can be dropped, as in my initial patch, as they
will always get -EINVAL.  The other option would be to set the
COPY_FILE_SPLICE flag in those calls, but that would get us back to the
problem we're trying to solve.

> if (!file_out->f_op->copy_file_range)
>         return -EOPNOTSUPP;
> return -EXDEV;
>
>>  }
>> @@ -1474,9 +1481,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>  {
>>         ssize_t ret;
>>
>> -       if (flags != 0)
>> -               return -EINVAL;
>> -
>
> This needs to move to the beginning of SYSCALL_DEFINE6(copy_file_range,...

Yep, I didn't included that change in my diff as I wasn't sure if you'd
like to have the flag visible in userspace.

Anyway, thanks for your patience!

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 16:42                 ` Luis Henriques
@ 2021-02-16 17:44                   ` Amir Goldstein
  2021-02-16 18:55                     ` Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-16 17:44 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Amir Goldstein <amir73il@gmail.com> writes:
>
> >> Ugh.  And I guess overlayfs may have a similar problem.
> >
> > Not exactly.
> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > with the flags it got from layer above, so if called from nfsd it
> > will allow cross fs copy and when called from syscall it won't.
> >
> > There are some corner cases where overlayfs could benefit from
> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > let's leave those for now. Just leave overlayfs code as is.
>
> Got it, thanks for clarifying.
>
> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> >> > is internal to kernel users.
> >> >
> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> >> > for improvements to nfsd_copy_file_range().
> >> >
> >> > We can move the check out to copy_file_range syscall:
> >> >
> >> >         if (flags != 0)
> >> >                 return -EINVAL;
> >> >
> >> > Leave the fallback from all filesystems and check for the
> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> >>
> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> >>
> >> The patch will also need to:
> >>
> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> >>    use the new flag.
> >>
> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> >>    are used (COPY_FILE_SPLICE at the moment).
> >>
> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> >
> > Grep for REMAP_FILE_
> > Same header file, same Documentation rst file.
> >
> >>
> >> Cheers,
> >> --
> >> Luis
> >>
> >> diff --git a/fs/read_write.c b/fs/read_write.c
> >> index 75f764b43418..341d315d2a96 100644
> >> --- a/fs/read_write.c
> >> +++ b/fs/read_write.c
> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >>                                 struct file *file_out, loff_t pos_out,
> >>                                 size_t len, unsigned int flags)
> >>  {
> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> >> +               if (!file_out->f_op->copy_file_range)
> >> +                       return -EOPNOTSUPP;
> >> +               else if (file_out->f_op->copy_file_range !=
> >> +                        file_in->f_op->copy_file_range)
> >> +                       return -EXDEV;
> >> +       }
> >
> > That looks strange, because you are duplicating the logic in
> > do_copy_file_range(). Maybe better:
> >
> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> >         return -EINVAL;
> > if (flags & COPY_FILE_SPLICE)
> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>
> My initial reasoning for duplicating the logic in do_copy_file_range() was
> to allow the generic_copy_file_range() callers to be left unmodified and
> allow the filesystems to default to this implementation.
>
> With this change, I guess that the calls to generic_copy_file_range() from
> the different filesystems can be dropped, as in my initial patch, as they
> will always get -EINVAL.  The other option would be to set the
> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> problem we're trying to solve.

I don't understand the problem.

What exactly is wrong with the code I suggested?
Why should any filesystem be changed?

Maybe I am missing something.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 13:51               ` Amir Goldstein
  2021-02-16 16:42                 ` Luis Henriques
@ 2021-02-16 18:54                 ` Andreas Dilger
  1 sibling, 0 replies; 93+ messages in thread
From: Andreas Dilger @ 2021-02-16 18:54 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Trond Myklebust, samba-technical, drinkcat, iant,
	linux-cifs, darrick.wong, linux-kernel, jlayton, anna.schumaker,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	gregkh, sfrench, James Simmons, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 2844 bytes --]

On Feb 16, 2021, at 6:51 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>> 
>>> This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
>>> is internal to kernel users.
>>> 
>>> FWIW, you may want to look at the loop in ovl_copy_up_data()
>>> for improvements to nfsd_copy_file_range().
>>> 
>>> We can move the check out to copy_file_range syscall:
>>> 
>>>        if (flags != 0)
>>>                return -EINVAL;
>>> 
>>> Leave the fallback from all filesystems and check for the
>>> COPY_FILE_SPLICE flag inside generic_copy_file_range().
>> 
>> Ok, the diff bellow is just to make sure I understood your suggestion.
>> 
>> The patch will also need to:
>> 
>> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>>   use the new flag.
>> 
>> - check flags in generic_copy_file_checks() to make sure only valid flags
>>   are used (COPY_FILE_SPLICE at the moment).
>> 
>> Also, where should this flag be defined?  include/uapi/linux/fs.h?
>> 
>> Cheers,
>> --
>> Luis
>> 
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 75f764b43418..341d315d2a96 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>>                                struct file *file_out, loff_t pos_out,
>>                                size_t len, unsigned int flags)
>> {
>> +       if (!(flags & COPY_FILE_SPLICE)) {
>> +               if (!file_out->f_op->copy_file_range)
>> +                       return -EOPNOTSUPP;
>> +               else if (file_out->f_op->copy_file_range !=
>> +                        file_in->f_op->copy_file_range)
>> +                       return -EXDEV;
>> +       }
> 
> That looks strange, because you are duplicating the logic in
> do_copy_file_range(). Maybe better:
> 
> if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
>        return -EINVAL;
> if (flags & COPY_FILE_SPLICE)
>       return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>                                 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> if (!file_out->f_op->copy_file_range)
>        return -EOPNOTSUPP;
> return -EXDEV;

This shouldn't return -EINVAL to userspace if the flag is not set.

That implies there *is* some valid way for userspace to call this
function, which is AFAICS not possible if COPY_FILE_SPLICE is only
available to in-kernel callers.  Instead, it should continue
to return -EOPNOTSUPP to userspace if copy_file_range() is not valid
for this combination of file descriptors, so that applications will
fall back to the non-CFR implementation.

The WARN_ON_ONCE(ret == -EOPNOTSUPP) in vfs_copy_file_range() would
also need to be removed if this will be triggered from userspace.


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 17:44                   ` Amir Goldstein
@ 2021-02-16 18:55                     ` Luis Henriques
  2021-02-16 19:20                       ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-16 18:55 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

Amir Goldstein <amir73il@gmail.com> writes:

> On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
>>
>> Amir Goldstein <amir73il@gmail.com> writes:
>>
>> >> Ugh.  And I guess overlayfs may have a similar problem.
>> >
>> > Not exactly.
>> > Generally speaking, overlayfs should call vfs_copy_file_range()
>> > with the flags it got from layer above, so if called from nfsd it
>> > will allow cross fs copy and when called from syscall it won't.
>> >
>> > There are some corner cases where overlayfs could benefit from
>> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
>> > let's leave those for now. Just leave overlayfs code as is.
>>
>> Got it, thanks for clarifying.
>>
>> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
>> >> > is internal to kernel users.
>> >> >
>> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
>> >> > for improvements to nfsd_copy_file_range().
>> >> >
>> >> > We can move the check out to copy_file_range syscall:
>> >> >
>> >> >         if (flags != 0)
>> >> >                 return -EINVAL;
>> >> >
>> >> > Leave the fallback from all filesystems and check for the
>> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>> >>
>> >> Ok, the diff bellow is just to make sure I understood your suggestion.
>> >>
>> >> The patch will also need to:
>> >>
>> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>> >>    use the new flag.
>> >>
>> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
>> >>    are used (COPY_FILE_SPLICE at the moment).
>> >>
>> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
>> >
>> > Grep for REMAP_FILE_
>> > Same header file, same Documentation rst file.
>> >
>> >>
>> >> Cheers,
>> >> --
>> >> Luis
>> >>
>> >> diff --git a/fs/read_write.c b/fs/read_write.c
>> >> index 75f764b43418..341d315d2a96 100644
>> >> --- a/fs/read_write.c
>> >> +++ b/fs/read_write.c
>> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>> >>                                 struct file *file_out, loff_t pos_out,
>> >>                                 size_t len, unsigned int flags)
>> >>  {
>> >> +       if (!(flags & COPY_FILE_SPLICE)) {
>> >> +               if (!file_out->f_op->copy_file_range)
>> >> +                       return -EOPNOTSUPP;
>> >> +               else if (file_out->f_op->copy_file_range !=
>> >> +                        file_in->f_op->copy_file_range)
>> >> +                       return -EXDEV;
>> >> +       }
>> >
>> > That looks strange, because you are duplicating the logic in
>> > do_copy_file_range(). Maybe better:
>> >
>> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
>> >         return -EINVAL;
>> > if (flags & COPY_FILE_SPLICE)
>> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>>
>> My initial reasoning for duplicating the logic in do_copy_file_range() was
>> to allow the generic_copy_file_range() callers to be left unmodified and
>> allow the filesystems to default to this implementation.
>>
>> With this change, I guess that the calls to generic_copy_file_range() from
>> the different filesystems can be dropped, as in my initial patch, as they
>> will always get -EINVAL.  The other option would be to set the
>> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
>> problem we're trying to solve.
>
> I don't understand the problem.
>
> What exactly is wrong with the code I suggested?
> Why should any filesystem be changed?
>
> Maybe I am missing something.

Ok, I have to do a full brain reboot and start all over.

Before that, I picked the code you suggested and tested it.  I've mounted
a cephfs filesystem and used xfs_io to execute a 'copy_range' command
using /sys/kernel/debug/sched_features as source.  The result was a
0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
early exit in:

	if (len == 0)
		return 0;

'len' is set in generic_copy_file_checks().

This means that we're not solving the original problem anymore (probably
since v1 of this patch, haven't checked).

Also, re-reading Trond's emails, I read: "... also disallowing the copy
from, say, an XFS formatted partition to an ext4 partition".  Isn't that
*exactly* what we're trying to do here?  I.e. _prevent_ these copies from
happening so that tracefs files can't be CFR'ed?

/me stops now and waits to see if the morning brings some sun :-)

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 18:55                     ` Luis Henriques
@ 2021-02-16 19:20                       ` Amir Goldstein
  2021-02-16 19:27                         ` Anna Schumaker
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-16 19:20 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, anna.schumaker, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Amir Goldstein <amir73il@gmail.com> writes:
>
> > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
> >>
> >> Amir Goldstein <amir73il@gmail.com> writes:
> >>
> >> >> Ugh.  And I guess overlayfs may have a similar problem.
> >> >
> >> > Not exactly.
> >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> >> > with the flags it got from layer above, so if called from nfsd it
> >> > will allow cross fs copy and when called from syscall it won't.
> >> >
> >> > There are some corner cases where overlayfs could benefit from
> >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> >> > let's leave those for now. Just leave overlayfs code as is.
> >>
> >> Got it, thanks for clarifying.
> >>
> >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> >> >> > is internal to kernel users.
> >> >> >
> >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> >> >> > for improvements to nfsd_copy_file_range().
> >> >> >
> >> >> > We can move the check out to copy_file_range syscall:
> >> >> >
> >> >> >         if (flags != 0)
> >> >> >                 return -EINVAL;
> >> >> >
> >> >> > Leave the fallback from all filesystems and check for the
> >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> >> >>
> >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> >> >>
> >> >> The patch will also need to:
> >> >>
> >> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> >> >>    use the new flag.
> >> >>
> >> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> >> >>    are used (COPY_FILE_SPLICE at the moment).
> >> >>
> >> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> >> >
> >> > Grep for REMAP_FILE_
> >> > Same header file, same Documentation rst file.
> >> >
> >> >>
> >> >> Cheers,
> >> >> --
> >> >> Luis
> >> >>
> >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> >> >> index 75f764b43418..341d315d2a96 100644
> >> >> --- a/fs/read_write.c
> >> >> +++ b/fs/read_write.c
> >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >> >>                                 struct file *file_out, loff_t pos_out,
> >> >>                                 size_t len, unsigned int flags)
> >> >>  {
> >> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> >> >> +               if (!file_out->f_op->copy_file_range)
> >> >> +                       return -EOPNOTSUPP;
> >> >> +               else if (file_out->f_op->copy_file_range !=
> >> >> +                        file_in->f_op->copy_file_range)
> >> >> +                       return -EXDEV;
> >> >> +       }
> >> >
> >> > That looks strange, because you are duplicating the logic in
> >> > do_copy_file_range(). Maybe better:
> >> >
> >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> >> >         return -EINVAL;
> >> > if (flags & COPY_FILE_SPLICE)
> >> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> >> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> >>
> >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> >> to allow the generic_copy_file_range() callers to be left unmodified and
> >> allow the filesystems to default to this implementation.
> >>
> >> With this change, I guess that the calls to generic_copy_file_range() from
> >> the different filesystems can be dropped, as in my initial patch, as they
> >> will always get -EINVAL.  The other option would be to set the
> >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> >> problem we're trying to solve.
> >
> > I don't understand the problem.
> >
> > What exactly is wrong with the code I suggested?
> > Why should any filesystem be changed?
> >
> > Maybe I am missing something.
>
> Ok, I have to do a full brain reboot and start all over.
>
> Before that, I picked the code you suggested and tested it.  I've mounted
> a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> using /sys/kernel/debug/sched_features as source.  The result was a
> 0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
> early exit in:
>
>         if (len == 0)
>                 return 0;
>
> 'len' is set in generic_copy_file_checks().

Good point.. I guess we will need to do all the checks earlier in
generic_copy_file_checks() including the logic of:

        if (file_in->f_op->remap_file_range &&
            file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)


>
> This means that we're not solving the original problem anymore (probably
> since v1 of this patch, haven't checked).
>
> Also, re-reading Trond's emails, I read: "... also disallowing the copy
> from, say, an XFS formatted partition to an ext4 partition".  Isn't that
> *exactly* what we're trying to do here?  I.e. _prevent_ these copies from
> happening so that tracefs files can't be CFR'ed?
>

We want to address the report which means calls coming from
copy_file_range() syscall.

Trond's use case is vfs_copy_file_range() coming from nfsd.
When he writes about copy from XFS to ext4, he means an
NFS client is issuing server side copy (on same or different NFS mounts)
and the NFS server is executing nfsd_copy_file_range() on a source
file that happens to be on XFS and destination happens to be on ext4.

We can undo the copy_file_range() syscall change of behavior from
v5.3 without regressing the NFS use case.

We just need to be careful and look at all the affected code paths.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 19:20                       ` Amir Goldstein
@ 2021-02-16 19:27                         ` Anna Schumaker
  2021-02-16 19:31                           ` Steve French
  0 siblings, 1 reply; 93+ messages in thread
From: Anna Schumaker @ 2021-02-16 19:27 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Trond Myklebust, samba-technical, drinkcat, iant,
	linux-cifs, darrick.wong, linux-kernel, jlayton, llozano,
	linux-nfs, miklos, viro, dchinner, linux-fsdevel, gregkh,
	sfrench, ceph-devel

On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > Amir Goldstein <amir73il@gmail.com> writes:
> >
> > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
> > >>
> > >> Amir Goldstein <amir73il@gmail.com> writes:
> > >>
> > >> >> Ugh.  And I guess overlayfs may have a similar problem.
> > >> >
> > >> > Not exactly.
> > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > >> > with the flags it got from layer above, so if called from nfsd it
> > >> > will allow cross fs copy and when called from syscall it won't.
> > >> >
> > >> > There are some corner cases where overlayfs could benefit from
> > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > >> > let's leave those for now. Just leave overlayfs code as is.
> > >>
> > >> Got it, thanks for clarifying.
> > >>
> > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > >> >> > is internal to kernel users.
> > >> >> >
> > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > >> >> > for improvements to nfsd_copy_file_range().
> > >> >> >
> > >> >> > We can move the check out to copy_file_range syscall:
> > >> >> >
> > >> >> >         if (flags != 0)
> > >> >> >                 return -EINVAL;
> > >> >> >
> > >> >> > Leave the fallback from all filesystems and check for the
> > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > >> >>
> > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > >> >>
> > >> >> The patch will also need to:
> > >> >>
> > >> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > >> >>    use the new flag.
> > >> >>
> > >> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> > >> >>    are used (COPY_FILE_SPLICE at the moment).
> > >> >>
> > >> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> > >> >
> > >> > Grep for REMAP_FILE_
> > >> > Same header file, same Documentation rst file.
> > >> >
> > >> >>
> > >> >> Cheers,
> > >> >> --
> > >> >> Luis
> > >> >>
> > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > >> >> index 75f764b43418..341d315d2a96 100644
> > >> >> --- a/fs/read_write.c
> > >> >> +++ b/fs/read_write.c
> > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > >> >>                                 struct file *file_out, loff_t pos_out,
> > >> >>                                 size_t len, unsigned int flags)
> > >> >>  {
> > >> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> > >> >> +               if (!file_out->f_op->copy_file_range)
> > >> >> +                       return -EOPNOTSUPP;
> > >> >> +               else if (file_out->f_op->copy_file_range !=
> > >> >> +                        file_in->f_op->copy_file_range)
> > >> >> +                       return -EXDEV;
> > >> >> +       }
> > >> >
> > >> > That looks strange, because you are duplicating the logic in
> > >> > do_copy_file_range(). Maybe better:
> > >> >
> > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > >> >         return -EINVAL;
> > >> > if (flags & COPY_FILE_SPLICE)
> > >> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > >> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > >>
> > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > >> allow the filesystems to default to this implementation.
> > >>
> > >> With this change, I guess that the calls to generic_copy_file_range() from
> > >> the different filesystems can be dropped, as in my initial patch, as they
> > >> will always get -EINVAL.  The other option would be to set the
> > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > >> problem we're trying to solve.
> > >
> > > I don't understand the problem.
> > >
> > > What exactly is wrong with the code I suggested?
> > > Why should any filesystem be changed?
> > >
> > > Maybe I am missing something.
> >
> > Ok, I have to do a full brain reboot and start all over.
> >
> > Before that, I picked the code you suggested and tested it.  I've mounted
> > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > using /sys/kernel/debug/sched_features as source.  The result was a
> > 0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
> > early exit in:
> >
> >         if (len == 0)
> >                 return 0;
> >
> > 'len' is set in generic_copy_file_checks().
>
> Good point.. I guess we will need to do all the checks earlier in
> generic_copy_file_checks() including the logic of:
>
>         if (file_in->f_op->remap_file_range &&
>             file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
>
>
> >
> > This means that we're not solving the original problem anymore (probably
> > since v1 of this patch, haven't checked).
> >
> > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > from, say, an XFS formatted partition to an ext4 partition".  Isn't that
> > *exactly* what we're trying to do here?  I.e. _prevent_ these copies from
> > happening so that tracefs files can't be CFR'ed?
> >
>
> We want to address the report which means calls coming from
> copy_file_range() syscall.
>
> Trond's use case is vfs_copy_file_range() coming from nfsd.
> When he writes about copy from XFS to ext4, he means an
> NFS client is issuing server side copy (on same or different NFS mounts)
> and the NFS server is executing nfsd_copy_file_range() on a source
> file that happens to be on XFS and destination happens to be on ext4.

NFS also supports a server-to-server copy where the destination server
mounts the source server and reads the data to be copied. Please don't
break that either :)

Anna

>
> We can undo the copy_file_range() syscall change of behavior from
> v5.3 without regressing the NFS use case.
>
> We just need to be careful and look at all the affected code paths.
>
> Thanks,
> Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 19:27                         ` Anna Schumaker
@ 2021-02-16 19:31                           ` Steve French
  2021-02-16 19:40                             ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Steve French @ 2021-02-16 19:31 UTC (permalink / raw)
  To: Anna Schumaker
  Cc: Amir Goldstein, Luis Henriques, Trond Myklebust, samba-technical,
	drinkcat, iant, linux-cifs, darrick.wong, linux-kernel, jlayton,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	gregkh, sfrench, ceph-devel

On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
<anna.schumaker@netapp.com> wrote:
>
> On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <lhenriques@suse.de> wrote:
> > >
> > > Amir Goldstein <amir73il@gmail.com> writes:
> > >
> > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > >>
> > > >> Amir Goldstein <amir73il@gmail.com> writes:
> > > >>
> > > >> >> Ugh.  And I guess overlayfs may have a similar problem.
> > > >> >
> > > >> > Not exactly.
> > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > >> > with the flags it got from layer above, so if called from nfsd it
> > > >> > will allow cross fs copy and when called from syscall it won't.
> > > >> >
> > > >> > There are some corner cases where overlayfs could benefit from
> > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > >>
> > > >> Got it, thanks for clarifying.
> > > >>
> > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > >> >> > is internal to kernel users.
> > > >> >> >
> > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > >> >> > for improvements to nfsd_copy_file_range().
> > > >> >> >
> > > >> >> > We can move the check out to copy_file_range syscall:
> > > >> >> >
> > > >> >> >         if (flags != 0)
> > > >> >> >                 return -EINVAL;
> > > >> >> >
> > > >> >> > Leave the fallback from all filesystems and check for the
> > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > >> >>
> > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > >> >>
> > > >> >> The patch will also need to:
> > > >> >>
> > > >> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > >> >>    use the new flag.
> > > >> >>
> > > >> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> > > >> >>    are used (COPY_FILE_SPLICE at the moment).
> > > >> >>
> > > >> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> > > >> >
> > > >> > Grep for REMAP_FILE_
> > > >> > Same header file, same Documentation rst file.
> > > >> >
> > > >> >>
> > > >> >> Cheers,
> > > >> >> --
> > > >> >> Luis
> > > >> >>
> > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > >> >> index 75f764b43418..341d315d2a96 100644
> > > >> >> --- a/fs/read_write.c
> > > >> >> +++ b/fs/read_write.c
> > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > >> >>                                 struct file *file_out, loff_t pos_out,
> > > >> >>                                 size_t len, unsigned int flags)
> > > >> >>  {
> > > >> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> > > >> >> +               if (!file_out->f_op->copy_file_range)
> > > >> >> +                       return -EOPNOTSUPP;
> > > >> >> +               else if (file_out->f_op->copy_file_range !=
> > > >> >> +                        file_in->f_op->copy_file_range)
> > > >> >> +                       return -EXDEV;
> > > >> >> +       }
> > > >> >
> > > >> > That looks strange, because you are duplicating the logic in
> > > >> > do_copy_file_range(). Maybe better:
> > > >> >
> > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > >> >         return -EINVAL;
> > > >> > if (flags & COPY_FILE_SPLICE)
> > > >> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > >> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > >>
> > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > >> allow the filesystems to default to this implementation.
> > > >>
> > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > >> will always get -EINVAL.  The other option would be to set the
> > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > >> problem we're trying to solve.
> > > >
> > > > I don't understand the problem.
> > > >
> > > > What exactly is wrong with the code I suggested?
> > > > Why should any filesystem be changed?
> > > >
> > > > Maybe I am missing something.
> > >
> > > Ok, I have to do a full brain reboot and start all over.
> > >
> > > Before that, I picked the code you suggested and tested it.  I've mounted
> > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > using /sys/kernel/debug/sched_features as source.  The result was a
> > > 0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
> > > early exit in:
> > >
> > >         if (len == 0)
> > >                 return 0;
> > >
> > > 'len' is set in generic_copy_file_checks().
> >
> > Good point.. I guess we will need to do all the checks earlier in
> > generic_copy_file_checks() including the logic of:
> >
> >         if (file_in->f_op->remap_file_range &&
> >             file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> >
> >
> > >
> > > This means that we're not solving the original problem anymore (probably
> > > since v1 of this patch, haven't checked).
> > >
> > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > from, say, an XFS formatted partition to an ext4 partition".  Isn't that
> > > *exactly* what we're trying to do here?  I.e. _prevent_ these copies from
> > > happening so that tracefs files can't be CFR'ed?
> > >
> >
> > We want to address the report which means calls coming from
> > copy_file_range() syscall.
> >
> > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > When he writes about copy from XFS to ext4, he means an
> > NFS client is issuing server side copy (on same or different NFS mounts)
> > and the NFS server is executing nfsd_copy_file_range() on a source
> > file that happens to be on XFS and destination happens to be on ext4.
>
> NFS also supports a server-to-server copy where the destination server
> mounts the source server and reads the data to be copied. Please don't
> break that either :)

This is a case we will eventually need to support for cifs (SMB3) as well.


-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 19:31                           ` Steve French
@ 2021-02-16 19:40                             ` Amir Goldstein
  2021-02-16 21:15                               ` Steve French
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-16 19:40 UTC (permalink / raw)
  To: Steve French
  Cc: Anna Schumaker, Luis Henriques, Trond Myklebust, samba-technical,
	drinkcat, iant, linux-cifs, darrick.wong, linux-kernel, jlayton,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	gregkh, sfrench, ceph-devel

On Tue, Feb 16, 2021 at 9:31 PM Steve French <smfrench@gmail.com> wrote:
>
> On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
> <anna.schumaker@netapp.com> wrote:
> >
> > On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > >
> > > > Amir Goldstein <amir73il@gmail.com> writes:
> > > >
> > > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > >>
> > > > >> Amir Goldstein <amir73il@gmail.com> writes:
> > > > >>
> > > > >> >> Ugh.  And I guess overlayfs may have a similar problem.
> > > > >> >
> > > > >> > Not exactly.
> > > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > > >> > with the flags it got from layer above, so if called from nfsd it
> > > > >> > will allow cross fs copy and when called from syscall it won't.
> > > > >> >
> > > > >> > There are some corner cases where overlayfs could benefit from
> > > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > > >>
> > > > >> Got it, thanks for clarifying.
> > > > >>
> > > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > > >> >> > is internal to kernel users.
> > > > >> >> >
> > > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > > >> >> > for improvements to nfsd_copy_file_range().
> > > > >> >> >
> > > > >> >> > We can move the check out to copy_file_range syscall:
> > > > >> >> >
> > > > >> >> >         if (flags != 0)
> > > > >> >> >                 return -EINVAL;
> > > > >> >> >
> > > > >> >> > Leave the fallback from all filesystems and check for the
> > > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > > >> >>
> > > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > > >> >>
> > > > >> >> The patch will also need to:
> > > > >> >>
> > > > >> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > > >> >>    use the new flag.
> > > > >> >>
> > > > >> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> > > > >> >>    are used (COPY_FILE_SPLICE at the moment).
> > > > >> >>
> > > > >> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> > > > >> >
> > > > >> > Grep for REMAP_FILE_
> > > > >> > Same header file, same Documentation rst file.
> > > > >> >
> > > > >> >>
> > > > >> >> Cheers,
> > > > >> >> --
> > > > >> >> Luis
> > > > >> >>
> > > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > > >> >> index 75f764b43418..341d315d2a96 100644
> > > > >> >> --- a/fs/read_write.c
> > > > >> >> +++ b/fs/read_write.c
> > > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > >> >>                                 struct file *file_out, loff_t pos_out,
> > > > >> >>                                 size_t len, unsigned int flags)
> > > > >> >>  {
> > > > >> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> > > > >> >> +               if (!file_out->f_op->copy_file_range)
> > > > >> >> +                       return -EOPNOTSUPP;
> > > > >> >> +               else if (file_out->f_op->copy_file_range !=
> > > > >> >> +                        file_in->f_op->copy_file_range)
> > > > >> >> +                       return -EXDEV;
> > > > >> >> +       }
> > > > >> >
> > > > >> > That looks strange, because you are duplicating the logic in
> > > > >> > do_copy_file_range(). Maybe better:
> > > > >> >
> > > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > > >> >         return -EINVAL;
> > > > >> > if (flags & COPY_FILE_SPLICE)
> > > > >> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > >> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > >>
> > > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > > >> allow the filesystems to default to this implementation.
> > > > >>
> > > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > > >> will always get -EINVAL.  The other option would be to set the
> > > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > > >> problem we're trying to solve.
> > > > >
> > > > > I don't understand the problem.
> > > > >
> > > > > What exactly is wrong with the code I suggested?
> > > > > Why should any filesystem be changed?
> > > > >
> > > > > Maybe I am missing something.
> > > >
> > > > Ok, I have to do a full brain reboot and start all over.
> > > >
> > > > Before that, I picked the code you suggested and tested it.  I've mounted
> > > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > > using /sys/kernel/debug/sched_features as source.  The result was a
> > > > 0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
> > > > early exit in:
> > > >
> > > >         if (len == 0)
> > > >                 return 0;
> > > >
> > > > 'len' is set in generic_copy_file_checks().
> > >
> > > Good point.. I guess we will need to do all the checks earlier in
> > > generic_copy_file_checks() including the logic of:
> > >
> > >         if (file_in->f_op->remap_file_range &&
> > >             file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> > >
> > >
> > > >
> > > > This means that we're not solving the original problem anymore (probably
> > > > since v1 of this patch, haven't checked).
> > > >
> > > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > > from, say, an XFS formatted partition to an ext4 partition".  Isn't that
> > > > *exactly* what we're trying to do here?  I.e. _prevent_ these copies from
> > > > happening so that tracefs files can't be CFR'ed?
> > > >
> > >
> > > We want to address the report which means calls coming from
> > > copy_file_range() syscall.
> > >
> > > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > > When he writes about copy from XFS to ext4, he means an
> > > NFS client is issuing server side copy (on same or different NFS mounts)
> > > and the NFS server is executing nfsd_copy_file_range() on a source
> > > file that happens to be on XFS and destination happens to be on ext4.
> >
> > NFS also supports a server-to-server copy where the destination server
> > mounts the source server and reads the data to be copied. Please don't
> > break that either :)
>

As long as the copy is via nfsd_copy_file_range() and not from the syscall
it should not regress.

> This is a case we will eventually need to support for cifs (SMB3) as well.
>

samba already does server side copy very well without needing any support
from the kernel.

nfsd also doesn't *need* to use vfs_copy_file_range() it can use kernel APIs
like the loop in ovl_copy_up_data(). But it does, so we should not regress it.

samba/nfsd can try to use copy_file_range() and it will work if the
source/target
fs support it. Otherwise, the server can perfectly well do the copy via other
available interfaces, just like userspace copy tools.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 19:40                             ` Amir Goldstein
@ 2021-02-16 21:15                               ` Steve French
  2021-02-17  8:08                                 ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Steve French @ 2021-02-16 21:15 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Anna Schumaker, Luis Henriques, Trond Myklebust, samba-technical,
	drinkcat, iant, linux-cifs, darrick.wong, linux-kernel, jlayton,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	gregkh, sfrench, ceph-devel

On Tue, Feb 16, 2021 at 1:40 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Tue, Feb 16, 2021 at 9:31 PM Steve French <smfrench@gmail.com> wrote:
> >
> > On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
> > <anna.schumaker@netapp.com> wrote:
> > >
> > > On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > >
> > > > > Amir Goldstein <amir73il@gmail.com> writes:
> > > > >
> > > > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > > >>
> > > > > >> Amir Goldstein <amir73il@gmail.com> writes:
> > > > > >>
> > > > > >> >> Ugh.  And I guess overlayfs may have a similar problem.
> > > > > >> >
> > > > > >> > Not exactly.
> > > > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > > > >> > with the flags it got from layer above, so if called from nfsd it
> > > > > >> > will allow cross fs copy and when called from syscall it won't.
> > > > > >> >
> > > > > >> > There are some corner cases where overlayfs could benefit from
> > > > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > > > >>
> > > > > >> Got it, thanks for clarifying.
> > > > > >>
> > > > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > > > >> >> > is internal to kernel users.
> > > > > >> >> >
> > > > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > > > >> >> > for improvements to nfsd_copy_file_range().
> > > > > >> >> >
> > > > > >> >> > We can move the check out to copy_file_range syscall:
> > > > > >> >> >
> > > > > >> >> >         if (flags != 0)
> > > > > >> >> >                 return -EINVAL;
> > > > > >> >> >
> > > > > >> >> > Leave the fallback from all filesystems and check for the
> > > > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > > > >> >>
> > > > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > > > >> >>
> > > > > >> >> The patch will also need to:
> > > > > >> >>
> > > > > >> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > > > >> >>    use the new flag.
> > > > > >> >>
> > > > > >> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> > > > > >> >>    are used (COPY_FILE_SPLICE at the moment).
> > > > > >> >>
> > > > > >> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> > > > > >> >
> > > > > >> > Grep for REMAP_FILE_
> > > > > >> > Same header file, same Documentation rst file.
> > > > > >> >
> > > > > >> >>
> > > > > >> >> Cheers,
> > > > > >> >> --
> > > > > >> >> Luis
> > > > > >> >>
> > > > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > >> >> index 75f764b43418..341d315d2a96 100644
> > > > > >> >> --- a/fs/read_write.c
> > > > > >> >> +++ b/fs/read_write.c
> > > > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > > >> >>                                 struct file *file_out, loff_t pos_out,
> > > > > >> >>                                 size_t len, unsigned int flags)
> > > > > >> >>  {
> > > > > >> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> > > > > >> >> +               if (!file_out->f_op->copy_file_range)
> > > > > >> >> +                       return -EOPNOTSUPP;
> > > > > >> >> +               else if (file_out->f_op->copy_file_range !=
> > > > > >> >> +                        file_in->f_op->copy_file_range)
> > > > > >> >> +                       return -EXDEV;
> > > > > >> >> +       }
> > > > > >> >
> > > > > >> > That looks strange, because you are duplicating the logic in
> > > > > >> > do_copy_file_range(). Maybe better:
> > > > > >> >
> > > > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > > > >> >         return -EINVAL;
> > > > > >> > if (flags & COPY_FILE_SPLICE)
> > > > > >> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > > >> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > > >>
> > > > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > > > >> allow the filesystems to default to this implementation.
> > > > > >>
> > > > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > > > >> will always get -EINVAL.  The other option would be to set the
> > > > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > > > >> problem we're trying to solve.
> > > > > >
> > > > > > I don't understand the problem.
> > > > > >
> > > > > > What exactly is wrong with the code I suggested?
> > > > > > Why should any filesystem be changed?
> > > > > >
> > > > > > Maybe I am missing something.
> > > > >
> > > > > Ok, I have to do a full brain reboot and start all over.
> > > > >
> > > > > Before that, I picked the code you suggested and tested it.  I've mounted
> > > > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > > > using /sys/kernel/debug/sched_features as source.  The result was a
> > > > > 0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
> > > > > early exit in:
> > > > >
> > > > >         if (len == 0)
> > > > >                 return 0;
> > > > >
> > > > > 'len' is set in generic_copy_file_checks().
> > > >
> > > > Good point.. I guess we will need to do all the checks earlier in
> > > > generic_copy_file_checks() including the logic of:
> > > >
> > > >         if (file_in->f_op->remap_file_range &&
> > > >             file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> > > >
> > > >
> > > > >
> > > > > This means that we're not solving the original problem anymore (probably
> > > > > since v1 of this patch, haven't checked).
> > > > >
> > > > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > > > from, say, an XFS formatted partition to an ext4 partition".  Isn't that
> > > > > *exactly* what we're trying to do here?  I.e. _prevent_ these copies from
> > > > > happening so that tracefs files can't be CFR'ed?
> > > > >
> > > >
> > > > We want to address the report which means calls coming from
> > > > copy_file_range() syscall.
> > > >
> > > > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > > > When he writes about copy from XFS to ext4, he means an
> > > > NFS client is issuing server side copy (on same or different NFS mounts)
> > > > and the NFS server is executing nfsd_copy_file_range() on a source
> > > > file that happens to be on XFS and destination happens to be on ext4.
> > >
> > > NFS also supports a server-to-server copy where the destination server
> > > mounts the source server and reads the data to be copied. Please don't
> > > break that either :)
> >
>
> As long as the copy is via nfsd_copy_file_range() and not from the syscall
> it should not regress.
>
> > This is a case we will eventually need to support for cifs (SMB3) as well.
> >
>
> samba already does server side copy very well without needing any support
> from the kernel.
>
> nfsd also doesn't *need* to use vfs_copy_file_range() it can use kernel APIs
> like the loop in ovl_copy_up_data(). But it does, so we should not regress it.
>
> samba/nfsd can try to use copy_file_range() and it will work if the
> source/target
> fs support it. Otherwise, the server can perfectly well do the copy via other
> available interfaces, just like userspace copy tools.

I was thinking about cifsd ("ksmbd") the kernel server from
Namjae/Sergey etc. which is making excellent progress.

-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 15:43 ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Luis Henriques
  2021-02-15 16:02   ` Trond Myklebust
  2021-02-15 16:34   ` Amir Goldstein
@ 2021-02-17  4:45   ` Nicolas Boichat
  2021-02-18  7:42   ` Christoph Hellwig
  3 siblings, 0 replies; 93+ messages in thread
From: Nicolas Boichat @ 2021-02-17  4:45 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Ian Lance Taylor, Luis Lozano, ceph-devel,
	lkml, linux-cifs, samba-technical, linux-fsdevel, linux-nfs

On Mon, Feb 15, 2021 at 11:42 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Nicolas Boichat reported an issue when trying to use the copy_file_range
> syscall on a tracefs file.  It failed silently because the file content is
> generated on-the-fly (reporting a size of zero) and copy_file_range needs
> to know in advance how much data is present.

Not sure if you have the whole history, these links and discussion can
help if you want to expand on the commit message:
[1] http://issuetracker.google.com/issues/178332739
[2] https://lkml.org/lkml/2021/1/25/64
[3] https://lkml.org/lkml/2021/1/26/1736
[4] https://patchwork.kernel.org/project/linux-fsdevel/cover/20210212044405.4120619-1-drinkcat@chromium.org/

> This commit restores the cross-fs restrictions that existed prior to
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") and
> removes generic_copy_file_range() calls from ceph, cifs, fuse, and nfs.

It goes beyond that, I think this also prevents copies within the same
FS if copy_file_range is not implemented. Which is IMHO a good thing
since this has been broken on procfs and friends ever since
copy_file_range was implemented (but I assume that nobody ever hit
that before cross-fs became available).

>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Cc: Nicolas Boichat <drinkcat@chromium.org>

You could replace that with Reported-by: Nicolas Boichat <drinkcat@chromium.org>

> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/ceph/file.c     | 21 +++-----------------
>  fs/cifs/cifsfs.c   |  3 ---
>  fs/fuse/file.c     | 21 +++-----------------
>  fs/nfs/nfs4file.c  | 20 +++----------------
>  fs/read_write.c    | 49 ++++++++++------------------------------------
>  include/linux/fs.h |  3 ---
>  6 files changed, 19 insertions(+), 98 deletions(-)
>
[snip]
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..b217cd62ae0d 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1358,40 +1358,12 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
>  }
>  #endif
>
> -/**
> - * generic_copy_file_range - copy data between two files
> - * @file_in:   file structure to read from
> - * @pos_in:    file offset to read from
> - * @file_out:  file structure to write data to
> - * @pos_out:   file offset to write data to
> - * @len:       amount of data to copy
> - * @flags:     copy flags
> - *
> - * This is a generic filesystem helper to copy data from one file to another.
> - * It has no constraints on the source or destination file owners - the files
> - * can belong to different superblocks and different filesystem types. Short
> - * copies are allowed.
> - *
> - * This should be called from the @file_out filesystem, as per the
> - * ->copy_file_range() method.
> - *
> - * Returns the number of bytes copied or a negative error indicating the
> - * failure.
> - */
> -
> -ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> -                               struct file *file_out, loff_t pos_out,
> -                               size_t len, unsigned int flags)
> -{
> -       return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> -                               len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> -}
> -EXPORT_SYMBOL(generic_copy_file_range);
> -
>  static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
>                                   struct file *file_out, loff_t pos_out,
>                                   size_t len, unsigned int flags)
>  {
> +       ssize_t ret = -EXDEV;
> +
>         /*
>          * Although we now allow filesystems to handle cross sb copy, passing
>          * a file of the wrong filesystem type to filesystem driver can result
> @@ -1400,14 +1372,14 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
>          * several different file_system_type structures, but they all end up
>          * using the same ->copy_file_range() function pointer.
>          */
> -       if (file_out->f_op->copy_file_range &&
> -           file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> -               return file_out->f_op->copy_file_range(file_in, pos_in,
> -                                                      file_out, pos_out,
> -                                                      len, flags);
> +       if (!file_out->f_op->copy_file_range)
> +               ret = -EOPNOTSUPP;

This doesn't work as the 0-filesize check is done before that in
vfs_copy_file_range (so the syscall still returns 0, works fine if you
comment out `if (len == 0)`).

Also, you need to check for file_in->f_op->copy_file_range instead,
the problem is if the _input_ filesystem doesn't report sizes or can't
seek properly.

> +       else if (file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> +               ret = file_out->f_op->copy_file_range(file_in, pos_in,
> +                                                     file_out, pos_out,
> +                                                     len, flags);
>
> -       return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                                      flags);
> +       return ret;
>  }
>
>  /*
> @@ -1514,8 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>         }
>
>         ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                               flags);
> -       WARN_ON_ONCE(ret == -EOPNOTSUPP);
> +                                flags);
>  done:
>         if (ret > 0) {
>                 fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-16 21:15                               ` Steve French
@ 2021-02-17  8:08                                 ` Amir Goldstein
  2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
  2021-02-18  0:50                                   ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Andreas Dilger
  0 siblings, 2 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-17  8:08 UTC (permalink / raw)
  To: Steve French
  Cc: Anna Schumaker, Luis Henriques, Trond Myklebust, samba-technical,
	drinkcat, iant, linux-cifs, darrick.wong, linux-kernel, jlayton,
	llozano, linux-nfs, miklos, viro, dchinner, linux-fsdevel,
	gregkh, sfrench, ceph-devel

On Tue, Feb 16, 2021 at 11:15 PM Steve French <smfrench@gmail.com> wrote:
>
> On Tue, Feb 16, 2021 at 1:40 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Tue, Feb 16, 2021 at 9:31 PM Steve French <smfrench@gmail.com> wrote:
> > >
> > > On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
> > > <anna.schumaker@netapp.com> wrote:
> > > >
> > > > On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > >
> > > > > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > > >
> > > > > > Amir Goldstein <amir73il@gmail.com> writes:
> > > > > >
> > > > > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > > > >>
> > > > > > >> Amir Goldstein <amir73il@gmail.com> writes:
> > > > > > >>
> > > > > > >> >> Ugh.  And I guess overlayfs may have a similar problem.
> > > > > > >> >
> > > > > > >> > Not exactly.
> > > > > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > > > > >> > with the flags it got from layer above, so if called from nfsd it
> > > > > > >> > will allow cross fs copy and when called from syscall it won't.
> > > > > > >> >
> > > > > > >> > There are some corner cases where overlayfs could benefit from
> > > > > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > > > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > > > > >>
> > > > > > >> Got it, thanks for clarifying.
> > > > > > >>
> > > > > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > > > > >> >> > is internal to kernel users.
> > > > > > >> >> >
> > > > > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > > > > >> >> > for improvements to nfsd_copy_file_range().
> > > > > > >> >> >
> > > > > > >> >> > We can move the check out to copy_file_range syscall:
> > > > > > >> >> >
> > > > > > >> >> >         if (flags != 0)
> > > > > > >> >> >                 return -EINVAL;
> > > > > > >> >> >
> > > > > > >> >> > Leave the fallback from all filesystems and check for the
> > > > > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > > > > >> >>
> > > > > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > > > > >> >>
> > > > > > >> >> The patch will also need to:
> > > > > > >> >>
> > > > > > >> >>  - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > > > > >> >>    use the new flag.
> > > > > > >> >>
> > > > > > >> >>  - check flags in generic_copy_file_checks() to make sure only valid flags
> > > > > > >> >>    are used (COPY_FILE_SPLICE at the moment).
> > > > > > >> >>
> > > > > > >> >> Also, where should this flag be defined?  include/uapi/linux/fs.h?
> > > > > > >> >
> > > > > > >> > Grep for REMAP_FILE_
> > > > > > >> > Same header file, same Documentation rst file.
> > > > > > >> >
> > > > > > >> >>
> > > > > > >> >> Cheers,
> > > > > > >> >> --
> > > > > > >> >> Luis
> > > > > > >> >>
> > > > > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > > >> >> index 75f764b43418..341d315d2a96 100644
> > > > > > >> >> --- a/fs/read_write.c
> > > > > > >> >> +++ b/fs/read_write.c
> > > > > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > > > >> >>                                 struct file *file_out, loff_t pos_out,
> > > > > > >> >>                                 size_t len, unsigned int flags)
> > > > > > >> >>  {
> > > > > > >> >> +       if (!(flags & COPY_FILE_SPLICE)) {
> > > > > > >> >> +               if (!file_out->f_op->copy_file_range)
> > > > > > >> >> +                       return -EOPNOTSUPP;
> > > > > > >> >> +               else if (file_out->f_op->copy_file_range !=
> > > > > > >> >> +                        file_in->f_op->copy_file_range)
> > > > > > >> >> +                       return -EXDEV;
> > > > > > >> >> +       }
> > > > > > >> >
> > > > > > >> > That looks strange, because you are duplicating the logic in
> > > > > > >> > do_copy_file_range(). Maybe better:
> > > > > > >> >
> > > > > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > > > > >> >         return -EINVAL;
> > > > > > >> > if (flags & COPY_FILE_SPLICE)
> > > > > > >> >        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > > > >> >                                  len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > > > >>
> > > > > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > > > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > > > > >> allow the filesystems to default to this implementation.
> > > > > > >>
> > > > > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > > > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > > > > >> will always get -EINVAL.  The other option would be to set the
> > > > > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > > > > >> problem we're trying to solve.
> > > > > > >
> > > > > > > I don't understand the problem.
> > > > > > >
> > > > > > > What exactly is wrong with the code I suggested?
> > > > > > > Why should any filesystem be changed?
> > > > > > >
> > > > > > > Maybe I am missing something.
> > > > > >
> > > > > > Ok, I have to do a full brain reboot and start all over.
> > > > > >
> > > > > > Before that, I picked the code you suggested and tested it.  I've mounted
> > > > > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > > > > using /sys/kernel/debug/sched_features as source.  The result was a
> > > > > > 0-sized file in cephfs.  And the reason is thevfs_copy_file_range()
> > > > > > early exit in:
> > > > > >
> > > > > >         if (len == 0)
> > > > > >                 return 0;
> > > > > >
> > > > > > 'len' is set in generic_copy_file_checks().
> > > > >
> > > > > Good point.. I guess we will need to do all the checks earlier in
> > > > > generic_copy_file_checks() including the logic of:
> > > > >
> > > > >         if (file_in->f_op->remap_file_range &&
> > > > >             file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> > > > >
> > > > >
> > > > > >
> > > > > > This means that we're not solving the original problem anymore (probably
> > > > > > since v1 of this patch, haven't checked).
> > > > > >
> > > > > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > > > > from, say, an XFS formatted partition to an ext4 partition".  Isn't that
> > > > > > *exactly* what we're trying to do here?  I.e. _prevent_ these copies from
> > > > > > happening so that tracefs files can't be CFR'ed?
> > > > > >
> > > > >
> > > > > We want to address the report which means calls coming from
> > > > > copy_file_range() syscall.
> > > > >
> > > > > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > > > > When he writes about copy from XFS to ext4, he means an
> > > > > NFS client is issuing server side copy (on same or different NFS mounts)
> > > > > and the NFS server is executing nfsd_copy_file_range() on a source
> > > > > file that happens to be on XFS and destination happens to be on ext4.
> > > >
> > > > NFS also supports a server-to-server copy where the destination server
> > > > mounts the source server and reads the data to be copied. Please don't
> > > > break that either :)
> > >
> >
> > As long as the copy is via nfsd_copy_file_range() and not from the syscall
> > it should not regress.
> >
> > > This is a case we will eventually need to support for cifs (SMB3) as well.
> > >
> >
> > samba already does server side copy very well without needing any support
> > from the kernel.
> >
> > nfsd also doesn't *need* to use vfs_copy_file_range() it can use kernel APIs
> > like the loop in ovl_copy_up_data(). But it does, so we should not regress it.
> >
> > samba/nfsd can try to use copy_file_range() and it will work if the
> > source/target
> > fs support it. Otherwise, the server can perfectly well do the copy via other
> > available interfaces, just like userspace copy tools.
>
> I was thinking about cifsd ("ksmbd") the kernel server from
> Namjae/Sergey etc. which is making excellent progress.
>

You are missing my point.
Never mind which server. The server does not *need* to rely on
vfs_copy_file_range() to copy files from XFS to ext4.
The server is very capable of implementing the fallback generic copy
in case source/target fs do not support native {copy,remap}_file_range().

w.r.t semantics of copy_file_range() syscall vs. the fallback to userespace
'cp' tool (check source file size before copy or not), please note that the
semantics of CIFS_IOC_COPYCHUNK_FILE are that of the former:

        rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
                                        src_inode->i_size, 0);

It will copy zero bytes if advertised source file size if zero.

NFS server side copy semantics are currently de-facto the same
because both the client and the server will have to pass through this
line in vfs_copy_file_range():

        if (len == 0)
                return 0;

IMO, and this opinion was voiced by several other filesystem developers,
the shortend copy semantics are the correct semantics for copy_file_range()
syscall as well as for vfs_copy_file_range() for internal kernel users.

I guess what this means is that if the 'cp' tool ever tries an opportunistic
copy_file_range() syscall (e.g. --cfr=auto), it may result in zero size copy.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-17  8:08                                 ` Amir Goldstein
@ 2021-02-17 17:26                                   ` Luis Henriques
  2021-02-17 20:47                                     ` Amir Goldstein
                                                       ` (3 more replies)
  2021-02-18  0:50                                   ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Andreas Dilger
  1 sibling, 4 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-17 17:26 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file.  Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems.  After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystems copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices").  It also introduces a flag (COPY_FILE_SPLICE) that can be used
by filesystems calling directly into the vfs copy_file_range to override
these restrictions.  Right now, only NFS needs to set this flag.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
Ok, I've tried to address all the issues and comments.  Hopefully this v3
is a bit closer to the final fix.

Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
  adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

 fs/nfsd/vfs.c      |  3 ++-
 fs/read_write.c    | 44 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/fs.h |  7 +++++++
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..14e55822c223 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	 * limit like this and pipeline multiple COPY requests.
 	 */
 	count = min_t(u64, count, 1 << 22);
-	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
+				   COPY_FILE_SPLICE);
 }
 
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..40a16003fb05 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
 				       flags);
 }
 
+/*
+ * This helper function checks whether copy_file_range can actually be used,
+ * depending on the source and destination filesystems being the same.
+ *
+ * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
+ */
+static int fops_copy_file_checks(struct file *file_in, struct file *file_out,
+				 unsigned int flags)
+{
+	if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
+		return -EINVAL;
+
+	if (flags & COPY_FILE_SPLICE)
+		return 0;
+	/*
+	 * We got here from userspace, so forbid copies if copy_file_range isn't
+	 * implemented or if we're doing a cross-fs copy.
+	 */
+	if (!file_out->f_op->copy_file_range)
+		return -EOPNOTSUPP;
+	else if (file_out->f_op->copy_file_range !=
+		 file_in->f_op->copy_file_range)
+		return -EXDEV;
+
+	return 0;
+}
+
 /*
  * Performs necessary checks before doing a file copy
  *
@@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	loff_t size_in;
 	int ret;
 
+	/* Only check f_ops if we're not trying to clone */
+	if (!file_in->f_op->remap_file_range ||
+	    (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
+		ret = fops_copy_file_checks(file_in, file_out, flags);
+		if (ret)
+			return ret;
+	}
+
 	ret = generic_file_rw_checks(file_in, file_out);
 	if (ret)
 		return ret;
@@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 {
 	ssize_t ret;
 
-	if (flags != 0)
-		return -EINVAL;
-
 	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
 				       flags);
 	if (unlikely(ret))
@@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 			ret = cloned;
 			goto done;
 		}
+		ret = fops_copy_file_checks(file_in, file_out, flags);
+		if (ret)
+			return ret;
 	}
 
 	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
@@ -1543,6 +1578,9 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
 	struct fd f_out;
 	ssize_t ret = -EBADF;
 
+	if (flags != 0)
+		return -EINVAL;
+
 	f_in = fdget(fd_in);
 	if (!f_in.file)
 		goto out2;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..6f604926d955 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1815,6 +1815,13 @@ struct dir_context {
  */
 #define REMAP_FILE_ADVISORY		(REMAP_FILE_CAN_SHORTEN)
 
+/*
+ * This flag control the behavior of copy_file_range from internal (kernel)
+ * users.  It can be used to override the policy of forbidding copies when
+ * source and destination filesystems are different.
+ */
+#define COPY_FILE_SPLICE		(1 << 0)
+
 struct iov_iter;
 
 struct file_operations {

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
@ 2021-02-17 20:47                                     ` Amir Goldstein
  2021-02-18  0:56                                     ` Nicolas Boichat
                                                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-17 20:47 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Jeff Layton, Steve French, Miklos Szeredi, Trond Myklebust,
	Anna Schumaker, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Greg KH, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	ceph-devel, linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List

On Wed, Feb 17, 2021 at 7:25 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions.  Right now, only NFS needs to set this flag.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Ok, I've tried to address all the issues and comments.  Hopefully this v3
> is a bit closer to the final fix.
>
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/nfsd/vfs.c      |  3 ++-
>  fs/read_write.c    | 44 +++++++++++++++++++++++++++++++++++++++++---
>  include/linux/fs.h |  7 +++++++
>  3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..14e55822c223 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
> +                                  COPY_FILE_SPLICE);
>  }
>
>  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..40a16003fb05 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
>                                        flags);
>  }
>
> +/*
> + * This helper function checks whether copy_file_range can actually be used,
> + * depending on the source and destination filesystems being the same.
> + *
> + * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
> + */
> +static int fops_copy_file_checks(struct file *file_in, struct file *file_out,
> +                                unsigned int flags)
> +{
> +       if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> +               return -EINVAL;
> +
> +       if (flags & COPY_FILE_SPLICE)
> +               return 0;
> +       /*
> +        * We got here from userspace, so forbid copies if copy_file_range isn't
> +        * implemented or if we're doing a cross-fs copy.
> +        */

Suggest:

       if (!file_in->f_op->copy_file_range) {
               if (file_in->f_op->copy_file_range !=
                   file_out->f_op->copy_file_range)
                   return -EXDEV;
       } else if (file_in->f_op->remap_file_range) {
               if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
                    return -EXDEV;
       } else {
                return -EOPNOTSUPP;
       }

       return 0;
}

> +
>  /*
>   * Performs necessary checks before doing a file copy
>   *
> @@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
>
> +       /* Only check f_ops if we're not trying to clone */
> +       if (!file_in->f_op->remap_file_range ||
> +           (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
> +               ret = fops_copy_file_checks(file_in, file_out, flags);
> +               if (ret)
> +                       return ret;
> +       }
> +

and then you don't need this special casing of clone here.

>         ret = generic_file_rw_checks(file_in, file_out);
>         if (ret)
>                 return ret;
> @@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  {
>         ssize_t ret;
>
> -       if (flags != 0)
> -               return -EINVAL;
> -
>         ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
>                                        flags);
>         if (unlikely(ret))
> @@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>                         ret = cloned;
>                         goto done;
>                 }
> +               ret = fops_copy_file_checks(file_in, file_out, flags);
> +               if (ret)
> +                       return ret;

and you don't need this here (right?)

and you can remove the checks for same i_sb and same copy_file_range
op that were already tested from vfs_copy_file_range().

Hope I am not missing anything.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-17  8:08                                 ` Amir Goldstein
  2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
@ 2021-02-18  0:50                                   ` Andreas Dilger
  2021-02-18  7:34                                     ` gregkh
  1 sibling, 1 reply; 93+ messages in thread
From: Andreas Dilger @ 2021-02-18  0:50 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Steve French, Anna Schumaker, Luis Henriques, Trond Myklebust,
	samba-technical, drinkcat, iant, linux-cifs, darrick.wong,
	linux-kernel, jlayton, llozano, linux-nfs, miklos, viro,
	dchinner, linux-fsdevel, gregkh, sfrench, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 2162 bytes --]

On Feb 17, 2021, at 1:08 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> You are missing my point.
> Never mind which server. The server does not *need* to rely on
> vfs_copy_file_range() to copy files from XFS to ext4.
> The server is very capable of implementing the fallback generic copy
> in case source/target fs do not support native {copy,remap}_file_range().
> 
> w.r.t semantics of copy_file_range() syscall vs. the fallback to userespace
> 'cp' tool (check source file size before copy or not), please note that the
> semantics of CIFS_IOC_COPYCHUNK_FILE are that of the former:
> 
>        rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
>                                        src_inode->i_size, 0);
> 
> It will copy zero bytes if advertised source file size if zero.
> 
> NFS server side copy semantics are currently de-facto the same
> because both the client and the server will have to pass through this
> line in vfs_copy_file_range():
> 
>        if (len == 0)
>                return 0;
> 
> IMO, and this opinion was voiced by several other filesystem developers,
> the shortend copy semantics are the correct semantics for copy_file_range()
> syscall as well as for vfs_copy_file_range() for internal kernel users.
> 
> I guess what this means is that if the 'cp' tool ever tries an opportunistic
> copy_file_range() syscall (e.g. --cfr=auto), it may result in zero size copy.

Having a syscall that does the "wrong thing" when called on two files
doesn't make sense.  Expecting userspace to check whether source/target
files supports CFR is also not practical.  This is trivial for the
kernel to determine and return -EOPNOTSUPP to the caller if the source
file (procfs/sysfs/etc) does not work with CFR properly.

Applications must already handle -EOPNOTSUPP with a fallback, but
expecting all applications that may call copy_file_range() to be
properly coded to handle corner cases is just asking for trouble.
That is doubly true given that an existing widely-used tool like
cp and mv are using this syscall if it is available in the kernel.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
  2021-02-17 20:47                                     ` Amir Goldstein
@ 2021-02-18  0:56                                     ` Nicolas Boichat
  2021-02-18  5:32                                     ` Olga Kornievskaia
  2021-02-18  7:43                                     ` Christoph Hellwig
  3 siblings, 0 replies; 93+ messages in thread
From: Nicolas Boichat @ 2021-02-18  0:56 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Ian Lance Taylor, Luis Lozano, ceph-devel,
	lkml, linux-cifs, samba-technical, linux-fsdevel, linux-nfs

On Thu, Feb 18, 2021 at 1:25 AM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").

Note that you also fix intra-filesystem copy_file_range on these
generated filesystems. This is IMHO great, but needs to be mentioned
in the commit message.

>  It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions.  Right now, only NFS needs to set this flag.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")

So technically this fixes something much older, presumably ever since
copy_file_range was introduced.

> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>

Tested-by: Nicolas Boichat <drinkcat@chromium.org>
but I guess you should not add to the next revision, I'll keep testing
further revisions ,-)

> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Ok, I've tried to address all the issues and comments.  Hopefully this v3
> is a bit closer to the final fix.
>
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/nfsd/vfs.c      |  3 ++-
>  fs/read_write.c    | 44 +++++++++++++++++++++++++++++++++++++++++---
>  include/linux/fs.h |  7 +++++++
>  3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..14e55822c223 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
> +                                  COPY_FILE_SPLICE);
>  }
>
>  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..40a16003fb05 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
>                                        flags);
>  }
>
> +/*
> + * This helper function checks whether copy_file_range can actually be used,
> + * depending on the source and destination filesystems being the same.
> + *
> + * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
> + */
> +static int fops_copy_file_checks(struct file *file_in, struct file *file_out,

fops_copy_file_range_checks ?

> +                                unsigned int flags)
> +{
> +       if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> +               return -EINVAL;
> +
> +       if (flags & COPY_FILE_SPLICE)
> +               return 0;
> +       /*
> +        * We got here from userspace, so forbid copies if copy_file_range isn't
> +        * implemented or if we're doing a cross-fs copy.
> +        */
> +       if (!file_out->f_op->copy_file_range)
> +               return -EOPNOTSUPP;

After this is merged, should this be added as an error code to the man page?

> +       else if (file_out->f_op->copy_file_range !=
> +                file_in->f_op->copy_file_range)

Just note, this could be a cross-fs copy (just not a cross-fs_type copy).

> +               return -EXDEV;
> +
> +       return 0;
> +}
> +
>  /*
>   * Performs necessary checks before doing a file copy
>   *
> @@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
>
> +       /* Only check f_ops if we're not trying to clone */
> +       if (!file_in->f_op->remap_file_range ||
> +           (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
> +               ret = fops_copy_file_checks(file_in, file_out, flags);
> +               if (ret)
> +                       return ret;
> +       }
> +
>         ret = generic_file_rw_checks(file_in, file_out);
>         if (ret)
>                 return ret;
> @@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  {
>         ssize_t ret;
>
> -       if (flags != 0)
> -               return -EINVAL;
> -
>         ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
>                                        flags);
>         if (unlikely(ret))
> @@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>                         ret = cloned;
>                         goto done;
>                 }
> +               ret = fops_copy_file_checks(file_in, file_out, flags);
> +               if (ret)
> +                       return ret;
>         }
>
>         ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> @@ -1543,6 +1578,9 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
>         struct fd f_out;
>         ssize_t ret = -EBADF;
>
> +       if (flags != 0)
> +               return -EINVAL;
> +
>         f_in = fdget(fd_in);
>         if (!f_in.file)
>                 goto out2;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd47deea7c17..6f604926d955 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1815,6 +1815,13 @@ struct dir_context {
>   */
>  #define REMAP_FILE_ADVISORY            (REMAP_FILE_CAN_SHORTEN)
>
> +/*
> + * This flag control the behavior of copy_file_range from internal (kernel)
> + * users.  It can be used to override the policy of forbidding copies when
> + * source and destination filesystems are different.
> + */
> +#define COPY_FILE_SPLICE               (1 << 0)

nit: BIT(0) ?


> +
>  struct iov_iter;
>
>  struct file_operations {

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
  2021-02-17 20:47                                     ` Amir Goldstein
  2021-02-18  0:56                                     ` Nicolas Boichat
@ 2021-02-18  5:32                                     ` Olga Kornievskaia
  2021-02-18  6:47                                       ` Amir Goldstein
  2021-02-18  7:43                                     ` Christoph Hellwig
  3 siblings, 1 reply; 93+ messages in thread
From: Olga Kornievskaia @ 2021-02-18  5:32 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, linux-nfs

On Wed, Feb 17, 2021 at 3:30 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions.  Right now, only NFS needs to set this flag.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Ok, I've tried to address all the issues and comments.  Hopefully this v3
> is a bit closer to the final fix.
>
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description

In my testing, this patch breaks NFS server-to-server copy file.

>
>  fs/nfsd/vfs.c      |  3 ++-
>  fs/read_write.c    | 44 +++++++++++++++++++++++++++++++++++++++++---
>  include/linux/fs.h |  7 +++++++
>  3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..14e55822c223 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
> +                                  COPY_FILE_SPLICE);
>  }
>
>  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..40a16003fb05 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
>                                        flags);
>  }
>
> +/*
> + * This helper function checks whether copy_file_range can actually be used,
> + * depending on the source and destination filesystems being the same.
> + *
> + * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
> + */
> +static int fops_copy_file_checks(struct file *file_in, struct file *file_out,
> +                                unsigned int flags)
> +{
> +       if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> +               return -EINVAL;
> +
> +       if (flags & COPY_FILE_SPLICE)
> +               return 0;
> +       /*
> +        * We got here from userspace, so forbid copies if copy_file_range isn't
> +        * implemented or if we're doing a cross-fs copy.
> +        */
> +       if (!file_out->f_op->copy_file_range)
> +               return -EOPNOTSUPP;
> +       else if (file_out->f_op->copy_file_range !=
> +                file_in->f_op->copy_file_range)
> +               return -EXDEV;
> +
> +       return 0;
> +}
> +
>  /*
>   * Performs necessary checks before doing a file copy
>   *
> @@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
>
> +       /* Only check f_ops if we're not trying to clone */
> +       if (!file_in->f_op->remap_file_range ||
> +           (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
> +               ret = fops_copy_file_checks(file_in, file_out, flags);
> +               if (ret)
> +                       return ret;
> +       }
> +
>         ret = generic_file_rw_checks(file_in, file_out);
>         if (ret)
>                 return ret;
> @@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>  {
>         ssize_t ret;
>
> -       if (flags != 0)
> -               return -EINVAL;
> -
>         ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
>                                        flags);
>         if (unlikely(ret))
> @@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>                         ret = cloned;
>                         goto done;
>                 }
> +               ret = fops_copy_file_checks(file_in, file_out, flags);
> +               if (ret)
> +                       return ret;
>         }
>
>         ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> @@ -1543,6 +1578,9 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
>         struct fd f_out;
>         ssize_t ret = -EBADF;
>
> +       if (flags != 0)
> +               return -EINVAL;
> +
>         f_in = fdget(fd_in);
>         if (!f_in.file)
>                 goto out2;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd47deea7c17..6f604926d955 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1815,6 +1815,13 @@ struct dir_context {
>   */
>  #define REMAP_FILE_ADVISORY            (REMAP_FILE_CAN_SHORTEN)
>
> +/*
> + * This flag control the behavior of copy_file_range from internal (kernel)
> + * users.  It can be used to override the policy of forbidding copies when
> + * source and destination filesystems are different.
> + */
> +#define COPY_FILE_SPLICE               (1 << 0)
> +
>  struct iov_iter;
>
>  struct file_operations {

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18  5:32                                     ` Olga Kornievskaia
@ 2021-02-18  6:47                                       ` Amir Goldstein
  2021-02-18 16:28                                         ` Olga Kornievskaia
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-18  6:47 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, linux-nfs

On Thu, Feb 18, 2021 at 7:33 AM Olga Kornievskaia <aglo@umich.edu> wrote:
>
> On Wed, Feb 17, 2021 at 3:30 PM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > A regression has been reported by Nicolas Boichat, found while using the
> > copy_file_range syscall to copy a tracefs file.  Before commit
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > kernel would return -EXDEV to userspace when trying to copy a file across
> > different filesystems.  After this commit, the syscall doesn't fail anymore
> > and instead returns zero (zero bytes copied), as this file's content is
> > generated on-the-fly and thus reports a size of zero.
> >
> > This patch restores some cross-filesystems copy restrictions that existed
> > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices").  It also introduces a flag (COPY_FILE_SPLICE) that can be used
> > by filesystems calling directly into the vfs copy_file_range to override
> > these restrictions.  Right now, only NFS needs to set this flag.
> >
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > ---
> > Ok, I've tried to address all the issues and comments.  Hopefully this v3
> > is a bit closer to the final fix.
> >
> > Changes since v2
> > - do all the required checks earlier, in generic_copy_file_checks(),
> >   adding new checks for ->remap_file_range
> > - new COPY_FILE_SPLICE flag
> > - don't remove filesystem's fallback to generic_copy_file_range()
> > - updated commit changelog (and subject)
> > Changes since v1 (after Amir review)
> > - restored do_copy_file_range() helper
> > - return -EOPNOTSUPP if fs doesn't implement CFR
> > - updated commit description
>
> In my testing, this patch breaks NFS server-to-server copy file.

Hi Olga,

Can you please provide more details on the failed tests.

Does it fail on the client between two nfs mounts or does it fail
on the server? If the latter, between which two filesystems on the server?

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-18  0:50                                   ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Andreas Dilger
@ 2021-02-18  7:34                                     ` gregkh
  0 siblings, 0 replies; 93+ messages in thread
From: gregkh @ 2021-02-18  7:34 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Amir Goldstein, Steve French, Anna Schumaker, Luis Henriques,
	Trond Myklebust, samba-technical, drinkcat, iant, linux-cifs,
	darrick.wong, linux-kernel, jlayton, llozano, linux-nfs, miklos,
	viro, dchinner, linux-fsdevel, sfrench, ceph-devel

On Wed, Feb 17, 2021 at 05:50:35PM -0700, Andreas Dilger wrote:
> On Feb 17, 2021, at 1:08 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > 
> > You are missing my point.
> > Never mind which server. The server does not *need* to rely on
> > vfs_copy_file_range() to copy files from XFS to ext4.
> > The server is very capable of implementing the fallback generic copy
> > in case source/target fs do not support native {copy,remap}_file_range().
> > 
> > w.r.t semantics of copy_file_range() syscall vs. the fallback to userespace
> > 'cp' tool (check source file size before copy or not), please note that the
> > semantics of CIFS_IOC_COPYCHUNK_FILE are that of the former:
> > 
> >        rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
> >                                        src_inode->i_size, 0);
> > 
> > It will copy zero bytes if advertised source file size if zero.
> > 
> > NFS server side copy semantics are currently de-facto the same
> > because both the client and the server will have to pass through this
> > line in vfs_copy_file_range():
> > 
> >        if (len == 0)
> >                return 0;
> > 
> > IMO, and this opinion was voiced by several other filesystem developers,
> > the shortend copy semantics are the correct semantics for copy_file_range()
> > syscall as well as for vfs_copy_file_range() for internal kernel users.
> > 
> > I guess what this means is that if the 'cp' tool ever tries an opportunistic
> > copy_file_range() syscall (e.g. --cfr=auto), it may result in zero size copy.
> 
> Having a syscall that does the "wrong thing" when called on two files
> doesn't make sense.  Expecting userspace to check whether source/target
> files supports CFR is also not practical.  This is trivial for the
> kernel to determine and return -EOPNOTSUPP to the caller if the source
> file (procfs/sysfs/etc) does not work with CFR properly.

How does the kernel "know" that a specific file in a specific filesystem
will not work with CFR "properly"?  That goes back to the original patch
which tried to label each and every filesystem type with a
"supported/not supported" type of flag, which was going to be a mess,
especially as it seems that this might be a file-specific thing, not a
filesystem-specific thing.

The goal of the patch _should_ be that the kernel figure it out itself,
but so far no one seems to be able to explain how that can be done :(

So, any hints?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-15 15:43 ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Luis Henriques
                     ` (2 preceding siblings ...)
  2021-02-17  4:45   ` Nicolas Boichat
@ 2021-02-18  7:42   ` Christoph Hellwig
  2021-02-18  9:10     ` Amir Goldstein
  3 siblings, 1 reply; 93+ messages in thread
From: Christoph Hellwig @ 2021-02-18  7:42 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, linux-cifs,
	samba-technical, linux-fsdevel, linux-nfs

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

This whole idea of cross-device copie has always been a horrible idea,
and I've been arguing against it since the patches were posted.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
                                                       ` (2 preceding siblings ...)
  2021-02-18  5:32                                     ` Olga Kornievskaia
@ 2021-02-18  7:43                                     ` Christoph Hellwig
  3 siblings, 0 replies; 93+ messages in thread
From: Christoph Hellwig @ 2021-02-18  7:43 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, linux-cifs,
	samba-technical, linux-fsdevel, linux-nfs

On Wed, Feb 17, 2021 at 05:26:54PM +0000, Luis Henriques wrote:
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
> 
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions.  Right now, only NFS needs to set this flag.

No need for the flag.  Jyst fall back to splicing in the only caller
that wants it.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-18  7:42   ` Christoph Hellwig
@ 2021-02-18  9:10     ` Amir Goldstein
  2021-02-18 10:29       ` Luis Henriques
  2021-02-18 20:41       ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Steve French
  0 siblings, 2 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-18  9:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, Linux NFS Mailing List

On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> Looks good:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
>
> This whole idea of cross-device copie has always been a horrible idea,
> and I've been arguing against it since the patches were posted.

Ok. I'm good with this v2 as well, but need to add the fallback to
do_splice_direct()
in nfsd_copy_file_range(), because this patch breaks it.

And the commit message of v3 is better in describing the reported issue.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-18  9:10     ` Amir Goldstein
@ 2021-02-18 10:29       ` Luis Henriques
  2021-02-18 12:15         ` Luis Henriques
  2021-02-18 20:41       ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Steve French
  1 sibling, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-18 10:29 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christoph Hellwig, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, Linux NFS Mailing List

Amir Goldstein <amir73il@gmail.com> writes:

> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <hch@infradead.org> wrote:
>>
>> Looks good:
>>
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
>>
>> This whole idea of cross-device copie has always been a horrible idea,
>> and I've been arguing against it since the patches were posted.
>
> Ok. I'm good with this v2 as well, but need to add the fallback to
> do_splice_direct()
> in nfsd_copy_file_range(), because this patch breaks it.
>
> And the commit message of v3 is better in describing the reported issue.

Except that, as I said in a previous email, v2 doesn't really fix the
issue: all the checks need to be done earlier in generic_copy_file_checks().

I'll work on getting v4, based on v2 and but moving the checks and
implementing your review suggestions to v3 (plus this nfs change).

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-18 10:29       ` Luis Henriques
@ 2021-02-18 12:15         ` Luis Henriques
  2021-02-18 12:49           ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-18 12:15 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christoph Hellwig, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, Linux NFS Mailing List

Luis Henriques <lhenriques@suse.de> writes:

> Amir Goldstein <amir73il@gmail.com> writes:
>
>> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <hch@infradead.org> wrote:
>>>
>>> Looks good:
>>>
>>> Reviewed-by: Christoph Hellwig <hch@lst.de>
>>>
>>> This whole idea of cross-device copie has always been a horrible idea,
>>> and I've been arguing against it since the patches were posted.
>>
>> Ok. I'm good with this v2 as well, but need to add the fallback to
>> do_splice_direct()
>> in nfsd_copy_file_range(), because this patch breaks it.
>>
>> And the commit message of v3 is better in describing the reported issue.
>
> Except that, as I said in a previous email, v2 doesn't really fix the
> issue: all the checks need to be done earlier in generic_copy_file_checks().
>
> I'll work on getting v4, based on v2 and but moving the checks and
> implementing your review suggestions to v3 (plus this nfs change).

There's something else:

The filesystems (nfs, ceph, cifs, fuse) rely on the fallback to
generic_copy_file_range() if something's wrong.  And this "something's
wrong" is fs specific.  For example: in ceph it is possible to offload the
file copy to the OSDs even if the files are in different filesystems as
long as these filesystems are on the *same* ceph cluster.  If the copy
being done is across two different clusters, then the copy reverts to
splice.  This means that the boilerplate code being removed in v2 of this
patch needs to be restored and replace by:

	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
				     len, flags);

	if (ret == -EOPNOTSUPP || ret == -EXDEV)
		ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off,
				       len > MAX_RW_COUNT ? MAX_RW_COUNT : len,
				       flags);
	return ret;

A quick look at the other filesystems code indicate similar patterns.
Since at this point we've gone through all the syscall checks already,
calling do_splice_direct() shouldn't be a huge change.  But I may be
missing something.  Again.  Which is quite likely :-)

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-18 12:15         ` Luis Henriques
@ 2021-02-18 12:49           ` Amir Goldstein
  2021-02-18 14:36             ` [PATCH v4] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-18 12:49 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Christoph Hellwig, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, Linux NFS Mailing List

On Thu, Feb 18, 2021 at 2:14 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Luis Henriques <lhenriques@suse.de> writes:
>
> > Amir Goldstein <amir73il@gmail.com> writes:
> >
> >> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <hch@infradead.org> wrote:
> >>>
> >>> Looks good:
> >>>
> >>> Reviewed-by: Christoph Hellwig <hch@lst.de>
> >>>
> >>> This whole idea of cross-device copie has always been a horrible idea,
> >>> and I've been arguing against it since the patches were posted.
> >>
> >> Ok. I'm good with this v2 as well, but need to add the fallback to
> >> do_splice_direct()
> >> in nfsd_copy_file_range(), because this patch breaks it.
> >>
> >> And the commit message of v3 is better in describing the reported issue.
> >
> > Except that, as I said in a previous email, v2 doesn't really fix the
> > issue: all the checks need to be done earlier in generic_copy_file_checks().
> >
> > I'll work on getting v4, based on v2 and but moving the checks and
> > implementing your review suggestions to v3 (plus this nfs change).
>
> There's something else:
>
> The filesystems (nfs, ceph, cifs, fuse) rely on the fallback to
> generic_copy_file_range() if something's wrong.  And this "something's
> wrong" is fs specific.  For example: in ceph it is possible to offload the
> file copy to the OSDs even if the files are in different filesystems as
> long as these filesystems are on the *same* ceph cluster.  If the copy
> being done is across two different clusters, then the copy reverts to
> splice.  This means that the boilerplate code being removed in v2 of this
> patch needs to be restored and replace by:
>
>         ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
>                                      len, flags);
>
>         if (ret == -EOPNOTSUPP || ret == -EXDEV)
>                 ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off,
>                                        len > MAX_RW_COUNT ? MAX_RW_COUNT : len,
>                                        flags);
>         return ret;
>

Why not leave the filesystem code as is and leave the
generic_copy_file_range() helper? Less churn.

Then nfsd_copy_file_range() can also fallback to generic_copy_file_range().

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH v4] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 12:49           ` Amir Goldstein
@ 2021-02-18 14:36             ` Luis Henriques
  2021-02-18 14:58               ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-18 14:36 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file.  Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems.  After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystem copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices").  Filesystems are still allowed to fall-back to the VFS
generic_copy_file_range() implementation, but that has now to be done
explicitly.

nfsd is also modified to use generic_copy_file_range() instead of
vfs_copy_file_range() so that it can still fall-back to splice without going
through all the checks.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
And here's v4.  I'd like to request help for testing.  I know Nicolas is
doing that (thanks!  and thanks for the reviews).  But it would be great to
get at least the nfs code tested.  Olga, can you help here?

Changes since v3
- dropped the COPY_FILE_SPLICE flag
- kept the f_op's checks early in generic_copy_file_checks, implementing
  Amir's suggestions
- modified nfsd to use generic_copy_file_range()
Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
  adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

 fs/nfsd/vfs.c   |  2 +-
 fs/read_write.c | 50 +++++++++++++++++++++++--------------------------
 2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..49dd28ee2602 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -578,7 +578,7 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	 * limit like this and pipeline multiple COPY requests.
 	 */
 	count = min_t(u64, count, 1 << 22);
-	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+	return generic_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
 }
 
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..214d44f7cbfa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  size_t len, unsigned int flags)
-{
-	/*
-	 * Although we now allow filesystems to handle cross sb copy, passing
-	 * a file of the wrong filesystem type to filesystem driver can result
-	 * in an attempt to dereference the wrong type of ->private_data, so
-	 * avoid doing that until we really have a good reason.  NFS defines
-	 * several different file_system_type structures, but they all end up
-	 * using the same ->copy_file_range() function pointer.
-	 */
-	if (file_out->f_op->copy_file_range &&
-	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
-		return file_out->f_op->copy_file_range(file_in, pos_in,
-						       file_out, pos_out,
-						       len, flags);
-
-	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				       flags);
-}
-
 /*
  * Performs necessary checks before doing a file copy
  *
@@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	loff_t size_in;
 	int ret;
 
+	/*
+	 * Although we now allow filesystems to handle cross sb copy, passing
+	 * a file of the wrong filesystem type to filesystem driver can result
+	 * in an attempt to dereference the wrong type of ->private_data, so
+	 * avoid doing that until we really have a good reason.  NFS defines
+	 * several different file_system_type structures, but they all end up
+	 * using the same ->copy_file_range() function pointer.
+	 */
+	if (file_out->f_op->copy_file_range) {
+		if (file_in->f_op->copy_file_range !=
+		    file_out->f_op->copy_file_range)
+			return -EXDEV;
+	} else if (file_in->f_op->remap_file_range) {
+		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+			return -EXDEV;
+	} else {
+                return -EOPNOTSUPP;
+	}
+
 	ret = generic_file_rw_checks(file_in, file_out);
 	if (ret)
 		return ret;
@@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * Try cloning first, this is supported by more file systems, and
 	 * more efficient if both clone and copy are supported (e.g. NFS).
 	 */
-	if (file_in->f_op->remap_file_range &&
-	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+	if (file_in->f_op->remap_file_range) {
 		loff_t cloned;
 
 		cloned = file_in->f_op->remap_file_range(file_in, pos_in,
@@ -1513,9 +1509,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 		}
 	}
 
-	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				flags);
-	WARN_ON_ONCE(ret == -EOPNOTSUPP);
+	ret = file_out->f_op->copy_file_range(file_in, pos_in,
+					      file_out, pos_out,
+					      len, flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v4] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 14:36             ` [PATCH v4] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
@ 2021-02-18 14:58               ` Amir Goldstein
  2021-02-18 15:17                 ` [PATCH v5] " Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-18 14:58 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Jeff Layton, Steve French, Miklos Szeredi, Trond Myklebust,
	Anna Schumaker, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Greg KH, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List

On Thu, Feb 18, 2021 at 4:35 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to use generic_copy_file_range() instead of
> vfs_copy_file_range() so that it can still fall-back to splice without going
> through all the checks.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> And here's v4.  I'd like to request help for testing.  I know Nicolas is
> doing that (thanks!  and thanks for the reviews).  But it would be great to
> get at least the nfs code tested.  Olga, can you help here?
>
> Changes since v3
> - dropped the COPY_FILE_SPLICE flag
> - kept the f_op's checks early in generic_copy_file_checks, implementing
>   Amir's suggestions
> - modified nfsd to use generic_copy_file_range()
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/nfsd/vfs.c   |  2 +-
>  fs/read_write.c | 50 +++++++++++++++++++++++--------------------------
>  2 files changed, 24 insertions(+), 28 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..49dd28ee2602 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,7 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +       return generic_copy_file_range(src, src_pos, dst, dst_pos, count, 0);

That is not the desired change.
It should try vfs_copy_file_range() and fallback to generic_copy_file_range()
for EXDEV and EOPNOTSUPP.
I will explain why.
This code runs on nfs server.
The nfs client requested remote server side copy offload using
nfs4_copy_file_range() and remote request is handled here.
It is not enough to generic_copy_file_range() on the server because
the source and destination themselves can be on yet another remote
location (cifs/ceph/nfs), so this is why calling vfs_copy_file_range()
here is important.
At least that is my understanding.
Unlike userspace copy fallback, if the server returns -EXDEV the client
will need to transfer the data over the network.
That is why the generic_copy_file_range() fallback is important.


>  }
>
>  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..214d44f7cbfa 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>  }
>  EXPORT_SYMBOL(generic_copy_file_range);
>
> -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> -                                 struct file *file_out, loff_t pos_out,
> -                                 size_t len, unsigned int flags)
> -{
> -       /*
> -        * Although we now allow filesystems to handle cross sb copy, passing
> -        * a file of the wrong filesystem type to filesystem driver can result
> -        * in an attempt to dereference the wrong type of ->private_data, so
> -        * avoid doing that until we really have a good reason.  NFS defines
> -        * several different file_system_type structures, but they all end up
> -        * using the same ->copy_file_range() function pointer.
> -        */
> -       if (file_out->f_op->copy_file_range &&
> -           file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> -               return file_out->f_op->copy_file_range(file_in, pos_in,
> -                                                      file_out, pos_out,
> -                                                      len, flags);
> -
> -       return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                                      flags);
> -}
> -
>  /*
>   * Performs necessary checks before doing a file copy
>   *
> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
>
> +       /*
> +        * Although we now allow filesystems to handle cross sb copy, passing
> +        * a file of the wrong filesystem type to filesystem driver can result
> +        * in an attempt to dereference the wrong type of ->private_data, so
> +        * avoid doing that until we really have a good reason.  NFS defines
> +        * several different file_system_type structures, but they all end up
> +        * using the same ->copy_file_range() function pointer.
> +        */
> +       if (file_out->f_op->copy_file_range) {
> +               if (file_in->f_op->copy_file_range !=
> +                   file_out->f_op->copy_file_range)
> +                       return -EXDEV;
> +       } else if (file_in->f_op->remap_file_range) {
> +               if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> +                       return -EXDEV;
> +       } else {
> +                return -EOPNOTSUPP;
> +       }
> +
>         ret = generic_file_rw_checks(file_in, file_out);
>         if (ret)
>                 return ret;
> @@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>          * Try cloning first, this is supported by more file systems, and
>          * more efficient if both clone and copy are supported (e.g. NFS).
>          */
> -       if (file_in->f_op->remap_file_range &&
> -           file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
> +       if (file_in->f_op->remap_file_range) {
>                 loff_t cloned;
>
>                 cloned = file_in->f_op->remap_file_range(file_in, pos_in,
> @@ -1513,9 +1509,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>                 }
>         }
>
> -       ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                               flags);
> -       WARN_ON_ONCE(ret == -EOPNOTSUPP);
> +       ret = file_out->f_op->copy_file_range(file_in, pos_in,
> +                                             file_out, pos_out,
> +                                             len, flags);

I see you have made an assumption here that if we did not clone then
file_out->f_op->copy_file_range must be valid.
It is not true.
file_out->f_op->copy_file_range could be NULL and we got here becauses
remap_file_range was attempted and failed.
So you still need to check for non-NULL file_out->f_op->copy_file_range
here just like it was before the regressing commit.

Otherwise, looks ok to me, but without NFS testing we won't know for sure
It's a tricky one...

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH v5] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 14:58               ` Amir Goldstein
@ 2021-02-18 15:17                 ` Luis Henriques
  2021-02-18 15:53                   ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-18 15:17 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file.  Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems.  After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystem copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices").  Filesystems are still allowed to fall-back to the VFS
generic_copy_file_range() implementation, but that has now to be done
explicitly.

nfsd is also modified to fall-back into generic_copy_file_range() in case
vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
And v5!  Sorry.  Sure, it makes sense to go through the all the vfs_cfr()
checks first.

Again, here's my request for testing.

Changes since v4
- nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
  or -EXDEV.
Changes since v3
- dropped the COPY_FILE_SPLICE flag
- kept the f_op's checks early in generic_copy_file_checks, implementing
  Amir's suggestions
- modified nfsd to use generic_copy_file_range()
Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
  adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description
 fs/nfsd/vfs.c   |  8 +++++++-
 fs/read_write.c | 50 +++++++++++++++++++++++--------------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..23dab0fa9087 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 			     u64 dst_pos, u64 count)
 {
+	ssize_t ret;
 
 	/*
 	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	 * limit like this and pipeline multiple COPY requests.
 	 */
 	count = min_t(u64, count, 1 << 22);
-	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+					      count, 0);
+	return ret;
 }
 
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..214d44f7cbfa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  size_t len, unsigned int flags)
-{
-	/*
-	 * Although we now allow filesystems to handle cross sb copy, passing
-	 * a file of the wrong filesystem type to filesystem driver can result
-	 * in an attempt to dereference the wrong type of ->private_data, so
-	 * avoid doing that until we really have a good reason.  NFS defines
-	 * several different file_system_type structures, but they all end up
-	 * using the same ->copy_file_range() function pointer.
-	 */
-	if (file_out->f_op->copy_file_range &&
-	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
-		return file_out->f_op->copy_file_range(file_in, pos_in,
-						       file_out, pos_out,
-						       len, flags);
-
-	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				       flags);
-}
-
 /*
  * Performs necessary checks before doing a file copy
  *
@@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	loff_t size_in;
 	int ret;
 
+	/*
+	 * Although we now allow filesystems to handle cross sb copy, passing
+	 * a file of the wrong filesystem type to filesystem driver can result
+	 * in an attempt to dereference the wrong type of ->private_data, so
+	 * avoid doing that until we really have a good reason.  NFS defines
+	 * several different file_system_type structures, but they all end up
+	 * using the same ->copy_file_range() function pointer.
+	 */
+	if (file_out->f_op->copy_file_range) {
+		if (file_in->f_op->copy_file_range !=
+		    file_out->f_op->copy_file_range)
+			return -EXDEV;
+	} else if (file_in->f_op->remap_file_range) {
+		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+			return -EXDEV;
+	} else {
+                return -EOPNOTSUPP;
+	}
+
 	ret = generic_file_rw_checks(file_in, file_out);
 	if (ret)
 		return ret;
@@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * Try cloning first, this is supported by more file systems, and
 	 * more efficient if both clone and copy are supported (e.g. NFS).
 	 */
-	if (file_in->f_op->remap_file_range &&
-	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+	if (file_in->f_op->remap_file_range) {
 		loff_t cloned;
 
 		cloned = file_in->f_op->remap_file_range(file_in, pos_in,
@@ -1513,9 +1509,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 		}
 	}
 
-	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				flags);
-	WARN_ON_ONCE(ret == -EOPNOTSUPP);
+	ret = file_out->f_op->copy_file_range(file_in, pos_in,
+					      file_out, pos_out,
+					      len, flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v5] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 15:17                 ` [PATCH v5] " Luis Henriques
@ 2021-02-18 15:53                   ` Amir Goldstein
  2021-02-18 16:35                     ` Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-18 15:53 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Jeff Layton, Steve French, Miklos Szeredi, Trond Myklebust,
	Anna Schumaker, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Greg KH, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List

On Thu, Feb 18, 2021 at 5:16 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range() in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> And v5!  Sorry.  Sure, it makes sense to go through the all the vfs_cfr()
> checks first.

You missed my other comment on v4...

not checking NULL copy_file_range case.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18  6:47                                       ` Amir Goldstein
@ 2021-02-18 16:28                                         ` Olga Kornievskaia
  0 siblings, 0 replies; 93+ messages in thread
From: Olga Kornievskaia @ 2021-02-18 16:28 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, ceph-devel, linux-kernel, CIFS, samba-technical,
	linux-fsdevel, linux-nfs

On Thu, Feb 18, 2021 at 1:48 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Thu, Feb 18, 2021 at 7:33 AM Olga Kornievskaia <aglo@umich.edu> wrote:
> >
> > On Wed, Feb 17, 2021 at 3:30 PM Luis Henriques <lhenriques@suse.de> wrote:
> > >
> > > A regression has been reported by Nicolas Boichat, found while using the
> > > copy_file_range syscall to copy a tracefs file.  Before commit
> > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > > kernel would return -EXDEV to userspace when trying to copy a file across
> > > different filesystems.  After this commit, the syscall doesn't fail anymore
> > > and instead returns zero (zero bytes copied), as this file's content is
> > > generated on-the-fly and thus reports a size of zero.
> > >
> > > This patch restores some cross-filesystems copy restrictions that existed
> > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > devices").  It also introduces a flag (COPY_FILE_SPLICE) that can be used
> > > by filesystems calling directly into the vfs copy_file_range to override
> > > these restrictions.  Right now, only NFS needs to set this flag.
> > >
> > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > ---
> > > Ok, I've tried to address all the issues and comments.  Hopefully this v3
> > > is a bit closer to the final fix.
> > >
> > > Changes since v2
> > > - do all the required checks earlier, in generic_copy_file_checks(),
> > >   adding new checks for ->remap_file_range
> > > - new COPY_FILE_SPLICE flag
> > > - don't remove filesystem's fallback to generic_copy_file_range()
> > > - updated commit changelog (and subject)
> > > Changes since v1 (after Amir review)
> > > - restored do_copy_file_range() helper
> > > - return -EOPNOTSUPP if fs doesn't implement CFR
> > > - updated commit description
> >
> > In my testing, this patch breaks NFS server-to-server copy file.
>
> Hi Olga,
>
> Can you please provide more details on the failed tests.
>
> Does it fail on the client between two nfs mounts or does it fail
> on the server? If the latter, between which two filesystems on the server?
>

It was a pilot error. V3 worked. I'm having some other issues with
server to server copy code but they seem to be unrelated to this. I
will test the new v6 versions when it comes out.

> Thanks,
> Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v5] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 15:53                   ` Amir Goldstein
@ 2021-02-18 16:35                     ` Luis Henriques
  2021-02-18 17:18                       ` [PATCH v6] " Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-18 16:35 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Jeff Layton, Steve French, Miklos Szeredi, Trond Myklebust,
	Anna Schumaker, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Greg KH, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List

Amir Goldstein <amir73il@gmail.com> writes:

> On Thu, Feb 18, 2021 at 5:16 PM Luis Henriques <lhenriques@suse.de> wrote:
>>
>> A regression has been reported by Nicolas Boichat, found while using the
>> copy_file_range syscall to copy a tracefs file.  Before commit
>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>> kernel would return -EXDEV to userspace when trying to copy a file across
>> different filesystems.  After this commit, the syscall doesn't fail anymore
>> and instead returns zero (zero bytes copied), as this file's content is
>> generated on-the-fly and thus reports a size of zero.
>>
>> This patch restores some cross-filesystem copy restrictions that existed
>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> devices").  Filesystems are still allowed to fall-back to the VFS
>> generic_copy_file_range() implementation, but that has now to be done
>> explicitly.
>>
>> nfsd is also modified to fall-back into generic_copy_file_range() in case
>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>>
>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
>> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
>> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
>> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> ---
>> And v5!  Sorry.  Sure, it makes sense to go through the all the vfs_cfr()
>> checks first.
>
> You missed my other comment on v4...
>
> not checking NULL copy_file_range case.

Ah, yeah I did missed it.  I'll follow up with yet another revision.

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH v6] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 16:35                     ` Luis Henriques
@ 2021-02-18 17:18                       ` Luis Henriques
  2021-02-19 21:18                         ` Olga Kornievskaia
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-18 17:18 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file.  Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems.  After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystem copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices").  Filesystems are still allowed to fall-back to the VFS
generic_copy_file_range() implementation, but that has now to be done
explicitly.

nfsd is also modified to fall-back into generic_copy_file_range() in case
vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
And v6 is upon us.  Behold!

Changes since v5
- check if ->copy_file_range is NULL before calling it
Changes since v4
- nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
  or -EXDEV.
Changes since v3
- dropped the COPY_FILE_SPLICE flag
- kept the f_op's checks early in generic_copy_file_checks, implementing
  Amir's suggestions
- modified nfsd to use generic_copy_file_range()
Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
  adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

 fs/nfsd/vfs.c   |  8 +++++++-
 fs/read_write.c | 53 ++++++++++++++++++++++++-------------------------
 2 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..23dab0fa9087 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 			     u64 dst_pos, u64 count)
 {
+	ssize_t ret;
 
 	/*
 	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	 * limit like this and pipeline multiple COPY requests.
 	 */
 	count = min_t(u64, count, 1 << 22);
-	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+					      count, 0);
+	return ret;
 }
 
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..0348aaa9e237 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  size_t len, unsigned int flags)
-{
-	/*
-	 * Although we now allow filesystems to handle cross sb copy, passing
-	 * a file of the wrong filesystem type to filesystem driver can result
-	 * in an attempt to dereference the wrong type of ->private_data, so
-	 * avoid doing that until we really have a good reason.  NFS defines
-	 * several different file_system_type structures, but they all end up
-	 * using the same ->copy_file_range() function pointer.
-	 */
-	if (file_out->f_op->copy_file_range &&
-	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
-		return file_out->f_op->copy_file_range(file_in, pos_in,
-						       file_out, pos_out,
-						       len, flags);
-
-	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				       flags);
-}
-
 /*
  * Performs necessary checks before doing a file copy
  *
@@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	loff_t size_in;
 	int ret;
 
+	/*
+	 * Although we now allow filesystems to handle cross sb copy, passing
+	 * a file of the wrong filesystem type to filesystem driver can result
+	 * in an attempt to dereference the wrong type of ->private_data, so
+	 * avoid doing that until we really have a good reason.  NFS defines
+	 * several different file_system_type structures, but they all end up
+	 * using the same ->copy_file_range() function pointer.
+	 */
+	if (file_out->f_op->copy_file_range) {
+		if (file_in->f_op->copy_file_range !=
+		    file_out->f_op->copy_file_range)
+			return -EXDEV;
+	} else if (file_in->f_op->remap_file_range) {
+		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+			return -EXDEV;
+	} else {
+                return -EOPNOTSUPP;
+	}
+
 	ret = generic_file_rw_checks(file_in, file_out);
 	if (ret)
 		return ret;
@@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * Try cloning first, this is supported by more file systems, and
 	 * more efficient if both clone and copy are supported (e.g. NFS).
 	 */
-	if (file_in->f_op->remap_file_range &&
-	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+	if (file_in->f_op->remap_file_range) {
 		loff_t cloned;
 
 		cloned = file_in->f_op->remap_file_range(file_in, pos_in,
@@ -1511,11 +1507,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 			ret = cloned;
 			goto done;
 		}
+		/* Resort to copy_file_range if implemented. */
+		ret = -EOPNOTSUPP;
 	}
 
-	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				flags);
-	WARN_ON_ONCE(ret == -EOPNOTSUPP);
+	if (file_out->f_op->copy_file_range)
+		ret = file_out->f_op->copy_file_range(file_in, pos_in,
+						      file_out, pos_out,
+						      len, flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices
  2021-02-18  9:10     ` Amir Goldstein
  2021-02-18 10:29       ` Luis Henriques
@ 2021-02-18 20:41       ` Steve French
  1 sibling, 0 replies; 93+ messages in thread
From: Steve French @ 2021-02-18 20:41 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christoph Hellwig, Luis Henriques, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List

On Thu, Feb 18, 2021 at 4:03 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > Looks good:
> >
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> >
> > This whole idea of cross-device copie has always been a horrible idea,
> > and I've been arguing against it since the patches were posted.
>
> Ok. I'm good with this v2 as well, but need to add the fallback to
> do_splice_direct()
> in nfsd_copy_file_range(), because this patch breaks it.

Interestingly, for ksmbd (cifsd) looks like they already do splice not
copy_file_range


-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v6] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-18 17:18                       ` [PATCH v6] " Luis Henriques
@ 2021-02-19 21:18                         ` Olga Kornievskaia
  2021-02-19 21:52                           ` Amir Goldstein
  2021-02-21 19:58                           ` [PATCH v7] " Luis Henriques
  0 siblings, 2 replies; 93+ messages in thread
From: Olga Kornievskaia @ 2021-02-19 21:18 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Thu, Feb 18, 2021 at 12:33 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range() in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> And v6 is upon us.  Behold!


> Changes since v5
> - check if ->copy_file_range is NULL before calling it
> Changes since v4
> - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
>   or -EXDEV.
> Changes since v3
> - dropped the COPY_FILE_SPLICE flag
> - kept the f_op's checks early in generic_copy_file_checks, implementing
>   Amir's suggestions
> - modified nfsd to use generic_copy_file_range()
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/nfsd/vfs.c   |  8 +++++++-
>  fs/read_write.c | 53 ++++++++++++++++++++++++-------------------------
>  2 files changed, 33 insertions(+), 28 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..23dab0fa9087 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
>  ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>                              u64 dst_pos, u64 count)
>  {
> +       ssize_t ret;
>
>         /*
>          * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +       ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +
> +       if (ret == -EOPNOTSUPP || ret == -EXDEV)
> +               ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> +                                             count, 0);
> +       return ret;
>  }
>
>  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..0348aaa9e237 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>  }
>  EXPORT_SYMBOL(generic_copy_file_range);
>
> -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> -                                 struct file *file_out, loff_t pos_out,
> -                                 size_t len, unsigned int flags)
> -{
> -       /*
> -        * Although we now allow filesystems to handle cross sb copy, passing
> -        * a file of the wrong filesystem type to filesystem driver can result
> -        * in an attempt to dereference the wrong type of ->private_data, so
> -        * avoid doing that until we really have a good reason.  NFS defines
> -        * several different file_system_type structures, but they all end up
> -        * using the same ->copy_file_range() function pointer.
> -        */
> -       if (file_out->f_op->copy_file_range &&
> -           file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> -               return file_out->f_op->copy_file_range(file_in, pos_in,
> -                                                      file_out, pos_out,
> -                                                      len, flags);
> -
> -       return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                                      flags);
> -}
> -
>  /*
>   * Performs necessary checks before doing a file copy
>   *
> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
>
> +       /*
> +        * Although we now allow filesystems to handle cross sb copy, passing
> +        * a file of the wrong filesystem type to filesystem driver can result
> +        * in an attempt to dereference the wrong type of ->private_data, so
> +        * avoid doing that until we really have a good reason.  NFS defines
> +        * several different file_system_type structures, but they all end up
> +        * using the same ->copy_file_range() function pointer.
> +        */
> +       if (file_out->f_op->copy_file_range) {
> +               if (file_in->f_op->copy_file_range !=
> +                   file_out->f_op->copy_file_range)
> +                       return -EXDEV;
> +       } else if (file_in->f_op->remap_file_range) {
> +               if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> +                       return -EXDEV;
> +       } else {
> +                return -EOPNOTSUPP;
> +       }
> +
>         ret = generic_file_rw_checks(file_in, file_out);
>         if (ret)
>                 return ret;
> @@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>          * Try cloning first, this is supported by more file systems, and
>          * more efficient if both clone and copy are supported (e.g. NFS).
>          */
> -       if (file_in->f_op->remap_file_range &&
> -           file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
> +       if (file_in->f_op->remap_file_range) {
>                 loff_t cloned;

This chunk breaks NFS. You are removing the check that the source and
destination for the CLONE operation are the same superblock and that
leads to the fact that when NFS does a copy between 2 different NFS
servers, it would try CLONE first which is not allowed. NFS relied on
this check to be done by the VFS layer. Either don't remove it or,
otherwise, fix the NFS clone's code to not send the CLONE and error
accordingly so that the COPY is done as it should have been.

>                 cloned = file_in->f_op->remap_file_range(file_in, pos_in,
> @@ -1511,11 +1507,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>                         ret = cloned;
>                         goto done;
>                 }
> +               /* Resort to copy_file_range if implemented. */
> +               ret = -EOPNOTSUPP;
>         }
>
> -       ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                               flags);
> -       WARN_ON_ONCE(ret == -EOPNOTSUPP);
> +       if (file_out->f_op->copy_file_range)
> +               ret = file_out->f_op->copy_file_range(file_in, pos_in,
> +                                                     file_out, pos_out,
> +                                                     len, flags);
>  done:
>         if (ret > 0) {
>                 fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v6] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-19 21:18                         ` Olga Kornievskaia
@ 2021-02-19 21:52                           ` Amir Goldstein
  2021-02-21 19:58                           ` [PATCH v7] " Luis Henriques
  1 sibling, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-19 21:52 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Fri, Feb 19, 2021 at 11:18 PM Olga Kornievskaia <aglo@umich.edu> wrote:
>
> On Thu, Feb 18, 2021 at 12:33 PM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > A regression has been reported by Nicolas Boichat, found while using the
> > copy_file_range syscall to copy a tracefs file.  Before commit
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > kernel would return -EXDEV to userspace when trying to copy a file across
> > different filesystems.  After this commit, the syscall doesn't fail anymore
> > and instead returns zero (zero bytes copied), as this file's content is
> > generated on-the-fly and thus reports a size of zero.
> >
> > This patch restores some cross-filesystem copy restrictions that existed
> > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices").  Filesystems are still allowed to fall-back to the VFS
> > generic_copy_file_range() implementation, but that has now to be done
> > explicitly.
> >
> > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> >
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > ---
> > And v6 is upon us.  Behold!
>
>
> > Changes since v5
> > - check if ->copy_file_range is NULL before calling it
> > Changes since v4
> > - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
> >   or -EXDEV.
> > Changes since v3
> > - dropped the COPY_FILE_SPLICE flag
> > - kept the f_op's checks early in generic_copy_file_checks, implementing
> >   Amir's suggestions
> > - modified nfsd to use generic_copy_file_range()
> > Changes since v2
> > - do all the required checks earlier, in generic_copy_file_checks(),
> >   adding new checks for ->remap_file_range
> > - new COPY_FILE_SPLICE flag
> > - don't remove filesystem's fallback to generic_copy_file_range()
> > - updated commit changelog (and subject)
> > Changes since v1 (after Amir review)
> > - restored do_copy_file_range() helper
> > - return -EOPNOTSUPP if fs doesn't implement CFR
> > - updated commit description
> >
> >  fs/nfsd/vfs.c   |  8 +++++++-
> >  fs/read_write.c | 53 ++++++++++++++++++++++++-------------------------
> >  2 files changed, 33 insertions(+), 28 deletions(-)
> >
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index 04937e51de56..23dab0fa9087 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
> >  ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> >                              u64 dst_pos, u64 count)
> >  {
> > +       ssize_t ret;
> >
> >         /*
> >          * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> > @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> >          * limit like this and pipeline multiple COPY requests.
> >          */
> >         count = min_t(u64, count, 1 << 22);
> > -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > +       ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > +
> > +       if (ret == -EOPNOTSUPP || ret == -EXDEV)
> > +               ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> > +                                             count, 0);
> > +       return ret;
> >  }
> >
> >  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > diff --git a/fs/read_write.c b/fs/read_write.c
> > index 75f764b43418..0348aaa9e237 100644
> > --- a/fs/read_write.c
> > +++ b/fs/read_write.c
> > @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >  }
> >  EXPORT_SYMBOL(generic_copy_file_range);
> >
> > -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> > -                                 struct file *file_out, loff_t pos_out,
> > -                                 size_t len, unsigned int flags)
> > -{
> > -       /*
> > -        * Although we now allow filesystems to handle cross sb copy, passing
> > -        * a file of the wrong filesystem type to filesystem driver can result
> > -        * in an attempt to dereference the wrong type of ->private_data, so
> > -        * avoid doing that until we really have a good reason.  NFS defines
> > -        * several different file_system_type structures, but they all end up
> > -        * using the same ->copy_file_range() function pointer.
> > -        */
> > -       if (file_out->f_op->copy_file_range &&
> > -           file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> > -               return file_out->f_op->copy_file_range(file_in, pos_in,
> > -                                                      file_out, pos_out,
> > -                                                      len, flags);
> > -
> > -       return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> > -                                      flags);
> > -}
> > -
> >  /*
> >   * Performs necessary checks before doing a file copy
> >   *
> > @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
> >         loff_t size_in;
> >         int ret;
> >
> > +       /*
> > +        * Although we now allow filesystems to handle cross sb copy, passing
> > +        * a file of the wrong filesystem type to filesystem driver can result
> > +        * in an attempt to dereference the wrong type of ->private_data, so
> > +        * avoid doing that until we really have a good reason.  NFS defines
> > +        * several different file_system_type structures, but they all end up
> > +        * using the same ->copy_file_range() function pointer.
> > +        */
> > +       if (file_out->f_op->copy_file_range) {
> > +               if (file_in->f_op->copy_file_range !=
> > +                   file_out->f_op->copy_file_range)
> > +                       return -EXDEV;
> > +       } else if (file_in->f_op->remap_file_range) {
> > +               if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> > +                       return -EXDEV;
> > +       } else {
> > +                return -EOPNOTSUPP;
> > +       }
> > +
> >         ret = generic_file_rw_checks(file_in, file_out);
> >         if (ret)
> >                 return ret;
> > @@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> >          * Try cloning first, this is supported by more file systems, and
> >          * more efficient if both clone and copy are supported (e.g. NFS).
> >          */
> > -       if (file_in->f_op->remap_file_range &&
> > -           file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
> > +       if (file_in->f_op->remap_file_range) {
> >                 loff_t cloned;
>
> This chunk breaks NFS. You are removing the check that the source and
> destination for the CLONE operation are the same superblock and that
> leads to the fact that when NFS does a copy between 2 different NFS
> servers, it would try CLONE first which is not allowed. NFS relied on
> this check to be done by the VFS layer. Either don't remove it or,
> otherwise, fix the NFS clone's code to not send the CLONE and error
> accordingly so that the COPY is done as it should have been.
>

Right, we need to add this check back (not only for NFS).

However, I was looking at the change that introduced this opportunistic
call for clone_file_range() into copy_file_range():

commit a76b5b04375f974579c83433b06466758c0c552c
Author: Christoph Hellwig <hch@lst.de>
Date:   Fri Dec 9 16:17:19 2016 -0800

    fs: try to clone files first in vfs_copy_file_range

    A clone is a perfectly fine implementation of a file copy, so most
    file systems just implement the copy that way.  Instead of duplicating
    this logic move it to the VFS.  Currently btrfs and XFS implement copies
    the same way as clones and there is no behavior change for them, cifs
    only implements clones and grow support for copy_file_range with this
    patch.  NFS implements both, so this will allow copy_file_range to work
    on servers that only implement CLONE and be lot more efficient on servers
    that implements CLONE and COPY.

And I was thinking to myself that like the change that brought us here
("vfs: allow copy_file_range to copy across devices"), this change was done
for a certain purpose (serve copy_file_range() by fs that implement CLONE),
but that last part (prefer CLONE over COPY) also sounds like an optimization
that nobody asked for and could lead to unexpected behavior down the road.

I think that if a filesystem implements both methods (COPY and CLONE)
and user called to COPY API, we need to call the more specialized COPY
method and not try the CLONE method, because filesystem should be very
capable of making this optimization internally.

This could have been a hypothetical question, but there are actually
two filesystems that implement both COPY and CLONE, so let's ask the
developers what they think VFS should call.

Olga, Trond, Steve, which methods of your filesystem do you think that
vfs_copy_file_range() should call?
1. Only copy_file_range()?
2. Both copy_file_range() and remap_file_range()?
3. CLONE before COPY or the other way around?

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH v7] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-19 21:18                         ` Olga Kornievskaia
  2021-02-19 21:52                           ` Amir Goldstein
@ 2021-02-21 19:58                           ` Luis Henriques
  2021-02-22  3:00                             ` Nicolas Boichat
  2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
  1 sibling, 2 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-21 19:58 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file.  Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems.  After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystem copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices").  Filesystems are still allowed to fall-back to the VFS
generic_copy_file_range() implementation, but that has now to be done
explicitly.

nfsd is also modified to fall-back into generic_copy_file_range() in case
vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
Changes since v6
- restored i_sb checks for the clone operation
Changes since v5
- check if ->copy_file_range is NULL before calling it
Changes since v4
- nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
  or -EXDEV.
Changes since v3
- dropped the COPY_FILE_SPLICE flag
- kept the f_op's checks early in generic_copy_file_checks, implementing
  Amir's suggestions
- modified nfsd to use generic_copy_file_range()
Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
  adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

 fs/nfsd/vfs.c   |  8 +++++++-
 fs/read_write.c | 50 ++++++++++++++++++++++++-------------------------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..23dab0fa9087 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 			     u64 dst_pos, u64 count)
 {
+	ssize_t ret;
 
 	/*
 	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	 * limit like this and pipeline multiple COPY requests.
 	 */
 	count = min_t(u64, count, 1 << 22);
-	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+					      count, 0);
+	return ret;
 }
 
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..463345c0ee30 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  size_t len, unsigned int flags)
-{
-	/*
-	 * Although we now allow filesystems to handle cross sb copy, passing
-	 * a file of the wrong filesystem type to filesystem driver can result
-	 * in an attempt to dereference the wrong type of ->private_data, so
-	 * avoid doing that until we really have a good reason.  NFS defines
-	 * several different file_system_type structures, but they all end up
-	 * using the same ->copy_file_range() function pointer.
-	 */
-	if (file_out->f_op->copy_file_range &&
-	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
-		return file_out->f_op->copy_file_range(file_in, pos_in,
-						       file_out, pos_out,
-						       len, flags);
-
-	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				       flags);
-}
-
 /*
  * Performs necessary checks before doing a file copy
  *
@@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	loff_t size_in;
 	int ret;
 
+	/*
+	 * Although we now allow filesystems to handle cross sb copy, passing
+	 * a file of the wrong filesystem type to filesystem driver can result
+	 * in an attempt to dereference the wrong type of ->private_data, so
+	 * avoid doing that until we really have a good reason.  NFS defines
+	 * several different file_system_type structures, but they all end up
+	 * using the same ->copy_file_range() function pointer.
+	 */
+	if (file_out->f_op->copy_file_range) {
+		if (file_in->f_op->copy_file_range !=
+		    file_out->f_op->copy_file_range)
+			return -EXDEV;
+	} else if (file_in->f_op->remap_file_range) {
+		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+			return -EXDEV;
+	} else {
+                return -EOPNOTSUPP;
+	}
+
 	ret = generic_file_rw_checks(file_in, file_out);
 	if (ret)
 		return ret;
@@ -1511,11 +1508,14 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 			ret = cloned;
 			goto done;
 		}
+		/* Resort to copy_file_range if implemented. */
+		ret = -EOPNOTSUPP;
 	}
 
-	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				flags);
-	WARN_ON_ONCE(ret == -EOPNOTSUPP);
+	if (file_out->f_op->copy_file_range)
+		ret = file_out->f_op->copy_file_range(file_in, pos_in,
+						      file_out, pos_out,
+						      len, flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v7] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-21 19:58                           ` [PATCH v7] " Luis Henriques
@ 2021-02-22  3:00                             ` Nicolas Boichat
  2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
  1 sibling, 0 replies; 93+ messages in thread
From: Nicolas Boichat @ 2021-02-22  3:00 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	lkml, linux-cifs, samba-technical, linux-fsdevel, linux-nfs

On Mon, Feb 22, 2021 at 3:57 AM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range() in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>

Tested-by: Nicolas Boichat <drinkcat@chromium.org>

> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Changes since v6
> - restored i_sb checks for the clone operation
> Changes since v5
> - check if ->copy_file_range is NULL before calling it
> Changes since v4
> - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
>   or -EXDEV.
> Changes since v3
> - dropped the COPY_FILE_SPLICE flag
> - kept the f_op's checks early in generic_copy_file_checks, implementing
>   Amir's suggestions
> - modified nfsd to use generic_copy_file_range()
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/nfsd/vfs.c   |  8 +++++++-
>  fs/read_write.c | 50 ++++++++++++++++++++++++-------------------------
>  2 files changed, 32 insertions(+), 26 deletions(-)
> [snip]

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-21 19:58                           ` [PATCH v7] " Luis Henriques
  2021-02-22  3:00                             ` Nicolas Boichat
@ 2021-02-22 10:24                             ` Luis Henriques
  2021-02-22 10:46                               ` Amir Goldstein
                                                 ` (3 more replies)
  1 sibling, 4 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-22 10:24 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, Luis Henriques

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file.  Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems.  After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystem copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices").  Filesystems are still allowed to fall-back to the VFS
generic_copy_file_range() implementation, but that has now to be done
explicitly.

nfsd is also modified to fall-back into generic_copy_file_range() in case
vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <drinkcat@chromium.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
Changes since v7
- set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so that the
  error returned is always related to the 'copy' operation
Changes since v6
- restored i_sb checks for the clone operation
Changes since v5
- check if ->copy_file_range is NULL before calling it
Changes since v4
- nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
  or -EXDEV.
Changes since v3
- dropped the COPY_FILE_SPLICE flag
- kept the f_op's checks early in generic_copy_file_checks, implementing
  Amir's suggestions
- modified nfsd to use generic_copy_file_range()
Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
  adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

 fs/nfsd/vfs.c   |  8 +++++++-
 fs/read_write.c | 49 ++++++++++++++++++++++++-------------------------
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..23dab0fa9087 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 			     u64 dst_pos, u64 count)
 {
+	ssize_t ret;
 
 	/*
 	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
 	 * limit like this and pipeline multiple COPY requests.
 	 */
 	count = min_t(u64, count, 1 << 22);
-	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+	if (ret == -EOPNOTSUPP || ret == -EXDEV)
+		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+					      count, 0);
+	return ret;
 }
 
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..5a26297fd410 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  size_t len, unsigned int flags)
-{
-	/*
-	 * Although we now allow filesystems to handle cross sb copy, passing
-	 * a file of the wrong filesystem type to filesystem driver can result
-	 * in an attempt to dereference the wrong type of ->private_data, so
-	 * avoid doing that until we really have a good reason.  NFS defines
-	 * several different file_system_type structures, but they all end up
-	 * using the same ->copy_file_range() function pointer.
-	 */
-	if (file_out->f_op->copy_file_range &&
-	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
-		return file_out->f_op->copy_file_range(file_in, pos_in,
-						       file_out, pos_out,
-						       len, flags);
-
-	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				       flags);
-}
-
 /*
  * Performs necessary checks before doing a file copy
  *
@@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 	loff_t size_in;
 	int ret;
 
+	/*
+	 * Although we now allow filesystems to handle cross sb copy, passing
+	 * a file of the wrong filesystem type to filesystem driver can result
+	 * in an attempt to dereference the wrong type of ->private_data, so
+	 * avoid doing that until we really have a good reason.  NFS defines
+	 * several different file_system_type structures, but they all end up
+	 * using the same ->copy_file_range() function pointer.
+	 */
+	if (file_out->f_op->copy_file_range) {
+		if (file_in->f_op->copy_file_range !=
+		    file_out->f_op->copy_file_range)
+			return -EXDEV;
+	} else if (file_in->f_op->remap_file_range) {
+		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+			return -EXDEV;
+	} else {
+                return -EOPNOTSUPP;
+	}
+
 	ret = generic_file_rw_checks(file_in, file_out);
 	if (ret)
 		return ret;
@@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 
 	file_start_write(file_out);
 
+	ret = -EOPNOTSUPP;
 	/*
 	 * Try cloning first, this is supported by more file systems, and
 	 * more efficient if both clone and copy are supported (e.g. NFS).
@@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 		}
 	}
 
-	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-				flags);
-	WARN_ON_ONCE(ret == -EOPNOTSUPP);
+	if (file_out->f_op->copy_file_range)
+		ret = file_out->f_op->copy_file_range(file_in, pos_in,
+						      file_out, pos_out,
+						      len, flags);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
@ 2021-02-22 10:46                               ` Amir Goldstein
  2021-02-22 16:25                               ` dai.ngo
                                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-22 10:46 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Jeff Layton, Steve French, Miklos Szeredi, Trond Myklebust,
	Anna Schumaker, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Greg KH, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List

On Mon, Feb 22, 2021 at 12:23 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range() in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---

Reviewed-by: Amir Goldstein <amir73il@gmail.com>

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
  2021-02-22 10:46                               ` Amir Goldstein
@ 2021-02-22 16:25                               ` dai.ngo
  2021-02-23 10:32                                 ` Luis Henriques
  2021-02-24  1:00                               ` Olga Kornievskaia
  2021-02-24 14:23                               ` [PATCH] copy_file_range.2: Kernel v5.12 updates Luis Henriques
  3 siblings, 1 reply; 93+ messages in thread
From: dai.ngo @ 2021-02-22 16:25 UTC (permalink / raw)
  To: Luis Henriques, Amir Goldstein, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs


On 2/22/21 2:24 AM, Luis Henriques wrote:
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range() in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Changes since v7
> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so that the
>    error returned is always related to the 'copy' operation
> Changes since v6
> - restored i_sb checks for the clone operation
> Changes since v5
> - check if ->copy_file_range is NULL before calling it
> Changes since v4
> - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
>    or -EXDEV.
> Changes since v3
> - dropped the COPY_FILE_SPLICE flag
> - kept the f_op's checks early in generic_copy_file_checks, implementing
>    Amir's suggestions
> - modified nfsd to use generic_copy_file_range()
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>    adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>   fs/nfsd/vfs.c   |  8 +++++++-
>   fs/read_write.c | 49 ++++++++++++++++++++++++-------------------------
>   2 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..23dab0fa9087 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
>   ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>   			     u64 dst_pos, u64 count)
>   {
> +	ssize_t ret;
>   
>   	/*
>   	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>   	 * limit like this and pipeline multiple COPY requests.
>   	 */
>   	count = min_t(u64, count, 1 << 22);
> -	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +
> +	if (ret == -EOPNOTSUPP || ret == -EXDEV)
> +		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> +					      count, 0);
> +	return ret;
>   }
>   
>   __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..5a26297fd410 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>   }
>   EXPORT_SYMBOL(generic_copy_file_range);
>   
> -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> -				  struct file *file_out, loff_t pos_out,
> -				  size_t len, unsigned int flags)
> -{
> -	/*
> -	 * Although we now allow filesystems to handle cross sb copy, passing
> -	 * a file of the wrong filesystem type to filesystem driver can result
> -	 * in an attempt to dereference the wrong type of ->private_data, so
> -	 * avoid doing that until we really have a good reason.  NFS defines
> -	 * several different file_system_type structures, but they all end up
> -	 * using the same ->copy_file_range() function pointer.
> -	 */
> -	if (file_out->f_op->copy_file_range &&
> -	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> -		return file_out->f_op->copy_file_range(file_in, pos_in,
> -						       file_out, pos_out,
> -						       len, flags);
> -
> -	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -				       flags);
> -}
> -
>   /*
>    * Performs necessary checks before doing a file copy
>    *
> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>   	loff_t size_in;
>   	int ret;
>   
> +	/*
> +	 * Although we now allow filesystems to handle cross sb copy, passing
> +	 * a file of the wrong filesystem type to filesystem driver can result
> +	 * in an attempt to dereference the wrong type of ->private_data, so
> +	 * avoid doing that until we really have a good reason.  NFS defines
> +	 * several different file_system_type structures, but they all end up
> +	 * using the same ->copy_file_range() function pointer.
> +	 */
> +	if (file_out->f_op->copy_file_range) {
> +		if (file_in->f_op->copy_file_range !=
> +		    file_out->f_op->copy_file_range)
> +			return -EXDEV;
> +	} else if (file_in->f_op->remap_file_range) {
> +		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> +			return -EXDEV;

I think this check is redundant, it's done in vfs_copy_file_range.
If this check is removed then the else clause below should be removed
also. Once this check and the else clause are removed then might as
well move the the check of copy_file_range from here to vfs_copy_file_range.

-Dai

> +	} else {
> +                return -EOPNOTSUPP;
> +	}
> +
>   	ret = generic_file_rw_checks(file_in, file_out);
>   	if (ret)
>   		return ret;
> @@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>   
>   	file_start_write(file_out);
>   
> +	ret = -EOPNOTSUPP;
>   	/*
>   	 * Try cloning first, this is supported by more file systems, and
>   	 * more efficient if both clone and copy are supported (e.g. NFS).
> @@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>   		}
>   	}
>   
> -	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -				flags);
> -	WARN_ON_ONCE(ret == -EOPNOTSUPP);
> +	if (file_out->f_op->copy_file_range)
> +		ret = file_out->f_op->copy_file_range(file_in, pos_in,
> +						      file_out, pos_out,
> +						      len, flags);
>   done:
>   	if (ret > 0) {
>   		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-22 16:25                               ` dai.ngo
@ 2021-02-23 10:32                                 ` Luis Henriques
  2021-02-23 15:28                                   ` Amir Goldstein
  2021-02-23 15:29                                   ` dai.ngo
  0 siblings, 2 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-23 10:32 UTC (permalink / raw)
  To: dai.ngo
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, linux-cifs,
	samba-technical, linux-fsdevel, linux-nfs

On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
> 
> On 2/22/21 2:24 AM, Luis Henriques wrote:
> > A regression has been reported by Nicolas Boichat, found while using the
> > copy_file_range syscall to copy a tracefs file.  Before commit
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > kernel would return -EXDEV to userspace when trying to copy a file across
> > different filesystems.  After this commit, the syscall doesn't fail anymore
> > and instead returns zero (zero bytes copied), as this file's content is
> > generated on-the-fly and thus reports a size of zero.
> > 
> > This patch restores some cross-filesystem copy restrictions that existed
> > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices").  Filesystems are still allowed to fall-back to the VFS
> > generic_copy_file_range() implementation, but that has now to be done
> > explicitly.
> > 
> > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> > 
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> > Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> > Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > ---
> > Changes since v7
> > - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so that the
> >    error returned is always related to the 'copy' operation
> > Changes since v6
> > - restored i_sb checks for the clone operation
> > Changes since v5
> > - check if ->copy_file_range is NULL before calling it
> > Changes since v4
> > - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
> >    or -EXDEV.
> > Changes since v3
> > - dropped the COPY_FILE_SPLICE flag
> > - kept the f_op's checks early in generic_copy_file_checks, implementing
> >    Amir's suggestions
> > - modified nfsd to use generic_copy_file_range()
> > Changes since v2
> > - do all the required checks earlier, in generic_copy_file_checks(),
> >    adding new checks for ->remap_file_range
> > - new COPY_FILE_SPLICE flag
> > - don't remove filesystem's fallback to generic_copy_file_range()
> > - updated commit changelog (and subject)
> > Changes since v1 (after Amir review)
> > - restored do_copy_file_range() helper
> > - return -EOPNOTSUPP if fs doesn't implement CFR
> > - updated commit description
> > 
> >   fs/nfsd/vfs.c   |  8 +++++++-
> >   fs/read_write.c | 49 ++++++++++++++++++++++++-------------------------
> >   2 files changed, 31 insertions(+), 26 deletions(-)
> > 
> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index 04937e51de56..23dab0fa9087 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
> >   ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> >   			     u64 dst_pos, u64 count)
> >   {
> > +	ssize_t ret;
> >   	/*
> >   	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> > @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> >   	 * limit like this and pipeline multiple COPY requests.
> >   	 */
> >   	count = min_t(u64, count, 1 << 22);
> > -	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > +	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > +
> > +	if (ret == -EOPNOTSUPP || ret == -EXDEV)
> > +		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> > +					      count, 0);
> > +	return ret;
> >   }
> >   __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > diff --git a/fs/read_write.c b/fs/read_write.c
> > index 75f764b43418..5a26297fd410 100644
> > --- a/fs/read_write.c
> > +++ b/fs/read_write.c
> > @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >   }
> >   EXPORT_SYMBOL(generic_copy_file_range);
> > -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> > -				  struct file *file_out, loff_t pos_out,
> > -				  size_t len, unsigned int flags)
> > -{
> > -	/*
> > -	 * Although we now allow filesystems to handle cross sb copy, passing
> > -	 * a file of the wrong filesystem type to filesystem driver can result
> > -	 * in an attempt to dereference the wrong type of ->private_data, so
> > -	 * avoid doing that until we really have a good reason.  NFS defines
> > -	 * several different file_system_type structures, but they all end up
> > -	 * using the same ->copy_file_range() function pointer.
> > -	 */
> > -	if (file_out->f_op->copy_file_range &&
> > -	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> > -		return file_out->f_op->copy_file_range(file_in, pos_in,
> > -						       file_out, pos_out,
> > -						       len, flags);
> > -
> > -	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> > -				       flags);
> > -}
> > -
> >   /*
> >    * Performs necessary checks before doing a file copy
> >    *
> > @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
> >   	loff_t size_in;
> >   	int ret;
> > +	/*
> > +	 * Although we now allow filesystems to handle cross sb copy, passing
> > +	 * a file of the wrong filesystem type to filesystem driver can result
> > +	 * in an attempt to dereference the wrong type of ->private_data, so
> > +	 * avoid doing that until we really have a good reason.  NFS defines
> > +	 * several different file_system_type structures, but they all end up
> > +	 * using the same ->copy_file_range() function pointer.
> > +	 */
> > +	if (file_out->f_op->copy_file_range) {
> > +		if (file_in->f_op->copy_file_range !=
> > +		    file_out->f_op->copy_file_range)
> > +			return -EXDEV;
> > +	} else if (file_in->f_op->remap_file_range) {
> > +		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> > +			return -EXDEV;
> 
> I think this check is redundant, it's done in vfs_copy_file_range.
> If this check is removed then the else clause below should be removed
> also. Once this check and the else clause are removed then might as
> well move the the check of copy_file_range from here to vfs_copy_file_range.
> 

I don't think it's really redundant, although I agree is messy due to the
fact we try to clone first instead of copying them.

So, in the clone path, this is the only place where we return -EXDEV if:

1) we don't have ->copy_file_range *and*
2) we have ->remap_file_range but the i_sb are different.

The check in vfs_copy_file_range() is only executed if:

1) we have *valid* ->copy_file_range ops and/or
2) we have *valid* ->remap_file_range

So... if we remove the check in generic_copy_file_checks() as you suggest
and:
- we don't have ->copy_file_range,
- we have ->remap_file_range but
- the i_sb are different

we'll return the -EOPNOTSUPP (the one set in line "ret = -EOPNOTSUPP;" in
function vfs_copy_file_range() ) instead of -EXDEV.

But I may have got it all wrong.  I've looked so many times at this code
that I'm probably useless at finding problems in it :-)

Cheers,
--
Luís

> -Dai
> 
> > +	} else {
> > +                return -EOPNOTSUPP;
> > +	}
> > +
> >   	ret = generic_file_rw_checks(file_in, file_out);
> >   	if (ret)
> >   		return ret;
> > @@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> >   	file_start_write(file_out);
> > +	ret = -EOPNOTSUPP;
> >   	/*
> >   	 * Try cloning first, this is supported by more file systems, and
> >   	 * more efficient if both clone and copy are supported (e.g. NFS).
> > @@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> >   		}
> >   	}
> > -	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> > -				flags);
> > -	WARN_ON_ONCE(ret == -EOPNOTSUPP);
> > +	if (file_out->f_op->copy_file_range)
> > +		ret = file_out->f_op->copy_file_range(file_in, pos_in,
> > +						      file_out, pos_out,
> > +						      len, flags);
> >   done:
> >   	if (ret > 0) {
> >   		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 10:32                                 ` Luis Henriques
@ 2021-02-23 15:28                                   ` Amir Goldstein
  2021-02-23 15:29                                   ` dai.ngo
  1 sibling, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-23 15:28 UTC (permalink / raw)
  To: Luis Henriques
  Cc: dai.ngo, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List

On Tue, Feb 23, 2021 at 12:31 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
> >
> > On 2/22/21 2:24 AM, Luis Henriques wrote:
> > > A regression has been reported by Nicolas Boichat, found while using the
> > > copy_file_range syscall to copy a tracefs file.  Before commit
> > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > > kernel would return -EXDEV to userspace when trying to copy a file across
> > > different filesystems.  After this commit, the syscall doesn't fail anymore
> > > and instead returns zero (zero bytes copied), as this file's content is
> > > generated on-the-fly and thus reports a size of zero.
> > >
> > > This patch restores some cross-filesystem copy restrictions that existed
> > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > devices").  Filesystems are still allowed to fall-back to the VFS
> > > generic_copy_file_range() implementation, but that has now to be done
> > > explicitly.
> > >
> > > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> > >
> > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> > > Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> > > Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > ---
> > > Changes since v7
> > > - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so that the
> > >    error returned is always related to the 'copy' operation
> > > Changes since v6
> > > - restored i_sb checks for the clone operation
> > > Changes since v5
> > > - check if ->copy_file_range is NULL before calling it
> > > Changes since v4
> > > - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
> > >    or -EXDEV.
> > > Changes since v3
> > > - dropped the COPY_FILE_SPLICE flag
> > > - kept the f_op's checks early in generic_copy_file_checks, implementing
> > >    Amir's suggestions
> > > - modified nfsd to use generic_copy_file_range()
> > > Changes since v2
> > > - do all the required checks earlier, in generic_copy_file_checks(),
> > >    adding new checks for ->remap_file_range
> > > - new COPY_FILE_SPLICE flag
> > > - don't remove filesystem's fallback to generic_copy_file_range()
> > > - updated commit changelog (and subject)
> > > Changes since v1 (after Amir review)
> > > - restored do_copy_file_range() helper
> > > - return -EOPNOTSUPP if fs doesn't implement CFR
> > > - updated commit description
> > >
> > >   fs/nfsd/vfs.c   |  8 +++++++-
> > >   fs/read_write.c | 49 ++++++++++++++++++++++++-------------------------
> > >   2 files changed, 31 insertions(+), 26 deletions(-)
> > >
> > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > index 04937e51de56..23dab0fa9087 100644
> > > --- a/fs/nfsd/vfs.c
> > > +++ b/fs/nfsd/vfs.c
> > > @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
> > >   ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> > >                          u64 dst_pos, u64 count)
> > >   {
> > > +   ssize_t ret;
> > >     /*
> > >      * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> > > @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> > >      * limit like this and pipeline multiple COPY requests.
> > >      */
> > >     count = min_t(u64, count, 1 << 22);
> > > -   return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > > +   ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > > +
> > > +   if (ret == -EOPNOTSUPP || ret == -EXDEV)
> > > +           ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> > > +                                         count, 0);
> > > +   return ret;
> > >   }
> > >   __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > index 75f764b43418..5a26297fd410 100644
> > > --- a/fs/read_write.c
> > > +++ b/fs/read_write.c
> > > @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > >   }
> > >   EXPORT_SYMBOL(generic_copy_file_range);
> > > -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> > > -                             struct file *file_out, loff_t pos_out,
> > > -                             size_t len, unsigned int flags)
> > > -{
> > > -   /*
> > > -    * Although we now allow filesystems to handle cross sb copy, passing
> > > -    * a file of the wrong filesystem type to filesystem driver can result
> > > -    * in an attempt to dereference the wrong type of ->private_data, so
> > > -    * avoid doing that until we really have a good reason.  NFS defines
> > > -    * several different file_system_type structures, but they all end up
> > > -    * using the same ->copy_file_range() function pointer.
> > > -    */
> > > -   if (file_out->f_op->copy_file_range &&
> > > -       file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> > > -           return file_out->f_op->copy_file_range(file_in, pos_in,
> > > -                                                  file_out, pos_out,
> > > -                                                  len, flags);
> > > -
> > > -   return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> > > -                                  flags);
> > > -}
> > > -
> > >   /*
> > >    * Performs necessary checks before doing a file copy
> > >    *
> > > @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
> > >     loff_t size_in;
> > >     int ret;
> > > +   /*
> > > +    * Although we now allow filesystems to handle cross sb copy, passing
> > > +    * a file of the wrong filesystem type to filesystem driver can result
> > > +    * in an attempt to dereference the wrong type of ->private_data, so
> > > +    * avoid doing that until we really have a good reason.  NFS defines
> > > +    * several different file_system_type structures, but they all end up
> > > +    * using the same ->copy_file_range() function pointer.
> > > +    */
> > > +   if (file_out->f_op->copy_file_range) {
> > > +           if (file_in->f_op->copy_file_range !=
> > > +               file_out->f_op->copy_file_range)
> > > +                   return -EXDEV;
> > > +   } else if (file_in->f_op->remap_file_range) {
> > > +           if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> > > +                   return -EXDEV;
> >
> > I think this check is redundant, it's done in vfs_copy_file_range.
> > If this check is removed then the else clause below should be removed
> > also. Once this check and the else clause are removed then might as
> > well move the the check of copy_file_range from here to vfs_copy_file_range.
> >
>
> I don't think it's really redundant, although I agree is messy due to the
> fact we try to clone first instead of copying them.
>

It was put here in early checks on purpose before the check for
zero size file.
I'm pretty sure this wasn't the case in earlier versions of the path
and then it did not solve the reported problem.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 10:32                                 ` Luis Henriques
  2021-02-23 15:28                                   ` Amir Goldstein
@ 2021-02-23 15:29                                   ` dai.ngo
  2021-02-23 16:02                                     ` dai.ngo
  1 sibling, 1 reply; 93+ messages in thread
From: dai.ngo @ 2021-02-23 15:29 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, linux-cifs,
	samba-technical, linux-fsdevel, linux-nfs


On 2/23/21 2:32 AM, Luis Henriques wrote:
> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
>> On 2/22/21 2:24 AM, Luis Henriques wrote:
>>> A regression has been reported by Nicolas Boichat, found while using the
>>> copy_file_range syscall to copy a tracefs file.  Before commit
>>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>>> kernel would return -EXDEV to userspace when trying to copy a file across
>>> different filesystems.  After this commit, the syscall doesn't fail anymore
>>> and instead returns zero (zero bytes copied), as this file's content is
>>> generated on-the-fly and thus reports a size of zero.
>>>
>>> This patch restores some cross-filesystem copy restrictions that existed
>>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>>> devices").  Filesystems are still allowed to fall-back to the VFS
>>> generic_copy_file_range() implementation, but that has now to be done
>>> explicitly.
>>>
>>> nfsd is also modified to fall-back into generic_copy_file_range() in case
>>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>>>
>>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
>>> Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
>>> Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
>>> Link: https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
>>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
>>> ---
>>> Changes since v7
>>> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so that the
>>>     error returned is always related to the 'copy' operation
>>> Changes since v6
>>> - restored i_sb checks for the clone operation
>>> Changes since v5
>>> - check if ->copy_file_range is NULL before calling it
>>> Changes since v4
>>> - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
>>>     or -EXDEV.
>>> Changes since v3
>>> - dropped the COPY_FILE_SPLICE flag
>>> - kept the f_op's checks early in generic_copy_file_checks, implementing
>>>     Amir's suggestions
>>> - modified nfsd to use generic_copy_file_range()
>>> Changes since v2
>>> - do all the required checks earlier, in generic_copy_file_checks(),
>>>     adding new checks for ->remap_file_range
>>> - new COPY_FILE_SPLICE flag
>>> - don't remove filesystem's fallback to generic_copy_file_range()
>>> - updated commit changelog (and subject)
>>> Changes since v1 (after Amir review)
>>> - restored do_copy_file_range() helper
>>> - return -EOPNOTSUPP if fs doesn't implement CFR
>>> - updated commit description
>>>
>>>    fs/nfsd/vfs.c   |  8 +++++++-
>>>    fs/read_write.c | 49 ++++++++++++++++++++++++-------------------------
>>>    2 files changed, 31 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>> index 04937e51de56..23dab0fa9087 100644
>>> --- a/fs/nfsd/vfs.c
>>> +++ b/fs/nfsd/vfs.c
>>> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
>>>    ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>>>    			     u64 dst_pos, u64 count)
>>>    {
>>> +	ssize_t ret;
>>>    	/*
>>>    	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>>> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>>>    	 * limit like this and pipeline multiple COPY requests.
>>>    	 */
>>>    	count = min_t(u64, count, 1 << 22);
>>> -	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>>> +	ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>>> +
>>> +	if (ret == -EOPNOTSUPP || ret == -EXDEV)
>>> +		ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
>>> +					      count, 0);
>>> +	return ret;
>>>    }
>>>    __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>> diff --git a/fs/read_write.c b/fs/read_write.c
>>> index 75f764b43418..5a26297fd410 100644
>>> --- a/fs/read_write.c
>>> +++ b/fs/read_write.c
>>> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>>>    }
>>>    EXPORT_SYMBOL(generic_copy_file_range);
>>> -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
>>> -				  struct file *file_out, loff_t pos_out,
>>> -				  size_t len, unsigned int flags)
>>> -{
>>> -	/*
>>> -	 * Although we now allow filesystems to handle cross sb copy, passing
>>> -	 * a file of the wrong filesystem type to filesystem driver can result
>>> -	 * in an attempt to dereference the wrong type of ->private_data, so
>>> -	 * avoid doing that until we really have a good reason.  NFS defines
>>> -	 * several different file_system_type structures, but they all end up
>>> -	 * using the same ->copy_file_range() function pointer.
>>> -	 */
>>> -	if (file_out->f_op->copy_file_range &&
>>> -	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
>>> -		return file_out->f_op->copy_file_range(file_in, pos_in,
>>> -						       file_out, pos_out,
>>> -						       len, flags);
>>> -
>>> -	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
>>> -				       flags);
>>> -}
>>> -
>>>    /*
>>>     * Performs necessary checks before doing a file copy
>>>     *
>>> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>>>    	loff_t size_in;
>>>    	int ret;
>>> +	/*
>>> +	 * Although we now allow filesystems to handle cross sb copy, passing
>>> +	 * a file of the wrong filesystem type to filesystem driver can result
>>> +	 * in an attempt to dereference the wrong type of ->private_data, so
>>> +	 * avoid doing that until we really have a good reason.  NFS defines
>>> +	 * several different file_system_type structures, but they all end up
>>> +	 * using the same ->copy_file_range() function pointer.
>>> +	 */
>>> +	if (file_out->f_op->copy_file_range) {
>>> +		if (file_in->f_op->copy_file_range !=
>>> +		    file_out->f_op->copy_file_range)
>>> +			return -EXDEV;
>>> +	} else if (file_in->f_op->remap_file_range) {
>>> +		if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
>>> +			return -EXDEV;
>> I think this check is redundant, it's done in vfs_copy_file_range.
>> If this check is removed then the else clause below should be removed
>> also. Once this check and the else clause are removed then might as
>> well move the the check of copy_file_range from here to vfs_copy_file_range.
>>
> I don't think it's really redundant, although I agree is messy due to the
> fact we try to clone first instead of copying them.
>
> So, in the clone path, this is the only place where we return -EXDEV if:
>
> 1) we don't have ->copy_file_range *and*
> 2) we have ->remap_file_range but the i_sb are different.
>
> The check in vfs_copy_file_range() is only executed if:
>
> 1) we have *valid* ->copy_file_range ops and/or
> 2) we have *valid* ->remap_file_range
>
> So... if we remove the check in generic_copy_file_checks() as you suggest
> and:
> - we don't have ->copy_file_range,
> - we have ->remap_file_range but
> - the i_sb are different
>
> we'll return the -EOPNOTSUPP (the one set in line "ret = -EOPNOTSUPP;" in
> function vfs_copy_file_range() ) instead of -EXDEV.

Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
-EXDEVV by doing generic_copy_file_range.  Do any other consumers of
vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
the correct error code for this case? It seems to me that -EOPNOTSUPP
is more appropriate than EXDEV when (sb1 != sb2).

>
> But I may have got it all wrong.  I've looked so many times at this code
> that I'm probably useless at finding problems in it :-)

You're not alone, we all try to do the right thing :-)

-Dai

>
> Cheers,
> --
> Luís
>
>> -Dai
>>
>>> +	} else {
>>> +                return -EOPNOTSUPP;
>>> +	}
>>> +
>>>    	ret = generic_file_rw_checks(file_in, file_out);
>>>    	if (ret)
>>>    		return ret;
>>> @@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>>    	file_start_write(file_out);
>>> +	ret = -EOPNOTSUPP;
>>>    	/*
>>>    	 * Try cloning first, this is supported by more file systems, and
>>>    	 * more efficient if both clone and copy are supported (e.g. NFS).
>>> @@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>>    		}
>>>    	}
>>> -	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
>>> -				flags);
>>> -	WARN_ON_ONCE(ret == -EOPNOTSUPP);
>>> +	if (file_out->f_op->copy_file_range)
>>> +		ret = file_out->f_op->copy_file_range(file_in, pos_in,
>>> +						      file_out, pos_out,
>>> +						      len, flags);
>>>    done:
>>>    	if (ret > 0) {
>>>    		fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 15:29                                   ` dai.ngo
@ 2021-02-23 16:02                                     ` dai.ngo
  2021-02-23 16:47                                       ` Amir Goldstein
  2021-02-23 17:13                                       ` Olga Kornievskaia
  0 siblings, 2 replies; 93+ messages in thread
From: dai.ngo @ 2021-02-23 16:02 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, linux-cifs,
	samba-technical, linux-fsdevel, linux-nfs


On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
>
> On 2/23/21 2:32 AM, Luis Henriques wrote:
>> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
>>> On 2/22/21 2:24 AM, Luis Henriques wrote:
>>>> A regression has been reported by Nicolas Boichat, found while 
>>>> using the
>>>> copy_file_range syscall to copy a tracefs file.  Before commit
>>>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>>>> kernel would return -EXDEV to userspace when trying to copy a file 
>>>> across
>>>> different filesystems.  After this commit, the syscall doesn't fail 
>>>> anymore
>>>> and instead returns zero (zero bytes copied), as this file's 
>>>> content is
>>>> generated on-the-fly and thus reports a size of zero.
>>>>
>>>> This patch restores some cross-filesystem copy restrictions that 
>>>> existed
>>>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy 
>>>> across
>>>> devices").  Filesystems are still allowed to fall-back to the VFS
>>>> generic_copy_file_range() implementation, but that has now to be done
>>>> explicitly.
>>>>
>>>> nfsd is also modified to fall-back into generic_copy_file_range() 
>>>> in case
>>>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>>>>
>>>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across 
>>>> devices")
>>>> Link: 
>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
>>>> Link: 
>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
>>>> Link: 
>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
>>>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>>>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
>>>> ---
>>>> Changes since v7
>>>> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so 
>>>> that the
>>>>     error returned is always related to the 'copy' operation
>>>> Changes since v6
>>>> - restored i_sb checks for the clone operation
>>>> Changes since v5
>>>> - check if ->copy_file_range is NULL before calling it
>>>> Changes since v4
>>>> - nfsd falls-back to generic_copy_file_range() only *if* it gets 
>>>> -EOPNOTSUPP
>>>>     or -EXDEV.
>>>> Changes since v3
>>>> - dropped the COPY_FILE_SPLICE flag
>>>> - kept the f_op's checks early in generic_copy_file_checks, 
>>>> implementing
>>>>     Amir's suggestions
>>>> - modified nfsd to use generic_copy_file_range()
>>>> Changes since v2
>>>> - do all the required checks earlier, in generic_copy_file_checks(),
>>>>     adding new checks for ->remap_file_range
>>>> - new COPY_FILE_SPLICE flag
>>>> - don't remove filesystem's fallback to generic_copy_file_range()
>>>> - updated commit changelog (and subject)
>>>> Changes since v1 (after Amir review)
>>>> - restored do_copy_file_range() helper
>>>> - return -EOPNOTSUPP if fs doesn't implement CFR
>>>> - updated commit description
>>>>
>>>>    fs/nfsd/vfs.c   |  8 +++++++-
>>>>    fs/read_write.c | 49 
>>>> ++++++++++++++++++++++++-------------------------
>>>>    2 files changed, 31 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>>> index 04937e51de56..23dab0fa9087 100644
>>>> --- a/fs/nfsd/vfs.c
>>>> +++ b/fs/nfsd/vfs.c
>>>> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file 
>>>> *nf_src, u64 src_pos,
>>>>    ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, 
>>>> struct file *dst,
>>>>                     u64 dst_pos, u64 count)
>>>>    {
>>>> +    ssize_t ret;
>>>>        /*
>>>>         * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>>>> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, 
>>>> u64 src_pos, struct file *dst,
>>>>         * limit like this and pipeline multiple COPY requests.
>>>>         */
>>>>        count = min_t(u64, count, 1 << 22);
>>>> -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>>>> +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>>>> +
>>>> +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
>>>> +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
>>>> +                          count, 0);
>>>> +    return ret;
>>>>    }
>>>>    __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh 
>>>> *fhp,
>>>> diff --git a/fs/read_write.c b/fs/read_write.c
>>>> index 75f764b43418..5a26297fd410 100644
>>>> --- a/fs/read_write.c
>>>> +++ b/fs/read_write.c
>>>> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file 
>>>> *file_in, loff_t pos_in,
>>>>    }
>>>>    EXPORT_SYMBOL(generic_copy_file_range);
>>>> -static ssize_t do_copy_file_range(struct file *file_in, loff_t 
>>>> pos_in,
>>>> -                  struct file *file_out, loff_t pos_out,
>>>> -                  size_t len, unsigned int flags)
>>>> -{
>>>> -    /*
>>>> -     * Although we now allow filesystems to handle cross sb copy, 
>>>> passing
>>>> -     * a file of the wrong filesystem type to filesystem driver 
>>>> can result
>>>> -     * in an attempt to dereference the wrong type of 
>>>> ->private_data, so
>>>> -     * avoid doing that until we really have a good reason.  NFS 
>>>> defines
>>>> -     * several different file_system_type structures, but they all 
>>>> end up
>>>> -     * using the same ->copy_file_range() function pointer.
>>>> -     */
>>>> -    if (file_out->f_op->copy_file_range &&
>>>> -        file_out->f_op->copy_file_range == 
>>>> file_in->f_op->copy_file_range)
>>>> -        return file_out->f_op->copy_file_range(file_in, pos_in,
>>>> -                               file_out, pos_out,
>>>> -                               len, flags);
>>>> -
>>>> -    return generic_copy_file_range(file_in, pos_in, file_out, 
>>>> pos_out, len,
>>>> -                       flags);
>>>> -}
>>>> -
>>>>    /*
>>>>     * Performs necessary checks before doing a file copy
>>>>     *
>>>> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct 
>>>> file *file_in, loff_t pos_in,
>>>>        loff_t size_in;
>>>>        int ret;
>>>> +    /*
>>>> +     * Although we now allow filesystems to handle cross sb copy, 
>>>> passing
>>>> +     * a file of the wrong filesystem type to filesystem driver 
>>>> can result
>>>> +     * in an attempt to dereference the wrong type of 
>>>> ->private_data, so
>>>> +     * avoid doing that until we really have a good reason.  NFS 
>>>> defines
>>>> +     * several different file_system_type structures, but they all 
>>>> end up
>>>> +     * using the same ->copy_file_range() function pointer.
>>>> +     */
>>>> +    if (file_out->f_op->copy_file_range) {
>>>> +        if (file_in->f_op->copy_file_range !=
>>>> +            file_out->f_op->copy_file_range)
>>>> +            return -EXDEV;
>>>> +    } else if (file_in->f_op->remap_file_range) {
>>>> +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
>>>> +            return -EXDEV;
>>> I think this check is redundant, it's done in vfs_copy_file_range.
>>> If this check is removed then the else clause below should be removed
>>> also. Once this check and the else clause are removed then might as
>>> well move the the check of copy_file_range from here to 
>>> vfs_copy_file_range.
>>>
>> I don't think it's really redundant, although I agree is messy due to 
>> the
>> fact we try to clone first instead of copying them.
>>
>> So, in the clone path, this is the only place where we return -EXDEV if:
>>
>> 1) we don't have ->copy_file_range *and*
>> 2) we have ->remap_file_range but the i_sb are different.
>>
>> The check in vfs_copy_file_range() is only executed if:
>>
>> 1) we have *valid* ->copy_file_range ops and/or
>> 2) we have *valid* ->remap_file_range
>>
>> So... if we remove the check in generic_copy_file_checks() as you 
>> suggest
>> and:
>> - we don't have ->copy_file_range,
>> - we have ->remap_file_range but
>> - the i_sb are different
>>
>> we'll return the -EOPNOTSUPP (the one set in line "ret = 
>> -EOPNOTSUPP;" in
>> function vfs_copy_file_range() ) instead of -EXDEV.
>
> Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
> -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
> vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
> the correct error code for this case? It seems to me that -EOPNOTSUPP
> is more appropriate than EXDEV when (sb1 != sb2).

So with the current patch, for a clone operation across 2 filesystems:

   . if src and dst filesystem support both copy_file_range and
     map_file_range then the code returns -ENOTSUPPORT.

   . if the filesystems only support map_file_range then the
     code returns -EXDEV

This seems confusing, shouldn't only 1 error code returned for this case?

-Dai

>
>>
>> But I may have got it all wrong.  I've looked so many times at this code
>> that I'm probably useless at finding problems in it :-)
>
> You're not alone, we all try to do the right thing :-)
>
> -Dai
>
>>
>> Cheers,
>> -- 
>> Luís
>>
>>> -Dai
>>>
>>>> +    } else {
>>>> +                return -EOPNOTSUPP;
>>>> +    }
>>>> +
>>>>        ret = generic_file_rw_checks(file_in, file_out);
>>>>        if (ret)
>>>>            return ret;
>>>> @@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file 
>>>> *file_in, loff_t pos_in,
>>>>        file_start_write(file_out);
>>>> +    ret = -EOPNOTSUPP;
>>>>        /*
>>>>         * Try cloning first, this is supported by more file 
>>>> systems, and
>>>>         * more efficient if both clone and copy are supported (e.g. 
>>>> NFS).
>>>> @@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file 
>>>> *file_in, loff_t pos_in,
>>>>            }
>>>>        }
>>>> -    ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
>>>> -                flags);
>>>> -    WARN_ON_ONCE(ret == -EOPNOTSUPP);
>>>> +    if (file_out->f_op->copy_file_range)
>>>> +        ret = file_out->f_op->copy_file_range(file_in, pos_in,
>>>> +                              file_out, pos_out,
>>>> +                              len, flags);
>>>>    done:
>>>>        if (ret > 0) {
>>>>            fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 16:02                                     ` dai.ngo
@ 2021-02-23 16:47                                       ` Amir Goldstein
  2021-02-23 16:57                                         ` dai.ngo
  2021-02-23 17:13                                       ` Olga Kornievskaia
  1 sibling, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-23 16:47 UTC (permalink / raw)
  To: dai.ngo
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List

On Tue, Feb 23, 2021 at 6:02 PM <dai.ngo@oracle.com> wrote:
>
>
> On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
> >
> > On 2/23/21 2:32 AM, Luis Henriques wrote:
> >> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
> >>> On 2/22/21 2:24 AM, Luis Henriques wrote:
> >>>> A regression has been reported by Nicolas Boichat, found while
> >>>> using the
> >>>> copy_file_range syscall to copy a tracefs file.  Before commit
> >>>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> >>>> kernel would return -EXDEV to userspace when trying to copy a file
> >>>> across
> >>>> different filesystems.  After this commit, the syscall doesn't fail
> >>>> anymore
> >>>> and instead returns zero (zero bytes copied), as this file's
> >>>> content is
> >>>> generated on-the-fly and thus reports a size of zero.
> >>>>
> >>>> This patch restores some cross-filesystem copy restrictions that
> >>>> existed
> >>>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy
> >>>> across
> >>>> devices").  Filesystems are still allowed to fall-back to the VFS
> >>>> generic_copy_file_range() implementation, but that has now to be done
> >>>> explicitly.
> >>>>
> >>>> nfsd is also modified to fall-back into generic_copy_file_range()
> >>>> in case
> >>>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> >>>>
> >>>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >>>> devices")
> >>>> Link:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> >>>> Link:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> >>>> Link:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> >>>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> >>>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >>>> ---
> >>>> Changes since v7
> >>>> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so
> >>>> that the
> >>>>     error returned is always related to the 'copy' operation
> >>>> Changes since v6
> >>>> - restored i_sb checks for the clone operation
> >>>> Changes since v5
> >>>> - check if ->copy_file_range is NULL before calling it
> >>>> Changes since v4
> >>>> - nfsd falls-back to generic_copy_file_range() only *if* it gets
> >>>> -EOPNOTSUPP
> >>>>     or -EXDEV.
> >>>> Changes since v3
> >>>> - dropped the COPY_FILE_SPLICE flag
> >>>> - kept the f_op's checks early in generic_copy_file_checks,
> >>>> implementing
> >>>>     Amir's suggestions
> >>>> - modified nfsd to use generic_copy_file_range()
> >>>> Changes since v2
> >>>> - do all the required checks earlier, in generic_copy_file_checks(),
> >>>>     adding new checks for ->remap_file_range
> >>>> - new COPY_FILE_SPLICE flag
> >>>> - don't remove filesystem's fallback to generic_copy_file_range()
> >>>> - updated commit changelog (and subject)
> >>>> Changes since v1 (after Amir review)
> >>>> - restored do_copy_file_range() helper
> >>>> - return -EOPNOTSUPP if fs doesn't implement CFR
> >>>> - updated commit description
> >>>>
> >>>>    fs/nfsd/vfs.c   |  8 +++++++-
> >>>>    fs/read_write.c | 49
> >>>> ++++++++++++++++++++++++-------------------------
> >>>>    2 files changed, 31 insertions(+), 26 deletions(-)
> >>>>
> >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >>>> index 04937e51de56..23dab0fa9087 100644
> >>>> --- a/fs/nfsd/vfs.c
> >>>> +++ b/fs/nfsd/vfs.c
> >>>> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file
> >>>> *nf_src, u64 src_pos,
> >>>>    ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos,
> >>>> struct file *dst,
> >>>>                     u64 dst_pos, u64 count)
> >>>>    {
> >>>> +    ssize_t ret;
> >>>>        /*
> >>>>         * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> >>>> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src,
> >>>> u64 src_pos, struct file *dst,
> >>>>         * limit like this and pipeline multiple COPY requests.
> >>>>         */
> >>>>        count = min_t(u64, count, 1 << 22);
> >>>> -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >>>> +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >>>> +
> >>>> +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
> >>>> +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> >>>> +                          count, 0);
> >>>> +    return ret;
> >>>>    }
> >>>>    __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh
> >>>> *fhp,
> >>>> diff --git a/fs/read_write.c b/fs/read_write.c
> >>>> index 75f764b43418..5a26297fd410 100644
> >>>> --- a/fs/read_write.c
> >>>> +++ b/fs/read_write.c
> >>>> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file
> >>>> *file_in, loff_t pos_in,
> >>>>    }
> >>>>    EXPORT_SYMBOL(generic_copy_file_range);
> >>>> -static ssize_t do_copy_file_range(struct file *file_in, loff_t
> >>>> pos_in,
> >>>> -                  struct file *file_out, loff_t pos_out,
> >>>> -                  size_t len, unsigned int flags)
> >>>> -{
> >>>> -    /*
> >>>> -     * Although we now allow filesystems to handle cross sb copy,
> >>>> passing
> >>>> -     * a file of the wrong filesystem type to filesystem driver
> >>>> can result
> >>>> -     * in an attempt to dereference the wrong type of
> >>>> ->private_data, so
> >>>> -     * avoid doing that until we really have a good reason.  NFS
> >>>> defines
> >>>> -     * several different file_system_type structures, but they all
> >>>> end up
> >>>> -     * using the same ->copy_file_range() function pointer.
> >>>> -     */
> >>>> -    if (file_out->f_op->copy_file_range &&
> >>>> -        file_out->f_op->copy_file_range ==
> >>>> file_in->f_op->copy_file_range)
> >>>> -        return file_out->f_op->copy_file_range(file_in, pos_in,
> >>>> -                               file_out, pos_out,
> >>>> -                               len, flags);
> >>>> -
> >>>> -    return generic_copy_file_range(file_in, pos_in, file_out,
> >>>> pos_out, len,
> >>>> -                       flags);
> >>>> -}
> >>>> -
> >>>>    /*
> >>>>     * Performs necessary checks before doing a file copy
> >>>>     *
> >>>> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct
> >>>> file *file_in, loff_t pos_in,
> >>>>        loff_t size_in;
> >>>>        int ret;
> >>>> +    /*
> >>>> +     * Although we now allow filesystems to handle cross sb copy,
> >>>> passing
> >>>> +     * a file of the wrong filesystem type to filesystem driver
> >>>> can result
> >>>> +     * in an attempt to dereference the wrong type of
> >>>> ->private_data, so
> >>>> +     * avoid doing that until we really have a good reason.  NFS
> >>>> defines
> >>>> +     * several different file_system_type structures, but they all
> >>>> end up
> >>>> +     * using the same ->copy_file_range() function pointer.
> >>>> +     */
> >>>> +    if (file_out->f_op->copy_file_range) {
> >>>> +        if (file_in->f_op->copy_file_range !=
> >>>> +            file_out->f_op->copy_file_range)
> >>>> +            return -EXDEV;
> >>>> +    } else if (file_in->f_op->remap_file_range) {
> >>>> +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> >>>> +            return -EXDEV;
> >>> I think this check is redundant, it's done in vfs_copy_file_range.
> >>> If this check is removed then the else clause below should be removed
> >>> also. Once this check and the else clause are removed then might as
> >>> well move the the check of copy_file_range from here to
> >>> vfs_copy_file_range.
> >>>
> >> I don't think it's really redundant, although I agree is messy due to
> >> the
> >> fact we try to clone first instead of copying them.
> >>
> >> So, in the clone path, this is the only place where we return -EXDEV if:
> >>
> >> 1) we don't have ->copy_file_range *and*
> >> 2) we have ->remap_file_range but the i_sb are different.
> >>
> >> The check in vfs_copy_file_range() is only executed if:
> >>
> >> 1) we have *valid* ->copy_file_range ops and/or
> >> 2) we have *valid* ->remap_file_range
> >>
> >> So... if we remove the check in generic_copy_file_checks() as you
> >> suggest
> >> and:
> >> - we don't have ->copy_file_range,
> >> - we have ->remap_file_range but
> >> - the i_sb are different
> >>
> >> we'll return the -EOPNOTSUPP (the one set in line "ret =
> >> -EOPNOTSUPP;" in
> >> function vfs_copy_file_range() ) instead of -EXDEV.
> >
> > Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
> > -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
> > vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
> > the correct error code for this case? It seems to me that -EOPNOTSUPP
> > is more appropriate than EXDEV when (sb1 != sb2).
>

EXDEV is the right code for:
filesystem supports the operation but not for sb1 != sb1.

> So with the current patch, for a clone operation across 2 filesystems:
>
>    . if src and dst filesystem support both copy_file_range and
>      map_file_range then the code returns -ENOTSUPPORT.
>

Why do you say that?
Which code are you referring to exactly?
Did you see this behavior in a test?

>    . if the filesystems only support map_file_range then the
>      code returns -EXDEV
>
> This seems confusing, shouldn't only 1 error code returned for this case?
>

From my read of the code, user will get -EXDEV in both the cases you
listed.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 16:47                                       ` Amir Goldstein
@ 2021-02-23 16:57                                         ` dai.ngo
       [not found]                                           ` <e3eed18b-fc7e-e687-608b-7f662017329c@oracle.com>
  2021-02-23 17:56                                           ` Luis Henriques
  0 siblings, 2 replies; 93+ messages in thread
From: dai.ngo @ 2021-02-23 16:57 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List


On 2/23/21 8:47 AM, Amir Goldstein wrote:
> On Tue, Feb 23, 2021 at 6:02 PM <dai.ngo@oracle.com> wrote:
>>
>> On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
>>> On 2/23/21 2:32 AM, Luis Henriques wrote:
>>>> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
>>>>> On 2/22/21 2:24 AM, Luis Henriques wrote:
>>>>>> A regression has been reported by Nicolas Boichat, found while
>>>>>> using the
>>>>>> copy_file_range syscall to copy a tracefs file.  Before commit
>>>>>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>>>>>> kernel would return -EXDEV to userspace when trying to copy a file
>>>>>> across
>>>>>> different filesystems.  After this commit, the syscall doesn't fail
>>>>>> anymore
>>>>>> and instead returns zero (zero bytes copied), as this file's
>>>>>> content is
>>>>>> generated on-the-fly and thus reports a size of zero.
>>>>>>
>>>>>> This patch restores some cross-filesystem copy restrictions that
>>>>>> existed
>>>>>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy
>>>>>> across
>>>>>> devices").  Filesystems are still allowed to fall-back to the VFS
>>>>>> generic_copy_file_range() implementation, but that has now to be done
>>>>>> explicitly.
>>>>>>
>>>>>> nfsd is also modified to fall-back into generic_copy_file_range()
>>>>>> in case
>>>>>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>>>>>>
>>>>>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>>>>>> devices")
>>>>>> Link:
>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
>>>>>> Link:
>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
>>>>>> Link:
>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
>>>>>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>>>>>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
>>>>>> ---
>>>>>> Changes since v7
>>>>>> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so
>>>>>> that the
>>>>>>      error returned is always related to the 'copy' operation
>>>>>> Changes since v6
>>>>>> - restored i_sb checks for the clone operation
>>>>>> Changes since v5
>>>>>> - check if ->copy_file_range is NULL before calling it
>>>>>> Changes since v4
>>>>>> - nfsd falls-back to generic_copy_file_range() only *if* it gets
>>>>>> -EOPNOTSUPP
>>>>>>      or -EXDEV.
>>>>>> Changes since v3
>>>>>> - dropped the COPY_FILE_SPLICE flag
>>>>>> - kept the f_op's checks early in generic_copy_file_checks,
>>>>>> implementing
>>>>>>      Amir's suggestions
>>>>>> - modified nfsd to use generic_copy_file_range()
>>>>>> Changes since v2
>>>>>> - do all the required checks earlier, in generic_copy_file_checks(),
>>>>>>      adding new checks for ->remap_file_range
>>>>>> - new COPY_FILE_SPLICE flag
>>>>>> - don't remove filesystem's fallback to generic_copy_file_range()
>>>>>> - updated commit changelog (and subject)
>>>>>> Changes since v1 (after Amir review)
>>>>>> - restored do_copy_file_range() helper
>>>>>> - return -EOPNOTSUPP if fs doesn't implement CFR
>>>>>> - updated commit description
>>>>>>
>>>>>>     fs/nfsd/vfs.c   |  8 +++++++-
>>>>>>     fs/read_write.c | 49
>>>>>> ++++++++++++++++++++++++-------------------------
>>>>>>     2 files changed, 31 insertions(+), 26 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>>>>>> index 04937e51de56..23dab0fa9087 100644
>>>>>> --- a/fs/nfsd/vfs.c
>>>>>> +++ b/fs/nfsd/vfs.c
>>>>>> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file
>>>>>> *nf_src, u64 src_pos,
>>>>>>     ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos,
>>>>>> struct file *dst,
>>>>>>                      u64 dst_pos, u64 count)
>>>>>>     {
>>>>>> +    ssize_t ret;
>>>>>>         /*
>>>>>>          * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>>>>>> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src,
>>>>>> u64 src_pos, struct file *dst,
>>>>>>          * limit like this and pipeline multiple COPY requests.
>>>>>>          */
>>>>>>         count = min_t(u64, count, 1 << 22);
>>>>>> -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>>>>>> +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>>>>>> +
>>>>>> +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
>>>>>> +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
>>>>>> +                          count, 0);
>>>>>> +    return ret;
>>>>>>     }
>>>>>>     __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh
>>>>>> *fhp,
>>>>>> diff --git a/fs/read_write.c b/fs/read_write.c
>>>>>> index 75f764b43418..5a26297fd410 100644
>>>>>> --- a/fs/read_write.c
>>>>>> +++ b/fs/read_write.c
>>>>>> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file
>>>>>> *file_in, loff_t pos_in,
>>>>>>     }
>>>>>>     EXPORT_SYMBOL(generic_copy_file_range);
>>>>>> -static ssize_t do_copy_file_range(struct file *file_in, loff_t
>>>>>> pos_in,
>>>>>> -                  struct file *file_out, loff_t pos_out,
>>>>>> -                  size_t len, unsigned int flags)
>>>>>> -{
>>>>>> -    /*
>>>>>> -     * Although we now allow filesystems to handle cross sb copy,
>>>>>> passing
>>>>>> -     * a file of the wrong filesystem type to filesystem driver
>>>>>> can result
>>>>>> -     * in an attempt to dereference the wrong type of
>>>>>> ->private_data, so
>>>>>> -     * avoid doing that until we really have a good reason.  NFS
>>>>>> defines
>>>>>> -     * several different file_system_type structures, but they all
>>>>>> end up
>>>>>> -     * using the same ->copy_file_range() function pointer.
>>>>>> -     */
>>>>>> -    if (file_out->f_op->copy_file_range &&
>>>>>> -        file_out->f_op->copy_file_range ==
>>>>>> file_in->f_op->copy_file_range)
>>>>>> -        return file_out->f_op->copy_file_range(file_in, pos_in,
>>>>>> -                               file_out, pos_out,
>>>>>> -                               len, flags);
>>>>>> -
>>>>>> -    return generic_copy_file_range(file_in, pos_in, file_out,
>>>>>> pos_out, len,
>>>>>> -                       flags);
>>>>>> -}
>>>>>> -
>>>>>>     /*
>>>>>>      * Performs necessary checks before doing a file copy
>>>>>>      *
>>>>>> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct
>>>>>> file *file_in, loff_t pos_in,
>>>>>>         loff_t size_in;
>>>>>>         int ret;
>>>>>> +    /*
>>>>>> +     * Although we now allow filesystems to handle cross sb copy,
>>>>>> passing
>>>>>> +     * a file of the wrong filesystem type to filesystem driver
>>>>>> can result
>>>>>> +     * in an attempt to dereference the wrong type of
>>>>>> ->private_data, so
>>>>>> +     * avoid doing that until we really have a good reason.  NFS
>>>>>> defines
>>>>>> +     * several different file_system_type structures, but they all
>>>>>> end up
>>>>>> +     * using the same ->copy_file_range() function pointer.
>>>>>> +     */
>>>>>> +    if (file_out->f_op->copy_file_range) {
>>>>>> +        if (file_in->f_op->copy_file_range !=
>>>>>> +            file_out->f_op->copy_file_range)
>>>>>> +            return -EXDEV;
>>>>>> +    } else if (file_in->f_op->remap_file_range) {
>>>>>> +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
>>>>>> +            return -EXDEV;
>>>>> I think this check is redundant, it's done in vfs_copy_file_range.
>>>>> If this check is removed then the else clause below should be removed
>>>>> also. Once this check and the else clause are removed then might as
>>>>> well move the the check of copy_file_range from here to
>>>>> vfs_copy_file_range.
>>>>>
>>>> I don't think it's really redundant, although I agree is messy due to
>>>> the
>>>> fact we try to clone first instead of copying them.
>>>>
>>>> So, in the clone path, this is the only place where we return -EXDEV if:
>>>>
>>>> 1) we don't have ->copy_file_range *and*
>>>> 2) we have ->remap_file_range but the i_sb are different.
>>>>
>>>> The check in vfs_copy_file_range() is only executed if:
>>>>
>>>> 1) we have *valid* ->copy_file_range ops and/or
>>>> 2) we have *valid* ->remap_file_range
>>>>
>>>> So... if we remove the check in generic_copy_file_checks() as you
>>>> suggest
>>>> and:
>>>> - we don't have ->copy_file_range,
>>>> - we have ->remap_file_range but
>>>> - the i_sb are different
>>>>
>>>> we'll return the -EOPNOTSUPP (the one set in line "ret =
>>>> -EOPNOTSUPP;" in
>>>> function vfs_copy_file_range() ) instead of -EXDEV.
>>> Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
>>> -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
>>> vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
>>> the correct error code for this case? It seems to me that -EOPNOTSUPP
>>> is more appropriate than EXDEV when (sb1 != sb2).
> EXDEV is the right code for:
> filesystem supports the operation but not for sb1 != sb1.
>
>> So with the current patch, for a clone operation across 2 filesystems:
>>
>>     . if src and dst filesystem support both copy_file_range and
>>       map_file_range then the code returns -ENOTSUPPORT.
>>
> Why do you say that?
> Which code are you referring to exactly?

If the filesystems support both copy_file_range and map_file_range,
it passes the check in generic_file_check but it fails with the
check in vfs_copy_file_range and returns -ENOTSUPPORT (added by
the v8 patch)

-Dai

> Did you see this behavior in a test?
>
>>     . if the filesystems only support map_file_range then the
>>       code returns -EXDEV
>>
>> This seems confusing, shouldn't only 1 error code returned for this case?
>>
>  From my read of the code, user will get -EXDEV in both the cases you
> listed.
>
> Thanks,
> Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 16:02                                     ` dai.ngo
  2021-02-23 16:47                                       ` Amir Goldstein
@ 2021-02-23 17:13                                       ` Olga Kornievskaia
  1 sibling, 0 replies; 93+ messages in thread
From: Olga Kornievskaia @ 2021-02-23 17:13 UTC (permalink / raw)
  To: Dai Ngo
  Cc: Luis Henriques, Amir Goldstein, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Christoph Hellwig,
	ceph-devel, linux-kernel, CIFS, samba-technical, linux-fsdevel,
	linux-nfs

On Tue, Feb 23, 2021 at 11:03 AM <dai.ngo@oracle.com> wrote:
>
>
> On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
> >
> > On 2/23/21 2:32 AM, Luis Henriques wrote:
> >> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
> >>> On 2/22/21 2:24 AM, Luis Henriques wrote:
> >>>> A regression has been reported by Nicolas Boichat, found while
> >>>> using the
> >>>> copy_file_range syscall to copy a tracefs file.  Before commit
> >>>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> >>>> kernel would return -EXDEV to userspace when trying to copy a file
> >>>> across
> >>>> different filesystems.  After this commit, the syscall doesn't fail
> >>>> anymore
> >>>> and instead returns zero (zero bytes copied), as this file's
> >>>> content is
> >>>> generated on-the-fly and thus reports a size of zero.
> >>>>
> >>>> This patch restores some cross-filesystem copy restrictions that
> >>>> existed
> >>>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy
> >>>> across
> >>>> devices").  Filesystems are still allowed to fall-back to the VFS
> >>>> generic_copy_file_range() implementation, but that has now to be done
> >>>> explicitly.
> >>>>
> >>>> nfsd is also modified to fall-back into generic_copy_file_range()
> >>>> in case
> >>>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> >>>>
> >>>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >>>> devices")
> >>>> Link:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> >>>> Link:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> >>>> Link:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> >>>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> >>>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >>>> ---
> >>>> Changes since v7
> >>>> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so
> >>>> that the
> >>>>     error returned is always related to the 'copy' operation
> >>>> Changes since v6
> >>>> - restored i_sb checks for the clone operation
> >>>> Changes since v5
> >>>> - check if ->copy_file_range is NULL before calling it
> >>>> Changes since v4
> >>>> - nfsd falls-back to generic_copy_file_range() only *if* it gets
> >>>> -EOPNOTSUPP
> >>>>     or -EXDEV.
> >>>> Changes since v3
> >>>> - dropped the COPY_FILE_SPLICE flag
> >>>> - kept the f_op's checks early in generic_copy_file_checks,
> >>>> implementing
> >>>>     Amir's suggestions
> >>>> - modified nfsd to use generic_copy_file_range()
> >>>> Changes since v2
> >>>> - do all the required checks earlier, in generic_copy_file_checks(),
> >>>>     adding new checks for ->remap_file_range
> >>>> - new COPY_FILE_SPLICE flag
> >>>> - don't remove filesystem's fallback to generic_copy_file_range()
> >>>> - updated commit changelog (and subject)
> >>>> Changes since v1 (after Amir review)
> >>>> - restored do_copy_file_range() helper
> >>>> - return -EOPNOTSUPP if fs doesn't implement CFR
> >>>> - updated commit description
> >>>>
> >>>>    fs/nfsd/vfs.c   |  8 +++++++-
> >>>>    fs/read_write.c | 49
> >>>> ++++++++++++++++++++++++-------------------------
> >>>>    2 files changed, 31 insertions(+), 26 deletions(-)
> >>>>
> >>>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >>>> index 04937e51de56..23dab0fa9087 100644
> >>>> --- a/fs/nfsd/vfs.c
> >>>> +++ b/fs/nfsd/vfs.c
> >>>> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file
> >>>> *nf_src, u64 src_pos,
> >>>>    ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos,
> >>>> struct file *dst,
> >>>>                     u64 dst_pos, u64 count)
> >>>>    {
> >>>> +    ssize_t ret;
> >>>>        /*
> >>>>         * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> >>>> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src,
> >>>> u64 src_pos, struct file *dst,
> >>>>         * limit like this and pipeline multiple COPY requests.
> >>>>         */
> >>>>        count = min_t(u64, count, 1 << 22);
> >>>> -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >>>> +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >>>> +
> >>>> +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
> >>>> +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> >>>> +                          count, 0);
> >>>> +    return ret;
> >>>>    }
> >>>>    __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh
> >>>> *fhp,
> >>>> diff --git a/fs/read_write.c b/fs/read_write.c
> >>>> index 75f764b43418..5a26297fd410 100644
> >>>> --- a/fs/read_write.c
> >>>> +++ b/fs/read_write.c
> >>>> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file
> >>>> *file_in, loff_t pos_in,
> >>>>    }
> >>>>    EXPORT_SYMBOL(generic_copy_file_range);
> >>>> -static ssize_t do_copy_file_range(struct file *file_in, loff_t
> >>>> pos_in,
> >>>> -                  struct file *file_out, loff_t pos_out,
> >>>> -                  size_t len, unsigned int flags)
> >>>> -{
> >>>> -    /*
> >>>> -     * Although we now allow filesystems to handle cross sb copy,
> >>>> passing
> >>>> -     * a file of the wrong filesystem type to filesystem driver
> >>>> can result
> >>>> -     * in an attempt to dereference the wrong type of
> >>>> ->private_data, so
> >>>> -     * avoid doing that until we really have a good reason.  NFS
> >>>> defines
> >>>> -     * several different file_system_type structures, but they all
> >>>> end up
> >>>> -     * using the same ->copy_file_range() function pointer.
> >>>> -     */
> >>>> -    if (file_out->f_op->copy_file_range &&
> >>>> -        file_out->f_op->copy_file_range ==
> >>>> file_in->f_op->copy_file_range)
> >>>> -        return file_out->f_op->copy_file_range(file_in, pos_in,
> >>>> -                               file_out, pos_out,
> >>>> -                               len, flags);
> >>>> -
> >>>> -    return generic_copy_file_range(file_in, pos_in, file_out,
> >>>> pos_out, len,
> >>>> -                       flags);
> >>>> -}
> >>>> -
> >>>>    /*
> >>>>     * Performs necessary checks before doing a file copy
> >>>>     *
> >>>> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct
> >>>> file *file_in, loff_t pos_in,
> >>>>        loff_t size_in;
> >>>>        int ret;
> >>>> +    /*
> >>>> +     * Although we now allow filesystems to handle cross sb copy,
> >>>> passing
> >>>> +     * a file of the wrong filesystem type to filesystem driver
> >>>> can result
> >>>> +     * in an attempt to dereference the wrong type of
> >>>> ->private_data, so
> >>>> +     * avoid doing that until we really have a good reason.  NFS
> >>>> defines
> >>>> +     * several different file_system_type structures, but they all
> >>>> end up
> >>>> +     * using the same ->copy_file_range() function pointer.
> >>>> +     */
> >>>> +    if (file_out->f_op->copy_file_range) {
> >>>> +        if (file_in->f_op->copy_file_range !=
> >>>> +            file_out->f_op->copy_file_range)
> >>>> +            return -EXDEV;
> >>>> +    } else if (file_in->f_op->remap_file_range) {
> >>>> +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> >>>> +            return -EXDEV;
> >>> I think this check is redundant, it's done in vfs_copy_file_range.
> >>> If this check is removed then the else clause below should be removed
> >>> also. Once this check and the else clause are removed then might as
> >>> well move the the check of copy_file_range from here to
> >>> vfs_copy_file_range.
> >>>
> >> I don't think it's really redundant, although I agree is messy due to
> >> the
> >> fact we try to clone first instead of copying them.
> >>
> >> So, in the clone path, this is the only place where we return -EXDEV if:
> >>
> >> 1) we don't have ->copy_file_range *and*
> >> 2) we have ->remap_file_range but the i_sb are different.
> >>
> >> The check in vfs_copy_file_range() is only executed if:
> >>
> >> 1) we have *valid* ->copy_file_range ops and/or
> >> 2) we have *valid* ->remap_file_range
> >>
> >> So... if we remove the check in generic_copy_file_checks() as you
> >> suggest
> >> and:
> >> - we don't have ->copy_file_range,
> >> - we have ->remap_file_range but
> >> - the i_sb are different
> >>
> >> we'll return the -EOPNOTSUPP (the one set in line "ret =
> >> -EOPNOTSUPP;" in
> >> function vfs_copy_file_range() ) instead of -EXDEV.
> >
> > Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
> > -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
> > vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
> > the correct error code for this case? It seems to me that -EOPNOTSUPP
> > is more appropriate than EXDEV when (sb1 != sb2).
>
> So with the current patch, for a clone operation across 2 filesystems:

Wait, I can't get passed "a clone operation across 2 filesystems", I
thought there are not any options. It's not allowed? Then we go do try
the copy. Those are two different steps so errors code might be
different.

>    . if src and dst filesystem support both copy_file_range and
>      map_file_range then the code returns -ENOTSUPPORT.
>
>    . if the filesystems only support map_file_range then the
>      code returns -EXDEV
>
> This seems confusing, shouldn't only 1 error code returned for this case?
>
> -Dai
>
> >
> >>
> >> But I may have got it all wrong.  I've looked so many times at this code
> >> that I'm probably useless at finding problems in it :-)
> >
> > You're not alone, we all try to do the right thing :-)
> >
> > -Dai
> >
> >>
> >> Cheers,
> >> --
> >> Luís
> >>
> >>> -Dai
> >>>
> >>>> +    } else {
> >>>> +                return -EOPNOTSUPP;
> >>>> +    }
> >>>> +
> >>>>        ret = generic_file_rw_checks(file_in, file_out);
> >>>>        if (ret)
> >>>>            return ret;
> >>>> @@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file
> >>>> *file_in, loff_t pos_in,
> >>>>        file_start_write(file_out);
> >>>> +    ret = -EOPNOTSUPP;
> >>>>        /*
> >>>>         * Try cloning first, this is supported by more file
> >>>> systems, and
> >>>>         * more efficient if both clone and copy are supported (e.g.
> >>>> NFS).
> >>>> @@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file
> >>>> *file_in, loff_t pos_in,
> >>>>            }
> >>>>        }
> >>>> -    ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> >>>> -                flags);
> >>>> -    WARN_ON_ONCE(ret == -EOPNOTSUPP);
> >>>> +    if (file_out->f_op->copy_file_range)
> >>>> +        ret = file_out->f_op->copy_file_range(file_in, pos_in,
> >>>> +                              file_out, pos_out,
> >>>> +                              len, flags);
> >>>>    done:
> >>>>        if (ret > 0) {
> >>>>            fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
       [not found]                                           ` <e3eed18b-fc7e-e687-608b-7f662017329c@oracle.com>
@ 2021-02-23 17:33                                             ` Amir Goldstein
  2021-02-24  0:13                                               ` dai.ngo
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-23 17:33 UTC (permalink / raw)
  To: dai.ngo
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List

On Tue, Feb 23, 2021 at 7:31 PM <dai.ngo@oracle.com> wrote:
>
> On 2/23/21 8:57 AM, dai.ngo@oracle.com wrote:
>
>
> On 2/23/21 8:47 AM, Amir Goldstein wrote:
>
> On Tue, Feb 23, 2021 at 6:02 PM <dai.ngo@oracle.com> wrote:
>
>
> On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
>
> On 2/23/21 2:32 AM, Luis Henriques wrote:
>
> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
>
> On 2/22/21 2:24 AM, Luis Henriques wrote:
>
> A regression has been reported by Nicolas Boichat, found while
> using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file
> across
> different filesystems.  After this commit, the syscall doesn't fail
> anymore
> and instead returns zero (zero bytes copied), as this file's
> content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that
> existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy
> across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range()
> in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices")
> Link:
> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> Link:
> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> Link:
> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Changes since v7
> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so
> that the
>      error returned is always related to the 'copy' operation
> Changes since v6
> - restored i_sb checks for the clone operation
> Changes since v5
> - check if ->copy_file_range is NULL before calling it
> Changes since v4
> - nfsd falls-back to generic_copy_file_range() only *if* it gets
> -EOPNOTSUPP
>      or -EXDEV.
> Changes since v3
> - dropped the COPY_FILE_SPLICE flag
> - kept the f_op's checks early in generic_copy_file_checks,
> implementing
>      Amir's suggestions
> - modified nfsd to use generic_copy_file_range()
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>      adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>     fs/nfsd/vfs.c   |  8 +++++++-
>     fs/read_write.c | 49
> ++++++++++++++++++++++++-------------------------
>     2 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..23dab0fa9087 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file
> *nf_src, u64 src_pos,
>     ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos,
> struct file *dst,
>                      u64 dst_pos, u64 count)
>     {
> +    ssize_t ret;
>         /*
>          * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src,
> u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +
> +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
> +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> +                          count, 0);
> +    return ret;
>     }
>     __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh
> *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..5a26297fd410 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file
> *file_in, loff_t pos_in,
>     }
>     EXPORT_SYMBOL(generic_copy_file_range);
> -static ssize_t do_copy_file_range(struct file *file_in, loff_t
> pos_in,
> -                  struct file *file_out, loff_t pos_out,
> -                  size_t len, unsigned int flags)
> -{
> -    /*
> -     * Although we now allow filesystems to handle cross sb copy,
> passing
> -     * a file of the wrong filesystem type to filesystem driver
> can result
> -     * in an attempt to dereference the wrong type of
> ->private_data, so
> -     * avoid doing that until we really have a good reason.  NFS
> defines
> -     * several different file_system_type structures, but they all
> end up
> -     * using the same ->copy_file_range() function pointer.
> -     */
> -    if (file_out->f_op->copy_file_range &&
> -        file_out->f_op->copy_file_range ==
> file_in->f_op->copy_file_range)
> -        return file_out->f_op->copy_file_range(file_in, pos_in,
> -                               file_out, pos_out,
> -                               len, flags);
> -
> -    return generic_copy_file_range(file_in, pos_in, file_out,
> pos_out, len,
> -                       flags);
> -}
> -
>     /*
>      * Performs necessary checks before doing a file copy
>      *
> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct
> file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
> +    /*
> +     * Although we now allow filesystems to handle cross sb copy,
> passing
> +     * a file of the wrong filesystem type to filesystem driver
> can result
> +     * in an attempt to dereference the wrong type of
> ->private_data, so
> +     * avoid doing that until we really have a good reason.  NFS
> defines
> +     * several different file_system_type structures, but they all
> end up
> +     * using the same ->copy_file_range() function pointer.
> +     */
> +    if (file_out->f_op->copy_file_range) {
> +        if (file_in->f_op->copy_file_range !=
> +            file_out->f_op->copy_file_range)
> +            return -EXDEV;
> +    } else if (file_in->f_op->remap_file_range) {
> +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> +            return -EXDEV;
>
> I think this check is redundant, it's done in vfs_copy_file_range.
> If this check is removed then the else clause below should be removed
> also. Once this check and the else clause are removed then might as
> well move the the check of copy_file_range from here to
> vfs_copy_file_range.
>
> I don't think it's really redundant, although I agree is messy due to
> the
> fact we try to clone first instead of copying them.
>
> So, in the clone path, this is the only place where we return -EXDEV if:
>
> 1) we don't have ->copy_file_range *and*
> 2) we have ->remap_file_range but the i_sb are different.
>
> The check in vfs_copy_file_range() is only executed if:
>
> 1) we have *valid* ->copy_file_range ops and/or
> 2) we have *valid* ->remap_file_range
>
> So... if we remove the check in generic_copy_file_checks() as you
> suggest
> and:
> - we don't have ->copy_file_range,
> - we have ->remap_file_range but
> - the i_sb are different
>
> we'll return the -EOPNOTSUPP (the one set in line "ret =
> -EOPNOTSUPP;" in
> function vfs_copy_file_range() ) instead of -EXDEV.
>
> Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
> -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
> vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
> the correct error code for this case? It seems to me that -EOPNOTSUPP
> is more appropriate than EXDEV when (sb1 != sb2).
>
> EXDEV is the right code for:
> filesystem supports the operation but not for sb1 != sb1.
>
> So with the current patch, for a clone operation across 2 filesystems:
>
>     . if src and dst filesystem support both copy_file_range and
>       map_file_range then the code returns -ENOTSUPPORT.
>
> Why do you say that?
> Which code are you referring to exactly?
>
>
> If the filesystems support both copy_file_range and map_file_range,
> it passes the check in generic_file_check but it fails with the
> check in vfs_copy_file_range and returns -ENOTSUPPORT (added by
> the v8 patch)
>
> Ok, I misread the code here. If it passes the check in generic_copy_file_checks
> and it fails the sb check in vfs_copy_file_range then it tries copy_file_range
> so it's ok.
>
> I think having the check in both generic_copy_file_checks and vfs_copy_file_range
> making the code hard to read. What's the reason not to do the check only in
> vfs_copy_file_range?
>

You are going in circles.
I already answered that.
Please re-read the entire thread on all patch versions before commenting.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 16:57                                         ` dai.ngo
       [not found]                                           ` <e3eed18b-fc7e-e687-608b-7f662017329c@oracle.com>
@ 2021-02-23 17:56                                           ` Luis Henriques
  1 sibling, 0 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-23 17:56 UTC (permalink / raw)
  To: dai.ngo
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List

On Tue, Feb 23, 2021 at 08:57:38AM -0800, dai.ngo@oracle.com wrote:
> 
> On 2/23/21 8:47 AM, Amir Goldstein wrote:
> > On Tue, Feb 23, 2021 at 6:02 PM <dai.ngo@oracle.com> wrote:
> > > 
> > > On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
> > > > On 2/23/21 2:32 AM, Luis Henriques wrote:
> > > > > On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
> > > > > > On 2/22/21 2:24 AM, Luis Henriques wrote:
> > > > > > > A regression has been reported by Nicolas Boichat, found while
> > > > > > > using the
> > > > > > > copy_file_range syscall to copy a tracefs file.  Before commit
> > > > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > > > > > > kernel would return -EXDEV to userspace when trying to copy a file
> > > > > > > across
> > > > > > > different filesystems.  After this commit, the syscall doesn't fail
> > > > > > > anymore
> > > > > > > and instead returns zero (zero bytes copied), as this file's
> > > > > > > content is
> > > > > > > generated on-the-fly and thus reports a size of zero.
> > > > > > > 
> > > > > > > This patch restores some cross-filesystem copy restrictions that
> > > > > > > existed
> > > > > > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy
> > > > > > > across
> > > > > > > devices").  Filesystems are still allowed to fall-back to the VFS
> > > > > > > generic_copy_file_range() implementation, but that has now to be done
> > > > > > > explicitly.
> > > > > > > 
> > > > > > > nfsd is also modified to fall-back into generic_copy_file_range()
> > > > > > > in case
> > > > > > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> > > > > > > 
> > > > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > > > > devices")
> > > > > > > Link:
> > > > > > > https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
> > > > > > > Link:
> > > > > > > https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
> > > > > > > Link:
> > > > > > > https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
> > > > > > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > > > > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > > > > > ---
> > > > > > > Changes since v7
> > > > > > > - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so
> > > > > > > that the
> > > > > > >      error returned is always related to the 'copy' operation
> > > > > > > Changes since v6
> > > > > > > - restored i_sb checks for the clone operation
> > > > > > > Changes since v5
> > > > > > > - check if ->copy_file_range is NULL before calling it
> > > > > > > Changes since v4
> > > > > > > - nfsd falls-back to generic_copy_file_range() only *if* it gets
> > > > > > > -EOPNOTSUPP
> > > > > > >      or -EXDEV.
> > > > > > > Changes since v3
> > > > > > > - dropped the COPY_FILE_SPLICE flag
> > > > > > > - kept the f_op's checks early in generic_copy_file_checks,
> > > > > > > implementing
> > > > > > >      Amir's suggestions
> > > > > > > - modified nfsd to use generic_copy_file_range()
> > > > > > > Changes since v2
> > > > > > > - do all the required checks earlier, in generic_copy_file_checks(),
> > > > > > >      adding new checks for ->remap_file_range
> > > > > > > - new COPY_FILE_SPLICE flag
> > > > > > > - don't remove filesystem's fallback to generic_copy_file_range()
> > > > > > > - updated commit changelog (and subject)
> > > > > > > Changes since v1 (after Amir review)
> > > > > > > - restored do_copy_file_range() helper
> > > > > > > - return -EOPNOTSUPP if fs doesn't implement CFR
> > > > > > > - updated commit description
> > > > > > > 
> > > > > > >     fs/nfsd/vfs.c   |  8 +++++++-
> > > > > > >     fs/read_write.c | 49
> > > > > > > ++++++++++++++++++++++++-------------------------
> > > > > > >     2 files changed, 31 insertions(+), 26 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > > > > index 04937e51de56..23dab0fa9087 100644
> > > > > > > --- a/fs/nfsd/vfs.c
> > > > > > > +++ b/fs/nfsd/vfs.c
> > > > > > > @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file
> > > > > > > *nf_src, u64 src_pos,
> > > > > > >     ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos,
> > > > > > > struct file *dst,
> > > > > > >                      u64 dst_pos, u64 count)
> > > > > > >     {
> > > > > > > +    ssize_t ret;
> > > > > > >         /*
> > > > > > >          * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> > > > > > > @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src,
> > > > > > > u64 src_pos, struct file *dst,
> > > > > > >          * limit like this and pipeline multiple COPY requests.
> > > > > > >          */
> > > > > > >         count = min_t(u64, count, 1 << 22);
> > > > > > > -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > > > > > > +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> > > > > > > +
> > > > > > > +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
> > > > > > > +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> > > > > > > +                          count, 0);
> > > > > > > +    return ret;
> > > > > > >     }
> > > > > > >     __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh
> > > > > > > *fhp,
> > > > > > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > > > index 75f764b43418..5a26297fd410 100644
> > > > > > > --- a/fs/read_write.c
> > > > > > > +++ b/fs/read_write.c
> > > > > > > @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file
> > > > > > > *file_in, loff_t pos_in,
> > > > > > >     }
> > > > > > >     EXPORT_SYMBOL(generic_copy_file_range);
> > > > > > > -static ssize_t do_copy_file_range(struct file *file_in, loff_t
> > > > > > > pos_in,
> > > > > > > -                  struct file *file_out, loff_t pos_out,
> > > > > > > -                  size_t len, unsigned int flags)
> > > > > > > -{
> > > > > > > -    /*
> > > > > > > -     * Although we now allow filesystems to handle cross sb copy,
> > > > > > > passing
> > > > > > > -     * a file of the wrong filesystem type to filesystem driver
> > > > > > > can result
> > > > > > > -     * in an attempt to dereference the wrong type of
> > > > > > > ->private_data, so
> > > > > > > -     * avoid doing that until we really have a good reason.  NFS
> > > > > > > defines
> > > > > > > -     * several different file_system_type structures, but they all
> > > > > > > end up
> > > > > > > -     * using the same ->copy_file_range() function pointer.
> > > > > > > -     */
> > > > > > > -    if (file_out->f_op->copy_file_range &&
> > > > > > > -        file_out->f_op->copy_file_range ==
> > > > > > > file_in->f_op->copy_file_range)
> > > > > > > -        return file_out->f_op->copy_file_range(file_in, pos_in,
> > > > > > > -                               file_out, pos_out,
> > > > > > > -                               len, flags);
> > > > > > > -
> > > > > > > -    return generic_copy_file_range(file_in, pos_in, file_out,
> > > > > > > pos_out, len,
> > > > > > > -                       flags);
> > > > > > > -}
> > > > > > > -
> > > > > > >     /*
> > > > > > >      * Performs necessary checks before doing a file copy
> > > > > > >      *
> > > > > > > @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct
> > > > > > > file *file_in, loff_t pos_in,
> > > > > > >         loff_t size_in;
> > > > > > >         int ret;
> > > > > > > +    /*
> > > > > > > +     * Although we now allow filesystems to handle cross sb copy,
> > > > > > > passing
> > > > > > > +     * a file of the wrong filesystem type to filesystem driver
> > > > > > > can result
> > > > > > > +     * in an attempt to dereference the wrong type of
> > > > > > > ->private_data, so
> > > > > > > +     * avoid doing that until we really have a good reason.  NFS
> > > > > > > defines
> > > > > > > +     * several different file_system_type structures, but they all
> > > > > > > end up
> > > > > > > +     * using the same ->copy_file_range() function pointer.
> > > > > > > +     */
> > > > > > > +    if (file_out->f_op->copy_file_range) {
> > > > > > > +        if (file_in->f_op->copy_file_range !=
> > > > > > > +            file_out->f_op->copy_file_range)
> > > > > > > +            return -EXDEV;
> > > > > > > +    } else if (file_in->f_op->remap_file_range) {
> > > > > > > +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> > > > > > > +            return -EXDEV;
> > > > > > I think this check is redundant, it's done in vfs_copy_file_range.
> > > > > > If this check is removed then the else clause below should be removed
> > > > > > also. Once this check and the else clause are removed then might as
> > > > > > well move the the check of copy_file_range from here to
> > > > > > vfs_copy_file_range.
> > > > > > 
> > > > > I don't think it's really redundant, although I agree is messy due to
> > > > > the
> > > > > fact we try to clone first instead of copying them.
> > > > > 
> > > > > So, in the clone path, this is the only place where we return -EXDEV if:
> > > > > 
> > > > > 1) we don't have ->copy_file_range *and*
> > > > > 2) we have ->remap_file_range but the i_sb are different.
> > > > > 
> > > > > The check in vfs_copy_file_range() is only executed if:
> > > > > 
> > > > > 1) we have *valid* ->copy_file_range ops and/or
> > > > > 2) we have *valid* ->remap_file_range
> > > > > 
> > > > > So... if we remove the check in generic_copy_file_checks() as you
> > > > > suggest
> > > > > and:
> > > > > - we don't have ->copy_file_range,
> > > > > - we have ->remap_file_range but
> > > > > - the i_sb are different
> > > > > 
> > > > > we'll return the -EOPNOTSUPP (the one set in line "ret =
> > > > > -EOPNOTSUPP;" in
> > > > > function vfs_copy_file_range() ) instead of -EXDEV.
> > > > Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
> > > > -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
> > > > vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
> > > > the correct error code for this case? It seems to me that -EOPNOTSUPP
> > > > is more appropriate than EXDEV when (sb1 != sb2).
> > EXDEV is the right code for:
> > filesystem supports the operation but not for sb1 != sb1.
> > 
> > > So with the current patch, for a clone operation across 2 filesystems:
> > > 
> > >     . if src and dst filesystem support both copy_file_range and
> > >       map_file_range then the code returns -ENOTSUPPORT.
> > > 
> > Why do you say that?
> > Which code are you referring to exactly?
> 
> If the filesystems support both copy_file_range and map_file_range,
> it passes the check in generic_file_check but it fails with the
> check in vfs_copy_file_range and returns -ENOTSUPPORT (added by
> the v8 patch)

I'm sorry but I can't simply see where this can happen.  If both syscalls
are present (and all other checks pass), the code will first try the
->map_file_range.  If that succeeds, it bails out; if that fails, it tries
the ->copy_file_range.  The -ENOTSUPPORT is just there for the case the
->map_file_range fails and ->copy_file_range isn't implemented.

[ <sigh> It would be so much easier if we didn't attempt to clone. ]

But as I said previously, I'm way beyond embarrassment now as I failed to
see too many obvious mistakes in previous versions :-)

Cheers,
--
Luís

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-23 17:33                                             ` Amir Goldstein
@ 2021-02-24  0:13                                               ` dai.ngo
  0 siblings, 0 replies; 93+ messages in thread
From: dai.ngo @ 2021-02-24  0:13 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List

On 2/23/21 9:33 AM, Amir Goldstein wrote:
> On Tue, Feb 23, 2021 at 7:31 PM <dai.ngo@oracle.com> wrote:
>> On 2/23/21 8:57 AM, dai.ngo@oracle.com wrote:
>>
>>
>> On 2/23/21 8:47 AM, Amir Goldstein wrote:
>>
>> On Tue, Feb 23, 2021 at 6:02 PM <dai.ngo@oracle.com> wrote:
>>
>>
>> On 2/23/21 7:29 AM, dai.ngo@oracle.com wrote:
>>
>> On 2/23/21 2:32 AM, Luis Henriques wrote:
>>
>> On Mon, Feb 22, 2021 at 08:25:27AM -0800, dai.ngo@oracle.com wrote:
>>
>> On 2/22/21 2:24 AM, Luis Henriques wrote:
>>
>> A regression has been reported by Nicolas Boichat, found while
>> using the
>> copy_file_range syscall to copy a tracefs file.  Before commit
>> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>> kernel would return -EXDEV to userspace when trying to copy a file
>> across
>> different filesystems.  After this commit, the syscall doesn't fail
>> anymore
>> and instead returns zero (zero bytes copied), as this file's
>> content is
>> generated on-the-fly and thus reports a size of zero.
>>
>> This patch restores some cross-filesystem copy restrictions that
>> existed
>> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy
>> across
>> devices").  Filesystems are still allowed to fall-back to the VFS
>> generic_copy_file_range() implementation, but that has now to be done
>> explicitly.
>>
>> nfsd is also modified to fall-back into generic_copy_file_range()
>> in case
>> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>>
>> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> devices")
>> Link:
>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmi49dC6w$
>> Link:
>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx*BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/__;Kw!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmgCmMHzA$
>> Link:
>> https://urldefense.com/v3/__https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/__;!!GqivPVa7Brio!P1UWThiSkxbjfjFQWNYJmCxGEkiLFyvHjH6cS-G1ZTt1z-TeqwGQgQmzqItkrQ$
>> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> ---
>> Changes since v7
>> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so
>> that the
>>       error returned is always related to the 'copy' operation
>> Changes since v6
>> - restored i_sb checks for the clone operation
>> Changes since v5
>> - check if ->copy_file_range is NULL before calling it
>> Changes since v4
>> - nfsd falls-back to generic_copy_file_range() only *if* it gets
>> -EOPNOTSUPP
>>       or -EXDEV.
>> Changes since v3
>> - dropped the COPY_FILE_SPLICE flag
>> - kept the f_op's checks early in generic_copy_file_checks,
>> implementing
>>       Amir's suggestions
>> - modified nfsd to use generic_copy_file_range()
>> Changes since v2
>> - do all the required checks earlier, in generic_copy_file_checks(),
>>       adding new checks for ->remap_file_range
>> - new COPY_FILE_SPLICE flag
>> - don't remove filesystem's fallback to generic_copy_file_range()
>> - updated commit changelog (and subject)
>> Changes since v1 (after Amir review)
>> - restored do_copy_file_range() helper
>> - return -EOPNOTSUPP if fs doesn't implement CFR
>> - updated commit description
>>
>>      fs/nfsd/vfs.c   |  8 +++++++-
>>      fs/read_write.c | 49
>> ++++++++++++++++++++++++-------------------------
>>      2 files changed, 31 insertions(+), 26 deletions(-)
>>
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index 04937e51de56..23dab0fa9087 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file
>> *nf_src, u64 src_pos,
>>      ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos,
>> struct file *dst,
>>                       u64 dst_pos, u64 count)
>>      {
>> +    ssize_t ret;
>>          /*
>>           * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src,
>> u64 src_pos, struct file *dst,
>>           * limit like this and pipeline multiple COPY requests.
>>           */
>>          count = min_t(u64, count, 1 << 22);
>> -    return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>> +    ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>> +
>> +    if (ret == -EOPNOTSUPP || ret == -EXDEV)
>> +        ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
>> +                          count, 0);
>> +    return ret;
>>      }
>>      __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh
>> *fhp,
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 75f764b43418..5a26297fd410 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file
>> *file_in, loff_t pos_in,
>>      }
>>      EXPORT_SYMBOL(generic_copy_file_range);
>> -static ssize_t do_copy_file_range(struct file *file_in, loff_t
>> pos_in,
>> -                  struct file *file_out, loff_t pos_out,
>> -                  size_t len, unsigned int flags)
>> -{
>> -    /*
>> -     * Although we now allow filesystems to handle cross sb copy,
>> passing
>> -     * a file of the wrong filesystem type to filesystem driver
>> can result
>> -     * in an attempt to dereference the wrong type of
>> ->private_data, so
>> -     * avoid doing that until we really have a good reason.  NFS
>> defines
>> -     * several different file_system_type structures, but they all
>> end up
>> -     * using the same ->copy_file_range() function pointer.
>> -     */
>> -    if (file_out->f_op->copy_file_range &&
>> -        file_out->f_op->copy_file_range ==
>> file_in->f_op->copy_file_range)
>> -        return file_out->f_op->copy_file_range(file_in, pos_in,
>> -                               file_out, pos_out,
>> -                               len, flags);
>> -
>> -    return generic_copy_file_range(file_in, pos_in, file_out,
>> pos_out, len,
>> -                       flags);
>> -}
>> -
>>      /*
>>       * Performs necessary checks before doing a file copy
>>       *
>> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct
>> file *file_in, loff_t pos_in,
>>          loff_t size_in;
>>          int ret;
>> +    /*
>> +     * Although we now allow filesystems to handle cross sb copy,
>> passing
>> +     * a file of the wrong filesystem type to filesystem driver
>> can result
>> +     * in an attempt to dereference the wrong type of
>> ->private_data, so
>> +     * avoid doing that until we really have a good reason.  NFS
>> defines
>> +     * several different file_system_type structures, but they all
>> end up
>> +     * using the same ->copy_file_range() function pointer.
>> +     */
>> +    if (file_out->f_op->copy_file_range) {
>> +        if (file_in->f_op->copy_file_range !=
>> +            file_out->f_op->copy_file_range)
>> +            return -EXDEV;
>> +    } else if (file_in->f_op->remap_file_range) {
>> +        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
>> +            return -EXDEV;
>>
>> I think this check is redundant, it's done in vfs_copy_file_range.
>> If this check is removed then the else clause below should be removed
>> also. Once this check and the else clause are removed then might as
>> well move the the check of copy_file_range from here to
>> vfs_copy_file_range.
>>
>> I don't think it's really redundant, although I agree is messy due to
>> the
>> fact we try to clone first instead of copying them.
>>
>> So, in the clone path, this is the only place where we return -EXDEV if:
>>
>> 1) we don't have ->copy_file_range *and*
>> 2) we have ->remap_file_range but the i_sb are different.
>>
>> The check in vfs_copy_file_range() is only executed if:
>>
>> 1) we have *valid* ->copy_file_range ops and/or
>> 2) we have *valid* ->remap_file_range
>>
>> So... if we remove the check in generic_copy_file_checks() as you
>> suggest
>> and:
>> - we don't have ->copy_file_range,
>> - we have ->remap_file_range but
>> - the i_sb are different
>>
>> we'll return the -EOPNOTSUPP (the one set in line "ret =
>> -EOPNOTSUPP;" in
>> function vfs_copy_file_range() ) instead of -EXDEV.
>>
>> Yes, this is the different.The NFS code handles both -EOPNOTSUPP and
>> -EXDEVV by doing generic_copy_file_range.  Do any other consumers of
>> vfs_copy_file_range rely on -EXDEV and not -EOPNOTSUPP and which is
>> the correct error code for this case? It seems to me that -EOPNOTSUPP
>> is more appropriate than EXDEV when (sb1 != sb2).
>>
>> EXDEV is the right code for:
>> filesystem supports the operation but not for sb1 != sb1.
>>
>> So with the current patch, for a clone operation across 2 filesystems:
>>
>>      . if src and dst filesystem support both copy_file_range and
>>        map_file_range then the code returns -ENOTSUPPORT.
>>
>> Why do you say that?
>> Which code are you referring to exactly?
>>
>>
>> If the filesystems support both copy_file_range and map_file_range,
>> it passes the check in generic_file_check but it fails with the
>> check in vfs_copy_file_range and returns -ENOTSUPPORT (added by
>> the v8 patch)
>>
>> Ok, I misread the code here. If it passes the check in generic_copy_file_checks
>> and it fails the sb check in vfs_copy_file_range then it tries copy_file_range
>> so it's ok.
>>
>> I think having the check in both generic_copy_file_checks and vfs_copy_file_range
>> making the code hard to read. What's the reason not to do the check only in
>> vfs_copy_file_range?
>>
> You are going in circles.
> I already answered that.
> Please re-read the entire thread on all patch versions before commenting.

I'm fine with the patch as it is, as long as it does not break NFS.

I just think it's easier to read if the checks are done in
vfs_copy_file_range such as:

@@ -1495,6 +1473,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
  
         file_start_write(file_out);
  
+       if (file_out->f_op->copy_file_range == NULL &&
+           file_in->f_op->remap_file_range == NULL)
+               return -EOPNOTSUPP;     /* not sure this error is needed */
+
+       ret = -EXDEV;
         /*
          * Try cloning first, this is supported by more file systems, and
          * more efficient if both clone and copy are supported (e.g. NFS).
@@ -1513,9 +1496,10 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
                 }
         }
  
-       ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
-                               flags);
-       WARN_ON_ONCE(ret == -EOPNOTSUPP);
+       if (file_out->f_op->copy_file_range &&
+           file_out->f_op->copy_file_range == file_in->f_op->copy_file_range) {
+               ret = file_out->f_op->copy_file_range(file_in, pos_in,
+                               file_out, pos_out, len, flags);
  done:
         if (ret > 0) {
                 fsnotify_access(file_in);

Thanks,
-Dai

>
> Thanks,
> Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
  2021-02-22 10:46                               ` Amir Goldstein
  2021-02-22 16:25                               ` dai.ngo
@ 2021-02-24  1:00                               ` Olga Kornievskaia
  2021-02-24 10:23                                 ` Luis Henriques
  2021-02-24 14:23                               ` [PATCH] copy_file_range.2: Kernel v5.12 updates Luis Henriques
  3 siblings, 1 reply; 93+ messages in thread
From: Olga Kornievskaia @ 2021-02-24  1:00 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file.  Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems.  After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystem copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").  Filesystems are still allowed to fall-back to the VFS
> generic_copy_file_range() implementation, but that has now to be done
> explicitly.
>
> nfsd is also modified to fall-back into generic_copy_file_range() in case
> vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>

I tested v8 and I believe it works for NFS.

> ---
> Changes since v7
> - set 'ret' to '-EOPNOTSUPP' before the clone 'if' statement so that the
>   error returned is always related to the 'copy' operation
> Changes since v6
> - restored i_sb checks for the clone operation
> Changes since v5
> - check if ->copy_file_range is NULL before calling it
> Changes since v4
> - nfsd falls-back to generic_copy_file_range() only *if* it gets -EOPNOTSUPP
>   or -EXDEV.
> Changes since v3
> - dropped the COPY_FILE_SPLICE flag
> - kept the f_op's checks early in generic_copy_file_checks, implementing
>   Amir's suggestions
> - modified nfsd to use generic_copy_file_range()
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
>   adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
>  fs/nfsd/vfs.c   |  8 +++++++-
>  fs/read_write.c | 49 ++++++++++++++++++++++++-------------------------
>  2 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..23dab0fa9087 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -568,6 +568,7 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
>  ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>                              u64 dst_pos, u64 count)
>  {
> +       ssize_t ret;
>
>         /*
>          * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> @@ -578,7 +579,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
>          * limit like this and pipeline multiple COPY requests.
>          */
>         count = min_t(u64, count, 1 << 22);
> -       return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +       ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> +
> +       if (ret == -EOPNOTSUPP || ret == -EXDEV)
> +               ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
> +                                             count, 0);
> +       return ret;
>  }
>
>  __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..5a26297fd410 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>  }
>  EXPORT_SYMBOL(generic_copy_file_range);
>
> -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> -                                 struct file *file_out, loff_t pos_out,
> -                                 size_t len, unsigned int flags)
> -{
> -       /*
> -        * Although we now allow filesystems to handle cross sb copy, passing
> -        * a file of the wrong filesystem type to filesystem driver can result
> -        * in an attempt to dereference the wrong type of ->private_data, so
> -        * avoid doing that until we really have a good reason.  NFS defines
> -        * several different file_system_type structures, but they all end up
> -        * using the same ->copy_file_range() function pointer.
> -        */
> -       if (file_out->f_op->copy_file_range &&
> -           file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> -               return file_out->f_op->copy_file_range(file_in, pos_in,
> -                                                      file_out, pos_out,
> -                                                      len, flags);
> -
> -       return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                                      flags);
> -}
> -
>  /*
>   * Performs necessary checks before doing a file copy
>   *
> @@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>         loff_t size_in;
>         int ret;
>
> +       /*
> +        * Although we now allow filesystems to handle cross sb copy, passing
> +        * a file of the wrong filesystem type to filesystem driver can result
> +        * in an attempt to dereference the wrong type of ->private_data, so
> +        * avoid doing that until we really have a good reason.  NFS defines
> +        * several different file_system_type structures, but they all end up
> +        * using the same ->copy_file_range() function pointer.
> +        */
> +       if (file_out->f_op->copy_file_range) {
> +               if (file_in->f_op->copy_file_range !=
> +                   file_out->f_op->copy_file_range)
> +                       return -EXDEV;
> +       } else if (file_in->f_op->remap_file_range) {
> +               if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> +                       return -EXDEV;
> +       } else {
> +                return -EOPNOTSUPP;
> +       }
> +
>         ret = generic_file_rw_checks(file_in, file_out);
>         if (ret)
>                 return ret;
> @@ -1495,6 +1492,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>
>         file_start_write(file_out);
>
> +       ret = -EOPNOTSUPP;
>         /*
>          * Try cloning first, this is supported by more file systems, and
>          * more efficient if both clone and copy are supported (e.g. NFS).
> @@ -1513,9 +1511,10 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>                 }
>         }
>
> -       ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> -                               flags);
> -       WARN_ON_ONCE(ret == -EOPNOTSUPP);
> +       if (file_out->f_op->copy_file_range)
> +               ret = file_out->f_op->copy_file_range(file_in, pos_in,
> +                                                     file_out, pos_out,
> +                                                     len, flags);
>  done:
>         if (ret > 0) {
>                 fsnotify_access(file_in);

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-24  1:00                               ` Olga Kornievskaia
@ 2021-02-24 10:23                                 ` Luis Henriques
  2021-02-24 10:44                                   ` Nicolas Boichat
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-24 10:23 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Amir Goldstein, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
> On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > A regression has been reported by Nicolas Boichat, found while using the
> > copy_file_range syscall to copy a tracefs file.  Before commit
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > kernel would return -EXDEV to userspace when trying to copy a file across
> > different filesystems.  After this commit, the syscall doesn't fail anymore
> > and instead returns zero (zero bytes copied), as this file's content is
> > generated on-the-fly and thus reports a size of zero.
> >
> > This patch restores some cross-filesystem copy restrictions that existed
> > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices").  Filesystems are still allowed to fall-back to the VFS
> > generic_copy_file_range() implementation, but that has now to be done
> > explicitly.
> >
> > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> >
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> 
> I tested v8 and I believe it works for NFS.

Thanks a lot for the testing.  And to everyone else for reviews,
feedback,... and patience.

I'll now go look into the manpage and see what needs to be changed.

Cheers,
--
Luís

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-24 10:23                                 ` Luis Henriques
@ 2021-02-24 10:44                                   ` Nicolas Boichat
  2021-04-09  5:23                                     ` Nicolas Boichat
  0 siblings, 1 reply; 93+ messages in thread
From: Nicolas Boichat @ 2021-02-24 10:44 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Olga Kornievskaia, Amir Goldstein, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Wed, Feb 24, 2021 at 6:22 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
> > On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
> > >
> > > A regression has been reported by Nicolas Boichat, found while using the
> > > copy_file_range syscall to copy a tracefs file.  Before commit
> > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > > kernel would return -EXDEV to userspace when trying to copy a file across
> > > different filesystems.  After this commit, the syscall doesn't fail anymore
> > > and instead returns zero (zero bytes copied), as this file's content is
> > > generated on-the-fly and thus reports a size of zero.
> > >
> > > This patch restores some cross-filesystem copy restrictions that existed
> > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > devices").  Filesystems are still allowed to fall-back to the VFS
> > > generic_copy_file_range() implementation, but that has now to be done
> > > explicitly.
> > >
> > > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> > >
> > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >
> > I tested v8 and I believe it works for NFS.
>
> Thanks a lot for the testing.  And to everyone else for reviews,
> feedback,... and patience.

Thanks so much to you!!!

Works here, you can add my
Tested-by: Nicolas Boichat <drinkcat@chromium.org>

>
> I'll now go look into the manpage and see what needs to be changed.
>
> Cheers,
> --
> Luís

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
                                                 ` (2 preceding siblings ...)
  2021-02-24  1:00                               ` Olga Kornievskaia
@ 2021-02-24 14:23                               ` Luis Henriques
  2021-02-24 16:10                                 ` Amir Goldstein
                                                   ` (2 more replies)
  3 siblings, 3 replies; 93+ messages in thread
From: Luis Henriques @ 2021-02-24 14:23 UTC (permalink / raw)
  To: Alejandro Colomar, Michael Kerrisk, Amir Goldstein, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Anna Schumaker,
	Alexander Viro, Darrick J. Wong, Dave Chinner, Greg KH,
	Nicolas Boichat, Ian Lance Taylor, Luis Lozano, Andreas Dilger,
	Olga Kornievskaia, Christoph Hellwig
  Cc: ceph-devel, linux-kernel, linux-cifs, samba-technical,
	linux-fsdevel, linux-nfs, linux-man, Luis Henriques

Update man-page with recent changes to this syscall.

Signed-off-by: Luis Henriques <lhenriques@suse.de>
---
Hi!

Here's a suggestion for fixing the manpage for copy_file_range().  Note that
I've assumed the fix will hit 5.12.

 man2/copy_file_range.2 | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
index 611a39b8026b..b0fd85e2631e 100644
--- a/man2/copy_file_range.2
+++ b/man2/copy_file_range.2
@@ -169,6 +169,9 @@ Out of memory.
 .B ENOSPC
 There is not enough space on the target filesystem to complete the copy.
 .TP
+.B EOPNOTSUPP
+The filesystem does not support this operation.
+.TP
 .B EOVERFLOW
 The requested source or destination range is too large to represent in the
 specified data types.
@@ -187,7 +190,7 @@ refers to an active swap file.
 .B EXDEV
 The files referred to by
 .IR fd_in " and " fd_out
-are not on the same mounted filesystem (pre Linux 5.3).
+are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
 .SH VERSIONS
 The
 .BR copy_file_range ()
@@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
 .PP
 First support for cross-filesystem copies was introduced in Linux 5.3.
 Older kernels will return -EXDEV when cross-filesystem copies are attempted.
+.PP
+After Linux 5.12, support for copies between different filesystems was dropped.
+However, individual filesystems may still provide
+.BR copy_file_range ()
+implementations that allow copies across different devices.
 .SH CONFORMING TO
 The
 .BR copy_file_range ()

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-24 14:23                               ` [PATCH] copy_file_range.2: Kernel v5.12 updates Luis Henriques
@ 2021-02-24 16:10                                 ` Amir Goldstein
  2021-02-25 10:21                                   ` Luis Henriques
  2021-02-26 22:18                                   ` Alejandro Colomar (man-pages)
  2021-03-01 14:41                                 ` [RFC v3] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
  2021-03-04  9:38                                 ` [RFC v4] " Alejandro Colomar
  2 siblings, 2 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-24 16:10 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Alejandro Colomar, Michael Kerrisk, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Update man-page with recent changes to this syscall.
>
> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> ---
> Hi!
>
> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> I've assumed the fix will hit 5.12.
>
>  man2/copy_file_range.2 | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> index 611a39b8026b..b0fd85e2631e 100644
> --- a/man2/copy_file_range.2
> +++ b/man2/copy_file_range.2
> @@ -169,6 +169,9 @@ Out of memory.
>  .B ENOSPC
>  There is not enough space on the target filesystem to complete the copy.
>  .TP
> +.B EOPNOTSUPP
> +The filesystem does not support this operation.
> +.TP
>  .B EOVERFLOW
>  The requested source or destination range is too large to represent in the
>  specified data types.
> @@ -187,7 +190,7 @@ refers to an active swap file.
>  .B EXDEV
>  The files referred to by
>  .IR fd_in " and " fd_out
> -are not on the same mounted filesystem (pre Linux 5.3).
> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).

I think you need to drop the (Linux range) altogether.
What's missing here is the NFS cross server copy use case.
Maybe:

...are not on the same mounted filesystem and the source and target filesystems
do not support cross-filesystem copy.

You may refer the reader to VERSIONS section where it will say which
filesystems support cross-fs copy as of kernel version XXX (i.e. cifs and nfs).

>  .SH VERSIONS
>  The
>  .BR copy_file_range ()
> @@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
>  .PP
>  First support for cross-filesystem copies was introduced in Linux 5.3.
>  Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> +.PP
> +After Linux 5.12, support for copies between different filesystems was dropped.
> +However, individual filesystems may still provide
> +.BR copy_file_range ()
> +implementations that allow copies across different devices.

Again, this is not likely to stay uptodate for very long.
The stable kernels are expected to apply your patch (because it fixes
a regression)
so this should be phrased differently.
If it were me, I would provide all the details of the situation to
Michael and ask him
to write the best description for this section.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-24 16:10                                 ` Amir Goldstein
@ 2021-02-25 10:21                                   ` Luis Henriques
  2021-02-26 10:13                                     ` Alejandro Colomar (man-pages)
  2021-02-26 22:18                                   ` Alejandro Colomar (man-pages)
  1 sibling, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-02-25 10:21 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Alejandro Colomar, Michael Kerrisk, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Wed, Feb 24, 2021 at 06:10:45PM +0200, Amir Goldstein wrote:
> On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > Update man-page with recent changes to this syscall.
> >
> > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > ---
> > Hi!
> >
> > Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> > I've assumed the fix will hit 5.12.
> >
> >  man2/copy_file_range.2 | 10 +++++++++-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> >
> > diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> > index 611a39b8026b..b0fd85e2631e 100644
> > --- a/man2/copy_file_range.2
> > +++ b/man2/copy_file_range.2
> > @@ -169,6 +169,9 @@ Out of memory.
> >  .B ENOSPC
> >  There is not enough space on the target filesystem to complete the copy.
> >  .TP
> > +.B EOPNOTSUPP
> > +The filesystem does not support this operation.
> > +.TP
> >  .B EOVERFLOW
> >  The requested source or destination range is too large to represent in the
> >  specified data types.
> > @@ -187,7 +190,7 @@ refers to an active swap file.
> >  .B EXDEV
> >  The files referred to by
> >  .IR fd_in " and " fd_out
> > -are not on the same mounted filesystem (pre Linux 5.3).
> > +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
> 
> I think you need to drop the (Linux range) altogether.
> What's missing here is the NFS cross server copy use case.
> Maybe:
> 
> ...are not on the same mounted filesystem and the source and target filesystems
> do not support cross-filesystem copy.
> 
> You may refer the reader to VERSIONS section where it will say which
> filesystems support cross-fs copy as of kernel version XXX (i.e. cifs and nfs).
> 
> >  .SH VERSIONS
> >  The
> >  .BR copy_file_range ()
> > @@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
> >  .PP
> >  First support for cross-filesystem copies was introduced in Linux 5.3.
> >  Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> > +.PP
> > +After Linux 5.12, support for copies between different filesystems was dropped.
> > +However, individual filesystems may still provide
> > +.BR copy_file_range ()
> > +implementations that allow copies across different devices.
> 
> Again, this is not likely to stay uptodate for very long.
> The stable kernels are expected to apply your patch (because it fixes
> a regression)
> so this should be phrased differently.
> If it were me, I would provide all the details of the situation to
> Michael and ask him
> to write the best description for this section.

Thanks Amir.

Yeah, it's tricky.  Support was added and then dropped.   Since stable
kernels will be picking this patch,  maybe the best thing to do is to no
mention the generic cross-filesystem support at all...?  Or simply say
that 5.3 temporarily supported it but that support was later dropped.

Michael (or Alejandro), would you be OK handling this yourself as Amir
suggested?

Cheers,
--
Luís

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-25 10:21                                   ` Luis Henriques
@ 2021-02-26 10:13                                     ` Alejandro Colomar (man-pages)
  2021-02-26 10:34                                       ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Alejandro Colomar (man-pages) @ 2021-02-26 10:13 UTC (permalink / raw)
  To: Luis Henriques, Amir Goldstein
  Cc: Michael Kerrisk, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

Hello Luis,

On 2/25/21 11:21 AM, Luis Henriques wrote:
> On Wed, Feb 24, 2021 at 06:10:45PM +0200, Amir Goldstein wrote:
>> If it were me, I would provide all the details of the situation to
>> Michael and ask him
>> to write the best description for this section.
> 
> Thanks Amir.
> 
> Yeah, it's tricky.  Support was added and then dropped.   Since stable
> kernels will be picking this patch,  maybe the best thing to do is to no
> mention the generic cross-filesystem support at all...?  Or simply say
> that 5.3 temporarily supported it but that support was later dropped.
> 
> Michael (or Alejandro), would you be OK handling this yourself as Amir
> suggested?

Could you please provide a more detailed history of what is to be 
documented?

Thanks,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-26 10:13                                     ` Alejandro Colomar (man-pages)
@ 2021-02-26 10:34                                       ` Amir Goldstein
  2021-02-26 11:15                                         ` Alejandro Colomar (man-pages)
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-26 10:34 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages)
  Cc: Luis Henriques, Michael Kerrisk, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Fri, Feb 26, 2021 at 12:13 PM Alejandro Colomar (man-pages)
<alx.manpages@gmail.com> wrote:
>
> Hello Luis,
>
> On 2/25/21 11:21 AM, Luis Henriques wrote:
> > On Wed, Feb 24, 2021 at 06:10:45PM +0200, Amir Goldstein wrote:
> >> If it were me, I would provide all the details of the situation to
> >> Michael and ask him
> >> to write the best description for this section.
> >
> > Thanks Amir.
> >
> > Yeah, it's tricky.  Support was added and then dropped.   Since stable
> > kernels will be picking this patch,  maybe the best thing to do is to no
> > mention the generic cross-filesystem support at all...?  Or simply say
> > that 5.3 temporarily supported it but that support was later dropped.
> >
> > Michael (or Alejandro), would you be OK handling this yourself as Amir
> > suggested?
>
> Could you please provide a more detailed history of what is to be
> documented?
>

Is this detailed enough? ;-)

https://lwn.net/Articles/846403/

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-26 10:34                                       ` Amir Goldstein
@ 2021-02-26 11:15                                         ` Alejandro Colomar (man-pages)
  2021-02-26 13:59                                           ` Jeff Layton
  0 siblings, 1 reply; 93+ messages in thread
From: Alejandro Colomar (man-pages) @ 2021-02-26 11:15 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Michael Kerrisk, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

Hello Amir,

On 2/26/21 11:34 AM, Amir Goldstein wrote:
> Is this detailed enough? ;-)
> 
> https://lwn.net/Articles/846403/

I'm sorry I can't read it yet:

[
Subscription required
The page you have tried to view (How useful should copy_file_range() 
be?) is currently available to LWN subscribers only. Reader 
subscriptions are a necessary way to fund the continued existence of LWN 
and the quality of its content.
[...]
(Alternatively, this item will become freely available on March 4, 2021)
]

However, the 4th of March is close enough, i guess.

Thanks,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-26 11:15                                         ` Alejandro Colomar (man-pages)
@ 2021-02-26 13:59                                           ` Jeff Layton
  2021-02-26 21:26                                             ` Alejandro Colomar (man-pages)
  0 siblings, 1 reply; 93+ messages in thread
From: Jeff Layton @ 2021-02-26 13:59 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages), Amir Goldstein
  Cc: Luis Henriques, Michael Kerrisk, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Fri, 2021-02-26 at 12:15 +0100, Alejandro Colomar (man-pages) wrote:
> Hello Amir,
> 
> On 2/26/21 11:34 AM, Amir Goldstein wrote:
> > Is this detailed enough? ;-)
> > 
> > https://lwn.net/Articles/846403/
> 
> I'm sorry I can't read it yet:
> 
> [
> Subscription required
> The page you have tried to view (How useful should copy_file_range() 
> be?) is currently available to LWN subscribers only. Reader 
> subscriptions are a necessary way to fund the continued existence of LWN 
> and the quality of its content.
> [...]
> (Alternatively, this item will become freely available on March 4, 2021)
> ]
> 


Here's a link that should work. I'm probably breaking the rules a bit as
a subscriber, but hopefully Jon won't mind too much. FWIW, I've found it
to be worthwhile to subscribe to LWN if you're doing a lot of kernel
development:

    https://lwn.net/SubscriberLink/846403/0fd639403e629cab/

Cheers,
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-26 13:59                                           ` Jeff Layton
@ 2021-02-26 21:26                                             ` Alejandro Colomar (man-pages)
  0 siblings, 0 replies; 93+ messages in thread
From: Alejandro Colomar (man-pages) @ 2021-02-26 21:26 UTC (permalink / raw)
  To: Jeff Layton, Amir Goldstein
  Cc: Luis Henriques, Michael Kerrisk, Steve French, Miklos Szeredi,
	Trond Myklebust, Anna Schumaker, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

Hello Jeff,

On 2/26/21 2:59 PM, Jeff Layton wrote:
> Here's a link that should work. I'm probably breaking the rules a bit as
> a subscriber, but hopefully Jon won't mind too much. FWIW, I've found it
> to be worthwhile to subscribe to LWN if you're doing a lot of kernel
> development:
> 
>      https://lwn.net/SubscriberLink/846403/0fd639403e629cab/

Thanks!  (I already received the link privately some minutes before from 
various people.)

It seems that he considers it fair use :)

[[
Where is it appropriate to post a subscriber link?

Almost anywhere. Private mail, messages to project mailing lists, and 
blog entries are all appropriate. As long as people do not use 
subscriber links as a way to defeat our attempts to gain subscribers, we 
are happy to see them shared.
]]
<https://lwn.net/op/FAQ.lwn#site>

Cheers,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-24 16:10                                 ` Amir Goldstein
  2021-02-25 10:21                                   ` Luis Henriques
@ 2021-02-26 22:18                                   ` Alejandro Colomar (man-pages)
  2021-02-27  5:41                                     ` Amir Goldstein
  1 sibling, 1 reply; 93+ messages in thread
From: Alejandro Colomar (man-pages) @ 2021-02-26 22:18 UTC (permalink / raw)
  To: Amir Goldstein, Luis Henriques
  Cc: Michael Kerrisk, Anna Schumaker, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Greg KH, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

Hello Amir, Luis,

On 2/24/21 5:10 PM, Amir Goldstein wrote:
> On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
>>
>> Update man-page with recent changes to this syscall.
>>
>> Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> ---
>> Hi!
>>
>> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
>> I've assumed the fix will hit 5.12.
>>
>>   man2/copy_file_range.2 | 10 +++++++++-
>>   1 file changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>> index 611a39b8026b..b0fd85e2631e 100644
>> --- a/man2/copy_file_range.2
>> +++ b/man2/copy_file_range.2
>> @@ -169,6 +169,9 @@ Out of memory.
>>   .B ENOSPC
>>   There is not enough space on the target filesystem to complete the copy.
>>   .TP
>> +.B EOPNOTSUPP

I'll add the kernel version here:

.BR EOPNOTSUPP " (since Linux 5.12)"

>> +The filesystem does not support this operation >> +.TP
>>   .B EOVERFLOW
>>   The requested source or destination range is too large to represent in the
>>   specified data types.
>> @@ -187,7 +190,7 @@ refers to an active swap file.
>>   .B EXDEV
>>   The files referred to by
>>   .IR fd_in " and " fd_out
>> -are not on the same mounted filesystem (pre Linux 5.3).
>> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).

I'm not sure that 'mounted' adds any value here.  Would you remove the 
word here?

It reads as if two separate devices with the same filesystem type would 
still give this error.

Per the LWN.net article Amir shared, this is permitted ("When called 
from user space, copy_file_range() will only try to copy a file across 
filesystems if the two are of the same type").

This behavior was slightly different before 5.3 AFAICR (was it?) ("until 
then, copy_file_range() refused to copy between files that were not 
located on the same filesystem.").  If that's the case, I'd specify the 
difference, or more probably split the error into two, one before 5.3, 
and one since 5.12.

> 
> I think you need to drop the (Linux range) altogether.

I'll keep the range.  Users of 5.3..5.11 might be surprised if the 
filesystems are different and they don't get an error, I think.

I reworded it to follow other pages conventions:

.BR EXDEV " (before Linux 5.3; or since Linux 5.12)"

which renders as:

        EXDEV (before Linux 5.3; or since Linux 5.12)
               The files referred to by fd_in and fd_out are not on
               the same mounted filesystem.


> What's missing here is the NFS cross server copy use case.
> Maybe:
> 
> ...are not on the same mounted filesystem and the source and target filesystems
> do not support cross-filesystem copy.

Yes.

Again, this wasn't true before 5.3, right?

> 
> You may refer the reader to VERSIONS section where it will say which
> filesystems support cross-fs copy as of kernel version XXX (i.e. cifs and nfs).
> 
>>   .SH VERSIONS
>>   The
>>   .BR copy_file_range ()
>> @@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
>>   .PP
>>   First support for cross-filesystem copies was introduced in Linux 5.3.
>>   Older kernels will return -EXDEV when cross-filesystem copies are attempted.
>> +.PP
>> +After Linux 5.12, support for copies between different filesystems was dropped.
>> +However, individual filesystems may still provide
>> +.BR copy_file_range ()
>> +implementations that allow copies across different devices.
> 
> Again, this is not likely to stay uptodate for very long.
> The stable kernels are expected to apply your patch (because it fixes
> a regression)
> so this should be phrased differently.
> If it were me, I would provide all the details of the situation to
> Michael and ask him
> to write the best description for this section.

I'll look into more detail at this part in a later review.


On 2/26/21 11:34 AM, Amir Goldstein wrote:
 > Is this detailed enough? ;-)
 >
 > https://lwn.net/Articles/846403/

Yes, it is!



Thanks,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-26 22:18                                   ` Alejandro Colomar (man-pages)
@ 2021-02-27  5:41                                     ` Amir Goldstein
  2021-02-27 12:20                                       ` Alejandro Colomar (man-pages)
  2021-02-27 23:08                                       ` [PATCH] copy_file_range.2: Kernel v5.12 updates Steve French
  0 siblings, 2 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-27  5:41 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages)
  Cc: Luis Henriques, Michael Kerrisk, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Sat, Feb 27, 2021 at 12:19 AM Alejandro Colomar (man-pages)
<alx.manpages@gmail.com> wrote:
>
> Hello Amir, Luis,
>
> On 2/24/21 5:10 PM, Amir Goldstein wrote:
> > On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> >>
> >> Update man-page with recent changes to this syscall.
> >>
> >> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >> ---
> >> Hi!
> >>
> >> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> >> I've assumed the fix will hit 5.12.
> >>
> >>   man2/copy_file_range.2 | 10 +++++++++-
> >>   1 file changed, 9 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> >> index 611a39b8026b..b0fd85e2631e 100644
> >> --- a/man2/copy_file_range.2
> >> +++ b/man2/copy_file_range.2
> >> @@ -169,6 +169,9 @@ Out of memory.
> >>   .B ENOSPC
> >>   There is not enough space on the target filesystem to complete the copy.
> >>   .TP
> >> +.B EOPNOTSUPP
>
> I'll add the kernel version here:
>
> .BR EOPNOTSUPP " (since Linux 5.12)"

Error could be returned prior to 5.3 and would be probably returned
by future stable kernels 5.3..5.12 too

>
> >> +The filesystem does not support this operation >> +.TP
> >>   .B EOVERFLOW
> >>   The requested source or destination range is too large to represent in the
> >>   specified data types.
> >> @@ -187,7 +190,7 @@ refers to an active swap file.
> >>   .B EXDEV
> >>   The files referred to by
> >>   .IR fd_in " and " fd_out
> >> -are not on the same mounted filesystem (pre Linux 5.3).
> >> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
>
> I'm not sure that 'mounted' adds any value here.  Would you remove the
> word here?

See rename(2). 'mounted' in this context is explained there.
HOWEVER, it does not fit here.
copy_file_range() IS allowed between two mounts of the same filesystem instance.

To make things more complicated, it appears that cross mount clone is not
allowed via FICLONE/FICLONERANGE ioctl, so ioctl_ficlonerange(2) man page
also uses the 'mounted filesystem' terminology for EXDEV

As things stand now, because of the fallback to clone logic,
copy_file_range() provides a way for users to clone across different mounts
of the same filesystem instance, which they cannot do with the FICLONE ioctl.

Fun :)

BTW, I don't know if preventing cross mount clone was done intentionally,
but as I wrote in a comment in the code once:

        /*
         * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
         * the same mount. Practically, they only need to be on the same file
         * system.
         */

>
> It reads as if two separate devices with the same filesystem type would
> still give this error.
>
> Per the LWN.net article Amir shared, this is permitted ("When called
> from user space, copy_file_range() will only try to copy a file across
> filesystems if the two are of the same type").
>
> This behavior was slightly different before 5.3 AFAICR (was it?) ("until
> then, copy_file_range() refused to copy between files that were not
> located on the same filesystem.").  If that's the case, I'd specify the
> difference, or more probably split the error into two, one before 5.3,
> and one since 5.12.
>

True.

> >
> > I think you need to drop the (Linux range) altogether.
>
> I'll keep the range.  Users of 5.3..5.11 might be surprised if the
> filesystems are different and they don't get an error, I think.
>
> I reworded it to follow other pages conventions:
>
> .BR EXDEV " (before Linux 5.3; or since Linux 5.12)"
>
> which renders as:
>
>         EXDEV (before Linux 5.3; or since Linux 5.12)
>                The files referred to by fd_in and fd_out are not on
>                the same mounted filesystem.
>

drop 'mounted'

>
> > What's missing here is the NFS cross server copy use case.
> > Maybe:
> >
> > ...are not on the same mounted filesystem and the source and target filesystems
> > do not support cross-filesystem copy.
>
> Yes.
>
> Again, this wasn't true before 5.3, right?
>

Right.
Actually, v5.3 provides the vfs capabilities for filesystems to support
cross fs copy. I am not sure if NFS already implements cross fs copy in
v5.3 and not sure about cifs. Need to get input from nfs/cis developers
or dig in the release notes for server-side copy.

> >
> > You may refer the reader to VERSIONS section where it will say which
> > filesystems support cross-fs copy as of kernel version XXX (i.e. cifs and nfs).
> >
> >>   .SH VERSIONS
> >>   The
> >>   .BR copy_file_range ()
> >> @@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
> >>   .PP
> >>   First support for cross-filesystem copies was introduced in Linux 5.3.
> >>   Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> >> +.PP
> >> +After Linux 5.12, support for copies between different filesystems was dropped.
> >> +However, individual filesystems may still provide
> >> +.BR copy_file_range ()
> >> +implementations that allow copies across different devices.
> >
> > Again, this is not likely to stay uptodate for very long.
> > The stable kernels are expected to apply your patch (because it fixes
> > a regression)
> > so this should be phrased differently.
> > If it were me, I would provide all the details of the situation to
> > Michael and ask him
> > to write the best description for this section.
>
> I'll look into more detail at this part in a later review.
>
>
> On 2/26/21 11:34 AM, Amir Goldstein wrote:
>  > Is this detailed enough? ;-)
>  >
>  > https://lwn.net/Articles/846403/
>
> Yes, it is!
>

Thanks to LWN :)

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-27  5:41                                     ` Amir Goldstein
@ 2021-02-27 12:20                                       ` Alejandro Colomar (man-pages)
  2021-02-27 13:49                                         ` [RFC v2] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
  2021-02-27 23:08                                       ` [PATCH] copy_file_range.2: Kernel v5.12 updates Steve French
  1 sibling, 1 reply; 93+ messages in thread
From: Alejandro Colomar (man-pages) @ 2021-02-27 12:20 UTC (permalink / raw)
  To: Amir Goldstein, Greg KH
  Cc: Luis Henriques, Michael Kerrisk, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

Hi Amir,

On 2/27/21 6:41 AM, Amir Goldstein wrote:
> On Sat, Feb 27, 2021 at 12:19 AM Alejandro Colomar (man-pages)
>> On 2/24/21 5:10 PM, Amir Goldstein wrote:
>>> On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
>>>>    .TP
>>>> +.B EOPNOTSUPP
>>
>> I'll add the kernel version here:
>>
>> .BR EOPNOTSUPP " (since Linux 5.12)"
> 
> Error could be returned prior to 5.3 and would be probably returned
> by future stable kernels 5.3..5.12 too

OK, I think I'll state <5.3 and >=5.12 for the moment, and if Greg adds 
that to stable 5.3..5.11 kernels, please update me.

>>>>    .B EXDEV
>>>>    The files referred to by
>>>>    .IR fd_in " and " fd_out
>>>> -are not on the same mounted filesystem (pre Linux 5.3).
>>>> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
>>
>> I'm not sure that 'mounted' adds any value here.  Would you remove the
>> word here?
> 
> See rename(2). 'mounted' in this context is explained there.
> HOWEVER, it does not fit here.
> copy_file_range() IS allowed between two mounts of the same filesystem instance.

Also allowed for <5.3 ?

> 
> To make things more complicated, it appears that cross mount clone is not
> allowed via FICLONE/FICLONERANGE ioctl, so ioctl_ficlonerange(2) man page
> also uses the 'mounted filesystem' terminology for EXDEV
> 
> As things stand now, because of the fallback to clone logic,
> copy_file_range() provides a way for users to clone across different mounts
> of the same filesystem instance, which they cannot do with the FICLONE ioctl.
> 
> Fun :)
> 
> BTW, I don't know if preventing cross mount clone was done intentionally,
> but as I wrote in a comment in the code once:
> 
>          /*
>           * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
>           * the same mount. Practically, they only need to be on the same file
>           * system.
>           */

:)

> 
>>
>> It reads as if two separate devices with the same filesystem type would
>> still give this error.
>>
>> Per the LWN.net article Amir shared, this is permitted ("When called
>> from user space, copy_file_range() will only try to copy a file across
>> filesystems if the two are of the same type").
>>
>> This behavior was slightly different before 5.3 AFAICR (was it?) ("until
>> then, copy_file_range() refused to copy between files that were not
>> located on the same filesystem.").  If that's the case, I'd specify the
>> difference, or more probably split the error into two, one before 5.3,
>> and one since 5.12.
>>
> 
> True.
> 
>>>
>>> I think you need to drop the (Linux range) altogether.
>>
>> I'll keep the range.  Users of 5.3..5.11 might be surprised if the
>> filesystems are different and they don't get an error, I think.
>>
>> I reworded it to follow other pages conventions:
>>
>> .BR EXDEV " (before Linux 5.3; or since Linux 5.12)"
>>
>> which renders as:
>>
>>          EXDEV (before Linux 5.3; or since Linux 5.12)
>>                 The files referred to by fd_in and fd_out are not on
>>                 the same mounted filesystem.
>>
> 
> drop 'mounted'

Yes

> 
>>
>>> What's missing here is the NFS cross server copy use case.
>>> Maybe:
>>>
>>> ...are not on the same mounted filesystem and the source and target filesystems
>>> do not support cross-filesystem copy.
>>
>> Yes.
>>
>> Again, this wasn't true before 5.3, right?
>>
> 
> Right.
> Actually, v5.3 provides the vfs capabilities for filesystems to support
> cross fs copy. I am not sure if NFS already implements cross fs copy in
> v5.3 and not sure about cifs. Need to get input from nfs/cis developers
> or dig in the release notes for server-side copy.

Okay
> Thanks to LWN :)

:)

Thanks,

Alex


-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [RFC v2] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-02-27 12:20                                       ` Alejandro Colomar (man-pages)
@ 2021-02-27 13:49                                         ` Alejandro Colomar
  2021-02-27 16:00                                           ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Alejandro Colomar @ 2021-02-27 13:49 UTC (permalink / raw)
  To: Amir Goldstein, Michael Kerrisk, linux-man, Luis Henriques
  Cc: Alejandro Colomar, Greg KH, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Nicolas Boichat, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	Walter Harms

Linux 5.12 fixes a regression.

Cross-filesystem copies (introduced in 5.3) were buggy.

Move the statements documenting cross-fs to BUGS.
Kernels 5.3..5.11 should be patched soon.

State version information for some errors related to this.

Reported-by: Luis Henriques <lhenriques@suse.de>
Reported-by: Amir Goldstein <amir73il@gmail.com>
Related: <https://lwn.net/Articles/846403/>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Steve French <sfrench@samba.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Nicolas Boichat <drinkcat@chromium.org>
Cc: Ian Lance Taylor <iant@google.com>
Cc: Luis Lozano <llozano@chromium.org>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Olga Kornievskaia <aglo@umich.edu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: ceph-devel <ceph-devel@vger.kernel.org>
Cc: linux-kernel <linux-kernel@vger.kernel.org>
Cc: CIFS <linux-cifs@vger.kernel.org>
Cc: samba-technical <samba-technical@lists.samba.org>
Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>
Cc: Walter Harms <wharms@bfs.de>
Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
---

Hi all,

Please check that this is correct.
I wrote it as I understood copy_file_range() from the LWN article,
and the conversation on this thread,
but maybe someone with more experience on this syscall find bugs in my patch.

When kernels 5.3..5.11 fix this, some info could be compacted a bit more,
and maybe the BUGS section could be removed.

Also, I'd like to know which filesystems support cross-fs, and since when.

Amir, you said that it was only cifs and nfs (since when? 5.3? 5.12?).

Also, I'm a bit surprised that <5.3 could fail with EOPNOTSUPP
and it wasn't documented.  Is that for sure, Amir?

Thanks,

Alex

---
 man2/copy_file_range.2 | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
index 611a39b80..93f54889d 100644
--- a/man2/copy_file_range.2
+++ b/man2/copy_file_range.2
@@ -169,6 +169,9 @@ Out of memory.
 .B ENOSPC
 There is not enough space on the target filesystem to complete the copy.
 .TP
+.BR EOPNOTSUPP " (before Linux 5.3; or since Linux 5.12)"
+The filesystem does not support this operation.
+.TP
 .B EOVERFLOW
 The requested source or destination range is too large to represent in the
 specified data types.
@@ -184,10 +187,17 @@ or
 .I fd_out
 refers to an active swap file.
 .TP
-.B EXDEV
+.BR EXDEV " (before Linux 5.3)"
 The files referred to by
 .IR fd_in " and " fd_out
-are not on the same mounted filesystem (pre Linux 5.3).
+are not on the same filesystem.
+.TP
+.BR EXDEV " (or since Linux 5.12)"
+The files referred to by
+.IR fd_in " and " fd_out
+are not on the same filesystem,
+and the source and target filesystems are not of the same type,
+or do not support cross-filesystem copy.
 .SH VERSIONS
 The
 .BR copy_file_range ()
@@ -195,13 +205,10 @@ system call first appeared in Linux 4.5, but glibc 2.27 provides a user-space
 emulation when it is not available.
 .\" https://sourceware.org/git/?p=glibc.git;a=commit;f=posix/unistd.h;h=bad7a0c81f501fbbcc79af9eaa4b8254441c4a1f
 .PP
-A major rework of the kernel implementation occurred in 5.3.
-Areas of the API that weren't clearly defined were clarified and the API bounds
-are much more strictly checked than on earlier kernels.
-Applications should target the behaviour and requirements of 5.3 kernels.
-.PP
-First support for cross-filesystem copies was introduced in Linux 5.3.
-Older kernels will return -EXDEV when cross-filesystem copies are attempted.
+Since 5.12,
+cross-filesystem copies can be achieved
+when both filesystems are of the same type,
+and that filesystem implements support for it.
 .SH CONFORMING TO
 The
 .BR copy_file_range ()
@@ -226,6 +233,10 @@ gives filesystems an opportunity to implement "copy acceleration" techniques,
 such as the use of reflinks (i.e., two or more inodes that share
 pointers to the same copy-on-write disk blocks)
 or server-side-copy (in the case of NFS).
+.SH BUGS
+In Linux kernels 5.3 to 5.11, cross-filesystem copies were supported.
+However, on some virtual filesystems, the call failed to copy,
+eventhough it may have reported success.
 .SH EXAMPLES
 .EX
 #define _GNU_SOURCE
-- 
2.30.1.721.g45526154a5


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC v2] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-02-27 13:49                                         ` [RFC v2] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
@ 2021-02-27 16:00                                           ` Amir Goldstein
  0 siblings, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-02-27 16:00 UTC (permalink / raw)
  To: Alejandro Colomar
  Cc: Michael Kerrisk, linux-man, Luis Henriques, Greg KH,
	Anna Schumaker, Jeff Layton, Steve French, Miklos Szeredi,
	Trond Myklebust, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Nicolas Boichat, Ian Lance Taylor, Luis Lozano, Andreas Dilger,
	Olga Kornievskaia, Christoph Hellwig, ceph-devel, linux-kernel,
	CIFS, samba-technical, linux-fsdevel, Linux NFS Mailing List,
	Walter Harms

On Sat, Feb 27, 2021 at 3:59 PM Alejandro Colomar
<alx.manpages@gmail.com> wrote:
>
> Linux 5.12 fixes a regression.
>
> Cross-filesystem copies (introduced in 5.3) were buggy.
>
> Move the statements documenting cross-fs to BUGS.
> Kernels 5.3..5.11 should be patched soon.
>
> State version information for some errors related to this.
>
> Reported-by: Luis Henriques <lhenriques@suse.de>
> Reported-by: Amir Goldstein <amir73il@gmail.com>
> Related: <https://lwn.net/Articles/846403/>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Michael Kerrisk <mtk.manpages@gmail.com>
> Cc: Anna Schumaker <anna.schumaker@netapp.com>
> Cc: Jeff Layton <jlayton@kernel.org>
> Cc: Steve French <sfrench@samba.org>
> Cc: Miklos Szeredi <miklos@szeredi.hu>
> Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Nicolas Boichat <drinkcat@chromium.org>
> Cc: Ian Lance Taylor <iant@google.com>
> Cc: Luis Lozano <llozano@chromium.org>
> Cc: Andreas Dilger <adilger@dilger.ca>
> Cc: Olga Kornievskaia <aglo@umich.edu>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: ceph-devel <ceph-devel@vger.kernel.org>
> Cc: linux-kernel <linux-kernel@vger.kernel.org>
> Cc: CIFS <linux-cifs@vger.kernel.org>
> Cc: samba-technical <samba-technical@lists.samba.org>
> Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
> Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>
> Cc: Walter Harms <wharms@bfs.de>
> Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
> ---
>
> Hi all,
>
> Please check that this is correct.
> I wrote it as I understood copy_file_range() from the LWN article,
> and the conversation on this thread,
> but maybe someone with more experience on this syscall find bugs in my patch.
>
> When kernels 5.3..5.11 fix this, some info could be compacted a bit more,
> and maybe the BUGS section could be removed.
>
> Also, I'd like to know which filesystems support cross-fs, and since when.
>
> Amir, you said that it was only cifs and nfs (since when? 5.3? 5.12?).
>
> Also, I'm a bit surprised that <5.3 could fail with EOPNOTSUPP
> and it wasn't documented.  Is that for sure, Amir?

No. You are right. EOPNOTSUPP is new.
Kernel always fell back to sendfile(2) if the filesystem did not support
copy_file_range().

>
> Thanks,
>
> Alex
>
> ---
>  man2/copy_file_range.2 | 29 ++++++++++++++++++++---------
>  1 file changed, 20 insertions(+), 9 deletions(-)
>
> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> index 611a39b80..93f54889d 100644
> --- a/man2/copy_file_range.2
> +++ b/man2/copy_file_range.2
> @@ -169,6 +169,9 @@ Out of memory.
>  .B ENOSPC
>  There is not enough space on the target filesystem to complete the copy.
>  .TP
> +.BR EOPNOTSUPP " (before Linux 5.3; or since Linux 5.12)"
> +The filesystem does not support this operation.
> +.TP

so not before 5.3

>  .B EOVERFLOW
>  The requested source or destination range is too large to represent in the
>  specified data types.
> @@ -184,10 +187,17 @@ or
>  .I fd_out
>  refers to an active swap file.
>  .TP
> -.B EXDEV
> +.BR EXDEV " (before Linux 5.3)"
>  The files referred to by
>  .IR fd_in " and " fd_out
> -are not on the same mounted filesystem (pre Linux 5.3).
> +are not on the same filesystem.
> +.TP
> +.BR EXDEV " (or since Linux 5.12)"
> +The files referred to by
> +.IR fd_in " and " fd_out
> +are not on the same filesystem,
> +and the source and target filesystems are not of the same type,
> +or do not support cross-filesystem copy.

ok.

>  .SH VERSIONS
>  The
>  .BR copy_file_range ()
> @@ -195,13 +205,10 @@ system call first appeared in Linux 4.5, but glibc 2.27 provides a user-space
>  emulation when it is not available.
>  .\" https://sourceware.org/git/?p=glibc.git;a=commit;f=posix/unistd.h;h=bad7a0c81f501fbbcc79af9eaa4b8254441c4a1f
>  .PP
> -A major rework of the kernel implementation occurred in 5.3.
> -Areas of the API that weren't clearly defined were clarified and the API bounds
> -are much more strictly checked than on earlier kernels.
> -Applications should target the behaviour and requirements of 5.3 kernels.
> -.PP

That information is useful. Why remove it?
FYI, the LTP tests written to velidate the copy_file_range() API are not running
on kernel < 5.3 at all.

> -First support for cross-filesystem copies was introduced in Linux 5.3.
> -Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> +Since 5.12,
> +cross-filesystem copies can be achieved
> +when both filesystems are of the same type,
> +and that filesystem implements support for it.
>  .SH CONFORMING TO
>  The
>  .BR copy_file_range ()
> @@ -226,6 +233,10 @@ gives filesystems an opportunity to implement "copy acceleration" techniques,
>  such as the use of reflinks (i.e., two or more inodes that share
>  pointers to the same copy-on-write disk blocks)
>  or server-side-copy (in the case of NFS).
> +.SH BUGS
> +In Linux kernels 5.3 to 5.11, cross-filesystem copies were supported.

I think it is a bit confusing to say "were supported", because how come
support went away from kernel 5.12? maybe something along the lines
that kernel implementation of copy was used if there was no filesystem
support for the operation...

> +However, on some virtual filesystems, the call failed to copy,
> +eventhough it may have reported success.
>  .SH EXAMPLES
>  .EX
>  #define _GNU_SOURCE
> --
> 2.30.1.721.g45526154a5
>

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-27  5:41                                     ` Amir Goldstein
  2021-02-27 12:20                                       ` Alejandro Colomar (man-pages)
@ 2021-02-27 23:08                                       ` Steve French
  2021-02-28  7:35                                         ` Amir Goldstein
  1 sibling, 1 reply; 93+ messages in thread
From: Steve French @ 2021-02-27 23:08 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Alejandro Colomar (man-pages),
	Luis Henriques, Michael Kerrisk, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Fri, Feb 26, 2021 at 11:43 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Sat, Feb 27, 2021 at 12:19 AM Alejandro Colomar (man-pages)
> <alx.manpages@gmail.com> wrote:
> >
> > Hello Amir, Luis,
> >
> > On 2/24/21 5:10 PM, Amir Goldstein wrote:
> > > On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> > >>
> > >> Update man-page with recent changes to this syscall.
> > >>
> > >> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > >> ---
> > >> Hi!
> > >>
> > >> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> > >> I've assumed the fix will hit 5.12.
> > >>
> > >>   man2/copy_file_range.2 | 10 +++++++++-
> > >>   1 file changed, 9 insertions(+), 1 deletion(-)
> > >>
> > >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> > >> index 611a39b8026b..b0fd85e2631e 100644
> > >> --- a/man2/copy_file_range.2
> > >> +++ b/man2/copy_file_range.2
> > >> @@ -169,6 +169,9 @@ Out of memory.
> > >>   .B ENOSPC
> > >>   There is not enough space on the target filesystem to complete the copy.
> > >>   .TP
> > >> +.B EOPNOTSUPP
> >
> > I'll add the kernel version here:
> >
> > .BR EOPNOTSUPP " (since Linux 5.12)"
>
> Error could be returned prior to 5.3 and would be probably returned
> by future stable kernels 5.3..5.12 too
>
> >
> > >> +The filesystem does not support this operation >> +.TP
> > >>   .B EOVERFLOW
> > >>   The requested source or destination range is too large to represent in the
> > >>   specified data types.
> > >> @@ -187,7 +190,7 @@ refers to an active swap file.
> > >>   .B EXDEV
> > >>   The files referred to by
> > >>   .IR fd_in " and " fd_out
> > >> -are not on the same mounted filesystem (pre Linux 5.3).
> > >> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
> >
> > I'm not sure that 'mounted' adds any value here.  Would you remove the
> > word here?
>
> See rename(2). 'mounted' in this context is explained there.
> HOWEVER, it does not fit here.
> copy_file_range() IS allowed between two mounts of the same filesystem instance.
>
> To make things more complicated, it appears that cross mount clone is not
> allowed via FICLONE/FICLONERANGE ioctl, so ioctl_ficlonerange(2) man page
> also uses the 'mounted filesystem' terminology for EXDEV
>
> As things stand now, because of the fallback to clone logic,
> copy_file_range() provides a way for users to clone across different mounts
> of the same filesystem instance, which they cannot do with the FICLONE ioctl.
>
> Fun :)
>
> BTW, I don't know if preventing cross mount clone was done intentionally,
> but as I wrote in a comment in the code once:
>
>         /*
>          * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
>          * the same mount. Practically, they only need to be on the same file
>          * system.
>          */
>
> >
> > It reads as if two separate devices with the same filesystem type would
> > still give this error.
> >
> > Per the LWN.net article Amir shared, this is permitted ("When called
> > from user space, copy_file_range() will only try to copy a file across
> > filesystems if the two are of the same type").
> >
> > This behavior was slightly different before 5.3 AFAICR (was it?) ("until
> > then, copy_file_range() refused to copy between files that were not
> > located on the same filesystem.").  If that's the case, I'd specify the
> > difference, or more probably split the error into two, one before 5.3,
> > and one since 5.12.
> >
>
> True.
>
> > >
> > > I think you need to drop the (Linux range) altogether.
> >
> > I'll keep the range.  Users of 5.3..5.11 might be surprised if the
> > filesystems are different and they don't get an error, I think.
> >
> > I reworded it to follow other pages conventions:
> >
> > .BR EXDEV " (before Linux 5.3; or since Linux 5.12)"
> >
> > which renders as:
> >
> >         EXDEV (before Linux 5.3; or since Linux 5.12)
> >                The files referred to by fd_in and fd_out are not on
> >                the same mounted filesystem.
> >
>
> drop 'mounted'
>
> >
> > > What's missing here is the NFS cross server copy use case.
> > > Maybe:

At least for the SMB3 kernel server (ksmbd "cifsd") looks like they use splice.
And for the user space CIFS/SMB3 server (like Samba) they have a configurable
plug in library interface ("Samba VFS modules") that would allow you
to implement
cross filesystem copy optimally for your version of Linux and plug
this into Samba
with little work on your part.

> >
> > Again, this wasn't true before 5.3, right?
> >
>
> Right.
> Actually, v5.3 provides the vfs capabilities for filesystems to support
> cross fs copy. I am not sure if NFS already implements cross fs copy in
> v5.3 and not sure about cifs. Need to get input from nfs/cis developers
> or dig in the release notes for server-side copy.

The SMB3 protocol has multiple ways to do "server side copy" (copy
offload to the server), some of which would apply to your example.
The case of "reflink" in many cases would be most efficient, and is supported
by the Linux client (see MS-SMB2 protocol specification section 3.3.5.15.18) but
is supported by fewer server file systems, so probably more important
to focus on
the other mechanisms which are server side copy rather than clone.  The most
popular way, supported by most servers, is  "CopyChunk" - 100s of
millions of systems
support this (if not more) - see MS-SMB2 protocol specification
section 2.2.31.1 and
3.3.5.15.16 - there are various cases where two different SMB3 mounts
on the same
client could handle cross mount server side copy.

There are other mechanisms supported by fewer servers SMB3 ODX/T10 style copy
offload (Windows and some others see e.g. Gordon at Nexenta's presentation
https://www.slideshare.net/gordonross/smb3-offload-data-transfer-odx)
but still popular for virtualization workloads.  For this it could be
even more common
for those to be different mounts on the client.  The Linux client does
not support
the SMB3 ODX/T10 offload yet but it would be good to add support for it.
There is a nice description of its additional benefits at
https://docs.microsoft.com/en-us/windows-hardware/drivers/storage/offloaded-data-transfer

But - yes SMB3 on Linux can have cross mount file copy today, which is
far more efficient
(having the server do the copy for us) rather than sending large
reads/writes back and
forth over the network from the client.  In the future I am hoping that use case
becomes even more common over SMB3 as cloud servers improve.


> > > You may refer the reader to VERSIONS section where it will say which
> > > filesystems support cross-fs copy as of kernel version XXX (i.e. cifs and nfs).
> > >
> > >>   .SH VERSIONS
> > >>   The
> > >>   .BR copy_file_range ()
> > >> @@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
> > >>   .PP
> > >>   First support for cross-filesystem copies was introduced in Linux 5.3.
> > >>   Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> > >> +.PP
> > >> +After Linux 5.12, support for copies between different filesystems was dropped.
> > >> +However, individual filesystems may still provide
> > >> +.BR copy_file_range ()
> > >> +implementations that allow copies across different devices.

Yes - this could be very important, especially for cifs (smb3) going forward.



-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-27 23:08                                       ` [PATCH] copy_file_range.2: Kernel v5.12 updates Steve French
@ 2021-02-28  7:35                                         ` Amir Goldstein
  2021-02-28 22:25                                           ` Steve French
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-02-28  7:35 UTC (permalink / raw)
  To: Steve French
  Cc: Alejandro Colomar (man-pages),
	Luis Henriques, Michael Kerrisk, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Sun, Feb 28, 2021 at 1:08 AM Steve French <smfrench@gmail.com> wrote:
>
> On Fri, Feb 26, 2021 at 11:43 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Sat, Feb 27, 2021 at 12:19 AM Alejandro Colomar (man-pages)
> > <alx.manpages@gmail.com> wrote:
> > >
> > > Hello Amir, Luis,
> > >
> > > On 2/24/21 5:10 PM, Amir Goldstein wrote:
> > > > On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > >>
> > > >> Update man-page with recent changes to this syscall.
> > > >>
> > > >> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > >> ---
> > > >> Hi!
> > > >>
> > > >> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> > > >> I've assumed the fix will hit 5.12.
> > > >>
> > > >>   man2/copy_file_range.2 | 10 +++++++++-
> > > >>   1 file changed, 9 insertions(+), 1 deletion(-)
> > > >>
> > > >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> > > >> index 611a39b8026b..b0fd85e2631e 100644
> > > >> --- a/man2/copy_file_range.2
> > > >> +++ b/man2/copy_file_range.2
> > > >> @@ -169,6 +169,9 @@ Out of memory.
> > > >>   .B ENOSPC
> > > >>   There is not enough space on the target filesystem to complete the copy.
> > > >>   .TP
> > > >> +.B EOPNOTSUPP
> > >
> > > I'll add the kernel version here:
> > >
> > > .BR EOPNOTSUPP " (since Linux 5.12)"
> >
> > Error could be returned prior to 5.3 and would be probably returned
> > by future stable kernels 5.3..5.12 too
> >
> > >
> > > >> +The filesystem does not support this operation >> +.TP
> > > >>   .B EOVERFLOW
> > > >>   The requested source or destination range is too large to represent in the
> > > >>   specified data types.
> > > >> @@ -187,7 +190,7 @@ refers to an active swap file.
> > > >>   .B EXDEV
> > > >>   The files referred to by
> > > >>   .IR fd_in " and " fd_out
> > > >> -are not on the same mounted filesystem (pre Linux 5.3).
> > > >> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
> > >
> > > I'm not sure that 'mounted' adds any value here.  Would you remove the
> > > word here?
> >
> > See rename(2). 'mounted' in this context is explained there.
> > HOWEVER, it does not fit here.
> > copy_file_range() IS allowed between two mounts of the same filesystem instance.
> >
> > To make things more complicated, it appears that cross mount clone is not
> > allowed via FICLONE/FICLONERANGE ioctl, so ioctl_ficlonerange(2) man page
> > also uses the 'mounted filesystem' terminology for EXDEV
> >
> > As things stand now, because of the fallback to clone logic,
> > copy_file_range() provides a way for users to clone across different mounts
> > of the same filesystem instance, which they cannot do with the FICLONE ioctl.
> >
> > Fun :)
> >
> > BTW, I don't know if preventing cross mount clone was done intentionally,
> > but as I wrote in a comment in the code once:
> >
> >         /*
> >          * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
> >          * the same mount. Practically, they only need to be on the same file
> >          * system.
> >          */
> >
> > >
> > > It reads as if two separate devices with the same filesystem type would
> > > still give this error.
> > >
> > > Per the LWN.net article Amir shared, this is permitted ("When called
> > > from user space, copy_file_range() will only try to copy a file across
> > > filesystems if the two are of the same type").
> > >
> > > This behavior was slightly different before 5.3 AFAICR (was it?) ("until
> > > then, copy_file_range() refused to copy between files that were not
> > > located on the same filesystem.").  If that's the case, I'd specify the
> > > difference, or more probably split the error into two, one before 5.3,
> > > and one since 5.12.
> > >
> >
> > True.
> >
> > > >
> > > > I think you need to drop the (Linux range) altogether.
> > >
> > > I'll keep the range.  Users of 5.3..5.11 might be surprised if the
> > > filesystems are different and they don't get an error, I think.
> > >
> > > I reworded it to follow other pages conventions:
> > >
> > > .BR EXDEV " (before Linux 5.3; or since Linux 5.12)"
> > >
> > > which renders as:
> > >
> > >         EXDEV (before Linux 5.3; or since Linux 5.12)
> > >                The files referred to by fd_in and fd_out are not on
> > >                the same mounted filesystem.
> > >
> >
> > drop 'mounted'
> >
> > >
> > > > What's missing here is the NFS cross server copy use case.
> > > > Maybe:
>
> At least for the SMB3 kernel server (ksmbd "cifsd") looks like they use splice.
> And for the user space CIFS/SMB3 server (like Samba) they have a configurable
> plug in library interface ("Samba VFS modules") that would allow you
> to implement
> cross filesystem copy optimally for your version of Linux and plug
> this into Samba
> with little work on your part.
>
> > >
> > > Again, this wasn't true before 5.3, right?
> > >
> >
> > Right.
> > Actually, v5.3 provides the vfs capabilities for filesystems to support
> > cross fs copy. I am not sure if NFS already implements cross fs copy in
> > v5.3 and not sure about cifs. Need to get input from nfs/cis developers
> > or dig in the release notes for server-side copy.
>
> The SMB3 protocol has multiple ways to do "server side copy" (copy
> offload to the server), some of which would apply to your example.
> The case of "reflink" in many cases would be most efficient, and is supported
> by the Linux client (see MS-SMB2 protocol specification section 3.3.5.15.18) but
> is supported by fewer server file systems, so probably more important
> to focus on
> the other mechanisms which are server side copy rather than clone.  The most
> popular way, supported by most servers, is  "CopyChunk" - 100s of
> millions of systems
> support this (if not more) - see MS-SMB2 protocol specification
> section 2.2.31.1 and
> 3.3.5.15.16 - there are various cases where two different SMB3 mounts
> on the same
> client could handle cross mount server side copy.
>
> There are other mechanisms supported by fewer servers SMB3 ODX/T10 style copy
> offload (Windows and some others see e.g. Gordon at Nexenta's presentation
> https://www.slideshare.net/gordonross/smb3-offload-data-transfer-odx)
> but still popular for virtualization workloads.  For this it could be
> even more common
> for those to be different mounts on the client.  The Linux client does
> not support
> the SMB3 ODX/T10 offload yet but it would be good to add support for it.
> There is a nice description of its additional benefits at
> https://docs.microsoft.com/en-us/windows-hardware/drivers/storage/offloaded-data-transfer
>
> But - yes SMB3 on Linux can have cross mount file copy today, which is
> far more efficient

Can have? or does have?
IIUC, server-side copy ability exists for "same cifs fs" for a long time and
since v5.3, it is available for "same cifs connection", which is not exactly
the same as "same cifs fs" but also not really different for most people.
Can you elaborate about  that?
Just assume the server can do anything. What can the Linux client do
since v5.3 or later?

> (having the server do the copy for us) rather than sending large
> reads/writes back and
> forth over the network from the client.  In the future I am hoping that use case
> becomes even more common over SMB3 as cloud servers improve.
>
>
> > > > You may refer the reader to VERSIONS section where it will say which
> > > > filesystems support cross-fs copy as of kernel version XXX (i.e. cifs and nfs).
> > > >
> > > >>   .SH VERSIONS
> > > >>   The
> > > >>   .BR copy_file_range ()
> > > >> @@ -202,6 +205,11 @@ Applications should target the behaviour and requirements of 5.3 kernels.
> > > >>   .PP
> > > >>   First support for cross-filesystem copies was introduced in Linux 5.3.
> > > >>   Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> > > >> +.PP
> > > >> +After Linux 5.12, support for copies between different filesystems was dropped.
> > > >> +However, individual filesystems may still provide
> > > >> +.BR copy_file_range ()
> > > >> +implementations that allow copies across different devices.
>
> Yes - this could be very important, especially for cifs (smb3) going forward.
>
>
>
> --
> Thanks,
>
> Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-28  7:35                                         ` Amir Goldstein
@ 2021-02-28 22:25                                           ` Steve French
  2021-03-01  6:18                                             ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Steve French @ 2021-02-28 22:25 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Alejandro Colomar (man-pages),
	Luis Henriques, Michael Kerrisk, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man

On Sun, Feb 28, 2021 at 1:36 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Sun, Feb 28, 2021 at 1:08 AM Steve French <smfrench@gmail.com> wrote:
> >
> > On Fri, Feb 26, 2021 at 11:43 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Sat, Feb 27, 2021 at 12:19 AM Alejandro Colomar (man-pages)
> > > <alx.manpages@gmail.com> wrote:
> > > >
> > > > Hello Amir, Luis,
> > > >
> > > > On 2/24/21 5:10 PM, Amir Goldstein wrote:
> > > > > On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > >>
> > > > >> Update man-page with recent changes to this syscall.
> > > > >>
> > > > >> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > > >> ---
> > > > >> Hi!
> > > > >>
> > > > >> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> > > > >> I've assumed the fix will hit 5.12.
> > > > >>
> > > > >>   man2/copy_file_range.2 | 10 +++++++++-
> > > > >>   1 file changed, 9 insertions(+), 1 deletion(-)
> > > > >>
> > > > >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> > > > >> index 611a39b8026b..b0fd85e2631e 100644
> > > > >> --- a/man2/copy_file_range.2
> > > > >> +++ b/man2/copy_file_range.2
> > > > >> @@ -169,6 +169,9 @@ Out of memory.
> > > > >>   .B ENOSPC
> > > > >>   There is not enough space on the target filesystem to complete the copy.
> > > > >>   .TP
> > > > >> +.B EOPNOTSUPP
> > > >
> > > > I'll add the kernel version here:
> > > >
> > > > .BR EOPNOTSUPP " (since Linux 5.12)"
> > >
> > > Error could be returned prior to 5.3 and would be probably returned
> > > by future stable kernels 5.3..5.12 too
> > >
> > > >
> > > > >> +The filesystem does not support this operation >> +.TP
> > > > >>   .B EOVERFLOW
> > > > >>   The requested source or destination range is too large to represent in the
> > > > >>   specified data types.
> > > > >> @@ -187,7 +190,7 @@ refers to an active swap file.
> > > > >>   .B EXDEV
> > > > >>   The files referred to by
> > > > >>   .IR fd_in " and " fd_out
> > > > >> -are not on the same mounted filesystem (pre Linux 5.3).
> > > > >> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
> > > >
> > > > I'm not sure that 'mounted' adds any value here.  Would you remove the
> > > > word here?
> > >
> > > See rename(2). 'mounted' in this context is explained there.
> > > HOWEVER, it does not fit here.
> > > copy_file_range() IS allowed between two mounts of the same filesystem instance.
> > >
> > > To make things more complicated, it appears that cross mount clone is not
> > > allowed via FICLONE/FICLONERANGE ioctl, so ioctl_ficlonerange(2) man page
> > > also uses the 'mounted filesystem' terminology for EXDEV
> > >
> > > As things stand now, because of the fallback to clone logic,
> > > copy_file_range() provides a way for users to clone across different mounts
> > > of the same filesystem instance, which they cannot do with the FICLONE ioctl.
> > >
> > > Fun :)
> > >
> > > BTW, I don't know if preventing cross mount clone was done intentionally,
> > > but as I wrote in a comment in the code once:
> > >
> > >         /*
> > >          * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
> > >          * the same mount. Practically, they only need to be on the same file
> > >          * system.
> > >          */
> > >
> > > >
> > > > It reads as if two separate devices with the same filesystem type would
> > > > still give this error.
> > > >
> > > > Per the LWN.net article Amir shared, this is permitted ("When called
> > > > from user space, copy_file_range() will only try to copy a file across
> > > > filesystems if the two are of the same type").
> > > >
> > > > This behavior was slightly different before 5.3 AFAICR (was it?) ("until
> > > > then, copy_file_range() refused to copy between files that were not
> > > > located on the same filesystem.").  If that's the case, I'd specify the
> > > > difference, or more probably split the error into two, one before 5.3,
> > > > and one since 5.12.
> > > >
> > >
> > > True.
> > >
> > > > >
> > > > > I think you need to drop the (Linux range) altogether.
> > > >
> > > > I'll keep the range.  Users of 5.3..5.11 might be surprised if the
> > > > filesystems are different and they don't get an error, I think.
> > > >
> > > > I reworded it to follow other pages conventions:
> > > >
> > > > .BR EXDEV " (before Linux 5.3; or since Linux 5.12)"
> > > >
> > > > which renders as:
> > > >
> > > >         EXDEV (before Linux 5.3; or since Linux 5.12)
> > > >                The files referred to by fd_in and fd_out are not on
> > > >                the same mounted filesystem.
> > > >
> > >
> > > drop 'mounted'
> > >
> > > >
> > > > > What's missing here is the NFS cross server copy use case.
> > > > > Maybe:
> >
> > At least for the SMB3 kernel server (ksmbd "cifsd") looks like they use splice.
> > And for the user space CIFS/SMB3 server (like Samba) they have a configurable
> > plug in library interface ("Samba VFS modules") that would allow you
> > to implement
> > cross filesystem copy optimally for your version of Linux and plug
> > this into Samba
> > with little work on your part.
> >
> > > >
> > > > Again, this wasn't true before 5.3, right?
> > > >
> > >
> > > Right.
> > > Actually, v5.3 provides the vfs capabilities for filesystems to support
> > > cross fs copy. I am not sure if NFS already implements cross fs copy in
> > > v5.3 and not sure about cifs. Need to get input from nfs/cis developers
> > > or dig in the release notes for server-side copy.
> >
> > The SMB3 protocol has multiple ways to do "server side copy" (copy
> > offload to the server), some of which would apply to your example.
> > The case of "reflink" in many cases would be most efficient, and is supported
> > by the Linux client (see MS-SMB2 protocol specification section 3.3.5.15.18) but
> > is supported by fewer server file systems, so probably more important
> > to focus on
> > the other mechanisms which are server side copy rather than clone.  The most
> > popular way, supported by most servers, is  "CopyChunk" - 100s of
> > millions of systems
> > support this (if not more) - see MS-SMB2 protocol specification
> > section 2.2.31.1 and
> > 3.3.5.15.16 - there are various cases where two different SMB3 mounts
> > on the same
> > client could handle cross mount server side copy.
> >
> > There are other mechanisms supported by fewer servers SMB3 ODX/T10 style copy
> > offload (Windows and some others see e.g. Gordon at Nexenta's presentation
> > https://www.slideshare.net/gordonross/smb3-offload-data-transfer-odx)
> > but still popular for virtualization workloads.  For this it could be
> > even more common
> > for those to be different mounts on the client.  The Linux client does
> > not support
> > the SMB3 ODX/T10 offload yet but it would be good to add support for it.
> > There is a nice description of its additional benefits at
> > https://docs.microsoft.com/en-us/windows-hardware/drivers/storage/offloaded-data-transfer
> >
> > But - yes SMB3 on Linux can have cross mount file copy today, which is
> > far more efficient
>
> Can have? or does have?
> IIUC, server-side copy ability exists for "same cifs fs" for a long time and
> since v5.3, it is available for "same cifs connection", which is not exactly
> the same as "same cifs fs" but also not really different for most people.
> Can you elaborate about  that?
> Just assume the server can do anything. What can the Linux client do
> since v5.3 or later?

Inside the SMB3 client (cifs.ko) we check that the file handles provided
are for the same authenticated user to the same server, so
e.g. you could mount //server/share on /mnt1 and //server/anothershare on /mnt2
and do a copy_file_range from /mnt1/file1 to /mnt2/file2 even though these are
different mounts.   The cifs client should allow additional cases of cross mount
copy, but at least this helps for various common scenarios and is very widely
supported on most servers as well.


-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH] copy_file_range.2: Kernel v5.12 updates
  2021-02-28 22:25                                           ` Steve French
@ 2021-03-01  6:18                                             ` Amir Goldstein
  0 siblings, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-03-01  6:18 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages)
  Cc: Luis Henriques, Michael Kerrisk, Anna Schumaker, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Nicolas Boichat,
	Ian Lance Taylor, Luis Lozano, Andreas Dilger, Olga Kornievskaia,
	Christoph Hellwig, ceph-devel, linux-kernel, CIFS,
	samba-technical, linux-fsdevel, Linux NFS Mailing List,
	linux-man, Steve French

On Mon, Mar 1, 2021 at 12:25 AM Steve French <smfrench@gmail.com> wrote:
>
> On Sun, Feb 28, 2021 at 1:36 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Sun, Feb 28, 2021 at 1:08 AM Steve French <smfrench@gmail.com> wrote:
> > >
> > > On Fri, Feb 26, 2021 at 11:43 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Sat, Feb 27, 2021 at 12:19 AM Alejandro Colomar (man-pages)
> > > > <alx.manpages@gmail.com> wrote:
> > > > >
> > > > > Hello Amir, Luis,
> > > > >
> > > > > On 2/24/21 5:10 PM, Amir Goldstein wrote:
> > > > > > On Wed, Feb 24, 2021 at 4:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> > > > > >>
> > > > > >> Update man-page with recent changes to this syscall.
> > > > > >>
> > > > > >> Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > > > > >> ---
> > > > > >> Hi!
> > > > > >>
> > > > > >> Here's a suggestion for fixing the manpage for copy_file_range().  Note that
> > > > > >> I've assumed the fix will hit 5.12.
> > > > > >>
> > > > > >>   man2/copy_file_range.2 | 10 +++++++++-
> > > > > >>   1 file changed, 9 insertions(+), 1 deletion(-)
> > > > > >>
> > > > > >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> > > > > >> index 611a39b8026b..b0fd85e2631e 100644
> > > > > >> --- a/man2/copy_file_range.2
> > > > > >> +++ b/man2/copy_file_range.2
> > > > > >> @@ -169,6 +169,9 @@ Out of memory.
> > > > > >>   .B ENOSPC
> > > > > >>   There is not enough space on the target filesystem to complete the copy.
> > > > > >>   .TP
> > > > > >> +.B EOPNOTSUPP
> > > > >
> > > > > I'll add the kernel version here:
> > > > >
> > > > > .BR EOPNOTSUPP " (since Linux 5.12)"
> > > >
> > > > Error could be returned prior to 5.3 and would be probably returned
> > > > by future stable kernels 5.3..5.12 too
> > > >
> > > > >
> > > > > >> +The filesystem does not support this operation >> +.TP
> > > > > >>   .B EOVERFLOW
> > > > > >>   The requested source or destination range is too large to represent in the
> > > > > >>   specified data types.
> > > > > >> @@ -187,7 +190,7 @@ refers to an active swap file.
> > > > > >>   .B EXDEV
> > > > > >>   The files referred to by
> > > > > >>   .IR fd_in " and " fd_out
> > > > > >> -are not on the same mounted filesystem (pre Linux 5.3).
> > > > > >> +are not on the same mounted filesystem (pre Linux 5.3 and post Linux 5.12).
> > > > >
> > > > > I'm not sure that 'mounted' adds any value here.  Would you remove the
> > > > > word here?
> > > >
> > > > See rename(2). 'mounted' in this context is explained there.
> > > > HOWEVER, it does not fit here.
> > > > copy_file_range() IS allowed between two mounts of the same filesystem instance.
> > > >
> > > > To make things more complicated, it appears that cross mount clone is not
> > > > allowed via FICLONE/FICLONERANGE ioctl, so ioctl_ficlonerange(2) man page
> > > > also uses the 'mounted filesystem' terminology for EXDEV
> > > >
> > > > As things stand now, because of the fallback to clone logic,
> > > > copy_file_range() provides a way for users to clone across different mounts
> > > > of the same filesystem instance, which they cannot do with the FICLONE ioctl.
> > > >
> > > > Fun :)
> > > >
> > > > BTW, I don't know if preventing cross mount clone was done intentionally,
> > > > but as I wrote in a comment in the code once:
> > > >
> > > >         /*
> > > >          * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
> > > >          * the same mount. Practically, they only need to be on the same file
> > > >          * system.
> > > >          */
> > > >
> > > > >
> > > > > It reads as if two separate devices with the same filesystem type would
> > > > > still give this error.
> > > > >
> > > > > Per the LWN.net article Amir shared, this is permitted ("When called
> > > > > from user space, copy_file_range() will only try to copy a file across
> > > > > filesystems if the two are of the same type").
> > > > >
> > > > > This behavior was slightly different before 5.3 AFAICR (was it?) ("until
> > > > > then, copy_file_range() refused to copy between files that were not
> > > > > located on the same filesystem.").  If that's the case, I'd specify the
> > > > > difference, or more probably split the error into two, one before 5.3,
> > > > > and one since 5.12.
> > > > >
> > > >
> > > > True.
> > > >
> > > > > >
> > > > > > I think you need to drop the (Linux range) altogether.
> > > > >
> > > > > I'll keep the range.  Users of 5.3..5.11 might be surprised if the
> > > > > filesystems are different and they don't get an error, I think.
> > > > >
> > > > > I reworded it to follow other pages conventions:
> > > > >
> > > > > .BR EXDEV " (before Linux 5.3; or since Linux 5.12)"
> > > > >
> > > > > which renders as:
> > > > >
> > > > >         EXDEV (before Linux 5.3; or since Linux 5.12)
> > > > >                The files referred to by fd_in and fd_out are not on
> > > > >                the same mounted filesystem.
> > > > >
> > > >
> > > > drop 'mounted'
> > > >
> > > > >
> > > > > > What's missing here is the NFS cross server copy use case.
> > > > > > Maybe:
> > >
> > > At least for the SMB3 kernel server (ksmbd "cifsd") looks like they use splice.
> > > And for the user space CIFS/SMB3 server (like Samba) they have a configurable
> > > plug in library interface ("Samba VFS modules") that would allow you
> > > to implement
> > > cross filesystem copy optimally for your version of Linux and plug
> > > this into Samba
> > > with little work on your part.
> > >
> > > > >
> > > > > Again, this wasn't true before 5.3, right?
> > > > >
> > > >
> > > > Right.
> > > > Actually, v5.3 provides the vfs capabilities for filesystems to support
> > > > cross fs copy. I am not sure if NFS already implements cross fs copy in
> > > > v5.3 and not sure about cifs. Need to get input from nfs/cis developers
> > > > or dig in the release notes for server-side copy.
> > >
> > > The SMB3 protocol has multiple ways to do "server side copy" (copy
> > > offload to the server), some of which would apply to your example.
> > > The case of "reflink" in many cases would be most efficient, and is supported
> > > by the Linux client (see MS-SMB2 protocol specification section 3.3.5.15.18) but
> > > is supported by fewer server file systems, so probably more important
> > > to focus on
> > > the other mechanisms which are server side copy rather than clone.  The most
> > > popular way, supported by most servers, is  "CopyChunk" - 100s of
> > > millions of systems
> > > support this (if not more) - see MS-SMB2 protocol specification
> > > section 2.2.31.1 and
> > > 3.3.5.15.16 - there are various cases where two different SMB3 mounts
> > > on the same
> > > client could handle cross mount server side copy.
> > >
> > > There are other mechanisms supported by fewer servers SMB3 ODX/T10 style copy
> > > offload (Windows and some others see e.g. Gordon at Nexenta's presentation
> > > https://www.slideshare.net/gordonross/smb3-offload-data-transfer-odx)
> > > but still popular for virtualization workloads.  For this it could be
> > > even more common
> > > for those to be different mounts on the client.  The Linux client does
> > > not support
> > > the SMB3 ODX/T10 offload yet but it would be good to add support for it.
> > > There is a nice description of its additional benefits at
> > > https://docs.microsoft.com/en-us/windows-hardware/drivers/storage/offloaded-data-transfer
> > >
> > > But - yes SMB3 on Linux can have cross mount file copy today, which is
> > > far more efficient
> >
> > Can have? or does have?
> > IIUC, server-side copy ability exists for "same cifs fs" for a long time and
> > since v5.3, it is available for "same cifs connection", which is not exactly
> > the same as "same cifs fs" but also not really different for most people.
> > Can you elaborate about  that?
> > Just assume the server can do anything. What can the Linux client do
> > since v5.3 or later?
>
> Inside the SMB3 client (cifs.ko) we check that the file handles provided
> are for the same authenticated user to the same server, so
> e.g. you could mount //server/share on /mnt1 and //server/anothershare on /mnt2
> and do a copy_file_range from /mnt1/file1 to /mnt2/file2 even though these are
> different mounts.   The cifs client should allow additional cases of cross mount
> copy, but at least this helps for various common scenarios and is very widely
> supported on most servers as well.
>

Got it. Thanks for clarifying.

So it appears that both cifs and nfs support cross-fs copy since v5.3
and many other fs that support clone, started supporting cross-mnt
(same fs) copy (implemented as clone) since v5.3 and still do to this day.

Alejandro, just to be clear, none of these changes are in v5.12 yet,
so please hold on to your patch for now.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [RFC v3] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-02-24 14:23                               ` [PATCH] copy_file_range.2: Kernel v5.12 updates Luis Henriques
  2021-02-24 16:10                                 ` Amir Goldstein
@ 2021-03-01 14:41                                 ` Alejandro Colomar
  2021-03-01 14:58                                   ` Amir Goldstein
  2021-03-04  9:38                                 ` [RFC v4] " Alejandro Colomar
  2 siblings, 1 reply; 93+ messages in thread
From: Alejandro Colomar @ 2021-03-01 14:41 UTC (permalink / raw)
  To: linux-man, Amir Goldstein, Michael Kerrisk, Luis Henriques, Steve French
  Cc: Alejandro Colomar, Greg KH, Anna Schumaker, Jeff Layton,
	Miklos Szeredi, Trond Myklebust, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List, Walter Harms

Linux 5.12 fixes a regression.

Cross-filesystem (introduced in 5.3) copies were buggy.

Move the statements documenting cross-fs to BUGS.
Kernels 5.3..5.11 should be patched soon.

State version information for some errors related to this.

Reported-by: Luis Henriques <lhenriques@suse.de>
Reported-by: Amir Goldstein <amir73il@gmail.com>
Related: <https://lwn.net/Articles/846403/>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Steve French <sfrench@samba.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Nicolas Boichat <drinkcat@chromium.org>
Cc: Ian Lance Taylor <iant@google.com>
Cc: Luis Lozano <llozano@chromium.org>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Olga Kornievskaia <aglo@umich.edu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: ceph-devel <ceph-devel@vger.kernel.org>
Cc: linux-kernel <linux-kernel@vger.kernel.org>
Cc: CIFS <linux-cifs@vger.kernel.org>
Cc: samba-technical <samba-technical@lists.samba.org>
Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>
Cc: Walter Harms <wharms@bfs.de>
Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
---

v3:
	- Don't remove some important text.
	- Reword BUGS.

---
Hi Amir,

I covered your comments.  I may need to add something else after your
discussion with Steve; please comment.

I tried to reword BUGS so that it's as specific and understandable as I can.
If you still find it not good enough, please comment :)

Thanks,

Alex

---
 man2/copy_file_range.2 | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
index 611a39b80..1c0df3f74 100644
--- a/man2/copy_file_range.2
+++ b/man2/copy_file_range.2
@@ -169,6 +169,9 @@ Out of memory.
 .B ENOSPC
 There is not enough space on the target filesystem to complete the copy.
 .TP
+.BR EOPNOTSUPP " (since Linux 5.12)"
+The filesystem does not support this operation.
+.TP
 .B EOVERFLOW
 The requested source or destination range is too large to represent in the
 specified data types.
@@ -184,10 +187,17 @@ or
 .I fd_out
 refers to an active swap file.
 .TP
-.B EXDEV
+.BR EXDEV " (before Linux 5.3)"
+The files referred to by
+.IR fd_in " and " fd_out
+are not on the same filesystem.
+.TP
+.BR EXDEV " (since Linux 5.12)"
 The files referred to by
 .IR fd_in " and " fd_out
-are not on the same mounted filesystem (pre Linux 5.3).
+are not on the same filesystem,
+and the source and target filesystems are not of the same type,
+or do not support cross-filesystem copy.
 .SH VERSIONS
 The
 .BR copy_file_range ()
@@ -200,8 +210,10 @@ Areas of the API that weren't clearly defined were clarified and the API bounds
 are much more strictly checked than on earlier kernels.
 Applications should target the behaviour and requirements of 5.3 kernels.
 .PP
-First support for cross-filesystem copies was introduced in Linux 5.3.
-Older kernels will return -EXDEV when cross-filesystem copies are attempted.
+Since 5.12,
+cross-filesystem copies can be achieved
+when both filesystems are of the same type,
+and that filesystem implements support for it.
 .SH CONFORMING TO
 The
 .BR copy_file_range ()
@@ -226,6 +238,12 @@ gives filesystems an opportunity to implement "copy acceleration" techniques,
 such as the use of reflinks (i.e., two or more inodes that share
 pointers to the same copy-on-write disk blocks)
 or server-side-copy (in the case of NFS).
+.SH BUGS
+In Linux kernels 5.3 to 5.11,
+cross-filesystem copies were supported by the kernel,
+instead of being supported by individual filesystems.
+However, on some virtual filesystems,
+the call failed to copy, while still reporting success.
 .SH EXAMPLES
 .EX
 #define _GNU_SOURCE
-- 
2.30.1.721.g45526154a5


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC v3] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-03-01 14:41                                 ` [RFC v3] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
@ 2021-03-01 14:58                                   ` Amir Goldstein
  0 siblings, 0 replies; 93+ messages in thread
From: Amir Goldstein @ 2021-03-01 14:58 UTC (permalink / raw)
  To: Alejandro Colomar
  Cc: linux-man, Michael Kerrisk, Luis Henriques, Steve French,
	Greg KH, Anna Schumaker, Jeff Layton, Miklos Szeredi,
	Trond Myklebust, Alexander Viro, Darrick J. Wong, Dave Chinner,
	Nicolas Boichat, Ian Lance Taylor, Luis Lozano, Andreas Dilger,
	Olga Kornievskaia, Christoph Hellwig, ceph-devel, linux-kernel,
	CIFS, samba-technical, linux-fsdevel, Linux NFS Mailing List,
	Walter Harms

On Mon, Mar 1, 2021 at 4:45 PM Alejandro Colomar <alx.manpages@gmail.com> wrote:
>
> Linux 5.12 fixes a regression.
>
> Cross-filesystem (introduced in 5.3) copies were buggy.
>
> Move the statements documenting cross-fs to BUGS.
> Kernels 5.3..5.11 should be patched soon.
>
> State version information for some errors related to this.
>
> Reported-by: Luis Henriques <lhenriques@suse.de>
> Reported-by: Amir Goldstein <amir73il@gmail.com>
> Related: <https://lwn.net/Articles/846403/>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Michael Kerrisk <mtk.manpages@gmail.com>
> Cc: Anna Schumaker <anna.schumaker@netapp.com>
> Cc: Jeff Layton <jlayton@kernel.org>
> Cc: Steve French <sfrench@samba.org>
> Cc: Miklos Szeredi <miklos@szeredi.hu>
> Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Nicolas Boichat <drinkcat@chromium.org>
> Cc: Ian Lance Taylor <iant@google.com>
> Cc: Luis Lozano <llozano@chromium.org>
> Cc: Andreas Dilger <adilger@dilger.ca>
> Cc: Olga Kornievskaia <aglo@umich.edu>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: ceph-devel <ceph-devel@vger.kernel.org>
> Cc: linux-kernel <linux-kernel@vger.kernel.org>
> Cc: CIFS <linux-cifs@vger.kernel.org>
> Cc: samba-technical <samba-technical@lists.samba.org>
> Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
> Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>
> Cc: Walter Harms <wharms@bfs.de>
> Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
> ---
>
> v3:
>         - Don't remove some important text.
>         - Reword BUGS.
>
> ---
> Hi Amir,
>
> I covered your comments.  I may need to add something else after your
> discussion with Steve; please comment.
>
> I tried to reword BUGS so that it's as specific and understandable as I can.
> If you still find it not good enough, please comment :)
>
> Thanks,
>
> Alex
>
> ---
>  man2/copy_file_range.2 | 26 ++++++++++++++++++++++----
>  1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> index 611a39b80..1c0df3f74 100644
> --- a/man2/copy_file_range.2
> +++ b/man2/copy_file_range.2
> @@ -169,6 +169,9 @@ Out of memory.
>  .B ENOSPC
>  There is not enough space on the target filesystem to complete the copy.
>  .TP
> +.BR EOPNOTSUPP " (since Linux 5.12)"
> +The filesystem does not support this operation.
> +.TP
>  .B EOVERFLOW
>  The requested source or destination range is too large to represent in the
>  specified data types.
> @@ -184,10 +187,17 @@ or
>  .I fd_out
>  refers to an active swap file.
>  .TP
> -.B EXDEV
> +.BR EXDEV " (before Linux 5.3)"
> +The files referred to by
> +.IR fd_in " and " fd_out
> +are not on the same filesystem.
> +.TP
> +.BR EXDEV " (since Linux 5.12)"
>  The files referred to by
>  .IR fd_in " and " fd_out
> -are not on the same mounted filesystem (pre Linux 5.3).
> +are not on the same filesystem,
> +and the source and target filesystems are not of the same type,
> +or do not support cross-filesystem copy.
>  .SH VERSIONS
>  The
>  .BR copy_file_range ()
> @@ -200,8 +210,10 @@ Areas of the API that weren't clearly defined were clarified and the API bounds
>  are much more strictly checked than on earlier kernels.
>  Applications should target the behaviour and requirements of 5.3 kernels.
>  .PP
> -First support for cross-filesystem copies was introduced in Linux 5.3.
> -Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> +Since 5.12,
> +cross-filesystem copies can be achieved
> +when both filesystems are of the same type,
> +and that filesystem implements support for it.

Maybe refer to BUGS here for pre 5.12 behavior?

>  .SH CONFORMING TO
>  The
>  .BR copy_file_range ()
> @@ -226,6 +238,12 @@ gives filesystems an opportunity to implement "copy acceleration" techniques,
>  such as the use of reflinks (i.e., two or more inodes that share
>  pointers to the same copy-on-write disk blocks)
>  or server-side-copy (in the case of NFS).
> +.SH BUGS
> +In Linux kernels 5.3 to 5.11,
> +cross-filesystem copies were supported by the kernel,
> +instead of being supported by individual filesystems.

Not so clear/accurate IMO. Maybe:

cross-filesystem copies were implemented by the kernel,
if the operation was not supported by individual filesystems.

> +However, on some virtual filesystems,
> +the call failed to copy, while still reporting success.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [RFC v4] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-02-24 14:23                               ` [PATCH] copy_file_range.2: Kernel v5.12 updates Luis Henriques
  2021-02-24 16:10                                 ` Amir Goldstein
  2021-03-01 14:41                                 ` [RFC v3] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
@ 2021-03-04  9:38                                 ` Alejandro Colomar
  2021-03-04 17:13                                   ` Darrick J. Wong
  2 siblings, 1 reply; 93+ messages in thread
From: Alejandro Colomar @ 2021-03-04  9:38 UTC (permalink / raw)
  To: linux-man, Amir Goldstein, Michael Kerrisk, Luis Henriques, Steve French
  Cc: Alejandro Colomar, Greg KH, Anna Schumaker, Jeff Layton,
	Miklos Szeredi, Trond Myklebust, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List, Walter Harms

Linux 5.12 fixes a regression.

Cross-filesystem (introduced in 5.3) copies were buggy.

Move the statements documenting cross-fs to BUGS.
Kernels 5.3..5.11 should be patched soon.

State version information for some errors related to this.

Reported-by: Luis Henriques <lhenriques@suse.de>
Reported-by: Amir Goldstein <amir73il@gmail.com>
Related: <https://lwn.net/Articles/846403/>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Steve French <sfrench@samba.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Nicolas Boichat <drinkcat@chromium.org>
Cc: Ian Lance Taylor <iant@google.com>
Cc: Luis Lozano <llozano@chromium.org>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Olga Kornievskaia <aglo@umich.edu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: ceph-devel <ceph-devel@vger.kernel.org>
Cc: linux-kernel <linux-kernel@vger.kernel.org>
Cc: CIFS <linux-cifs@vger.kernel.org>
Cc: samba-technical <samba-technical@lists.samba.org>
Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>
Cc: Walter Harms <wharms@bfs.de>
Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
---

v3:
        - Don't remove some important text.
        - Reword BUGS.
v4:
	- Reword.
	- Link to BUGS.

Thanks, Amir, for all the help and better wordings.

Cheers,

Alex

---
 man2/copy_file_range.2 | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
index 611a39b80..f58bfea8f 100644
--- a/man2/copy_file_range.2
+++ b/man2/copy_file_range.2
@@ -169,6 +169,9 @@ Out of memory.
 .B ENOSPC
 There is not enough space on the target filesystem to complete the copy.
 .TP
+.BR EOPNOTSUPP " (since Linux 5.12)"
+The filesystem does not support this operation.
+.TP
 .B EOVERFLOW
 The requested source or destination range is too large to represent in the
 specified data types.
@@ -184,10 +187,17 @@ or
 .I fd_out
 refers to an active swap file.
 .TP
-.B EXDEV
+.BR EXDEV " (before Linux 5.3)"
+The files referred to by
+.IR fd_in " and " fd_out
+are not on the same filesystem.
+.TP
+.BR EXDEV " (since Linux 5.12)"
 The files referred to by
 .IR fd_in " and " fd_out
-are not on the same mounted filesystem (pre Linux 5.3).
+are not on the same filesystem,
+and the source and target filesystems are not of the same type,
+or do not support cross-filesystem copy.
 .SH VERSIONS
 The
 .BR copy_file_range ()
@@ -200,8 +210,11 @@ Areas of the API that weren't clearly defined were clarified and the API bounds
 are much more strictly checked than on earlier kernels.
 Applications should target the behaviour and requirements of 5.3 kernels.
 .PP
-First support for cross-filesystem copies was introduced in Linux 5.3.
-Older kernels will return -EXDEV when cross-filesystem copies are attempted.
+Since Linux 5.12,
+cross-filesystem copies can be achieved
+when both filesystems are of the same type,
+and that filesystem implements support for it.
+See BUGS for behavior prior to 5.12.
 .SH CONFORMING TO
 The
 .BR copy_file_range ()
@@ -226,6 +239,12 @@ gives filesystems an opportunity to implement "copy acceleration" techniques,
 such as the use of reflinks (i.e., two or more inodes that share
 pointers to the same copy-on-write disk blocks)
 or server-side-copy (in the case of NFS).
+.SH BUGS
+In Linux kernels 5.3 to 5.11,
+cross-filesystem copies were implemented by the kernel,
+if the operation was not supported by individual filesystems.
+However, on some virtual filesystems,
+the call failed to copy, while still reporting success.
 .SH EXAMPLES
 .EX
 #define _GNU_SOURCE
-- 
2.30.1.721.g45526154a5


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC v4] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-03-04  9:38                                 ` [RFC v4] " Alejandro Colomar
@ 2021-03-04 17:13                                   ` Darrick J. Wong
  2021-03-04 18:24                                     ` Alejandro Colomar (man-pages)
  0 siblings, 1 reply; 93+ messages in thread
From: Darrick J. Wong @ 2021-03-04 17:13 UTC (permalink / raw)
  To: Alejandro Colomar
  Cc: linux-man, Amir Goldstein, Michael Kerrisk, Luis Henriques,
	Steve French, Greg KH, Anna Schumaker, Jeff Layton,
	Miklos Szeredi, Trond Myklebust, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List, Walter Harms

On Thu, Mar 04, 2021 at 10:38:07AM +0100, Alejandro Colomar wrote:
> Linux 5.12 fixes a regression.
> 
> Cross-filesystem (introduced in 5.3) copies were buggy.
> 
> Move the statements documenting cross-fs to BUGS.
> Kernels 5.3..5.11 should be patched soon.
> 
> State version information for some errors related to this.
> 
> Reported-by: Luis Henriques <lhenriques@suse.de>
> Reported-by: Amir Goldstein <amir73il@gmail.com>
> Related: <https://lwn.net/Articles/846403/>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Michael Kerrisk <mtk.manpages@gmail.com>
> Cc: Anna Schumaker <anna.schumaker@netapp.com>
> Cc: Jeff Layton <jlayton@kernel.org>
> Cc: Steve French <sfrench@samba.org>
> Cc: Miklos Szeredi <miklos@szeredi.hu>
> Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Nicolas Boichat <drinkcat@chromium.org>
> Cc: Ian Lance Taylor <iant@google.com>
> Cc: Luis Lozano <llozano@chromium.org>
> Cc: Andreas Dilger <adilger@dilger.ca>
> Cc: Olga Kornievskaia <aglo@umich.edu>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: ceph-devel <ceph-devel@vger.kernel.org>
> Cc: linux-kernel <linux-kernel@vger.kernel.org>
> Cc: CIFS <linux-cifs@vger.kernel.org>
> Cc: samba-technical <samba-technical@lists.samba.org>
> Cc: linux-fsdevel <linux-fsdevel@vger.kernel.org>
> Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>
> Cc: Walter Harms <wharms@bfs.de>
> Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
> ---
> 
> v3:
>         - Don't remove some important text.
>         - Reword BUGS.
> v4:
> 	- Reword.
> 	- Link to BUGS.
> 
> Thanks, Amir, for all the help and better wordings.
> 
> Cheers,
> 
> Alex
> 
> ---
>  man2/copy_file_range.2 | 27 +++++++++++++++++++++++----
>  1 file changed, 23 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> index 611a39b80..f58bfea8f 100644
> --- a/man2/copy_file_range.2
> +++ b/man2/copy_file_range.2
> @@ -169,6 +169,9 @@ Out of memory.
>  .B ENOSPC
>  There is not enough space on the target filesystem to complete the copy.
>  .TP
> +.BR EOPNOTSUPP " (since Linux 5.12)"
> +The filesystem does not support this operation.
> +.TP
>  .B EOVERFLOW
>  The requested source or destination range is too large to represent in the
>  specified data types.
> @@ -184,10 +187,17 @@ or
>  .I fd_out
>  refers to an active swap file.
>  .TP
> -.B EXDEV
> +.BR EXDEV " (before Linux 5.3)"
> +The files referred to by
> +.IR fd_in " and " fd_out
> +are not on the same filesystem.
> +.TP
> +.BR EXDEV " (since Linux 5.12)"
>  The files referred to by
>  .IR fd_in " and " fd_out
> -are not on the same mounted filesystem (pre Linux 5.3).
> +are not on the same filesystem,
> +and the source and target filesystems are not of the same type,
> +or do not support cross-filesystem copy.
>  .SH VERSIONS
>  The
>  .BR copy_file_range ()
> @@ -200,8 +210,11 @@ Areas of the API that weren't clearly defined were clarified and the API bounds
>  are much more strictly checked than on earlier kernels.
>  Applications should target the behaviour and requirements of 5.3 kernels.
>  .PP
> -First support for cross-filesystem copies was introduced in Linux 5.3.
> -Older kernels will return -EXDEV when cross-filesystem copies are attempted.
> +Since Linux 5.12,
> +cross-filesystem copies can be achieved
> +when both filesystems are of the same type,
> +and that filesystem implements support for it.
> +See BUGS for behavior prior to 5.12.
>  .SH CONFORMING TO
>  The
>  .BR copy_file_range ()
> @@ -226,6 +239,12 @@ gives filesystems an opportunity to implement "copy acceleration" techniques,
>  such as the use of reflinks (i.e., two or more inodes that share
>  pointers to the same copy-on-write disk blocks)
>  or server-side-copy (in the case of NFS).
> +.SH BUGS
> +In Linux kernels 5.3 to 5.11,
> +cross-filesystem copies were implemented by the kernel,
> +if the operation was not supported by individual filesystems.
> +However, on some virtual filesystems,
> +the call failed to copy, while still reporting success.

...success, or merely a short copy?

(The rest looks reasonable (at least by c_f_r standards) to me.)

--D

>  .SH EXAMPLES
>  .EX
>  #define _GNU_SOURCE
> -- 
> 2.30.1.721.g45526154a5
> 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC v4] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-03-04 17:13                                   ` Darrick J. Wong
@ 2021-03-04 18:24                                     ` Alejandro Colomar (man-pages)
  2021-03-04 23:50                                       ` Darrick J. Wong
  0 siblings, 1 reply; 93+ messages in thread
From: Alejandro Colomar (man-pages) @ 2021-03-04 18:24 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: linux-man, Amir Goldstein, Michael Kerrisk, Luis Henriques,
	Steve French, Greg KH, Anna Schumaker, Jeff Layton,
	Miklos Szeredi, Trond Myklebust, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List, Walter Harms

Hi Darrick,

On 3/4/21 6:13 PM, Darrick J. Wong wrote:
> On Thu, Mar 04, 2021 at 10:38:07AM +0100, Alejandro Colomar wrote:
>> +However, on some virtual filesystems,
>> +the call failed to copy, while still reporting success.
> 
> ...success, or merely a short copy?

Okay.

> 
> (The rest looks reasonable (at least by c_f_r standards) to me.)

I'm curious, what does "c_f_r standards" mean? :)

Cheers,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC v4] copy_file_range.2: Update cross-filesystem support for 5.12
  2021-03-04 18:24                                     ` Alejandro Colomar (man-pages)
@ 2021-03-04 23:50                                       ` Darrick J. Wong
  0 siblings, 0 replies; 93+ messages in thread
From: Darrick J. Wong @ 2021-03-04 23:50 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages)
  Cc: linux-man, Amir Goldstein, Michael Kerrisk, Luis Henriques,
	Steve French, Greg KH, Anna Schumaker, Jeff Layton,
	Miklos Szeredi, Trond Myklebust, Alexander Viro, Darrick J. Wong,
	Dave Chinner, Nicolas Boichat, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Olga Kornievskaia, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel,
	Linux NFS Mailing List, Walter Harms

On Thu, Mar 04, 2021 at 07:24:02PM +0100, Alejandro Colomar (man-pages) wrote:
> Hi Darrick,
> 
> On 3/4/21 6:13 PM, Darrick J. Wong wrote:
> > On Thu, Mar 04, 2021 at 10:38:07AM +0100, Alejandro Colomar wrote:
> > > +However, on some virtual filesystems,
> > > +the call failed to copy, while still reporting success.
> > 
> > ...success, or merely a short copy?
> 
> Okay.
> 
> > 
> > (The rest looks reasonable (at least by c_f_r standards) to me.)
> 
> I'm curious, what does "c_f_r standards" mean? :)

c_f_r is shorthand for "copy_file_range".

As for standards... well... I'll just say that this being the /second/
major shift in behavior reflects our poor community development
processes.  The door to general cross-fs copies should not have been
thrown open with as little testing as it did.  There are legendary
dchinner rants about how obviously broken the generic fallback was when
it was introduced.

There's a reason why we usually wire up new kernel functionality on an
opt-in basis, and that is to foster gradual enablement as QA resources
permit.  It's one thing for maintainers to blow up their own subsystems
in isolation, and an entirely different thing to do it between projects
with no coordination.

Did c_f_r work between an ext4 and an xfs?  I have no idea.  It seemed
to work between xfses of a similar vintage and featureset, at least, but
that's about as much testing as I have ever managed.

--D

> 
> Cheers,
> 
> Alex
> 
> -- 
> Alejandro Colomar
> Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
> http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-02-24 10:44                                   ` Nicolas Boichat
@ 2021-04-09  5:23                                     ` Nicolas Boichat
  2021-04-09 13:39                                       ` Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Nicolas Boichat @ 2021-04-09  5:23 UTC (permalink / raw)
  To: Luis Henriques
  Cc: Olga Kornievskaia, Amir Goldstein, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Wed, Feb 24, 2021 at 6:44 PM Nicolas Boichat <drinkcat@chromium.org> wrote:
>
> On Wed, Feb 24, 2021 at 6:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
> > > On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
> > > >
> > > > A regression has been reported by Nicolas Boichat, found while using the
> > > > copy_file_range syscall to copy a tracefs file.  Before commit
> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > > > kernel would return -EXDEV to userspace when trying to copy a file across
> > > > different filesystems.  After this commit, the syscall doesn't fail anymore
> > > > and instead returns zero (zero bytes copied), as this file's content is
> > > > generated on-the-fly and thus reports a size of zero.
> > > >
> > > > This patch restores some cross-filesystem copy restrictions that existed
> > > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > devices").  Filesystems are still allowed to fall-back to the VFS
> > > > generic_copy_file_range() implementation, but that has now to be done
> > > > explicitly.
> > > >
> > > > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> > > >
> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > >
> > > I tested v8 and I believe it works for NFS.
> >
> > Thanks a lot for the testing.  And to everyone else for reviews,
> > feedback,... and patience.
>
> Thanks so much to you!!!
>
> Works here, you can add my
> Tested-by: Nicolas Boichat <drinkcat@chromium.org>

What happened to this patch? It does not seem to have been picked up
yet? Any reason why?

> >
> > I'll now go look into the manpage and see what needs to be changed.
> >
> > Cheers,
> > --
> > Luís

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-04-09  5:23                                     ` Nicolas Boichat
@ 2021-04-09 13:39                                       ` Luis Henriques
  2021-04-09 13:50                                         ` Amir Goldstein
  0 siblings, 1 reply; 93+ messages in thread
From: Luis Henriques @ 2021-04-09 13:39 UTC (permalink / raw)
  To: Nicolas Boichat
  Cc: Olga Kornievskaia, Amir Goldstein, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Darrick J. Wong, Dave Chinner, Greg KH, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

Nicolas Boichat <drinkcat@chromium.org> writes:

> On Wed, Feb 24, 2021 at 6:44 PM Nicolas Boichat <drinkcat@chromium.org> wrote:
>>
>> On Wed, Feb 24, 2021 at 6:22 PM Luis Henriques <lhenriques@suse.de> wrote:
>> >
>> > On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
>> > > On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
>> > > >
>> > > > A regression has been reported by Nicolas Boichat, found while using the
>> > > > copy_file_range syscall to copy a tracefs file.  Before commit
>> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>> > > > kernel would return -EXDEV to userspace when trying to copy a file across
>> > > > different filesystems.  After this commit, the syscall doesn't fail anymore
>> > > > and instead returns zero (zero bytes copied), as this file's content is
>> > > > generated on-the-fly and thus reports a size of zero.
>> > > >
>> > > > This patch restores some cross-filesystem copy restrictions that existed
>> > > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> > > > devices").  Filesystems are still allowed to fall-back to the VFS
>> > > > generic_copy_file_range() implementation, but that has now to be done
>> > > > explicitly.
>> > > >
>> > > > nfsd is also modified to fall-back into generic_copy_file_range() in case
>> > > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>> > > >
>> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
>> > > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
>> > > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
>> > > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
>> > > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>> > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> > >
>> > > I tested v8 and I believe it works for NFS.
>> >
>> > Thanks a lot for the testing.  And to everyone else for reviews,
>> > feedback,... and patience.
>>
>> Thanks so much to you!!!
>>
>> Works here, you can add my
>> Tested-by: Nicolas Boichat <drinkcat@chromium.org>
>
> What happened to this patch? It does not seem to have been picked up
> yet? Any reason why?

Hmm... good question.  I'm not actually sure who would be picking it.  Al,
maybe...?

Cheers,
-- 
Luis

>
>> >
>> > I'll now go look into the manpage and see what needs to be changed.
>> >
>> > Cheers,
>> > --
>> > Luís


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-04-09 13:39                                       ` Luis Henriques
@ 2021-04-09 13:50                                         ` Amir Goldstein
  2021-04-23  4:40                                           ` Nicolas Boichat
  0 siblings, 1 reply; 93+ messages in thread
From: Amir Goldstein @ 2021-04-09 13:50 UTC (permalink / raw)
  To: Luis Henriques, Darrick J. Wong
  Cc: Nicolas Boichat, Olga Kornievskaia, Jeff Layton, Steve French,
	Miklos Szeredi, Trond Myklebust, Anna Schumaker, Alexander Viro,
	Dave Chinner, Greg KH, Ian Lance Taylor, Luis Lozano,
	Andreas Dilger, Christoph Hellwig, ceph-devel, linux-kernel,
	CIFS, samba-technical, linux-fsdevel, linux-nfs

On Fri, Apr 9, 2021 at 4:39 PM Luis Henriques <lhenriques@suse.de> wrote:
>
> Nicolas Boichat <drinkcat@chromium.org> writes:
>
> > On Wed, Feb 24, 2021 at 6:44 PM Nicolas Boichat <drinkcat@chromium.org> wrote:
> >>
> >> On Wed, Feb 24, 2021 at 6:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> >> >
> >> > On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
> >> > > On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
> >> > > >
> >> > > > A regression has been reported by Nicolas Boichat, found while using the
> >> > > > copy_file_range syscall to copy a tracefs file.  Before commit
> >> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> >> > > > kernel would return -EXDEV to userspace when trying to copy a file across
> >> > > > different filesystems.  After this commit, the syscall doesn't fail anymore
> >> > > > and instead returns zero (zero bytes copied), as this file's content is
> >> > > > generated on-the-fly and thus reports a size of zero.
> >> > > >
> >> > > > This patch restores some cross-filesystem copy restrictions that existed
> >> > > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> > > > devices").  Filesystems are still allowed to fall-back to the VFS
> >> > > > generic_copy_file_range() implementation, but that has now to be done
> >> > > > explicitly.
> >> > > >
> >> > > > nfsd is also modified to fall-back into generic_copy_file_range() in case
> >> > > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> >> > > >
> >> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> >> > > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> >> > > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> >> > > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> >> > > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> >> > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> >> > >
> >> > > I tested v8 and I believe it works for NFS.
> >> >
> >> > Thanks a lot for the testing.  And to everyone else for reviews,
> >> > feedback,... and patience.
> >>
> >> Thanks so much to you!!!
> >>
> >> Works here, you can add my
> >> Tested-by: Nicolas Boichat <drinkcat@chromium.org>
> >
> > What happened to this patch? It does not seem to have been picked up
> > yet? Any reason why?
>
> Hmm... good question.  I'm not actually sure who would be picking it.  Al,
> maybe...?
>

Darrick,

Would you mind taking this through your tree in case Al doesn't pick it up?

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-04-09 13:50                                         ` Amir Goldstein
@ 2021-04-23  4:40                                           ` Nicolas Boichat
  2021-05-03  8:54                                             ` Luis Henriques
  0 siblings, 1 reply; 93+ messages in thread
From: Nicolas Boichat @ 2021-04-23  4:40 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Luis Henriques, Darrick J. Wong, Olga Kornievskaia, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Anna Schumaker,
	Alexander Viro, Dave Chinner, Greg KH, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

On Fri, Apr 9, 2021 at 9:50 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Fri, Apr 9, 2021 at 4:39 PM Luis Henriques <lhenriques@suse.de> wrote:
> >
> > Nicolas Boichat <drinkcat@chromium.org> writes:
> >
> > > On Wed, Feb 24, 2021 at 6:44 PM Nicolas Boichat <drinkcat@chromium.org> wrote:
> > >>
> > >> On Wed, Feb 24, 2021 at 6:22 PM Luis Henriques <lhenriques@suse.de> wrote:
> > >> >
> > >> > On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
> > >> > > On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
> > >> > > >
> > >> > > > A regression has been reported by Nicolas Boichat, found while using the
> > >> > > > copy_file_range syscall to copy a tracefs file.  Before commit
> > >> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > >> > > > kernel would return -EXDEV to userspace when trying to copy a file across
> > >> > > > different filesystems.  After this commit, the syscall doesn't fail anymore
> > >> > > > and instead returns zero (zero bytes copied), as this file's content is
> > >> > > > generated on-the-fly and thus reports a size of zero.
> > >> > > >
> > >> > > > This patch restores some cross-filesystem copy restrictions that existed
> > >> > > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > >> > > > devices").  Filesystems are still allowed to fall-back to the VFS
> > >> > > > generic_copy_file_range() implementation, but that has now to be done
> > >> > > > explicitly.
> > >> > > >
> > >> > > > nfsd is also modified to fall-back into generic_copy_file_range() in case
> > >> > > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
> > >> > > >
> > >> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > >> > > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
> > >> > > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > >> > > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > >> > > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
> > >> > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
> > >> > >
> > >> > > I tested v8 and I believe it works for NFS.
> > >> >
> > >> > Thanks a lot for the testing.  And to everyone else for reviews,
> > >> > feedback,... and patience.
> > >>
> > >> Thanks so much to you!!!
> > >>
> > >> Works here, you can add my
> > >> Tested-by: Nicolas Boichat <drinkcat@chromium.org>
> > >
> > > What happened to this patch? It does not seem to have been picked up
> > > yet? Any reason why?
> >
> > Hmm... good question.  I'm not actually sure who would be picking it.  Al,
> > maybe...?
> >
>
> Darrick,
>
> Would you mind taking this through your tree in case Al doesn't pick it up?

Err, sorry for yet another ping... but it would be good to move
forward with those patches ,-P

Thanks!

> Thanks,
> Amir.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [PATCH v8] vfs: fix copy_file_range regression in cross-fs copies
  2021-04-23  4:40                                           ` Nicolas Boichat
@ 2021-05-03  8:54                                             ` Luis Henriques
  0 siblings, 0 replies; 93+ messages in thread
From: Luis Henriques @ 2021-05-03  8:54 UTC (permalink / raw)
  To: Nicolas Boichat
  Cc: Amir Goldstein, Darrick J. Wong, Olga Kornievskaia, Jeff Layton,
	Steve French, Miklos Szeredi, Trond Myklebust, Anna Schumaker,
	Alexander Viro, Dave Chinner, Greg KH, Ian Lance Taylor,
	Luis Lozano, Andreas Dilger, Christoph Hellwig, ceph-devel,
	linux-kernel, CIFS, samba-technical, linux-fsdevel, linux-nfs

Nicolas Boichat <drinkcat@chromium.org> writes:

> On Fri, Apr 9, 2021 at 9:50 PM Amir Goldstein <amir73il@gmail.com> wrote:
>>
>> On Fri, Apr 9, 2021 at 4:39 PM Luis Henriques <lhenriques@suse.de> wrote:
>> >
>> > Nicolas Boichat <drinkcat@chromium.org> writes:
>> >
>> > > On Wed, Feb 24, 2021 at 6:44 PM Nicolas Boichat <drinkcat@chromium.org> wrote:
>> > >>
>> > >> On Wed, Feb 24, 2021 at 6:22 PM Luis Henriques <lhenriques@suse.de> wrote:
>> > >> >
>> > >> > On Tue, Feb 23, 2021 at 08:00:54PM -0500, Olga Kornievskaia wrote:
>> > >> > > On Mon, Feb 22, 2021 at 5:25 AM Luis Henriques <lhenriques@suse.de> wrote:
>> > >> > > >
>> > >> > > > A regression has been reported by Nicolas Boichat, found while using the
>> > >> > > > copy_file_range syscall to copy a tracefs file.  Before commit
>> > >> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
>> > >> > > > kernel would return -EXDEV to userspace when trying to copy a file across
>> > >> > > > different filesystems.  After this commit, the syscall doesn't fail anymore
>> > >> > > > and instead returns zero (zero bytes copied), as this file's content is
>> > >> > > > generated on-the-fly and thus reports a size of zero.
>> > >> > > >
>> > >> > > > This patch restores some cross-filesystem copy restrictions that existed
>> > >> > > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> > >> > > > devices").  Filesystems are still allowed to fall-back to the VFS
>> > >> > > > generic_copy_file_range() implementation, but that has now to be done
>> > >> > > > explicitly.
>> > >> > > >
>> > >> > > > nfsd is also modified to fall-back into generic_copy_file_range() in case
>> > >> > > > vfs_copy_file_range() fails with -EOPNOTSUPP or -EXDEV.
>> > >> > > >
>> > >> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
>> > >> > > > Link: https://lore.kernel.org/linux-fsdevel/20210212044405.4120619-1-drinkcat@chromium.org/
>> > >> > > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
>> > >> > > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
>> > >> > > > Reported-by: Nicolas Boichat <drinkcat@chromium.org>
>> > >> > > > Signed-off-by: Luis Henriques <lhenriques@suse.de>
>> > >> > >
>> > >> > > I tested v8 and I believe it works for NFS.
>> > >> >
>> > >> > Thanks a lot for the testing.  And to everyone else for reviews,
>> > >> > feedback,... and patience.
>> > >>
>> > >> Thanks so much to you!!!
>> > >>
>> > >> Works here, you can add my
>> > >> Tested-by: Nicolas Boichat <drinkcat@chromium.org>
>> > >
>> > > What happened to this patch? It does not seem to have been picked up
>> > > yet? Any reason why?
>> >
>> > Hmm... good question.  I'm not actually sure who would be picking it.  Al,
>> > maybe...?
>> >
>>
>> Darrick,
>>
>> Would you mind taking this through your tree in case Al doesn't pick it up?
>
> Err, sorry for yet another ping... but it would be good to move
> forward with those patches ,-P

Yeah, I'm not sure what else to do, or who else to bug regarding this :-/

Cheers,
-- 
Luis

^ permalink raw reply	[flat|nested] 93+ messages in thread

end of thread, other threads:[~2021-05-03  8:52 UTC | newest]

Thread overview: 93+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CAOQ4uxiFGjdvX2-zh5o46pn7RZhvbGHH0wpzLPuPOom91FwWeQ@mail.gmail.com>
2021-02-15 15:43 ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Luis Henriques
2021-02-15 16:02   ` Trond Myklebust
2021-02-16  0:25     ` Steve French
2021-02-15 16:34   ` Amir Goldstein
2021-02-15 16:53     ` Trond Myklebust
2021-02-15 17:24       ` Amir Goldstein
2021-02-15 18:57         ` Trond Myklebust
2021-02-15 19:43           ` Amir Goldstein
2021-02-16 11:17             ` Luis Henriques
2021-02-16 11:28               ` gregkh
2021-02-16 12:01                 ` Luis Henriques
2021-02-16 12:08                   ` Greg KH
2021-02-16 13:51               ` Amir Goldstein
2021-02-16 16:42                 ` Luis Henriques
2021-02-16 17:44                   ` Amir Goldstein
2021-02-16 18:55                     ` Luis Henriques
2021-02-16 19:20                       ` Amir Goldstein
2021-02-16 19:27                         ` Anna Schumaker
2021-02-16 19:31                           ` Steve French
2021-02-16 19:40                             ` Amir Goldstein
2021-02-16 21:15                               ` Steve French
2021-02-17  8:08                                 ` Amir Goldstein
2021-02-17 17:26                                   ` [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
2021-02-17 20:47                                     ` Amir Goldstein
2021-02-18  0:56                                     ` Nicolas Boichat
2021-02-18  5:32                                     ` Olga Kornievskaia
2021-02-18  6:47                                       ` Amir Goldstein
2021-02-18 16:28                                         ` Olga Kornievskaia
2021-02-18  7:43                                     ` Christoph Hellwig
2021-02-18  0:50                                   ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Andreas Dilger
2021-02-18  7:34                                     ` gregkh
2021-02-16 18:54                 ` Andreas Dilger
2021-02-17  4:45   ` Nicolas Boichat
2021-02-18  7:42   ` Christoph Hellwig
2021-02-18  9:10     ` Amir Goldstein
2021-02-18 10:29       ` Luis Henriques
2021-02-18 12:15         ` Luis Henriques
2021-02-18 12:49           ` Amir Goldstein
2021-02-18 14:36             ` [PATCH v4] vfs: fix copy_file_range regression in cross-fs copies Luis Henriques
2021-02-18 14:58               ` Amir Goldstein
2021-02-18 15:17                 ` [PATCH v5] " Luis Henriques
2021-02-18 15:53                   ` Amir Goldstein
2021-02-18 16:35                     ` Luis Henriques
2021-02-18 17:18                       ` [PATCH v6] " Luis Henriques
2021-02-19 21:18                         ` Olga Kornievskaia
2021-02-19 21:52                           ` Amir Goldstein
2021-02-21 19:58                           ` [PATCH v7] " Luis Henriques
2021-02-22  3:00                             ` Nicolas Boichat
2021-02-22 10:24                             ` [PATCH v8] " Luis Henriques
2021-02-22 10:46                               ` Amir Goldstein
2021-02-22 16:25                               ` dai.ngo
2021-02-23 10:32                                 ` Luis Henriques
2021-02-23 15:28                                   ` Amir Goldstein
2021-02-23 15:29                                   ` dai.ngo
2021-02-23 16:02                                     ` dai.ngo
2021-02-23 16:47                                       ` Amir Goldstein
2021-02-23 16:57                                         ` dai.ngo
     [not found]                                           ` <e3eed18b-fc7e-e687-608b-7f662017329c@oracle.com>
2021-02-23 17:33                                             ` Amir Goldstein
2021-02-24  0:13                                               ` dai.ngo
2021-02-23 17:56                                           ` Luis Henriques
2021-02-23 17:13                                       ` Olga Kornievskaia
2021-02-24  1:00                               ` Olga Kornievskaia
2021-02-24 10:23                                 ` Luis Henriques
2021-02-24 10:44                                   ` Nicolas Boichat
2021-04-09  5:23                                     ` Nicolas Boichat
2021-04-09 13:39                                       ` Luis Henriques
2021-04-09 13:50                                         ` Amir Goldstein
2021-04-23  4:40                                           ` Nicolas Boichat
2021-05-03  8:54                                             ` Luis Henriques
2021-02-24 14:23                               ` [PATCH] copy_file_range.2: Kernel v5.12 updates Luis Henriques
2021-02-24 16:10                                 ` Amir Goldstein
2021-02-25 10:21                                   ` Luis Henriques
2021-02-26 10:13                                     ` Alejandro Colomar (man-pages)
2021-02-26 10:34                                       ` Amir Goldstein
2021-02-26 11:15                                         ` Alejandro Colomar (man-pages)
2021-02-26 13:59                                           ` Jeff Layton
2021-02-26 21:26                                             ` Alejandro Colomar (man-pages)
2021-02-26 22:18                                   ` Alejandro Colomar (man-pages)
2021-02-27  5:41                                     ` Amir Goldstein
2021-02-27 12:20                                       ` Alejandro Colomar (man-pages)
2021-02-27 13:49                                         ` [RFC v2] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
2021-02-27 16:00                                           ` Amir Goldstein
2021-02-27 23:08                                       ` [PATCH] copy_file_range.2: Kernel v5.12 updates Steve French
2021-02-28  7:35                                         ` Amir Goldstein
2021-02-28 22:25                                           ` Steve French
2021-03-01  6:18                                             ` Amir Goldstein
2021-03-01 14:41                                 ` [RFC v3] copy_file_range.2: Update cross-filesystem support for 5.12 Alejandro Colomar
2021-03-01 14:58                                   ` Amir Goldstein
2021-03-04  9:38                                 ` [RFC v4] " Alejandro Colomar
2021-03-04 17:13                                   ` Darrick J. Wong
2021-03-04 18:24                                     ` Alejandro Colomar (man-pages)
2021-03-04 23:50                                       ` Darrick J. Wong
2021-02-18 20:41       ` [PATCH v2] vfs: prevent copy_file_range to copy across devices Steve French

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).