All of lore.kernel.org
 help / color / mirror / Atom feed
* [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
@ 2016-10-12 13:33 Miklos Szeredi
  2016-10-13 18:45 ` Amir Goldstein
  2016-10-20 20:46 ` Vivek Goyal
  0 siblings, 2 replies; 14+ messages in thread
From: Miklos Szeredi @ 2016-10-12 13:33 UTC (permalink / raw)
  To: linux-unionfs
  Cc: linux-fsdevel, linux-kernel, Jeremy Eder, David Howells,
	Ratna Bolla, Gou Rao, Vinod Jayaraman, Al Viro, Vivek Goyal,
	Dave Chinner

This is a proof of concept patch to fix the following.

/ovl is in overlay mount and /ovl/foo exists on the lower layer only.

 rofd = open("/ovl/foo", O_RDONLY);
 rwfd = open("/ovl/foo", O_WRONLY); /* this causes copy up */
 write(rwfd, "bar", 3);
 read(rofd, buf, 3); 
 assert(memcmp(buf, "bar", 3) == 0);

Similar problem exists with an MAP_SHARED mmap created from rofd.

While this has only caused few problems (yum/dnf failure is the only one I know
of) and easily worked around in userspace, many see it as a proof that overlayfs
can never be a proper "POSIX" filesystem.

To quell those worries, here's a simple patch that should address the above.

The only VFS change is that f_op is initialized from f_path.dentry->d_inode
instead of file_inode(filp) in open.  The effect of this is that overlayfs can
intercept open and other file operations, while the file still effectively
belongs to the underlying fs.

The patch does not give up on the nice properties of overlayfs, like sharing the
page cache with the underlying files.  It does cause copy up in one case where
previously there wasn't one and that's the O_RDONLY/MAP_SHARED case.  I haven't
done much research into this, but running some tests in chroot didn't trigger
this.

Comments, testing are welcome.

Thanks,
Miklos

---
 fs/internal.h            |   1 -
 fs/open.c                |   2 +-
 fs/overlayfs/dir.c       |   2 +-
 fs/overlayfs/inode.c     | 175 +++++++++++++++++++++++++++++++++++++++++++----
 fs/overlayfs/overlayfs.h |   2 +-
 fs/overlayfs/super.c     |   7 +-
 include/linux/fs.h       |   1 +
 7 files changed, 169 insertions(+), 21 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index f4da3341b4a3..361ba1c12698 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -112,7 +112,6 @@ extern long do_handle_open(int mountdirfd,
 			   struct file_handle __user *ufh, int open_flag);
 extern int open_check_o_direct(struct file *f);
 extern int vfs_open(const struct path *, struct file *, const struct cred *);
-extern struct file *filp_clone_open(struct file *);
 
 /*
  * inode.c
diff --git a/fs/open.c b/fs/open.c
index a7719cfb7257..e21f1a6f77b7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -728,7 +728,7 @@ static int do_dentry_open(struct file *f,
 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
 		f->f_mode |= FMODE_ATOMIC_POS;
 
-	f->f_op = fops_get(inode->i_fop);
+	f->f_op = fops_get(f->f_path.dentry->d_inode->i_fop);
 	if (unlikely(WARN_ON(!f->f_op))) {
 		error = -ENODEV;
 		goto cleanup_all;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 5f90ddf778ba..1ea511be6e37 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -534,7 +534,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
 		goto out;
 
 	err = -ENOMEM;
-	inode = ovl_new_inode(dentry->d_sb, mode);
+	inode = ovl_new_inode(dentry->d_sb, mode, rdev);
 	if (!inode)
 		goto out_drop_write;
 
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c18d6a4ff456..744c8eb7e947 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -11,6 +11,9 @@
 #include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/file.h>
 #include "overlayfs.h"
 
 static int ovl_copy_up_truncate(struct dentry *dentry)
@@ -381,7 +384,154 @@ static const struct inode_operations ovl_symlink_inode_operations = {
 	.update_time	= ovl_update_time,
 };
 
-static void ovl_fill_inode(struct inode *inode, umode_t mode)
+static const struct file_operations ovl_file_operations;
+
+static const struct file_operations *ovl_real_fop(struct file *file)
+{
+	return file_inode(file)->i_fop;
+}
+
+static int ovl_open(struct inode *realinode, struct file *file)
+{
+	int ret = 0;
+	const struct file_operations *fop = ovl_real_fop(file);
+	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
+
+	/* Don't intercept upper file operations */
+	if (isupper)
+		replace_fops(file, fop);
+
+	if (fop->open)
+		ret = fop->open(realinode, file);
+
+	if (!isupper && WARN_ON(file->f_op != &ovl_file_operations)) {
+		if (fop->release)
+			fop->release(realinode, file);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int ovl_release(struct inode *realinode, struct file *file)
+{
+	const struct file_operations *fop = ovl_real_fop(file);
+
+	if (fop->release)
+		return fop->release(realinode, file);
+
+	return 0;
+}
+
+static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
+	ssize_t ret = -EINVAL;
+
+	if (likely(!isupper)) {
+		const struct file_operations *fop = ovl_real_fop(file);
+
+		if (likely(fop->read_iter))
+			ret = fop->read_iter(iocb, to);
+	} else {
+		struct file *upperfile = filp_clone_open(file);
+
+		if (IS_ERR(upperfile)) {
+			ret = PTR_ERR(upperfile);
+		} else {
+			ret = vfs_iter_read(upperfile, to, &iocb->ki_pos);
+			fput(upperfile);
+		}
+	}
+
+	return ret;
+}
+
+static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
+{
+	const struct file_operations *fop = ovl_real_fop(file);
+
+	if (fop->llseek)
+		return fop->llseek(file, offset, whence);
+
+	return -EINVAL;
+}
+
+static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	const struct file_operations *fop = ovl_real_fop(file);
+	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
+	int err;
+
+	/*
+	 * Treat MAP_SHARED as hint about future writes to the file (through
+	 * another file descriptor).  Caller might not have had such an intent,
+	 * but we hope MAP_PRIVATE will be used in most such cases.
+	 *
+	 * If we don't copy up now and the file is modified, it becomes really
+	 * difficult to change the mapping to match that of the file's content
+	 * later.
+	 */
+	if (unlikely(isupper || vma->vm_flags & VM_MAYSHARE)) {
+		if (!isupper) {
+			err = ovl_copy_up(file->f_path.dentry);
+			if (err)
+				goto out;
+		}
+
+		file = filp_clone_open(file);
+		err = PTR_ERR(file);
+		if (IS_ERR(file))
+			goto out;
+
+		fput(vma->vm_file);
+		/* transfer ref: */
+		vma->vm_file = file;
+		fop = file->f_op;
+	}
+	err = -EINVAL;
+	if (fop->mmap)
+		err = fop->mmap(file, vma);
+out:
+	return err;
+}
+
+static int ovl_fsync(struct file *file, loff_t start, loff_t end,
+		     int datasync)
+{
+	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
+	int ret = -EINVAL;
+
+	if (likely(!isupper)) {
+		const struct file_operations *fop = ovl_real_fop(file);
+
+		if (likely(fop->fsync))
+		       ret = fop->fsync(file, start, end, datasync);
+	} else {
+		struct file *upperfile = filp_clone_open(file);
+
+		if (IS_ERR(upperfile)) {
+			ret = PTR_ERR(upperfile);
+		} else {
+			ret = vfs_fsync_range(upperfile, start, end, datasync);
+			fput(upperfile);
+		}
+	}
+
+	return ret;
+}
+
+static const struct file_operations ovl_file_operations = {
+	.open		= ovl_open,
+	.release	= ovl_release,
+	.read_iter	= ovl_read_iter,
+	.llseek		= ovl_llseek,
+	.mmap		= ovl_mmap,
+	.fsync		= ovl_fsync,
+};
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
 {
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
@@ -390,8 +540,12 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
 	inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
 #endif
 
-	mode &= S_IFMT;
-	switch (mode) {
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &ovl_file_inode_operations;
+		inode->i_fop = &ovl_file_operations;
+		break;
+
 	case S_IFDIR:
 		inode->i_op = &ovl_dir_inode_operations;
 		inode->i_fop = &ovl_dir_operations;
@@ -402,26 +556,19 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
 		break;
 
 	default:
-		WARN(1, "illegal file type: %i\n", mode);
-		/* Fall through */
-
-	case S_IFREG:
-	case S_IFSOCK:
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
 		inode->i_op = &ovl_file_inode_operations;
+		init_special_inode(inode, mode, rdev);
 		break;
 	}
 }
 
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 
 	inode = new_inode(sb);
 	if (inode)
-		ovl_fill_inode(inode, mode);
+		ovl_fill_inode(inode, mode, rdev);
 
 	return inode;
 }
@@ -445,7 +592,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
 	inode = iget5_locked(sb, (unsigned long) realinode,
 			     ovl_inode_test, ovl_inode_set, realinode);
 	if (inode && inode->i_state & I_NEW) {
-		ovl_fill_inode(inode, realinode->i_mode);
+		ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
 		set_nlink(inode, realinode->i_nlink);
 		unlock_new_inode(inode);
 	}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e218e741cb99..95d0d86c2d54 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -195,7 +195,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
 int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
 bool ovl_is_private_xattr(const char *name);
 
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
 struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
 static inline void ovl_copyattr(struct inode *from, struct inode *to)
 {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7e3f0127fc1a..daed68d13467 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -305,7 +305,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 {
 	struct dentry *real;
 
-	if (d_is_dir(dentry)) {
+	if (!d_is_reg(dentry)) {
 		if (!inode || inode == d_inode(dentry))
 			return dentry;
 		goto bug;
@@ -579,7 +579,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 		if (upperdentry && !d_is_dir(upperdentry)) {
 			inode = ovl_get_inode(dentry->d_sb, realinode);
 		} else {
-			inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
+			inode = ovl_new_inode(dentry->d_sb, realinode->i_mode,
+					      realinode->i_rdev);
 			if (inode)
 				ovl_inode_init(inode, realinode, !!upperdentry);
 		}
@@ -1292,7 +1293,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	if (!oe)
 		goto out_put_cred;
 
-	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
+	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
 	if (!root_dentry)
 		goto out_free_oe;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bc65d5918140..cc7d1203b846 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2342,6 +2342,7 @@ extern struct file *filp_open(const char *, int, umode_t);
 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
 				   const char *, int, umode_t);
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
+extern struct file *filp_clone_open(struct file *);
 extern int filp_close(struct file *, fl_owner_t id);
 
 extern struct filename *getname_flags(const char __user *, int, int *);

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-12 13:33 [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up Miklos Szeredi
@ 2016-10-13 18:45 ` Amir Goldstein
  2016-10-20 20:46 ` Vivek Goyal
  1 sibling, 0 replies; 14+ messages in thread
From: Amir Goldstein @ 2016-10-13 18:45 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-unionfs, linux-fsdevel, linux-kernel, Jeremy Eder,
	David Howells, Ratna Bolla, Gou Rao, Vinod Jayaraman, Al Viro,
	Vivek Goyal, Dave Chinner

On Wed, Oct 12, 2016 at 4:33 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> This is a proof of concept patch to fix the following.
>
> /ovl is in overlay mount and /ovl/foo exists on the lower layer only.
>
>  rofd = open("/ovl/foo", O_RDONLY);
>  rwfd = open("/ovl/foo", O_WRONLY); /* this causes copy up */
>  write(rwfd, "bar", 3);
>  read(rofd, buf, 3);
>  assert(memcmp(buf, "bar", 3) == 0);
>
> Similar problem exists with an MAP_SHARED mmap created from rofd.
>
> While this has only caused few problems (yum/dnf failure is the only one I know
> of) and easily worked around in userspace, many see it as a proof that overlayfs
> can never be a proper "POSIX" filesystem.
>
> To quell those worries, here's a simple patch that should address the above.
>
> The only VFS change is that f_op is initialized from f_path.dentry->d_inode
> instead of file_inode(filp) in open.  The effect of this is that overlayfs can
> intercept open and other file operations, while the file still effectively
> belongs to the underlying fs.
>
> The patch does not give up on the nice properties of overlayfs, like sharing the
> page cache with the underlying files.  It does cause copy up in one case where
> previously there wasn't one and that's the O_RDONLY/MAP_SHARED case.  I haven't
> done much research into this, but running some tests in chroot didn't trigger
> this.
>
> Comments, testing are welcome.

I ran the -g quick set of xfstest (overlay over ext4)
and there are no regressions - the same set of tests are failing.

I am trying to get to adding more xfstests for overlay and/or to
improve the way that the generic tests run on overlay.

>
> Thanks,
> Miklos
>
> ---
>  fs/internal.h            |   1 -
>  fs/open.c                |   2 +-
>  fs/overlayfs/dir.c       |   2 +-
>  fs/overlayfs/inode.c     | 175 +++++++++++++++++++++++++++++++++++++++++++----
>  fs/overlayfs/overlayfs.h |   2 +-
>  fs/overlayfs/super.c     |   7 +-
>  include/linux/fs.h       |   1 +
>  7 files changed, 169 insertions(+), 21 deletions(-)
>
> diff --git a/fs/internal.h b/fs/internal.h
> index f4da3341b4a3..361ba1c12698 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -112,7 +112,6 @@ extern long do_handle_open(int mountdirfd,
>                            struct file_handle __user *ufh, int open_flag);
>  extern int open_check_o_direct(struct file *f);
>  extern int vfs_open(const struct path *, struct file *, const struct cred *);
> -extern struct file *filp_clone_open(struct file *);
>
>  /*
>   * inode.c
> diff --git a/fs/open.c b/fs/open.c
> index a7719cfb7257..e21f1a6f77b7 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -728,7 +728,7 @@ static int do_dentry_open(struct file *f,
>         if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
>                 f->f_mode |= FMODE_ATOMIC_POS;
>
> -       f->f_op = fops_get(inode->i_fop);
> +       f->f_op = fops_get(f->f_path.dentry->d_inode->i_fop);

I said it before, I think we need to find a good macro name for this construct
maybe file_dentry() := f->f_path.dentry ?
and the few places that really need d_real should use a new macro
file_real_dentry()??


>         if (unlikely(WARN_ON(!f->f_op))) {
>                 error = -ENODEV;
>                 goto cleanup_all;
> diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
> index 5f90ddf778ba..1ea511be6e37 100644
> --- a/fs/overlayfs/dir.c
> +++ b/fs/overlayfs/dir.c
> @@ -534,7 +534,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
>                 goto out;
>
>         err = -ENOMEM;
> -       inode = ovl_new_inode(dentry->d_sb, mode);
> +       inode = ovl_new_inode(dentry->d_sb, mode, rdev);
>         if (!inode)
>                 goto out_drop_write;
>
> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
> index c18d6a4ff456..744c8eb7e947 100644
> --- a/fs/overlayfs/inode.c
> +++ b/fs/overlayfs/inode.c
> @@ -11,6 +11,9 @@
>  #include <linux/slab.h>
>  #include <linux/xattr.h>
>  #include <linux/posix_acl.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/file.h>
>  #include "overlayfs.h"
>
>  static int ovl_copy_up_truncate(struct dentry *dentry)
> @@ -381,7 +384,154 @@ static const struct inode_operations ovl_symlink_inode_operations = {
>         .update_time    = ovl_update_time,
>  };
>
> -static void ovl_fill_inode(struct inode *inode, umode_t mode)
> +static const struct file_operations ovl_file_operations;
> +
> +static const struct file_operations *ovl_real_fop(struct file *file)
> +{
> +       return file_inode(file)->i_fop;
> +}
> +
> +static int ovl_open(struct inode *realinode, struct file *file)
> +{
> +       int ret = 0;
> +       const struct file_operations *fop = ovl_real_fop(file);
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +
> +       /* Don't intercept upper file operations */
> +       if (isupper)
> +               replace_fops(file, fop);
> +
> +       if (fop->open)
> +               ret = fop->open(realinode, file);
> +
> +       if (!isupper && WARN_ON(file->f_op != &ovl_file_operations)) {
> +               if (fop->release)
> +                       fop->release(realinode, file);
> +               return -EINVAL;
> +       }
> +
> +       return ret;
> +}
> +
> +static int ovl_release(struct inode *realinode, struct file *file)
> +{
> +       const struct file_operations *fop = ovl_real_fop(file);
> +
> +       if (fop->release)
> +               return fop->release(realinode, file);
> +
> +       return 0;
> +}
> +
> +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +       struct file *file = iocb->ki_filp;
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +       ssize_t ret = -EINVAL;
> +
> +       if (likely(!isupper)) {
> +               const struct file_operations *fop = ovl_real_fop(file);
> +
> +               if (likely(fop->read_iter))
> +                       ret = fop->read_iter(iocb, to);
> +       } else {
> +               struct file *upperfile = filp_clone_open(file);
> +
> +               if (IS_ERR(upperfile)) {
> +                       ret = PTR_ERR(upperfile);
> +               } else {
> +                       ret = vfs_iter_read(upperfile, to, &iocb->ki_pos);
> +                       fput(upperfile);
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
> +{
> +       const struct file_operations *fop = ovl_real_fop(file);
> +
> +       if (fop->llseek)
> +               return fop->llseek(file, offset, whence);
> +
> +       return -EINVAL;
> +}
> +
> +static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       const struct file_operations *fop = ovl_real_fop(file);
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +       int err;
> +
> +       /*
> +        * Treat MAP_SHARED as hint about future writes to the file (through
> +        * another file descriptor).  Caller might not have had such an intent,
> +        * but we hope MAP_PRIVATE will be used in most such cases.
> +        *
> +        * If we don't copy up now and the file is modified, it becomes really
> +        * difficult to change the mapping to match that of the file's content
> +        * later.
> +        */
> +       if (unlikely(isupper || vma->vm_flags & VM_MAYSHARE)) {
> +               if (!isupper) {
> +                       err = ovl_copy_up(file->f_path.dentry);
> +                       if (err)
> +                               goto out;
> +               }
> +
> +               file = filp_clone_open(file);
> +               err = PTR_ERR(file);
> +               if (IS_ERR(file))
> +                       goto out;
> +
> +               fput(vma->vm_file);
> +               /* transfer ref: */
> +               vma->vm_file = file;
> +               fop = file->f_op;
> +       }
> +       err = -EINVAL;
> +       if (fop->mmap)
> +               err = fop->mmap(file, vma);
> +out:
> +       return err;
> +}
> +
> +static int ovl_fsync(struct file *file, loff_t start, loff_t end,
> +                    int datasync)

I'm confused. Can fsync be called on a RO file?
I do not see that it can't, but I wonder what is the rational.

> +{
> +       bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +       int ret = -EINVAL;
> +
> +       if (likely(!isupper)) {
> +               const struct file_operations *fop = ovl_real_fop(file);
> +
> +               if (likely(fop->fsync))
> +                      ret = fop->fsync(file, start, end, datasync);
> +       } else {
> +               struct file *upperfile = filp_clone_open(file);
> +
> +               if (IS_ERR(upperfile)) {
> +                       ret = PTR_ERR(upperfile);
> +               } else {
> +                       ret = vfs_fsync_range(upperfile, start, end, datasync);
> +                       fput(upperfile);
> +               }
> +       }
> +
> +       return ret;
> +}
> +
> +static const struct file_operations ovl_file_operations = {
> +       .open           = ovl_open,
> +       .release        = ovl_release,
> +       .read_iter      = ovl_read_iter,
> +       .llseek         = ovl_llseek,
> +       .mmap           = ovl_mmap,
> +       .fsync          = ovl_fsync,
> +};
> +
> +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
>  {
>         inode->i_ino = get_next_ino();
>         inode->i_mode = mode;
> @@ -390,8 +540,12 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
>         inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
>  #endif
>
> -       mode &= S_IFMT;
> -       switch (mode) {
> +       switch (mode & S_IFMT) {
> +       case S_IFREG:
> +               inode->i_op = &ovl_file_inode_operations;
> +               inode->i_fop = &ovl_file_operations;
> +               break;
> +
>         case S_IFDIR:
>                 inode->i_op = &ovl_dir_inode_operations;
>                 inode->i_fop = &ovl_dir_operations;
> @@ -402,26 +556,19 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
>                 break;
>
>         default:
> -               WARN(1, "illegal file type: %i\n", mode);
> -               /* Fall through */
> -
> -       case S_IFREG:
> -       case S_IFSOCK:
> -       case S_IFBLK:
> -       case S_IFCHR:
> -       case S_IFIFO:
>                 inode->i_op = &ovl_file_inode_operations;
> +               init_special_inode(inode, mode, rdev);
>                 break;
>         }
>  }
>
> -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
> +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
>  {
>         struct inode *inode;
>
>         inode = new_inode(sb);
>         if (inode)
> -               ovl_fill_inode(inode, mode);
> +               ovl_fill_inode(inode, mode, rdev);
>
>         return inode;
>  }
> @@ -445,7 +592,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
>         inode = iget5_locked(sb, (unsigned long) realinode,
>                              ovl_inode_test, ovl_inode_set, realinode);
>         if (inode && inode->i_state & I_NEW) {
> -               ovl_fill_inode(inode, realinode->i_mode);
> +               ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
>                 set_nlink(inode, realinode->i_nlink);
>                 unlock_new_inode(inode);
>         }
> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
> index e218e741cb99..95d0d86c2d54 100644
> --- a/fs/overlayfs/overlayfs.h
> +++ b/fs/overlayfs/overlayfs.h
> @@ -195,7 +195,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
>  int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
>  bool ovl_is_private_xattr(const char *name);
>
> -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
> +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
>  struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
>  static inline void ovl_copyattr(struct inode *from, struct inode *to)
>  {
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index 7e3f0127fc1a..daed68d13467 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -305,7 +305,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
>  {
>         struct dentry *real;
>
> -       if (d_is_dir(dentry)) {
> +       if (!d_is_reg(dentry)) {
>                 if (!inode || inode == d_inode(dentry))
>                         return dentry;
>                 goto bug;
> @@ -579,7 +579,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
>                 if (upperdentry && !d_is_dir(upperdentry)) {

Shouldn't this be && !d_is_reg(dentry) to align with the change in
ovl_d_real() above?


>                         inode = ovl_get_inode(dentry->d_sb, realinode);
>                 } else {
> -                       inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
> +                       inode = ovl_new_inode(dentry->d_sb, realinode->i_mode,
> +                                             realinode->i_rdev);
>                         if (inode)
>                                 ovl_inode_init(inode, realinode, !!upperdentry);
>                 }
> @@ -1292,7 +1293,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
>         if (!oe)
>                 goto out_put_cred;
>
> -       root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
> +       root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
>         if (!root_dentry)
>                 goto out_free_oe;
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index bc65d5918140..cc7d1203b846 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2342,6 +2342,7 @@ extern struct file *filp_open(const char *, int, umode_t);
>  extern struct file *file_open_root(struct dentry *, struct vfsmount *,
>                                    const char *, int, umode_t);
>  extern struct file * dentry_open(const struct path *, int, const struct cred *);
> +extern struct file *filp_clone_open(struct file *);
>  extern int filp_close(struct file *, fl_owner_t id);
>
>  extern struct filename *getname_flags(const char __user *, int, int *);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-unionfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-12 13:33 [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up Miklos Szeredi
  2016-10-13 18:45 ` Amir Goldstein
@ 2016-10-20 20:46 ` Vivek Goyal
  2016-10-20 20:54   ` Vivek Goyal
  2016-10-21  9:13   ` Amir Goldstein
  1 sibling, 2 replies; 14+ messages in thread
From: Vivek Goyal @ 2016-10-20 20:46 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-unionfs, linux-fsdevel, linux-kernel, Jeremy Eder,
	David Howells, Ratna Bolla, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Wed, Oct 12, 2016 at 03:33:26PM +0200, Miklos Szeredi wrote:
> This is a proof of concept patch to fix the following.
> 
> /ovl is in overlay mount and /ovl/foo exists on the lower layer only.
> 
>  rofd = open("/ovl/foo", O_RDONLY);
>  rwfd = open("/ovl/foo", O_WRONLY); /* this causes copy up */
>  write(rwfd, "bar", 3);
>  read(rofd, buf, 3); 
>  assert(memcmp(buf, "bar", 3) == 0);
> 
> Similar problem exists with an MAP_SHARED mmap created from rofd.
> 
> While this has only caused few problems (yum/dnf failure is the only one I know
> of) and easily worked around in userspace, many see it as a proof that overlayfs
> can never be a proper "POSIX" filesystem.
> 
> To quell those worries, here's a simple patch that should address the above.
> 
> The only VFS change is that f_op is initialized from f_path.dentry->d_inode
> instead of file_inode(filp) in open.  The effect of this is that overlayfs can
> intercept open and other file operations, while the file still effectively
> belongs to the underlying fs.
> 
> The patch does not give up on the nice properties of overlayfs, like sharing the
> page cache with the underlying files.  It does cause copy up in one case where
> previously there wasn't one and that's the O_RDONLY/MAP_SHARED case.  I haven't
> done much research into this, but running some tests in chroot didn't trigger
> this.
> 
> Comments, testing are welcome.

Hi Miklos, 

This looks like a very interesting idea. In fact once file has been copied
up and writen to, and if I do fstat(rofd), it shows the size of copied up
file but one can read the contents. So fixing that anomaly would be nice.

Hopefully O_RDONLY/MAP_SHARED is not a common case and we get away with
this forced copy up penalty.

[..]
> +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	struct file *file = iocb->ki_filp;
> +	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> +	ssize_t ret = -EINVAL;
> +
> +	if (likely(!isupper)) {
> +		const struct file_operations *fop = ovl_real_fop(file);
> +
> +		if (likely(fop->read_iter))
> +			ret = fop->read_iter(iocb, to);
> +	} else {
> +		struct file *upperfile = filp_clone_open(file);
> +

IIUC, every read of lower file will call filp_clone_open(). Looking at the
code of filp_clone_open(), I am concerned about the overhead of this call.
Is it significant? Don't want to be paying too much of penalty for read
operation on lower files. That would be a common case for containers.

BTW, I did a quick testing. Using docker launched a fedora container and
called "dnf update" inside that. And later I noticed following on serial
console.

Thanks
Vivek

[  309.075885] ======================================================
[  309.076841] [ INFO: possible circular locking dependency detected ]
[  309.077818] 4.9.0-rc1+ #197 Not tainted
[  309.078411] -------------------------------------------------------
[  309.079377] dnf/2468 is trying to acquire lock:
[  309.080082]  ([  309.080324] &type->s_vfs_rename_key
#2[  309.080942] ){+.+.+.}
, at: [  309.081435] [<ffffffff8129f652>] lock_rename+0x32/0x100
[  309.082261] 
[  309.082261] but task is already holding lock:
[  309.083158]  ([  309.083399] &mm->mmap_sem
){++++++}[  309.083974] , at: 
[  309.084316] [<ffffffff8121df0c>] vm_mmap_pgoff+0x8c/0x100
[  309.085150] 
[  309.085150] which lock already depends on the new lock.
[  309.085150] 
[  309.086393] 
[  309.086393] the existing dependency chain (in reverse order) is:
[  309.088279] 
-> #3[  309.088612]  (
&mm->mmap_sem[  309.089091] ){++++++}
[  309.089470] :
[  309.089735]        [  309.090046] [<ffffffff81112786>] lock_acquire+0xf6/0x1f0
[  309.090884]        [  309.091197] [<ffffffff812316c0>] __might_fault+0x70/0xa0
[  309.092047]        [  309.092357] [<ffffffff812aa585>] filldir+0xb5/0x140
[  309.093128]        [  309.093434] [<ffffffff8132f235>] call_filldir+0x65/0x130
[  309.094273]        [  309.094590] [<ffffffff8132fc0f>] ext4_readdir+0x6cf/0x8a0
[  309.095425]        [  309.095742] [<ffffffff812aa24b>] iterate_dir+0x17b/0x1b0
[  309.096572]        [  309.096878] [<ffffffff812aa76c>] SyS_getdents+0x9c/0x130
[  309.097716]        [  309.098026] [<ffffffff818a4bc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
[  309.099005] 
-> #2[  309.099304]  (
&type->i_mutex_dir_key[  309.099888] #3
){++++++}[  309.100301] :
[  309.100576]        [  309.100881] [<ffffffff81112786>] lock_acquire+0xf6/0x1f0
[  309.101711]        [  309.102017] [<ffffffff818a1279>] down_write+0x49/0x80
[  309.102798]        [  309.103098] [<ffffffff8129fea5>] vfs_rmdir+0x55/0x140
[  309.103878]        [  309.104179] [<ffffffff812a5bdd>] do_rmdir+0x1bd/0x230
[  309.104958]        [  309.105256] [<ffffffff812a6a12>] SyS_unlinkat+0x22/0x30
[  309.106063]        [  309.106364] [<ffffffff818a4bc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
[  309.107345] 
-> #1[  309.107661]  (
&type->i_mutex_dir_key[  309.108256] #3
/1[  309.108597] ){+.+.+.}
[  309.108971] :
[  309.109225]        [  309.109532] [<ffffffff81112786>] lock_acquire+0xf6/0x1f0
[  309.110370]        [  309.110686] [<ffffffff8110c8df>] down_write_nested+0x4f/0x80
[  309.111606]        [  309.111916] [<ffffffff8129f701>] lock_rename+0xe1/0x100
[  309.112752]        [  309.113062] [<ffffffff812a7842>] SyS_renameat+0x212/0x3f0
[  309.113918]        [  309.114224] [<ffffffff818a4bc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
[  309.115218] 
-> #0[  309.115525]  (
&type->s_vfs_rename_key[  309.116138] #2
){+.+.+.}[  309.116564] :
[  309.116837]        [  309.117146] [<ffffffff811121a0>] __lock_acquire+0x1110/0x12a0
[  309.118047]        [  309.118354] [<ffffffff81112786>] lock_acquire+0xf6/0x1f0
[  309.119193]        [  309.119500] [<ffffffff818a0159>] mutex_lock_nested+0x79/0x3c0
[  309.120406]        [  309.120723] [<ffffffff8129f652>] lock_rename+0x32/0x100
[  309.121564]        [  309.121875] [<ffffffffa02d1617>] ovl_copy_up_one+0xf7/0x6a0 [overlay]
[  309.122866]        [  309.123170] [<ffffffffa02d1ccb>] ovl_copy_up+0x10b/0x13d [overlay]
[  309.124136]        [  309.124442] [<ffffffffa02cd7a5>] ovl_mmap+0x55/0x90 [overlay]
[  309.125348]        [  309.125677] [<ffffffff8123f3a4>] mmap_region+0x394/0x630
[  309.126510]        [  309.126829] [<ffffffff8123fa86>] do_mmap+0x446/0x530
[  309.127616]        [  309.127928] [<ffffffff8121df3d>] vm_mmap_pgoff+0xbd/0x100
[  309.128783]        [  309.129092] [<ffffffff8123d5c1>] SyS_mmap_pgoff+0x1c1/0x290
[  309.129973]        [  309.130284] [<ffffffff8103b71b>] SyS_mmap+0x1b/0x30
[  309.131058]        [  309.131368] [<ffffffff818a4bc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2
[  309.132377] 
[  309.132377] other info that might help us debug this:
[  309.132377] 
[  309.133608] Chain exists of:
  [  309.134084] &type->s_vfs_rename_key
#2[  309.134694]  --> 
&type->i_mutex_dir_key[  309.135324] #3
 --> [  309.135705] &mm->mmap_sem
[  309.136131] 
[  309.136131] 
[  309.136599]  Possible unsafe locking scenario:
[  309.136599] 
[  309.137499]        CPU0                    CPU1
[  309.138202]        ----                    ----
[  309.138905]   lock([  309.139211] &mm->mmap_sem
[  309.139646] );
[  309.139914]                                lock([  309.140602] &type->i_mutex_dir_key
#3[  309.141190] );
[  309.141472]                                lock([  309.142175] &mm->mmap_sem
[  309.142610] );
[  309.142877]   lock([  309.143186] &type->s_vfs_rename_key
#2[  309.143796] );
[  309.144084] 
[  309.144084]  *** DEADLOCK ***
[  309.144084] 
[  309.144991] 1 lock held by dnf/2468:
[  309.145543]  #0: [  309.145827]  (
&mm->mmap_sem[  309.146299] ){++++++}
, at: [  309.146782] [<ffffffff8121df0c>] vm_mmap_pgoff+0x8c/0x100
[  309.147634] 
[  309.147634] stack backtrace:
[  309.148310] CPU: 4 PID: 2468 Comm: dnf Not tainted 4.9.0-rc1+ #197
[  309.149257] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.2-20150714_191134- 04/01/2014
[  309.150717]  ffffc900033cf900 ffffffff8144b253 ffffffff828b29f0 ffffffff828e8d50
[  309.151942]  ffffc900033cf940 ffffffff8110f83e 00000000f7ac0000 ffff8801f7ac0860
[  309.153660]  ffff8801f7ac0000 ffff8801f7ac0838 0000000000000001 0000000000000000
[  309.154877] Call Trace:
[  309.155266]  [<ffffffff8144b253>] dump_stack+0x86/0xc3
[  309.156062]  [<ffffffff8110f83e>] print_circular_bug+0x1be/0x210
[  309.156991]  [<ffffffff811121a0>] __lock_acquire+0x1110/0x12a0
[  309.157896]  [<ffffffff8112ee3d>] ? debug_lockdep_rcu_enabled+0x1d/0x20
[  309.158912]  [<ffffffff813c3ed4>] ? avc_has_perm+0x34/0x290
[  309.159768]  [<ffffffff81112786>] lock_acquire+0xf6/0x1f0
[  309.160597]  [<ffffffff8129f652>] ? lock_rename+0x32/0x100
[  309.161438]  [<ffffffff818a0159>] mutex_lock_nested+0x79/0x3c0
[  309.162351]  [<ffffffff8129f652>] ? lock_rename+0x32/0x100
[  309.163202]  [<ffffffff813c92fb>] ? selinux_inode_getattr+0x8b/0xb0
[  309.164162]  [<ffffffff8129f652>] lock_rename+0x32/0x100
[  309.164981]  [<ffffffffa02d1617>] ovl_copy_up_one+0xf7/0x6a0 [overlay]
[  309.165987]  [<ffffffff813c3fd3>] ? avc_has_perm+0x133/0x290
[  309.166864]  [<ffffffff813c3ed4>] ? avc_has_perm+0x34/0x290
[  309.167723]  [<ffffffff810e348a>] ? __might_sleep+0x4a/0x80
[  309.168580]  [<ffffffffa02d1ccb>] ovl_copy_up+0x10b/0x13d [overlay]
[  309.169539]  [<ffffffffa02cd7a5>] ovl_mmap+0x55/0x90 [overlay]
[  309.170436]  [<ffffffff8123f3a4>] mmap_region+0x394/0x630
[  309.171269]  [<ffffffff8123fa86>] do_mmap+0x446/0x530
[  309.172064]  [<ffffffff8121df3d>] vm_mmap_pgoff+0xbd/0x100
[  309.172908]  [<ffffffff8123d5c1>] SyS_mmap_pgoff+0x1c1/0x290
[  309.173777]  [<ffffffff8103b71b>] SyS_mmap+0x1b/0x30
[  309.174539]  [<ffffffff818a4bc1>] entry_SYSCALL_64_fastpath+0x1f/0xc2

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-20 20:46 ` Vivek Goyal
@ 2016-10-20 20:54   ` Vivek Goyal
  2016-10-21  8:53     ` Amir Goldstein
  2016-10-21  9:12     ` Miklos Szeredi
  2016-10-21  9:13   ` Amir Goldstein
  1 sibling, 2 replies; 14+ messages in thread
From: Vivek Goyal @ 2016-10-20 20:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-unionfs, linux-fsdevel, linux-kernel, Jeremy Eder,
	David Howells, Ratna Bolla, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:

[..]
> > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > +{
> > +	struct file *file = iocb->ki_filp;
> > +	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> > +	ssize_t ret = -EINVAL;
> > +
> > +	if (likely(!isupper)) {
> > +		const struct file_operations *fop = ovl_real_fop(file);
> > +
> > +		if (likely(fop->read_iter))
> > +			ret = fop->read_iter(iocb, to);
> > +	} else {
> > +		struct file *upperfile = filp_clone_open(file);
> > +
> 
> IIUC, every read of lower file will call filp_clone_open(). Looking at the
> code of filp_clone_open(), I am concerned about the overhead of this call.
> Is it significant? Don't want to be paying too much of penalty for read
> operation on lower files. That would be a common case for containers.
> 

Looks like I read the code in reverse. So if I open a file read-only,
and if it has not been copied up, I will simply call read_iter() on
lower filesystem. But if file has been copied up, then I will call
filp_clone_open() and pay the cost. And this will continue till this
file is closed by caller. 

When file is opened again, by that time it is upper file and we will
install real fop in file (instead of overlay fop).

Vivek

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-20 20:54   ` Vivek Goyal
@ 2016-10-21  8:53     ` Amir Goldstein
  2016-10-21 20:13       ` Vivek Goyal
  2016-10-21  9:12     ` Miklos Szeredi
  1 sibling, 1 reply; 14+ messages in thread
From: Amir Goldstein @ 2016-10-21  8:53 UTC (permalink / raw)
  To: Vivek Goyal, Miklos Szeredi
  Cc: linux-unionfs, linux-fsdevel, linux-kernel, Jeremy Eder,
	David Howells, Ratna Bolla, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Thu, Oct 20, 2016 at 11:54 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:
>
> [..]
>> > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> > +{
>> > +   struct file *file = iocb->ki_filp;
>> > +   bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
>> > +   ssize_t ret = -EINVAL;
>> > +
>> > +   if (likely(!isupper)) {
>> > +           const struct file_operations *fop = ovl_real_fop(file);
>> > +
>> > +           if (likely(fop->read_iter))
>> > +                   ret = fop->read_iter(iocb, to);
>> > +   } else {
>> > +           struct file *upperfile = filp_clone_open(file);
>> > +
>>
>> IIUC, every read of lower file will call filp_clone_open(). Looking at the
>> code of filp_clone_open(), I am concerned about the overhead of this call.
>> Is it significant? Don't want to be paying too much of penalty for read
>> operation on lower files. That would be a common case for containers.
>>
>
> Looks like I read the code in reverse. So if I open a file read-only,
> and if it has not been copied up, I will simply call read_iter() on
> lower filesystem. But if file has been copied up, then I will call
> filp_clone_open() and pay the cost. And this will continue till this
> file is closed by caller.
>

I wonder if that cost could be reduced by calling replace_fd() or
some variant of it to install the cloned file onto the rofd after the
first access??

> When file is opened again, by that time it is upper file and we will
> install real fop in file (instead of overlay fop).
>
> Vivek
> --
> To unsubscribe from this list: send the line "unsubscribe linux-unionfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-20 20:54   ` Vivek Goyal
  2016-10-21  8:53     ` Amir Goldstein
@ 2016-10-21  9:12     ` Miklos Szeredi
  2016-10-21 13:31       ` Vivek Goyal
  1 sibling, 1 reply; 14+ messages in thread
From: Miklos Szeredi @ 2016-10-21  9:12 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: linux-unionfs, linux-fsdevel, linux-kernel, Jeremy Eder,
	David Howells, Gou Rao, Vinod Jayaraman, Al Viro, Dave Chinner

On Thu, Oct 20, 2016 at 04:54:08PM -0400, Vivek Goyal wrote:
> On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:
> 
> [..]
> > > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > +{
> > > +	struct file *file = iocb->ki_filp;
> > > +	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> > > +	ssize_t ret = -EINVAL;
> > > +
> > > +	if (likely(!isupper)) {
> > > +		const struct file_operations *fop = ovl_real_fop(file);
> > > +
> > > +		if (likely(fop->read_iter))
> > > +			ret = fop->read_iter(iocb, to);
> > > +	} else {
> > > +		struct file *upperfile = filp_clone_open(file);
> > > +
> > 
> > IIUC, every read of lower file will call filp_clone_open(). Looking at the
> > code of filp_clone_open(), I am concerned about the overhead of this call.
> > Is it significant? Don't want to be paying too much of penalty for read
> > operation on lower files. That would be a common case for containers.
> > 
> 
> Looks like I read the code in reverse. So if I open a file read-only,
> and if it has not been copied up, I will simply call read_iter() on
> lower filesystem. But if file has been copied up, then I will call
> filp_clone_open() and pay the cost. And this will continue till this
> file is closed by caller. 
> 
> When file is opened again, by that time it is upper file and we will
> install real fop in file (instead of overlay fop).

Right.

The lockdep issue seems to be real, we can't take i_mutex and s_vfs_rename_mutex
while mmap_sem is locked.  Fortunately copy up doesn't need mmap_sem, so we can
do it while unlocked and retry the mmap.

Here's an incremental workaround patch.

I don't like adding such workarounds to the VFS/MM but they are really cheap for
the non-overlay case and there doesn't appear to be an alternative in this case.

Thanks,
Miklos

---
 fs/overlayfs/inode.c |   19 +++++--------------
 mm/util.c            |   22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 14 deletions(-)

--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -419,21 +419,12 @@ static int ovl_mmap(struct file *file, s
 	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
 	int err;
 
-	/*
-	 * Treat MAP_SHARED as hint about future writes to the file (through
-	 * another file descriptor).  Caller might not have had such an intent,
-	 * but we hope MAP_PRIVATE will be used in most such cases.
-	 *
-	 * If we don't copy up now and the file is modified, it becomes really
-	 * difficult to change the mapping to match that of the file's content
-	 * later.
-	 */
 	if (unlikely(isupper || vma->vm_flags & VM_MAYSHARE)) {
-		if (!isupper) {
-			err = ovl_copy_up(file->f_path.dentry);
-			if (err)
-				goto out;
-		}
+		/*
+		 * File should have been copied up by now. See vm_mmap_pgoff().
+		 */
+		if (WARN_ON(!isupper))
+			return -EIO;
 
 		file = filp_clone_open(file);
 		err = PTR_ERR(file);
--- a/mm/util.c
+++ b/mm/util.c
@@ -297,6 +297,28 @@ unsigned long vm_mmap_pgoff(struct file
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
+		/*
+		 * Special treatment for overlayfs:
+		 *
+		 * Take MAP_SHARED/PROT_READ as hint about future writes to the
+		 * file (through another file descriptor).  Caller might not
+		 * have had such an intent, but we hope MAP_PRIVATE will be used
+		 * in most such cases.
+		 *
+		 * If we don't copy up now and the file is modified, it becomes
+		 * really difficult to change the mapping to match that of the
+		 * file's content later.
+		 *
+		 * Copy up needs to be done without mmap_sem since it takes vfs
+		 * locks which would potentially deadlock under mmap_sem.
+		 */
+		if ((flag & MAP_SHARED) && !(prot & PROT_WRITE)) {
+			void *p = d_real(file->f_path.dentry, NULL, O_WRONLY);
+
+			if (IS_ERR(p))
+				return PTR_ERR(p);
+		}
+
 		if (down_write_killable(&mm->mmap_sem))
 			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-20 20:46 ` Vivek Goyal
  2016-10-20 20:54   ` Vivek Goyal
@ 2016-10-21  9:13   ` Amir Goldstein
  2016-10-21  9:30     ` Miklos Szeredi
  1 sibling, 1 reply; 14+ messages in thread
From: Amir Goldstein @ 2016-10-21  9:13 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Miklos Szeredi, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Thu, Oct 20, 2016 at 11:46 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Wed, Oct 12, 2016 at 03:33:26PM +0200, Miklos Szeredi wrote:
>> This is a proof of concept patch to fix the following.
>>
>> /ovl is in overlay mount and /ovl/foo exists on the lower layer only.
>>
>>  rofd = open("/ovl/foo", O_RDONLY);
>>  rwfd = open("/ovl/foo", O_WRONLY); /* this causes copy up */
>>  write(rwfd, "bar", 3);
>>  read(rofd, buf, 3);
>>  assert(memcmp(buf, "bar", 3) == 0);
>>
>> Similar problem exists with an MAP_SHARED mmap created from rofd.
>>
>> While this has only caused few problems (yum/dnf failure is the only one I know
>> of) and easily worked around in userspace, many see it as a proof that overlayfs
>> can never be a proper "POSIX" filesystem.
>>
>> To quell those worries, here's a simple patch that should address the above.
>>
>> The only VFS change is that f_op is initialized from f_path.dentry->d_inode
>> instead of file_inode(filp) in open.  The effect of this is that overlayfs can
>> intercept open and other file operations, while the file still effectively
>> belongs to the underlying fs.
>>
>> The patch does not give up on the nice properties of overlayfs, like sharing the
>> page cache with the underlying files.  It does cause copy up in one case where
>> previously there wasn't one and that's the O_RDONLY/MAP_SHARED case.  I haven't
>> done much research into this, but running some tests in chroot didn't trigger
>> this.
>>
>> Comments, testing are welcome.
>
> Hi Miklos,
>
> This looks like a very interesting idea. In fact once file has been copied
> up and writen to, and if I do fstat(rofd), it shows the size of copied up
> file but one can read the contents. So fixing that anomaly would be nice.
>

I think it would be a good idea in general to stabilize the overlay ino/dev
throughout copy-up, same as Miklos suggested to do for directories, to
all files:
pure upper uses upper ino + overlayfs dev
non-pure upper uses lower ino + overlayfs dev

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-21  9:13   ` Amir Goldstein
@ 2016-10-21  9:30     ` Miklos Szeredi
  2016-10-21 13:18       ` Amir Goldstein
  0 siblings, 1 reply; 14+ messages in thread
From: Miklos Szeredi @ 2016-10-21  9:30 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Vivek Goyal, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Fri, Oct 21, 2016 at 11:13 AM, Amir Goldstein <amir73il@gmail.com> wrote:

> I think it would be a good idea in general to stabilize the overlay ino/dev
> throughout copy-up, same as Miklos suggested to do for directories, to
> all files:
> pure upper uses upper ino + overlayfs dev
> non-pure upper uses lower ino + overlayfs dev

Making st_ino, st_dev and d_ino behave consistently would be the next big step.

The above scheme only works if lower and upper are on the same
filesystem.  Otherwise there can be collisions between the lower and
upper inode numbers.  Perhaps you meant:

- pure upper uses upper ino + upper dev
- non-pure upper uses lower ino + overlayfs dev

It works for the single lower layer case, but again breaks if there
are multiple lower layers.   And d_ino in a merged directory could
still get us into trouble.  And find -xdev would not do what you'd
expect with a "normal" filesystem.

So there doesn't appear to be any easy solutions to this...

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-21  9:30     ` Miklos Szeredi
@ 2016-10-21 13:18       ` Amir Goldstein
  0 siblings, 0 replies; 14+ messages in thread
From: Amir Goldstein @ 2016-10-21 13:18 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Vivek Goyal, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Fri, Oct 21, 2016 at 12:30 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Fri, Oct 21, 2016 at 11:13 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>
>> I think it would be a good idea in general to stabilize the overlay ino/dev
>> throughout copy-up, same as Miklos suggested to do for directories, to
>> all files:
>> pure upper uses upper ino + overlayfs dev
>> non-pure upper uses lower ino + overlayfs dev
>
> Making st_ino, st_dev and d_ino behave consistently would be the next big step.
>
> The above scheme only works if lower and upper are on the same
> filesystem.  Otherwise there can be collisions between the lower and
> upper inode numbers.  Perhaps you meant:
>
> - pure upper uses upper ino + upper dev
> - non-pure upper uses lower ino + overlayfs dev
>
> It works for the single lower layer case, but again breaks if there
> are multiple lower layers.   And d_ino in a merged directory could
> still get us into trouble.  And find -xdev would not do what you'd
> expect with a "normal" filesystem.
>
> So there doesn't appear to be any easy solutions to this...
>

Not for the general case there isn't, but I was actually thinking of
the docker case
and there is a lot that can be done for the use case of lower and upper on the
same fs to make overlayfs more compliant.
Since it's quite a common use case, perhaps its worth the special treatment.

Amir.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-21  9:12     ` Miklos Szeredi
@ 2016-10-21 13:31       ` Vivek Goyal
  0 siblings, 0 replies; 14+ messages in thread
From: Vivek Goyal @ 2016-10-21 13:31 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-unionfs, linux-fsdevel, linux-kernel, Jeremy Eder,
	David Howells, Gou Rao, Vinod Jayaraman, Al Viro, Dave Chinner

On Fri, Oct 21, 2016 at 11:12:11AM +0200, Miklos Szeredi wrote:
> On Thu, Oct 20, 2016 at 04:54:08PM -0400, Vivek Goyal wrote:
> > On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:
> > 
> > [..]
> > > > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > > > +{
> > > > +	struct file *file = iocb->ki_filp;
> > > > +	bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> > > > +	ssize_t ret = -EINVAL;
> > > > +
> > > > +	if (likely(!isupper)) {
> > > > +		const struct file_operations *fop = ovl_real_fop(file);
> > > > +
> > > > +		if (likely(fop->read_iter))
> > > > +			ret = fop->read_iter(iocb, to);
> > > > +	} else {
> > > > +		struct file *upperfile = filp_clone_open(file);
> > > > +
> > > 
> > > IIUC, every read of lower file will call filp_clone_open(). Looking at the
> > > code of filp_clone_open(), I am concerned about the overhead of this call.
> > > Is it significant? Don't want to be paying too much of penalty for read
> > > operation on lower files. That would be a common case for containers.
> > > 
> > 
> > Looks like I read the code in reverse. So if I open a file read-only,
> > and if it has not been copied up, I will simply call read_iter() on
> > lower filesystem. But if file has been copied up, then I will call
> > filp_clone_open() and pay the cost. And this will continue till this
> > file is closed by caller. 
> > 
> > When file is opened again, by that time it is upper file and we will
> > install real fop in file (instead of overlay fop).
> 
> Right.
> 
> The lockdep issue seems to be real, we can't take i_mutex and s_vfs_rename_mutex
> while mmap_sem is locked.  Fortunately copy up doesn't need mmap_sem, so we can
> do it while unlocked and retry the mmap.
> 
> Here's an incremental workaround patch.
> 
> I don't like adding such workarounds to the VFS/MM but they are really cheap for
> the non-overlay case and there doesn't appear to be an alternative in this case.

This incremental patch does fix the locking warning issue I was seeing.

Vivek

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-21  8:53     ` Amir Goldstein
@ 2016-10-21 20:13       ` Vivek Goyal
  2016-10-22  7:24         ` Amir Goldstein
  0 siblings, 1 reply; 14+ messages in thread
From: Vivek Goyal @ 2016-10-21 20:13 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Ratna Bolla, Gou Rao,
	Vinod Jayaraman, Al Viro, Dave Chinner

On Fri, Oct 21, 2016 at 11:53:41AM +0300, Amir Goldstein wrote:
> On Thu, Oct 20, 2016 at 11:54 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
> > On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:
> >
> > [..]
> >> > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
> >> > +{
> >> > +   struct file *file = iocb->ki_filp;
> >> > +   bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
> >> > +   ssize_t ret = -EINVAL;
> >> > +
> >> > +   if (likely(!isupper)) {
> >> > +           const struct file_operations *fop = ovl_real_fop(file);
> >> > +
> >> > +           if (likely(fop->read_iter))
> >> > +                   ret = fop->read_iter(iocb, to);
> >> > +   } else {
> >> > +           struct file *upperfile = filp_clone_open(file);
> >> > +
> >>
> >> IIUC, every read of lower file will call filp_clone_open(). Looking at the
> >> code of filp_clone_open(), I am concerned about the overhead of this call.
> >> Is it significant? Don't want to be paying too much of penalty for read
> >> operation on lower files. That would be a common case for containers.
> >>
> >
> > Looks like I read the code in reverse. So if I open a file read-only,
> > and if it has not been copied up, I will simply call read_iter() on
> > lower filesystem. But if file has been copied up, then I will call
> > filp_clone_open() and pay the cost. And this will continue till this
> > file is closed by caller.
> >
> 
> I wonder if that cost could be reduced by calling replace_fd() or
> some variant of it to install the cloned file onto the rofd after the
> first access??

Hmm.., Interesting. Will something like following work? This applies on
top of Miklos's patch. It seems to work for me. It might be completely
broken/racy though. Somebody who understands this code well, will have
to have a look.

---
 fs/file.c            |   41 +++++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/inode.c |    1 +
 2 files changed, 42 insertions(+)

Index: rhvgoyal-linux/fs/overlayfs/inode.c
===================================================================
--- rhvgoyal-linux.orig/fs/overlayfs/inode.c	2016-10-21 15:43:05.391488406 -0400
+++ rhvgoyal-linux/fs/overlayfs/inode.c	2016-10-21 16:07:57.409420795 -0400
@@ -416,6 +416,7 @@ static ssize_t ovl_read_iter(struct kioc
 		if (IS_ERR(upperfile)) {
 			ret = PTR_ERR(upperfile);
 		} else {
+			replace_file(file, upperfile);
 			ret = vfs_iter_read(upperfile, to, &iocb->ki_pos);
 			fput(upperfile);
 		}
Index: rhvgoyal-linux/fs/file.c
===================================================================
--- rhvgoyal-linux.orig/fs/file.c	2016-10-21 15:43:05.391488406 -0400
+++ rhvgoyal-linux/fs/file.c	2016-10-21 16:08:18.168420795 -0400
@@ -864,6 +864,47 @@ Ebusy:
 	return -EBUSY;
 }
 
+
+int replace_file(struct file *old_file, struct file *new_file)
+{
+#define MAX_TO_FREE	8
+	int n, idx = 0;
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	struct file *to_free[MAX_TO_FREE];
+	bool retry = false;
+
+try_again:
+	spin_lock(&files->file_lock);
+	for (n = 0, fdt = files_fdtable(files); n < fdt->max_fds; n++) {
+                struct file *file;
+                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+                if (!file)
+                        continue;
+		if (file == old_file) {
+			get_file(new_file);
+			rcu_assign_pointer(fdt->fd[n], new_file);
+			to_free[idx++] = file;
+			if (idx >= MAX_TO_FREE) {
+				retry = true;
+				break;
+			}
+		}
+        }
+	spin_unlock(&files->file_lock);
+	while (idx) {
+		filp_close(to_free[--idx], files);
+	}
+
+	if (retry) {
+		retry = false;
+		idx = 0;
+		goto try_again;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(replace_file);
+
 int replace_fd(unsigned fd, struct file *file, unsigned flags)
 {
 	int err;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-21 20:13       ` Vivek Goyal
@ 2016-10-22  7:24         ` Amir Goldstein
  2016-10-22 15:39           ` Amir Goldstein
  0 siblings, 1 reply; 14+ messages in thread
From: Amir Goldstein @ 2016-10-22  7:24 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Miklos Szeredi, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Ratna Bolla, Gou Rao,
	Vinod Jayaraman, Al Viro, Dave Chinner

On Fri, Oct 21, 2016 at 11:13 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Fri, Oct 21, 2016 at 11:53:41AM +0300, Amir Goldstein wrote:
>> On Thu, Oct 20, 2016 at 11:54 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
>> > On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:
>> >
>> > [..]
>> >> > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> >> > +{
>> >> > +   struct file *file = iocb->ki_filp;
>> >> > +   bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
>> >> > +   ssize_t ret = -EINVAL;
>> >> > +
>> >> > +   if (likely(!isupper)) {
>> >> > +           const struct file_operations *fop = ovl_real_fop(file);
>> >> > +
>> >> > +           if (likely(fop->read_iter))
>> >> > +                   ret = fop->read_iter(iocb, to);
>> >> > +   } else {
>> >> > +           struct file *upperfile = filp_clone_open(file);
>> >> > +
>> >>
>> >> IIUC, every read of lower file will call filp_clone_open(). Looking at the
>> >> code of filp_clone_open(), I am concerned about the overhead of this call.
>> >> Is it significant? Don't want to be paying too much of penalty for read
>> >> operation on lower files. That would be a common case for containers.
>> >>
>> >
>> > Looks like I read the code in reverse. So if I open a file read-only,
>> > and if it has not been copied up, I will simply call read_iter() on
>> > lower filesystem. But if file has been copied up, then I will call
>> > filp_clone_open() and pay the cost. And this will continue till this
>> > file is closed by caller.
>> >
>>
>> I wonder if that cost could be reduced by calling replace_fd() or
>> some variant of it to install the cloned file onto the rofd after the
>> first access??
>
> Hmm.., Interesting. Will something like following work? This applies on
> top of Miklos's patch. It seems to work for me. It might be completely
> broken/racy though. Somebody who understands this code well, will have
> to have a look.
>

The idea sounded scary already when I suggested it :)
See below what I think is scary about this implementation...

Thanks for following through.


> ---
>  fs/file.c            |   41 +++++++++++++++++++++++++++++++++++++++++
>  fs/overlayfs/inode.c |    1 +
>  2 files changed, 42 insertions(+)
>
> Index: rhvgoyal-linux/fs/overlayfs/inode.c
> ===================================================================
> --- rhvgoyal-linux.orig/fs/overlayfs/inode.c    2016-10-21 15:43:05.391488406 -0400
> +++ rhvgoyal-linux/fs/overlayfs/inode.c 2016-10-21 16:07:57.409420795 -0400
> @@ -416,6 +416,7 @@ static ssize_t ovl_read_iter(struct kioc
>                 if (IS_ERR(upperfile)) {
>                         ret = PTR_ERR(upperfile);
>                 } else {
> +                       replace_file(file, upperfile);

When fdtable is not shared (single threaded process), after this call
I think that file pointer
may be free (?), because file is not reference counted.
Although I did not see any code in VFS callers trying to dereference
the file pointer after
calling read_iter(), this seems like a dangerous practice, so will
need to a way to fix that.

>                         ret = vfs_iter_read(upperfile, to, &iocb->ki_pos);
>                         fput(upperfile);
>                 }
> Index: rhvgoyal-linux/fs/file.c
> ===================================================================
> --- rhvgoyal-linux.orig/fs/file.c       2016-10-21 15:43:05.391488406 -0400
> +++ rhvgoyal-linux/fs/file.c    2016-10-21 16:08:18.168420795 -0400
> @@ -864,6 +864,47 @@ Ebusy:
>         return -EBUSY;
>  }
>
> +
> +int replace_file(struct file *old_file, struct file *new_file)
> +{
> +#define MAX_TO_FREE    8
> +       int n, idx = 0;
> +       struct files_struct *files = current->files;
> +       struct fdtable *fdt;
> +       struct file *to_free[MAX_TO_FREE];
> +       bool retry = false;
> +
> +try_again:
> +       spin_lock(&files->file_lock);
> +       for (n = 0, fdt = files_fdtable(files); n < fdt->max_fds; n++) {
> +                struct file *file;
> +                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
> +                if (!file)
> +                        continue;
> +               if (file == old_file) {
> +                       get_file(new_file);
> +                       rcu_assign_pointer(fdt->fd[n], new_file);
> +                       to_free[idx++] = file;
> +                       if (idx >= MAX_TO_FREE) {
> +                               retry = true;
> +                               break;
> +                       }
> +               }
> +        }
> +       spin_unlock(&files->file_lock);
> +       while (idx) {
> +               filp_close(to_free[--idx], files);
> +       }
> +
> +       if (retry) {
> +               retry = false;
> +               idx = 0;
> +               goto try_again;
> +       }
> +       return 0;
> +}
> +EXPORT_SYMBOL(replace_file);
> +
>  int replace_fd(unsigned fd, struct file *file, unsigned flags)
>  {
>         int err;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-22  7:24         ` Amir Goldstein
@ 2016-10-22 15:39           ` Amir Goldstein
  2016-10-24  8:11             ` Miklos Szeredi
  0 siblings, 1 reply; 14+ messages in thread
From: Amir Goldstein @ 2016-10-22 15:39 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Miklos Szeredi, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Sat, Oct 22, 2016 at 10:24 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Fri, Oct 21, 2016 at 11:13 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
>> On Fri, Oct 21, 2016 at 11:53:41AM +0300, Amir Goldstein wrote:
>>> On Thu, Oct 20, 2016 at 11:54 PM, Vivek Goyal <vgoyal@redhat.com> wrote:
>>> > On Thu, Oct 20, 2016 at 04:46:30PM -0400, Vivek Goyal wrote:
>>> >
>>> > [..]
>>> >> > +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>> >> > +{
>>> >> > +   struct file *file = iocb->ki_filp;
>>> >> > +   bool isupper = OVL_TYPE_UPPER(ovl_path_type(file->f_path.dentry));
>>> >> > +   ssize_t ret = -EINVAL;
>>> >> > +
>>> >> > +   if (likely(!isupper)) {
>>> >> > +           const struct file_operations *fop = ovl_real_fop(file);
>>> >> > +
>>> >> > +           if (likely(fop->read_iter))
>>> >> > +                   ret = fop->read_iter(iocb, to);
>>> >> > +   } else {
>>> >> > +           struct file *upperfile = filp_clone_open(file);
>>> >> > +
>>> >>
>>> >> IIUC, every read of lower file will call filp_clone_open(). Looking at the
>>> >> code of filp_clone_open(), I am concerned about the overhead of this call.
>>> >> Is it significant? Don't want to be paying too much of penalty for read
>>> >> operation on lower files. That would be a common case for containers.
>>> >>
>>> >
>>> > Looks like I read the code in reverse. So if I open a file read-only,
>>> > and if it has not been copied up, I will simply call read_iter() on
>>> > lower filesystem. But if file has been copied up, then I will call
>>> > filp_clone_open() and pay the cost. And this will continue till this
>>> > file is closed by caller.
>>> >
>>>
>>> I wonder if that cost could be reduced by calling replace_fd() or
>>> some variant of it to install the cloned file onto the rofd after the
>>> first access??
>>
>> Hmm.., Interesting. Will something like following work? This applies on
>> top of Miklos's patch. It seems to work for me. It might be completely
>> broken/racy though. Somebody who understands this code well, will have
>> to have a look.
>>
>
> The idea sounded scary already when I suggested it :)
> See below what I think is scary about this implementation...
>
> Thanks for following through.
>
>
>> ---
>>  fs/file.c            |   41 +++++++++++++++++++++++++++++++++++++++++
>>  fs/overlayfs/inode.c |    1 +
>>  2 files changed, 42 insertions(+)
>>
>> Index: rhvgoyal-linux/fs/overlayfs/inode.c
>> ===================================================================
>> --- rhvgoyal-linux.orig/fs/overlayfs/inode.c    2016-10-21 15:43:05.391488406 -0400
>> +++ rhvgoyal-linux/fs/overlayfs/inode.c 2016-10-21 16:07:57.409420795 -0400
>> @@ -416,6 +416,7 @@ static ssize_t ovl_read_iter(struct kioc
>>                 if (IS_ERR(upperfile)) {
>>                         ret = PTR_ERR(upperfile);
>>                 } else {
>> +                       replace_file(file, upperfile);
>
> When fdtable is not shared (single threaded process), after this call
> I think that file pointer
> may be free (?), because file is not reference counted.
> Although I did not see any code in VFS callers trying to dereference
> the file pointer after
> calling read_iter(), this seems like a dangerous practice, so will
> need to a way to fix that.
>

My bad. file pointer is freed in work_task_run(), so replace_file()
should be just as safe as replace_fd() and do_dup2().

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up
  2016-10-22 15:39           ` Amir Goldstein
@ 2016-10-24  8:11             ` Miklos Szeredi
  0 siblings, 0 replies; 14+ messages in thread
From: Miklos Szeredi @ 2016-10-24  8:11 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Vivek Goyal, linux-unionfs, linux-fsdevel, linux-kernel,
	Jeremy Eder, David Howells, Gou Rao, Vinod Jayaraman, Al Viro,
	Dave Chinner

On Sat, Oct 22, 2016 at 5:39 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Sat, Oct 22, 2016 at 10:24 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>> On Fri, Oct 21, 2016 at 11:13 PM, Vivek Goyal <vgoyal@redhat.com> wrote:

>>> ---
>>>  fs/file.c            |   41 +++++++++++++++++++++++++++++++++++++++++
>>>  fs/overlayfs/inode.c |    1 +
>>>  2 files changed, 42 insertions(+)
>>>
>>> Index: rhvgoyal-linux/fs/overlayfs/inode.c
>>> ===================================================================
>>> --- rhvgoyal-linux.orig/fs/overlayfs/inode.c    2016-10-21 15:43:05.391488406 -0400
>>> +++ rhvgoyal-linux/fs/overlayfs/inode.c 2016-10-21 16:07:57.409420795 -0400
>>> @@ -416,6 +416,7 @@ static ssize_t ovl_read_iter(struct kioc
>>>                 if (IS_ERR(upperfile)) {
>>>                         ret = PTR_ERR(upperfile);
>>>                 } else {
>>> +                       replace_file(file, upperfile);

I think it's a cool idea.  But I'm not even going to look at the
implementation for now, because it's such a rare corner case, that
trying to optimize it should really be the last thing we do after
everything else is working fine (and only if it actually turns out to
be a thing that somebody actually cares about).

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-10-24  8:11 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-12 13:33 [POC/RFC PATCH] overlayfs: fix data inconsistency at copy up Miklos Szeredi
2016-10-13 18:45 ` Amir Goldstein
2016-10-20 20:46 ` Vivek Goyal
2016-10-20 20:54   ` Vivek Goyal
2016-10-21  8:53     ` Amir Goldstein
2016-10-21 20:13       ` Vivek Goyal
2016-10-22  7:24         ` Amir Goldstein
2016-10-22 15:39           ` Amir Goldstein
2016-10-24  8:11             ` Miklos Szeredi
2016-10-21  9:12     ` Miklos Szeredi
2016-10-21 13:31       ` Vivek Goyal
2016-10-21  9:13   ` Amir Goldstein
2016-10-21  9:30     ` Miklos Szeredi
2016-10-21 13:18       ` Amir Goldstein

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.