($INBOX_DIR/description missing)
 help / color / Atom feed
* [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
@ 2020-06-12  0:46 Mike Kravetz
  2020-06-12  0:46 ` [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS Mike Kravetz
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Mike Kravetz @ 2020-06-12  0:46 UTC (permalink / raw)
  To: linux-mm, linux-fsdevel, overlayfs, linux-kernel
  Cc: Al Viro, Miklos Szeredi, Matthew Wilcox, Colin Walters,
	Andrew Morton, syzbot, syzkaller-bugs, Mike Kravetz

The routine is_file_hugepages() checks f_op == hugetlbfs_file_operations
to determine if the file resides in hugetlbfs.  This is problematic when
the file is on a union or overlay.  Instead, define a new file mode
FMODE_HUGETLBFS which is set when a hugetlbfs file is opened.  The mode
can easily be copied to other 'files' derived from the original hugetlbfs
file.

With this change hugetlbfs_file_operations can be static as it should be.

There is also a (duplicate) set of shm file operations used for the routine
is_file_shm_hugepages().  Instead of setting/using special f_op's, just
propagate the FMODE_HUGETLBFS mode.  This means is_file_shm_hugepages() and
the duplicate f_ops can be removed.

While cleaning things up, change the name of is_file_hugepages() to
is_file_hugetlbfs().  The term hugepages is a bit ambiguous.

A subsequent patch will propagate FMODE_HUGETLBFS in overlayfs.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c    |  7 +++++++
 fs/io_uring.c           |  2 +-
 include/linux/fs.h      |  3 +++
 include/linux/hugetlb.h | 10 ++++------
 include/linux/shm.h     |  5 -----
 ipc/shm.c               | 34 ++++++++--------------------------
 mm/memfd.c              |  2 +-
 mm/mmap.c               |  8 ++++----
 8 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 991c60c7ffe0..5c0c50a88c84 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -324,6 +324,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return retval;
 }
 
+static int hugetlbfs_open(struct inode *inode, struct file *file)
+{
+	file->f_mode |= FMODE_HUGETLBFS;
+	return 0;
+}
+
 static int hugetlbfs_write_begin(struct file *file,
 			struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
@@ -1112,6 +1118,7 @@ static void init_once(void *foo)
 
 const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
+	.open			= hugetlbfs_open,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bb25e3997d41..96e8a4bb610a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7123,7 +7123,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 				struct vm_area_struct *vma = vmas[j];
 
 				if (vma->vm_file &&
-				    !is_file_hugepages(vma->vm_file)) {
+				    !is_file_hugetlbfs(vma->vm_file)) {
 					ret = -EOPNOTSUPP;
 					break;
 				}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 45cc10cdf6dd..99af9513f9ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File is in hugetlbfs filesystem */
+#define FMODE_HUGETLBFS		((__force fmode_t)0x40000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 43a1cef8f0f1..aa3408775464 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -429,18 +429,16 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
 	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
 }
 
-extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags,
 				int page_size_log);
 
-static inline bool is_file_hugepages(struct file *file)
+static inline bool is_file_hugetlbfs(struct file *file)
 {
-	if (file->f_op == &hugetlbfs_file_operations)
+	if (unlikely(file->f_mode & FMODE_HUGETLBFS))
 		return true;
-
-	return is_file_shm_hugepages(file);
+	return false;
 }
 
 static inline struct hstate *hstate_inode(struct inode *i)
@@ -449,7 +447,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
 }
 #else /* !CONFIG_HUGETLBFS */
 
-#define is_file_hugepages(file)			false
+#define is_file_hugetlbfs(file)			false
 static inline struct file *
 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
 		struct user_struct **user, int creat_flags,
diff --git a/include/linux/shm.h b/include/linux/shm.h
index d8e69aed3d32..1ab62d7b334f 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -16,7 +16,6 @@ struct sysv_shm {
 
 long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
 	      unsigned long shmlba);
-bool is_file_shm_hugepages(struct file *file);
 void exit_shm(struct task_struct *task);
 #define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
 #else
@@ -30,10 +29,6 @@ static inline long do_shmat(int shmid, char __user *shmaddr,
 {
 	return -ENOSYS;
 }
-static inline bool is_file_shm_hugepages(struct file *file)
-{
-	return false;
-}
 static inline void exit_shm(struct task_struct *task)
 {
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index 0ba6add05b35..8f119b1d6170 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -285,7 +285,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	shm_rmid(ns, shp);
 	shm_unlock(shp);
-	if (!is_file_hugepages(shm_file))
+	if (!is_file_hugetlbfs(shm_file))
 		shmem_lock(shm_file, 0, shp->mlock_user);
 	else if (shp->mlock_user)
 		user_shm_unlock(i_size_read(file_inode(shm_file)),
@@ -560,24 +560,6 @@ static const struct file_operations shm_file_operations = {
 	.fallocate	= shm_fallocate,
 };
 
-/*
- * shm_file_operations_huge is now identical to shm_file_operations,
- * but we keep it distinct for the sake of is_file_shm_hugepages().
- */
-static const struct file_operations shm_file_operations_huge = {
-	.mmap		= shm_mmap,
-	.fsync		= shm_fsync,
-	.release	= shm_release,
-	.get_unmapped_area	= shm_get_unmapped_area,
-	.llseek		= noop_llseek,
-	.fallocate	= shm_fallocate,
-};
-
-bool is_file_shm_hugepages(struct file *file)
-{
-	return file->f_op == &shm_file_operations_huge;
-}
-
 static const struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
@@ -698,7 +680,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 no_id:
 	ipc_update_pid(&shp->shm_cprid, NULL);
 	ipc_update_pid(&shp->shm_lprid, NULL);
-	if (is_file_hugepages(file) && shp->mlock_user)
+	if (is_file_hugetlbfs(file) && shp->mlock_user)
 		user_shm_unlock(size, shp->mlock_user);
 	fput(file);
 	ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
@@ -836,7 +818,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
 
 	inode = file_inode(shp->shm_file);
 
-	if (is_file_hugepages(shp->shm_file)) {
+	if (is_file_hugetlbfs(shp->shm_file)) {
 		struct address_space *mapping = inode->i_mapping;
 		struct hstate *h = hstate_file(shp->shm_file);
 		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
@@ -1102,7 +1084,7 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
 	}
 
 	shm_file = shp->shm_file;
-	if (is_file_hugepages(shm_file))
+	if (is_file_hugetlbfs(shm_file))
 		goto out_unlock0;
 
 	if (cmd == SHM_LOCK) {
@@ -1523,10 +1505,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 		goto out_nattch;
 	}
 
-	file = alloc_file_clone(base, f_flags,
-			  is_file_hugepages(base) ?
-				&shm_file_operations_huge :
-				&shm_file_operations);
+	file = alloc_file_clone(base, f_flags, &shm_file_operations);
 	err = PTR_ERR(file);
 	if (IS_ERR(file)) {
 		kfree(sfd);
@@ -1534,6 +1513,9 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 		goto out_nattch;
 	}
 
+	/* copy hugetlbfs mode for is_file_hugetlbfs() */
+	file->f_mode |= (base->f_mode & FMODE_HUGETLBFS);
+
 	sfd->id = shp->shm_perm.id;
 	sfd->ns = get_ipc_ns(ns);
 	sfd->file = base;
diff --git a/mm/memfd.c b/mm/memfd.c
index 2647c898990c..e6c16b6bf3f6 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -123,7 +123,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
 		return &SHMEM_I(file_inode(file))->seals;
 
 #ifdef CONFIG_HUGETLBFS
-	if (is_file_hugepages(file))
+	if (is_file_hugetlbfs(file))
 		return &HUGETLBFS_I(file_inode(file))->seals;
 #endif
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f609e9ec4a25..703a9680a937 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1538,7 +1538,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 
 		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
-		if (file && is_file_hugepages(file))
+		if (file && is_file_hugetlbfs(file))
 			vm_flags |= VM_NORESERVE;
 	}
 
@@ -1562,10 +1562,10 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 		file = fget(fd);
 		if (!file)
 			return -EBADF;
-		if (is_file_hugepages(file))
+		if (is_file_hugetlbfs(file))
 			len = ALIGN(len, huge_page_size(hstate_file(file)));
 		retval = -EINVAL;
-		if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+		if (unlikely(flags & MAP_HUGETLB && !is_file_hugetlbfs(file)))
 			goto out_fput;
 	} else if (flags & MAP_HUGETLB) {
 		struct user_struct *user = NULL;
@@ -1678,7 +1678,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 	 * hugetlb has its own accounting separate from the core VM
 	 * VM_HUGETLB may not be set yet so we cannot check for that flag.
 	 */
-	if (file && is_file_hugepages(file))
+	if (file && is_file_hugetlbfs(file))
 		return 0;
 
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
-- 
2.25.4


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS
  2020-06-12  0:46 [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Mike Kravetz
@ 2020-06-12  0:46 ` Mike Kravetz
  2020-06-14 12:50   ` Amir Goldstein
  2020-06-12  1:53 ` [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Matthew Wilcox
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Mike Kravetz @ 2020-06-12  0:46 UTC (permalink / raw)
  To: linux-mm, linux-fsdevel, overlayfs, linux-kernel
  Cc: Al Viro, Miklos Szeredi, Matthew Wilcox, Colin Walters,
	Andrew Morton, syzbot, syzkaller-bugs, Mike Kravetz

The core routine get_unmapped_area will call a filesystem specific version
of get_unmapped_area if it exists in file operations.  If a file is on a
union/overlay, overlayfs does not contain a get_unmapped_area f_op and the
underlying filesystem routine may be ignored.  Add an overlayfs f_op to call
the underlying f_op if it exists.

The routine is_file_hugetlbfs() is used to determine if a file is on
hugetlbfs.  This is determined by f_mode & FMODE_HUGETLBFS.  Copy the mode
to the overlayfs file during open so that is_file_hugetlbfs() will work as
intended.

These two issues can result in the BUG as shown in [1].

[1] https://lore.kernel.org/linux-mm/000000000000b4684e05a2968ca6@google.com/

Reported-by: syzbot+d6ec23007e951dadf3de@syzkaller.appspotmail.com
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/overlayfs/file.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 87c362f65448..41e5746ba3c6 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -124,6 +124,8 @@ static int ovl_real_fdget(const struct file *file, struct fd *real)
 	return ovl_real_fdget_meta(file, real, false);
 }
 
+#define OVL_F_MODE_TO_UPPER	(FMODE_HUGETLBFS)
+
 static int ovl_open(struct inode *inode, struct file *file)
 {
 	struct file *realfile;
@@ -140,6 +142,9 @@ static int ovl_open(struct inode *inode, struct file *file)
 	if (IS_ERR(realfile))
 		return PTR_ERR(realfile);
 
+	/* Copy modes from underlying file */
+	file->f_mode |= (realfile->f_mode & OVL_F_MODE_TO_UPPER);
+
 	file->private_data = realfile;
 
 	return 0;
@@ -757,6 +762,21 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
 			    remap_flags, op);
 }
 
+#ifdef CONFIG_MMU
+static unsigned long ovl_get_unmapped_area(struct file *file,
+				unsigned long uaddr, unsigned long len,
+				unsigned long pgoff, unsigned long flags)
+{
+	struct file *realfile = file->private_data;
+
+	return (realfile->f_op->get_unmapped_area ?:
+		current->mm->get_unmapped_area)(realfile,
+						uaddr, len, pgoff, flags);
+}
+#else
+#define ovl_get_unmapped_area NULL
+#endif
+
 const struct file_operations ovl_file_operations = {
 	.open		= ovl_open,
 	.release	= ovl_release,
@@ -774,6 +794,7 @@ const struct file_operations ovl_file_operations = {
 
 	.copy_file_range	= ovl_copy_file_range,
 	.remap_file_range	= ovl_remap_file_range,
+	.get_unmapped_area	= ovl_get_unmapped_area,
 };
 
 int __init ovl_aio_request_cache_init(void)
-- 
2.25.4


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-12  0:46 [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Mike Kravetz
  2020-06-12  0:46 ` [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS Mike Kravetz
@ 2020-06-12  1:53 ` Matthew Wilcox
  2020-06-12  1:58 ` Al Viro
  2020-06-12  6:28 ` [RFC PATCH] hugetlb: hugetlbfs_file_operations can be static kernel test robot
  3 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox @ 2020-06-12  1:53 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-fsdevel, overlayfs, linux-kernel, Al Viro,
	Miklos Szeredi, Colin Walters, Andrew Morton, syzbot,
	syzkaller-bugs

On Thu, Jun 11, 2020 at 05:46:43PM -0700, Mike Kravetz wrote:
> The routine is_file_hugepages() checks f_op == hugetlbfs_file_operations
> to determine if the file resides in hugetlbfs.  This is problematic when
> the file is on a union or overlay.  Instead, define a new file mode
> FMODE_HUGETLBFS which is set when a hugetlbfs file is opened.  The mode
> can easily be copied to other 'files' derived from the original hugetlbfs
> file.
> 
> With this change hugetlbfs_file_operations can be static as it should be.
> 
> There is also a (duplicate) set of shm file operations used for the routine
> is_file_shm_hugepages().  Instead of setting/using special f_op's, just
> propagate the FMODE_HUGETLBFS mode.  This means is_file_shm_hugepages() and
> the duplicate f_ops can be removed.
> 
> While cleaning things up, change the name of is_file_hugepages() to
> is_file_hugetlbfs().  The term hugepages is a bit ambiguous.

I was going to have objections to this before I read it more carefully
and realised that the "shm" here is sysvipc and doesn't have anything
to do with the huge page support in shmfs.

> A subsequent patch will propagate FMODE_HUGETLBFS in overlayfs.
> 
> Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>

Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>

I might have suggested splitting the rename of is_file_hugetlbfs() from
the rest of this patch, but I wouldn't resend to change that.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-12  0:46 [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Mike Kravetz
  2020-06-12  0:46 ` [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS Mike Kravetz
  2020-06-12  1:53 ` [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Matthew Wilcox
@ 2020-06-12  1:58 ` Al Viro
  2020-06-12 21:51   ` Mike Kravetz
  2020-06-12  6:28 ` [RFC PATCH] hugetlb: hugetlbfs_file_operations can be static kernel test robot
  3 siblings, 1 reply; 17+ messages in thread
From: Al Viro @ 2020-06-12  1:58 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-fsdevel, overlayfs, linux-kernel, Miklos Szeredi,
	Matthew Wilcox, Colin Walters, Andrew Morton, syzbot,
	syzkaller-bugs

On Thu, Jun 11, 2020 at 05:46:43PM -0700, Mike Kravetz wrote:
> The routine is_file_hugepages() checks f_op == hugetlbfs_file_operations
> to determine if the file resides in hugetlbfs.  This is problematic when
> the file is on a union or overlay.  Instead, define a new file mode
> FMODE_HUGETLBFS which is set when a hugetlbfs file is opened.  The mode
> can easily be copied to other 'files' derived from the original hugetlbfs
> file.
> 
> With this change hugetlbfs_file_operations can be static as it should be.
> 
> There is also a (duplicate) set of shm file operations used for the routine
> is_file_shm_hugepages().  Instead of setting/using special f_op's, just
> propagate the FMODE_HUGETLBFS mode.  This means is_file_shm_hugepages() and
> the duplicate f_ops can be removed.

s/HUGETLBFS/HUGEPAGES/, please.

> While cleaning things up, change the name of is_file_hugepages() to
> is_file_hugetlbfs().  The term hugepages is a bit ambiguous.

Don't, especially since the very next patch adds such on overlayfs...

Incidentally, can a hugetlbfs be a lower layer, while the upper one
is a normal filesystem?  What should happen on copyup?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RFC PATCH] hugetlb: hugetlbfs_file_operations can be static
  2020-06-12  0:46 [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Mike Kravetz
                   ` (2 preceding siblings ...)
  2020-06-12  1:58 ` Al Viro
@ 2020-06-12  6:28 ` kernel test robot
  3 siblings, 0 replies; 17+ messages in thread
From: kernel test robot @ 2020-06-12  6:28 UTC (permalink / raw)
  To: Mike Kravetz, linux-mm, linux-fsdevel, overlayfs, linux-kernel
  Cc: kbuild-all, Al Viro, Miklos Szeredi, Matthew Wilcox,
	Colin Walters, Andrew Morton, Linux Memory Management List


Signed-off-by: kernel test robot <lkp@intel.com>
---
 inode.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5c0c50a88c84b..98d044be8a5cf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -41,7 +41,7 @@
 
 static const struct super_operations hugetlbfs_ops;
 static const struct address_space_operations hugetlbfs_aops;
-const struct file_operations hugetlbfs_file_operations;
+static const struct file_operations hugetlbfs_file_operations;
 static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-12  1:58 ` Al Viro
@ 2020-06-12 21:51   ` Mike Kravetz
  2020-06-13  6:53     ` Amir Goldstein
  0 siblings, 1 reply; 17+ messages in thread
From: Mike Kravetz @ 2020-06-12 21:51 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-mm, linux-fsdevel, overlayfs, linux-kernel, Miklos Szeredi,
	Matthew Wilcox, Colin Walters, Andrew Morton, syzbot,
	syzkaller-bugs

On 6/11/20 6:58 PM, Al Viro wrote:
> On Thu, Jun 11, 2020 at 05:46:43PM -0700, Mike Kravetz wrote:
>> The routine is_file_hugepages() checks f_op == hugetlbfs_file_operations
>> to determine if the file resides in hugetlbfs.  This is problematic when
>> the file is on a union or overlay.  Instead, define a new file mode
>> FMODE_HUGETLBFS which is set when a hugetlbfs file is opened.  The mode
>> can easily be copied to other 'files' derived from the original hugetlbfs
>> file.
>>
>> With this change hugetlbfs_file_operations can be static as it should be.
>>
>> There is also a (duplicate) set of shm file operations used for the routine
>> is_file_shm_hugepages().  Instead of setting/using special f_op's, just
>> propagate the FMODE_HUGETLBFS mode.  This means is_file_shm_hugepages() and
>> the duplicate f_ops can be removed.
> 
> s/HUGETLBFS/HUGEPAGES/, please.
> 
>> While cleaning things up, change the name of is_file_hugepages() to
>> is_file_hugetlbfs().  The term hugepages is a bit ambiguous.
> 
> Don't, especially since the very next patch adds such on overlayfs...

Ok. This is just something I thought might clarify things.  I seem to
recall questions about 'huge page' routines such as "is that for THP or
hugetlb huge pages"?  That was my motivation for the change.  Since this
is only about hugetlbfs, make it explicit.

> Incidentally, can a hugetlbfs be a lower layer, while the upper one
> is a normal filesystem?  What should happen on copyup?

Yes, that seems to work as expected.  When accessed for write the hugetlb
file is copied to the normal filesystem.

The BUG found by syzbot actually has a single hugetlbfs as both lower and
upper.  With the BUG 'fixed', I am not exactly sure what the expected
behavior is in this case.  I may be wrong, but I would expect any operations
that can be performed on a stand alone hugetlbfs to also be performed on
the overlay.  However, mmap() still fails.  I will look into it.

I also looked at normal filesystem lower and hugetlbfs upper.  Yes, overlayfs
allows this.  This is somewhat 'interesting' as write() is not supported in
hugetlbfs.  Writing to files in the overlay actually ended up writing to
files in the lower filesystem.  That seems wrong, but overlayfs is new to me.

Earlier in the discussion of these issues, Colin Walters asked "Is there any
actual valid use case for mounting an overlayfs on top of hugetlbfs?"  I can
not think of one.  Perhaps we should consider limiting the ways in which
hugetlbfs can be used in overlayfs?  Preventing it from being an upper
filesystem might be a good start?  Or, do people think making hugetlbfs and
overlayfs play nice together is useful?
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-12 21:51   ` Mike Kravetz
@ 2020-06-13  6:53     ` Amir Goldstein
  2020-06-13 14:38       ` Matthew Wilcox
                         ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Amir Goldstein @ 2020-06-13  6:53 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Al Viro, Linux MM, linux-fsdevel, overlayfs, linux-kernel,
	Miklos Szeredi, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

> > Incidentally, can a hugetlbfs be a lower layer, while the upper one
> > is a normal filesystem?  What should happen on copyup?
>
> Yes, that seems to work as expected.  When accessed for write the hugetlb
> file is copied to the normal filesystem.
>
> The BUG found by syzbot actually has a single hugetlbfs as both lower and
> upper.  With the BUG 'fixed', I am not exactly sure what the expected
> behavior is in this case.  I may be wrong, but I would expect any operations
> that can be performed on a stand alone hugetlbfs to also be performed on
> the overlay.  However, mmap() still fails.  I will look into it.
>
> I also looked at normal filesystem lower and hugetlbfs upper.  Yes, overlayfs
> allows this.  This is somewhat 'interesting' as write() is not supported in
> hugetlbfs.  Writing to files in the overlay actually ended up writing to
> files in the lower filesystem.  That seems wrong, but overlayfs is new to me.
>

I am not sure how that happened, but I think that ovl_open_realfile()
needs to fixup f_mode flags FMODE_CAN_WRITE | FMODE_CAN_READ
after open_with_fake_path().

> Earlier in the discussion of these issues, Colin Walters asked "Is there any
> actual valid use case for mounting an overlayfs on top of hugetlbfs?"  I can
> not think of one.  Perhaps we should consider limiting the ways in which
> hugetlbfs can be used in overlayfs?  Preventing it from being an upper
> filesystem might be a good start?  Or, do people think making hugetlbfs and
> overlayfs play nice together is useful?

If people think that making hugetlbfs and overlayfs play nice together maybe
they should work on this problem. It doesn't look like either
hugetlbfs developers
nor overlayfs developers care much about the combination.
Your concern, I assume, is fixing the syzbot issue.

I agree with Colin's remark about adding limitations, but it would be a shame
if overlay had to special case hugetlbfs. It would have been better if we could
find a property of hugetlbfs that makes it inapplicable for overlayfs
upper/lower
or stacking fs in general.

The simplest thing for you to do in order to shush syzbot is what procfs does:
        /*
         * procfs isn't actually a stacking filesystem; however, there is
         * too much magic going on inside it to permit stacking things on
         * top of it
         */
        s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;

Currently, the only in-tree stacking fs are overlayfs and ecryptfs, but there
are some out of tree implementations as well (shiftfs).
So you may only take that option if you do not care about the combination
of hugetlbfs with any of the above.

overlayfs support of mmap is not as good as one might hope.
overlayfs.rst says:
"If a file residing on a lower layer is opened for read-only and then
 memory mapped with MAP_SHARED, then subsequent changes to
 the file are not reflected in the memory mapping."

So if I were you, I wouldn't go trying to fix overlayfs-huguetlb interop...

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-13  6:53     ` Amir Goldstein
@ 2020-06-13 14:38       ` Matthew Wilcox
  2020-06-13 19:12       ` Mike Kravetz
  2020-06-15  8:24       ` Miklos Szeredi
  2 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox @ 2020-06-13 14:38 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Mike Kravetz, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Miklos Szeredi, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On Sat, Jun 13, 2020 at 09:53:24AM +0300, Amir Goldstein wrote:
> Currently, the only in-tree stacking fs are overlayfs and ecryptfs, but there
> are some out of tree implementations as well (shiftfs).
> So you may only take that option if you do not care about the combination
> of hugetlbfs with any of the above.

I could see shiftfs being interesting, maybe.  I don't really see
the usecase for layering overlayfs or ecryptfs on top of a ram-based
filesystem.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-13  6:53     ` Amir Goldstein
  2020-06-13 14:38       ` Matthew Wilcox
@ 2020-06-13 19:12       ` Mike Kravetz
  2020-06-15  7:53         ` Miklos Szeredi
  2020-06-15  8:24       ` Miklos Szeredi
  2 siblings, 1 reply; 17+ messages in thread
From: Mike Kravetz @ 2020-06-13 19:12 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Al Viro, Linux MM, linux-fsdevel, overlayfs, linux-kernel,
	Miklos Szeredi, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On 6/12/20 11:53 PM, Amir Goldstein wrote:
>>> Incidentally, can a hugetlbfs be a lower layer, while the upper one
>>> is a normal filesystem?  What should happen on copyup?
>>
>> Yes, that seems to work as expected.  When accessed for write the hugetlb
>> file is copied to the normal filesystem.
>>
>> The BUG found by syzbot actually has a single hugetlbfs as both lower and
>> upper.  With the BUG 'fixed', I am not exactly sure what the expected
>> behavior is in this case.  I may be wrong, but I would expect any operations
>> that can be performed on a stand alone hugetlbfs to also be performed on
>> the overlay.  However, mmap() still fails.  I will look into it.
>>
>> I also looked at normal filesystem lower and hugetlbfs upper.  Yes, overlayfs
>> allows this.  This is somewhat 'interesting' as write() is not supported in
>> hugetlbfs.  Writing to files in the overlay actually ended up writing to
>> files in the lower filesystem.  That seems wrong, but overlayfs is new to me.
>>
> 
> I am not sure how that happened, but I think that ovl_open_realfile()
> needs to fixup f_mode flags FMODE_CAN_WRITE | FMODE_CAN_READ
> after open_with_fake_path().
> 
>> Earlier in the discussion of these issues, Colin Walters asked "Is there any
>> actual valid use case for mounting an overlayfs on top of hugetlbfs?"  I can
>> not think of one.  Perhaps we should consider limiting the ways in which
>> hugetlbfs can be used in overlayfs?  Preventing it from being an upper
>> filesystem might be a good start?  Or, do people think making hugetlbfs and
>> overlayfs play nice together is useful?
> 
> If people think that making hugetlbfs and overlayfs play nice together maybe
> they should work on this problem. It doesn't look like either
> hugetlbfs developers
> nor overlayfs developers care much about the combination.

Thanks Amir,

As a hugetlbfs developer, I do not know of a use case for interoperability
with overlayfs.  So yes, I am not too interested in making them work well
together.  However, if there was an actual use case I would be more than
happy to consider doing the work.  Just hate to put effort into fixing up
two 'special' filesystems for functionality that may not be used.

I can't speak for overlayfs developers.

> Your concern, I assume, is fixing the syzbot issue.

That is the primary concern.  We should not BUG!  After fixing that up, Al
asked how these things worked together.  I honestly did not look at
interoperability before that.  I am not sure if anyone has done that in the
past.

> I agree with Colin's remark about adding limitations, but it would be a shame
> if overlay had to special case hugetlbfs. It would have been better if we could
> find a property of hugetlbfs that makes it inapplicable for overlayfs
> upper/lower
> or stacking fs in general.
> 
> The simplest thing for you to do in order to shush syzbot is what procfs does:
>         /*
>          * procfs isn't actually a stacking filesystem; however, there is
>          * too much magic going on inside it to permit stacking things on
>          * top of it
>          */
>         s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
> 
> Currently, the only in-tree stacking fs are overlayfs and ecryptfs, but there
> are some out of tree implementations as well (shiftfs).
> So you may only take that option if you do not care about the combination
> of hugetlbfs with any of the above.
> 
> overlayfs support of mmap is not as good as one might hope.
> overlayfs.rst says:
> "If a file residing on a lower layer is opened for read-only and then
>  memory mapped with MAP_SHARED, then subsequent changes to
>  the file are not reflected in the memory mapping."
> 
> So if I were you, I wouldn't go trying to fix overlayfs-huguetlb interop...

Thanks again,

I'll look at something as simple as s_stack_depth.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS
  2020-06-12  0:46 ` [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS Mike Kravetz
@ 2020-06-14 12:50   ` Amir Goldstein
  0 siblings, 0 replies; 17+ messages in thread
From: Amir Goldstein @ 2020-06-14 12:50 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Linux MM, linux-fsdevel, overlayfs, linux-kernel, Al Viro,
	Miklos Szeredi, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs


[-- Attachment #1: Type: text/plain, Size: 2106 bytes --]

On Fri, Jun 12, 2020 at 3:57 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> The core routine get_unmapped_area will call a filesystem specific version
> of get_unmapped_area if it exists in file operations.  If a file is on a
> union/overlay, overlayfs does not contain a get_unmapped_area f_op and the
> underlying filesystem routine may be ignored.  Add an overlayfs f_op to call
> the underlying f_op if it exists.
>
> The routine is_file_hugetlbfs() is used to determine if a file is on
> hugetlbfs.  This is determined by f_mode & FMODE_HUGETLBFS.  Copy the mode
> to the overlayfs file during open so that is_file_hugetlbfs() will work as
> intended.
>
> These two issues can result in the BUG as shown in [1].
>
> [1] https://lore.kernel.org/linux-mm/000000000000b4684e05a2968ca6@google.com/
>
> Reported-by: syzbot+d6ec23007e951dadf3de@syzkaller.appspotmail.com
> Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  fs/overlayfs/file.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
>
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 87c362f65448..41e5746ba3c6 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -124,6 +124,8 @@ static int ovl_real_fdget(const struct file *file, struct fd *real)
>         return ovl_real_fdget_meta(file, real, false);
>  }
>
> +#define OVL_F_MODE_TO_UPPER    (FMODE_HUGETLBFS)
> +
>  static int ovl_open(struct inode *inode, struct file *file)
>  {
>         struct file *realfile;
> @@ -140,6 +142,9 @@ static int ovl_open(struct inode *inode, struct file *file)
>         if (IS_ERR(realfile))
>                 return PTR_ERR(realfile);
>
> +       /* Copy modes from underlying file */
> +       file->f_mode |= (realfile->f_mode & OVL_F_MODE_TO_UPPER);
> +

The name OVL_F_MODE_TO_UPPER is strange because
ovl_open() may be opening a file from lower or from upper.

Please see attached patches.
They are not enough to solve the syzbot repro, but if you do want to
fix hugetlb/overlay interop I think they will be necessary.

Thanks,
Amir.

[-- Attachment #2: 0002-ovl-warn-about-unsupported-file-operations-on-upper-.patch.txt --]
[-- Type: text/plain, Size: 4831 bytes --]

From 691915fd8d4bb2c0f944b7d8e504a323b8e53b69 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 14 Jun 2020 13:51:30 +0300
Subject: [PATCH 2/2] ovl: warn about unsupported file operations on upper fs

syzbot has reported a crash in a test case where overlayfs uses
hugetlbfs as upper fs.

Since hugetlbfs has no write() nor write_iter() file ops, there seems
to be little value in supporting this configuration, however, we do not
want to regress strange use cases that me be using such configuration.

As a minimum, check for this case and warn about it on mount time as
well as adding this check for the strict requirements set of remote
upper fs.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/super.c | 60 ++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 91476bc422f9..5bee70eb8f64 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -6,6 +6,7 @@
 
 #include <uapi/linux/magic.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
 #include <linux/mount.h>
@@ -1131,42 +1132,61 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
 }
 
 /*
- * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
+ * Returns 1 if required ops are supported, 0 if not supported and
  * negative values if error is encountered.
  */
-static int ovl_check_rename_whiteout(struct dentry *workdir)
+static int ovl_check_supported_ops(struct ovl_fs *ofs)
 {
-	struct inode *dir = d_inode(workdir);
-	struct dentry *temp;
+	struct inode *dir = d_inode(ofs->workdir);
+	struct dentry *temp = NULL;
 	struct dentry *dest;
 	struct dentry *whiteout;
 	struct name_snapshot name;
+	struct path path;
+	struct file *file;
+	unsigned int unsupported;
 	int err;
 
 	inode_lock_nested(dir, I_MUTEX_PARENT);
 
-	temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
-	err = PTR_ERR(temp);
-	if (IS_ERR(temp))
+	path.mnt = ovl_upper_mnt(ofs);
+	path.dentry = ovl_create_temp(ofs->workdir, OVL_CATTR(S_IFREG | 0));
+	err = PTR_ERR(path.dentry);
+	if (IS_ERR(path.dentry))
 		goto out_unlock;
 
-	dest = ovl_lookup_temp(workdir);
-	err = PTR_ERR(dest);
-	if (IS_ERR(dest)) {
-		dput(temp);
+	temp = path.dentry;
+	file = dentry_open(&path, O_RDWR, current_cred());
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out_unlock;
+
+	unsupported = OVL_UPPER_FMODE_MASK & ~file->f_mode;
+	fput(file);
+	if (unsupported) {
+		err = 0;
+		pr_warn("upper fs does not support required file operations (%x).\n",
+			unsupported);
 		goto out_unlock;
 	}
 
+	dest = ovl_lookup_temp(ofs->workdir);
+	err = PTR_ERR(dest);
+	if (IS_ERR(dest))
+		goto out_unlock;
+
 	/* Name is inline and stable - using snapshot as a copy helper */
 	take_dentry_name_snapshot(&name, temp);
 	err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
 	if (err) {
-		if (err == -EINVAL)
+		if (err == -EINVAL) {
 			err = 0;
+			pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+		}
 		goto cleanup_temp;
 	}
 
-	whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+	whiteout = lookup_one_len(name.name.name, ofs->workdir, name.name.len);
 	err = PTR_ERR(whiteout);
 	if (IS_ERR(whiteout))
 		goto cleanup_temp;
@@ -1181,10 +1201,10 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
 cleanup_temp:
 	ovl_cleanup(dir, temp);
 	release_dentry_name_snapshot(&name);
-	dput(temp);
 	dput(dest);
 
 out_unlock:
+	dput(temp);
 	inode_unlock(dir);
 
 	return err;
@@ -1195,7 +1215,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
 {
 	struct vfsmount *mnt = ovl_upper_mnt(ofs);
 	struct dentry *temp;
-	bool rename_whiteout;
+	bool supported_ops;
 	bool d_type;
 	int fh_type;
 	int err;
@@ -1235,14 +1255,12 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
 		pr_warn("upper fs does not support tmpfile.\n");
 
 
-	/* Check if upper/work fs supports RENAME_WHITEOUT */
-	err = ovl_check_rename_whiteout(ofs->workdir);
+	/* Check if upper/work fs supports required ops */
+	err = ovl_check_supported_ops(ofs);
 	if (err < 0)
 		goto out;
 
-	rename_whiteout = err;
-	if (!rename_whiteout)
-		pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+	supported_ops = err;
 
 	/*
 	 * Check if upper/work fs supports trusted.overlay.* xattr
@@ -1264,7 +1282,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
 	 * we can enforce strict requirements for remote upper fs.
 	 */
 	if (ovl_dentry_remote(ofs->workdir) &&
-	    (!d_type || !rename_whiteout || ofs->noxattr)) {
+	    (!d_type || !supported_ops || ofs->noxattr)) {
 		pr_err("upper fs missing required features.\n");
 		err = -EINVAL;
 		goto out;
-- 
2.17.1


[-- Attachment #3: 0001-ovl-inherit-supported-ops-f_mode-flags-from-final-re.patch.txt --]
[-- Type: text/plain, Size: 3515 bytes --]

From bd4e44245f2f162da37b02dc1e0e7458cd28a5de Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 14 Jun 2020 10:44:09 +0300
Subject: [PATCH 1/2] ovl: inherit supported ops f_mode flags from final real
 file

Mike Kravetz reported that when hugetlbfs is used as overlayfs upper
layer, writes do not fail, but result in writing to lower layer.

This is surprising because hugeltbfs file does not have write() nor
write_iter() method.

Regardless of the question whether or not this type of filesystem should
be allowed as overlayfs upper layer, overlayfs file should emulate the
supported ops of the underlying files, so at least in the case where
underlying file ops cannot change as result of copy up, the overlayfs
file should inherit the f_mode flags indicating the supported ops of
the underlying file.

Reported-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/copy_up.c   | 11 +++++++++++
 fs/overlayfs/file.c      | 11 +++++++++++
 fs/overlayfs/overlayfs.h |  5 +++++
 3 files changed, 27 insertions(+)

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 79dd052c7dbf..424f2a170f11 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -953,6 +953,17 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
 	return true;
 }
 
+/* May need copy up in the future? */
+bool ovl_may_need_copy_up(struct dentry *dentry)
+{
+	int flags = O_RDONLY;
+
+	if (ovl_upper_mnt(OVL_FS(dentry->d_sb)))
+		flags = O_RDWR;
+
+	return ovl_open_need_copy_up(dentry, flags);
+}
+
 int ovl_maybe_copy_up(struct dentry *dentry, int flags)
 {
 	int err = 0;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 01820e654a21..01dd3ed723df 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -153,6 +153,17 @@ static int ovl_open(struct inode *inode, struct file *file)
 	if (IS_ERR(realfile))
 		return PTR_ERR(realfile);
 
+	/*
+	 * Overlay file supported ops are a super set of the underlying file
+	 * supported ops and we do not change them when file is copied up.
+	 * But if file cannot be copied up, then there is no need to advertize
+	 * more supported ops than underlying file actually has.
+	 */
+	if (!ovl_may_need_copy_up(file_dentry(file))) {
+		file->f_mode &= ~OVL_UPPER_FMODE_MASK;
+		file->f_mode |= realfile->f_mode & OVL_UPPER_FMODE_MASK;
+	}
+
 	file->private_data = realfile;
 
 	return 0;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b725c7f15ff4..6748c28ff477 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -110,6 +110,10 @@ struct ovl_fh {
 #define OVL_FH_FID_OFFSET	(OVL_FH_WIRE_OFFSET + \
 				 offsetof(struct ovl_fb, fid))
 
+/* f_mode bits expected to be set on an upper file */
+#define OVL_UPPER_FMODE_MASK (FMODE_CAN_READ | FMODE_CAN_WRITE | \
+			      FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE)
+
 static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int err = vfs_rmdir(dir, dentry);
@@ -485,6 +489,7 @@ int ovl_copy_up(struct dentry *dentry);
 int ovl_copy_up_with_data(struct dentry *dentry);
 int ovl_copy_up_flags(struct dentry *dentry, int flags);
 int ovl_maybe_copy_up(struct dentry *dentry, int flags);
+bool ovl_may_need_copy_up(struct dentry *dentry);
 int ovl_copy_xattr(struct dentry *old, struct dentry *new);
 int ovl_set_attr(struct dentry *upper, struct kstat *stat);
 struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-13 19:12       ` Mike Kravetz
@ 2020-06-15  7:53         ` Miklos Szeredi
  2020-06-15 10:05           ` Amir Goldstein
  2020-06-15 23:45           ` Mike Kravetz
  0 siblings, 2 replies; 17+ messages in thread
From: Miklos Szeredi @ 2020-06-15  7:53 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Amir Goldstein, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On Sat, Jun 13, 2020 at 9:12 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 6/12/20 11:53 PM, Amir Goldstein wrote:

> As a hugetlbfs developer, I do not know of a use case for interoperability
> with overlayfs.  So yes, I am not too interested in making them work well
> together.  However, if there was an actual use case I would be more than
> happy to consider doing the work.  Just hate to put effort into fixing up
> two 'special' filesystems for functionality that may not be used.
>
> I can't speak for overlayfs developers.

As I said, I only know of tmpfs being upper layer as a valid use case.
   Does that work with hugepages?  How would I go about testing that?

> > I agree with Colin's remark about adding limitations, but it would be a shame
> > if overlay had to special case hugetlbfs. It would have been better if we could
> > find a property of hugetlbfs that makes it inapplicable for overlayfs
> > upper/lower
> > or stacking fs in general.
> >
> > The simplest thing for you to do in order to shush syzbot is what procfs does:
> >         /*
> >          * procfs isn't actually a stacking filesystem; however, there is
> >          * too much magic going on inside it to permit stacking things on
> >          * top of it
> >          */
> >         s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
> >
> > Currently, the only in-tree stacking fs are overlayfs and ecryptfs, but there
> > are some out of tree implementations as well (shiftfs).
> > So you may only take that option if you do not care about the combination
> > of hugetlbfs with any of the above.
> >
> > overlayfs support of mmap is not as good as one might hope.
> > overlayfs.rst says:
> > "If a file residing on a lower layer is opened for read-only and then
> >  memory mapped with MAP_SHARED, then subsequent changes to
> >  the file are not reflected in the memory mapping."
> >
> > So if I were you, I wouldn't go trying to fix overlayfs-huguetlb interop...
>
> Thanks again,
>
> I'll look at something as simple as s_stack_depth.

Agree.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-13  6:53     ` Amir Goldstein
  2020-06-13 14:38       ` Matthew Wilcox
  2020-06-13 19:12       ` Mike Kravetz
@ 2020-06-15  8:24       ` Miklos Szeredi
  2020-06-15 17:48         ` Mike Kravetz
  2 siblings, 1 reply; 17+ messages in thread
From: Miklos Szeredi @ 2020-06-15  8:24 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Mike Kravetz, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On Sat, Jun 13, 2020 at 8:53 AM Amir Goldstein <amir73il@gmail.com> wrote:

> > I also looked at normal filesystem lower and hugetlbfs upper.  Yes, overlayfs
> > allows this.  This is somewhat 'interesting' as write() is not supported in
> > hugetlbfs.  Writing to files in the overlay actually ended up writing to
> > files in the lower filesystem.  That seems wrong, but overlayfs is new to me.

Yes, this very definitely should not happen.

> I am not sure how that happened, but I think that ovl_open_realfile()
> needs to fixup f_mode flags FMODE_CAN_WRITE | FMODE_CAN_READ
> after open_with_fake_path().

Okay, but how did the write actually get to the lower layer?

I failed to reproduce this.  Mike, how did you trigger this?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-15  7:53         ` Miklos Szeredi
@ 2020-06-15 10:05           ` Amir Goldstein
  2020-06-15 13:01             ` Miklos Szeredi
  2020-06-15 23:45           ` Mike Kravetz
  1 sibling, 1 reply; 17+ messages in thread
From: Amir Goldstein @ 2020-06-15 10:05 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Mike Kravetz, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On Mon, Jun 15, 2020 at 10:53 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Sat, Jun 13, 2020 at 9:12 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >
> > On 6/12/20 11:53 PM, Amir Goldstein wrote:
>
> > As a hugetlbfs developer, I do not know of a use case for interoperability
> > with overlayfs.  So yes, I am not too interested in making them work well
> > together.  However, if there was an actual use case I would be more than
> > happy to consider doing the work.  Just hate to put effort into fixing up
> > two 'special' filesystems for functionality that may not be used.
> >
> > I can't speak for overlayfs developers.
>
> As I said, I only know of tmpfs being upper layer as a valid use case.
>    Does that work with hugepages?  How would I go about testing that?

Simple, after enabling CONFIG_HUGETLBFS:

diff --git a/mount_union.py b/mount_union.py
index fae8899..4070c70 100644
--- a/mount_union.py
+++ b/mount_union.py
@@ -15,7 +15,7 @@ def mount_union(ctx):
         snapshot_mntroot = cfg.snapshot_mntroot()
         if cfg.should_mount_upper():
             system("mount " + upper_mntroot + " 2>/dev/null"
-                    " || mount -t tmpfs upper_layer " + upper_mntroot)
+                    " || mount -t hugetlbfs upper_layer " + upper_mntroot)
         layer_mntroot = upper_mntroot + "/" + ctx.curr_layer()
         upperdir = layer_mntroot + "/u"
         workdir = layer_mntroot + "/w"

It fails colossally, because hugetlbfs, does not have write_iter().
It is only meant as an interface to create named maps of huge pages.
So I don't really see the use case for using it as upper.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-15 10:05           ` Amir Goldstein
@ 2020-06-15 13:01             ` Miklos Szeredi
  0 siblings, 0 replies; 17+ messages in thread
From: Miklos Szeredi @ 2020-06-15 13:01 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Mike Kravetz, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On Mon, Jun 15, 2020 at 12:05 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Jun 15, 2020 at 10:53 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > On Sat, Jun 13, 2020 at 9:12 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > >
> > > On 6/12/20 11:53 PM, Amir Goldstein wrote:
> >
> > > As a hugetlbfs developer, I do not know of a use case for interoperability
> > > with overlayfs.  So yes, I am not too interested in making them work well
> > > together.  However, if there was an actual use case I would be more than
> > > happy to consider doing the work.  Just hate to put effort into fixing up
> > > two 'special' filesystems for functionality that may not be used.
> > >
> > > I can't speak for overlayfs developers.
> >
> > As I said, I only know of tmpfs being upper layer as a valid use case.
> >    Does that work with hugepages?  How would I go about testing that?
>
> Simple, after enabling CONFIG_HUGETLBFS:
>
> diff --git a/mount_union.py b/mount_union.py
> index fae8899..4070c70 100644
> --- a/mount_union.py
> +++ b/mount_union.py
> @@ -15,7 +15,7 @@ def mount_union(ctx):
>          snapshot_mntroot = cfg.snapshot_mntroot()
>          if cfg.should_mount_upper():
>              system("mount " + upper_mntroot + " 2>/dev/null"
> -                    " || mount -t tmpfs upper_layer " + upper_mntroot)
> +                    " || mount -t hugetlbfs upper_layer " + upper_mntroot)
>          layer_mntroot = upper_mntroot + "/" + ctx.curr_layer()
>          upperdir = layer_mntroot + "/u"
>          workdir = layer_mntroot + "/w"
>
> It fails colossally, because hugetlbfs, does not have write_iter().
> It is only meant as an interface to create named maps of huge pages.
> So I don't really see the use case for using it as upper.

Right.

I was actually asking about the tmpfs+hugepages, not the hugetlbfs case.

In the tmpfs case it looks like the lack of ->get_unmapped_area() in
overlayfs could still be an issue.   But I'm not sure how to trigger
that.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-15  8:24       ` Miklos Szeredi
@ 2020-06-15 17:48         ` Mike Kravetz
  0 siblings, 0 replies; 17+ messages in thread
From: Mike Kravetz @ 2020-06-15 17:48 UTC (permalink / raw)
  To: Miklos Szeredi, Amir Goldstein
  Cc: Al Viro, Linux MM, linux-fsdevel, overlayfs, linux-kernel,
	Matthew Wilcox, Colin Walters, Andrew Morton, syzbot,
	syzkaller-bugs

On 6/15/20 1:24 AM, Miklos Szeredi wrote:
> On Sat, Jun 13, 2020 at 8:53 AM Amir Goldstein <amir73il@gmail.com> wrote:
> 
>>> I also looked at normal filesystem lower and hugetlbfs upper.  Yes, overlayfs
>>> allows this.  This is somewhat 'interesting' as write() is not supported in
>>> hugetlbfs.  Writing to files in the overlay actually ended up writing to
>>> files in the lower filesystem.  That seems wrong, but overlayfs is new to me.
> 
> Yes, this very definitely should not happen.
> 
>> I am not sure how that happened, but I think that ovl_open_realfile()
>> needs to fixup f_mode flags FMODE_CAN_WRITE | FMODE_CAN_READ
>> after open_with_fake_path().
> 
> Okay, but how did the write actually get to the lower layer?
> 
> I failed to reproduce this.  Mike, how did you trigger this?

My apologies!!!

I reviewed my testing and found that it was incorrectly writing to the
lower filesystem.  Writing to any file in the union will fail.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-15  7:53         ` Miklos Szeredi
  2020-06-15 10:05           ` Amir Goldstein
@ 2020-06-15 23:45           ` Mike Kravetz
  2020-06-16  9:01             ` Miklos Szeredi
  1 sibling, 1 reply; 17+ messages in thread
From: Mike Kravetz @ 2020-06-15 23:45 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Amir Goldstein, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On 6/15/20 12:53 AM, Miklos Szeredi wrote:
> On Sat, Jun 13, 2020 at 9:12 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>> On 6/12/20 11:53 PM, Amir Goldstein wrote:
>>>
>>> The simplest thing for you to do in order to shush syzbot is what procfs does:
>>>         /*
>>>          * procfs isn't actually a stacking filesystem; however, there is
>>>          * too much magic going on inside it to permit stacking things on
>>>          * top of it
>>>          */
>>>         s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
>>>
>>> Currently, the only in-tree stacking fs are overlayfs and ecryptfs, but there
>>> are some out of tree implementations as well (shiftfs).
>>> So you may only take that option if you do not care about the combination
>>> of hugetlbfs with any of the above.
>>>
>>> overlayfs support of mmap is not as good as one might hope.
>>> overlayfs.rst says:
>>> "If a file residing on a lower layer is opened for read-only and then
>>>  memory mapped with MAP_SHARED, then subsequent changes to
>>>  the file are not reflected in the memory mapping."
>>>
>>> So if I were you, I wouldn't go trying to fix overlayfs-huguetlb interop...
>>
>> Thanks again,
>>
>> I'll look at something as simple as s_stack_depth.
> 
> Agree.

Apologies again for in the incorrect information about writing to lower
filesystem.

Stacking ecryptfs on hugetlbfs does not work either.  Here is what happens
when trying to create a new file.

[ 1188.863425] ecryptfs_write_metadata_to_contents: Error attempting to write header information to lower file; rc = [-22]
[ 1188.865469] ecryptfs_write_metadata: Error writing metadata out to lower file; rc = [-22]
[ 1188.867022] Error writing headers; rc = [-22]

I like Amir's idea of just setting s_stack_depth in hugetlbfs to prevent
stacking.

From 0fbed66b37c18919ea7edd47b113c97644f49362 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Mon, 15 Jun 2020 14:37:52 -0700
Subject: [PATCH] hugetlbfs: prevent filesystem stacking of hugetlbfs

syzbot found issues with having hugetlbfs on a union/overlay as reported
in [1].  Due to the limitations (no write) and special functionality of
hugetlbfs, it does not work well in filesystem stacking.  There are no
know use cases for hugetlbfs stacking.  Rather than making modifications
to get hugetlbfs working in such environments, simply prevent stacking.

[1] https://lore.kernel.org/linux-mm/000000000000b4684e05a2968ca6@google.com/

Reported-by: syzbot+d6ec23007e951dadf3de@syzkaller.appspotmail.com
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 991c60c7ffe0..f32759c8e84d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1313,6 +1313,12 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_magic = HUGETLBFS_MAGIC;
 	sb->s_op = &hugetlbfs_ops;
 	sb->s_time_gran = 1;
+
+	/*
+	 * Due to the special and limited functionality of hugetlbfs, it does
+	 * not work well as a stacking filesystem.
+	 */
+	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
 	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
 	if (!sb->s_root)
 		goto out_free;
-- 
2.25.4


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files
  2020-06-15 23:45           ` Mike Kravetz
@ 2020-06-16  9:01             ` Miklos Szeredi
  0 siblings, 0 replies; 17+ messages in thread
From: Miklos Szeredi @ 2020-06-16  9:01 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Amir Goldstein, Al Viro, Linux MM, linux-fsdevel, overlayfs,
	linux-kernel, Matthew Wilcox, Colin Walters, Andrew Morton,
	syzbot, syzkaller-bugs

On Tue, Jun 16, 2020 at 1:45 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 6/15/20 12:53 AM, Miklos Szeredi wrote:
> > On Sat, Jun 13, 2020 at 9:12 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >> On 6/12/20 11:53 PM, Amir Goldstein wrote:
> >>>
> >>> The simplest thing for you to do in order to shush syzbot is what procfs does:
> >>>         /*
> >>>          * procfs isn't actually a stacking filesystem; however, there is
> >>>          * too much magic going on inside it to permit stacking things on
> >>>          * top of it
> >>>          */
> >>>         s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
> >>>
> >>> Currently, the only in-tree stacking fs are overlayfs and ecryptfs, but there
> >>> are some out of tree implementations as well (shiftfs).
> >>> So you may only take that option if you do not care about the combination
> >>> of hugetlbfs with any of the above.
> >>>
> >>> overlayfs support of mmap is not as good as one might hope.
> >>> overlayfs.rst says:
> >>> "If a file residing on a lower layer is opened for read-only and then
> >>>  memory mapped with MAP_SHARED, then subsequent changes to
> >>>  the file are not reflected in the memory mapping."
> >>>
> >>> So if I were you, I wouldn't go trying to fix overlayfs-huguetlb interop...
> >>
> >> Thanks again,
> >>
> >> I'll look at something as simple as s_stack_depth.
> >
> > Agree.
>
> Apologies again for in the incorrect information about writing to lower
> filesystem.
>
> Stacking ecryptfs on hugetlbfs does not work either.  Here is what happens
> when trying to create a new file.
>
> [ 1188.863425] ecryptfs_write_metadata_to_contents: Error attempting to write header information to lower file; rc = [-22]
> [ 1188.865469] ecryptfs_write_metadata: Error writing metadata out to lower file; rc = [-22]
> [ 1188.867022] Error writing headers; rc = [-22]
>
> I like Amir's idea of just setting s_stack_depth in hugetlbfs to prevent
> stacking.
>
> From 0fbed66b37c18919ea7edd47b113c97644f49362 Mon Sep 17 00:00:00 2001
> From: Mike Kravetz <mike.kravetz@oracle.com>
> Date: Mon, 15 Jun 2020 14:37:52 -0700
> Subject: [PATCH] hugetlbfs: prevent filesystem stacking of hugetlbfs
>
> syzbot found issues with having hugetlbfs on a union/overlay as reported
> in [1].  Due to the limitations (no write) and special functionality of
> hugetlbfs, it does not work well in filesystem stacking.  There are no
> know use cases for hugetlbfs stacking.  Rather than making modifications
> to get hugetlbfs working in such environments, simply prevent stacking.
>
> [1] https://lore.kernel.org/linux-mm/000000000000b4684e05a2968ca6@google.com/
>
> Reported-by: syzbot+d6ec23007e951dadf3de@syzkaller.appspotmail.com
> Suggested-by: Amir Goldstein <amir73il@gmail.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>

Acked-by: Miklos Szeredi <mszeredi@redhat.com>

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, back to index

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-12  0:46 [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Mike Kravetz
2020-06-12  0:46 ` [PATCH v4 2/2] ovl: call underlying get_unmapped_area() routine. propogate FMODE_HUGETLBFS Mike Kravetz
2020-06-14 12:50   ` Amir Goldstein
2020-06-12  1:53 ` [PATCH v4 1/2] hugetlb: use f_mode & FMODE_HUGETLBFS to identify hugetlbfs files Matthew Wilcox
2020-06-12  1:58 ` Al Viro
2020-06-12 21:51   ` Mike Kravetz
2020-06-13  6:53     ` Amir Goldstein
2020-06-13 14:38       ` Matthew Wilcox
2020-06-13 19:12       ` Mike Kravetz
2020-06-15  7:53         ` Miklos Szeredi
2020-06-15 10:05           ` Amir Goldstein
2020-06-15 13:01             ` Miklos Szeredi
2020-06-15 23:45           ` Mike Kravetz
2020-06-16  9:01             ` Miklos Szeredi
2020-06-15  8:24       ` Miklos Szeredi
2020-06-15 17:48         ` Mike Kravetz
2020-06-12  6:28 ` [RFC PATCH] hugetlb: hugetlbfs_file_operations can be static kernel test robot

($INBOX_DIR/description missing)

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-unionfs/0 linux-unionfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-unionfs linux-unionfs/ https://lore.kernel.org/linux-unionfs \
		linux-unionfs@vger.kernel.org
	public-inbox-index linux-unionfs

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-unionfs


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git