All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts
@ 2021-03-04 17:41 Christian Brauner
  2021-03-04 18:10 ` Christian Brauner
  2021-03-05 16:22 ` Christoph Hellwig
  0 siblings, 2 replies; 4+ messages in thread
From: Christian Brauner @ 2021-03-04 17:41 UTC (permalink / raw)
  To: Christoph Hellwig, David Howells, Al Viro; +Cc: linux-fsdevel, stable

Creating a series of detached mounts, attaching them to the filesystem,
and unmounting them can be used to trigger an integer overflow in
ns->mounts causing the kernel to block any new mounts in count_mounts()
and returning ENOSPC because it falsely assumes that the maximum number
of mounts in the mount namespace has been reached, i.e. it thinks it
can't fit the new mounts into the mount namespace anymore.

Depending on the number of mounts in your system, this can be reproduced
on any kernel that supportes open_tree() and move_mount() with the
following instructions:

1. Compile the following program "repro.c" via "make repro"
  > cat repro.c

  #define _GNU_SOURCE
  #include <errno.h>
  #include <fcntl.h>
  #include <getopt.h>
  #include <limits.h>
  #include <stdbool.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <linux/mount.h>
  #include <sys/syscall.h>
  #include <sys/types.h>
  #include <unistd.h>

  /* open_tree() */
  #ifndef OPEN_TREE_CLONE
  #define OPEN_TREE_CLONE 1
  #endif

  #ifndef OPEN_TREE_CLOEXEC
  #define OPEN_TREE_CLOEXEC O_CLOEXEC
  #endif

  #ifndef __NR_open_tree
  	#if defined __alpha__
  		#define __NR_open_tree 538
  	#elif defined _MIPS_SIM
  		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
  			#define __NR_open_tree 4428
  		#endif
  		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
  			#define __NR_open_tree 6428
  		#endif
  		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
  			#define __NR_open_tree 5428
  		#endif
  	#elif defined __ia64__
  		#define __NR_open_tree (428 + 1024)
  	#else
  		#define __NR_open_tree 428
  	#endif
  #endif

  /* move_mount() */
  #ifndef MOVE_MOUNT_F_EMPTY_PATH
  #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
  #endif

  #ifndef __NR_move_mount
  	#if defined __alpha__
  		#define __NR_move_mount 539
  	#elif defined _MIPS_SIM
  		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
  			#define __NR_move_mount 4429
  		#endif
  		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
  			#define __NR_move_mount 6429
  		#endif
  		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
  			#define __NR_move_mount 5429
  		#endif
  	#elif defined __ia64__
  		#define __NR_move_mount (428 + 1024)
  	#else
  		#define __NR_move_mount 429
  	#endif
  #endif

  static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
  {
  	return syscall(__NR_open_tree, dfd, filename, flags);
  }

  static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
  				 const char *to_pathname, unsigned int flags)
  {
  	return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
  }

  static void usage(void)
  {
  	const char *text = "mount-new [--recursive] <source> <target>\n";
  	fprintf(stderr, "%s", text);
  	_exit(EXIT_SUCCESS);
  }

  #define exit_usage(format, ...)                         \
  	({                                              \
  		fprintf(stderr, format, ##__VA_ARGS__); \
  		usage();                                \
  	})

  #define exit_log(format, ...)                           \
  	({                                              \
  		fprintf(stderr, format, ##__VA_ARGS__); \
  		exit(EXIT_FAILURE);                     \
  	})

  static const struct option longopts[] = {
  	{"recursive",	no_argument,		0,	'a'},
  	{"help",	no_argument,		0,	'b'},
  	{ NULL,		no_argument,		0,	 0 },
  };

  int main(int argc, char *argv[])
  {
  	int index = 0;
  	bool recursive = false;
  	int fd_tree, new_argc, ret;
  	char *source, *target;
  	char *const *new_argv;

  	while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
  		switch (ret) {
  		case 'a':
  			recursive = true;
  			break;
  		case 'b':
  			/* fallthrough */
  		default:
  			usage();
  		}
  	}

  	new_argv = &argv[optind];
  	new_argc = argc - optind;
  	if (new_argc < 2)
  		exit_usage("Missing source or target mountpoint\n\n");
  	source = new_argv[0];
  	target = new_argv[1];

  	fd_tree = sys_open_tree(-EBADF, source,
  				OPEN_TREE_CLONE |
  				OPEN_TREE_CLOEXEC |
  				AT_EMPTY_PATH |
  				(recursive ? AT_RECURSIVE : 0));
  	if (fd_tree < 0) {
  		exit_log("%m - Failed to open %s\n", source);
  		exit(EXIT_FAILURE);
  	}

  	ret = sys_move_mount(fd_tree, "", -EBADF, target, MOVE_MOUNT_F_EMPTY_PATH);
  	if (ret < 0)
  		exit_log("%m - Failed to attach mount to %s\n", target);
  	close(fd_tree);

  	exit(EXIT_SUCCESS);
  }

2. Run a loop like:

while true; do sudo ./repro; done

and wait for the kernel to refuse any new mounts by returning ENOSPC.
Depending on the number of mounts you have on the system this might take
shorter or longer.

The root cause of this is that detached mounts aren't handled correctly
when the destination mount point is MS_SHARED causing  a borked mount
tree and ultimately to a miscalculation of the number of mounts in the
mount namespace.

Detached mounts created via
open_tree(fd, path, OPEN_TREE_CLONE)
are essentially like an unattached new mount, or an unattached
bind-mount. They can then later on be attached to the filesystem via
move_mount() which calles into attach_recursive_mount(). Part of
attaching it to the filesystem is making sure that mounts get correctly
propagated in case the destination mountpoint is MS_SHARED, i.e.  is a
shared mountpoint. This is done by calling into propagate_mnt() which
walks the list of peers calling propagate_one() on each mount in this
list making sure it receives the propagation event.
For new mounts and bind-mounts propagate_one() knows to skip them by
realizing that there's no mount namespace attached to them.
However, detached mounts have an anonymous mount namespace attached to
them and so they aren't skipped causing the mount table to get wonky and
ultimately preventing any new mounts via the ENOSPC issue.

So teach propagation to handle detached mounts by making it aware of
them. I've been tracking this issue down for the last couple of days by
unmounting everything in my current mount table leaving only /proc and
/sys mounted and running the reproducer above verifying the counting of
the mounts. With this fix the counts are correct and the ENOSPC issue
can't be reproduced.

Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around")
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: <stable@vger.kernel.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/pnode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/pnode.h b/fs/pnode.h
index 26f74e092bd9..988f1aa9b02a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
 
 #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m)  (!(m)->mnt_ns)
+#define IS_MNT_NEW(m)  (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
 #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
 #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
 #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)

base-commit: f69d02e37a85645aa90d18cacfff36dba370f797
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts
  2021-03-04 17:41 [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts Christian Brauner
@ 2021-03-04 18:10 ` Christian Brauner
  2021-03-05 16:22 ` Christoph Hellwig
  1 sibling, 0 replies; 4+ messages in thread
From: Christian Brauner @ 2021-03-04 18:10 UTC (permalink / raw)
  To: Christoph Hellwig, David Howells, Al Viro; +Cc: linux-fsdevel, stable

On Thu, Mar 04, 2021 at 06:41:55PM +0100, Christian Brauner wrote:
> Creating a series of detached mounts, attaching them to the filesystem,
> and unmounting them can be used to trigger an integer overflow in
> ns->mounts causing the kernel to block any new mounts in count_mounts()
> and returning ENOSPC because it falsely assumes that the maximum number
> of mounts in the mount namespace has been reached, i.e. it thinks it
> can't fit the new mounts into the mount namespace anymore.
> 
> Depending on the number of mounts in your system, this can be reproduced
> on any kernel that supportes open_tree() and move_mount() with the
> following instructions:
> 
> 1. Compile the following program "repro.c" via "make repro"
>   > cat repro.c
> 
>   #define _GNU_SOURCE
>   #include <errno.h>
>   #include <fcntl.h>
>   #include <getopt.h>
>   #include <limits.h>
>   #include <stdbool.h>
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <linux/mount.h>
>   #include <sys/syscall.h>
>   #include <sys/types.h>
>   #include <unistd.h>
> 
>   /* open_tree() */
>   #ifndef OPEN_TREE_CLONE
>   #define OPEN_TREE_CLONE 1
>   #endif
> 
>   #ifndef OPEN_TREE_CLOEXEC
>   #define OPEN_TREE_CLOEXEC O_CLOEXEC
>   #endif
> 
>   #ifndef __NR_open_tree
>   	#if defined __alpha__
>   		#define __NR_open_tree 538
>   	#elif defined _MIPS_SIM
>   		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
>   			#define __NR_open_tree 4428
>   		#endif
>   		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
>   			#define __NR_open_tree 6428
>   		#endif
>   		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
>   			#define __NR_open_tree 5428
>   		#endif
>   	#elif defined __ia64__
>   		#define __NR_open_tree (428 + 1024)
>   	#else
>   		#define __NR_open_tree 428
>   	#endif
>   #endif
> 
>   /* move_mount() */
>   #ifndef MOVE_MOUNT_F_EMPTY_PATH
>   #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
>   #endif
> 
>   #ifndef __NR_move_mount
>   	#if defined __alpha__
>   		#define __NR_move_mount 539
>   	#elif defined _MIPS_SIM
>   		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
>   			#define __NR_move_mount 4429
>   		#endif
>   		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
>   			#define __NR_move_mount 6429
>   		#endif
>   		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
>   			#define __NR_move_mount 5429
>   		#endif
>   	#elif defined __ia64__
>   		#define __NR_move_mount (428 + 1024)
>   	#else
>   		#define __NR_move_mount 429
>   	#endif
>   #endif
> 
>   static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
>   {
>   	return syscall(__NR_open_tree, dfd, filename, flags);
>   }
> 
>   static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
>   				 const char *to_pathname, unsigned int flags)
>   {
>   	return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
>   }
> 
>   static void usage(void)
>   {
>   	const char *text = "mount-new [--recursive] <source> <target>\n";
>   	fprintf(stderr, "%s", text);
>   	_exit(EXIT_SUCCESS);
>   }
> 
>   #define exit_usage(format, ...)                         \
>   	({                                              \
>   		fprintf(stderr, format, ##__VA_ARGS__); \
>   		usage();                                \
>   	})
> 
>   #define exit_log(format, ...)                           \
>   	({                                              \
>   		fprintf(stderr, format, ##__VA_ARGS__); \
>   		exit(EXIT_FAILURE);                     \
>   	})
> 
>   static const struct option longopts[] = {
>   	{"recursive",	no_argument,		0,	'a'},
>   	{"help",	no_argument,		0,	'b'},
>   	{ NULL,		no_argument,		0,	 0 },
>   };
> 
>   int main(int argc, char *argv[])
>   {
>   	int index = 0;
>   	bool recursive = false;
>   	int fd_tree, new_argc, ret;
>   	char *source, *target;
>   	char *const *new_argv;
> 
>   	while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
>   		switch (ret) {
>   		case 'a':
>   			recursive = true;
>   			break;
>   		case 'b':
>   			/* fallthrough */
>   		default:
>   			usage();
>   		}
>   	}
> 
>   	new_argv = &argv[optind];
>   	new_argc = argc - optind;
>   	if (new_argc < 2)
>   		exit_usage("Missing source or target mountpoint\n\n");
>   	source = new_argv[0];
>   	target = new_argv[1];
> 
>   	fd_tree = sys_open_tree(-EBADF, source,
>   				OPEN_TREE_CLONE |
>   				OPEN_TREE_CLOEXEC |
>   				AT_EMPTY_PATH |
>   				(recursive ? AT_RECURSIVE : 0));
>   	if (fd_tree < 0) {
>   		exit_log("%m - Failed to open %s\n", source);
>   		exit(EXIT_FAILURE);
>   	}
> 
>   	ret = sys_move_mount(fd_tree, "", -EBADF, target, MOVE_MOUNT_F_EMPTY_PATH);
>   	if (ret < 0)
>   		exit_log("%m - Failed to attach mount to %s\n", target);
>   	close(fd_tree);
> 
>   	exit(EXIT_SUCCESS);
>   }
> 
> 2. Run a loop like:
> 
> while true; do sudo ./repro; done

The full reproducer is:

while true; do sudo ./repro /mnt /mnt; sudo umount -l /mnt; done

> 
> and wait for the kernel to refuse any new mounts by returning ENOSPC.
> Depending on the number of mounts you have on the system this might take
> shorter or longer.
> 
> The root cause of this is that detached mounts aren't handled correctly
> when the destination mount point is MS_SHARED causing  a borked mount
> tree and ultimately to a miscalculation of the number of mounts in the
> mount namespace.
> 
> Detached mounts created via
> open_tree(fd, path, OPEN_TREE_CLONE)
> are essentially like an unattached new mount, or an unattached
> bind-mount. They can then later on be attached to the filesystem via
> move_mount() which calles into attach_recursive_mount(). Part of
> attaching it to the filesystem is making sure that mounts get correctly
> propagated in case the destination mountpoint is MS_SHARED, i.e.  is a
> shared mountpoint. This is done by calling into propagate_mnt() which
> walks the list of peers calling propagate_one() on each mount in this
> list making sure it receives the propagation event.
> For new mounts and bind-mounts propagate_one() knows to skip them by
> realizing that there's no mount namespace attached to them.
> However, detached mounts have an anonymous mount namespace attached to
> them and so they aren't skipped causing the mount table to get wonky and
> ultimately preventing any new mounts via the ENOSPC issue.
> 
> So teach propagation to handle detached mounts by making it aware of
> them. I've been tracking this issue down for the last couple of days by
> unmounting everything in my current mount table leaving only /proc and
> /sys mounted and running the reproducer above verifying the counting of
> the mounts. With this fix the counts are correct and the ENOSPC issue
> can't be reproduced.
> 
> Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around")
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: David Howells <dhowells@redhat.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
> ---
>  fs/pnode.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 26f74e092bd9..988f1aa9b02a 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -12,7 +12,7 @@
>  
>  #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
>  #define IS_MNT_SLAVE(m) ((m)->mnt_master)
> -#define IS_MNT_NEW(m)  (!(m)->mnt_ns)
> +#define IS_MNT_NEW(m)  (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
>  #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
>  #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
>  #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
> 
> base-commit: f69d02e37a85645aa90d18cacfff36dba370f797
> -- 
> 2.27.0
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts
  2021-03-04 17:41 [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts Christian Brauner
  2021-03-04 18:10 ` Christian Brauner
@ 2021-03-05 16:22 ` Christoph Hellwig
  1 sibling, 0 replies; 4+ messages in thread
From: Christoph Hellwig @ 2021-03-05 16:22 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Christoph Hellwig, David Howells, Al Viro, linux-fsdevel, stable

On Thu, Mar 04, 2021 at 06:41:55PM +0100, Christian Brauner wrote:
> Creating a series of detached mounts, attaching them to the filesystem,
> and unmounting them can be used to trigger an integer overflow in
> ns->mounts causing the kernel to block any new mounts in count_mounts()
> and returning ENOSPC because it falsely assumes that the maximum number
> of mounts in the mount namespace has been reached, i.e. it thinks it
> can't fit the new mounts into the mount namespace anymore.
> 
> Depending on the number of mounts in your system, this can be reproduced
> on any kernel that supportes open_tree() and move_mount() with the
> following instructions:
> 
> 1. Compile the following program "repro.c" via "make repro"
>   > cat repro.c

Can you wire this up for xfstests?

The patch itself looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts
@ 2021-03-06 10:10 Christian Brauner
  0 siblings, 0 replies; 4+ messages in thread
From: Christian Brauner @ 2021-03-06 10:10 UTC (permalink / raw)
  To: Christoph Hellwig, David Howells, Al Viro
  Cc: linux-fsdevel, stable, Christian Brauner

Creating a series of detached mounts, attaching them to the filesystem,
and unmounting them can be used to trigger an integer overflow in
ns->mounts causing the kernel to block any new mounts in count_mounts()
and returning ENOSPC because it falsely assumes that the maximum number
of mounts in the mount namespace has been reached, i.e. it thinks it
can't fit the new mounts into the mount namespace anymore.

Depending on the number of mounts in your system, this can be reproduced
on any kernel that supportes open_tree() and move_mount() by compiling
and running the following program:

  /* SPDX-License-Identifier: LGPL-2.1+ */

  #define _GNU_SOURCE
  #include <errno.h>
  #include <fcntl.h>
  #include <getopt.h>
  #include <limits.h>
  #include <stdbool.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <sys/mount.h>
  #include <sys/stat.h>
  #include <sys/syscall.h>
  #include <sys/types.h>
  #include <unistd.h>

  /* open_tree() */
  #ifndef OPEN_TREE_CLONE
  #define OPEN_TREE_CLONE 1
  #endif

  #ifndef OPEN_TREE_CLOEXEC
  #define OPEN_TREE_CLOEXEC O_CLOEXEC
  #endif

  #ifndef __NR_open_tree
          #if defined __alpha__
                  #define __NR_open_tree 538
          #elif defined _MIPS_SIM
                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
                          #define __NR_open_tree 4428
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
                          #define __NR_open_tree 6428
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
                          #define __NR_open_tree 5428
                  #endif
          #elif defined __ia64__
                  #define __NR_open_tree (428 + 1024)
          #else
                  #define __NR_open_tree 428
          #endif
  #endif

  /* move_mount() */
  #ifndef MOVE_MOUNT_F_EMPTY_PATH
  #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
  #endif

  #ifndef __NR_move_mount
          #if defined __alpha__
                  #define __NR_move_mount 539
          #elif defined _MIPS_SIM
                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
                          #define __NR_move_mount 4429
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
                          #define __NR_move_mount 6429
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
                          #define __NR_move_mount 5429
                  #endif
          #elif defined __ia64__
                  #define __NR_move_mount (428 + 1024)
          #else
                  #define __NR_move_mount 429
          #endif
  #endif

  static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
  {
          return syscall(__NR_open_tree, dfd, filename, flags);
  }

  static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
                                   const char *to_pathname, unsigned int flags)
  {
          return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
  }

  static bool is_shared_mountpoint(const char *path)
  {
          bool shared = false;
          FILE *f = NULL;
          char *line = NULL;
          int i;
          size_t len = 0;

          f = fopen("/proc/self/mountinfo", "re");
          if (!f)
                  return 0;

          while (getline(&line, &len, f) > 0) {
                  char *slider1, *slider2;

                  for (slider1 = line, i = 0; slider1 && i < 4; i++)
                          slider1 = strchr(slider1 + 1, ' ');

                  if (!slider1)
                          continue;

                  slider2 = strchr(slider1 + 1, ' ');
                  if (!slider2)
                          continue;

                  *slider2 = '\0';
                  if (strcmp(slider1 + 1, path) == 0) {
                          /* This is the path. Is it shared? */
                          slider1 = strchr(slider2 + 1, ' ');
                          if (slider1 && strstr(slider1, "shared:")) {
                                  shared = true;
                                  break;
                          }
                  }
          }
          fclose(f);
          free(line);

          return shared;
  }

  static void usage(void)
  {
          const char *text = "mount-new [--recursive] <base-dir>\n";
          fprintf(stderr, "%s", text);
          _exit(EXIT_SUCCESS);
  }

  #define exit_usage(format, ...)                              \
          ({                                                   \
                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
                  usage();                                     \
          })

  #define exit_log(format, ...)                                \
          ({                                                   \
                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
                  exit(EXIT_FAILURE);                          \
          })

  static const struct option longopts[] = {
          {"help",        no_argument,            0,      'a'},
          { NULL,         no_argument,            0,       0 },
  };

  int main(int argc, char *argv[])
  {
          int exit_code = EXIT_SUCCESS, index = 0;
          int dfd, fd_tree, new_argc, ret;
          char *base_dir;
          char *const *new_argv;
          char target[PATH_MAX];

          while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
                  switch (ret) {
                  case 'a':
                          /* fallthrough */
                  default:
                          usage();
                  }
          }

          new_argv = &argv[optind];
          new_argc = argc - optind;
          if (new_argc < 1)
                  exit_usage("Missing base directory\n");
          base_dir = new_argv[0];

          if (*base_dir != '/')
                  exit_log("Please specify an absolute path");

          /* Ensure that target is a shared mountpoint. */
          if (!is_shared_mountpoint(base_dir))
                  exit_log("Please ensure that \"%s\" is a shared mountpoint", base_dir);

          dfd = open(base_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
          if (dfd < 0)
                  exit_log("%m - Failed to open base directory \"%s\"", base_dir);

          ret = mkdirat(dfd, "detached-move-mount", 0755);
          if (ret < 0)
                  exit_log("%m - Failed to create required temporary directories");

          ret = snprintf(target, sizeof(target), "%s/detached-move-mount", base_dir);
          if (ret < 0 || (size_t)ret >= sizeof(target))
                  exit_log("%m - Failed to assemble target path");

          /*
           * Having a mount table with 10000 mounts is already quite excessive
           * and shoult account even for weird test systems.
           */
          for (size_t i = 0; i < 10000; i++) {
                  fd_tree = sys_open_tree(dfd, "detached-move-mount",
                                          OPEN_TREE_CLONE |
                                          OPEN_TREE_CLOEXEC |
                                          AT_EMPTY_PATH);
                  if (fd_tree < 0) {
                          fprintf(stderr, "%m - Failed to open %d(detached-move-mount)", dfd);
                          exit_code = EXIT_FAILURE;
                          break;
                  }

                  ret = sys_move_mount(fd_tree, "", dfd, "detached-move-mount", MOVE_MOUNT_F_EMPTY_PATH);
                  if (ret < 0) {
                          if (errno == ENOSPC)
                                  fprintf(stderr, "%m - Buggy mount counting");
                          else
                                  fprintf(stderr, "%m - Failed to attach mount to %d(detached-move-mount)", dfd);
                          exit_code = EXIT_FAILURE;
                          break;
                  }
                  close(fd_tree);

                  ret = umount2(target, MNT_DETACH);
                  if (ret < 0) {
                          fprintf(stderr, "%m - Failed to unmount %s", target);
                          exit_code = EXIT_FAILURE;
                          break;
                  }
          }

          (void)unlinkat(dfd, "detached-move-mount", AT_REMOVEDIR);
          close(dfd);

          exit(exit_code);
  }

and wait for the kernel to refuse any new mounts by returning ENOSPC.
How many iterations are needed depends on the number of mounts in your
system. Assuming you have something like 50 mounts on a standard system
it should be almost instantaneous.

The root cause of this is that detached mounts aren't handled correctly
when source and target mount are identical and reside on a shared mount
causing a broken mount tree where the detached source itself is
propagated which propagation prevents for regular bind-mounts and new
mounts. This ultimately leads to a miscalculation of the number of
mounts in the mount namespace.

Detached mounts created via
open_tree(fd, path, OPEN_TREE_CLONE)
are essentially like an unattached new mount, or an unattached
bind-mount. They can then later on be attached to the filesystem via
move_mount() which calls into attach_recursive_mount(). Part of
attaching it to the filesystem is making sure that mounts get correctly
propagated in case the destination mountpoint is MS_SHARED, i.e. is a
shared mountpoint. This is done by calling into propagate_mnt() which
walks the list of peers calling propagate_one() on each mount in this
list making sure it receives the propagation event.
The propagate_one() functions thereby skips both new mounts and bind
mounts to not propagate them "into themselves". Both are identified by
checking whether the mount is already attached to any mount namespace in
mnt->mnt_ns. The is what the IS_MNT_NEW() helper is responsible for.

However, detached mounts have an anonymous mount namespace attached to
them stashed in mnt->mnt_ns which means that IS_MNT_NEW() doesn't
realize they need to be skipped causing the mount to propagate "into
itself" breaking the mount table and causing a disconnect between the
number of mounts recorded as being beneath or reachable from the target
mountpoint and the number of mounts actually recorded/counted in
ns->mounts ultimately causing an overflow which in turn prevents any new
mounts via the ENOSPC issue.

So teach propagation to handle detached mounts by making it aware of
them. I've been tracking this issue down for the last couple of days and
then verifying that the fix is correct by
unmounting everything in my current mount table leaving only /proc and
/sys mounted and running the reproducer above overnight verifying the
number of mounts counted in ns->mounts. With this fix the counts are
correct and the ENOSPC issue can't be reproduced.

This change will only have an effect on mounts created with the new
mount API since detached mounts cannot be created with the old mount API
so regressions are extremely unlikely.

Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around")
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: <stable@vger.kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/pnode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/pnode.h b/fs/pnode.h
index 26f74e092bd9..988f1aa9b02a 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
 
 #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m)  (!(m)->mnt_ns)
+#define IS_MNT_NEW(m)  (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
 #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
 #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
 #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)

base-commit: f69d02e37a85645aa90d18cacfff36dba370f797
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-03-06 10:11 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-04 17:41 [PATCH] mount: fix mounting of detached mounts onto targets that reside on shared mounts Christian Brauner
2021-03-04 18:10 ` Christian Brauner
2021-03-05 16:22 ` Christoph Hellwig
2021-03-06 10:10 Christian Brauner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.