All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Fix a race in put_mountpoint.
@ 2016-12-31  4:10 Krister Johansen
  2016-12-31  6:17 ` Al Viro
  0 siblings, 1 reply; 63+ messages in thread
From: Krister Johansen @ 2016-12-31  4:10 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-fsdevel, Eric W. Biederman

This can cause a panic when simultaneous callers of put_mountpoint
attempt to free the same mountpoint.  This occurs because some callers
hold the mount_hash_lock, while others hold the namespace lock.  Some
even hold both.

In this submitter's case, the panic manifested itself as a GP fault in
put_mountpoint() when it called hlist_del() and attempted to dereference
a m_hash.pprev that had been poisioned by another thread.

Instead of trying to force all mountpoint hash users back under the
namespace lock, add locks that cover just the mountpoint hash.  This
uses hlist_bl to protect against simlultaneous additions and removals,
and RCU for lookups.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
---
 fs/mount.h     |  6 ++++--
 fs/namespace.c | 62 ++++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc..1a2f41a 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -3,6 +3,7 @@
 #include <linux/poll.h>
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
+#include <linux/rculist_bl.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -24,10 +25,11 @@ struct mnt_pcp {
 };
 
 struct mountpoint {
-	struct hlist_node m_hash;
+	struct hlist_bl_node m_hash;
 	struct dentry *m_dentry;
 	struct hlist_head m_list;
-	int m_count;
+	atomic_t m_count;
+	struct rcu_head m_rcu;
 };
 
 struct mount {
diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259..7c29420 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -63,7 +63,7 @@ static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 
 static struct hlist_head *mount_hashtable __read_mostly;
-static struct hlist_head *mountpoint_hashtable __read_mostly;
+static struct hlist_bl_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static DECLARE_RWSEM(namespace_sem);
 
@@ -89,7 +89,7 @@ static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *den
 	return &mount_hashtable[tmp & m_hash_mask];
 }
 
-static inline struct hlist_head *mp_hash(struct dentry *dentry)
+static inline struct hlist_bl_head *mp_hash(struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
 	tmp = tmp + (tmp >> mp_hash_shift);
@@ -727,27 +727,36 @@ bool __is_local_mountpoint(struct dentry *dentry)
 
 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 {
-	struct hlist_head *chain = mp_hash(dentry);
+	struct hlist_bl_head *chain = mp_hash(dentry);
+	struct hlist_bl_node *node;
 	struct mountpoint *mp;
 
-	hlist_for_each_entry(mp, chain, m_hash) {
+	rcu_read_lock();
+	hlist_bl_for_each_entry_rcu(mp, node, chain, m_hash) {
 		if (mp->m_dentry == dentry) {
 			/* might be worth a WARN_ON() */
-			if (d_unlinked(dentry))
-				return ERR_PTR(-ENOENT);
-			mp->m_count++;
-			return mp;
+			if (d_unlinked(dentry)) {
+				mp = ERR_PTR(-ENOENT);
+				goto out;
+			}
+			if (atomic_inc_not_zero(&mp->m_count))
+				goto out;
 		}
 	}
-	return NULL;
+	mp = NULL;
+out:
+	rcu_read_unlock();
+	return mp;
 }
 
 static struct mountpoint *new_mountpoint(struct dentry *dentry)
 {
-	struct hlist_head *chain = mp_hash(dentry);
+	struct hlist_bl_head *chain = mp_hash(dentry);
 	struct mountpoint *mp;
 	int ret;
 
+	WARN_ON(!rwsem_is_locked(&namespace_sem));
+
 	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
@@ -759,22 +768,37 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
 	}
 
 	mp->m_dentry = dentry;
-	mp->m_count = 1;
-	hlist_add_head(&mp->m_hash, chain);
+	init_rcu_head(&mp->m_rcu);
+	atomic_set(&mp->m_count, 1);
+	hlist_bl_lock(chain);
+	hlist_bl_add_head_rcu(&mp->m_hash, chain);
+	hlist_bl_unlock(chain);
 	INIT_HLIST_HEAD(&mp->m_list);
 	return mp;
 }
 
+static void free_mountpoint(struct rcu_head *head)
+{
+	struct mountpoint *mp;
+
+	mp = container_of(head, struct mountpoint, m_rcu);
+	kfree(mp);
+}
+
 static void put_mountpoint(struct mountpoint *mp)
 {
-	if (!--mp->m_count) {
+	if (atomic_dec_and_test(&mp->m_count)) {
 		struct dentry *dentry = mp->m_dentry;
+		struct hlist_bl_head *chain = mp_hash(dentry);
+
 		BUG_ON(!hlist_empty(&mp->m_list));
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_MOUNTED;
 		spin_unlock(&dentry->d_lock);
-		hlist_del(&mp->m_hash);
-		kfree(mp);
+		hlist_bl_lock(chain);
+		hlist_bl_del_rcu(&mp->m_hash);
+		hlist_bl_unlock(chain);
+		call_rcu(&mp->m_rcu, free_mountpoint);
 	}
 }
 
@@ -846,7 +870,7 @@ void mnt_set_mountpoint(struct mount *mnt,
 			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
-	mp->m_count++;
+	atomic_inc(&mp->m_count);
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
 	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
 	child_mnt->mnt_parent = mnt;
@@ -3120,7 +3144,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	/* make certain new is below the root */
 	if (!is_path_reachable(new_mnt, new.dentry, &root))
 		goto out4;
-	root_mp->m_count++; /* pin it so it won't go away */
+	atomic_inc(&root_mp->m_count); /* pin it so it won't go away */
 	lock_mount_hash();
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
@@ -3199,7 +3223,7 @@ void __init mnt_init(void)
 				0,
 				&m_hash_shift, &m_hash_mask, 0, 0);
 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
-				sizeof(struct hlist_head),
+				sizeof(struct hlist_bl_head),
 				mphash_entries, 19,
 				0,
 				&mp_hash_shift, &mp_hash_mask, 0, 0);
@@ -3210,7 +3234,7 @@ void __init mnt_init(void)
 	for (u = 0; u <= m_hash_mask; u++)
 		INIT_HLIST_HEAD(&mount_hashtable[u]);
 	for (u = 0; u <= mp_hash_mask; u++)
-		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
+		INIT_HLIST_BL_HEAD(&mountpoint_hashtable[u]);
 
 	kernfs_init();
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] Fix a race in put_mountpoint.
  2016-12-31  4:10 [PATCH] Fix a race in put_mountpoint Krister Johansen
@ 2016-12-31  6:17 ` Al Viro
  2017-01-03  0:51   ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Al Viro @ 2016-12-31  6:17 UTC (permalink / raw)
  To: Krister Johansen; +Cc: linux-fsdevel, Eric W. Biederman

On Fri, Dec 30, 2016 at 08:10:01PM -0800, Krister Johansen wrote:
> This can cause a panic when simultaneous callers of put_mountpoint
> attempt to free the same mountpoint.  This occurs because some callers
> hold the mount_hash_lock, while others hold the namespace lock.  Some
> even hold both.
> 
> In this submitter's case, the panic manifested itself as a GP fault in
> put_mountpoint() when it called hlist_del() and attempted to dereference
> a m_hash.pprev that had been poisioned by another thread.
> 
> Instead of trying to force all mountpoint hash users back under the
> namespace lock, add locks that cover just the mountpoint hash.  This
> uses hlist_bl to protect against simlultaneous additions and removals,
> and RCU for lookups.

Too complicated, IMO.  Look at the call graph for that sucker:
put_mountpoint
        <- unhash_mnt
                <- detach_mnt
                        <- attach_recursive_mnt [under mount_lock]
                        <- pivot_root [under mount_lock]
                <- umount_mnt
                        <- mntput_no_expire [under mount_lock]
                        <- umount_tree
                                <- do_umount [under mount_lock]
                                <- __detach_mounts [under mount_lock]
                                <- copy_tree [under mount_lock]
                                <- drop_collected_mounts [under mount_lock]
                                <- attach_recursive_mnt [under mount_lock]
                                <- do_loopback [under mount_lock]
                                <- mark_mounts_for_expiry [under mount_lock]
                                <- shrink_submounts
                                        <- do_umount [under mount_lock]
                        <- __detach_mounts [under mount_lock]
        <- __detach_mounts [right after dropping mount_lock]
        <- unlock_mount
        <- pivot_root
Now, __detach_mounts() thing is trivially fixed - we just move that call
one line up.  unhash_mnt() is all covered.  Which leaves us with unlock_mount()
and pivot_root() and both are _not_ hot paths - not by any stretch of
imagination.  We also have lookup_mountpoint(), which is not hot either.
Note that hash insertions and lookups are serialized on namespace lock;
it's only removal from final mntput() that can be triggered outside.  So
playing with bitlocks is absolutely pointless.

Let's do this: make sure that all callers of lookup_mountpoint() have
lock_mount (at least read_seqlock_excl), pull the call of put_mountpoint()
in __detach_mounts() under mount_lock and slap read_seqlock_excl() around
the calls in unlock_mount() and pivot_root().  Note that read_seqlock_excl()
*is* exclusive - the "read" part in it is about "don't bump the seqcount
part of mount_lock".  I.e. the patch below ought to fix that and it's much
simpler than your variant.  Do you see any holes in the above?

diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259e064f..ca98a8ff2732 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1595,11 +1595,11 @@ void __detach_mounts(struct dentry *dentry)
 	struct mount *mnt;
 
 	namespace_lock();
+	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
 	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;
 
-	lock_mount_hash();
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1609,9 +1609,9 @@ void __detach_mounts(struct dentry *dentry)
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -2038,7 +2038,10 @@ static struct mountpoint *lock_mount(struct path *path)
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
-		struct mountpoint *mp = lookup_mountpoint(dentry);
+		struct mountpoint *mp;
+		read_seqlock_excl(&mount_lock);
+		mp = lookup_mountpoint(dentry);
+		read_sequnlock_excl(&mount_lock);
 		if (!mp)
 			mp = new_mountpoint(dentry);
 		if (IS_ERR(mp)) {
@@ -2059,7 +2062,9 @@ static struct mountpoint *lock_mount(struct path *path)
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(where);
+	read_sequnlock_excl(&mount_lock);
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 }
@@ -3137,7 +3142,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	list_del_init(&new_mnt->mnt_expire);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(root_mp);
+	read_sequnlock_excl(&mount_lock);
 	error = 0;
 out4:
 	unlock_mount(old_mp);

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] Fix a race in put_mountpoint.
  2016-12-31  6:17 ` Al Viro
@ 2017-01-03  0:51   ` Eric W. Biederman
  2017-01-03  1:48     ` Al Viro
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-03  0:51 UTC (permalink / raw)
  To: Al Viro; +Cc: Krister Johansen, linux-fsdevel

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Fri, Dec 30, 2016 at 08:10:01PM -0800, Krister Johansen wrote:
>> This can cause a panic when simultaneous callers of put_mountpoint
>> attempt to free the same mountpoint.  This occurs because some callers
>> hold the mount_hash_lock, while others hold the namespace lock.  Some
>> even hold both.
>> 
>> In this submitter's case, the panic manifested itself as a GP fault in
>> put_mountpoint() when it called hlist_del() and attempted to dereference
>> a m_hash.pprev that had been poisioned by another thread.
>> 
>> Instead of trying to force all mountpoint hash users back under the
>> namespace lock, add locks that cover just the mountpoint hash.  This
>> uses hlist_bl to protect against simlultaneous additions and removals,
>> and RCU for lookups.
>
> Too complicated, IMO.  Look at the call graph for that sucker:
> put_mountpoint
>         <- unhash_mnt
>                 <- detach_mnt
>                         <- attach_recursive_mnt [under mount_lock]
>                         <- pivot_root [under mount_lock]
>                 <- umount_mnt
>                         <- mntput_no_expire [under mount_lock]
>                         <- umount_tree
>                                 <- do_umount [under mount_lock]
>                                 <- __detach_mounts [under mount_lock]
>                                 <- copy_tree [under mount_lock]
>                                 <- drop_collected_mounts [under mount_lock]
>                                 <- attach_recursive_mnt [under mount_lock]
>                                 <- do_loopback [under mount_lock]
>                                 <- mark_mounts_for_expiry [under mount_lock]
>                                 <- shrink_submounts
>                                         <- do_umount [under mount_lock]
>                         <- __detach_mounts [under mount_lock]
>         <- __detach_mounts [right after dropping mount_lock]
>         <- unlock_mount
>         <- pivot_root
> Now, __detach_mounts() thing is trivially fixed - we just move that call
> one line up.  unhash_mnt() is all covered.  Which leaves us with unlock_mount()
> and pivot_root() and both are _not_ hot paths - not by any stretch of
> imagination.  We also have lookup_mountpoint(), which is not hot either.
> Note that hash insertions and lookups are serialized on namespace lock;
> it's only removal from final mntput() that can be triggered outside.  So
> playing with bitlocks is absolutely pointless.
>
> Let's do this: make sure that all callers of lookup_mountpoint() have
> lock_mount (at least read_seqlock_excl), pull the call of put_mountpoint()
> in __detach_mounts() under mount_lock and slap read_seqlock_excl() around
> the calls in unlock_mount() and pivot_root().  Note that read_seqlock_excl()
> *is* exclusive - the "read" part in it is about "don't bump the seqcount
> part of mount_lock".  I.e. the patch below ought to fix that and it's much
> simpler than your variant.  Do you see any holes in the above?

The only significant thing I see is that you have not taken the
mount_lock on the path where new_mountpoint adds the new struct
mountpoint into the mountpoint hash table.

Just for managing this fix we need a: "Cc: stable@vger.kernel.org"
and a "Fixes: ce07d891a089 ("mnt: Honor MNT_LOCKED when detaching mounts")"
As the bug is 2 years old at this point.

I will start with your patch and see about adding handling the missing
locking in new_mountpoint and see where it gets me.

Eric

> diff --git a/fs/namespace.c b/fs/namespace.c
> index b5b1259e064f..ca98a8ff2732 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1595,11 +1595,11 @@ void __detach_mounts(struct dentry *dentry)
>  	struct mount *mnt;
>  
>  	namespace_lock();
> +	lock_mount_hash();
>  	mp = lookup_mountpoint(dentry);
>  	if (IS_ERR_OR_NULL(mp))
>  		goto out_unlock;
>  
> -	lock_mount_hash();
>  	event++;
>  	while (!hlist_empty(&mp->m_list)) {
>  		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
> @@ -1609,9 +1609,9 @@ void __detach_mounts(struct dentry *dentry)
>  		}
>  		else umount_tree(mnt, UMOUNT_CONNECTED);
>  	}
> -	unlock_mount_hash();
>  	put_mountpoint(mp);
>  out_unlock:
> +	unlock_mount_hash();
>  	namespace_unlock();
>  }
>  
> @@ -2038,7 +2038,10 @@ static struct mountpoint *lock_mount(struct path *path)
>  	namespace_lock();
>  	mnt = lookup_mnt(path);
>  	if (likely(!mnt)) {
> -		struct mountpoint *mp = lookup_mountpoint(dentry);
> +		struct mountpoint *mp;
> +		read_seqlock_excl(&mount_lock);
> +		mp = lookup_mountpoint(dentry);
> +		read_sequnlock_excl(&mount_lock);
>  		if (!mp)
>  			mp = new_mountpoint(dentry);
>  		if (IS_ERR(mp)) {
> @@ -2059,7 +2062,9 @@ static struct mountpoint *lock_mount(struct path *path)
>  static void unlock_mount(struct mountpoint *where)
>  {
>  	struct dentry *dentry = where->m_dentry;
> +	read_seqlock_excl(&mount_lock);
>  	put_mountpoint(where);
> +	read_sequnlock_excl(&mount_lock);
>  	namespace_unlock();
>  	inode_unlock(dentry->d_inode);
>  }
> @@ -3137,7 +3142,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
>  	list_del_init(&new_mnt->mnt_expire);
>  	unlock_mount_hash();
>  	chroot_fs_refs(&root, &new);
> +	read_seqlock_excl(&mount_lock);
>  	put_mountpoint(root_mp);
> +	read_sequnlock_excl(&mount_lock);
>  	error = 0;
>  out4:
>  	unlock_mount(old_mp);

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] Fix a race in put_mountpoint.
  2017-01-03  0:51   ` Eric W. Biederman
@ 2017-01-03  1:48     ` Al Viro
  2017-01-03  3:17       ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Al Viro @ 2017-01-03  1:48 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Krister Johansen, linux-fsdevel

On Tue, Jan 03, 2017 at 01:51:36PM +1300, Eric W. Biederman wrote:

> The only significant thing I see is that you have not taken the
> mount_lock on the path where new_mountpoint adds the new struct
> mountpoint into the mountpoint hash table.

Umm...  Point, but I really don't like that bouncing mount_lock up
and down there.  It's not going to cause any serious overhead,
but it just looks ugly... ;-/

Let me think for a while...

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] Fix a race in put_mountpoint.
  2017-01-03  1:48     ` Al Viro
@ 2017-01-03  3:17       ` Eric W. Biederman
  2017-01-03  4:00         ` Al Viro
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-03  3:17 UTC (permalink / raw)
  To: Al Viro; +Cc: Krister Johansen, linux-fsdevel

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Tue, Jan 03, 2017 at 01:51:36PM +1300, Eric W. Biederman wrote:
>
>> The only significant thing I see is that you have not taken the
>> mount_lock on the path where new_mountpoint adds the new struct
>> mountpoint into the mountpoint hash table.
>
> Umm...  Point, but I really don't like that bouncing mount_lock up
> and down there.  It's not going to cause any serious overhead,
> but it just looks ugly... ;-/
>
> Let me think for a while...

The other possibility is to grab namespace_sem in mntput_no_expire
around the call of umount_mnt.  That is the only path where
put_mountpoint can be called where we are not holding namespace_sem.
That works in the small but I haven't traced the callers of mntput and
mntput_no_expire yet to see if it works in practice.

Eric



^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] Fix a race in put_mountpoint.
  2017-01-03  3:17       ` Eric W. Biederman
@ 2017-01-03  4:00         ` Al Viro
  2017-01-04  3:52           ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Al Viro @ 2017-01-03  4:00 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Krister Johansen, linux-fsdevel

On Tue, Jan 03, 2017 at 04:17:14PM +1300, Eric W. Biederman wrote:
> Al Viro <viro@ZenIV.linux.org.uk> writes:
> 
> > On Tue, Jan 03, 2017 at 01:51:36PM +1300, Eric W. Biederman wrote:
> >
> >> The only significant thing I see is that you have not taken the
> >> mount_lock on the path where new_mountpoint adds the new struct
> >> mountpoint into the mountpoint hash table.
> >
> > Umm...  Point, but I really don't like that bouncing mount_lock up
> > and down there.  It's not going to cause any serious overhead,
> > but it just looks ugly... ;-/
> >
> > Let me think for a while...
> 
> The other possibility is to grab namespace_sem in mntput_no_expire
> around the call of umount_mnt.  That is the only path where
> put_mountpoint can be called where we are not holding namespace_sem.
> That works in the small but I haven't traced the callers of mntput and
> mntput_no_expire yet to see if it works in practice.

No, that's a really bad idea.  Final mntput should _not_ happen under
namespace_lock, but I don't want grabbing it in that place.

How about this instead:

diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259e064f..20fc797277f8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -742,29 +742,6 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 	return NULL;
 }
 
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
-{
-	struct hlist_head *chain = mp_hash(dentry);
-	struct mountpoint *mp;
-	int ret;
-
-	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
-	if (!mp)
-		return ERR_PTR(-ENOMEM);
-
-	ret = d_set_mounted(dentry);
-	if (ret) {
-		kfree(mp);
-		return ERR_PTR(ret);
-	}
-
-	mp->m_dentry = dentry;
-	mp->m_count = 1;
-	hlist_add_head(&mp->m_hash, chain);
-	INIT_HLIST_HEAD(&mp->m_list);
-	return mp;
-}
-
 static void put_mountpoint(struct mountpoint *mp)
 {
 	if (!--mp->m_count) {
@@ -1595,11 +1572,11 @@ void __detach_mounts(struct dentry *dentry)
 	struct mount *mnt;
 
 	namespace_lock();
+	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
 	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;
 
-	lock_mount_hash();
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1609,9 +1586,9 @@ void __detach_mounts(struct dentry *dentry)
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -2027,8 +2004,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 
 static struct mountpoint *lock_mount(struct path *path)
 {
+	struct mountpoint *mp = NULL;
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
+	int err;
+
 retry:
 	inode_lock(dentry->d_inode);
 	if (unlikely(cant_mount(dentry))) {
@@ -2037,29 +2017,60 @@ static struct mountpoint *lock_mount(struct path *path)
 	}
 	namespace_lock();
 	mnt = lookup_mnt(path);
-	if (likely(!mnt)) {
-		struct mountpoint *mp = lookup_mountpoint(dentry);
-		if (!mp)
-			mp = new_mountpoint(dentry);
-		if (IS_ERR(mp)) {
+	if (unlikely(mnt)) {
+		namespace_unlock();
+		inode_unlock(path->dentry->d_inode);
+		path_put(path);
+		path->mnt = mnt;
+		dentry = path->dentry = dget(mnt->mnt_root);
+		goto retry;
+	}
+
+	/*
+	 * OK, we have namespace_lock held, nothing is overmounting
+	 * *path and inode of mountpoint to be is locked.
+	 */
+	if (likely(!d_mountpoint(dentry)))
+		mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	read_seqlock_excl(&mount_lock);
+	if (!mp && !d_mountpoint(dentry)) {
+		read_sequnlock_excl(&mount_lock);
+		mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+		read_seqlock_excl(&mount_lock);
+	}
+	if (d_mountpoint(dentry)) {
+		kfree(mp);
+		mp = lookup_mountpoint(dentry);
+	} else {
+		if (unlikely(!mp)) {
+			read_sequnlock_excl(&mount_lock);
 			namespace_unlock();
 			inode_unlock(dentry->d_inode);
-			return mp;
+			return ERR_PTR(-ENOMEM);
 		}
-		return mp;
+		err = d_set_mounted(dentry);
+		if (unlikely(err)) {
+			kfree(mp);
+			read_sequnlock_excl(&mount_lock);
+			namespace_unlock();
+			inode_unlock(dentry->d_inode);
+			return ERR_PTR(err);
+		}
+		mp->m_dentry = dentry;
+		mp->m_count = 1;
+		hlist_add_head(&mp->m_hash, mp_hash(dentry));
+		INIT_HLIST_HEAD(&mp->m_list);
 	}
-	namespace_unlock();
-	inode_unlock(path->dentry->d_inode);
-	path_put(path);
-	path->mnt = mnt;
-	dentry = path->dentry = dget(mnt->mnt_root);
-	goto retry;
+	read_sequnlock_excl(&mount_lock);
+	return mp;
 }
 
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(where);
+	read_sequnlock_excl(&mount_lock);
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 }
@@ -3137,7 +3148,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	list_del_init(&new_mnt->mnt_expire);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(root_mp);
+	read_sequnlock_excl(&mount_lock);
 	error = 0;
 out4:
 	unlock_mount(old_mp);

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] Fix a race in put_mountpoint.
  2017-01-03  4:00         ` Al Viro
@ 2017-01-04  3:52           ` Eric W. Biederman
  2017-01-04  3:53             ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-04  3:52 UTC (permalink / raw)
  To: Al Viro; +Cc: Krister Johansen, linux-fsdevel

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Tue, Jan 03, 2017 at 04:17:14PM +1300, Eric W. Biederman wrote:
>> Al Viro <viro@ZenIV.linux.org.uk> writes:
>> 
>> > On Tue, Jan 03, 2017 at 01:51:36PM +1300, Eric W. Biederman wrote:
>> >
>> >> The only significant thing I see is that you have not taken the
>> >> mount_lock on the path where new_mountpoint adds the new struct
>> >> mountpoint into the mountpoint hash table.
>> >
>> > Umm...  Point, but I really don't like that bouncing mount_lock up
>> > and down there.  It's not going to cause any serious overhead,
>> > but it just looks ugly... ;-/
>> >
>> > Let me think for a while...
>> 
>> The other possibility is to grab namespace_sem in mntput_no_expire
>> around the call of umount_mnt.  That is the only path where
>> put_mountpoint can be called where we are not holding namespace_sem.
>> That works in the small but I haven't traced the callers of mntput and
>> mntput_no_expire yet to see if it works in practice.
>
> No, that's a really bad idea.  Final mntput should _not_ happen under
> namespace_lock, but I don't want grabbing it in that place.

Agreed.  That just makes the code harder to maintain later on.

> How about this instead:

I really don't like the logic inlined as my patch to kill shadow mounts
needs to call acquire a mountpoint which may not already have been
allocated as well.

Beyond that we can make the logic simpler by causing d_set_mounted to
fail if the flag is already set and syncrhonize on that.  Which means
we don't have to verify the ordering between mount_lock
and rename_lock (from d_set_mounted) is not a problem, which makes
backports easier to verify.

Patch follows.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] mnt: Protect the mountpoint hashtable with mount_lock
  2017-01-04  3:52           ` Eric W. Biederman
@ 2017-01-04  3:53             ` Eric W. Biederman
  2017-01-04 21:04               ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
  2017-01-06  7:00               ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Krister Johansen
  0 siblings, 2 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-04  3:53 UTC (permalink / raw)
  To: Al Viro; +Cc: Krister Johansen, linux-fsdevel


Protecting the mountpoint hashtable with namespace_sem was sufficient
until a call to umount_mnt was added to mntput_no_expire.  At which
point it became possible for multiple calls of put_mountpoint on
the same hash chain to happen on the same time.

Kristen Johansen <kjlx@templeofstupid.com> reported:
> This can cause a panic when simultaneous callers of put_mountpoint
> attempt to free the same mountpoint.  This occurs because some callers
> hold the mount_hash_lock, while others hold the namespace lock.  Some
> even hold both.
>
> In this submitter's case, the panic manifested itself as a GP fault in
> put_mountpoint() when it called hlist_del() and attempted to dereference
> a m_hash.pprev that had been poisioned by another thread.

Al Viro observed that the simple fix is to switch from using the namespace_sem
to the mount_lock to protect the mountpoint hash table.

I have taken Al's suggested patch moved put_mountpoint in pivot_root
(instead of taking mount_lock an additional time), and have replaced
new_mountpoint with get_mountpoint a function that does the hash table
lookup and addition under the mount_lock.   The introduction of get_mounptoint
ensures that only the mount_lock is needed to manipulate the mountpoint
hashtable.

d_set_mounted is modified to only set DCACHE_MOUNTED if it is not
already set.  This allows get_mountpoint to use the setting of
DCACHE_MOUNTED to ensure adding a struct mountpoint for a dentry
happens exactly once.

Cc: stable@vger.kernel.org
Fixes: ce07d891a089 ("mnt: Honor MNT_LOCKED when detaching mounts")
Reported-by: Krister Johansen <kjlx@templeofstupid.com>
Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/dcache.c    |  7 +++++--
 fs/namespace.c | 64 +++++++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 769903dbc19d..95d71eda8142 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1336,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry)
 	}
 	spin_lock(&dentry->d_lock);
 	if (!d_unlinked(dentry)) {
-		dentry->d_flags |= DCACHE_MOUNTED;
-		ret = 0;
+		ret = -EBUSY;
+		if (!d_mountpoint(dentry)) {
+			dentry->d_flags |= DCACHE_MOUNTED;
+			ret = 0;
+		}
 	}
  	spin_unlock(&dentry->d_lock);
 out:
diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259e064f..487ba30bb5c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -742,26 +742,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 	return NULL;
 }
 
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+static struct mountpoint *get_mountpoint(struct dentry *dentry)
 {
-	struct hlist_head *chain = mp_hash(dentry);
-	struct mountpoint *mp;
+	struct mountpoint *mp, *new = NULL;
 	int ret;
 
-	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
-	if (!mp)
+	if (d_mountpoint(dentry)) {
+mountpoint:
+		read_seqlock_excl(&mount_lock);
+		mp = lookup_mountpoint(dentry);
+		read_sequnlock_excl(&mount_lock);
+		if (mp)
+			goto done;
+	}
+
+	if (!new)
+		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!new)
 		return ERR_PTR(-ENOMEM);
 
+
+	/* Exactly one processes may set d_mounted */
 	ret = d_set_mounted(dentry);
-	if (ret) {
-		kfree(mp);
-		return ERR_PTR(ret);
-	}
 
-	mp->m_dentry = dentry;
-	mp->m_count = 1;
-	hlist_add_head(&mp->m_hash, chain);
-	INIT_HLIST_HEAD(&mp->m_list);
+	/* Someone else set d_mounted? */
+	if (ret == -EBUSY)
+		goto mountpoint;
+
+	/* The dentry is not available as a mountpoint? */
+	mp = ERR_PTR(ret);
+	if (ret)
+		goto done;
+
+	/* Add the new mountpoint to the hash table */
+	read_seqlock_excl(&mount_lock);
+	new->m_dentry = dentry;
+	new->m_count = 1;
+	hlist_add_head(&new->m_hash, mp_hash(dentry));
+	INIT_HLIST_HEAD(&new->m_list);
+	read_sequnlock_excl(&mount_lock);
+
+	mp = new;
+	new = NULL;
+done:
+	kfree(new);
 	return mp;
 }
 
@@ -1595,11 +1619,11 @@ void __detach_mounts(struct dentry *dentry)
 	struct mount *mnt;
 
 	namespace_lock();
+	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
 	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;
 
-	lock_mount_hash();
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1609,9 +1633,9 @@ void __detach_mounts(struct dentry *dentry)
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -2038,9 +2062,7 @@ static struct mountpoint *lock_mount(struct path *path)
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
-		struct mountpoint *mp = lookup_mountpoint(dentry);
-		if (!mp)
-			mp = new_mountpoint(dentry);
+		struct mountpoint *mp = get_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
 			inode_unlock(dentry->d_inode);
@@ -2059,7 +2081,11 @@ static struct mountpoint *lock_mount(struct path *path)
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
+
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(where);
+	read_sequnlock_excl(&mount_lock);
+
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 }
@@ -3135,9 +3161,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */
 	list_del_init(&new_mnt->mnt_expire);
+	put_mountpoint(root_mp);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
-	put_mountpoint(root_mp);
 	error = 0;
 out4:
 	unlock_mount(old_mp);
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-04  3:53             ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Eric W. Biederman
@ 2017-01-04 21:04               ` Eric W. Biederman
  2017-01-07  5:06                 ` Al Viro
  2017-05-14  2:15                 ` Andrei Vagin
  2017-01-06  7:00               ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Krister Johansen
  1 sibling, 2 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-04 21:04 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai


Ever since mount propagation was introduced in cases where a mount in
propagated to parent mount mountpoint pair that is already in use the
code has placed the new mount behind the old mount in the mount hash
table.

This implementation detail is problematic as it allows creating
arbitrary length mount hash chains.

Furthermore it invalidates the constraint maintained elsewhere in the
mount code that a parent mount and a mountpoint pair will have exactly
one mount upon them.  Making it hard to deal with and to talk about
this special case in the mount code.

Modify mount propagation to notice when there is already a mount at
the parent mount and mountpoint where a new mount is propagating to
and place that preexisting mount on top of the new mount.

Modify unmount propagation to notice when a mount that is being
unmounted has another mount on top of it (and no other children), and
to replace the unmounted mount with the mount on top of it.

Move the MNT_UMUONT test from __lookup_mnt_last into
__propagate_umount as that is the only call of __lookup_mnt_last where
MNT_UMOUNT may be set on any mount visible in the mount hash table.

These modifications allow:
 - __lookup_mnt_last to be removed.
 - attach_shadows to be renamed __attach_mnt and the it's shadow
   handling to be removed.
 - commit_tree to be simplified
 - copy_tree to be simplified

The result is an easier to understand tree of mounts that does not
allow creation of arbitrary length hash chains in the mount hash table.

v2: Updated to mnt_change_mountpoint to not call dput or mntput
and instead to decrement the counts directly.  It is guaranteed
that there will be other references when mnt_change_mountpoint is
called so this is safe.

Cc: stable@vger.kernel.org
Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
Tested-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

Since the last version some of you may have seen I have modified
my implementation of mnt_change_mountpoint so that it no longer calls
mntput or dput but instead relies on the knowledge that it can not
possibly have the last reference to the mnt and dentry of interest.
This avoids code checking tools from complaining bitterly.

This is on top of my previous patch that sorts out locking of the
mountpoint hash table.  After time giving ample time for review I intend
to push this and the previous bug fix to Linus.

 fs/mount.h     |   1 -
 fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
 fs/pnode.c     |  27 ++++++++++----
 fs/pnode.h     |   2 ++
 4 files changed, 82 insertions(+), 58 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
 }
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
-extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..91ccfb73f0e0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 }
 
 /*
- * find the last mount at @dentry on vfsmount @mnt.
- * mount_lock must be held.
- */
-struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
-{
-	struct mount *p, *res = NULL;
-	p = __lookup_mnt(mnt, dentry);
-	if (!p)
-		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
-	hlist_for_each_entry_continue(p, mnt_hash) {
-		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
-	}
-out:
-	return res;
-}
-
-/*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 
+static void __attach_mnt(struct mount *mnt, struct mount *parent)
+{
+	hlist_add_head_rcu(&mnt->mnt_hash,
+			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
-	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
-	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	__attach_mnt(mnt, parent);
 }
 
-static void attach_shadowed(struct mount *mnt,
-			struct mount *parent,
-			struct mount *shadows)
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
-	if (shadows) {
-		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
-		list_add(&mnt->mnt_child, &shadows->mnt_child);
-	} else {
-		hlist_add_head_rcu(&mnt->mnt_hash,
-				m_hash(&parent->mnt, mnt->mnt_mountpoint));
-		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
-	}
+	struct mountpoint *old_mp = mnt->mnt_mp;
+	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
+	struct mount *old_parent = mnt->mnt_parent;
+
+	list_del_init(&mnt->mnt_child);
+	hlist_del_init(&mnt->mnt_mp_list);
+	hlist_del_init_rcu(&mnt->mnt_hash);
+
+	attach_mnt(mnt, parent, mp);
+
+	put_mountpoint(old_mp);
+
+	/*
+	 * Safely avoid even the suggestion this code might sleep or
+	 * lock the mount hash by taking avantage of the knowlege that
+	 * mnt_change_mounpoint will not release the final reference
+	 * to a mountpoint.
+	 *
+	 * During mounting, another mount will continue to use the old
+	 * mountpoint and during unmounting, the old mountpoint will
+	 * continue to exist until namespace_unlock which happens well
+	 * after mnt_change_mountpoint.
+	 */
+	spin_lock(&old_mountpoint->d_lock);
+	old_mountpoint->d_lockref.count--;
+	spin_unlock(&old_mountpoint->d_lock);
+
+	mnt_add_count(old_parent, -1);
 }
 
 /*
  * vfsmount lock must be held for write
  */
-static void commit_tree(struct mount *mnt, struct mount *shadows)
+static void commit_tree(struct mount *mnt)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 	n->mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
-	attach_shadowed(mnt, parent, shadows);
+	__attach_mnt(mnt, parent);
 	touch_mnt_namespace(n);
 }
 
@@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
-			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
@@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			mnt_set_mountpoint(parent, p->mnt_mp, q);
-			if (!list_empty(&parent->mnt_mounts)) {
-				t = list_last_entry(&parent->mnt_mounts,
-					struct mount, mnt_child);
-				if (t->mnt_mp != p->mnt_mp)
-					t = NULL;
-			}
-			attach_shadowed(q, parent, t);
+			attach_mnt(q, parent, p->mnt_mp);
 			unlock_mount_hash();
 		}
 	}
@@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 {
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
+	struct mountpoint *smp;
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 
+	/* Preallocate a mountpoint in case the new mounts need
+	 * to be tucked under other mounts.
+	 */
+	smp = get_mountpoint(source_mnt->mnt.mnt_root);
+	if (IS_ERR(smp))
+		return PTR_ERR(smp);
+
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!parent_path) {
 		err = count_mounts(ns, source_mnt);
@@ -2022,17 +2024,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-		commit_tree(source_mnt, NULL);
+		commit_tree(source_mnt);
 	}
 
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
-		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
-		q = __lookup_mnt_last(&child->mnt_parent->mnt,
-				      child->mnt_mountpoint);
-		commit_tree(child, q);
+		if (child->mnt.mnt_root == smp->m_dentry) {
+			struct mount *q;
+			q = __lookup_mnt(&child->mnt_parent->mnt,
+					 child->mnt_mountpoint);
+			if (q)
+				mnt_change_mountpoint(child, smp, q);
+		}
+		commit_tree(child);
 	}
 	unlock_mount_hash();
+	put_mountpoint(smp);
 
 	return 0;
 
@@ -2046,6 +2053,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	ns->pending_mounts = 0;
+	put_mountpoint(smp);
 	return err;
 }
 
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..eb4331240fd1 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -327,6 +327,9 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
  */
 static inline int do_refcount_check(struct mount *mnt, int count)
 {
+	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
+	if (topper)
+		count++;
 	return mnt_get_count(mnt) > count;
 }
 
@@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 
 	for (m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 		if (child && list_empty(&child->mnt_mounts) &&
 		    (ret = do_refcount_check(child, 1)))
 			break;
@@ -381,7 +384,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 		if (child)
 			child->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
@@ -399,9 +402,11 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
+			continue;
+		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *topper;
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
 		/*
 		 * umount the child only if the child has no children
@@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
+
+		/* If there is exactly one mount covering all of child
+		 * replace child with that mount.
+		 */
+		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
+		if (topper &&
+		    (child->mnt_mounts.next == &topper->mnt_child) &&
+		    (topper->mnt_child.next == &child->mnt_mounts))
+			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
+
 		if (list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
+			   struct mount *mnt);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] mnt: Protect the mountpoint hashtable with mount_lock
  2017-01-04  3:53             ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Eric W. Biederman
  2017-01-04 21:04               ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
@ 2017-01-06  7:00               ` Krister Johansen
  1 sibling, 0 replies; 63+ messages in thread
From: Krister Johansen @ 2017-01-06  7:00 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel

On Wed, Jan 04, 2017 at 04:53:59PM +1300, Eric W. Biederman wrote:
> 
> Protecting the mountpoint hashtable with namespace_sem was sufficient
> until a call to umount_mnt was added to mntput_no_expire.  At which
> point it became possible for multiple calls of put_mountpoint on
> the same hash chain to happen on the same time.
> 
> Kristen Johansen <kjlx@templeofstupid.com> reported:
> > This can cause a panic when simultaneous callers of put_mountpoint
> > attempt to free the same mountpoint.  This occurs because some callers
> > hold the mount_hash_lock, while others hold the namespace lock.  Some
> > even hold both.
> >
> > In this submitter's case, the panic manifested itself as a GP fault in
> > put_mountpoint() when it called hlist_del() and attempted to dereference
> > a m_hash.pprev that had been poisioned by another thread.
> 
> Al Viro observed that the simple fix is to switch from using the namespace_sem
> to the mount_lock to protect the mountpoint hash table.
> 
> I have taken Al's suggested patch moved put_mountpoint in pivot_root
> (instead of taking mount_lock an additional time), and have replaced
> new_mountpoint with get_mountpoint a function that does the hash table
> lookup and addition under the mount_lock.   The introduction of get_mounptoint
> ensures that only the mount_lock is needed to manipulate the mountpoint
> hashtable.
> 
> d_set_mounted is modified to only set DCACHE_MOUNTED if it is not
> already set.  This allows get_mountpoint to use the setting of
> DCACHE_MOUNTED to ensure adding a struct mountpoint for a dentry
> happens exactly once.
> 
> Cc: stable@vger.kernel.org
> Fixes: ce07d891a089 ("mnt: Honor MNT_LOCKED when detaching mounts")
> Reported-by: Krister Johansen <kjlx@templeofstupid.com>
> Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---

Sorry for the slow reply.

This looks right to me.  I just pulled in the patch and went through all
of the code paths in cscope.  Everything is now under the mount_lock,
which solves the problem from my perspective.  Feel free to put me down
as a reviewed-by if my vote counts.`

There's another issue with MNT_LOCKED and detached mounts that I've been
investigating.  I'd be curious to get your opinion before I write any
code.  I'll send that out in a separate e-mail, though.

-K

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-04 21:04               ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
@ 2017-01-07  5:06                 ` Al Viro
  2017-01-11  0:10                   ` Eric W. Biederman
  2017-05-14  2:15                 ` Andrei Vagin
  1 sibling, 1 reply; 63+ messages in thread
From: Al Viro @ 2017-01-07  5:06 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote:

>  - attach_shadows to be renamed __attach_mnt and the it's shadow
>    handling to be removed.

Er...  s/the it's/its/, presumably?  Or am I misparsing that?

> v2: Updated to mnt_change_mountpoint to not call dput or mntput
> and instead to decrement the counts directly.  It is guaranteed
> that there will be other references when mnt_change_mountpoint is
> called so this is safe.

> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
>  {

Too generic name, IMO, and I really wonder if "mount" (== interpose) and
"umount" (== excise?) cases would be better off separately.

> +	struct mountpoint *old_mp = mnt->mnt_mp;
> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
> +	struct mount *old_parent = mnt->mnt_parent;
> +
> +	list_del_init(&mnt->mnt_child);
> +	hlist_del_init(&mnt->mnt_mp_list);
> +	hlist_del_init_rcu(&mnt->mnt_hash);
> +
> +	attach_mnt(mnt, parent, mp);
> +
> +	put_mountpoint(old_mp);

> +	 *
> +	 * During mounting, another mount will continue to use the old
> +	 * mountpoint and during unmounting, the old mountpoint will
> +	 * continue to exist until namespace_unlock which happens well
> +	 * after mnt_change_mountpoint.
> +	 */

Umm...  AFAICS, in the former case "another mount" is simply parent, right?

> +	spin_lock(&old_mountpoint->d_lock);
> +	old_mountpoint->d_lockref.count--;
> +	spin_unlock(&old_mountpoint->d_lock);
> +	mnt_add_count(old_parent, -1);


> +		if (child->mnt.mnt_root == smp->m_dentry) {

Explain, please.  In which case is that condition _not_ satisfied, and
what should happen i

> +			struct mount *q;
> +			q = __lookup_mnt(&child->mnt_parent->mnt,
> +					 child->mnt_mountpoint);
> +			if (q)
> +				mnt_change_mountpoint(child, smp, q);


>  	unlock_mount_hash();
> +	put_mountpoint(smp);

Wrong order...

>  	ns->pending_mounts = 0;
> +	put_mountpoint(smp);

... and worse yet here.


>  static inline int do_refcount_check(struct mount *mnt, int count)
>  {
> +	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
> +	if (topper)
> +		count++;
>  	return mnt_get_count(mnt) > count;
>  }

> @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  	     		m = propagation_next(m, parent)) {
> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>  		if (child && list_empty(&child->mnt_mounts) &&
>  		    (ret = do_refcount_check(child, 1)))
>  			break;

Er...  You do realize that you can end up with more that one such
propagation, right?  IOW, there might be more than one thing slipped in.

> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>  			SET_MNT_MARK(child);

Reread the condition, please...  And yes, I realize that original is
also rather odd; at a guess it was meant to be !(, not (!, but that's
just a guess - it's your code, IIRC.

> @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> +		struct mount *topper;
> +		struct mount *child = __lookup_mnt(&m->mnt,
>  						mnt->mnt_mountpoint);
>  		/*
>  		 * umount the child only if the child has no children
> @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
>  		if (!child || !IS_MNT_MARKED(child))
>  			continue;
>  		CLEAR_MNT_MARK(child);
> +
> +		/* If there is exactly one mount covering all of child
> +		 * replace child with that mount.
> +		 */
> +		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
> +		if (topper &&

> +		    (child->mnt_mounts.next == &topper->mnt_child) &&
> +		    (topper->mnt_child.next == &child->mnt_mounts))

Weird way to spell child->mnt_mounts.next == child->mnt_mounts.prev, that...
Or, perhaps, the entire thing ought to be
		if (list_is_singular(&child->mnt_mounts)) {
			topper = list_first_entry(&child->mnt_mounts,
						  struct mount, mnt_child);
			if (topper->mnt_parent == child &&
			    topped->mnt_mountpoint == child->mnt.mnt_root)

to avoid hash lookups.

> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);


FWIW, the my main worry here is your handling of the umount.  For
example, what happens if
	* something is mounted on A (m1)
	* something else is mounted on A/bar (m2)
	* D is a slave of C
	* something has been mounted on D/foo (n)
	* you do mount --rbind A C/foo (m1' on C/foo, m2' on m1/bar,
					m1'' interposed on D/foo under n,
					m2'' on m1''/bar,
					m1'' slave of m1', m2'' slave of m2)
	* you make C/foo and C/foo/bar private (m1'' and m2'' are not getting
					propagation from m1' and m2' anymore)
	* you umount C/foo/bar		(m2' is unmounted)
	* you umount C/foo
m1' gets unmounted, all right, but what of m1''?  D is a slave of C, so we
get propagation of umount from C/foo to D/foo; m1'' still has m2'' attached
to it.  AFAICS, your logics will happily slip m1'' from under n (assuming
that n itself is not busy), and leak both m1'' and m2''.

OTOH, the case of multiple slip-under (Z is slave of Y, which is a slave of X,
mount on Z, then mount of Y, then mount on X) the check for being busy would
do very odd things.

Something's fishy on the umount side...

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-07  5:06                 ` Al Viro
@ 2017-01-11  0:10                   ` Eric W. Biederman
  2017-01-11  4:11                     ` Al Viro
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-11  0:10 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote:
>
>>  - attach_shadows to be renamed __attach_mnt and the it's shadow
>>    handling to be removed.
>
> Er...  s/the it's/its/, presumably?  Or am I misparsing that?
>
>> v2: Updated to mnt_change_mountpoint to not call dput or mntput
>> and instead to decrement the counts directly.  It is guaranteed
>> that there will be other references when mnt_change_mountpoint is
>> called so this is safe.
>
>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
>>  {
>
> Too generic name, IMO, and I really wonder if "mount" (== interpose) and
> "umount" (== excise?) cases would be better off separately.
>
>> +	struct mountpoint *old_mp = mnt->mnt_mp;
>> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
>> +	struct mount *old_parent = mnt->mnt_parent;
>> +
>> +	list_del_init(&mnt->mnt_child);
>> +	hlist_del_init(&mnt->mnt_mp_list);
>> +	hlist_del_init_rcu(&mnt->mnt_hash);
>> +
>> +	attach_mnt(mnt, parent, mp);
>> +
>> +	put_mountpoint(old_mp);
>
>> +	 *
>> +	 * During mounting, another mount will continue to use the old
>> +	 * mountpoint and during unmounting, the old mountpoint will
>> +	 * continue to exist until namespace_unlock which happens well
>> +	 * after mnt_change_mountpoint.
>> +	 */
>
> Umm...  AFAICS, in the former case "another mount" is simply parent,
> right?

Yes it is the new parent mountpoint.  I was looking at it from a
different perspective.


>> +	spin_lock(&old_mountpoint->d_lock);
>> +	old_mountpoint->d_lockref.count--;
>> +	spin_unlock(&old_mountpoint->d_lock);
>> +	mnt_add_count(old_parent, -1);
>
>
>> +		if (child->mnt.mnt_root == smp->m_dentry) {
>
> Explain, please.  In which case is that condition _not_ satisfied, and
> what should happen i

When a tree is grafted in that condition does not apply to the lower
leaves of the tree.  At the same time nothing needs to be done for those
leaves.  Only the primary mountpoint needs to worry about tucking.


>> +			struct mount *q;
>> +			q = __lookup_mnt(&child->mnt_parent->mnt,
>> +					 child->mnt_mountpoint);
>> +			if (q)
>> +				mnt_change_mountpoint(child, smp, q);
>
>
>>  	unlock_mount_hash();
>> +	put_mountpoint(smp);
>
> Wrong order...
>
>>  	ns->pending_mounts = 0;
>> +	put_mountpoint(smp);
>
> ... and worse yet here.

Definitely.  I totally spaced on propagating the locking changes to this
patch when I rebased it.

>>  static inline int do_refcount_check(struct mount *mnt, int count)
>>  {
>> +	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
>> +	if (topper)
>> +		count++;
>>  	return mnt_get_count(mnt) > count;
>>  }
>
>> @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>>  
>>  	for (m = propagation_next(parent, parent); m;
>>  	     		m = propagation_next(m, parent)) {
>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>  		if (child && list_empty(&child->mnt_mounts) &&
>>  		    (ret = do_refcount_check(child, 1)))
>>  			break;
>
> Er...  You do realize that you can end up with more that one such
> propagation, right?  IOW, there might be more than one thing slipped
> in.

So I have stared at this a lot and I don't see what you seem to be
seeing here.  I do however see that propagate_mount_busy has been
buggy since the beginning, as it only fails in the propagation case
if list of children is empty.

I also see my modification to the code is buggy since the list empty
precludes my changes to do_refcount_check from being effective.

I have looked hard and your point with multiple propagations elludes me.

I am going to add a patch to fix propagate_mount_busy, and then rebase
this patch on top of that, and post it all for review.


>> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>>  			SET_MNT_MARK(child);
>
> Reread the condition, please...  And yes, I realize that original is
> also rather odd; at a guess it was meant to be !(, not (!, but that's
> just a guess - it's your code, IIRC.

The intent is to find all trees that we can unmount where the point
at which the tree meets the rest of the mounts is not locked.

The mark is later used to see if it ok to unmount a mount or if
we will reveal information to userspace (by breaking a lock).

Therefore the mark needs to be set if the mount is unlocked,
and recursively the mark needs to be set for every child of
that mount where the mark is set (the second condition).

Which makes the code essentially correct.

Unfortunately the code does not handle multiple unmounts from the same
parent mount point.  Which means shadow/side mount support and untucking
of mounts fails to handle multiple unmounts from the same parent mount.

The way the mark is used fundamentally assumes only one operation on
each mountpoint, and that is broken.

I intend to work on propagute_umount some more and fix that brokenness
and hopefully fix the performance issues as well.  But I am leaving that
work for another change as it is going to require stripping out and
replacing algorithms, and so far I don't have good solutions.

>> @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
>>  
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> +		struct mount *topper;
>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>  						mnt->mnt_mountpoint);
>>  		/*
>>  		 * umount the child only if the child has no children
>> @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
>>  		if (!child || !IS_MNT_MARKED(child))
>>  			continue;
>>  		CLEAR_MNT_MARK(child);
>> +
>> +		/* If there is exactly one mount covering all of child
>> +		 * replace child with that mount.
>> +		 */
>> +		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
>> +		if (topper &&
>
>> +		    (child->mnt_mounts.next == &topper->mnt_child) &&
>> +		    (topper->mnt_child.next == &child->mnt_mounts))
>
> Weird way to spell child->mnt_mounts.next == child->mnt_mounts.prev, that...
> Or, perhaps, the entire thing ought to be

Except it is clearer than that.  It verifies not just that there is one
list item but that topper is that one list item.

Further it is the exact same logic as is used in do_check_refcnt and
at this stage in development I figured it was more important to have
a recognizable pattern than to have the absolute most performant code.
Especially as the basic complexity of the code is the same either way.

> 		if (list_is_singular(&child->mnt_mounts)) {
> 			topper = list_first_entry(&child->mnt_mounts,
> 						  struct mount, mnt_child);
> 			if (topper->mnt_parent == child &&
> 			    topped->mnt_mountpoint == child->mnt.mnt_root)
>
> to avoid hash lookups.

That it would and now that I see that list_is_singular exists it looks
like a reasonable option.

>> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
>
>
> FWIW, the my main worry here is your handling of the umount.  For
> example, what happens if
> 	* something is mounted on A (m1)
> 	* something else is mounted on A/bar (m2)
> 	* D is a slave of C
> 	* something has been mounted on D/foo (n)
> 	* you do mount --rbind A C/foo (m1' on C/foo, m2' on m1'/bar,
> 					m1'' interposed on D/foo under n,
> 					m2'' on m1''/bar,
> 					m1'' slave of m1', m2'' slave of m2)
> 	* you make C/foo and C/foo/bar private (m1'' and m2'' are not getting
> 					propagation from m1' and m2' anymore)
> 	* you umount C/foo/bar		(m2' is unmounted)
> 	* you umount C/foo
> m1' gets unmounted, all right, but what of m1''?  D is a slave of C, so we
> get propagation of umount from C/foo to D/foo; m1'' still has m2'' attached
> to it.  AFAICS, your logics will happily slip m1'' from under n (assuming
> that n itself is not busy), and leak both m1'' and m2''.

Yes.  This is exactly the same behavior we have today without my patch.
The only difference is who is the parent mount.

$ cat > viro1.sh << EOF
#!/bin/sh
set -e
set -x

mount -t tmpfs base /mnt
mkdir -p /mnt/A
mount -t tmpfs m1 /mnt/A
mkdir -p /mnt/A/bar
mount -t tmpfs m2 /mnt/A/bar

mkdir -p /mnt/D
mkdir -p /mnt/C
mount -t tmpfs mC /mnt/C
mkdir -p /mnt/C/foo
mount --make-shared /mnt/C
mount --bind /mnt/C /mnt/D
mount --make-slave /mnt/D
mount -t tmpfs n /mnt/D/foo
mount --rbind /mnt/A /mnt/C/foo

echo
cat /proc/self/mountinfo

mount --make-private /mnt/C/foo
mount --make-private /mnt/C/foo/bar

echo
cat /proc/self/mountinfo

umount /mnt/C/foo/bar

echo
cat /proc/self/mountinfo

umount /mnt/C/foo

echo
cat /proc/self/mountinfo
EOF
$ chmod +x ./viro1.sh
$ unshare -Urm ./viro1.sh

At least when I run the above on a kernel with and without my patch
under discussion all I see different is mnt_id of the parent.  Which is
exactly what we should expect from this change.

Did I make a mistake in creating my script?

Or are you referring to the fact that mount_propagation_busy is just
plain buggy?

> OTOH, the case of multiple slip-under (Z is slave of Y, which is a slave of X,
> mount on Z, then mount of Y, then mount on X) the check for being busy would
> do very odd things.

I don't see what you are referring to.  Reposting shortly.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-11  0:10                   ` Eric W. Biederman
@ 2017-01-11  4:11                     ` Al Viro
  2017-01-11 16:03                       ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Al Viro @ 2017-01-11  4:11 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

On Wed, Jan 11, 2017 at 01:10:57PM +1300, Eric W. Biederman wrote:
> >> +		if (child->mnt.mnt_root == smp->m_dentry) {
> >
> > Explain, please.  In which case is that condition _not_ satisfied, and
> > what should happen i
> 
> When a tree is grafted in that condition does not apply to the lower
> leaves of the tree.  At the same time nothing needs to be done for those
> leaves.  Only the primary mountpoint needs to worry about tucking.

	How in hell would those lower leaves end up on the list in
attach_recursive_mnt()?  IDGI...

> >> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
> >>  			SET_MNT_MARK(child);
> >
> > Reread the condition, please...  And yes, I realize that original is
> > also rather odd; at a guess it was meant to be !(, not (!, but that's
> > just a guess - it's your code, IIRC.
> 
> The intent is to find all trees that we can unmount where the point
> at which the tree meets the rest of the mounts is not locked.
> 
> The mark is later used to see if it ok to unmount a mount or if
> we will reveal information to userspace (by breaking a lock).
> 
> Therefore the mark needs to be set if the mount is unlocked,
> and recursively the mark needs to be set for every child of
> that mount where the mark is set (the second condition).

*blink*

You have "if mount is not locked - mark it; if mount is already marked -
mark it again".  The latter part (|| IS_MNT_MARKED(mnt), that is) looks
very odd, won't you agree?  What the hell was that (its counterpart in
the earlier code) about?

I could understand something along the lines "mark it unless it's locked
or already marked", but your code is "mark it if it's not locked *or*
if it's already marked".  Makes no sense in that form.

> > FWIW, the my main worry here is your handling of the umount.  For
> > example, what happens if
> > 	* something is mounted on A (m1)
> > 	* something else is mounted on A/bar (m2)
> > 	* D is a slave of C
> > 	* something has been mounted on D/foo (n)
> > 	* you do mount --rbind A C/foo (m1' on C/foo, m2' on m1'/bar,
> > 					m1'' interposed on D/foo under n,
> > 					m2'' on m1''/bar,
> > 					m1'' slave of m1', m2'' slave of m2)
> > 	* you make C/foo and C/foo/bar private (m1'' and m2'' are not getting
> > 					propagation from m1' and m2' anymore)
> > 	* you umount C/foo/bar		(m2' is unmounted)
> > 	* you umount C/foo
> > m1' gets unmounted, all right, but what of m1''?  D is a slave of C, so we
> > get propagation of umount from C/foo to D/foo; m1'' still has m2'' attached
> > to it.  AFAICS, your logics will happily slip m1'' from under n (assuming
> > that n itself is not busy), and leak both m1'' and m2''.
> 
> Yes.  This is exactly the same behavior we have today without my patch.
> The only difference is who is the parent mount.

Not quite.  In the current tree m1'' should get stuck there (and be exposed
when n gets unmounted); AFAICS, your change will have it kicked out, with
m2'' still attached and still contributing to refcount of m1''.

I might be missing something (and I hadn't checked your script - right now
I'm at 16 hours of uptime after only 4 hours of sleep).  Will take a look
at that after I grab some sleep...

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-11  4:11                     ` Al Viro
@ 2017-01-11 16:03                       ` Eric W. Biederman
  2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
  2017-01-12  5:03                         ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Al Viro
  0 siblings, 2 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-11 16:03 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Wed, Jan 11, 2017 at 01:10:57PM +1300, Eric W. Biederman wrote:
>> >> +		if (child->mnt.mnt_root == smp->m_dentry) {
>> >
>> > Explain, please.  In which case is that condition _not_ satisfied, and
>> > what should happen i
>> 
>> When a tree is grafted in that condition does not apply to the lower
>> leaves of the tree.  At the same time nothing needs to be done for those
>> leaves.  Only the primary mountpoint needs to worry about tucking.
>
> 	How in hell would those lower leaves end up on the list in
> attach_recursive_mnt()?  IDGI...

The submounts of a mount tree that is being attached need to have
commit_tree called on them to attach them to a mount namespace.

>
>> >> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>> >>  			SET_MNT_MARK(child);
>> >
>> > Reread the condition, please...  And yes, I realize that original is
>> > also rather odd; at a guess it was meant to be !(, not (!, but that's
>> > just a guess - it's your code, IIRC.
>> 
>> The intent is to find all trees that we can unmount where the point
>> at which the tree meets the rest of the mounts is not locked.
>> 
>> The mark is later used to see if it ok to unmount a mount or if
>> we will reveal information to userspace (by breaking a lock).
>> 
>> Therefore the mark needs to be set if the mount is unlocked,
>> and recursively the mark needs to be set for every child of
>> that mount where the mark is set (the second condition).
>
> *blink*
>
> You have "if mount is not locked - mark it; if mount is already marked -
> mark it again".  The latter part (|| IS_MNT_MARKED(mnt), that is) looks
> very odd, won't you agree?  What the hell was that (its counterpart in
> the earlier code) about?

Not mark it again.  If the parent is marked mark the child.

This is about finding subtrees where the root of the subree is unlocked
but the children may be locked to that root.  Still we can safely
unmount the entire subtree without revealing anything to userspace.

The walk is designed to happen from parent to the child mounts.

> I could understand something along the lines "mark it unless it's locked
> or already marked", but your code is "mark it if it's not locked *or*
> if it's already marked".  Makes no sense in that form.
>
>> > FWIW, the my main worry here is your handling of the umount.  For
>> > example, what happens if
>> > 	* something is mounted on A (m1)
>> > 	* something else is mounted on A/bar (m2)
>> > 	* D is a slave of C
>> > 	* something has been mounted on D/foo (n)
>> > 	* you do mount --rbind A C/foo (m1' on C/foo, m2' on m1'/bar,
>> > 					m1'' interposed on D/foo under n,
>> > 					m2'' on m1''/bar,
>> > 					m1'' slave of m1', m2'' slave of m2)
>> > 	* you make C/foo and C/foo/bar private (m1'' and m2'' are not getting
>> > 					propagation from m1' and m2' anymore)
>> > 	* you umount C/foo/bar		(m2' is unmounted)
>> > 	* you umount C/foo
>> > m1' gets unmounted, all right, but what of m1''?  D is a slave of C, so we
>> > get propagation of umount from C/foo to D/foo; m1'' still has m2'' attached
>> > to it.  AFAICS, your logics will happily slip m1'' from under n (assuming
>> > that n itself is not busy), and leak both m1'' and m2''.
>> 
>> Yes.  This is exactly the same behavior we have today without my patch.
>> The only difference is who is the parent mount.
>
> Not quite.  In the current tree m1'' should get stuck there (and be exposed
> when n gets unmounted); AFAICS, your change will have it kicked out, with
> m2'' still attached and still contributing to refcount of m1''.
>
> I might be missing something (and I hadn't checked your script - right now
> I'm at 16 hours of uptime after only 4 hours of sleep).  Will take a look
> at that after I grab some sleep...

Please look with fresh rested eyes.  I will see about posting a new
version of the patch shortly.

Eric


^ permalink raw reply	[flat|nested] 63+ messages in thread

* [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-11 16:03                       ` Eric W. Biederman
@ 2017-01-11 16:18                         ` Eric W. Biederman
  2017-01-11 16:19                           ` [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
                                             ` (3 more replies)
  2017-01-12  5:03                         ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Al Viro
  1 sibling, 4 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-11 16:18 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai


When I look at what propagate_mount_busy is trying to do and I look
at the code closely I discover there is a great disconnect between the
two.  In the ordinary non-propagation case propagate_mount_busy has
been verifying that there are no submounts and that there are no
extraneous references on the mount.

For mounts that the unmount would propagate to propagate_mount_busy has
been verifying that there are no extraneous references only if there
are no submounts.  Which is nonsense.

Thefore rework the logic in propgate_mount_busy so that for each
mount it examines it considers that mount busy if that mount has
children or if there are extraneous references to that mount.

While this check was incorrect we could leak mounts instead of simply
failing umount.

Cc: stable@vger.kernel.org
Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

If you don't figure this fix is worth it after all of this time please
let me know.  This feels like the proper thing to do, and I don't expect
it will break anyone to fix this.

 fs/pnode.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..12fafa711114 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -344,7 +344,6 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
 	struct mount *m, *child;
 	struct mount *parent = mnt->mnt_parent;
-	int ret = 0;
 
 	if (mnt == parent)
 		return do_refcount_check(mnt, refcnt);
@@ -360,11 +359,13 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 	for (m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
 		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
-		if (child && list_empty(&child->mnt_mounts) &&
-		    (ret = do_refcount_check(child, 1)))
-			break;
+		if (!child)
+			continue;
+		if (!list_empty(&child->mnt_mounts) ||
+		    do_refcount_check(child, 1))
+			return 1;
 	}
-	return ret;
+	return 0;
 }
 
 /*
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
@ 2017-01-11 16:19                           ` Eric W. Biederman
  2017-01-12  5:45                             ` Al Viro
  2017-01-12  5:30                           ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Al Viro
                                             ` (2 subsequent siblings)
  3 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-11 16:19 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai


Ever since mount propagation was introduced in cases where a mount in
propagated to parent mount mountpoint pair that is already in use the
code has placed the new mount behind the old mount in the mount hash
table.

This implementation detail is problematic as it allows creating
arbitrary length mount hash chains.

Furthermore it invalidates the constraint maintained elsewhere in the
mount code that a parent mount and a mountpoint pair will have exactly
one mount upon them.  Making it hard to deal with and to talk about
this special case in the mount code.

Modify mount propagation to notice when there is already a mount at
the parent mount and mountpoint where a new mount is propagating to
and place that preexisting mount on top of the new mount.

Modify unmount propagation to notice when a mount that is being
unmounted has another mount on top of it (and no other children), and
to replace the unmounted mount with the mount on top of it.

Move the MNT_UMUONT test from __lookup_mnt_last into
__propagate_umount as that is the only call of __lookup_mnt_last where
MNT_UMOUNT may be set on any mount visible in the mount hash table.

These modifications allow:
 - __lookup_mnt_last to be removed.
 - attach_shadows to be renamed __attach_mnt and its shadow
   handling to be removed.
 - commit_tree to be simplified
 - copy_tree to be simplified

The result is an easier to understand tree of mounts that does not
allow creation of arbitrary length hash chains in the mount hash table.

v2: Updated to mnt_change_mountpoint to not call dput or mntput
and instead to decrement the counts directly.  It is guaranteed
that there will be other references when mnt_change_mountpoint is
called so this is safe.

v3: Moved put_mountpoint under mount_lock in attach_recursive_mnt
    As the locking in fs/namespace.c changed between v2 and v3.

v4: Reworked the logic in propagate_mount_busy and __propagate_umount
    that detects when a mount completely covers another mount.

Cc: stable@vger.kernel.org
Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
Tested-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h     |   1 -
 fs/namespace.c | 114 +++++++++++++++++++++++++++++++--------------------------
 fs/pnode.c     |  55 +++++++++++++++++++++++-----
 fs/pnode.h     |   2 +
 4 files changed, 111 insertions(+), 61 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
 }
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
-extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..e076f51944d2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 }
 
 /*
- * find the last mount at @dentry on vfsmount @mnt.
- * mount_lock must be held.
- */
-struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
-{
-	struct mount *p, *res = NULL;
-	p = __lookup_mnt(mnt, dentry);
-	if (!p)
-		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
-	hlist_for_each_entry_continue(p, mnt_hash) {
-		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
-	}
-out:
-	return res;
-}
-
-/*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 
+static void __attach_mnt(struct mount *mnt, struct mount *parent)
+{
+	hlist_add_head_rcu(&mnt->mnt_hash,
+			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
-	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
-	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	__attach_mnt(mnt, parent);
 }
 
-static void attach_shadowed(struct mount *mnt,
-			struct mount *parent,
-			struct mount *shadows)
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
-	if (shadows) {
-		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
-		list_add(&mnt->mnt_child, &shadows->mnt_child);
-	} else {
-		hlist_add_head_rcu(&mnt->mnt_hash,
-				m_hash(&parent->mnt, mnt->mnt_mountpoint));
-		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
-	}
+	struct mountpoint *old_mp = mnt->mnt_mp;
+	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
+	struct mount *old_parent = mnt->mnt_parent;
+
+	list_del_init(&mnt->mnt_child);
+	hlist_del_init(&mnt->mnt_mp_list);
+	hlist_del_init_rcu(&mnt->mnt_hash);
+
+	attach_mnt(mnt, parent, mp);
+
+	put_mountpoint(old_mp);
+
+	/*
+	 * Safely avoid even the suggestion this code might sleep or
+	 * lock the mount hash by taking advantage of the knowledge that
+	 * mnt_change_mountpoint will not release the final reference
+	 * to a mountpoint.
+	 *
+	 * During mounting, the mount passed in as the parent mount will
+	 * continue to use the old mountpoint and during unmounting, the
+	 * old mountpoint will continue to exist until namespace_unlock,
+	 * which happens well after mnt_change_mountpoint.
+	 */
+	spin_lock(&old_mountpoint->d_lock);
+	old_mountpoint->d_lockref.count--;
+	spin_unlock(&old_mountpoint->d_lock);
+
+	mnt_add_count(old_parent, -1);
 }
 
 /*
  * vfsmount lock must be held for write
  */
-static void commit_tree(struct mount *mnt, struct mount *shadows)
+static void commit_tree(struct mount *mnt)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 	n->mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
-	attach_shadowed(mnt, parent, shadows);
+	__attach_mnt(mnt, parent);
 	touch_mnt_namespace(n);
 }
 
@@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
-			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
@@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			mnt_set_mountpoint(parent, p->mnt_mp, q);
-			if (!list_empty(&parent->mnt_mounts)) {
-				t = list_last_entry(&parent->mnt_mounts,
-					struct mount, mnt_child);
-				if (t->mnt_mp != p->mnt_mp)
-					t = NULL;
-			}
-			attach_shadowed(q, parent, t);
+			attach_mnt(q, parent, p->mnt_mp);
 			unlock_mount_hash();
 		}
 	}
@@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 {
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
+	struct mountpoint *smp;
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 
+	/* Preallocate a mountpoint in case the new mounts need
+	 * to be tucked under other mounts.
+	 */
+	smp = get_mountpoint(source_mnt->mnt.mnt_root);
+	if (IS_ERR(smp))
+		return PTR_ERR(smp);
+
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!parent_path) {
 		err = count_mounts(ns, source_mnt);
@@ -2022,16 +2024,21 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-		commit_tree(source_mnt, NULL);
+		commit_tree(source_mnt);
 	}
 
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
-		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
-		q = __lookup_mnt_last(&child->mnt_parent->mnt,
-				      child->mnt_mountpoint);
-		commit_tree(child, q);
+		if (child->mnt.mnt_root == smp->m_dentry) {
+			struct mount *q;
+			q = __lookup_mnt(&child->mnt_parent->mnt,
+					 child->mnt_mountpoint);
+			if (q)
+				mnt_change_mountpoint(child, smp, q);
+		}
+		commit_tree(child);
 	}
+	put_mountpoint(smp);
 	unlock_mount_hash();
 
 	return 0;
@@ -2046,6 +2053,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	ns->pending_mounts = 0;
+
+	read_seqlock_excl(&mount_lock);
+	put_mountpoint(smp);
+	read_sequnlock_excl(&mount_lock);
+
 	return err;
 }
 
diff --git a/fs/pnode.c b/fs/pnode.c
index 12fafa711114..2cadc58b22ec 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,22 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	return ret;
 }
 
+static struct mount *find_topper(struct mount *mnt)
+{
+	/* If there is exactly one mount covering mnt completely return it. */
+	struct mount *child;
+
+	if (!list_is_singular(&mnt->mnt_mounts))
+		return NULL;
+
+	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
+	if (child->mnt_parent != mnt ||
+	    child->mnt_mountpoint != mnt->mnt.mnt_root)
+		return NULL;
+
+	return child;
+}
+
 /*
  * return true if the refcount is greater than count
  */
@@ -342,7 +358,7 @@ static inline int do_refcount_check(struct mount *mnt, int count)
  */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-	struct mount *m, *child;
+	struct mount *m, *child, *topper;
 	struct mount *parent = mnt->mnt_parent;
 
 	if (mnt == parent)
@@ -358,11 +374,21 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 
 	for (m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		int count = 1;
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 		if (!child)
 			continue;
-		if (!list_empty(&child->mnt_mounts) ||
-		    do_refcount_check(child, 1))
+
+		/* Is there exactly one mount on the child that covers
+		 * it completely whose reference should be ignored?
+		 */
+		topper = find_topper(child);
+		if (topper)
+			count += 1;
+		else if (!list_empty(&child->mnt_mounts))
+			return 1;
+
+		if (do_refcount_check(child, count))
 			return 1;
 	}
 	return 0;
@@ -382,7 +408,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 		if (child)
 			child->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
@@ -400,9 +426,11 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
+			continue;
+		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -421,8 +449,8 @@ static void __propagate_umount(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *topper;
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
 		/*
 		 * umount the child only if the child has no children
@@ -431,6 +459,15 @@ static void __propagate_umount(struct mount *mnt)
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
+
+		/* If there is exactly one mount covering all of child
+		 * replace child with that mount.
+		 */
+		topper = find_topper(child);
+		if (topper)
+			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
+					      topper);
+
 		if (list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
+			   struct mount *mnt);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-11 16:03                       ` Eric W. Biederman
  2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
@ 2017-01-12  5:03                         ` Al Viro
  1 sibling, 0 replies; 63+ messages in thread
From: Al Viro @ 2017-01-12  5:03 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

On Thu, Jan 12, 2017 at 05:03:42AM +1300, Eric W. Biederman wrote:
> Al Viro <viro@ZenIV.linux.org.uk> writes:
> 
> > On Wed, Jan 11, 2017 at 01:10:57PM +1300, Eric W. Biederman wrote:
> >> >> +		if (child->mnt.mnt_root == smp->m_dentry) {
> >> >
> >> > Explain, please.  In which case is that condition _not_ satisfied, and
> >> > what should happen i
> >> 
> >> When a tree is grafted in that condition does not apply to the lower
> >> leaves of the tree.  At the same time nothing needs to be done for those
> >> leaves.  Only the primary mountpoint needs to worry about tucking.
> >
> > 	How in hell would those lower leaves end up on the list in
> > attach_recursive_mnt()?  IDGI...
> 
> The submounts of a mount tree that is being attached need to have
> commit_tree called on them to attach them to a mount namespace.

Huh?  commit_tree() is called once for each copy of the source tree.  This
        list_add_tail(&head, &mnt->mnt_list);
        list_for_each_entry(m, &head, mnt_list)
                m->mnt_ns = n;
is what goes through submounts in each of them, _not_ the loop in the caller.

What we get out of propagate_mnt() is the list of copies of source tree,
one for each of the mountpoints that should get propagation from the
target.
	->mnt_mountpoint/->mnt_parent is fully set for all nodes.
	Everything except the roots of those trees is hashed and
has ->mnt_child set up.
	->mnt_hash of the roots of those copies host the cyclic list,
anchored in tree_list passed to propagate_mnt().
	->mnt_list in each copy forms an unanchored cyclic list
going through all mounts in that copy.

The loop in attach_recursive_mnt() takes the tree_list apart and for each
element (== each copy of source tree) we have commit_tree() called once,
doing the remaining work:
	* splices the ->mnt_list into the namespace's mount list
	* sets ->mnt_ns for all nodes (root and submounts alike)
	* sets ->mnt_child and ->mnt_hash for the root.

Again, the loop in attach_recursive_mnt() is over the set of secondary
copies of the source tree; it goes *only* through their roots.  Submounts
are seen only by commit_tree(), in the list_for_each_entry() loop in
that function.  Hell, try to add else WARN_ON(1); to that if (...) of yours
and see if you can trigger it if you don't believe the above...

> > You have "if mount is not locked - mark it; if mount is already marked -
> > mark it again".  The latter part (|| IS_MNT_MARKED(mnt), that is) looks
> > very odd, won't you agree?  What the hell was that (its counterpart in
> > the earlier code) about?
> 
> Not mark it again.  If the parent is marked mark the child.

*doh*

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
  2017-01-11 16:19                           ` [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
@ 2017-01-12  5:30                           ` Al Viro
  2017-01-20  7:18                             ` Eric W. Biederman
  2017-01-13 20:32                           ` Andrei Vagin
  2017-01-20 23:18                           ` Ram Pai
  3 siblings, 1 reply; 63+ messages in thread
From: Al Viro @ 2017-01-12  5:30 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
> 
> When I look at what propagate_mount_busy is trying to do and I look
> at the code closely I discover there is a great disconnect between the
> two.  In the ordinary non-propagation case propagate_mount_busy has
> been verifying that there are no submounts and that there are no
> extraneous references on the mount.
> 
> For mounts that the unmount would propagate to propagate_mount_busy has
> been verifying that there are no extraneous references only if there
> are no submounts.  Which is nonsense.

... because?

> Thefore rework the logic in propgate_mount_busy so that for each
> mount it examines it considers that mount busy if that mount has
> children or if there are extraneous references to that mount.
> 
> While this check was incorrect we could leak mounts instead of simply
> failing umount.

	What do you mean, leak?  We ended up not unmounting them, and they
stayed around until umount of whatever they'd been shadowed by/slipped under
had exposed them and they got explicitly unmounted.

	This is not a leak in a sense of "data structure is unreachable and
will never be freed", unlike the one your previous version had introduced.

Your change might very well be a nicer behaviour - or a DoS in making.
But it really deserves more detailed rationale than that and yes, it
is a user-visible change.  With rather insane userland setups in that
area (*cough* systemd *cough* docker), it's _not_ obviously correct.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-11 16:19                           ` [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
@ 2017-01-12  5:45                             ` Al Viro
  2017-01-20  7:20                               ` Eric W. Biederman
  2017-01-20  7:26                               ` [PATCH v5] " Eric W. Biederman
  0 siblings, 2 replies; 63+ messages in thread
From: Al Viro @ 2017-01-12  5:45 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

On Thu, Jan 12, 2017 at 05:19:34AM +1300, Eric W. Biederman wrote:

> +		if (child->mnt.mnt_root == smp->m_dentry) {
> +			struct mount *q;
> +			q = __lookup_mnt(&child->mnt_parent->mnt,
> +					 child->mnt_mountpoint);
> +			if (q)
> +				mnt_change_mountpoint(child, smp, q);
> +		}

This is wrong; condition will be true for *all* mounts seen by that loop.  
Feel free to add else WARN_ON(1); to the line above and try to trigger
it.  You are misinterpreting what propagate_mnt() and commit_tree() are
doing - the loop in commit_tree() goes through the submounts and sets ->mnt_ns
on those.  The of the fields is already set up by that point.  For roots
of those copies we need to set ->mnt_hash/->mnt_child as well, but for
all submounts it's already been done by copy_tree().  Again, commit_tree()
is called once per secondary copy of source tree, not once per created
mount.

> +static struct mount *find_topper(struct mount *mnt)
> +{
> +	/* If there is exactly one mount covering mnt completely return it. */
> +	struct mount *child;
> +
> +	if (!list_is_singular(&mnt->mnt_mounts))
> +		return NULL;
> +
> +	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
> +	if (child->mnt_parent != mnt ||

The first part can't happen.  Turn that into WARN_ON(child->mnt_parent != mnt)
if you wish, but that never occurs unless the data structures are corrupted.

> +	    child->mnt_mountpoint != mnt->mnt.mnt_root)
> +		return NULL;

> @@ -342,7 +358,7 @@ static inline int do_refcount_check(struct mount *mnt, int count)
>   */
>  int propagate_mount_busy(struct mount *mnt, int refcnt)
>  {
> -	struct mount *m, *child;
> +	struct mount *m, *child, *topper;
>  	struct mount *parent = mnt->mnt_parent;
>  
>  	if (mnt == parent)
> @@ -358,11 +374,21 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  	     		m = propagation_next(m, parent)) {
> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> +		int count = 1;
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>  		if (!child)
>  			continue;
> -		if (!list_empty(&child->mnt_mounts) ||
> -		    do_refcount_check(child, 1))
> +
> +		/* Is there exactly one mount on the child that covers
> +		 * it completely whose reference should be ignored?
> +		 */
> +		topper = find_topper(child);
> +		if (topper)
> +			count += 1;
> +		else if (!list_empty(&child->mnt_mounts))
> +			return 1;
> +
> +		if (do_refcount_check(child, count))
>  			return 1;

Again, subject to the comments re semantics change (see the reply to previous
patch).

> @@ -431,6 +459,15 @@ static void __propagate_umount(struct mount *mnt)
>  		if (!child || !IS_MNT_MARKED(child))
>  			continue;
>  		CLEAR_MNT_MARK(child);
> +
> +		/* If there is exactly one mount covering all of child
> +		 * replace child with that mount.
> +		 */
> +		topper = find_topper(child);
> +		if (topper)
> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> +					      topper);
> +
>  		if (list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;

Umm...  With fallthrough from "is completely overmounted" case?  And
I'm not sure I understand what that list_empty() is doing there after
your previous semantics change - how _can_ we reach that point with
non-empty ->mnt_mounts now?

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
  2017-01-11 16:19                           ` [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
  2017-01-12  5:30                           ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Al Viro
@ 2017-01-13 20:32                           ` Andrei Vagin
  2017-01-18 19:20                             ` Andrei Vagin
  2017-01-20 23:18                           ` Ram Pai
  3 siblings, 1 reply; 63+ messages in thread
From: Andrei Vagin @ 2017-01-13 20:32 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

Hi Eric,

Something wrong is in this patch. Pls, take a look at this script:

[root@fc24 ~]# unshare -m bash -x xxx.sh 
+ set -x -e -m
+ mount --make-rprivate /
+ mount --make-shared /
+ mount -t tmpfs xxx /mnt
+ mount --make-private /mnt
+ mkdir /mnt/yyy
+ mount -t tmpfs xxx /mnt/yyy
+ sleep 1
+ unshare --propagation unchanged -m sleep 1000
+ pid=452
+ umount /mnt/yyy
+ umount /mnt/
umount: /mnt/: target is busy
        (In some cases useful info about processes that
         use the device is found by lsof(8) or fuser(1).)
+ echo FAIL
FAIL
+ kill 452
+ wait
xxx.sh: line 15:   452 Terminated              unshare --propagation
unchanged -m sleep 1000


[root@fc24 ~]# cat xxx.sh 
set -x -e -m

mount --make-rprivate /
mount --make-shared /
mount -t tmpfs xxx /mnt
mount --make-private /mnt
mkdir /mnt/yyy
mount -t tmpfs xxx /mnt/yyy
unshare --propagation unchanged -m sleep 1000 &
sleep 1
pid=$!
umount /mnt/yyy
umount /mnt/ || echo FAIL
kill $pid
wait


On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
> 
> When I look at what propagate_mount_busy is trying to do and I look
> at the code closely I discover there is a great disconnect between the
> two.  In the ordinary non-propagation case propagate_mount_busy has
> been verifying that there are no submounts and that there are no
> extraneous references on the mount.
> 
> For mounts that the unmount would propagate to propagate_mount_busy has
> been verifying that there are no extraneous references only if there
> are no submounts.  Which is nonsense.
> 
> Thefore rework the logic in propgate_mount_busy so that for each
> mount it examines it considers that mount busy if that mount has
> children or if there are extraneous references to that mount.
> 
> While this check was incorrect we could leak mounts instead of simply
> failing umount.
> 
> Cc: stable@vger.kernel.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> If you don't figure this fix is worth it after all of this time please
> let me know.  This feels like the proper thing to do, and I don't expect
> it will break anyone to fix this.
> 
>  fs/pnode.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 06a793f4ae38..12fafa711114 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -344,7 +344,6 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  {
>  	struct mount *m, *child;
>  	struct mount *parent = mnt->mnt_parent;
> -	int ret = 0;
>  
>  	if (mnt == parent)
>  		return do_refcount_check(mnt, refcnt);
> @@ -360,11 +359,13 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  	for (m = propagation_next(parent, parent); m;
>  	     		m = propagation_next(m, parent)) {
>  		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> -		if (child && list_empty(&child->mnt_mounts) &&
> -		    (ret = do_refcount_check(child, 1)))
> -			break;
> +		if (!child)
> +			continue;
> +		if (!list_empty(&child->mnt_mounts) ||
> +		    do_refcount_check(child, 1))
> +			return 1;
>  	}
> -	return ret;
> +	return 0;
>  }
>  
>  /*
> -- 
> 2.10.1
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-13 20:32                           ` Andrei Vagin
@ 2017-01-18 19:20                             ` Andrei Vagin
  0 siblings, 0 replies; 63+ messages in thread
From: Andrei Vagin @ 2017-01-18 19:20 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

On Fri, Jan 13, 2017 at 12:31:59PM -0800, Andrei Vagin wrote:
> Hi Eric,
> 
> Something wrong is in this patch. Pls, take a look at this script:

Actually, it works as expected. The reason of this error is what was
fixed in this patch. I'm sorry for this noise.

Tested-by: Andrei Vagin <avagin@virtuozzo.com>

> 
> [root@fc24 ~]# unshare -m bash -x xxx.sh 
> + set -x -e -m
> + mount --make-rprivate /
> + mount --make-shared /
> + mount -t tmpfs xxx /mnt
> + mount --make-private /mnt
> + mkdir /mnt/yyy
> + mount -t tmpfs xxx /mnt/yyy
> + sleep 1
> + unshare --propagation unchanged -m sleep 1000
> + pid=452
> + umount /mnt/yyy
> + umount /mnt/
> umount: /mnt/: target is busy
>         (In some cases useful info about processes that
>          use the device is found by lsof(8) or fuser(1).)
> + echo FAIL
> FAIL
> + kill 452
> + wait
> xxx.sh: line 15:   452 Terminated              unshare --propagation
> unchanged -m sleep 1000
> 
> 
> [root@fc24 ~]# cat xxx.sh 
> set -x -e -m
> 
> mount --make-rprivate /
> mount --make-shared /
> mount -t tmpfs xxx /mnt
> mount --make-private /mnt
> mkdir /mnt/yyy
> mount -t tmpfs xxx /mnt/yyy
> unshare --propagation unchanged -m sleep 1000 &
> sleep 1
> pid=$!
> umount /mnt/yyy
> umount /mnt/ || echo FAIL
> kill $pid
> wait
> 
> 
> On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
> > 
> > When I look at what propagate_mount_busy is trying to do and I look
> > at the code closely I discover there is a great disconnect between the
> > two.  In the ordinary non-propagation case propagate_mount_busy has
> > been verifying that there are no submounts and that there are no
> > extraneous references on the mount.
> > 
> > For mounts that the unmount would propagate to propagate_mount_busy has
> > been verifying that there are no extraneous references only if there
> > are no submounts.  Which is nonsense.
> > 
> > Thefore rework the logic in propgate_mount_busy so that for each
> > mount it examines it considers that mount busy if that mount has
> > children or if there are extraneous references to that mount.
> > 
> > While this check was incorrect we could leak mounts instead of simply
> > failing umount.
> > 
> > Cc: stable@vger.kernel.org
> > Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> > ---
> > 
> > If you don't figure this fix is worth it after all of this time please
> > let me know.  This feels like the proper thing to do, and I don't expect
> > it will break anyone to fix this.
> > 
> >  fs/pnode.c | 11 ++++++-----
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/pnode.c b/fs/pnode.c
> > index 06a793f4ae38..12fafa711114 100644
> > --- a/fs/pnode.c
> > +++ b/fs/pnode.c
> > @@ -344,7 +344,6 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >  {
> >  	struct mount *m, *child;
> >  	struct mount *parent = mnt->mnt_parent;
> > -	int ret = 0;
> >  
> >  	if (mnt == parent)
> >  		return do_refcount_check(mnt, refcnt);
> > @@ -360,11 +359,13 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >  	for (m = propagation_next(parent, parent); m;
> >  	     		m = propagation_next(m, parent)) {
> >  		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> > -		if (child && list_empty(&child->mnt_mounts) &&
> > -		    (ret = do_refcount_check(child, 1)))
> > -			break;
> > +		if (!child)
> > +			continue;
> > +		if (!list_empty(&child->mnt_mounts) ||
> > +		    do_refcount_check(child, 1))
> > +			return 1;
> >  	}
> > -	return ret;
> > +	return 0;
> >  }
> >  
> >  /*
> > -- 
> > 2.10.1
> > 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-12  5:30                           ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Al Viro
@ 2017-01-20  7:18                             ` Eric W. Biederman
  0 siblings, 0 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-20  7:18 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
>> 
>> When I look at what propagate_mount_busy is trying to do and I look
>> at the code closely I discover there is a great disconnect between the
>> two.  In the ordinary non-propagation case propagate_mount_busy has
>> been verifying that there are no submounts and that there are no
>> extraneous references on the mount.
>> 
>> For mounts that the unmount would propagate to propagate_mount_busy has
>> been verifying that there are no extraneous references only if there
>> are no submounts.  Which is nonsense.
>
> ... because?
>
>> Thefore rework the logic in propgate_mount_busy so that for each
>> mount it examines it considers that mount busy if that mount has
>> children or if there are extraneous references to that mount.
>> 
>> While this check was incorrect we could leak mounts instead of simply
>> failing umount.
>
> 	What do you mean, leak?  We ended up not unmounting them, and they
> stayed around until umount of whatever they'd been shadowed by/slipped under
> had exposed them and they got explicitly unmounted.

Leak in the sense of userspace expecting everything to be cleaned up and
it was not.  My concerns exist in the presence of a slave mount with
something mounted on it.  Nothing exotic needs to exist.

> 	This is not a leak in a sense of "data structure is unreachable and
> will never be freed", unlike the one your previous version had introduced.
>
> Your change might very well be a nicer behaviour - or a DoS in making.
> But it really deserves more detailed rationale than that and yes, it
> is a user-visible change.  With rather insane userland setups in that
> area (*cough* systemd *cough* docker), it's _not_ obviously correct.

I wrote this patch primarily because I looked at what the code was doing
and saw semantics that make no obvious sense to me given my experience
with how unmount ordinarily works, I needed to point that out so
we can have this conversation independently.  Having just looked and
doubled checked I can say this is how umount is documented to behave in
Documentation/filesystems/shared-subtrees.

My experience with umount in other contexts is either umount succeeds,
umount fails because something is making the mount busy, or a lazy
umount is requested and users of the mount are ignored.

The way umount propagation works adds another case into my understanding
of umount behavior.  Namely that the umount will be propagated to some
places but not to other places depending on the presence of submounts.
For me at least that violates the principle of least surprise.  I do not
understand if we are going to give up in some cases if the mount is busy
but not in other cases why we even bother looking at propgated mounts.

Given this behavior has existed for a decade and we have some very
creative pieces of userspace code I completely agree that my case for
making this change is insufficiently strong.  To actually make this
change would require extensive testing to verify I don't introduce any
regressions in userspace applications.

This patch was my way of pointing out the very strange (to my eyes)
behaviour of umount propagation ignoring some cases of busy mounts, and
asking if that was what you were concerned about in
propagate_mount_busy.

As my case is insufficient for this change.  And my concerns about what
you were concerned about with propagate_mount_busy have been addressed I
am going to drop this patch.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-12  5:45                             ` Al Viro
@ 2017-01-20  7:20                               ` Eric W. Biederman
  2017-01-20  7:26                               ` [PATCH v5] " Eric W. Biederman
  1 sibling, 0 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-20  7:20 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Thu, Jan 12, 2017 at 05:19:34AM +1300, Eric W. Biederman wrote:
>
>> +		if (child->mnt.mnt_root == smp->m_dentry) {
>> +			struct mount *q;
>> +			q = __lookup_mnt(&child->mnt_parent->mnt,
>> +					 child->mnt_mountpoint);
>> +			if (q)
>> +				mnt_change_mountpoint(child, smp, q);
>> +		}
>
> This is wrong; condition will be true for *all* mounts seen by that loop.  
> Feel free to add else WARN_ON(1); to the line above and try to trigger
> it.  You are misinterpreting what propagate_mnt() and commit_tree() are
> doing - the loop in commit_tree() goes through the submounts and sets ->mnt_ns
> on those.  The of the fields is already set up by that point.  For roots
> of those copies we need to set ->mnt_hash/->mnt_child as well, but for
> all submounts it's already been done by copy_tree().  Again, commit_tree()
> is called once per secondary copy of source tree, not once per created
> mount.

*doh*

>> +static struct mount *find_topper(struct mount *mnt)
>> +{
>> +	/* If there is exactly one mount covering mnt completely return it. */
>> +	struct mount *child;
>> +
>> +	if (!list_is_singular(&mnt->mnt_mounts))
>> +		return NULL;
>> +
>> +	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
>> +	if (child->mnt_parent != mnt ||
>
> The first part can't happen.  Turn that into WARN_ON(child->mnt_parent != mnt)
> if you wish, but that never occurs unless the data structures are
> corrupted.

Agreed.

>> +	    child->mnt_mountpoint != mnt->mnt.mnt_root)
>> +		return NULL;
>> @@ -431,6 +459,15 @@ static void __propagate_umount(struct mount *mnt)
>>  		if (!child || !IS_MNT_MARKED(child))
>>  			continue;
>>  		CLEAR_MNT_MARK(child);
>> +
>> +		/* If there is exactly one mount covering all of child
>> +		 * replace child with that mount.
>> +		 */
>> +		topper = find_topper(child);
>> +		if (topper)
>> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
>> +					      topper);
>> +
>>  		if (list_empty(&child->mnt_mounts)) {
>>  			list_del_init(&child->mnt_child);
>>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>
> Umm...  With fallthrough from "is completely overmounted" case?  And
> I'm not sure I understand what that list_empty() is doing there after
> your previous semantics change - how _can_ we reach that point with
> non-empty ->mnt_mounts now?

With the semantic change to propagate_mnt_busy, to reach the list_empty
with a non-empty mnt_mounts requires the a umount(MNT_DETACH) as that
skips the propagate_mnt_busy call.

Without the semantic change it is even easier to get there.

A respin of this patch without the semantic change in a moment.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-12  5:45                             ` Al Viro
  2017-01-20  7:20                               ` Eric W. Biederman
@ 2017-01-20  7:26                               ` Eric W. Biederman
  2017-01-21  3:58                                 ` Ram Pai
  1 sibling, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-20  7:26 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Andrei Vagin, Ram Pai


Ever since mount propagation was introduced in cases where a mount in
propagated to parent mount mountpoint pair that is already in use the
code has placed the new mount behind the old mount in the mount hash
table.

This implementation detail is problematic as it allows creating
arbitrary length mount hash chains.

Furthermore it invalidates the constraint maintained elsewhere in the
mount code that a parent mount and a mountpoint pair will have exactly
one mount upon them.  Making it hard to deal with and to talk about
this special case in the mount code.

Modify mount propagation to notice when there is already a mount at
the parent mount and mountpoint where a new mount is propagating to
and place that preexisting mount on top of the new mount.

Modify unmount propagation to notice when a mount that is being
unmounted has another mount on top of it (and no other children), and
to replace the unmounted mount with the mount on top of it.

Move the MNT_UMUONT test from __lookup_mnt_last into
__propagate_umount as that is the only call of __lookup_mnt_last where
MNT_UMOUNT may be set on any mount visible in the mount hash table.

These modifications allow:
 - __lookup_mnt_last to be removed.
 - attach_shadows to be renamed __attach_mnt and its shadow
   handling to be removed.
 - commit_tree to be simplified
 - copy_tree to be simplified

The result is an easier to understand tree of mounts that does not
allow creation of arbitrary length hash chains in the mount hash table.

v2: Updated to mnt_change_mountpoint to not call dput or mntput
and instead to decrement the counts directly.  It is guaranteed
that there will be other references when mnt_change_mountpoint is
called so this is safe.

v3: Moved put_mountpoint under mount_lock in attach_recursive_mnt
    As the locking in fs/namespace.c changed between v2 and v3.

v4: Reworked the logic in propagate_mount_busy and __propagate_umount
    that detects when a mount completely covers another mount.

v5: Removed unnecessary tests whose result is always true in
    find_topper and attach_recursive_mnt.

Cc: stable@vger.kernel.org
Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
Tested-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h     |   1 -
 fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
 fs/pnode.c     |  61 +++++++++++++++++++++++++-------
 fs/pnode.h     |   2 ++
 4 files changed, 111 insertions(+), 63 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
 }
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
-extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..8a3b6c1b16ff 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 }
 
 /*
- * find the last mount at @dentry on vfsmount @mnt.
- * mount_lock must be held.
- */
-struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
-{
-	struct mount *p, *res = NULL;
-	p = __lookup_mnt(mnt, dentry);
-	if (!p)
-		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
-	hlist_for_each_entry_continue(p, mnt_hash) {
-		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
-	}
-out:
-	return res;
-}
-
-/*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
@@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 
+static void __attach_mnt(struct mount *mnt, struct mount *parent)
+{
+	hlist_add_head_rcu(&mnt->mnt_hash,
+			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
-	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
-	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	__attach_mnt(mnt, parent);
 }
 
-static void attach_shadowed(struct mount *mnt,
-			struct mount *parent,
-			struct mount *shadows)
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
-	if (shadows) {
-		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
-		list_add(&mnt->mnt_child, &shadows->mnt_child);
-	} else {
-		hlist_add_head_rcu(&mnt->mnt_hash,
-				m_hash(&parent->mnt, mnt->mnt_mountpoint));
-		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
-	}
+	struct mountpoint *old_mp = mnt->mnt_mp;
+	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
+	struct mount *old_parent = mnt->mnt_parent;
+
+	list_del_init(&mnt->mnt_child);
+	hlist_del_init(&mnt->mnt_mp_list);
+	hlist_del_init_rcu(&mnt->mnt_hash);
+
+	attach_mnt(mnt, parent, mp);
+
+	put_mountpoint(old_mp);
+
+	/*
+	 * Safely avoid even the suggestion this code might sleep or
+	 * lock the mount hash by taking advantage of the knowledge that
+	 * mnt_change_mountpoint will not release the final reference
+	 * to a mountpoint.
+	 *
+	 * During mounting, the mount passed in as the parent mount will
+	 * continue to use the old mountpoint and during unmounting, the
+	 * old mountpoint will continue to exist until namespace_unlock,
+	 * which happens well after mnt_change_mountpoint.
+	 */
+	spin_lock(&old_mountpoint->d_lock);
+	old_mountpoint->d_lockref.count--;
+	spin_unlock(&old_mountpoint->d_lock);
+
+	mnt_add_count(old_parent, -1);
 }
 
 /*
  * vfsmount lock must be held for write
  */
-static void commit_tree(struct mount *mnt, struct mount *shadows)
+static void commit_tree(struct mount *mnt)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
@@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 	n->mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
-	attach_shadowed(mnt, parent, shadows);
+	__attach_mnt(mnt, parent);
 	touch_mnt_namespace(n);
 }
 
@@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
-			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
@@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			mnt_set_mountpoint(parent, p->mnt_mp, q);
-			if (!list_empty(&parent->mnt_mounts)) {
-				t = list_last_entry(&parent->mnt_mounts,
-					struct mount, mnt_child);
-				if (t->mnt_mp != p->mnt_mp)
-					t = NULL;
-			}
-			attach_shadowed(q, parent, t);
+			attach_mnt(q, parent, p->mnt_mp);
 			unlock_mount_hash();
 		}
 	}
@@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 {
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
+	struct mountpoint *smp;
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 
+	/* Preallocate a mountpoint in case the new mounts need
+	 * to be tucked under other mounts.
+	 */
+	smp = get_mountpoint(source_mnt->mnt.mnt_root);
+	if (IS_ERR(smp))
+		return PTR_ERR(smp);
+
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!parent_path) {
 		err = count_mounts(ns, source_mnt);
@@ -2022,16 +2024,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-		commit_tree(source_mnt, NULL);
+		commit_tree(source_mnt);
 	}
 
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
-		q = __lookup_mnt_last(&child->mnt_parent->mnt,
-				      child->mnt_mountpoint);
-		commit_tree(child, q);
+		q = __lookup_mnt(&child->mnt_parent->mnt,
+				 child->mnt_mountpoint);
+		if (q)
+			mnt_change_mountpoint(child, smp, q);
+		commit_tree(child);
 	}
+	put_mountpoint(smp);
 	unlock_mount_hash();
 
 	return 0;
@@ -2046,6 +2051,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	ns->pending_mounts = 0;
+
+	read_seqlock_excl(&mount_lock);
+	put_mountpoint(smp);
+	read_sequnlock_excl(&mount_lock);
+
 	return err;
 }
 
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..5bc7896d122a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,21 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 	return ret;
 }
 
+static struct mount *find_topper(struct mount *mnt)
+{
+	/* If there is exactly one mount covering mnt completely return it. */
+	struct mount *child;
+
+	if (!list_is_singular(&mnt->mnt_mounts))
+		return NULL;
+
+	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
+	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
+		return NULL;
+
+	return child;
+}
+
 /*
  * return true if the refcount is greater than count
  */
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
  */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-	struct mount *m, *child;
+	struct mount *m, *child, *topper;
 	struct mount *parent = mnt->mnt_parent;
-	int ret = 0;
 
 	if (mnt == parent)
 		return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 
 	for (m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
-		if (child && list_empty(&child->mnt_mounts) &&
-		    (ret = do_refcount_check(child, 1)))
-			break;
+		int count = 1;
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
+		if (!child)
+			continue;
+
+		/* Is there exactly one mount on the child that covers
+		 * it completely whose reference should be ignored?
+		 */
+		topper = find_topper(child);
+		if (topper)
+			count += 1;
+		else if (!list_empty(&child->mnt_mounts))
+			continue;
+
+		if (do_refcount_check(child, count))
+			return 1;
 	}
-	return ret;
+	return 0;
 }
 
 /*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 		if (child)
 			child->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
+			continue;
+		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *topper;
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
 		/*
 		 * umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
+
+		/* If there is exactly one mount covering all of child
+		 * replace child with that mount.
+		 */
+		topper = find_topper(child);
+		if (topper)
+			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
+					      topper);
+
 		if (list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
+			   struct mount *mnt);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
                                             ` (2 preceding siblings ...)
  2017-01-13 20:32                           ` Andrei Vagin
@ 2017-01-20 23:18                           ` Ram Pai
  2017-01-23  8:15                             ` Eric W. Biederman
  3 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-01-20 23:18 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
> 
> When I look at what propagate_mount_busy is trying to do and I look
> at the code closely I discover there is a great disconnect between the
> two.  In the ordinary non-propagation case propagate_mount_busy has
> been verifying that there are no submounts and that there are no
> extraneous references on the mount.
> 
> For mounts that the unmount would propagate to propagate_mount_busy has
> been verifying that there are no extraneous references only if there
> are no submounts.  Which is nonsense.


the reason why we had to do it that way was because there were
situations where it was impossible to umount anything...

take for example.

(1) mount --make-shared A

(2) mount --bind A  A/a    

The tree looks like this

 	A
	|
        B

(3) mount --bind A  B/a    
The tree looks like this
 	A
	|
 	B B'   (B' becomes a shadow mount)
	|
        C


(4) mount --make-slave A
	At this point B and C are peers and A is a slave.

(5) umount B' 
	NOTE: This used to be possible a decade ago if the process doing
	the umount had access to its dentry.
    The tree looks like this
 	A
	|
 	B
	|
        C

Now if you try to unmount C,  it becomes impossible, reason being...

B is the parent of C.
So the umount propagates to A.  But A has B mounted at the same
location.  But B is busy since it has got a child C.
So the entire umount has to fail.  There is no way to umount it all.
Kind of stuck for ever.  That is the reason; in those days a decade ago,
we relaxed the rule to let go propagated mounts that had children.

The above example is a simplest case that demonstrates the phenomenon.

Given that, the current code does not allow any process to reach shadow
mount B' and given that we are getting rid of shadow mounts, I think we
should allow the code changes you propose in this patch.

RP

	
> 
> Thefore rework the logic in propgate_mount_busy so that for each
> mount it examines it considers that mount busy if that mount has
> children or if there are extraneous references to that mount.
> 
> While this check was incorrect we could leak mounts instead of simply
> failing umount.
> 
> Cc: stable@vger.kernel.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> If you don't figure this fix is worth it after all of this time please
> let me know.  This feels like the proper thing to do, and I don't expect
> it will break anyone to fix this.
> 
>  fs/pnode.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 06a793f4ae38..12fafa711114 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -344,7 +344,6 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  {
>  	struct mount *m, *child;
>  	struct mount *parent = mnt->mnt_parent;
> -	int ret = 0;
> 
>  	if (mnt == parent)
>  		return do_refcount_check(mnt, refcnt);
> @@ -360,11 +359,13 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  	for (m = propagation_next(parent, parent); m;
>  	     		m = propagation_next(m, parent)) {
>  		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> -		if (child && list_empty(&child->mnt_mounts) &&
> -		    (ret = do_refcount_check(child, 1)))
> -			break;
> +		if (!child)
> +			continue;
> +		if (!list_empty(&child->mnt_mounts) ||
> +		    do_refcount_check(child, 1))
> +			return 1;
>  	}
> -	return ret;
> +	return 0;
>  }
> 
>  /*
> -- 
> 2.10.1

-- 
Ram Pai


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-20  7:26                               ` [PATCH v5] " Eric W. Biederman
@ 2017-01-21  3:58                                 ` Ram Pai
  2017-01-21  4:15                                   ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-01-21  3:58 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

On Fri, Jan 20, 2017 at 08:26:33PM +1300, Eric W. Biederman wrote:
> 
> Ever since mount propagation was introduced in cases where a mount in
> propagated to parent mount mountpoint pair that is already in use the
> code has placed the new mount behind the old mount in the mount hash
> table.
> 
> This implementation detail is problematic as it allows creating
> arbitrary length mount hash chains.
> 
> Furthermore it invalidates the constraint maintained elsewhere in the
> mount code that a parent mount and a mountpoint pair will have exactly
> one mount upon them.  Making it hard to deal with and to talk about
> this special case in the mount code.
> 
> Modify mount propagation to notice when there is already a mount at
> the parent mount and mountpoint where a new mount is propagating to
> and place that preexisting mount on top of the new mount.
> 
> Modify unmount propagation to notice when a mount that is being
> unmounted has another mount on top of it (and no other children), and
> to replace the unmounted mount with the mount on top of it.
> 
> Move the MNT_UMUONT test from __lookup_mnt_last into
> __propagate_umount as that is the only call of __lookup_mnt_last where
> MNT_UMOUNT may be set on any mount visible in the mount hash table.
> 
> These modifications allow:
>  - __lookup_mnt_last to be removed.
>  - attach_shadows to be renamed __attach_mnt and its shadow
>    handling to be removed.
>  - commit_tree to be simplified
>  - copy_tree to be simplified
> 
> The result is an easier to understand tree of mounts that does not
> allow creation of arbitrary length hash chains in the mount hash table.
> 
> v2: Updated to mnt_change_mountpoint to not call dput or mntput
> and instead to decrement the counts directly.  It is guaranteed
> that there will be other references when mnt_change_mountpoint is
> called so this is safe.
> 
> v3: Moved put_mountpoint under mount_lock in attach_recursive_mnt
>     As the locking in fs/namespace.c changed between v2 and v3.
> 
> v4: Reworked the logic in propagate_mount_busy and __propagate_umount
>     that detects when a mount completely covers another mount.
> 
> v5: Removed unnecessary tests whose result is always true in
>     find_topper and attach_recursive_mnt.
> 
> Cc: stable@vger.kernel.org
> Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
> Tested-by: Andrei Vagin <avagin@virtuozzo.com>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |   1 -
>  fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
>  fs/pnode.c     |  61 +++++++++++++++++++++++++-------
>  fs/pnode.h     |   2 ++
>  4 files changed, 111 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 2c856fc47ae3..2826543a131d 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
>  }
> 
>  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
> -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
> 
>  extern int __legitimize_mnt(struct vfsmount *, unsigned);
>  extern bool legitimize_mnt(struct vfsmount *, unsigned);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 487ba30bb5c6..8a3b6c1b16ff 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
>  }
> 
>  /*
> - * find the last mount at @dentry on vfsmount @mnt.
> - * mount_lock must be held.
> - */
> -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
> -{
> -	struct mount *p, *res = NULL;
> -	p = __lookup_mnt(mnt, dentry);
> -	if (!p)
> -		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> -	hlist_for_each_entry_continue(p, mnt_hash) {
> -		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
> -			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;
> -	}
> -out:
> -	return res;
> -}
> -
> -/*
>   * lookup_mnt - Return the first child mount mounted at path
>   *
>   * "First" means first mounted chronologically.  If you create the
> @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
>  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
>  }
> 
> +static void __attach_mnt(struct mount *mnt, struct mount *parent)
> +{
> +	hlist_add_head_rcu(&mnt->mnt_hash,
> +			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
> +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> +}
> +
>  /*
>   * vfsmount lock must be held for write
>   */
> @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
>  			struct mountpoint *mp)
>  {
>  	mnt_set_mountpoint(parent, mp, mnt);
> -	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
> -	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> +	__attach_mnt(mnt, parent);
>  }
> 
> -static void attach_shadowed(struct mount *mnt,
> -			struct mount *parent,
> -			struct mount *shadows)
> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
>  {
> -	if (shadows) {
> -		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
> -		list_add(&mnt->mnt_child, &shadows->mnt_child);
> -	} else {
> -		hlist_add_head_rcu(&mnt->mnt_hash,
> -				m_hash(&parent->mnt, mnt->mnt_mountpoint));
> -		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> -	}
> +	struct mountpoint *old_mp = mnt->mnt_mp;
> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
> +	struct mount *old_parent = mnt->mnt_parent;
> +
> +	list_del_init(&mnt->mnt_child);
> +	hlist_del_init(&mnt->mnt_mp_list);
> +	hlist_del_init_rcu(&mnt->mnt_hash);
> +
> +	attach_mnt(mnt, parent, mp);
> +
> +	put_mountpoint(old_mp);
> +
> +	/*
> +	 * Safely avoid even the suggestion this code might sleep or
> +	 * lock the mount hash by taking advantage of the knowledge that
> +	 * mnt_change_mountpoint will not release the final reference
> +	 * to a mountpoint.
> +	 *
> +	 * During mounting, the mount passed in as the parent mount will
> +	 * continue to use the old mountpoint and during unmounting, the
> +	 * old mountpoint will continue to exist until namespace_unlock,
> +	 * which happens well after mnt_change_mountpoint.
> +	 */
> +	spin_lock(&old_mountpoint->d_lock);
> +	old_mountpoint->d_lockref.count--;
> +	spin_unlock(&old_mountpoint->d_lock);
> +
> +	mnt_add_count(old_parent, -1);
>  }
> 
>  /*
>   * vfsmount lock must be held for write
>   */
> -static void commit_tree(struct mount *mnt, struct mount *shadows)
> +static void commit_tree(struct mount *mnt)
>  {
>  	struct mount *parent = mnt->mnt_parent;
>  	struct mount *m;
> @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
>  	n->mounts += n->pending_mounts;
>  	n->pending_mounts = 0;
> 
> -	attach_shadowed(mnt, parent, shadows);
> +	__attach_mnt(mnt, parent);
>  	touch_mnt_namespace(n);
>  }
> 
> @@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>  			continue;
> 
>  		for (s = r; s; s = next_mnt(s, r)) {
> -			struct mount *t = NULL;
>  			if (!(flag & CL_COPY_UNBINDABLE) &&
>  			    IS_MNT_UNBINDABLE(s)) {
>  				s = skip_mnt_tree(s);
> @@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>  				goto out;
>  			lock_mount_hash();
>  			list_add_tail(&q->mnt_list, &res->mnt_list);
> -			mnt_set_mountpoint(parent, p->mnt_mp, q);
> -			if (!list_empty(&parent->mnt_mounts)) {
> -				t = list_last_entry(&parent->mnt_mounts,
> -					struct mount, mnt_child);
> -				if (t->mnt_mp != p->mnt_mp)
> -					t = NULL;
> -			}
> -			attach_shadowed(q, parent, t);
> +			attach_mnt(q, parent, p->mnt_mp);
>  			unlock_mount_hash();
>  		}
>  	}
> @@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  {
>  	HLIST_HEAD(tree_list);
>  	struct mnt_namespace *ns = dest_mnt->mnt_ns;
> +	struct mountpoint *smp;
>  	struct mount *child, *p;
>  	struct hlist_node *n;
>  	int err;
> 
> +	/* Preallocate a mountpoint in case the new mounts need
> +	 * to be tucked under other mounts.
> +	 */
> +	smp = get_mountpoint(source_mnt->mnt.mnt_root);
> +	if (IS_ERR(smp))
> +		return PTR_ERR(smp);
> +
>  	/* Is there space to add these mounts to the mount namespace? */
>  	if (!parent_path) {
>  		err = count_mounts(ns, source_mnt);
> @@ -2022,16 +2024,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  		touch_mnt_namespace(source_mnt->mnt_ns);
>  	} else {
>  		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
> -		commit_tree(source_mnt, NULL);
> +		commit_tree(source_mnt);
>  	}
> 
>  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
>  		struct mount *q;
>  		hlist_del_init(&child->mnt_hash);
> -		q = __lookup_mnt_last(&child->mnt_parent->mnt,
> -				      child->mnt_mountpoint);
> -		commit_tree(child, q);
> +		q = __lookup_mnt(&child->mnt_parent->mnt,
> +				 child->mnt_mountpoint);
> +		if (q)
> +			mnt_change_mountpoint(child, smp, q);
> +		commit_tree(child);
>  	}
> +	put_mountpoint(smp);
>  	unlock_mount_hash();
> 
>  	return 0;
> @@ -2046,6 +2051,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  	cleanup_group_ids(source_mnt, NULL);
>   out:
>  	ns->pending_mounts = 0;
> +
> +	read_seqlock_excl(&mount_lock);
> +	put_mountpoint(smp);
> +	read_sequnlock_excl(&mount_lock);
> +
>  	return err;
>  }
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 06a793f4ae38..5bc7896d122a 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -322,6 +322,21 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
>  	return ret;
>  }
> 
> +static struct mount *find_topper(struct mount *mnt)
> +{
> +	/* If there is exactly one mount covering mnt completely return it. */
> +	struct mount *child;
> +
> +	if (!list_is_singular(&mnt->mnt_mounts))
> +		return NULL;
> +
> +	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
> +	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
> +		return NULL;
> +
> +	return child;
> +}
> +
>  /*
>   * return true if the refcount is greater than count
>   */
> @@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
>   */
>  int propagate_mount_busy(struct mount *mnt, int refcnt)
>  {
> -	struct mount *m, *child;
> +	struct mount *m, *child, *topper;
>  	struct mount *parent = mnt->mnt_parent;
> -	int ret = 0;
> 
>  	if (mnt == parent)
>  		return do_refcount_check(mnt, refcnt);
> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> 
>  	for (m = propagation_next(parent, parent); m;
>  	     		m = propagation_next(m, parent)) {
> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> -		if (child && list_empty(&child->mnt_mounts) &&
> -		    (ret = do_refcount_check(child, 1)))
> -			break;
> +		int count = 1;
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> +		if (!child)
> +			continue;
> +
> +		/* Is there exactly one mount on the child that covers
> +		 * it completely whose reference should be ignored?
> +		 */
> +		topper = find_topper(child);

This is tricky. I understand it is trying to identify the case where a
mount got tucked-in because of propagation.  But this will not
distinguish the case where a mount got over-mounted genuinely, not because of
propagation, but because of explicit user action.


example:

case 1: (explicit user action)
	B is a slave of A
	mount something on A/a , it will propagate to B/a
	and than mount something on B/a

case 2: (tucked mount)
	B is a slave of A
	mount something on B/a
	and than mount something on A/a

Both case 1 and case 2 lead to the same mount configuration.


	  however 'umount A/a' in case 1 should fail.
	  and 'umount A/a' in case 2 should pass.

Right? in other words, umounts of 'tucked mounts' should pass(case 2).
	whereas umounts of mounts on which overmounts exist should
		fail.(case 1)

maybe we need a flag to identify tucked mounts?

RP





> +		if (topper)
> +			count += 1;
> +		else if (!list_empty(&child->mnt_mounts))
> +			continue;
> +
> +		if (do_refcount_check(child, count))
> +			return 1;
>  	}
> -	return ret;
> +	return 0;
>  }
> 
>  /*
> @@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
> 
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>  		if (child)
>  			child->mnt.mnt_flags &= ~MNT_LOCKED;
>  	}
> @@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
> 
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> +		struct mount *child = __lookup_mnt(&m->mnt,
>  						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
> +			continue;
> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
> 
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> +		struct mount *topper;
> +		struct mount *child = __lookup_mnt(&m->mnt,
>  						mnt->mnt_mountpoint);
>  		/*
>  		 * umount the child only if the child has no children
> @@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
>  		if (!child || !IS_MNT_MARKED(child))
>  			continue;
>  		CLEAR_MNT_MARK(child);
> +
> +		/* If there is exactly one mount covering all of child
> +		 * replace child with that mount.
> +		 */
> +		topper = find_topper(child);
> +		if (topper)
> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> +					      topper);
> +
>  		if (list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..dc87e65becd2 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
>  unsigned int mnt_get_count(struct mount *mnt);
>  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
>  			struct mount *);
> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
> +			   struct mount *mnt);
>  struct mount *copy_tree(struct mount *, struct dentry *, int);
>  bool is_path_reachable(struct mount *, struct dentry *,
>  			 const struct path *root);
> -- 
> 2.10.1

-- 
Ram Pai


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-21  3:58                                 ` Ram Pai
@ 2017-01-21  4:15                                   ` Eric W. Biederman
  2017-01-23 19:02                                     ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-21  4:15 UTC (permalink / raw)
  To: Ram Pai; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

Ram Pai <linuxram@us.ibm.com> writes:

>> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>> 
>>  	for (m = propagation_next(parent, parent); m;
>>  	     		m = propagation_next(m, parent)) {
>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> -		if (child && list_empty(&child->mnt_mounts) &&
>> -		    (ret = do_refcount_check(child, 1)))
>> -			break;
>> +		int count = 1;
>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>> +		if (!child)
>> +			continue;
>> +
>> +		/* Is there exactly one mount on the child that covers
>> +		 * it completely whose reference should be ignored?
>> +		 */
>> +		topper = find_topper(child);
>
> This is tricky. I understand it is trying to identify the case where a
> mount got tucked-in because of propagation.  But this will not
> distinguish the case where a mount got over-mounted genuinely, not because of
> propagation, but because of explicit user action.
>
>
> example:
>
> case 1: (explicit user action)
> 	B is a slave of A
> 	mount something on A/a , it will propagate to B/a
> 	and than mount something on B/a
>
> case 2: (tucked mount)
> 	B is a slave of A
> 	mount something on B/a
> 	and than mount something on A/a
>
> Both case 1 and case 2 lead to the same mount configuration.
>
>
> 	  however 'umount A/a' in case 1 should fail.
> 	  and 'umount A/a' in case 2 should pass.
>
> Right? in other words, umounts of 'tucked mounts' should pass(case 2).
> 	whereas umounts of mounts on which overmounts exist should
> 		fail.(case 1)

Looking at your example.  I agree that case 1 will fail today.
However my actual expectation would be for both mount configurations
to behave the same.  In both cases something has been explicitly mounted
on B/a and something has propagated to B/a.  In both cases the mount
on top is what was explicitly mounted, and the mount below is what was
propagated to B/a.

I don't see why the order of operations should matter.

> maybe we need a flag to identify tucked mounts?

To preserve our exact current semantics yes.

The mount configurations that are delibearately constructed that I am
aware of are comparatively simple.  I don't think anyone has even taken
advantage of the shadow/side mounts at this point.  I made a reasonable
effort to find out and no one was even aware they existed.  Much less
what they were.  And certainly no one I talked to could find code that
used them.

So I think we are fine with a very modest semantic change here.
Especially one that appears to make the semantics more consistent and
predictable.

I also expect the checkpoint/restart folks will appreciate the change
as by giving them options it will make it easier to reconstruct
complicated mount trees.

Eric


>> +
>> +		if (do_refcount_check(child, count))
>> +			return 1;
>>  	}
>> -	return ret;
>> +	return 0;
>>  }
>> 
>>  /*
>> @@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
>> 
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>  		if (child)
>>  			child->mnt.mnt_flags &= ~MNT_LOCKED;
>>  	}
>> @@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
>> 
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>  						mnt->mnt_mountpoint);
>> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
>> +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
>> +			continue;
>> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>>  			SET_MNT_MARK(child);
>>  		}
>>  	}
>> @@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
>> 
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> +		struct mount *topper;
>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>  						mnt->mnt_mountpoint);
>>  		/*
>>  		 * umount the child only if the child has no children
>> @@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
>>  		if (!child || !IS_MNT_MARKED(child))
>>  			continue;
>>  		CLEAR_MNT_MARK(child);
>> +
>> +		/* If there is exactly one mount covering all of child
>> +		 * replace child with that mount.
>> +		 */
>> +		topper = find_topper(child);
>> +		if (topper)
>> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
>> +					      topper);
>> +
>>  		if (list_empty(&child->mnt_mounts)) {
>>  			list_del_init(&child->mnt_child);
>>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>> diff --git a/fs/pnode.h b/fs/pnode.h
>> index 550f5a8b4fcf..dc87e65becd2 100644
>> --- a/fs/pnode.h
>> +++ b/fs/pnode.h
>> @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
>>  unsigned int mnt_get_count(struct mount *mnt);
>>  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
>>  			struct mount *);
>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
>> +			   struct mount *mnt);
>>  struct mount *copy_tree(struct mount *, struct dentry *, int);
>>  bool is_path_reachable(struct mount *, struct dentry *,
>>  			 const struct path *root);
>> -- 
>> 2.10.1

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-20 23:18                           ` Ram Pai
@ 2017-01-23  8:15                             ` Eric W. Biederman
  2017-01-23 17:04                               ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-23  8:15 UTC (permalink / raw)
  To: Ram Pai; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

Ram Pai <linuxram@us.ibm.com> writes:

> On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
>> 
>> When I look at what propagate_mount_busy is trying to do and I look
>> at the code closely I discover there is a great disconnect between the
>> two.  In the ordinary non-propagation case propagate_mount_busy has
>> been verifying that there are no submounts and that there are no
>> extraneous references on the mount.
>> 
>> For mounts that the unmount would propagate to propagate_mount_busy has
>> been verifying that there are no extraneous references only if there
>> are no submounts.  Which is nonsense.
>
>
> the reason why we had to do it that way was because there were
> situations where it was impossible to umount anything...
>
> take for example.
>
> (1) mount --make-shared A
>
> (2) mount --bind A  A/a    
>
> The tree looks like this
>
>  	A
> 	|
>         B
>
> (3) mount --bind A  B/a    
> The tree looks like this
>  	A
> 	|
>  	B B'   (B' becomes a shadow mount)
> 	|
>         C
>
>
> (4) mount --make-slave A
> 	At this point B and C are peers and A is a slave.
>
> (5) umount B' 
> 	NOTE: This used to be possible a decade ago if the process doing
> 	the umount had access to its dentry.
>     The tree looks like this
>  	A
> 	|
>  	B
> 	|
>         C
>
> Now if you try to unmount C,  it becomes impossible, reason being...
>
> B is the parent of C.
> So the umount propagates to A.  But A has B mounted at the same
> location.  But B is busy since it has got a child C.
> So the entire umount has to fail.  There is no way to umount it all.
> Kind of stuck for ever.  That is the reason; in those days a decade ago,
> we relaxed the rule to let go propagated mounts that had children.
>
> The above example is a simplest case that demonstrates the phenomenon.
>
> Given that, the current code does not allow any process to reach shadow
> mount B' and given that we are getting rid of shadow mounts, I think we
> should allow the code changes you propose in this patch.

Thank you very much for the good description of why propagate_mount_busy
works the way it does.

I just finished taking a hard look at this and in fact the current code
does allow reaching B' via umount propagation.  My other patch changes
exactly how you have to reach it but it is still possible to umount B'

At the same time those mounts have alwasy been unmountable with
"umount -l" aka MOUNT_DETACH.

Have you ever encountered a non-contrived situation that leads to this
kind of problem?

I expect if we can verify that docker, and systemd are similar pieces of
the linux ecosystem are not depending on the exact details of the
propagation of the umount busy we should be able to remove this.

Last I looked the uses of mount and umount were all quite simple, so I
think it is very possible to make this change.  Especially as it is now
much harder to get into the situation you describe.

Eric


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts.
  2017-01-23  8:15                             ` Eric W. Biederman
@ 2017-01-23 17:04                               ` Ram Pai
  0 siblings, 0 replies; 63+ messages in thread
From: Ram Pai @ 2017-01-23 17:04 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

On Mon, Jan 23, 2017 at 09:15:02PM +1300, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Thu, Jan 12, 2017 at 05:18:12AM +1300, Eric W. Biederman wrote:
> >> 
> >> When I look at what propagate_mount_busy is trying to do and I look
> >> at the code closely I discover there is a great disconnect between the
> >> two.  In the ordinary non-propagation case propagate_mount_busy has
> >> been verifying that there are no submounts and that there are no
> >> extraneous references on the mount.
> >> 
> >> For mounts that the unmount would propagate to propagate_mount_busy has
> >> been verifying that there are no extraneous references only if there
> >> are no submounts.  Which is nonsense.
> >
> >
> > the reason why we had to do it that way was because there were
> > situations where it was impossible to umount anything...
> >
> > take for example.
> >
> > (1) mount --make-shared A
> >
> > (2) mount --bind A  A/a    
> >
> > The tree looks like this
> >
> >  	A
> > 	|
> >         B
> >
> > (3) mount --bind A  B/a    
> > The tree looks like this
> >  	A
> > 	|
> >  	B B'   (B' becomes a shadow mount)
> > 	|
> >         C
> >
> >
> > (4) mount --make-slave A
> > 	At this point B and C are peers and A is a slave.
> >
> > (5) umount B' 
> > 	NOTE: This used to be possible a decade ago if the process doing
> > 	the umount had access to its dentry.
> >     The tree looks like this
> >  	A
> > 	|
> >  	B
> > 	|
> >         C
> >
> > Now if you try to unmount C,  it becomes impossible, reason being...
> >
> > B is the parent of C.
> > So the umount propagates to A.  But A has B mounted at the same
> > location.  But B is busy since it has got a child C.
> > So the entire umount has to fail.  There is no way to umount it all.
> > Kind of stuck for ever.  That is the reason; in those days a decade ago,
> > we relaxed the rule to let go propagated mounts that had children.
> >
> > The above example is a simplest case that demonstrates the phenomenon.
> >
> > Given that, the current code does not allow any process to reach shadow
> > mount B' and given that we are getting rid of shadow mounts, I think we
> > should allow the code changes you propose in this patch.
> 
> Thank you very much for the good description of why propagate_mount_busy
> works the way it does.
> 
> I just finished taking a hard look at this and in fact the current code
> does allow reaching B' via umount propagation.  My other patch changes
> exactly how you have to reach it but it is still possible to umount B'
> 
> At the same time those mounts have alwasy been unmountable with
> "umount -l" aka MOUNT_DETACH.
> 
> Have you ever encountered a non-contrived situation that leads to this
> kind of problem?

No. Simple cases dont expose any of these hidden issues. The devil is in
the detail. There used to be a test suite; which I believe has been
integrated into LTP, that had all kinds of contrived cases to expose as
many hidden issues.

RP


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-21  4:15                                   ` Eric W. Biederman
@ 2017-01-23 19:02                                     ` Ram Pai
  2017-01-24  0:16                                       ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-01-23 19:02 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >> 
> >>  	for (m = propagation_next(parent, parent); m;
> >>  	     		m = propagation_next(m, parent)) {
> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> >> -		if (child && list_empty(&child->mnt_mounts) &&
> >> -		    (ret = do_refcount_check(child, 1)))
> >> -			break;
> >> +		int count = 1;
> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> >> +		if (!child)
> >> +			continue;
> >> +
> >> +		/* Is there exactly one mount on the child that covers
> >> +		 * it completely whose reference should be ignored?
> >> +		 */
> >> +		topper = find_topper(child);
> >
> > This is tricky. I understand it is trying to identify the case where a
> > mount got tucked-in because of propagation.  But this will not
> > distinguish the case where a mount got over-mounted genuinely, not because of
> > propagation, but because of explicit user action.
> >
> >
> > example:
> >
> > case 1: (explicit user action)
> > 	B is a slave of A
> > 	mount something on A/a , it will propagate to B/a
> > 	and than mount something on B/a
> >
> > case 2: (tucked mount)
> > 	B is a slave of A
> > 	mount something on B/a
> > 	and than mount something on A/a
> >
> > Both case 1 and case 2 lead to the same mount configuration.
> >
> >
> > 	  however 'umount A/a' in case 1 should fail.
> > 	  and 'umount A/a' in case 2 should pass.
> >
> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
> > 	whereas umounts of mounts on which overmounts exist should
> > 		fail.(case 1)
> 
> Looking at your example.  I agree that case 1 will fail today.

And should continue to fail. right? Your semantics change will pass it.

> However my actual expectation would be for both mount configurations
> to behave the same.  In both cases something has been explicitly mounted
> on B/a and something has propagated to B/a.  In both cases the mount
> on top is what was explicitly mounted, and the mount below is what was
> propagated to B/a.
> 
> I don't see why the order of operations should matter.

One of the subtle expectation is reversibility.

Mount followed immediately by unmount has always passed and that is the
standard expectation always. Your proposed code will ensure that.

However there is one other subtle expectaton.

A mount cannot disappear if a user has explicitly mounted on top of it.

your proposed code will not meet that expectation. 

In other words, these two expectations make it behave differently even
when; arguably, they feel like the same configuration.

> 
> > maybe we need a flag to identify tucked mounts?
> 
> To preserve our exact current semantics yes.
> 
> The mount configurations that are delibearately constructed that I am
> aware of are comparatively simple.  I don't think anyone has even taken
> advantage of the shadow/side mounts at this point.  I made a reasonable
> effort to find out and no one was even aware they existed.  Much less
> what they were.  And certainly no one I talked to could find code that
> used them.

But someday; even if its after a decade, someone ;) will
stumble into this semantics and wonder 'why?'. Its better to get it right
sooner. Sorry, I am blaming myself; for keeping some of the problems
open thinking no one will bump into them.


RP


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-23 19:02                                     ` Ram Pai
@ 2017-01-24  0:16                                       ` Eric W. Biederman
  2017-02-03 10:54                                         ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-01-24  0:16 UTC (permalink / raw)
  To: Ram Pai; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

Ram Pai <linuxram@us.ibm.com> writes:

> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>> >> 
>> >>  	for (m = propagation_next(parent, parent); m;
>> >>  	     		m = propagation_next(m, parent)) {
>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> >> -		if (child && list_empty(&child->mnt_mounts) &&
>> >> -		    (ret = do_refcount_check(child, 1)))
>> >> -			break;
>> >> +		int count = 1;
>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>> >> +		if (!child)
>> >> +			continue;
>> >> +
>> >> +		/* Is there exactly one mount on the child that covers
>> >> +		 * it completely whose reference should be ignored?
>> >> +		 */
>> >> +		topper = find_topper(child);
>> >
>> > This is tricky. I understand it is trying to identify the case where a
>> > mount got tucked-in because of propagation.  But this will not
>> > distinguish the case where a mount got over-mounted genuinely, not because of
>> > propagation, but because of explicit user action.
>> >
>> >
>> > example:
>> >
>> > case 1: (explicit user action)
>> > 	B is a slave of A
>> > 	mount something on A/a , it will propagate to B/a
>> > 	and than mount something on B/a
>> >
>> > case 2: (tucked mount)
>> > 	B is a slave of A
>> > 	mount something on B/a
>> > 	and than mount something on A/a
>> >
>> > Both case 1 and case 2 lead to the same mount configuration.
>> >
>> >
>> > 	  however 'umount A/a' in case 1 should fail.
>> > 	  and 'umount A/a' in case 2 should pass.
>> >
>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
>> > 	whereas umounts of mounts on which overmounts exist should
>> > 		fail.(case 1)
>> 
>> Looking at your example.  I agree that case 1 will fail today.
>
> And should continue to fail. right? Your semantics change will pass it.

I don't see why it should continue to fail.

>> However my actual expectation would be for both mount configurations
>> to behave the same.  In both cases something has been explicitly mounted
>> on B/a and something has propagated to B/a.  In both cases the mount
>> on top is what was explicitly mounted, and the mount below is what was
>> propagated to B/a.
>> 
>> I don't see why the order of operations should matter.
>
> One of the subtle expectation is reversibility.
>
> Mount followed immediately by unmount has always passed and that is the
> standard expectation always. Your proposed code will ensure that.
>
> However there is one other subtle expectaton.
>
> A mount cannot disappear if a user has explicitly mounted on top of it.
>
> your proposed code will not meet that expectation. 
>
> In other words, these two expectations make it behave differently even
> when; arguably, they feel like the same configuration.

I am not seeing that.



>> 
>> > maybe we need a flag to identify tucked mounts?
>> 
>> To preserve our exact current semantics yes.
>> 
>> The mount configurations that are delibearately constructed that I am
>> aware of are comparatively simple.  I don't think anyone has even taken
>> advantage of the shadow/side mounts at this point.  I made a reasonable
>> effort to find out and no one was even aware they existed.  Much less
>> what they were.  And certainly no one I talked to could find code that
>> used them.
>
> But someday; even if its after a decade, someone ;) will
> stumble into this semantics and wonder 'why?'. Its better to get it right
> sooner. Sorry, I am blaming myself; for keeping some of the problems
> open thinking no one will bump into them.

Oh definitely.  If we have people ready to talk it through I am happy to
dot as many i's and cross as many t's as we productively can.

I was just pointing out that I don't have any reason to expect that any
one depends on the subtle details of the implementation today so we
still have some wiggle room to fix them.  Even if they are visible to
user space.

Then I see Andrei Vagin's patch for checkpoint/restore and the mount
namespace and I start suspecting that will be the point where all of the
subtle details get locked in stone because checkpont/restore will have
to preserve every possible configuration of mount namespaces.

My main concern at this point is to get the code to a point where a
malicious user in a user namespace can not cause problems for root
in the primary mount namespace.  Even if root did open himself for all
kinds of trouble by running "mount --make-rshared /".  As that is
essentially required to use mount propagation at all.


Eric


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-24  0:16                                       ` Eric W. Biederman
@ 2017-02-03 10:54                                         ` Eric W. Biederman
  2017-02-03 17:10                                           ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-02-03 10:54 UTC (permalink / raw)
  To: Ram Pai; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

ebiederm@xmission.com (Eric W. Biederman) writes:

> Ram Pai <linuxram@us.ibm.com> writes:
>
>> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
>>> Ram Pai <linuxram@us.ibm.com> writes:
>>> 
>>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>>> >> 
>>> >>  	for (m = propagation_next(parent, parent); m;
>>> >>  	     		m = propagation_next(m, parent)) {
>>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>>> >> -		if (child && list_empty(&child->mnt_mounts) &&
>>> >> -		    (ret = do_refcount_check(child, 1)))
>>> >> -			break;
>>> >> +		int count = 1;
>>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>> >> +		if (!child)
>>> >> +			continue;
>>> >> +
>>> >> +		/* Is there exactly one mount on the child that covers
>>> >> +		 * it completely whose reference should be ignored?
>>> >> +		 */
>>> >> +		topper = find_topper(child);
>>> >
>>> > This is tricky. I understand it is trying to identify the case where a
>>> > mount got tucked-in because of propagation.  But this will not
>>> > distinguish the case where a mount got over-mounted genuinely, not because of
>>> > propagation, but because of explicit user action.
>>> >
>>> >
>>> > example:
>>> >
>>> > case 1: (explicit user action)
>>> > 	B is a slave of A
>>> > 	mount something on A/a , it will propagate to B/a
>>> > 	and than mount something on B/a
>>> >
>>> > case 2: (tucked mount)
>>> > 	B is a slave of A
>>> > 	mount something on B/a
>>> > 	and than mount something on A/a
>>> >
>>> > Both case 1 and case 2 lead to the same mount configuration.
>>> >
>>> >
>>> > 	  however 'umount A/a' in case 1 should fail.
>>> > 	  and 'umount A/a' in case 2 should pass.
>>> >
>>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
>>> > 	whereas umounts of mounts on which overmounts exist should
>>> > 		fail.(case 1)
>>> 
>>> Looking at your example.  I agree that case 1 will fail today.
>>
>> And should continue to fail. right? Your semantics change will pass it.
>
> I don't see why it should continue to fail.
>
>>> However my actual expectation would be for both mount configurations
>>> to behave the same.  In both cases something has been explicitly mounted
>>> on B/a and something has propagated to B/a.  In both cases the mount
>>> on top is what was explicitly mounted, and the mount below is what was
>>> propagated to B/a.
>>> 
>>> I don't see why the order of operations should matter.
>>
>> One of the subtle expectation is reversibility.
>>
>> Mount followed immediately by unmount has always passed and that is the
>> standard expectation always. Your proposed code will ensure that.
>>
>> However there is one other subtle expectaton.
>>
>> A mount cannot disappear if a user has explicitly mounted on top of it.
>>
>> your proposed code will not meet that expectation. 
>>
>> In other words, these two expectations make it behave differently even
>> when; arguably, they feel like the same configuration.
>
> I am not seeing that.
>
>
>
>>> 
>>> > maybe we need a flag to identify tucked mounts?
>>> 
>>> To preserve our exact current semantics yes.
>>> 
>>> The mount configurations that are delibearately constructed that I am
>>> aware of are comparatively simple.  I don't think anyone has even taken
>>> advantage of the shadow/side mounts at this point.  I made a reasonable
>>> effort to find out and no one was even aware they existed.  Much less
>>> what they were.  And certainly no one I talked to could find code that
>>> used them.
>>
>> But someday; even if its after a decade, someone ;) will
>> stumble into this semantics and wonder 'why?'. Its better to get it right
>> sooner. Sorry, I am blaming myself; for keeping some of the problems
>> open thinking no one will bump into them.
>
> Oh definitely.  If we have people ready to talk it through I am happy to
> dot as many i's and cross as many t's as we productively can.
>
> I was just pointing out that I don't have any reason to expect that any
> one depends on the subtle details of the implementation today so we
> still have some wiggle room to fix them.  Even if they are visible to
> user space.

So I haven't seen a reply, and we are getting awfully close to the merge
window.  Is there anything concrete we can do to ease concerns?

Right now I am thinking my last version of the patch is the likely the
best we have time and energy to manage and it would be good to merge it
before the code bit rots.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-03 10:54                                         ` Eric W. Biederman
@ 2017-02-03 17:10                                           ` Ram Pai
  2017-02-03 18:26                                             ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-02-03 17:10 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
> ebiederm@xmission.com (Eric W. Biederman) writes:
> 
> > Ram Pai <linuxram@us.ibm.com> writes:
> >
> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
> >>> Ram Pai <linuxram@us.ibm.com> writes:
> >>> 
> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >>> >> 
> >>> >>  	for (m = propagation_next(parent, parent); m;
> >>> >>  	     		m = propagation_next(m, parent)) {
> >>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> >>> >> -		if (child && list_empty(&child->mnt_mounts) &&
> >>> >> -		    (ret = do_refcount_check(child, 1)))
> >>> >> -			break;
> >>> >> +		int count = 1;
> >>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> >>> >> +		if (!child)
> >>> >> +			continue;
> >>> >> +
> >>> >> +		/* Is there exactly one mount on the child that covers
> >>> >> +		 * it completely whose reference should be ignored?
> >>> >> +		 */
> >>> >> +		topper = find_topper(child);
> >>> >
> >>> > This is tricky. I understand it is trying to identify the case where a
> >>> > mount got tucked-in because of propagation.  But this will not
> >>> > distinguish the case where a mount got over-mounted genuinely, not because of
> >>> > propagation, but because of explicit user action.
> >>> >
> >>> >
> >>> > example:
> >>> >
> >>> > case 1: (explicit user action)
> >>> > 	B is a slave of A
> >>> > 	mount something on A/a , it will propagate to B/a
> >>> > 	and than mount something on B/a
> >>> >
> >>> > case 2: (tucked mount)
> >>> > 	B is a slave of A
> >>> > 	mount something on B/a
> >>> > 	and than mount something on A/a
> >>> >
> >>> > Both case 1 and case 2 lead to the same mount configuration.
> >>> >
> >>> >
> >>> > 	  however 'umount A/a' in case 1 should fail.
> >>> > 	  and 'umount A/a' in case 2 should pass.
> >>> >
> >>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
> >>> > 	whereas umounts of mounts on which overmounts exist should
> >>> > 		fail.(case 1)
> >>> 
> >>> Looking at your example.  I agree that case 1 will fail today.
> >>
> >> And should continue to fail. right? Your semantics change will pass it.
> >
> > I don't see why it should continue to fail.
> >
> >>> However my actual expectation would be for both mount configurations
> >>> to behave the same.  In both cases something has been explicitly mounted
> >>> on B/a and something has propagated to B/a.  In both cases the mount
> >>> on top is what was explicitly mounted, and the mount below is what was
> >>> propagated to B/a.
> >>> 
> >>> I don't see why the order of operations should matter.
> >>
> >> One of the subtle expectation is reversibility.
> >>
> >> Mount followed immediately by unmount has always passed and that is the
> >> standard expectation always. Your proposed code will ensure that.
> >>
> >> However there is one other subtle expectaton.
> >>
> >> A mount cannot disappear if a user has explicitly mounted on top of it.
> >>
> >> your proposed code will not meet that expectation. 
> >>
> >> In other words, these two expectations make it behave differently even
> >> when; arguably, they feel like the same configuration.
> >
> > I am not seeing that.
> >
> >
> >
> >>> 
> >>> > maybe we need a flag to identify tucked mounts?
> >>> 
> >>> To preserve our exact current semantics yes.
> >>> 
> >>> The mount configurations that are delibearately constructed that I am
> >>> aware of are comparatively simple.  I don't think anyone has even taken
> >>> advantage of the shadow/side mounts at this point.  I made a reasonable
> >>> effort to find out and no one was even aware they existed.  Much less
> >>> what they were.  And certainly no one I talked to could find code that
> >>> used them.
> >>
> >> But someday; even if its after a decade, someone ;) will
> >> stumble into this semantics and wonder 'why?'. Its better to get it right
> >> sooner. Sorry, I am blaming myself; for keeping some of the problems
> >> open thinking no one will bump into them.
> >
> > Oh definitely.  If we have people ready to talk it through I am happy to
> > dot as many i's and cross as many t's as we productively can.
> >
> > I was just pointing out that I don't have any reason to expect that any
> > one depends on the subtle details of the implementation today so we
> > still have some wiggle room to fix them.  Even if they are visible to
> > user space.
> 
> So I haven't seen a reply, and we are getting awfully close to the merge
> window.  Is there anything concrete we can do to ease concerns?
> 
> Right now I am thinking my last version of the patch is the likely the
> best we have time and energy to manage and it would be good to merge it
> before the code bit rots.

I was waiting for some other opinions on the behavior, since I
continue to think that 'one should not be able to unmount mounts on
which a user has explicitly mounted upon'. I am happy to be overruled,
since your patch significantly improves the rest of the semantics.

Viro?

RP

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-03 17:10                                           ` Ram Pai
@ 2017-02-03 18:26                                             ` Eric W. Biederman
  2017-02-03 20:28                                               ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-02-03 18:26 UTC (permalink / raw)
  To: Ram Pai; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

Ram Pai <linuxram@us.ibm.com> writes:

> On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
>> ebiederm@xmission.com (Eric W. Biederman) writes:
>> 
>> > Ram Pai <linuxram@us.ibm.com> writes:
>> >
>> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
>> >>> Ram Pai <linuxram@us.ibm.com> writes:
>> >>> 
>> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>> >>> >> 
>> >>> >>  	for (m = propagation_next(parent, parent); m;
>> >>> >>  	     		m = propagation_next(m, parent)) {
>> >>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> >>> >> -		if (child && list_empty(&child->mnt_mounts) &&
>> >>> >> -		    (ret = do_refcount_check(child, 1)))
>> >>> >> -			break;
>> >>> >> +		int count = 1;
>> >>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>> >>> >> +		if (!child)
>> >>> >> +			continue;
>> >>> >> +
>> >>> >> +		/* Is there exactly one mount on the child that covers
>> >>> >> +		 * it completely whose reference should be ignored?
>> >>> >> +		 */
>> >>> >> +		topper = find_topper(child);
>> >>> >
>> >>> > This is tricky. I understand it is trying to identify the case where a
>> >>> > mount got tucked-in because of propagation.  But this will not
>> >>> > distinguish the case where a mount got over-mounted genuinely, not because of
>> >>> > propagation, but because of explicit user action.
>> >>> >
>> >>> >
>> >>> > example:
>> >>> >
>> >>> > case 1: (explicit user action)
>> >>> > 	B is a slave of A
>> >>> > 	mount something on A/a , it will propagate to B/a
>> >>> > 	and than mount something on B/a
>> >>> >
>> >>> > case 2: (tucked mount)
>> >>> > 	B is a slave of A
>> >>> > 	mount something on B/a
>> >>> > 	and than mount something on A/a
>> >>> >
>> >>> > Both case 1 and case 2 lead to the same mount configuration.
>> >>> >
>> >>> >
>> >>> > 	  however 'umount A/a' in case 1 should fail.
>> >>> > 	  and 'umount A/a' in case 2 should pass.
>> >>> >
>> >>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
>> >>> > 	whereas umounts of mounts on which overmounts exist should
>> >>> > 		fail.(case 1)
>> >>> 
>> >>> Looking at your example.  I agree that case 1 will fail today.
>> >>
>> >> And should continue to fail. right? Your semantics change will pass it.
>> >
>> > I don't see why it should continue to fail.
>> >
>> >>> However my actual expectation would be for both mount configurations
>> >>> to behave the same.  In both cases something has been explicitly mounted
>> >>> on B/a and something has propagated to B/a.  In both cases the mount
>> >>> on top is what was explicitly mounted, and the mount below is what was
>> >>> propagated to B/a.
>> >>> 
>> >>> I don't see why the order of operations should matter.
>> >>
>> >> One of the subtle expectation is reversibility.
>> >>
>> >> Mount followed immediately by unmount has always passed and that is the
>> >> standard expectation always. Your proposed code will ensure that.
>> >>
>> >> However there is one other subtle expectaton.
>> >>
>> >> A mount cannot disappear if a user has explicitly mounted on top of it.
>> >>
>> >> your proposed code will not meet that expectation. 
>> >>
>> >> In other words, these two expectations make it behave differently even
>> >> when; arguably, they feel like the same configuration.
>> >
>> > I am not seeing that.
>> >
>> >
>> >
>> >>> 
>> >>> > maybe we need a flag to identify tucked mounts?
>> >>> 
>> >>> To preserve our exact current semantics yes.
>> >>> 
>> >>> The mount configurations that are delibearately constructed that I am
>> >>> aware of are comparatively simple.  I don't think anyone has even taken
>> >>> advantage of the shadow/side mounts at this point.  I made a reasonable
>> >>> effort to find out and no one was even aware they existed.  Much less
>> >>> what they were.  And certainly no one I talked to could find code that
>> >>> used them.
>> >>
>> >> But someday; even if its after a decade, someone ;) will
>> >> stumble into this semantics and wonder 'why?'. Its better to get it right
>> >> sooner. Sorry, I am blaming myself; for keeping some of the problems
>> >> open thinking no one will bump into them.
>> >
>> > Oh definitely.  If we have people ready to talk it through I am happy to
>> > dot as many i's and cross as many t's as we productively can.
>> >
>> > I was just pointing out that I don't have any reason to expect that any
>> > one depends on the subtle details of the implementation today so we
>> > still have some wiggle room to fix them.  Even if they are visible to
>> > user space.
>> 
>> So I haven't seen a reply, and we are getting awfully close to the merge
>> window.  Is there anything concrete we can do to ease concerns?
>> 
>> Right now I am thinking my last version of the patch is the likely the
>> best we have time and energy to manage and it would be good to merge it
>> before the code bit rots.
>
> I was waiting for some other opinions on the behavior, since I
> continue to think that 'one should not be able to unmount mounts on
> which a user has explicitly mounted upon'. I am happy to be overruled,
> since your patch significantly improves the rest of the semantics.
>
> Viro?

Ram Pai, just to be clear you were hoping to add the logic below to my patch?

My objections to the snippet below are:

- It makes it hard for the CRIU folks (yet more state they have to find
  and restore).

- It feels subjectively worse to me.

- We already have cases where mounts are unmounted transparently (umount on rmdir).

- Al Viro claims that the side/shadow mounts are ordinary mounts and
  maintaining this extra logic that remembers if we tucked one mount
  under another seems to make this them less ordinary.

- The symmetry for unmounting exists for a tucked mount.  We can unmount
  it via propagation or we can unmount the mount above it, and then we
  can unmount the new underlying mount.  So I don't see why we don't
  want symmetry in the other case just because we mounted on top of
  the mount and rather than had the mount tucked under us.

diff --git a/fs/namespace.c b/fs/namespace.c
index 8bfad42c1ccf..8b00e0548438 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2047,8 +2047,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		hlist_del_init(&child->mnt_hash);
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
-		if (q)
+		if (q) {
 			mnt_change_mountpoint(child, smp, q);
+			child->mnt.mnt_flags |= MNT_TUCKED;
+		}
 		commit_tree(child);
 	}
 	put_mountpoint(smp);
diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896d122a..e2a6ac68feb9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -327,6 +327,9 @@ static struct mount *find_topper(struct mount *mnt)
 	/* If there is exactly one mount covering mnt completely return it. */
 	struct mount *child;
 
+	if (!(mnt->mnt.mnt_flags & MNT_TUCKED))
+		return NULL;
+	
 	if (!list_is_singular(&mnt->mnt_mounts))
 		return NULL;
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 8e0352af06b7..25ca398b19b3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -52,6 +52,7 @@ struct mnt_namespace;
 
 #define MNT_INTERNAL	0x4000
 
+#define MNT_TUCKED		0x020000
 #define MNT_LOCK_ATIME		0x040000
 #define MNT_LOCK_NOEXEC		0x080000
 #define MNT_LOCK_NOSUID		0x100000

Eric

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-03 18:26                                             ` Eric W. Biederman
@ 2017-02-03 20:28                                               ` Ram Pai
  2017-02-03 20:58                                                 ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-02-03 20:28 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

On Sat, Feb 04, 2017 at 07:26:20AM +1300, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
> >> ebiederm@xmission.com (Eric W. Biederman) writes:
> >> 
> >> > Ram Pai <linuxram@us.ibm.com> writes:
> >> >
> >> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
> >> >>> Ram Pai <linuxram@us.ibm.com> writes:
> >> >>> 
> >> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >> >>> >> 
> >> >>> >>  	for (m = propagation_next(parent, parent); m;
> >> >>> >>  	     		m = propagation_next(m, parent)) {
> >> >>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> >> >>> >> -		if (child && list_empty(&child->mnt_mounts) &&
> >> >>> >> -		    (ret = do_refcount_check(child, 1)))
> >> >>> >> -			break;
> >> >>> >> +		int count = 1;
> >> >>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> >> >>> >> +		if (!child)
> >> >>> >> +			continue;
> >> >>> >> +
> >> >>> >> +		/* Is there exactly one mount on the child that covers
> >> >>> >> +		 * it completely whose reference should be ignored?
> >> >>> >> +		 */
> >> >>> >> +		topper = find_topper(child);
> >> >>> >
> >> >>> > This is tricky. I understand it is trying to identify the case where a
> >> >>> > mount got tucked-in because of propagation.  But this will not
> >> >>> > distinguish the case where a mount got over-mounted genuinely, not because of
> >> >>> > propagation, but because of explicit user action.
> >> >>> >
> >> >>> >
> >> >>> > example:
> >> >>> >
> >> >>> > case 1: (explicit user action)
> >> >>> > 	B is a slave of A
> >> >>> > 	mount something on A/a , it will propagate to B/a
> >> >>> > 	and than mount something on B/a
> >> >>> >
> >> >>> > case 2: (tucked mount)
> >> >>> > 	B is a slave of A
> >> >>> > 	mount something on B/a
> >> >>> > 	and than mount something on A/a
> >> >>> >
> >> >>> > Both case 1 and case 2 lead to the same mount configuration.
> >> >>> >
> >> >>> >
> >> >>> > 	  however 'umount A/a' in case 1 should fail.
> >> >>> > 	  and 'umount A/a' in case 2 should pass.
> >> >>> >
> >> >>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
> >> >>> > 	whereas umounts of mounts on which overmounts exist should
> >> >>> > 		fail.(case 1)
> >> >>> 
> >> >>> Looking at your example.  I agree that case 1 will fail today.
> >> >>
> >> >> And should continue to fail. right? Your semantics change will pass it.
> >> >
> >> > I don't see why it should continue to fail.
> >> >
> >> >>> However my actual expectation would be for both mount configurations
> >> >>> to behave the same.  In both cases something has been explicitly mounted
> >> >>> on B/a and something has propagated to B/a.  In both cases the mount
> >> >>> on top is what was explicitly mounted, and the mount below is what was
> >> >>> propagated to B/a.
> >> >>> 
> >> >>> I don't see why the order of operations should matter.
> >> >>
> >> >> One of the subtle expectation is reversibility.
> >> >>
> >> >> Mount followed immediately by unmount has always passed and that is the
> >> >> standard expectation always. Your proposed code will ensure that.
> >> >>
> >> >> However there is one other subtle expectaton.
> >> >>
> >> >> A mount cannot disappear if a user has explicitly mounted on top of it.
> >> >>
> >> >> your proposed code will not meet that expectation. 
> >> >>
> >> >> In other words, these two expectations make it behave differently even
> >> >> when; arguably, they feel like the same configuration.
> >> >
> >> > I am not seeing that.
> >> >
> >> >
> >> >
> >> >>> 
> >> >>> > maybe we need a flag to identify tucked mounts?
> >> >>> 
> >> >>> To preserve our exact current semantics yes.
> >> >>> 
> >> >>> The mount configurations that are delibearately constructed that I am
> >> >>> aware of are comparatively simple.  I don't think anyone has even taken
> >> >>> advantage of the shadow/side mounts at this point.  I made a reasonable
> >> >>> effort to find out and no one was even aware they existed.  Much less
> >> >>> what they were.  And certainly no one I talked to could find code that
> >> >>> used them.
> >> >>
> >> >> But someday; even if its after a decade, someone ;) will
> >> >> stumble into this semantics and wonder 'why?'. Its better to get it right
> >> >> sooner. Sorry, I am blaming myself; for keeping some of the problems
> >> >> open thinking no one will bump into them.
> >> >
> >> > Oh definitely.  If we have people ready to talk it through I am happy to
> >> > dot as many i's and cross as many t's as we productively can.
> >> >
> >> > I was just pointing out that I don't have any reason to expect that any
> >> > one depends on the subtle details of the implementation today so we
> >> > still have some wiggle room to fix them.  Even if they are visible to
> >> > user space.
> >> 
> >> So I haven't seen a reply, and we are getting awfully close to the merge
> >> window.  Is there anything concrete we can do to ease concerns?
> >> 
> >> Right now I am thinking my last version of the patch is the likely the
> >> best we have time and energy to manage and it would be good to merge it
> >> before the code bit rots.
> >
> > I was waiting for some other opinions on the behavior, since I
> > continue to think that 'one should not be able to unmount mounts on
> > which a user has explicitly mounted upon'. I am happy to be overruled,
> > since your patch significantly improves the rest of the semantics.
> >
> > Viro?
> 
> Ram Pai, just to be clear you were hoping to add the logic below to my patch?

Yes. the behavior of your patch below is what I was proposing.

> 
> My objections to the snippet below are:
> 
> - It makes it hard for the CRIU folks (yet more state they have to find
>   and restore).

true. unfortunately one more subtle detail to be aware off.

> 
> - It feels subjectively worse to me.
> 
> - We already have cases where mounts are unmounted transparently (umount on rmdir).

sorry. i am not aware of this case. some details will help.

> 
> - Al Viro claims that the side/shadow mounts are ordinary mounts and
>   maintaining this extra logic that remembers if we tucked one mount
>   under another seems to make this them less ordinary.

I tend to argue that they are a bit more than ordinary, for they have the
ability to tuck.

> 
> - The symmetry for unmounting exists for a tucked mount.  We can unmount
>   it via propagation or we can unmount the mount above it, and then we
>   can unmount the new underlying mount.

this is fine with me.

>   So I don't see why we don't
>   want symmetry in the other case just because we mounted on top of
>   the mount and rather than had the mount tucked under us.

A tucked mount should be un-tuckable. I agree.  But a non-tucked mount
cannot pretend to be tucked and this is where I disagree.


> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8bfad42c1ccf..8b00e0548438 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2047,8 +2047,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  		hlist_del_init(&child->mnt_hash);
>  		q = __lookup_mnt(&child->mnt_parent->mnt,
>  				 child->mnt_mountpoint);
> -		if (q)
> +		if (q) {
>  			mnt_change_mountpoint(child, smp, q);
> +			child->mnt.mnt_flags |= MNT_TUCKED;
> +		}
>  		commit_tree(child);
>  	}
>  	put_mountpoint(smp);
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5bc7896d122a..e2a6ac68feb9 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -327,6 +327,9 @@ static struct mount *find_topper(struct mount *mnt)
>  	/* If there is exactly one mount covering mnt completely return it. */
>  	struct mount *child;
> 
> +	if (!(mnt->mnt.mnt_flags & MNT_TUCKED))
> +		return NULL;
> +	
>  	if (!list_is_singular(&mnt->mnt_mounts))
>  		return NULL;
> 
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 8e0352af06b7..25ca398b19b3 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -52,6 +52,7 @@ struct mnt_namespace;
> 
>  #define MNT_INTERNAL	0x4000
> 
> +#define MNT_TUCKED		0x020000
>  #define MNT_LOCK_ATIME		0x040000
>  #define MNT_LOCK_NOEXEC		0x080000
>  #define MNT_LOCK_NOSUID		0x100000
> 
> Eric

-- 
Ram Pai

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-03 20:28                                               ` Ram Pai
@ 2017-02-03 20:58                                                 ` Eric W. Biederman
  2017-02-06  3:25                                                   ` Andrei Vagin
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-02-03 20:58 UTC (permalink / raw)
  To: Ram Pai; +Cc: Al Viro, linux-fsdevel, Andrei Vagin

Ram Pai <linuxram@us.ibm.com> writes:

> On Sat, Feb 04, 2017 at 07:26:20AM +1300, Eric W. Biederman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
>> >> ebiederm@xmission.com (Eric W. Biederman) writes:
>> >> 
>> >> > Ram Pai <linuxram@us.ibm.com> writes:
>> >> >
>> >> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
>> >> >>> Ram Pai <linuxram@us.ibm.com> writes:
>> >> >>> 
>> >> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>> >> >>> >> 
>> >> >>> >>  	for (m = propagation_next(parent, parent); m;
>> >> >>> >>  	     		m = propagation_next(m, parent)) {
>> >> >>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> >> >>> >> -		if (child && list_empty(&child->mnt_mounts) &&
>> >> >>> >> -		    (ret = do_refcount_check(child, 1)))
>> >> >>> >> -			break;
>> >> >>> >> +		int count = 1;
>> >> >>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>> >> >>> >> +		if (!child)
>> >> >>> >> +			continue;
>> >> >>> >> +
>> >> >>> >> +		/* Is there exactly one mount on the child that covers
>> >> >>> >> +		 * it completely whose reference should be ignored?
>> >> >>> >> +		 */
>> >> >>> >> +		topper = find_topper(child);
>> >> >>> >
>> >> >>> > This is tricky. I understand it is trying to identify the case where a
>> >> >>> > mount got tucked-in because of propagation.  But this will not
>> >> >>> > distinguish the case where a mount got over-mounted genuinely, not because of
>> >> >>> > propagation, but because of explicit user action.
>> >> >>> >
>> >> >>> >
>> >> >>> > example:
>> >> >>> >
>> >> >>> > case 1: (explicit user action)
>> >> >>> > 	B is a slave of A
>> >> >>> > 	mount something on A/a , it will propagate to B/a
>> >> >>> > 	and than mount something on B/a
>> >> >>> >
>> >> >>> > case 2: (tucked mount)
>> >> >>> > 	B is a slave of A
>> >> >>> > 	mount something on B/a
>> >> >>> > 	and than mount something on A/a
>> >> >>> >
>> >> >>> > Both case 1 and case 2 lead to the same mount configuration.
>> >> >>> >
>> >> >>> >
>> >> >>> > 	  however 'umount A/a' in case 1 should fail.
>> >> >>> > 	  and 'umount A/a' in case 2 should pass.
>> >> >>> >
>> >> >>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
>> >> >>> > 	whereas umounts of mounts on which overmounts exist should
>> >> >>> > 		fail.(case 1)
>> >> >>> 
>> >> >>> Looking at your example.  I agree that case 1 will fail today.
>> >> >>
>> >> >> And should continue to fail. right? Your semantics change will pass it.
>> >> >
>> >> > I don't see why it should continue to fail.
>> >> >
>> >> >>> However my actual expectation would be for both mount configurations
>> >> >>> to behave the same.  In both cases something has been explicitly mounted
>> >> >>> on B/a and something has propagated to B/a.  In both cases the mount
>> >> >>> on top is what was explicitly mounted, and the mount below is what was
>> >> >>> propagated to B/a.
>> >> >>> 
>> >> >>> I don't see why the order of operations should matter.
>> >> >>
>> >> >> One of the subtle expectation is reversibility.
>> >> >>
>> >> >> Mount followed immediately by unmount has always passed and that is the
>> >> >> standard expectation always. Your proposed code will ensure that.
>> >> >>
>> >> >> However there is one other subtle expectaton.
>> >> >>
>> >> >> A mount cannot disappear if a user has explicitly mounted on top of it.
>> >> >>
>> >> >> your proposed code will not meet that expectation. 
>> >> >>
>> >> >> In other words, these two expectations make it behave differently even
>> >> >> when; arguably, they feel like the same configuration.
>> >> >
>> >> > I am not seeing that.
>> >> >
>> >> >
>> >> >
>> >> >>> 
>> >> >>> > maybe we need a flag to identify tucked mounts?
>> >> >>> 
>> >> >>> To preserve our exact current semantics yes.
>> >> >>> 
>> >> >>> The mount configurations that are delibearately constructed that I am
>> >> >>> aware of are comparatively simple.  I don't think anyone has even taken
>> >> >>> advantage of the shadow/side mounts at this point.  I made a reasonable
>> >> >>> effort to find out and no one was even aware they existed.  Much less
>> >> >>> what they were.  And certainly no one I talked to could find code that
>> >> >>> used them.
>> >> >>
>> >> >> But someday; even if its after a decade, someone ;) will
>> >> >> stumble into this semantics and wonder 'why?'. Its better to get it right
>> >> >> sooner. Sorry, I am blaming myself; for keeping some of the problems
>> >> >> open thinking no one will bump into them.
>> >> >
>> >> > Oh definitely.  If we have people ready to talk it through I am happy to
>> >> > dot as many i's and cross as many t's as we productively can.
>> >> >
>> >> > I was just pointing out that I don't have any reason to expect that any
>> >> > one depends on the subtle details of the implementation today so we
>> >> > still have some wiggle room to fix them.  Even if they are visible to
>> >> > user space.
>> >> 
>> >> So I haven't seen a reply, and we are getting awfully close to the merge
>> >> window.  Is there anything concrete we can do to ease concerns?
>> >> 
>> >> Right now I am thinking my last version of the patch is the likely the
>> >> best we have time and energy to manage and it would be good to merge it
>> >> before the code bit rots.
>> >
>> > I was waiting for some other opinions on the behavior, since I
>> > continue to think that 'one should not be able to unmount mounts on
>> > which a user has explicitly mounted upon'. I am happy to be overruled,
>> > since your patch significantly improves the rest of the semantics.
>> >
>> > Viro?
>> 
>> Ram Pai, just to be clear you were hoping to add the logic below to my patch?
>
> Yes. the behavior of your patch below is what I was proposing.
>
>> 
>> My objections to the snippet below are:
>> 
>> - It makes it hard for the CRIU folks (yet more state they have to find
>>   and restore).
>
> true. unfortunately one more subtle detail to be aware off.

A bit more than that, as it means that it requires an almost exact
playback of the sequence of mounts in all mount namespaces to
get to the point of reproducing a mount namespace.

>> - It feels subjectively worse to me.
>> 
>> - We already have cases where mounts are unmounted transparently (umount on rmdir).
>
> sorry. i am not aware of this case. some details will help.

The question:

What happens when we rmdir a directory that has a mount on it in another
mount namespace?

What happens when someone on the nfs server deletes a directory there
is a mount on?


It used to be that we returned -EBUSY, and refused the rmdir operation,
and we lied in the vfs about the nfs dentry being deleted to preserve
the mount.

In recent kernels I have done the work so that we transparently unmount
the mounts and allow the rmdir to happen.  An unprivileged user mounting
over say glibc and blocking the yum update of it is a pretty serious
bug.

>> - Al Viro claims that the side/shadow mounts are ordinary mounts and
>>   maintaining this extra logic that remembers if we tucked one mount
>>   under another seems to make this them less ordinary.
>
> I tend to argue that they are a bit more than ordinary, for they have the
> ability to tuck.
>
>> 
>> - The symmetry for unmounting exists for a tucked mount.  We can unmount
>>   it via propagation or we can unmount the mount above it, and then we
>>   can unmount the new underlying mount.
>
> this is fine with me.
>
>>   So I don't see why we don't
>>   want symmetry in the other case just because we mounted on top of
>>   the mount and rather than had the mount tucked under us.
>
> A tucked mount should be un-tuckable. I agree.  But a non-tucked mount
> cannot pretend to be tucked and this is where I disagree.

I have always seen the question as: Should a mount that is propagated be
unmountable via umount propagation.

Which leads me to think that allowing the umount propagation when it
won't change the applications view of files and filesystems is a good
thing.  From my perspective it also better preserves the reversability
property that is important.   The mount propgated and now the unmount
propagated.


>From a system management point of view one of the largest practical
problems with mount namespaces and mount propagation is: mounts that
propagate into another mount namespaces but don't get unmounted.


Which is to say not unmounting something (especially silently) and
leaving the filesystem busy when something could be unmounted is a
practical problem for people.


I am going to be out for a week, and I am leaving in a few minutes.
So I am going to push my patch to the my for-next branch, so there
is a reasonable chance of merging things when the merge window opens.

If the feedback is to add the MNT_TUCKED annotations to make the patch
suitable for merging to Linus's tree I will take care of that when
I get back.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-03 20:58                                                 ` Eric W. Biederman
@ 2017-02-06  3:25                                                   ` Andrei Vagin
  2017-02-06 21:40                                                     ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Andrei Vagin @ 2017-02-06  3:25 UTC (permalink / raw)
  To: Ram Pai; +Cc: Eric W. Biederman, Al Viro, linux-fsdevel

On Sat, Feb 04, 2017 at 09:58:39AM +1300, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Sat, Feb 04, 2017 at 07:26:20AM +1300, Eric W. Biederman wrote:
> >> Ram Pai <linuxram@us.ibm.com> writes:
> >> 
> >> > On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
> >> >> ebiederm@xmission.com (Eric W. Biederman) writes:
> >> >> 
> >> >> > Ram Pai <linuxram@us.ibm.com> writes:
> >> >> >
> >> >> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
> >> >> >>> Ram Pai <linuxram@us.ibm.com> writes:
> >> >> >>> 
> >> >> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >> >> >>> >> 
> >> >> >>> >>  	for (m = propagation_next(parent, parent); m;
> >> >> >>> >>  	     		m = propagation_next(m, parent)) {
> >> >> >>> >> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> >> >> >>> >> -		if (child && list_empty(&child->mnt_mounts) &&
> >> >> >>> >> -		    (ret = do_refcount_check(child, 1)))
> >> >> >>> >> -			break;
> >> >> >>> >> +		int count = 1;
> >> >> >>> >> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> >> >> >>> >> +		if (!child)
> >> >> >>> >> +			continue;
> >> >> >>> >> +
> >> >> >>> >> +		/* Is there exactly one mount on the child that covers
> >> >> >>> >> +		 * it completely whose reference should be ignored?
> >> >> >>> >> +		 */
> >> >> >>> >> +		topper = find_topper(child);
> >> >> >>> >
> >> >> >>> > This is tricky. I understand it is trying to identify the case where a
> >> >> >>> > mount got tucked-in because of propagation.  But this will not
> >> >> >>> > distinguish the case where a mount got over-mounted genuinely, not because of
> >> >> >>> > propagation, but because of explicit user action.
> >> >> >>> >
> >> >> >>> >
> >> >> >>> > example:
> >> >> >>> >
> >> >> >>> > case 1: (explicit user action)
> >> >> >>> > 	B is a slave of A
> >> >> >>> > 	mount something on A/a , it will propagate to B/a
> >> >> >>> > 	and than mount something on B/a
> >> >> >>> >
> >> >> >>> > case 2: (tucked mount)
> >> >> >>> > 	B is a slave of A
> >> >> >>> > 	mount something on B/a
> >> >> >>> > 	and than mount something on A/a
> >> >> >>> >
> >> >> >>> > Both case 1 and case 2 lead to the same mount configuration.
> >> >> >>> >
> >> >> >>> >
> >> >> >>> > 	  however 'umount A/a' in case 1 should fail.
> >> >> >>> > 	  and 'umount A/a' in case 2 should pass.
> >> >> >>> >
> >> >> >>> > Right? in other words, umounts of 'tucked mounts' should pass(case 2).
> >> >> >>> > 	whereas umounts of mounts on which overmounts exist should
> >> >> >>> > 		fail.(case 1)
> >> >> >>> 
> >> >> >>> Looking at your example.  I agree that case 1 will fail today.
> >> >> >>
> >> >> >> And should continue to fail. right? Your semantics change will pass it.
> >> >> >
> >> >> > I don't see why it should continue to fail.
> >> >> >
> >> >> >>> However my actual expectation would be for both mount configurations
> >> >> >>> to behave the same.  In both cases something has been explicitly mounted
> >> >> >>> on B/a and something has propagated to B/a.  In both cases the mount
> >> >> >>> on top is what was explicitly mounted, and the mount below is what was
> >> >> >>> propagated to B/a.
> >> >> >>> 
> >> >> >>> I don't see why the order of operations should matter.
> >> >> >>
> >> >> >> One of the subtle expectation is reversibility.
> >> >> >>
> >> >> >> Mount followed immediately by unmount has always passed and that is the
> >> >> >> standard expectation always. Your proposed code will ensure that.
> >> >> >>
> >> >> >> However there is one other subtle expectaton.
> >> >> >>
> >> >> >> A mount cannot disappear if a user has explicitly mounted on top of it.
> >> >> >>
> >> >> >> your proposed code will not meet that expectation. 
> >> >> >>
> >> >> >> In other words, these two expectations make it behave differently even
> >> >> >> when; arguably, they feel like the same configuration.
> >> >> >
> >> >> > I am not seeing that.
> >> >> >
> >> >> >
> >> >> >
> >> >> >>> 
> >> >> >>> > maybe we need a flag to identify tucked mounts?
> >> >> >>> 
> >> >> >>> To preserve our exact current semantics yes.
> >> >> >>> 
> >> >> >>> The mount configurations that are delibearately constructed that I am
> >> >> >>> aware of are comparatively simple.  I don't think anyone has even taken
> >> >> >>> advantage of the shadow/side mounts at this point.  I made a reasonable
> >> >> >>> effort to find out and no one was even aware they existed.  Much less
> >> >> >>> what they were.  And certainly no one I talked to could find code that
> >> >> >>> used them.
> >> >> >>
> >> >> >> But someday; even if its after a decade, someone ;) will
> >> >> >> stumble into this semantics and wonder 'why?'. Its better to get it right
> >> >> >> sooner. Sorry, I am blaming myself; for keeping some of the problems
> >> >> >> open thinking no one will bump into them.
> >> >> >
> >> >> > Oh definitely.  If we have people ready to talk it through I am happy to
> >> >> > dot as many i's and cross as many t's as we productively can.
> >> >> >
> >> >> > I was just pointing out that I don't have any reason to expect that any
> >> >> > one depends on the subtle details of the implementation today so we
> >> >> > still have some wiggle room to fix them.  Even if they are visible to
> >> >> > user space.
> >> >> 
> >> >> So I haven't seen a reply, and we are getting awfully close to the merge
> >> >> window.  Is there anything concrete we can do to ease concerns?
> >> >> 
> >> >> Right now I am thinking my last version of the patch is the likely the
> >> >> best we have time and energy to manage and it would be good to merge it
> >> >> before the code bit rots.
> >> >
> >> > I was waiting for some other opinions on the behavior, since I
> >> > continue to think that 'one should not be able to unmount mounts on
> >> > which a user has explicitly mounted upon'. I am happy to be overruled,
> >> > since your patch significantly improves the rest of the semantics.
> >> >
> >> > Viro?
> >> 
> >> Ram Pai, just to be clear you were hoping to add the logic below to my patch?
> >
> > Yes. the behavior of your patch below is what I was proposing.
> >
> >> 
> >> My objections to the snippet below are:
> >> 
> >> - It makes it hard for the CRIU folks (yet more state they have to find
> >>   and restore).
> >
> > true. unfortunately one more subtle detail to be aware off.
> 
> A bit more than that, as it means that it requires an almost exact
> playback of the sequence of mounts in all mount namespaces to
> get to the point of reproducing a mount namespace.

Currently dump and restore of mount namespaces is the most complicated
part of CRIU. The main problem is that we don't know how a tree of
mounts was created.  Mounts have two types of relationships:
child<->parent and shared groups. Currently both this relationships
can't be restored directly.  We can not add a mount into an existing
group, the group can be only inherited from a source mount.  And we can
not restore "tucked" mounts, which can be only appeared due to
propagation.

Now we don't know an algorithm to dump and restore any set of mount
points for a reqsonable time. The problem becomes more complex if we
start thinking how to restore mount namespaces which lives in different
user namespaces.


This patch from Eric together with my patch https://lkml.org/lkml/2017/1/23/712
can solve the problem of dumping and restoring mount namespaces.

> 
> >> - It feels subjectively worse to me.
> >> 
> >> - We already have cases where mounts are unmounted transparently (umount on rmdir).
> >
> > sorry. i am not aware of this case. some details will help.
> 
> The question:
> 
> What happens when we rmdir a directory that has a mount on it in another
> mount namespace?
> 
> What happens when someone on the nfs server deletes a directory there
> is a mount on?
> 
> 
> It used to be that we returned -EBUSY, and refused the rmdir operation,
> and we lied in the vfs about the nfs dentry being deleted to preserve
> the mount.
> 
> In recent kernels I have done the work so that we transparently unmount
> the mounts and allow the rmdir to happen.  An unprivileged user mounting
> over say glibc and blocking the yum update of it is a pretty serious
> bug.
> 
> >> - Al Viro claims that the side/shadow mounts are ordinary mounts and
> >>   maintaining this extra logic that remembers if we tucked one mount
> >>   under another seems to make this them less ordinary.
> >
> > I tend to argue that they are a bit more than ordinary, for they have the
> > ability to tuck.
> >
> >> 
> >> - The symmetry for unmounting exists for a tucked mount.  We can unmount
> >>   it via propagation or we can unmount the mount above it, and then we
> >>   can unmount the new underlying mount.
> >
> > this is fine with me.
> >
> >>   So I don't see why we don't
> >>   want symmetry in the other case just because we mounted on top of
> >>   the mount and rather than had the mount tucked under us.
> >
> > A tucked mount should be un-tuckable. I agree.  But a non-tucked mount
> > cannot pretend to be tucked and this is where I disagree.
> 
> I have always seen the question as: Should a mount that is propagated be
> unmountable via umount propagation.
> 
> Which leads me to think that allowing the umount propagation when it
> won't change the applications view of files and filesystems is a good
> thing.  From my perspective it also better preserves the reversability
> property that is important.   The mount propgated and now the unmount
> propagated.
> 
> 
> From a system management point of view one of the largest practical
> problems with mount namespaces and mount propagation is: mounts that
> propagate into another mount namespaces but don't get unmounted.
> 
> 
> Which is to say not unmounting something (especially silently) and
> leaving the filesystem busy when something could be unmounted is a
> practical problem for people.
> 
> 
> I am going to be out for a week, and I am leaving in a few minutes.
> So I am going to push my patch to the my for-next branch, so there
> is a reasonable chance of merging things when the merge window opens.
> 
> If the feedback is to add the MNT_TUCKED annotations to make the patch
> suitable for merging to Linus's tree I will take care of that when
> I get back.
> 
> Eric
> 
> 
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-06  3:25                                                   ` Andrei Vagin
@ 2017-02-06 21:40                                                     ` Ram Pai
  2017-02-07  6:35                                                       ` Andrei Vagin
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-02-06 21:40 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Eric W. Biederman, Al Viro, linux-fsdevel

On Sun, Feb 05, 2017 at 07:25:09PM -0800, Andrei Vagin wrote:
> On Sat, Feb 04, 2017 at 09:58:39AM +1300, Eric W. Biederman wrote:
> > Ram Pai <linuxram@us.ibm.com> writes:
> > 
> > > On Sat, Feb 04, 2017 at 07:26:20AM +1300, Eric W. Biederman wrote:
> > >> Ram Pai <linuxram@us.ibm.com> writes:
> > >> 
> > >> > On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
> > >> >> ebiederm@xmission.com (Eric W. Biederman) writes:
> > >> >> 
> > >> >> > Ram Pai <linuxram@us.ibm.com> writes:
> > >> >> >
> > >> >> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
> > >> >> >>> Ram Pai <linuxram@us.ibm.com> writes:
> > >> >> >>> 
> > >> >> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> > >> >> >>> >> 
....snip....
> > 
> > A bit more than that, as it means that it requires an almost exact
> > playback of the sequence of mounts in all mount namespaces to
> > get to the point of reproducing a mount namespace.

> 
> Currently dump and restore of mount namespaces is the most complicated
> part of CRIU. The main problem is that we don't know how a tree of
> mounts was created.  Mounts have two types of relationships:
> child<->parent and shared groups. Currently both this relationships
> can't be restored directly.  We can not add a mount into an existing
> group, the group can be only inherited from a source mount.  And we can
> not restore "tucked" mounts, which can be only appeared due to
> propagation.
> 
> Now we don't know an algorithm to dump and restore any set of mount
> points for a reqsonable time. The problem becomes more complex if we
> start thinking how to restore mount namespaces which lives in different
> user namespaces.
> 
> 
> This patch from Eric together with my patch https://lkml.org/lkml/2017/1/23/712
> can solve the problem of dumping and restoring mount namespaces.

Lets say we exposed the tucked flag for the mount in the
/proc/*/mountinfo, than there will be no need to know the history.
Its just a matter or setting that flag as part of
your new MS_SET_GROUP mount() call. right?

Looking at the patch at https://lkml.org/lkml/2017/1/23/712, I am
guessing the CRUI dumps information from mountinfo to restore it
later.

RP

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-02-06 21:40                                                     ` Ram Pai
@ 2017-02-07  6:35                                                       ` Andrei Vagin
  0 siblings, 0 replies; 63+ messages in thread
From: Andrei Vagin @ 2017-02-07  6:35 UTC (permalink / raw)
  To: Ram Pai; +Cc: Eric W. Biederman, Al Viro, linux-fsdevel

On Mon, Feb 06, 2017 at 01:40:53PM -0800, Ram Pai wrote:
> On Sun, Feb 05, 2017 at 07:25:09PM -0800, Andrei Vagin wrote:
> > On Sat, Feb 04, 2017 at 09:58:39AM +1300, Eric W. Biederman wrote:
> > > Ram Pai <linuxram@us.ibm.com> writes:
> > > 
> > > > On Sat, Feb 04, 2017 at 07:26:20AM +1300, Eric W. Biederman wrote:
> > > >> Ram Pai <linuxram@us.ibm.com> writes:
> > > >> 
> > > >> > On Fri, Feb 03, 2017 at 11:54:21PM +1300, Eric W. Biederman wrote:
> > > >> >> ebiederm@xmission.com (Eric W. Biederman) writes:
> > > >> >> 
> > > >> >> > Ram Pai <linuxram@us.ibm.com> writes:
> > > >> >> >
> > > >> >> >> On Sat, Jan 21, 2017 at 05:15:29PM +1300, Eric W. Biederman wrote:
> > > >> >> >>> Ram Pai <linuxram@us.ibm.com> writes:
> > > >> >> >>> 
> > > >> >> >>> >> @@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> > > >> >> >>> >> 
> ....snip....
> > > 
> > > A bit more than that, as it means that it requires an almost exact
> > > playback of the sequence of mounts in all mount namespaces to
> > > get to the point of reproducing a mount namespace.
> 
> > 
> > Currently dump and restore of mount namespaces is the most complicated
> > part of CRIU. The main problem is that we don't know how a tree of
> > mounts was created.  Mounts have two types of relationships:
> > child<->parent and shared groups. Currently both this relationships
> > can't be restored directly.  We can not add a mount into an existing
> > group, the group can be only inherited from a source mount.  And we can
> > not restore "tucked" mounts, which can be only appeared due to

Sorry, I used a wrong term. I mean "shadow" mounts.

> > propagation.
> > 
> > Now we don't know an algorithm to dump and restore any set of mount
> > points for a reqsonable time. The problem becomes more complex if we
> > start thinking how to restore mount namespaces which lives in different
> > user namespaces.
> > 
> > 
> > This patch from Eric together with my patch https://lkml.org/lkml/2017/1/23/712
> > can solve the problem of dumping and restoring mount namespaces.
> 
> Lets say we exposed the tucked flag for the mount in the
> /proc/*/mountinfo, than there will be no need to know the history.
> Its just a matter or setting that flag as part of
> your new MS_SET_GROUP mount() call. right?

This is right.

> 
> Looking at the patch at https://lkml.org/lkml/2017/1/23/712, I am
> guessing the CRUI dumps information from mountinfo to restore it
> later.
> 
> RP
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-01-04 21:04               ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
  2017-01-07  5:06                 ` Al Viro
@ 2017-05-14  2:15                 ` Andrei Vagin
  2017-05-14  4:05                   ` Eric W. Biederman
  1 sibling, 1 reply; 63+ messages in thread
From: Andrei Vagin @ 2017-05-14  2:15 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

Hi Eric,

I found one issue about this patch. Take a look at the next script. The
idea is simple, we call mount twice and than we call umount twice, but
the second umount fails.

[root@fc24 ~]# cat m.sh
#!/bin/sh

set -x -e
mount -t tmpfs xxx /mnt
mkdir -p /mnt/1
mkdir -p /mnt/2
mount --bind /mnt/1 /mnt/1
mount --make-shared /mnt/1
mount --bind /mnt/1 /mnt/2
mkdir -p /mnt/1/1
for i in `seq 2`; do
	mount --bind /mnt/1/1 /mnt/1/1
done
for i in `seq 2`; do
	umount /mnt/1/1 || {
		cat /proc/self/mountinfo | grep xxx
		exit 1
	}
done

[root@fc24 ~]# unshare -Urm  ./m.sh
+ mount -t tmpfs xxx /mnt
+ mkdir -p /mnt/1
+ mkdir -p /mnt/2
+ mount --bind /mnt/1 /mnt/1
+ mount --make-shared /mnt/1
+ mount --bind /mnt/1 /mnt/2
+ mkdir -p /mnt/1/1
++ seq 2
+ for i in '`seq 2`'
+ mount --bind /mnt/1/1 /mnt/1/1
+ for i in '`seq 2`'
+ mount --bind /mnt/1/1 /mnt/1/1
++ seq 2
+ for i in '`seq 2`'
+ umount /mnt/1/1
+ for i in '`seq 2`'
+ umount /mnt/1/1
umount: /mnt/1/1: not mounted
+ cat /proc/self/mountinfo
+ grep xxx
147 116 0:42 / /mnt rw,relatime - tmpfs xxx rw
148 147 0:42 /1 /mnt/1 rw,relatime shared:65 - tmpfs xxx rw
149 147 0:42 /1 /mnt/2 rw,relatime shared:65 - tmpfs xxx rw
157 149 0:42 /1/1 /mnt/2/1 rw,relatime shared:65 - tmpfs xxx rw
+ exit 1

And you can see that /mnt/2 contains a mount, but it should not.

Thanks,
Andrei

On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote:
> 
> Ever since mount propagation was introduced in cases where a mount in
> propagated to parent mount mountpoint pair that is already in use the
> code has placed the new mount behind the old mount in the mount hash
> table.
> 
> This implementation detail is problematic as it allows creating
> arbitrary length mount hash chains.
> 
> Furthermore it invalidates the constraint maintained elsewhere in the
> mount code that a parent mount and a mountpoint pair will have exactly
> one mount upon them.  Making it hard to deal with and to talk about
> this special case in the mount code.
> 
> Modify mount propagation to notice when there is already a mount at
> the parent mount and mountpoint where a new mount is propagating to
> and place that preexisting mount on top of the new mount.
> 
> Modify unmount propagation to notice when a mount that is being
> unmounted has another mount on top of it (and no other children), and
> to replace the unmounted mount with the mount on top of it.
> 
> Move the MNT_UMUONT test from __lookup_mnt_last into
> __propagate_umount as that is the only call of __lookup_mnt_last where
> MNT_UMOUNT may be set on any mount visible in the mount hash table.
> 
> These modifications allow:
>  - __lookup_mnt_last to be removed.
>  - attach_shadows to be renamed __attach_mnt and the it's shadow
>    handling to be removed.
>  - commit_tree to be simplified
>  - copy_tree to be simplified
> 
> The result is an easier to understand tree of mounts that does not
> allow creation of arbitrary length hash chains in the mount hash table.
> 
> v2: Updated to mnt_change_mountpoint to not call dput or mntput
> and instead to decrement the counts directly.  It is guaranteed
> that there will be other references when mnt_change_mountpoint is
> called so this is safe.
> 
> Cc: stable@vger.kernel.org
> Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
> Tested-by: Andrei Vagin <avagin@virtuozzo.com>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> Since the last version some of you may have seen I have modified
> my implementation of mnt_change_mountpoint so that it no longer calls
> mntput or dput but instead relies on the knowledge that it can not
> possibly have the last reference to the mnt and dentry of interest.
> This avoids code checking tools from complaining bitterly.
> 
> This is on top of my previous patch that sorts out locking of the
> mountpoint hash table.  After time giving ample time for review I intend
> to push this and the previous bug fix to Linus.
> 
>  fs/mount.h     |   1 -
>  fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
>  fs/pnode.c     |  27 ++++++++++----
>  fs/pnode.h     |   2 ++
>  4 files changed, 82 insertions(+), 58 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 2c856fc47ae3..2826543a131d 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
>  }
>  
>  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
> -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
>  
>  extern int __legitimize_mnt(struct vfsmount *, unsigned);
>  extern bool legitimize_mnt(struct vfsmount *, unsigned);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 487ba30bb5c6..91ccfb73f0e0 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
>  }
>  
>  /*
> - * find the last mount at @dentry on vfsmount @mnt.
> - * mount_lock must be held.
> - */
> -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
> -{
> -	struct mount *p, *res = NULL;
> -	p = __lookup_mnt(mnt, dentry);
> -	if (!p)
> -		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> -	hlist_for_each_entry_continue(p, mnt_hash) {
> -		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
> -			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;
> -	}
> -out:
> -	return res;
> -}
> -
> -/*
>   * lookup_mnt - Return the first child mount mounted at path
>   *
>   * "First" means first mounted chronologically.  If you create the
> @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
>  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
>  }
>  
> +static void __attach_mnt(struct mount *mnt, struct mount *parent)
> +{
> +	hlist_add_head_rcu(&mnt->mnt_hash,
> +			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
> +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> +}
> +
>  /*
>   * vfsmount lock must be held for write
>   */
> @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
>  			struct mountpoint *mp)
>  {
>  	mnt_set_mountpoint(parent, mp, mnt);
> -	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
> -	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> +	__attach_mnt(mnt, parent);
>  }
>  
> -static void attach_shadowed(struct mount *mnt,
> -			struct mount *parent,
> -			struct mount *shadows)
> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
>  {
> -	if (shadows) {
> -		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
> -		list_add(&mnt->mnt_child, &shadows->mnt_child);
> -	} else {
> -		hlist_add_head_rcu(&mnt->mnt_hash,
> -				m_hash(&parent->mnt, mnt->mnt_mountpoint));
> -		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> -	}
> +	struct mountpoint *old_mp = mnt->mnt_mp;
> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
> +	struct mount *old_parent = mnt->mnt_parent;
> +
> +	list_del_init(&mnt->mnt_child);
> +	hlist_del_init(&mnt->mnt_mp_list);
> +	hlist_del_init_rcu(&mnt->mnt_hash);
> +
> +	attach_mnt(mnt, parent, mp);
> +
> +	put_mountpoint(old_mp);
> +
> +	/*
> +	 * Safely avoid even the suggestion this code might sleep or
> +	 * lock the mount hash by taking avantage of the knowlege that
> +	 * mnt_change_mounpoint will not release the final reference
> +	 * to a mountpoint.
> +	 *
> +	 * During mounting, another mount will continue to use the old
> +	 * mountpoint and during unmounting, the old mountpoint will
> +	 * continue to exist until namespace_unlock which happens well
> +	 * after mnt_change_mountpoint.
> +	 */
> +	spin_lock(&old_mountpoint->d_lock);
> +	old_mountpoint->d_lockref.count--;
> +	spin_unlock(&old_mountpoint->d_lock);
> +
> +	mnt_add_count(old_parent, -1);
>  }
>  
>  /*
>   * vfsmount lock must be held for write
>   */
> -static void commit_tree(struct mount *mnt, struct mount *shadows)
> +static void commit_tree(struct mount *mnt)
>  {
>  	struct mount *parent = mnt->mnt_parent;
>  	struct mount *m;
> @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
>  	n->mounts += n->pending_mounts;
>  	n->pending_mounts = 0;
>  
> -	attach_shadowed(mnt, parent, shadows);
> +	__attach_mnt(mnt, parent);
>  	touch_mnt_namespace(n);
>  }
>  
> @@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>  			continue;
>  
>  		for (s = r; s; s = next_mnt(s, r)) {
> -			struct mount *t = NULL;
>  			if (!(flag & CL_COPY_UNBINDABLE) &&
>  			    IS_MNT_UNBINDABLE(s)) {
>  				s = skip_mnt_tree(s);
> @@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>  				goto out;
>  			lock_mount_hash();
>  			list_add_tail(&q->mnt_list, &res->mnt_list);
> -			mnt_set_mountpoint(parent, p->mnt_mp, q);
> -			if (!list_empty(&parent->mnt_mounts)) {
> -				t = list_last_entry(&parent->mnt_mounts,
> -					struct mount, mnt_child);
> -				if (t->mnt_mp != p->mnt_mp)
> -					t = NULL;
> -			}
> -			attach_shadowed(q, parent, t);
> +			attach_mnt(q, parent, p->mnt_mp);
>  			unlock_mount_hash();
>  		}
>  	}
> @@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  {
>  	HLIST_HEAD(tree_list);
>  	struct mnt_namespace *ns = dest_mnt->mnt_ns;
> +	struct mountpoint *smp;
>  	struct mount *child, *p;
>  	struct hlist_node *n;
>  	int err;
>  
> +	/* Preallocate a mountpoint in case the new mounts need
> +	 * to be tucked under other mounts.
> +	 */
> +	smp = get_mountpoint(source_mnt->mnt.mnt_root);
> +	if (IS_ERR(smp))
> +		return PTR_ERR(smp);
> +
>  	/* Is there space to add these mounts to the mount namespace? */
>  	if (!parent_path) {
>  		err = count_mounts(ns, source_mnt);
> @@ -2022,17 +2024,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  		touch_mnt_namespace(source_mnt->mnt_ns);
>  	} else {
>  		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
> -		commit_tree(source_mnt, NULL);
> +		commit_tree(source_mnt);
>  	}
>  
>  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
> -		struct mount *q;
>  		hlist_del_init(&child->mnt_hash);
> -		q = __lookup_mnt_last(&child->mnt_parent->mnt,
> -				      child->mnt_mountpoint);
> -		commit_tree(child, q);
> +		if (child->mnt.mnt_root == smp->m_dentry) {
> +			struct mount *q;
> +			q = __lookup_mnt(&child->mnt_parent->mnt,
> +					 child->mnt_mountpoint);
> +			if (q)
> +				mnt_change_mountpoint(child, smp, q);
> +		}
> +		commit_tree(child);
>  	}
>  	unlock_mount_hash();
> +	put_mountpoint(smp);
>  
>  	return 0;
>  
> @@ -2046,6 +2053,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  	cleanup_group_ids(source_mnt, NULL);
>   out:
>  	ns->pending_mounts = 0;
> +	put_mountpoint(smp);
>  	return err;
>  }
>  
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 06a793f4ae38..eb4331240fd1 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -327,6 +327,9 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
>   */
>  static inline int do_refcount_check(struct mount *mnt, int count)
>  {
> +	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
> +	if (topper)
> +		count++;
>  	return mnt_get_count(mnt) > count;
>  }
>  
> @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  	     		m = propagation_next(m, parent)) {
> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>  		if (child && list_empty(&child->mnt_mounts) &&
>  		    (ret = do_refcount_check(child, 1)))
>  			break;
> @@ -381,7 +384,7 @@ void propagate_mount_unlock(struct mount *mnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>  		if (child)
>  			child->mnt.mnt_flags &= ~MNT_LOCKED;
>  	}
> @@ -399,9 +402,11 @@ static void mark_umount_candidates(struct mount *mnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> +		struct mount *child = __lookup_mnt(&m->mnt,
>  						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
> +			continue;
> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> +		struct mount *topper;
> +		struct mount *child = __lookup_mnt(&m->mnt,
>  						mnt->mnt_mountpoint);
>  		/*
>  		 * umount the child only if the child has no children
> @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
>  		if (!child || !IS_MNT_MARKED(child))
>  			continue;
>  		CLEAR_MNT_MARK(child);
> +
> +		/* If there is exactly one mount covering all of child
> +		 * replace child with that mount.
> +		 */
> +		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
> +		if (topper &&
> +		    (child->mnt_mounts.next == &topper->mnt_child) &&
> +		    (topper->mnt_child.next == &child->mnt_mounts))
> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
> +
>  		if (list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..dc87e65becd2 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
>  unsigned int mnt_get_count(struct mount *mnt);
>  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
>  			struct mount *);
> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
> +			   struct mount *mnt);
>  struct mount *copy_tree(struct mount *, struct dentry *, int);
>  bool is_path_reachable(struct mount *, struct dentry *,
>  			 const struct path *root);
> -- 
> 2.10.1
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-05-14  2:15                 ` Andrei Vagin
@ 2017-05-14  4:05                   ` Eric W. Biederman
  2017-05-14  9:26                     ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-14  4:05 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai

Andrei Vagin <avagin@virtuozzo.com> writes:

> Hi Eric,
>
> I found one issue about this patch. Take a look at the next script. The
> idea is simple, we call mount twice and than we call umount twice, but
> the second umount fails.

Definitely an odd.  I will take a look.

Eric

>
> [root@fc24 ~]# cat m.sh
> #!/bin/sh
>
> set -x -e
> mount -t tmpfs xxx /mnt
> mkdir -p /mnt/1
> mkdir -p /mnt/2
> mount --bind /mnt/1 /mnt/1
> mount --make-shared /mnt/1
> mount --bind /mnt/1 /mnt/2
> mkdir -p /mnt/1/1
> for i in `seq 2`; do
> 	mount --bind /mnt/1/1 /mnt/1/1
> done
> for i in `seq 2`; do
> 	umount /mnt/1/1 || {
> 		cat /proc/self/mountinfo | grep xxx
> 		exit 1
> 	}
> done
>
> [root@fc24 ~]# unshare -Urm  ./m.sh
> + mount -t tmpfs xxx /mnt
> + mkdir -p /mnt/1
> + mkdir -p /mnt/2
> + mount --bind /mnt/1 /mnt/1
> + mount --make-shared /mnt/1
> + mount --bind /mnt/1 /mnt/2
> + mkdir -p /mnt/1/1
> ++ seq 2
> + for i in '`seq 2`'
> + mount --bind /mnt/1/1 /mnt/1/1
> + for i in '`seq 2`'
> + mount --bind /mnt/1/1 /mnt/1/1
> ++ seq 2
> + for i in '`seq 2`'
> + umount /mnt/1/1
> + for i in '`seq 2`'
> + umount /mnt/1/1
> umount: /mnt/1/1: not mounted
> + cat /proc/self/mountinfo
> + grep xxx
> 147 116 0:42 / /mnt rw,relatime - tmpfs xxx rw
> 148 147 0:42 /1 /mnt/1 rw,relatime shared:65 - tmpfs xxx rw
> 149 147 0:42 /1 /mnt/2 rw,relatime shared:65 - tmpfs xxx rw
> 157 149 0:42 /1/1 /mnt/2/1 rw,relatime shared:65 - tmpfs xxx rw
> + exit 1
>
> And you can see that /mnt/2 contains a mount, but it should not.
>
> Thanks,
> Andrei
>
> On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote:
>> 
>> Ever since mount propagation was introduced in cases where a mount in
>> propagated to parent mount mountpoint pair that is already in use the
>> code has placed the new mount behind the old mount in the mount hash
>> table.
>> 
>> This implementation detail is problematic as it allows creating
>> arbitrary length mount hash chains.
>> 
>> Furthermore it invalidates the constraint maintained elsewhere in the
>> mount code that a parent mount and a mountpoint pair will have exactly
>> one mount upon them.  Making it hard to deal with and to talk about
>> this special case in the mount code.
>> 
>> Modify mount propagation to notice when there is already a mount at
>> the parent mount and mountpoint where a new mount is propagating to
>> and place that preexisting mount on top of the new mount.
>> 
>> Modify unmount propagation to notice when a mount that is being
>> unmounted has another mount on top of it (and no other children), and
>> to replace the unmounted mount with the mount on top of it.
>> 
>> Move the MNT_UMUONT test from __lookup_mnt_last into
>> __propagate_umount as that is the only call of __lookup_mnt_last where
>> MNT_UMOUNT may be set on any mount visible in the mount hash table.
>> 
>> These modifications allow:
>>  - __lookup_mnt_last to be removed.
>>  - attach_shadows to be renamed __attach_mnt and the it's shadow
>>    handling to be removed.
>>  - commit_tree to be simplified
>>  - copy_tree to be simplified
>> 
>> The result is an easier to understand tree of mounts that does not
>> allow creation of arbitrary length hash chains in the mount hash table.
>> 
>> v2: Updated to mnt_change_mountpoint to not call dput or mntput
>> and instead to decrement the counts directly.  It is guaranteed
>> that there will be other references when mnt_change_mountpoint is
>> called so this is safe.
>> 
>> Cc: stable@vger.kernel.org
>> Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
>> Tested-by: Andrei Vagin <avagin@virtuozzo.com>
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>> 
>> Since the last version some of you may have seen I have modified
>> my implementation of mnt_change_mountpoint so that it no longer calls
>> mntput or dput but instead relies on the knowledge that it can not
>> possibly have the last reference to the mnt and dentry of interest.
>> This avoids code checking tools from complaining bitterly.
>> 
>> This is on top of my previous patch that sorts out locking of the
>> mountpoint hash table.  After time giving ample time for review I intend
>> to push this and the previous bug fix to Linus.
>> 
>>  fs/mount.h     |   1 -
>>  fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
>>  fs/pnode.c     |  27 ++++++++++----
>>  fs/pnode.h     |   2 ++
>>  4 files changed, 82 insertions(+), 58 deletions(-)
>> 
>> diff --git a/fs/mount.h b/fs/mount.h
>> index 2c856fc47ae3..2826543a131d 100644
>> --- a/fs/mount.h
>> +++ b/fs/mount.h
>> @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
>>  }
>>  
>>  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
>> -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
>>  
>>  extern int __legitimize_mnt(struct vfsmount *, unsigned);
>>  extern bool legitimize_mnt(struct vfsmount *, unsigned);
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 487ba30bb5c6..91ccfb73f0e0 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
>>  }
>>  
>>  /*
>> - * find the last mount at @dentry on vfsmount @mnt.
>> - * mount_lock must be held.
>> - */
>> -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>> -{
>> -	struct mount *p, *res = NULL;
>> -	p = __lookup_mnt(mnt, dentry);
>> -	if (!p)
>> -		goto out;
>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -		res = p;
>> -	hlist_for_each_entry_continue(p, mnt_hash) {
>> -		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>> -			break;
>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -			res = p;
>> -	}
>> -out:
>> -	return res;
>> -}
>> -
>> -/*
>>   * lookup_mnt - Return the first child mount mounted at path
>>   *
>>   * "First" means first mounted chronologically.  If you create the
>> @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
>>  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
>>  }
>>  
>> +static void __attach_mnt(struct mount *mnt, struct mount *parent)
>> +{
>> +	hlist_add_head_rcu(&mnt->mnt_hash,
>> +			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
>> +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
>> +}
>> +
>>  /*
>>   * vfsmount lock must be held for write
>>   */
>> @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
>>  			struct mountpoint *mp)
>>  {
>>  	mnt_set_mountpoint(parent, mp, mnt);
>> -	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
>> -	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
>> +	__attach_mnt(mnt, parent);
>>  }
>>  
>> -static void attach_shadowed(struct mount *mnt,
>> -			struct mount *parent,
>> -			struct mount *shadows)
>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
>>  {
>> -	if (shadows) {
>> -		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
>> -		list_add(&mnt->mnt_child, &shadows->mnt_child);
>> -	} else {
>> -		hlist_add_head_rcu(&mnt->mnt_hash,
>> -				m_hash(&parent->mnt, mnt->mnt_mountpoint));
>> -		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
>> -	}
>> +	struct mountpoint *old_mp = mnt->mnt_mp;
>> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
>> +	struct mount *old_parent = mnt->mnt_parent;
>> +
>> +	list_del_init(&mnt->mnt_child);
>> +	hlist_del_init(&mnt->mnt_mp_list);
>> +	hlist_del_init_rcu(&mnt->mnt_hash);
>> +
>> +	attach_mnt(mnt, parent, mp);
>> +
>> +	put_mountpoint(old_mp);
>> +
>> +	/*
>> +	 * Safely avoid even the suggestion this code might sleep or
>> +	 * lock the mount hash by taking avantage of the knowlege that
>> +	 * mnt_change_mounpoint will not release the final reference
>> +	 * to a mountpoint.
>> +	 *
>> +	 * During mounting, another mount will continue to use the old
>> +	 * mountpoint and during unmounting, the old mountpoint will
>> +	 * continue to exist until namespace_unlock which happens well
>> +	 * after mnt_change_mountpoint.
>> +	 */
>> +	spin_lock(&old_mountpoint->d_lock);
>> +	old_mountpoint->d_lockref.count--;
>> +	spin_unlock(&old_mountpoint->d_lock);
>> +
>> +	mnt_add_count(old_parent, -1);
>>  }
>>  
>>  /*
>>   * vfsmount lock must be held for write
>>   */
>> -static void commit_tree(struct mount *mnt, struct mount *shadows)
>> +static void commit_tree(struct mount *mnt)
>>  {
>>  	struct mount *parent = mnt->mnt_parent;
>>  	struct mount *m;
>> @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
>>  	n->mounts += n->pending_mounts;
>>  	n->pending_mounts = 0;
>>  
>> -	attach_shadowed(mnt, parent, shadows);
>> +	__attach_mnt(mnt, parent);
>>  	touch_mnt_namespace(n);
>>  }
>>  
>> @@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>>  			continue;
>>  
>>  		for (s = r; s; s = next_mnt(s, r)) {
>> -			struct mount *t = NULL;
>>  			if (!(flag & CL_COPY_UNBINDABLE) &&
>>  			    IS_MNT_UNBINDABLE(s)) {
>>  				s = skip_mnt_tree(s);
>> @@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>>  				goto out;
>>  			lock_mount_hash();
>>  			list_add_tail(&q->mnt_list, &res->mnt_list);
>> -			mnt_set_mountpoint(parent, p->mnt_mp, q);
>> -			if (!list_empty(&parent->mnt_mounts)) {
>> -				t = list_last_entry(&parent->mnt_mounts,
>> -					struct mount, mnt_child);
>> -				if (t->mnt_mp != p->mnt_mp)
>> -					t = NULL;
>> -			}
>> -			attach_shadowed(q, parent, t);
>> +			attach_mnt(q, parent, p->mnt_mp);
>>  			unlock_mount_hash();
>>  		}
>>  	}
>> @@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>>  {
>>  	HLIST_HEAD(tree_list);
>>  	struct mnt_namespace *ns = dest_mnt->mnt_ns;
>> +	struct mountpoint *smp;
>>  	struct mount *child, *p;
>>  	struct hlist_node *n;
>>  	int err;
>>  
>> +	/* Preallocate a mountpoint in case the new mounts need
>> +	 * to be tucked under other mounts.
>> +	 */
>> +	smp = get_mountpoint(source_mnt->mnt.mnt_root);
>> +	if (IS_ERR(smp))
>> +		return PTR_ERR(smp);
>> +
>>  	/* Is there space to add these mounts to the mount namespace? */
>>  	if (!parent_path) {
>>  		err = count_mounts(ns, source_mnt);
>> @@ -2022,17 +2024,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>>  		touch_mnt_namespace(source_mnt->mnt_ns);
>>  	} else {
>>  		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
>> -		commit_tree(source_mnt, NULL);
>> +		commit_tree(source_mnt);
>>  	}
>>  
>>  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
>> -		struct mount *q;
>>  		hlist_del_init(&child->mnt_hash);
>> -		q = __lookup_mnt_last(&child->mnt_parent->mnt,
>> -				      child->mnt_mountpoint);
>> -		commit_tree(child, q);
>> +		if (child->mnt.mnt_root == smp->m_dentry) {
>> +			struct mount *q;
>> +			q = __lookup_mnt(&child->mnt_parent->mnt,
>> +					 child->mnt_mountpoint);
>> +			if (q)
>> +				mnt_change_mountpoint(child, smp, q);
>> +		}
>> +		commit_tree(child);
>>  	}
>>  	unlock_mount_hash();
>> +	put_mountpoint(smp);
>>  
>>  	return 0;
>>  
>> @@ -2046,6 +2053,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>>  	cleanup_group_ids(source_mnt, NULL);
>>   out:
>>  	ns->pending_mounts = 0;
>> +	put_mountpoint(smp);
>>  	return err;
>>  }
>>  
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index 06a793f4ae38..eb4331240fd1 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -327,6 +327,9 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
>>   */
>>  static inline int do_refcount_check(struct mount *mnt, int count)
>>  {
>> +	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
>> +	if (topper)
>> +		count++;
>>  	return mnt_get_count(mnt) > count;
>>  }
>>  
>> @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>>  
>>  	for (m = propagation_next(parent, parent); m;
>>  	     		m = propagation_next(m, parent)) {
>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>  		if (child && list_empty(&child->mnt_mounts) &&
>>  		    (ret = do_refcount_check(child, 1)))
>>  			break;
>> @@ -381,7 +384,7 @@ void propagate_mount_unlock(struct mount *mnt)
>>  
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>  		if (child)
>>  			child->mnt.mnt_flags &= ~MNT_LOCKED;
>>  	}
>> @@ -399,9 +402,11 @@ static void mark_umount_candidates(struct mount *mnt)
>>  
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>  						mnt->mnt_mountpoint);
>> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
>> +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
>> +			continue;
>> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>>  			SET_MNT_MARK(child);
>>  		}
>>  	}
>> @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
>>  
>>  	for (m = propagation_next(parent, parent); m;
>>  			m = propagation_next(m, parent)) {
>> -
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> +		struct mount *topper;
>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>  						mnt->mnt_mountpoint);
>>  		/*
>>  		 * umount the child only if the child has no children
>> @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
>>  		if (!child || !IS_MNT_MARKED(child))
>>  			continue;
>>  		CLEAR_MNT_MARK(child);
>> +
>> +		/* If there is exactly one mount covering all of child
>> +		 * replace child with that mount.
>> +		 */
>> +		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
>> +		if (topper &&
>> +		    (child->mnt_mounts.next == &topper->mnt_child) &&
>> +		    (topper->mnt_child.next == &child->mnt_mounts))
>> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
>> +
>>  		if (list_empty(&child->mnt_mounts)) {
>>  			list_del_init(&child->mnt_child);
>>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>> diff --git a/fs/pnode.h b/fs/pnode.h
>> index 550f5a8b4fcf..dc87e65becd2 100644
>> --- a/fs/pnode.h
>> +++ b/fs/pnode.h
>> @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
>>  unsigned int mnt_get_count(struct mount *mnt);
>>  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
>>  			struct mount *);
>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
>> +			   struct mount *mnt);
>>  struct mount *copy_tree(struct mount *, struct dentry *, int);
>>  bool is_path_reachable(struct mount *, struct dentry *,
>>  			 const struct path *root);
>> -- 
>> 2.10.1
>> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-05-14  4:05                   ` Eric W. Biederman
@ 2017-05-14  9:26                     ` Eric W. Biederman
  2017-05-15 18:27                       ` Andrei Vagin
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-14  9:26 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai

ebiederm@xmission.com (Eric W. Biederman) writes:

> Andrei Vagin <avagin@virtuozzo.com> writes:
>
>> Hi Eric,
>>
>> I found one issue about this patch. Take a look at the next script. The
>> idea is simple, we call mount twice and than we call umount twice, but
>> the second umount fails.
>
> Definitely an odd.  I will take a look.

After a little more looking I have to say the only thing I can see wrong
in the behavior is that the first umount doesn't unmount everything.

Changing the mount propagation tree while it is being traversed
apparently prevents a complete traversal of the propgation tree.

Last time I was looking at this I was playing with multipass algorithm
because of these peculiarities that happen with propagation trees.

I suspect we are going to need to fix umount to behave correct before
we tune it for performance.  So that we actually know what correct
behavior is.

Eric


>>
>> [root@fc24 ~]# cat m.sh
>> #!/bin/sh
>>
>> set -x -e
>> mount -t tmpfs xxx /mnt
>> mkdir -p /mnt/1
>> mkdir -p /mnt/2
>> mount --bind /mnt/1 /mnt/1
>> mount --make-shared /mnt/1
>> mount --bind /mnt/1 /mnt/2
>> mkdir -p /mnt/1/1
>> for i in `seq 2`; do
>> 	mount --bind /mnt/1/1 /mnt/1/1
>> done
>> for i in `seq 2`; do
>> 	umount /mnt/1/1 || {
>> 		cat /proc/self/mountinfo | grep xxx
>> 		exit 1
>> 	}
>> done
>>
>> [root@fc24 ~]# unshare -Urm  ./m.sh
>> + mount -t tmpfs xxx /mnt
>> + mkdir -p /mnt/1
>> + mkdir -p /mnt/2
>> + mount --bind /mnt/1 /mnt/1
>> + mount --make-shared /mnt/1
>> + mount --bind /mnt/1 /mnt/2
>> + mkdir -p /mnt/1/1
>> ++ seq 2
>> + for i in '`seq 2`'
>> + mount --bind /mnt/1/1 /mnt/1/1
>> + for i in '`seq 2`'
>> + mount --bind /mnt/1/1 /mnt/1/1
>> ++ seq 2
>> + for i in '`seq 2`'
>> + umount /mnt/1/1
>> + for i in '`seq 2`'
>> + umount /mnt/1/1
>> umount: /mnt/1/1: not mounted
>> + cat /proc/self/mountinfo
>> + grep xxx
>> 147 116 0:42 / /mnt rw,relatime - tmpfs xxx rw
>> 148 147 0:42 /1 /mnt/1 rw,relatime shared:65 - tmpfs xxx rw
>> 149 147 0:42 /1 /mnt/2 rw,relatime shared:65 - tmpfs xxx rw
>> 157 149 0:42 /1/1 /mnt/2/1 rw,relatime shared:65 - tmpfs xxx rw
>> + exit 1
>>
>> And you can see that /mnt/2 contains a mount, but it should not.
>>
>> Thanks,
>> Andrei
>>
>> On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote:
>>> 
>>> Ever since mount propagation was introduced in cases where a mount in
>>> propagated to parent mount mountpoint pair that is already in use the
>>> code has placed the new mount behind the old mount in the mount hash
>>> table.
>>> 
>>> This implementation detail is problematic as it allows creating
>>> arbitrary length mount hash chains.
>>> 
>>> Furthermore it invalidates the constraint maintained elsewhere in the
>>> mount code that a parent mount and a mountpoint pair will have exactly
>>> one mount upon them.  Making it hard to deal with and to talk about
>>> this special case in the mount code.
>>> 
>>> Modify mount propagation to notice when there is already a mount at
>>> the parent mount and mountpoint where a new mount is propagating to
>>> and place that preexisting mount on top of the new mount.
>>> 
>>> Modify unmount propagation to notice when a mount that is being
>>> unmounted has another mount on top of it (and no other children), and
>>> to replace the unmounted mount with the mount on top of it.
>>> 
>>> Move the MNT_UMUONT test from __lookup_mnt_last into
>>> __propagate_umount as that is the only call of __lookup_mnt_last where
>>> MNT_UMOUNT may be set on any mount visible in the mount hash table.
>>> 
>>> These modifications allow:
>>>  - __lookup_mnt_last to be removed.
>>>  - attach_shadows to be renamed __attach_mnt and the it's shadow
>>>    handling to be removed.
>>>  - commit_tree to be simplified
>>>  - copy_tree to be simplified
>>> 
>>> The result is an easier to understand tree of mounts that does not
>>> allow creation of arbitrary length hash chains in the mount hash table.
>>> 
>>> v2: Updated to mnt_change_mountpoint to not call dput or mntput
>>> and instead to decrement the counts directly.  It is guaranteed
>>> that there will be other references when mnt_change_mountpoint is
>>> called so this is safe.
>>> 
>>> Cc: stable@vger.kernel.org
>>> Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
>>> Tested-by: Andrei Vagin <avagin@virtuozzo.com>
>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>> ---
>>> 
>>> Since the last version some of you may have seen I have modified
>>> my implementation of mnt_change_mountpoint so that it no longer calls
>>> mntput or dput but instead relies on the knowledge that it can not
>>> possibly have the last reference to the mnt and dentry of interest.
>>> This avoids code checking tools from complaining bitterly.
>>> 
>>> This is on top of my previous patch that sorts out locking of the
>>> mountpoint hash table.  After time giving ample time for review I intend
>>> to push this and the previous bug fix to Linus.
>>> 
>>>  fs/mount.h     |   1 -
>>>  fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
>>>  fs/pnode.c     |  27 ++++++++++----
>>>  fs/pnode.h     |   2 ++
>>>  4 files changed, 82 insertions(+), 58 deletions(-)
>>> 
>>> diff --git a/fs/mount.h b/fs/mount.h
>>> index 2c856fc47ae3..2826543a131d 100644
>>> --- a/fs/mount.h
>>> +++ b/fs/mount.h
>>> @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
>>>  }
>>>  
>>>  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
>>> -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
>>>  
>>>  extern int __legitimize_mnt(struct vfsmount *, unsigned);
>>>  extern bool legitimize_mnt(struct vfsmount *, unsigned);
>>> diff --git a/fs/namespace.c b/fs/namespace.c
>>> index 487ba30bb5c6..91ccfb73f0e0 100644
>>> --- a/fs/namespace.c
>>> +++ b/fs/namespace.c
>>> @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
>>>  }
>>>  
>>>  /*
>>> - * find the last mount at @dentry on vfsmount @mnt.
>>> - * mount_lock must be held.
>>> - */
>>> -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>>> -{
>>> -	struct mount *p, *res = NULL;
>>> -	p = __lookup_mnt(mnt, dentry);
>>> -	if (!p)
>>> -		goto out;
>>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>>> -		res = p;
>>> -	hlist_for_each_entry_continue(p, mnt_hash) {
>>> -		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>>> -			break;
>>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>>> -			res = p;
>>> -	}
>>> -out:
>>> -	return res;
>>> -}
>>> -
>>> -/*
>>>   * lookup_mnt - Return the first child mount mounted at path
>>>   *
>>>   * "First" means first mounted chronologically.  If you create the
>>> @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
>>>  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
>>>  }
>>>  
>>> +static void __attach_mnt(struct mount *mnt, struct mount *parent)
>>> +{
>>> +	hlist_add_head_rcu(&mnt->mnt_hash,
>>> +			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
>>> +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
>>> +}
>>> +
>>>  /*
>>>   * vfsmount lock must be held for write
>>>   */
>>> @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
>>>  			struct mountpoint *mp)
>>>  {
>>>  	mnt_set_mountpoint(parent, mp, mnt);
>>> -	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
>>> -	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
>>> +	__attach_mnt(mnt, parent);
>>>  }
>>>  
>>> -static void attach_shadowed(struct mount *mnt,
>>> -			struct mount *parent,
>>> -			struct mount *shadows)
>>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
>>>  {
>>> -	if (shadows) {
>>> -		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
>>> -		list_add(&mnt->mnt_child, &shadows->mnt_child);
>>> -	} else {
>>> -		hlist_add_head_rcu(&mnt->mnt_hash,
>>> -				m_hash(&parent->mnt, mnt->mnt_mountpoint));
>>> -		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
>>> -	}
>>> +	struct mountpoint *old_mp = mnt->mnt_mp;
>>> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
>>> +	struct mount *old_parent = mnt->mnt_parent;
>>> +
>>> +	list_del_init(&mnt->mnt_child);
>>> +	hlist_del_init(&mnt->mnt_mp_list);
>>> +	hlist_del_init_rcu(&mnt->mnt_hash);
>>> +
>>> +	attach_mnt(mnt, parent, mp);
>>> +
>>> +	put_mountpoint(old_mp);
>>> +
>>> +	/*
>>> +	 * Safely avoid even the suggestion this code might sleep or
>>> +	 * lock the mount hash by taking avantage of the knowlege that
>>> +	 * mnt_change_mounpoint will not release the final reference
>>> +	 * to a mountpoint.
>>> +	 *
>>> +	 * During mounting, another mount will continue to use the old
>>> +	 * mountpoint and during unmounting, the old mountpoint will
>>> +	 * continue to exist until namespace_unlock which happens well
>>> +	 * after mnt_change_mountpoint.
>>> +	 */
>>> +	spin_lock(&old_mountpoint->d_lock);
>>> +	old_mountpoint->d_lockref.count--;
>>> +	spin_unlock(&old_mountpoint->d_lock);
>>> +
>>> +	mnt_add_count(old_parent, -1);
>>>  }
>>>  
>>>  /*
>>>   * vfsmount lock must be held for write
>>>   */
>>> -static void commit_tree(struct mount *mnt, struct mount *shadows)
>>> +static void commit_tree(struct mount *mnt)
>>>  {
>>>  	struct mount *parent = mnt->mnt_parent;
>>>  	struct mount *m;
>>> @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
>>>  	n->mounts += n->pending_mounts;
>>>  	n->pending_mounts = 0;
>>>  
>>> -	attach_shadowed(mnt, parent, shadows);
>>> +	__attach_mnt(mnt, parent);
>>>  	touch_mnt_namespace(n);
>>>  }
>>>  
>>> @@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>>>  			continue;
>>>  
>>>  		for (s = r; s; s = next_mnt(s, r)) {
>>> -			struct mount *t = NULL;
>>>  			if (!(flag & CL_COPY_UNBINDABLE) &&
>>>  			    IS_MNT_UNBINDABLE(s)) {
>>>  				s = skip_mnt_tree(s);
>>> @@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
>>>  				goto out;
>>>  			lock_mount_hash();
>>>  			list_add_tail(&q->mnt_list, &res->mnt_list);
>>> -			mnt_set_mountpoint(parent, p->mnt_mp, q);
>>> -			if (!list_empty(&parent->mnt_mounts)) {
>>> -				t = list_last_entry(&parent->mnt_mounts,
>>> -					struct mount, mnt_child);
>>> -				if (t->mnt_mp != p->mnt_mp)
>>> -					t = NULL;
>>> -			}
>>> -			attach_shadowed(q, parent, t);
>>> +			attach_mnt(q, parent, p->mnt_mp);
>>>  			unlock_mount_hash();
>>>  		}
>>>  	}
>>> @@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>>>  {
>>>  	HLIST_HEAD(tree_list);
>>>  	struct mnt_namespace *ns = dest_mnt->mnt_ns;
>>> +	struct mountpoint *smp;
>>>  	struct mount *child, *p;
>>>  	struct hlist_node *n;
>>>  	int err;
>>>  
>>> +	/* Preallocate a mountpoint in case the new mounts need
>>> +	 * to be tucked under other mounts.
>>> +	 */
>>> +	smp = get_mountpoint(source_mnt->mnt.mnt_root);
>>> +	if (IS_ERR(smp))
>>> +		return PTR_ERR(smp);
>>> +
>>>  	/* Is there space to add these mounts to the mount namespace? */
>>>  	if (!parent_path) {
>>>  		err = count_mounts(ns, source_mnt);
>>> @@ -2022,17 +2024,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>>>  		touch_mnt_namespace(source_mnt->mnt_ns);
>>>  	} else {
>>>  		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
>>> -		commit_tree(source_mnt, NULL);
>>> +		commit_tree(source_mnt);
>>>  	}
>>>  
>>>  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
>>> -		struct mount *q;
>>>  		hlist_del_init(&child->mnt_hash);
>>> -		q = __lookup_mnt_last(&child->mnt_parent->mnt,
>>> -				      child->mnt_mountpoint);
>>> -		commit_tree(child, q);
>>> +		if (child->mnt.mnt_root == smp->m_dentry) {
>>> +			struct mount *q;
>>> +			q = __lookup_mnt(&child->mnt_parent->mnt,
>>> +					 child->mnt_mountpoint);
>>> +			if (q)
>>> +				mnt_change_mountpoint(child, smp, q);
>>> +		}
>>> +		commit_tree(child);
>>>  	}
>>>  	unlock_mount_hash();
>>> +	put_mountpoint(smp);
>>>  
>>>  	return 0;
>>>  
>>> @@ -2046,6 +2053,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>>>  	cleanup_group_ids(source_mnt, NULL);
>>>   out:
>>>  	ns->pending_mounts = 0;
>>> +	put_mountpoint(smp);
>>>  	return err;
>>>  }
>>>  
>>> diff --git a/fs/pnode.c b/fs/pnode.c
>>> index 06a793f4ae38..eb4331240fd1 100644
>>> --- a/fs/pnode.c
>>> +++ b/fs/pnode.c
>>> @@ -327,6 +327,9 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
>>>   */
>>>  static inline int do_refcount_check(struct mount *mnt, int count)
>>>  {
>>> +	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
>>> +	if (topper)
>>> +		count++;
>>>  	return mnt_get_count(mnt) > count;
>>>  }
>>>  
>>> @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>>>  
>>>  	for (m = propagation_next(parent, parent); m;
>>>  	     		m = propagation_next(m, parent)) {
>>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>>  		if (child && list_empty(&child->mnt_mounts) &&
>>>  		    (ret = do_refcount_check(child, 1)))
>>>  			break;
>>> @@ -381,7 +384,7 @@ void propagate_mount_unlock(struct mount *mnt)
>>>  
>>>  	for (m = propagation_next(parent, parent); m;
>>>  			m = propagation_next(m, parent)) {
>>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
>>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>>  		if (child)
>>>  			child->mnt.mnt_flags &= ~MNT_LOCKED;
>>>  	}
>>> @@ -399,9 +402,11 @@ static void mark_umount_candidates(struct mount *mnt)
>>>  
>>>  	for (m = propagation_next(parent, parent); m;
>>>  			m = propagation_next(m, parent)) {
>>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>>  						mnt->mnt_mountpoint);
>>> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
>>> +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
>>> +			continue;
>>> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>>>  			SET_MNT_MARK(child);
>>>  		}
>>>  	}
>>> @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
>>>  
>>>  	for (m = propagation_next(parent, parent); m;
>>>  			m = propagation_next(m, parent)) {
>>> -
>>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>>> +		struct mount *topper;
>>> +		struct mount *child = __lookup_mnt(&m->mnt,
>>>  						mnt->mnt_mountpoint);
>>>  		/*
>>>  		 * umount the child only if the child has no children
>>> @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
>>>  		if (!child || !IS_MNT_MARKED(child))
>>>  			continue;
>>>  		CLEAR_MNT_MARK(child);
>>> +
>>> +		/* If there is exactly one mount covering all of child
>>> +		 * replace child with that mount.
>>> +		 */
>>> +		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
>>> +		if (topper &&
>>> +		    (child->mnt_mounts.next == &topper->mnt_child) &&
>>> +		    (topper->mnt_child.next == &child->mnt_mounts))
>>> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
>>> +
>>>  		if (list_empty(&child->mnt_mounts)) {
>>>  			list_del_init(&child->mnt_child);
>>>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>>> diff --git a/fs/pnode.h b/fs/pnode.h
>>> index 550f5a8b4fcf..dc87e65becd2 100644
>>> --- a/fs/pnode.h
>>> +++ b/fs/pnode.h
>>> @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
>>>  unsigned int mnt_get_count(struct mount *mnt);
>>>  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
>>>  			struct mount *);
>>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
>>> +			   struct mount *mnt);
>>>  struct mount *copy_tree(struct mount *, struct dentry *, int);
>>>  bool is_path_reachable(struct mount *, struct dentry *,
>>>  			 const struct path *root);
>>> -- 
>>> 2.10.1
>>> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-05-14  9:26                     ` Eric W. Biederman
@ 2017-05-15 18:27                       ` Andrei Vagin
  2017-05-15 19:42                         ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Andrei Vagin @ 2017-05-15 18:27 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

[-- Attachment #1: Type: text/plain, Size: 17274 bytes --]

On Sun, May 14, 2017 at 04:26:18AM -0500, Eric W. Biederman wrote:
> ebiederm@xmission.com (Eric W. Biederman) writes:
> 
> > Andrei Vagin <avagin@virtuozzo.com> writes:
> >
> >> Hi Eric,
> >>
> >> I found one issue about this patch. Take a look at the next script. The
> >> idea is simple, we call mount twice and than we call umount twice, but
> >> the second umount fails.
> >
> > Definitely an odd.  I will take a look.
> 
> After a little more looking I have to say the only thing I can see wrong
> in the behavior is that the first umount doesn't unmount everything.
> 
> Changing the mount propagation tree while it is being traversed
> apparently prevents a complete traversal of the propgation tree.

Is it be enough to find topper which will not be umounted? I attached a
patch which does this. I can't find a reason why this will not work.

The idea of the patch is that we try to find a topper, check whether it is
marked, and if it is marked, then we try to find the next one. All
marked toppers are umounted and the first unmarked topper is attached to
the parent mount.

And I think we need to add tests in tools/testing/selftests/...

> 
> Last time I was looking at this I was playing with multipass algorithm
> because of these peculiarities that happen with propagation trees.
> 
> I suspect we are going to need to fix umount to behave correct before
> we tune it for performance.  So that we actually know what correct
> behavior is.
> 
> Eric
> 
> 
> >>
> >> [root@fc24 ~]# cat m.sh
> >> #!/bin/sh
> >>
> >> set -x -e
> >> mount -t tmpfs xxx /mnt
> >> mkdir -p /mnt/1
> >> mkdir -p /mnt/2
> >> mount --bind /mnt/1 /mnt/1
> >> mount --make-shared /mnt/1
> >> mount --bind /mnt/1 /mnt/2
> >> mkdir -p /mnt/1/1
> >> for i in `seq 2`; do
> >> 	mount --bind /mnt/1/1 /mnt/1/1
> >> done
> >> for i in `seq 2`; do
> >> 	umount /mnt/1/1 || {
> >> 		cat /proc/self/mountinfo | grep xxx
> >> 		exit 1
> >> 	}
> >> done
> >>
> >> [root@fc24 ~]# unshare -Urm  ./m.sh
> >> + mount -t tmpfs xxx /mnt
> >> + mkdir -p /mnt/1
> >> + mkdir -p /mnt/2
> >> + mount --bind /mnt/1 /mnt/1
> >> + mount --make-shared /mnt/1
> >> + mount --bind /mnt/1 /mnt/2
> >> + mkdir -p /mnt/1/1
> >> ++ seq 2
> >> + for i in '`seq 2`'
> >> + mount --bind /mnt/1/1 /mnt/1/1
> >> + for i in '`seq 2`'
> >> + mount --bind /mnt/1/1 /mnt/1/1
> >> ++ seq 2
> >> + for i in '`seq 2`'
> >> + umount /mnt/1/1
> >> + for i in '`seq 2`'
> >> + umount /mnt/1/1
> >> umount: /mnt/1/1: not mounted
> >> + cat /proc/self/mountinfo
> >> + grep xxx
> >> 147 116 0:42 / /mnt rw,relatime - tmpfs xxx rw
> >> 148 147 0:42 /1 /mnt/1 rw,relatime shared:65 - tmpfs xxx rw
> >> 149 147 0:42 /1 /mnt/2 rw,relatime shared:65 - tmpfs xxx rw
> >> 157 149 0:42 /1/1 /mnt/2/1 rw,relatime shared:65 - tmpfs xxx rw
> >> + exit 1
> >>
> >> And you can see that /mnt/2 contains a mount, but it should not.
> >>
> >> Thanks,
> >> Andrei
> >>
> >> On Thu, Jan 05, 2017 at 10:04:14AM +1300, Eric W. Biederman wrote:
> >>> 
> >>> Ever since mount propagation was introduced in cases where a mount in
> >>> propagated to parent mount mountpoint pair that is already in use the
> >>> code has placed the new mount behind the old mount in the mount hash
> >>> table.
> >>> 
> >>> This implementation detail is problematic as it allows creating
> >>> arbitrary length mount hash chains.
> >>> 
> >>> Furthermore it invalidates the constraint maintained elsewhere in the
> >>> mount code that a parent mount and a mountpoint pair will have exactly
> >>> one mount upon them.  Making it hard to deal with and to talk about
> >>> this special case in the mount code.
> >>> 
> >>> Modify mount propagation to notice when there is already a mount at
> >>> the parent mount and mountpoint where a new mount is propagating to
> >>> and place that preexisting mount on top of the new mount.
> >>> 
> >>> Modify unmount propagation to notice when a mount that is being
> >>> unmounted has another mount on top of it (and no other children), and
> >>> to replace the unmounted mount with the mount on top of it.
> >>> 
> >>> Move the MNT_UMUONT test from __lookup_mnt_last into
> >>> __propagate_umount as that is the only call of __lookup_mnt_last where
> >>> MNT_UMOUNT may be set on any mount visible in the mount hash table.
> >>> 
> >>> These modifications allow:
> >>>  - __lookup_mnt_last to be removed.
> >>>  - attach_shadows to be renamed __attach_mnt and the it's shadow
> >>>    handling to be removed.
> >>>  - commit_tree to be simplified
> >>>  - copy_tree to be simplified
> >>> 
> >>> The result is an easier to understand tree of mounts that does not
> >>> allow creation of arbitrary length hash chains in the mount hash table.
> >>> 
> >>> v2: Updated to mnt_change_mountpoint to not call dput or mntput
> >>> and instead to decrement the counts directly.  It is guaranteed
> >>> that there will be other references when mnt_change_mountpoint is
> >>> called so this is safe.
> >>> 
> >>> Cc: stable@vger.kernel.org
> >>> Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
> >>> Tested-by: Andrei Vagin <avagin@virtuozzo.com>
> >>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >>> ---
> >>> 
> >>> Since the last version some of you may have seen I have modified
> >>> my implementation of mnt_change_mountpoint so that it no longer calls
> >>> mntput or dput but instead relies on the knowledge that it can not
> >>> possibly have the last reference to the mnt and dentry of interest.
> >>> This avoids code checking tools from complaining bitterly.
> >>> 
> >>> This is on top of my previous patch that sorts out locking of the
> >>> mountpoint hash table.  After time giving ample time for review I intend
> >>> to push this and the previous bug fix to Linus.
> >>> 
> >>>  fs/mount.h     |   1 -
> >>>  fs/namespace.c | 110 +++++++++++++++++++++++++++++++--------------------------
> >>>  fs/pnode.c     |  27 ++++++++++----
> >>>  fs/pnode.h     |   2 ++
> >>>  4 files changed, 82 insertions(+), 58 deletions(-)
> >>> 
> >>> diff --git a/fs/mount.h b/fs/mount.h
> >>> index 2c856fc47ae3..2826543a131d 100644
> >>> --- a/fs/mount.h
> >>> +++ b/fs/mount.h
> >>> @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
> >>>  }
> >>>  
> >>>  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
> >>> -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
> >>>  
> >>>  extern int __legitimize_mnt(struct vfsmount *, unsigned);
> >>>  extern bool legitimize_mnt(struct vfsmount *, unsigned);
> >>> diff --git a/fs/namespace.c b/fs/namespace.c
> >>> index 487ba30bb5c6..91ccfb73f0e0 100644
> >>> --- a/fs/namespace.c
> >>> +++ b/fs/namespace.c
> >>> @@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
> >>>  }
> >>>  
> >>>  /*
> >>> - * find the last mount at @dentry on vfsmount @mnt.
> >>> - * mount_lock must be held.
> >>> - */
> >>> -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
> >>> -{
> >>> -	struct mount *p, *res = NULL;
> >>> -	p = __lookup_mnt(mnt, dentry);
> >>> -	if (!p)
> >>> -		goto out;
> >>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> >>> -		res = p;
> >>> -	hlist_for_each_entry_continue(p, mnt_hash) {
> >>> -		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
> >>> -			break;
> >>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> >>> -			res = p;
> >>> -	}
> >>> -out:
> >>> -	return res;
> >>> -}
> >>> -
> >>> -/*
> >>>   * lookup_mnt - Return the first child mount mounted at path
> >>>   *
> >>>   * "First" means first mounted chronologically.  If you create the
> >>> @@ -878,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
> >>>  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
> >>>  }
> >>>  
> >>> +static void __attach_mnt(struct mount *mnt, struct mount *parent)
> >>> +{
> >>> +	hlist_add_head_rcu(&mnt->mnt_hash,
> >>> +			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
> >>> +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> >>> +}
> >>> +
> >>>  /*
> >>>   * vfsmount lock must be held for write
> >>>   */
> >>> @@ -886,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
> >>>  			struct mountpoint *mp)
> >>>  {
> >>>  	mnt_set_mountpoint(parent, mp, mnt);
> >>> -	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
> >>> -	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> >>> +	__attach_mnt(mnt, parent);
> >>>  }
> >>>  
> >>> -static void attach_shadowed(struct mount *mnt,
> >>> -			struct mount *parent,
> >>> -			struct mount *shadows)
> >>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
> >>>  {
> >>> -	if (shadows) {
> >>> -		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
> >>> -		list_add(&mnt->mnt_child, &shadows->mnt_child);
> >>> -	} else {
> >>> -		hlist_add_head_rcu(&mnt->mnt_hash,
> >>> -				m_hash(&parent->mnt, mnt->mnt_mountpoint));
> >>> -		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> >>> -	}
> >>> +	struct mountpoint *old_mp = mnt->mnt_mp;
> >>> +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
> >>> +	struct mount *old_parent = mnt->mnt_parent;
> >>> +
> >>> +	list_del_init(&mnt->mnt_child);
> >>> +	hlist_del_init(&mnt->mnt_mp_list);
> >>> +	hlist_del_init_rcu(&mnt->mnt_hash);
> >>> +
> >>> +	attach_mnt(mnt, parent, mp);
> >>> +
> >>> +	put_mountpoint(old_mp);
> >>> +
> >>> +	/*
> >>> +	 * Safely avoid even the suggestion this code might sleep or
> >>> +	 * lock the mount hash by taking avantage of the knowlege that
> >>> +	 * mnt_change_mounpoint will not release the final reference
> >>> +	 * to a mountpoint.
> >>> +	 *
> >>> +	 * During mounting, another mount will continue to use the old
> >>> +	 * mountpoint and during unmounting, the old mountpoint will
> >>> +	 * continue to exist until namespace_unlock which happens well
> >>> +	 * after mnt_change_mountpoint.
> >>> +	 */
> >>> +	spin_lock(&old_mountpoint->d_lock);
> >>> +	old_mountpoint->d_lockref.count--;
> >>> +	spin_unlock(&old_mountpoint->d_lock);
> >>> +
> >>> +	mnt_add_count(old_parent, -1);
> >>>  }
> >>>  
> >>>  /*
> >>>   * vfsmount lock must be held for write
> >>>   */
> >>> -static void commit_tree(struct mount *mnt, struct mount *shadows)
> >>> +static void commit_tree(struct mount *mnt)
> >>>  {
> >>>  	struct mount *parent = mnt->mnt_parent;
> >>>  	struct mount *m;
> >>> @@ -925,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
> >>>  	n->mounts += n->pending_mounts;
> >>>  	n->pending_mounts = 0;
> >>>  
> >>> -	attach_shadowed(mnt, parent, shadows);
> >>> +	__attach_mnt(mnt, parent);
> >>>  	touch_mnt_namespace(n);
> >>>  }
> >>>  
> >>> @@ -1764,7 +1766,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
> >>>  			continue;
> >>>  
> >>>  		for (s = r; s; s = next_mnt(s, r)) {
> >>> -			struct mount *t = NULL;
> >>>  			if (!(flag & CL_COPY_UNBINDABLE) &&
> >>>  			    IS_MNT_UNBINDABLE(s)) {
> >>>  				s = skip_mnt_tree(s);
> >>> @@ -1786,14 +1787,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
> >>>  				goto out;
> >>>  			lock_mount_hash();
> >>>  			list_add_tail(&q->mnt_list, &res->mnt_list);
> >>> -			mnt_set_mountpoint(parent, p->mnt_mp, q);
> >>> -			if (!list_empty(&parent->mnt_mounts)) {
> >>> -				t = list_last_entry(&parent->mnt_mounts,
> >>> -					struct mount, mnt_child);
> >>> -				if (t->mnt_mp != p->mnt_mp)
> >>> -					t = NULL;
> >>> -			}
> >>> -			attach_shadowed(q, parent, t);
> >>> +			attach_mnt(q, parent, p->mnt_mp);
> >>>  			unlock_mount_hash();
> >>>  		}
> >>>  	}
> >>> @@ -1992,10 +1986,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> >>>  {
> >>>  	HLIST_HEAD(tree_list);
> >>>  	struct mnt_namespace *ns = dest_mnt->mnt_ns;
> >>> +	struct mountpoint *smp;
> >>>  	struct mount *child, *p;
> >>>  	struct hlist_node *n;
> >>>  	int err;
> >>>  
> >>> +	/* Preallocate a mountpoint in case the new mounts need
> >>> +	 * to be tucked under other mounts.
> >>> +	 */
> >>> +	smp = get_mountpoint(source_mnt->mnt.mnt_root);
> >>> +	if (IS_ERR(smp))
> >>> +		return PTR_ERR(smp);
> >>> +
> >>>  	/* Is there space to add these mounts to the mount namespace? */
> >>>  	if (!parent_path) {
> >>>  		err = count_mounts(ns, source_mnt);
> >>> @@ -2022,17 +2024,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> >>>  		touch_mnt_namespace(source_mnt->mnt_ns);
> >>>  	} else {
> >>>  		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
> >>> -		commit_tree(source_mnt, NULL);
> >>> +		commit_tree(source_mnt);
> >>>  	}
> >>>  
> >>>  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
> >>> -		struct mount *q;
> >>>  		hlist_del_init(&child->mnt_hash);
> >>> -		q = __lookup_mnt_last(&child->mnt_parent->mnt,
> >>> -				      child->mnt_mountpoint);
> >>> -		commit_tree(child, q);
> >>> +		if (child->mnt.mnt_root == smp->m_dentry) {
> >>> +			struct mount *q;
> >>> +			q = __lookup_mnt(&child->mnt_parent->mnt,
> >>> +					 child->mnt_mountpoint);
> >>> +			if (q)
> >>> +				mnt_change_mountpoint(child, smp, q);
> >>> +		}
> >>> +		commit_tree(child);
> >>>  	}
> >>>  	unlock_mount_hash();
> >>> +	put_mountpoint(smp);
> >>>  
> >>>  	return 0;
> >>>  
> >>> @@ -2046,6 +2053,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> >>>  	cleanup_group_ids(source_mnt, NULL);
> >>>   out:
> >>>  	ns->pending_mounts = 0;
> >>> +	put_mountpoint(smp);
> >>>  	return err;
> >>>  }
> >>>  
> >>> diff --git a/fs/pnode.c b/fs/pnode.c
> >>> index 06a793f4ae38..eb4331240fd1 100644
> >>> --- a/fs/pnode.c
> >>> +++ b/fs/pnode.c
> >>> @@ -327,6 +327,9 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
> >>>   */
> >>>  static inline int do_refcount_check(struct mount *mnt, int count)
> >>>  {
> >>> +	struct mount *topper = __lookup_mnt(&mnt->mnt, mnt->mnt.mnt_root);
> >>> +	if (topper)
> >>> +		count++;
> >>>  	return mnt_get_count(mnt) > count;
> >>>  }
> >>>  
> >>> @@ -359,7 +362,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> >>>  
> >>>  	for (m = propagation_next(parent, parent); m;
> >>>  	     		m = propagation_next(m, parent)) {
> >>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> >>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> >>>  		if (child && list_empty(&child->mnt_mounts) &&
> >>>  		    (ret = do_refcount_check(child, 1)))
> >>>  			break;
> >>> @@ -381,7 +384,7 @@ void propagate_mount_unlock(struct mount *mnt)
> >>>  
> >>>  	for (m = propagation_next(parent, parent); m;
> >>>  			m = propagation_next(m, parent)) {
> >>> -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
> >>> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> >>>  		if (child)
> >>>  			child->mnt.mnt_flags &= ~MNT_LOCKED;
> >>>  	}
> >>> @@ -399,9 +402,11 @@ static void mark_umount_candidates(struct mount *mnt)
> >>>  
> >>>  	for (m = propagation_next(parent, parent); m;
> >>>  			m = propagation_next(m, parent)) {
> >>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> >>> +		struct mount *child = __lookup_mnt(&m->mnt,
> >>>  						mnt->mnt_mountpoint);
> >>> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> >>> +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
> >>> +			continue;
> >>> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
> >>>  			SET_MNT_MARK(child);
> >>>  		}
> >>>  	}
> >>> @@ -420,8 +425,8 @@ static void __propagate_umount(struct mount *mnt)
> >>>  
> >>>  	for (m = propagation_next(parent, parent); m;
> >>>  			m = propagation_next(m, parent)) {
> >>> -
> >>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> >>> +		struct mount *topper;
> >>> +		struct mount *child = __lookup_mnt(&m->mnt,
> >>>  						mnt->mnt_mountpoint);
> >>>  		/*
> >>>  		 * umount the child only if the child has no children
> >>> @@ -430,6 +435,16 @@ static void __propagate_umount(struct mount *mnt)
> >>>  		if (!child || !IS_MNT_MARKED(child))
> >>>  			continue;
> >>>  		CLEAR_MNT_MARK(child);
> >>> +
> >>> +		/* If there is exactly one mount covering all of child
> >>> +		 * replace child with that mount.
> >>> +		 */
> >>> +		topper = __lookup_mnt(&child->mnt, child->mnt.mnt_root);
> >>> +		if (topper &&
> >>> +		    (child->mnt_mounts.next == &topper->mnt_child) &&
> >>> +		    (topper->mnt_child.next == &child->mnt_mounts))
> >>> +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
> >>> +
> >>>  		if (list_empty(&child->mnt_mounts)) {
> >>>  			list_del_init(&child->mnt_child);
> >>>  			child->mnt.mnt_flags |= MNT_UMOUNT;
> >>> diff --git a/fs/pnode.h b/fs/pnode.h
> >>> index 550f5a8b4fcf..dc87e65becd2 100644
> >>> --- a/fs/pnode.h
> >>> +++ b/fs/pnode.h
> >>> @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
> >>>  unsigned int mnt_get_count(struct mount *mnt);
> >>>  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
> >>>  			struct mount *);
> >>> +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
> >>> +			   struct mount *mnt);
> >>>  struct mount *copy_tree(struct mount *, struct dentry *, int);
> >>>  bool is_path_reachable(struct mount *, struct dentry *,
> >>>  			 const struct path *root);
> >>> -- 
> >>> 2.10.1
> >>> 

[-- Attachment #2: patch --]
[-- Type: text/plain, Size: 2126 bytes --]

diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896..7aefd06 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -435,6 +435,17 @@ static void mark_umount_candidates(struct mount *mnt)
 	}
 }
 
+static void __marked_umount(struct mount *mnt, struct mount *child)
+{
+	CLEAR_MNT_MARK(child);
+
+	if (list_empty(&child->mnt_mounts)) {
+		list_del_init(&child->mnt_child);
+		child->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move_tail(&child->mnt_list, &mnt->mnt_list);
+	}
+}
+
 /*
  * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
  * parent propagates to.
@@ -448,8 +459,13 @@ static void __propagate_umount(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *topper;
-		struct mount *child = __lookup_mnt(&m->mnt,
+		struct mount *topper, *topper_to_umount;
+		struct mount *child;
+
+		if (m->mnt.mnt_flags & MNT_UMOUNT)
+			continue;
+
+		child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
 		/*
 		 * umount the child only if the child has no children
@@ -457,21 +473,28 @@ static void __propagate_umount(struct mount *mnt)
 		 */
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
-		CLEAR_MNT_MARK(child);
 
-		/* If there is exactly one mount covering all of child
+		/* If there is exactly one mount covering all of childV
 		 * replace child with that mount.
 		 */
-		topper = find_topper(child);
+		topper = topper_to_umount = child;
+		while (1) {
+			topper = find_topper(topper);
+			if (topper == NULL)
+				break;
+			if (!IS_MNT_MARKED(topper))
+				break;
+			topper_to_umount = topper;
+		}
 		if (topper)
-			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
-					      topper);
+			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, topper);
 
-		if (list_empty(&child->mnt_mounts)) {
-			list_del_init(&child->mnt_child);
-			child->mnt.mnt_flags |= MNT_UMOUNT;
-			list_move_tail(&child->mnt_list, &mnt->mnt_list);
+		while (topper_to_umount != child) {
+			__marked_umount(mnt, topper_to_umount);
+			topper_to_umount = topper_to_umount->mnt_parent;
 		}
+
+		__marked_umount(mnt, child);
 	}
 }
 

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts.
  2017-05-15 18:27                       ` Andrei Vagin
@ 2017-05-15 19:42                         ` Eric W. Biederman
  2017-05-15 20:10                           ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-15 19:42 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai

Andrei Vagin <avagin@virtuozzo.com> writes:

> On Sun, May 14, 2017 at 04:26:18AM -0500, Eric W. Biederman wrote:
>> ebiederm@xmission.com (Eric W. Biederman) writes:
>> 
>> > Andrei Vagin <avagin@virtuozzo.com> writes:
>> >
>> >> Hi Eric,
>> >>
>> >> I found one issue about this patch. Take a look at the next script. The
>> >> idea is simple, we call mount twice and than we call umount twice, but
>> >> the second umount fails.
>> >
>> > Definitely an odd.  I will take a look.
>> 
>> After a little more looking I have to say the only thing I can see wrong
>> in the behavior is that the first umount doesn't unmount everything.
>> 
>> Changing the mount propagation tree while it is being traversed
>> apparently prevents a complete traversal of the propgation tree.
>
> Is it be enough to find topper which will not be umounted? I attached a
> patch which does this. I can't find a reason why this will not work.
>
> The idea of the patch is that we try to find a topper, check whether it is
> marked, and if it is marked, then we try to find the next one. All
> marked toppers are umounted and the first unmarked topper is attached to
> the parent mount.

That isn't completely wrong.  But I don't believe the it is the proper
logic.

Fundamentally the issue is that we are reparenting while remounting.
This results in mounts that should be unmounted moving before they
are unmounted.

Which means that if we leave all of the mnt_change_mountpoint work
for a final pass over the mounts everything works correctly.

Which is fabulous news.

That means multiple passes through a mount propagation tree for a given
mountpoint can no longer change what winds up unmounted.  Which
means we can skip subtrees that have already seen mount propagation.
Which means the optimizations I was playing with earlier fundamentally
will work without changing the semantics of MNT_DETACH.

> And I think we need to add tests in tools/testing/selftests/...

Feel free.

Patch to sort this immediate issue out in a moment...

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass
  2017-05-15 19:42                         ` Eric W. Biederman
@ 2017-05-15 20:10                           ` Eric W. Biederman
  2017-05-15 23:12                             ` Andrei Vagin
                                               ` (3 more replies)
  0 siblings, 4 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-15 20:10 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai


It was observed that in some pathlogical cases that the current code
does not unmount everything it should.  After investigation it was
determined that the issue is that mnt_change_mntpoint can can change
which mounts are available to be unmounted during mount propagation
which is wrong.

The trivial reproducer is:
$ cat ./pathological.sh

mount -t tmpfs test-base /mnt
cd /mnt
mkdir 1 2 1/1
mount --bind 1 1
mount --make-shared 1
mount --bind 1 2
mount --bind 1/1 1/1
mount --bind 1/1 1/1
echo
grep test-base /proc/self/mountinfo
umount 1/1
echo
grep test-base /proc/self/mountinfo

$ unshare -Urm ./pathological.sh

The expected output looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

The output without the fix looks like:
46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000

That last mount in the output was in the propgation tree to be unmounted but
was missed because the mnt_change_mountpoint changed it's parent before the walk
through the mount propagation tree observed it.

Cc: stable@vger.kernel.org
Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h     |  1 +
 fs/namespace.c |  1 +
 fs/pnode.c     | 35 ++++++++++++++++++++++++++++++-----
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index bf1fda6eed8f..ede5a1d5cf99 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	struct list_head mnt_reparent;	/* reparent list entry */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 8bd3e4d448b9..51e49866e1fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+		INIT_LIST_HEAD(&mnt->mnt_reparent);
 		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
 	}
 	return mnt;
diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896d122a..52aca0a118ff 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -439,7 +439,7 @@ static void mark_umount_candidates(struct mount *mnt)
  * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
  * parent propagates to.
  */
-static void __propagate_umount(struct mount *mnt)
+static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
@@ -464,17 +464,38 @@ static void __propagate_umount(struct mount *mnt)
 		 */
 		topper = find_topper(child);
 		if (topper)
-			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
-					      topper);
+			list_add_tail(&topper->mnt_reparent, to_reparent);
 
-		if (list_empty(&child->mnt_mounts)) {
+		if (topper || list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
+			list_del_init(&child->mnt_reparent);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
 			list_move_tail(&child->mnt_list, &mnt->mnt_list);
 		}
 	}
 }
 
+static void reparent_mounts(struct list_head *to_reparent)
+{
+	while (!list_empty(to_reparent)) {
+		struct mount *mnt, *parent;
+		struct mountpoint *mp;
+
+		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
+		list_del_init(&mnt->mnt_reparent);
+
+		/* Where should this mount be reparented to? */
+		mp = mnt->mnt_mp;
+		parent = mnt->mnt_parent;
+		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
+			mp = parent->mnt_mp;
+			parent = parent->mnt_parent;
+		}
+
+		mnt_change_mountpoint(parent, mp, mnt);
+	}
+}
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -485,11 +506,15 @@ static void __propagate_umount(struct mount *mnt)
 int propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
+	LIST_HEAD(to_reparent);
 
 	list_for_each_entry_reverse(mnt, list, mnt_list)
 		mark_umount_candidates(mnt);
 
 	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt);
+		__propagate_umount(mnt, &to_reparent);
+
+	reparent_mounts(&to_reparent);
+
 	return 0;
 }
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass
  2017-05-15 20:10                           ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Eric W. Biederman
@ 2017-05-15 23:12                             ` Andrei Vagin
  2017-05-16  5:42                             ` [PATCH] test: check a case when a mount is propagated between exiting mounts Andrei Vagin
                                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 63+ messages in thread
From: Andrei Vagin @ 2017-05-15 23:12 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
> 
> It was observed that in some pathlogical cases that the current code
> does not unmount everything it should.  After investigation it was
> determined that the issue is that mnt_change_mntpoint can can change
> which mounts are available to be unmounted during mount propagation
> which is wrong.
> 
> The trivial reproducer is:
> $ cat ./pathological.sh
> 
> mount -t tmpfs test-base /mnt
> cd /mnt
> mkdir 1 2 1/1
> mount --bind 1 1
> mount --make-shared 1
> mount --bind 1 2
> mount --bind 1/1 1/1
> mount --bind 1/1 1/1
> echo
> grep test-base /proc/self/mountinfo
> umount 1/1
> echo
> grep test-base /proc/self/mountinfo
> 
> $ unshare -Urm ./pathological.sh
> 
> The expected output looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> The output without the fix looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> That last mount in the output was in the propgation tree to be unmounted but
> was missed because the mnt_change_mountpoint changed it's parent before the walk
> through the mount propagation tree observed it.
> 

It works for me and the patch looks correct. Thanks!

Acked-by: Andrei Vagin <avagin@virtuozzo.com>

> Cc: stable@vger.kernel.org
> Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |  1 +
>  fs/namespace.c |  1 +
>  fs/pnode.c     | 35 ++++++++++++++++++++++++++++++-----
>  3 files changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index bf1fda6eed8f..ede5a1d5cf99 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_reparent;	/* reparent list entry */
>  #ifdef CONFIG_FSNOTIFY
>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8bd3e4d448b9..51e49866e1fe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_reparent);
>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>  	}
>  	return mnt;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5bc7896d122a..52aca0a118ff 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -439,7 +439,7 @@ static void mark_umount_candidates(struct mount *mnt)
>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>   * parent propagates to.
>   */
> -static void __propagate_umount(struct mount *mnt)
> +static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
>  {
>  	struct mount *parent = mnt->mnt_parent;
>  	struct mount *m;
> @@ -464,17 +464,38 @@ static void __propagate_umount(struct mount *mnt)
>  		 */
>  		topper = find_topper(child);
>  		if (topper)
> -			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> -					      topper);
> +			list_add_tail(&topper->mnt_reparent, to_reparent);
>  
> -		if (list_empty(&child->mnt_mounts)) {
> +		if (topper || list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
> +			list_del_init(&child->mnt_reparent);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>  			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>  		}
>  	}
>  }
>  
> +static void reparent_mounts(struct list_head *to_reparent)
> +{
> +	while (!list_empty(to_reparent)) {
> +		struct mount *mnt, *parent;
> +		struct mountpoint *mp;
> +
> +		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
> +		list_del_init(&mnt->mnt_reparent);
> +
> +		/* Where should this mount be reparented to? */
> +		mp = mnt->mnt_mp;
> +		parent = mnt->mnt_parent;
> +		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
> +			mp = parent->mnt_mp;
> +			parent = parent->mnt_parent;
> +		}
> +
> +		mnt_change_mountpoint(parent, mp, mnt);
> +	}
> +}
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -485,11 +506,15 @@ static void __propagate_umount(struct mount *mnt)
>  int propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_reparent);
>  
>  	list_for_each_entry_reverse(mnt, list, mnt_list)
>  		mark_umount_candidates(mnt);
>  
>  	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> +		__propagate_umount(mnt, &to_reparent);
> +
> +	reparent_mounts(&to_reparent);
> +
>  	return 0;
>  }
> -- 
> 2.10.1
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] test: check a case when a mount is propagated between exiting mounts
  2017-05-15 20:10                           ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Eric W. Biederman
  2017-05-15 23:12                             ` Andrei Vagin
@ 2017-05-16  5:42                             ` Andrei Vagin
  2017-05-17  5:54                             ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Eric W. Biederman
  2017-05-22  8:15                             ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Ram Pai
  3 siblings, 0 replies; 63+ messages in thread
From: Andrei Vagin @ 2017-05-16  5:42 UTC (permalink / raw)
  To: Eric W . Biederman
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrei Vagin, Shuah Khan

This test checks two behaviour cases:

When a mount is propagated to a place which is already busy, the new
mount is inserted between parent and old mount.

When a mount that is being unmounted due to propagation has another
mount on top of it, it is replaced by the top mount.

Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 tools/testing/selftests/mount/Makefile             | 19 +++--
 tools/testing/selftests/mount/test-reparent-mounts | 92 ++++++++++++++++++++++
 2 files changed, 105 insertions(+), 6 deletions(-)
 create mode 100755 tools/testing/selftests/mount/test-reparent-mounts

diff --git a/tools/testing/selftests/mount/Makefile b/tools/testing/selftests/mount/Makefile
index 9093d7f..5927230 100644
--- a/tools/testing/selftests/mount/Makefile
+++ b/tools/testing/selftests/mount/Makefile
@@ -6,11 +6,18 @@ TEST_GEN_PROGS := unprivileged-remount-test
 
 include ../lib.mk
 
-override RUN_TESTS := if [ -f /proc/self/uid_map ] ; \
-		      then	\
-				./unprivileged-remount-test ; \
-		      else	\
-				echo "WARN: No /proc/self/uid_map exist, test skipped." ; \
-		      fi
+override define RUN_TESTS
+	if [ -f /proc/self/uid_map ] ; \
+	then	\
+		./unprivileged-remount-test ; \
+	else	\
+		echo "WARN: No /proc/self/uid_map exist, test skipped." ; \
+	fi
+	unshare -Urm ./test-reparent-mounts
+	unshare -Urm ./test-reparent-mounts -c
+	unshare -Urm ./test-reparent-mounts -s
+	unshare -Urm ./test-reparent-mounts -s -S
+endef
+
 override EMIT_TESTS := echo "$(RUN_TESTS)"
 
diff --git a/tools/testing/selftests/mount/test-reparent-mounts b/tools/testing/selftests/mount/test-reparent-mounts
new file mode 100755
index 0000000..57ae300
--- /dev/null
+++ b/tools/testing/selftests/mount/test-reparent-mounts
@@ -0,0 +1,92 @@
+#!/bin/sh
+
+# This test checks two following behaviour cases:
+#
+# When a mount is propagated to a place which is already busy, the new mount is
+# inserted between parent and old mount.
+#
+# When a mount that is being unmounted due to propagation has another mount on
+# top of it, it is replaced by the top mount.
+
+ITER=3
+
+set -e
+
+usage()
+{
+	echo " ./$0 [OPTIONS]
+This test checks a case when a mount has to be propagated under another mount.
+	-c - create a mount which is visible only from the second tree
+	-s - make a second tree as a slave to the first one
+	-S - create a sub-mount when the send tree is a slave to the first one
+	-i - how many times to call mount
+"
+}
+
+while getopts "csi:hS" opt; do
+   case $opt in
+   c )  with_child=1;;
+   s )  make_slave=1;;
+   S )  slave_child=1;;
+   i )  ITER=$OPTARG;;
+   h )  usage; exit 0 ;;
+   esac
+done
+
+shift $(($OPTIND - 1))
+
+if [ -n "$1" ]; then
+	usage
+	exit 1
+fi
+
+mount -t tmpfs test /mnt
+mkdir /mnt/main
+mkdir /mnt/second
+mount --bind /mnt/main /mnt/main
+mount --make-shared /mnt/main
+mount --bind /mnt/main /mnt/second
+mkdir -p /mnt/main/sub
+
+if [ -n "$make_slave" ]; then
+	mount --make-slave /mnt/second
+	if [ -n "slave_child" ]; then
+		mount -t tmpfs slave_child /mnt/second/sub/
+		touch /mnt/second/sub/slave_child
+	fi
+fi
+
+if [ -n "$make_slave" ]; then
+	mount --make-slave /mnt/second
+	if [ -n "$slave_child" ]; then
+		mount -t tmpfs test_slave /mnt/second/sub/
+		touch /mnt/second/sub/slave_child
+	fi
+fi
+
+for i in `seq $ITER`; do
+	mount --bind /mnt/main/sub /mnt/main/sub
+done
+
+if [ -n "$with_child" ]; then
+	mkdir /mnt/second/sub/child
+	mount --make-private /mnt/second/sub
+	mount --bind /mnt/second/sub/child /mnt/second/sub/child
+fi
+if [ -n "$slave_child" ]; then
+	test -f /mnt/second/sub/slave_child
+fi
+
+umount /mnt/main/sub
+
+if [ -n "$with_child" ]; then
+	umount /mnt/second/sub/child
+	umount /mnt/second/sub
+fi
+if [ -n "$slave_child" ]; then
+	test -f /mnt/second/sub/slave_child
+	umount /mnt/second/sub
+fi
+
+umount /mnt/second
+umount /mnt/main
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-15 20:10                           ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Eric W. Biederman
  2017-05-15 23:12                             ` Andrei Vagin
  2017-05-16  5:42                             ` [PATCH] test: check a case when a mount is propagated between exiting mounts Andrei Vagin
@ 2017-05-17  5:54                             ` Eric W. Biederman
  2017-05-17  5:55                               ` [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees Eric W. Biederman
                                                 ` (2 more replies)
  2017-05-22  8:15                             ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Ram Pai
  3 siblings, 3 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-17  5:54 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai


While investigating some poor umount performance I realized that in
the case of overlapping mount trees where some of the mounts are locked
the code has been failing to unmount all of the mounts it should
have been unmounting.

This failure to unmount all of the necessary
mounts can be reproduced with:

$ cat locked_mounts_test.sh

mount -t tmpfs test-base /mnt
mount --make-shared /mnt
mkdir -p /mnt/b

mount -t tmpfs test1 /mnt/b
mount --make-shared /mnt/b
mkdir -p /mnt/b/10

mount -t tmpfs test2 /mnt/b/10
mount --make-shared /mnt/b/10
mkdir -p /mnt/b/10/20

mount --rbind /mnt/b /mnt/b/10/20

unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
sleep 1
umount -l /mnt/b
wait %%

$ unshare -Urm ./locked_mounts_test.sh

This failure is corrected by removing the prepass that marks mounts
that may be umounted.

A first pass is added that umounts mounts if possible and if not sets
mount mark if they could be unmounted if they weren't locked and adds
them to a list to umount possibilities.  This first pass reconsiders
the mounts parent if it is on the list of umount possibilities, ensuring
that information of umoutability will pass from child to mount parent.

A second pass then walks through all mounts that are umounted and processes
their children unmounting them or marking them for reparenting.

A last pass cleans up the state on the mounts that could not be umounted
and if applicable reparents them to their first parent that remained
mounted.

While a bit longer than the old code this code is much more robust
as it allows information to flow up from the leaves and down
from the trunk making the order in which mounts are encountered
in the umount propgation tree irrelevant.

Cc: stable@vger.kernel.org
Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h     |   2 +-
 fs/namespace.c |   2 +-
 fs/pnode.c     | 144 ++++++++++++++++++++++++++++++++++-----------------------
 3 files changed, 88 insertions(+), 60 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index ede5a1d5cf99..de45d9e76748 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,7 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
-	struct list_head mnt_reparent;	/* reparent list entry */
+	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 51e49866e1fe..5e3dcbeb1de5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,7 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
-		INIT_LIST_HEAD(&mnt->mnt_reparent);
+		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
 	}
 	return mnt;
diff --git a/fs/pnode.c b/fs/pnode.c
index 52aca0a118ff..fbaca7df2eb0 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -413,86 +413,95 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-/*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
- */
-static void mark_umount_candidates(struct mount *mnt)
+static void umount_one(struct mount *mnt, struct list_head *to_umount)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
-
-	BUG_ON(parent == mnt);
-
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
-			continue;
-		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
-			SET_MNT_MARK(child);
-		}
-	}
+	CLEAR_MNT_MARK(mnt);
+	mnt->mnt.mnt_flags |= MNT_UMOUNT;
+	list_del_init(&mnt->mnt_child);
+	list_del_init(&mnt->mnt_umounting);
+	list_move_tail(&mnt->mnt_list, to_umount);
 }
 
 /*
  * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
  * parent propagates to.
  */
-static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
+static bool __propagate_umount(struct mount *mnt,
+			       struct list_head *to_umount,
+			       struct list_head *to_restore)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	bool progress = false;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/*
+	 * The state of the parent won't change if this mount is
+	 * already unmounted or marked as without children.
+	 */
+	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
+		goto out;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *topper;
-		struct mount *child = __lookup_mnt(&m->mnt,
-						mnt->mnt_mountpoint);
-		/*
-		 * umount the child only if the child has no children
-		 * and the child is marked safe to unmount.
-		 */
-		if (!child || !IS_MNT_MARKED(child))
+	/* Verify topper is the only grandchild that has not been
+	 * speculatively unmounted.
+	 */
+	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
 			continue;
-		CLEAR_MNT_MARK(child);
+		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
+			continue;
+		/* Found a mounted child */
+		goto children;
+	}
 
-		/* If there is exactly one mount covering all of child
-		 * replace child with that mount.
-		 */
-		topper = find_topper(child);
-		if (topper)
-			list_add_tail(&topper->mnt_reparent, to_reparent);
+	/* Mark mounts that can be unmounted if not locked */
+	SET_MNT_MARK(mnt);
+	progress = true;
 
-		if (topper || list_empty(&child->mnt_mounts)) {
-			list_del_init(&child->mnt_child);
-			list_del_init(&child->mnt_reparent);
-			child->mnt.mnt_flags |= MNT_UMOUNT;
-			list_move_tail(&child->mnt_list, &mnt->mnt_list);
+	/* If a mount is without children and not locked umount it. */
+	if (!IS_MNT_LOCKED(mnt)) {
+		umount_one(mnt, to_umount);
+	} else {
+children:
+		list_move_tail(&mnt->mnt_umounting, to_restore);
+	}
+out:
+	return progress;
+}
+
+static void umount_list(struct list_head *to_umount,
+			struct list_head *to_restore)
+{
+	struct mount *mnt, *child, *tmp;
+	list_for_each_entry(mnt, to_umount, mnt_list) {
+		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
+			/* topper? */
+			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
+				list_move_tail(&child->mnt_umounting, to_restore);
+			else
+				umount_one(child, to_umount);
 		}
 	}
 }
 
-static void reparent_mounts(struct list_head *to_reparent)
+static void restore_mounts(struct list_head *to_restore)
 {
-	while (!list_empty(to_reparent)) {
+	/* Restore mounts to a clean working state */
+	while (!list_empty(to_restore)) {
 		struct mount *mnt, *parent;
 		struct mountpoint *mp;
 
-		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
-		list_del_init(&mnt->mnt_reparent);
+		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
+		CLEAR_MNT_MARK(mnt);
+		list_del_init(&mnt->mnt_umounting);
 
-		/* Where should this mount be reparented to? */
+		/* Should this mount be reparented? */
 		mp = mnt->mnt_mp;
 		parent = mnt->mnt_parent;
 		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
 			mp = parent->mnt_mp;
 			parent = parent->mnt_parent;
 		}
-
-		mnt_change_mountpoint(parent, mp, mnt);
+		if (parent != mnt->mnt_parent)
+			mnt_change_mountpoint(parent, mp, mnt);
 	}
 }
 
@@ -506,15 +515,34 @@ static void reparent_mounts(struct list_head *to_reparent)
 int propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
-	LIST_HEAD(to_reparent);
+	LIST_HEAD(to_restore);
+	LIST_HEAD(to_umount);
 
-	list_for_each_entry_reverse(mnt, list, mnt_list)
-		mark_umount_candidates(mnt);
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *parent = mnt->mnt_parent;
+		struct mount *m;
 
-	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt, &to_reparent);
+		for (m = propagation_next(parent, parent); m;
+		     m = propagation_next(m, parent)) {
+			struct mount *child = __lookup_mnt(&m->mnt,
+							   mnt->mnt_mountpoint);
+			if (!child)
+				continue;
+
+			/* Check the child and parents while progress is made */
+			while (__propagate_umount(child,
+						  &to_umount, &to_restore)) {
+				/* Is the parent a umount candidate? */
+				child = child->mnt_parent;
+				if (list_empty(&child->mnt_umounting))
+					break;
+			}
+		}
+	}
 
-	reparent_mounts(&to_reparent);
+	umount_list(&to_umount, &to_restore);
+	restore_mounts(&to_restore);
+	list_splice_tail(&to_umount, list);
 
 	return 0;
 }
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees
  2017-05-17  5:54                             ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Eric W. Biederman
@ 2017-05-17  5:55                               ` Eric W. Biederman
  2017-05-17 22:48                                 ` Andrei Vagin
  2017-05-24 20:42                               ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Ram Pai
  2017-05-30  6:07                               ` Ram Pai
  2 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-17  5:55 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai


Andrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

Make the walk of the mount propagation trees nearly linear by
remembering which mounts have already been visited, allowing
subsequent walks to detect when walking a mount propgation tree or a
subtree of a mount propgation tree would be duplicate work and to skip
them entirely.

Walk the list of mounts whose propgatation trees need to be traversed
from the mount highest in the mount tree to mounts lower in the mount
tree so that odds are higher that the code will walk the largest trees
first, allowing later tree walks to be skipped entirely.

Add cleanup_umount_visitation to remover the code's memory of which
mounts have been visited.

Add the functions last_slave and skip_propagation_subtree to allow
skipping appropriate parts of the mount propagation tree without
needing to change the logic of the rest of the code.

A script to generate overlapping mount propagation trees:

$ cat runs.h
set -e
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

iteration=10
if [ -n "$1" ] ; then
	iteration=$1
fi

for i in $(seq $iteration); do
	mount --bind /mnt/1/1 /mnt/1/1
done

mount --rbind /mnt/1 /mnt/2

TIMEFORMAT='%Rs'
nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
echo -n "umount -l /mnt/1 -> $nr        "
time umount -l /mnt/1

nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
time umount -l /mnt/2

$ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

     mhash |  8192   |  8192  | 1048576 | 1048576
    mounts | before  | after  |  before | after
    ------------------------------------------------
      1025 |  0.040s | 0.016s |  0.038s | 0.019s
      2049 |  0.094s | 0.017s |  0.080s | 0.018s
      4097 |  0.243s | 0.019s |  0.206s | 0.023s
      8193 |  1.202s | 0.028s |  1.562s | 0.032s
     16385 |  9.635s | 0.036s |  9.952s | 0.041s
     32769 | 60.928s | 0.063s | 44.321s | 0.064s
     65537 |         | 0.097s |         | 0.097s
    131073 |         | 0.233s |         | 0.176s
    262145 |         | 0.653s |         | 0.344s
    524289 |         | 2.305s |         | 0.735s
   1048577 |         | 7.107s |         | 2.603s

Andrei Vagin reports fixing the performance problem is part of the
work to fix CVE-2016-6213.

Cc: stable@vger.kernel.org
Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Reported-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index fbaca7df2eb0..53d411a371ce 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
 	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
 }
 
+static inline struct mount *last_slave(struct mount *p)
+{
+	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
+}
+
 static inline struct mount *next_slave(struct mount *p)
 {
 	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
 	}
 }
 
+static struct mount *skip_propagation_subtree(struct mount *m,
+						struct mount *origin)
+{
+	/*
+	 * Advance m such that propagation_next will not return
+	 * the slaves of m.
+	 */
+	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+		m = last_slave(m);
+
+	return m;
+}
+
 static struct mount *next_group(struct mount *m, struct mount *origin)
 {
 	while (1) {
@@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
 	}
 }
 
+static void cleanup_umount_visitations(struct list_head *visited)
+{
+	while (!list_empty(visited)) {
+		struct mount *mnt =
+			list_first_entry(visited, struct mount, mnt_umounting);
+		list_del_init(&mnt->mnt_umounting);
+	}
+}
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
 	struct mount *mnt;
 	LIST_HEAD(to_restore);
 	LIST_HEAD(to_umount);
+	LIST_HEAD(visited);
 
-	list_for_each_entry(mnt, list, mnt_list) {
+	/* Find candidates for unmounting */
+	list_for_each_entry_reverse(mnt, list, mnt_list) {
 		struct mount *parent = mnt->mnt_parent;
 		struct mount *m;
 
+		/*
+		 * If this mount has already been visited it is known that it's
+		 * entire peer group and all of their slaves in the propagation
+		 * tree for the mountpoint has already been visited and there is
+		 * no need to visit them again.
+		 */
+		if (!list_empty(&mnt->mnt_umounting))
+			continue;
+
+		list_add_tail(&mnt->mnt_umounting, &visited);
 		for (m = propagation_next(parent, parent); m;
 		     m = propagation_next(m, parent)) {
 			struct mount *child = __lookup_mnt(&m->mnt,
@@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
 			if (!child)
 				continue;
 
+			if (!list_empty(&child->mnt_umounting)) {
+				/*
+				 * If the child has already been visited it is
+				 * know that it's entire peer group and all of
+				 * their slaves in the propgation tree for the
+				 * mountpoint has already been visited and there
+				 * is no need to visit this subtree again.
+				 */
+				m = skip_propagation_subtree(m, parent);
+				continue;
+			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
+				/*
+				 * We have come accross an partially unmounted
+				 * mount in list that has not been visited yet.
+				 * Remember it has been visited and continue
+				 * about our merry way.
+				 */
+				list_add_tail(&child->mnt_umounting, &visited);
+				continue;
+			}
+
 			/* Check the child and parents while progress is made */
 			while (__propagate_umount(child,
 						  &to_umount, &to_restore)) {
@@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
 
 	umount_list(&to_umount, &to_restore);
 	restore_mounts(&to_restore);
+	cleanup_umount_visitations(&visited);
 	list_splice_tail(&to_umount, list);
 
 	return 0;
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees
  2017-05-17  5:55                               ` [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees Eric W. Biederman
@ 2017-05-17 22:48                                 ` Andrei Vagin
  2017-05-17 23:26                                   ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Andrei Vagin @ 2017-05-17 22:48 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

Hi Eric,

I tested both patches and I haven't found any issue. Thanks.

On Wed, May 17, 2017 at 12:55:16AM -0500, Eric W. Biederman wrote:
> 
> Andrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Make the walk of the mount propagation trees nearly linear by
> remembering which mounts have already been visited, allowing
> subsequent walks to detect when walking a mount propgation tree or a
> subtree of a mount propgation tree would be duplicate work and to skip
> them entirely.
> 
> Walk the list of mounts whose propgatation trees need to be traversed
> from the mount highest in the mount tree to mounts lower in the mount
> tree so that odds are higher that the code will walk the largest trees
> first, allowing later tree walks to be skipped entirely.
> 
> Add cleanup_umount_visitation to remover the code's memory of which
> mounts have been visited.
> 
> Add the functions last_slave and skip_propagation_subtree to allow
> skipping appropriate parts of the mount propagation tree without
> needing to change the logic of the rest of the code.
> 
> A script to generate overlapping mount propagation trees:
> 
> $ cat runs.h
> set -e
> mount -t tmpfs zdtm /mnt
> mkdir -p /mnt/1 /mnt/2
> mount -t tmpfs zdtm /mnt/1
> mount --make-shared /mnt/1
> mkdir /mnt/1/1
> 
> iteration=10
> if [ -n "$1" ] ; then
> 	iteration=$1
> fi
> 
> for i in $(seq $iteration); do
> 	mount --bind /mnt/1/1 /mnt/1/1
> done
> 
> mount --rbind /mnt/1 /mnt/2
> 
> TIMEFORMAT='%Rs'
> nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
> echo -n "umount -l /mnt/1 -> $nr        "
> time umount -l /mnt/1
> 
> nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
> time umount -l /mnt/2
> 
> $ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
>      mhash |  8192   |  8192  | 1048576 | 1048576
>     mounts | before  | after  |  before | after
>     ------------------------------------------------
>       1025 |  0.040s | 0.016s |  0.038s | 0.019s
>       2049 |  0.094s | 0.017s |  0.080s | 0.018s
>       4097 |  0.243s | 0.019s |  0.206s | 0.023s
>       8193 |  1.202s | 0.028s |  1.562s | 0.032s
>      16385 |  9.635s | 0.036s |  9.952s | 0.041s
>      32769 | 60.928s | 0.063s | 44.321s | 0.064s
>      65537 |         | 0.097s |         | 0.097s
>     131073 |         | 0.233s |         | 0.176s
>     262145 |         | 0.653s |         | 0.344s
>     524289 |         | 2.305s |         | 0.735s
>    1048577 |         | 7.107s |         | 2.603s
> 
> Andrei Vagin reports fixing the performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable@vger.kernel.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Reported-by: Andrei Vagin <avagin@openvz.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 62 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index fbaca7df2eb0..53d411a371ce 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
>  	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
>  }
>  
> +static inline struct mount *last_slave(struct mount *p)
> +{
> +	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
> +}
> +
>  static inline struct mount *next_slave(struct mount *p)
>  {
>  	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
> @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +static struct mount *skip_propagation_subtree(struct mount *m,
> +						struct mount *origin)
> +{
> +	/*
> +	 * Advance m such that propagation_next will not return
> +	 * the slaves of m.
> +	 */
> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> +		m = last_slave(m);
> +
> +	return m;
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
>  	}
>  }
>  
> +static void cleanup_umount_visitations(struct list_head *visited)
> +{
> +	while (!list_empty(visited)) {
> +		struct mount *mnt =
> +			list_first_entry(visited, struct mount, mnt_umounting);
> +		list_del_init(&mnt->mnt_umounting);
> +	}
> +}
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
>  	struct mount *mnt;
>  	LIST_HEAD(to_restore);
>  	LIST_HEAD(to_umount);
> +	LIST_HEAD(visited);
>  
> -	list_for_each_entry(mnt, list, mnt_list) {
> +	/* Find candidates for unmounting */
> +	list_for_each_entry_reverse(mnt, list, mnt_list) {
>  		struct mount *parent = mnt->mnt_parent;
>  		struct mount *m;
>  
> +		/*
> +		 * If this mount has already been visited it is known that it's
> +		 * entire peer group and all of their slaves in the propagation
> +		 * tree for the mountpoint has already been visited and there is
> +		 * no need to visit them again.
> +		 */
> +		if (!list_empty(&mnt->mnt_umounting))
> +			continue;
> +
> +		list_add_tail(&mnt->mnt_umounting, &visited);
>  		for (m = propagation_next(parent, parent); m;
>  		     m = propagation_next(m, parent)) {
>  			struct mount *child = __lookup_mnt(&m->mnt,
> @@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
>  			if (!child)
>  				continue;
>  
> +			if (!list_empty(&child->mnt_umounting)) {
> +				/*
> +				 * If the child has already been visited it is
> +				 * know that it's entire peer group and all of
> +				 * their slaves in the propgation tree for the
> +				 * mountpoint has already been visited and there
> +				 * is no need to visit this subtree again.
> +				 */
> +				m = skip_propagation_subtree(m, parent);
> +				continue;
> +			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
> +				/*
> +				 * We have come accross an partially unmounted
> +				 * mount in list that has not been visited yet.
> +				 * Remember it has been visited and continue
> +				 * about our merry way.
> +				 */
> +				list_add_tail(&child->mnt_umounting, &visited);
> +				continue;
> +			}
> +
>  			/* Check the child and parents while progress is made */
>  			while (__propagate_umount(child,
>  						  &to_umount, &to_restore)) {
> @@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
>  
>  	umount_list(&to_umount, &to_restore);
>  	restore_mounts(&to_restore);
> +	cleanup_umount_visitations(&visited);
>  	list_splice_tail(&to_umount, list);
>  
>  	return 0;
> -- 
> 2.10.1
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees
  2017-05-17 22:48                                 ` Andrei Vagin
@ 2017-05-17 23:26                                   ` Eric W. Biederman
  2017-05-18  0:51                                     ` Andrei Vagin
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-17 23:26 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Al Viro, linux-fsdevel, Ram Pai

Andrei Vagin <avagin@virtuozzo.com> writes:

> Hi Eric,
>
> I tested both patches and I haven't found any issue. Thanks.

Can I get a Tested-by an Acked-by or a Reviewed-by?

Apologies this took so long to get to this point until I realized that
we could move mnt_change_mountpoint into a separate pass this didn't
look possible.

Eric

>
> On Wed, May 17, 2017 at 12:55:16AM -0500, Eric W. Biederman wrote:
>> 
>> Andrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>> 
>> Make the walk of the mount propagation trees nearly linear by
>> remembering which mounts have already been visited, allowing
>> subsequent walks to detect when walking a mount propgation tree or a
>> subtree of a mount propgation tree would be duplicate work and to skip
>> them entirely.
>> 
>> Walk the list of mounts whose propgatation trees need to be traversed
>> from the mount highest in the mount tree to mounts lower in the mount
>> tree so that odds are higher that the code will walk the largest trees
>> first, allowing later tree walks to be skipped entirely.
>> 
>> Add cleanup_umount_visitation to remover the code's memory of which
>> mounts have been visited.
>> 
>> Add the functions last_slave and skip_propagation_subtree to allow
>> skipping appropriate parts of the mount propagation tree without
>> needing to change the logic of the rest of the code.
>> 
>> A script to generate overlapping mount propagation trees:
>> 
>> $ cat runs.h
>> set -e
>> mount -t tmpfs zdtm /mnt
>> mkdir -p /mnt/1 /mnt/2
>> mount -t tmpfs zdtm /mnt/1
>> mount --make-shared /mnt/1
>> mkdir /mnt/1/1
>> 
>> iteration=10
>> if [ -n "$1" ] ; then
>> 	iteration=$1
>> fi
>> 
>> for i in $(seq $iteration); do
>> 	mount --bind /mnt/1/1 /mnt/1/1
>> done
>> 
>> mount --rbind /mnt/1 /mnt/2
>> 
>> TIMEFORMAT='%Rs'
>> nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
>> echo -n "umount -l /mnt/1 -> $nr        "
>> time umount -l /mnt/1
>> 
>> nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
>> time umount -l /mnt/2
>> 
>> $ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done
>> 
>> Here are the performance numbers with and without the patch:
>> 
>>      mhash |  8192   |  8192  | 1048576 | 1048576
>>     mounts | before  | after  |  before | after
>>     ------------------------------------------------
>>       1025 |  0.040s | 0.016s |  0.038s | 0.019s
>>       2049 |  0.094s | 0.017s |  0.080s | 0.018s
>>       4097 |  0.243s | 0.019s |  0.206s | 0.023s
>>       8193 |  1.202s | 0.028s |  1.562s | 0.032s
>>      16385 |  9.635s | 0.036s |  9.952s | 0.041s
>>      32769 | 60.928s | 0.063s | 44.321s | 0.064s
>>      65537 |         | 0.097s |         | 0.097s
>>     131073 |         | 0.233s |         | 0.176s
>>     262145 |         | 0.653s |         | 0.344s
>>     524289 |         | 2.305s |         | 0.735s
>>    1048577 |         | 7.107s |         | 2.603s
>> 
>> Andrei Vagin reports fixing the performance problem is part of the
>> work to fix CVE-2016-6213.
>> 
>> Cc: stable@vger.kernel.org
>> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
>> Reported-by: Andrei Vagin <avagin@openvz.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 62 insertions(+), 1 deletion(-)
>> 
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index fbaca7df2eb0..53d411a371ce 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
>>  	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
>>  }
>>  
>> +static inline struct mount *last_slave(struct mount *p)
>> +{
>> +	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
>> +}
>> +
>>  static inline struct mount *next_slave(struct mount *p)
>>  {
>>  	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
>> @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
>>  	}
>>  }
>>  
>> +static struct mount *skip_propagation_subtree(struct mount *m,
>> +						struct mount *origin)
>> +{
>> +	/*
>> +	 * Advance m such that propagation_next will not return
>> +	 * the slaves of m.
>> +	 */
>> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
>> +		m = last_slave(m);
>> +
>> +	return m;
>> +}
>> +
>>  static struct mount *next_group(struct mount *m, struct mount *origin)
>>  {
>>  	while (1) {
>> @@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
>>  	}
>>  }
>>  
>> +static void cleanup_umount_visitations(struct list_head *visited)
>> +{
>> +	while (!list_empty(visited)) {
>> +		struct mount *mnt =
>> +			list_first_entry(visited, struct mount, mnt_umounting);
>> +		list_del_init(&mnt->mnt_umounting);
>> +	}
>> +}
>> +
>>  /*
>>   * collect all mounts that receive propagation from the mount in @list,
>>   * and return these additional mounts in the same list.
>> @@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
>>  	struct mount *mnt;
>>  	LIST_HEAD(to_restore);
>>  	LIST_HEAD(to_umount);
>> +	LIST_HEAD(visited);
>>  
>> -	list_for_each_entry(mnt, list, mnt_list) {
>> +	/* Find candidates for unmounting */
>> +	list_for_each_entry_reverse(mnt, list, mnt_list) {
>>  		struct mount *parent = mnt->mnt_parent;
>>  		struct mount *m;
>>  
>> +		/*
>> +		 * If this mount has already been visited it is known that it's
>> +		 * entire peer group and all of their slaves in the propagation
>> +		 * tree for the mountpoint has already been visited and there is
>> +		 * no need to visit them again.
>> +		 */
>> +		if (!list_empty(&mnt->mnt_umounting))
>> +			continue;
>> +
>> +		list_add_tail(&mnt->mnt_umounting, &visited);
>>  		for (m = propagation_next(parent, parent); m;
>>  		     m = propagation_next(m, parent)) {
>>  			struct mount *child = __lookup_mnt(&m->mnt,
>> @@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
>>  			if (!child)
>>  				continue;
>>  
>> +			if (!list_empty(&child->mnt_umounting)) {
>> +				/*
>> +				 * If the child has already been visited it is
>> +				 * know that it's entire peer group and all of
>> +				 * their slaves in the propgation tree for the
>> +				 * mountpoint has already been visited and there
>> +				 * is no need to visit this subtree again.
>> +				 */
>> +				m = skip_propagation_subtree(m, parent);
>> +				continue;
>> +			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
>> +				/*
>> +				 * We have come accross an partially unmounted
>> +				 * mount in list that has not been visited yet.
>> +				 * Remember it has been visited and continue
>> +				 * about our merry way.
>> +				 */
>> +				list_add_tail(&child->mnt_umounting, &visited);
>> +				continue;
>> +			}
>> +
>>  			/* Check the child and parents while progress is made */
>>  			while (__propagate_umount(child,
>>  						  &to_umount, &to_restore)) {
>> @@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
>>  
>>  	umount_list(&to_umount, &to_restore);
>>  	restore_mounts(&to_restore);
>> +	cleanup_umount_visitations(&visited);
>>  	list_splice_tail(&to_umount, list);
>>  
>>  	return 0;
>> -- 
>> 2.10.1
>> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees
  2017-05-17 23:26                                   ` Eric W. Biederman
@ 2017-05-18  0:51                                     ` Andrei Vagin
  0 siblings, 0 replies; 63+ messages in thread
From: Andrei Vagin @ 2017-05-18  0:51 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Al Viro, linux-fsdevel, Ram Pai

On Wed, May 17, 2017 at 06:26:09PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <avagin@virtuozzo.com> writes:
> 
> > Hi Eric,
> >
> > I tested both patches and I haven't found any issue. Thanks.
> 
> Can I get a Tested-by an Acked-by or a Reviewed-by?

Sure. I read patches and they look good for me.

Reviewed-by: Andrei Vagin <avagin@virtuozzo.com>

Thanks,
Andrei

> 
> Apologies this took so long to get to this point until I realized that
> we could move mnt_change_mountpoint into a separate pass this didn't
> look possible.
> 
> Eric
> 
> >
> > On Wed, May 17, 2017 at 12:55:16AM -0500, Eric W. Biederman wrote:
> >> 
> >> Andrei Vagin pointed out that time to executue propagate_umount can go
> >> non-linear (and take a ludicrious amount of time) when the mount
> >> propogation trees of the mounts to be unmunted by a lazy unmount
> >> overlap.
> >> 
> >> Make the walk of the mount propagation trees nearly linear by
> >> remembering which mounts have already been visited, allowing
> >> subsequent walks to detect when walking a mount propgation tree or a
> >> subtree of a mount propgation tree would be duplicate work and to skip
> >> them entirely.
> >> 
> >> Walk the list of mounts whose propgatation trees need to be traversed
> >> from the mount highest in the mount tree to mounts lower in the mount
> >> tree so that odds are higher that the code will walk the largest trees
> >> first, allowing later tree walks to be skipped entirely.
> >> 
> >> Add cleanup_umount_visitation to remover the code's memory of which
> >> mounts have been visited.
> >> 
> >> Add the functions last_slave and skip_propagation_subtree to allow
> >> skipping appropriate parts of the mount propagation tree without
> >> needing to change the logic of the rest of the code.
> >> 
> >> A script to generate overlapping mount propagation trees:
> >> 
> >> $ cat runs.h
> >> set -e
> >> mount -t tmpfs zdtm /mnt
> >> mkdir -p /mnt/1 /mnt/2
> >> mount -t tmpfs zdtm /mnt/1
> >> mount --make-shared /mnt/1
> >> mkdir /mnt/1/1
> >> 
> >> iteration=10
> >> if [ -n "$1" ] ; then
> >> 	iteration=$1
> >> fi
> >> 
> >> for i in $(seq $iteration); do
> >> 	mount --bind /mnt/1/1 /mnt/1/1
> >> done
> >> 
> >> mount --rbind /mnt/1 /mnt/2
> >> 
> >> TIMEFORMAT='%Rs'
> >> nr=$(( ( 2 ** ( $iteration + 1 ) ) + 1 ))
> >> echo -n "umount -l /mnt/1 -> $nr        "
> >> time umount -l /mnt/1
> >> 
> >> nr=$(cat /proc/self/mountinfo | grep zdtm | wc -l )
> >> time umount -l /mnt/2
> >> 
> >> $ for i in $(seq 9 19); do echo $i; unshare -Urm bash ./run.sh $i; done
> >> 
> >> Here are the performance numbers with and without the patch:
> >> 
> >>      mhash |  8192   |  8192  | 1048576 | 1048576
> >>     mounts | before  | after  |  before | after
> >>     ------------------------------------------------
> >>       1025 |  0.040s | 0.016s |  0.038s | 0.019s
> >>       2049 |  0.094s | 0.017s |  0.080s | 0.018s
> >>       4097 |  0.243s | 0.019s |  0.206s | 0.023s
> >>       8193 |  1.202s | 0.028s |  1.562s | 0.032s
> >>      16385 |  9.635s | 0.036s |  9.952s | 0.041s
> >>      32769 | 60.928s | 0.063s | 44.321s | 0.064s
> >>      65537 |         | 0.097s |         | 0.097s
> >>     131073 |         | 0.233s |         | 0.176s
> >>     262145 |         | 0.653s |         | 0.344s
> >>     524289 |         | 2.305s |         | 0.735s
> >>    1048577 |         | 7.107s |         | 2.603s
> >> 
> >> Andrei Vagin reports fixing the performance problem is part of the
> >> work to fix CVE-2016-6213.
> >> 
> >> Cc: stable@vger.kernel.org
> >> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> >> Reported-by: Andrei Vagin <avagin@openvz.org>
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >> ---
> >>  fs/pnode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  1 file changed, 62 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/fs/pnode.c b/fs/pnode.c
> >> index fbaca7df2eb0..53d411a371ce 100644
> >> --- a/fs/pnode.c
> >> +++ b/fs/pnode.c
> >> @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
> >>  	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
> >>  }
> >>  
> >> +static inline struct mount *last_slave(struct mount *p)
> >> +{
> >> +	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
> >> +}
> >> +
> >>  static inline struct mount *next_slave(struct mount *p)
> >>  {
> >>  	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
> >> @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
> >>  	}
> >>  }
> >>  
> >> +static struct mount *skip_propagation_subtree(struct mount *m,
> >> +						struct mount *origin)
> >> +{
> >> +	/*
> >> +	 * Advance m such that propagation_next will not return
> >> +	 * the slaves of m.
> >> +	 */
> >> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> >> +		m = last_slave(m);
> >> +
> >> +	return m;
> >> +}
> >> +
> >>  static struct mount *next_group(struct mount *m, struct mount *origin)
> >>  {
> >>  	while (1) {
> >> @@ -505,6 +523,15 @@ static void restore_mounts(struct list_head *to_restore)
> >>  	}
> >>  }
> >>  
> >> +static void cleanup_umount_visitations(struct list_head *visited)
> >> +{
> >> +	while (!list_empty(visited)) {
> >> +		struct mount *mnt =
> >> +			list_first_entry(visited, struct mount, mnt_umounting);
> >> +		list_del_init(&mnt->mnt_umounting);
> >> +	}
> >> +}
> >> +
> >>  /*
> >>   * collect all mounts that receive propagation from the mount in @list,
> >>   * and return these additional mounts in the same list.
> >> @@ -517,11 +544,23 @@ int propagate_umount(struct list_head *list)
> >>  	struct mount *mnt;
> >>  	LIST_HEAD(to_restore);
> >>  	LIST_HEAD(to_umount);
> >> +	LIST_HEAD(visited);
> >>  
> >> -	list_for_each_entry(mnt, list, mnt_list) {
> >> +	/* Find candidates for unmounting */
> >> +	list_for_each_entry_reverse(mnt, list, mnt_list) {
> >>  		struct mount *parent = mnt->mnt_parent;
> >>  		struct mount *m;
> >>  
> >> +		/*
> >> +		 * If this mount has already been visited it is known that it's
> >> +		 * entire peer group and all of their slaves in the propagation
> >> +		 * tree for the mountpoint has already been visited and there is
> >> +		 * no need to visit them again.
> >> +		 */
> >> +		if (!list_empty(&mnt->mnt_umounting))
> >> +			continue;
> >> +
> >> +		list_add_tail(&mnt->mnt_umounting, &visited);
> >>  		for (m = propagation_next(parent, parent); m;
> >>  		     m = propagation_next(m, parent)) {
> >>  			struct mount *child = __lookup_mnt(&m->mnt,
> >> @@ -529,6 +568,27 @@ int propagate_umount(struct list_head *list)
> >>  			if (!child)
> >>  				continue;
> >>  
> >> +			if (!list_empty(&child->mnt_umounting)) {
> >> +				/*
> >> +				 * If the child has already been visited it is
> >> +				 * know that it's entire peer group and all of
> >> +				 * their slaves in the propgation tree for the
> >> +				 * mountpoint has already been visited and there
> >> +				 * is no need to visit this subtree again.
> >> +				 */
> >> +				m = skip_propagation_subtree(m, parent);
> >> +				continue;
> >> +			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
> >> +				/*
> >> +				 * We have come accross an partially unmounted
> >> +				 * mount in list that has not been visited yet.
> >> +				 * Remember it has been visited and continue
> >> +				 * about our merry way.
> >> +				 */
> >> +				list_add_tail(&child->mnt_umounting, &visited);
> >> +				continue;
> >> +			}
> >> +
> >>  			/* Check the child and parents while progress is made */
> >>  			while (__propagate_umount(child,
> >>  						  &to_umount, &to_restore)) {
> >> @@ -542,6 +602,7 @@ int propagate_umount(struct list_head *list)
> >>  
> >>  	umount_list(&to_umount, &to_restore);
> >>  	restore_mounts(&to_restore);
> >> +	cleanup_umount_visitations(&visited);
> >>  	list_splice_tail(&to_umount, list);
> >>  
> >>  	return 0;
> >> -- 
> >> 2.10.1
> >> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass
  2017-05-15 20:10                           ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Eric W. Biederman
                                               ` (2 preceding siblings ...)
  2017-05-17  5:54                             ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Eric W. Biederman
@ 2017-05-22  8:15                             ` Ram Pai
  2017-05-22 18:33                               ` Eric W. Biederman
  3 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-05-22  8:15 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
> 
> It was observed that in some pathlogical cases that the current code
> does not unmount everything it should.  After investigation it was
> determined that the issue is that mnt_change_mntpoint can can change
> which mounts are available to be unmounted during mount propagation
> which is wrong.
> 
> The trivial reproducer is:
> $ cat ./pathological.sh
> 
> mount -t tmpfs test-base /mnt
> cd /mnt
> mkdir 1 2 1/1
> mount --bind 1 1
> mount --make-shared 1
> mount --bind 1 2
> mount --bind 1/1 1/1
> mount --bind 1/1 1/1
> echo
> grep test-base /proc/self/mountinfo
> umount 1/1
> echo
> grep test-base /proc/self/mountinfo
> 
> $ unshare -Urm ./pathological.sh
> 
> The expected output looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> The output without the fix looks like:
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> 
> That last mount in the output was in the propgation tree to be unmounted but
> was missed because the mnt_change_mountpoint changed it's parent before the walk
> through the mount propagation tree observed it.
> 

Looks patch correct to me.
Reviewed-by: Ram Pai <linuxram@us.ibm.com>

BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
explicitly flag tucked mounts with MNT_TUCKED, and use that information
to determine if the child is really a topper?  Currently we determine
the topper if it entirely is covering. How do we diambiguate that from an
entirely-covering-mount that is explicitly mounted by the administrator?
A topper situation is applicable only when tucked, right?


> Cc: stable@vger.kernel.org
> Fixes: 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |  1 +
>  fs/namespace.c |  1 +
>  fs/pnode.c     | 35 ++++++++++++++++++++++++++++++-----
>  3 files changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index bf1fda6eed8f..ede5a1d5cf99 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_reparent;	/* reparent list entry */
>  #ifdef CONFIG_FSNOTIFY
>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8bd3e4d448b9..51e49866e1fe 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_reparent);
>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>  	}
>  	return mnt;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5bc7896d122a..52aca0a118ff 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -439,7 +439,7 @@ static void mark_umount_candidates(struct mount *mnt)
>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>   * parent propagates to.
>   */
> -static void __propagate_umount(struct mount *mnt)
> +static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
>  {
>  	struct mount *parent = mnt->mnt_parent;
>  	struct mount *m;
> @@ -464,17 +464,38 @@ static void __propagate_umount(struct mount *mnt)
>  		 */
>  		topper = find_topper(child);
>  		if (topper)
> -			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> -					      topper);
> +			list_add_tail(&topper->mnt_reparent, to_reparent);
> 
> -		if (list_empty(&child->mnt_mounts)) {
> +		if (topper || list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
> +			list_del_init(&child->mnt_reparent);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>  			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>  		}
>  	}
>  }
> 
> +static void reparent_mounts(struct list_head *to_reparent)
> +{
> +	while (!list_empty(to_reparent)) {
> +		struct mount *mnt, *parent;
> +		struct mountpoint *mp;
> +
> +		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
> +		list_del_init(&mnt->mnt_reparent);
> +
> +		/* Where should this mount be reparented to? */
> +		mp = mnt->mnt_mp;
> +		parent = mnt->mnt_parent;
> +		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
> +			mp = parent->mnt_mp;
> +			parent = parent->mnt_parent;
> +		}
> +
> +		mnt_change_mountpoint(parent, mp, mnt);
> +	}
> +}
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -485,11 +506,15 @@ static void __propagate_umount(struct mount *mnt)
>  int propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_reparent);
> 
>  	list_for_each_entry_reverse(mnt, list, mnt_list)
>  		mark_umount_candidates(mnt);
> 
>  	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> +		__propagate_umount(mnt, &to_reparent);
> +
> +	reparent_mounts(&to_reparent);
> +
>  	return 0;
>  }
> -- 
> 2.10.1

-- 
Ram Pai

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass
  2017-05-22  8:15                             ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Ram Pai
@ 2017-05-22 18:33                               ` Eric W. Biederman
  2017-05-22 22:34                                 ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-22 18:33 UTC (permalink / raw)
  To: Ram Pai; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

Ram Pai <linuxram@us.ibm.com> writes:

> On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
>> 
>> It was observed that in some pathlogical cases that the current code
>> does not unmount everything it should.  After investigation it was
>> determined that the issue is that mnt_change_mntpoint can can change
>> which mounts are available to be unmounted during mount propagation
>> which is wrong.
>> 
>> The trivial reproducer is:
>> $ cat ./pathological.sh
>> 
>> mount -t tmpfs test-base /mnt
>> cd /mnt
>> mkdir 1 2 1/1
>> mount --bind 1 1
>> mount --make-shared 1
>> mount --bind 1 2
>> mount --bind 1/1 1/1
>> mount --bind 1/1 1/1
>> echo
>> grep test-base /proc/self/mountinfo
>> umount 1/1
>> echo
>> grep test-base /proc/self/mountinfo
>> 
>> $ unshare -Urm ./pathological.sh
>> 
>> The expected output looks like:
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> The output without the fix looks like:
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> 
>> That last mount in the output was in the propgation tree to be unmounted but
>> was missed because the mnt_change_mountpoint changed it's parent before the walk
>> through the mount propagation tree observed it.
>> 
>
> Looks patch correct to me.
> Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>
> BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
> explicitly flag tucked mounts with MNT_TUCKED, and use that information
> to determine if the child is really a topper?  Currently we determine
> the topper if it entirely is covering. How do we diambiguate that from an
> entirely-covering-mount that is explicitly mounted by the administrator?
> A topper situation is applicable only when tucked, right?

In the current code explictly does not care about the difference.
The code just restricts untucking mounts of any kind to umount
propagation.

This is where we have previously disagreed.

A short summary of our previous discussions:
Eric Biederman: find_topper makes tucked mounts ordinary mounts and is simple.
Eric Biederman: I don't see a compelling case for a MNT_TUCKED flag
Eric Biederman: I think the change is a nice behavioral improvement
Ram Pai: a MNT_TUCKED flag would perfectly preserve existing behavior
Ram Pai: find_topper while not perfect is better than the previous
         very special case for side/shadow mounts

With respect to backwards compatibility the set of bugs I am fixing
shows that it is possible to have some very egregious bugs in this
area and in practice no one cares.


Without a MNT_TUCKED flag I can readily tell what the following
code should do by simply inspection of the of the mount
propgation information in /proc/self/mountinfo:

$ mount -t tmpfs test-base /mnt
$ cd /mnt
$ mkdir -p 1 2 1/1
$ mount --bind 1 1
$ mount --make-shared 1
$ mount --bind 1 2
$ mount --bind 1/1 1/1
$ mount --bind 1/1 1/1
$ umount 1/1

Before the umount /proc/self/mountinfo shows:
 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=502,gid=502
 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502

So it is clear to me that umount /mnt/1/1 should just leave:
/mnt
/mnt/1
/mnt/2

I would argue that is what the code should always have done.

I believe the code with a MNT_TUCKED flag would leave:
/mnt
/mnt/1
/mnt/1/1
/mnt/2
But in truth it makes my head hurt to think about it.
I don't see that MNT_TUCKED adds anything except aditional code
complexity.

I don't actually see what the value is in keeping mounts that you can
not use (because they are overmounted) around.

If the scenarios we were talking about were all limited to perfoming a
mount and then undoing that mount I could almost see some value in a
MNT_TUCKED flag.  Given that one of the justications for tucking mounts
in the first place is what happens when you umount something on a slave
mount I really don't like it.  As now I get the question what happens
on a slave mount where a mount has been propagated and tucked, and
then the topper is unmounted and a new topper is added.  Should unmount
on the parent untuck the propagated mount or leave it there?  It was
propagated it was tucked but it wasn't tucked under what is currenty on
top.

I much prefer the current semantics where we just say mount propagation
can tuck and untuck things, and the history of how the mount tree got
into its current shape is not important.

Given how difficult it has been to make this code performant and correct
I am not particularly eager to add complexity for unnecessary bug
compatibility.  But if it creates a breaking regression for something
(other than a regression test) I am willing to add MNT_TUCKED.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass
  2017-05-22 18:33                               ` Eric W. Biederman
@ 2017-05-22 22:34                                 ` Ram Pai
  2017-05-23 13:58                                   ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-05-22 22:34 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

On Mon, May 22, 2017 at 01:33:05PM -0500, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
> >> 
> >> It was observed that in some pathlogical cases that the current code
> >> does not unmount everything it should.  After investigation it was
> >> determined that the issue is that mnt_change_mntpoint can can change
> >> which mounts are available to be unmounted during mount propagation
> >> which is wrong.
> >> 
> >> The trivial reproducer is:
> >> $ cat ./pathological.sh
> >> 
> >> mount -t tmpfs test-base /mnt
> >> cd /mnt
> >> mkdir 1 2 1/1
> >> mount --bind 1 1
> >> mount --make-shared 1
> >> mount --bind 1 2
> >> mount --bind 1/1 1/1
> >> mount --bind 1/1 1/1
> >> echo
> >> grep test-base /proc/self/mountinfo
> >> umount 1/1
> >> echo
> >> grep test-base /proc/self/mountinfo
> >> 
> >> $ unshare -Urm ./pathological.sh
> >> 
> >> The expected output looks like:
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> The output without the fix looks like:
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
> >> 
> >> That last mount in the output was in the propgation tree to be unmounted but
> >> was missed because the mnt_change_mountpoint changed it's parent before the walk
> >> through the mount propagation tree observed it.
> >> 
> >
> > Looks patch correct to me.
> > Reviewed-by: Ram Pai <linuxram@us.ibm.com>
> >
> > BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
> > explicitly flag tucked mounts with MNT_TUCKED, and use that information
> > to determine if the child is really a topper?  Currently we determine
> > the topper if it entirely is covering. How do we diambiguate that from an
> > entirely-covering-mount that is explicitly mounted by the administrator?
> > A topper situation is applicable only when tucked, right?
> 
> In the current code explictly does not care about the difference.
> The code just restricts untucking mounts of any kind to umount
> propagation.
> 
> This is where we have previously disagreed.
> 
> A short summary of our previous discussions:
> Eric Biederman: find_topper makes tucked mounts ordinary mounts and is simple.
> Eric Biederman: I don't see a compelling case for a MNT_TUCKED flag
> Eric Biederman: I think the change is a nice behavioral improvement
> Ram Pai: a MNT_TUCKED flag would perfectly preserve existing behavior
> Ram Pai: find_topper while not perfect is better than the previous
>          very special case for side/shadow mounts
> 
> With respect to backwards compatibility the set of bugs I am fixing
> shows that it is possible to have some very egregious bugs in this
> area and in practice no one cares.
> 
> 
> Without a MNT_TUCKED flag I can readily tell what the following
> code should do by simply inspection of the of the mount
> propgation information in /proc/self/mountinfo:
> 
Step 1>  $ mount -t tmpfs test-base /mnt
Step 2>  $ cd /mnt
Step 3>  $ mkdir -p 1 2 1/1
Step 4>  $ mount --bind 1 1
Step 5>  $ mount --make-shared 1
Step 6>  $ mount --bind 1 2
Step 7>  $ mount --bind 1/1 1/1
Step 8>  $ mount --bind 1/1 1/1
Step 9>  $ umount 1/1
> 
> Before the umount /proc/self/mountinfo shows:
>  46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=502,gid=502
>  47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>  52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
> 
> So it is clear to me that umount /mnt/1/1 should just leave:
> /mnt
> /mnt/1
> /mnt/2

This is one other place where we disagree....  I expect things to peel
back to the state the trees were, when the Step 7 was executed. which
is 
/mnt 
/mnt/1
/mnt/2
/mnt/1/1
/mnt/2/1
And all tucked mounts disapper.

Dont get me wrong. I dont think we will agree because we have different
expections. There is no standard on what to expect. Someone
authoritative; may be Al Viro, has to define what to expect. 

> 
> I would argue that is what the code should always have done.
> 
> I believe the code with a MNT_TUCKED flag would leave:
> /mnt
> /mnt/1
> /mnt/1/1
> /mnt/2
> But in truth it makes my head hurt to think about it.

Yes it is extremely mind-bending; sometimes mind-bleeding. :-(

> I don't see that MNT_TUCKED adds anything except aditional code
> complexity.
> 
> I don't actually see what the value is in keeping mounts that you can
> not use (because they are overmounted) around.

I argue that MNT_TUCKED leaves markers that can be used to determine
what can be taken out and what needs to be kept.

I will stop here and say.. there is value in marking TUCKED mounts.
Someone will run into some obscure issue in the future; probably a
decade from now, and the same story will repeat.

I wish there was a mathematical formula, where you plugin the operation
and a state of the trees, and the new state of the mount-trees emerge.

For now your patches look good to me.
RP

> 
> If the scenarios we were talking about were all limited to perfoming a
> mount and then undoing that mount I could almost see some value in a
> MNT_TUCKED flag.  Given that one of the justications for tucking mounts
> in the first place is what happens when you umount something on a slave
> mount I really don't like it.  As now I get the question what happens
> on a slave mount where a mount has been propagated and tucked, and
> then the topper is unmounted and a new topper is added.  Should unmount
> on the parent untuck the propagated mount or leave it there?  It was
> propagated it was tucked but it wasn't tucked under what is currenty on
> top.
> 
> I much prefer the current semantics where we just say mount propagation
> can tuck and untuck things, and the history of how the mount tree got
> into its current shape is not important.
> 
> Given how difficult it has been to make this code performant and correct
> I am not particularly eager to add complexity for unnecessary bug
> compatibility.  But if it creates a breaking regression for something
> (other than a regression test) I am willing to add MNT_TUCKED.
> 
> Eric

-- 
Ram Pai

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass
  2017-05-22 22:34                                 ` Ram Pai
@ 2017-05-23 13:58                                   ` Eric W. Biederman
  0 siblings, 0 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-23 13:58 UTC (permalink / raw)
  To: Ram Pai; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

Ram Pai <linuxram@us.ibm.com> writes:

> On Mon, May 22, 2017 at 01:33:05PM -0500, Eric W. Biederman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > On Mon, May 15, 2017 at 03:10:38PM -0500, Eric W. Biederman wrote:
>> >> 
>> >> It was observed that in some pathlogical cases that the current code
>> >> does not unmount everything it should.  After investigation it was
>> >> determined that the issue is that mnt_change_mntpoint can can change
>> >> which mounts are available to be unmounted during mount propagation
>> >> which is wrong.
>> >> 
>> >> The trivial reproducer is:
>> >> $ cat ./pathological.sh
>> >> 
>> >> mount -t tmpfs test-base /mnt
>> >> cd /mnt
>> >> mkdir 1 2 1/1
>> >> mount --bind 1 1
>> >> mount --make-shared 1
>> >> mount --bind 1 2
>> >> mount --bind 1/1 1/1
>> >> mount --bind 1/1 1/1
>> >> echo
>> >> grep test-base /proc/self/mountinfo
>> >> umount 1/1
>> >> echo
>> >> grep test-base /proc/self/mountinfo
>> >> 
>> >> $ unshare -Urm ./pathological.sh
>> >> 
>> >> The expected output looks like:
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> The output without the fix looks like:
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> 46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=1000,gid=1000
>> >> 47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 52 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=1000,gid=1000
>> >> 
>> >> That last mount in the output was in the propgation tree to be unmounted but
>> >> was missed because the mnt_change_mountpoint changed it's parent before the walk
>> >> through the mount propagation tree observed it.
>> >> 
>> >
>> > Looks patch correct to me.
>> > Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>> >
>> > BTW: The logic of find_topper() looks not-so-accurate to me. Why dont we
>> > explicitly flag tucked mounts with MNT_TUCKED, and use that information
>> > to determine if the child is really a topper?  Currently we determine
>> > the topper if it entirely is covering. How do we diambiguate that from an
>> > entirely-covering-mount that is explicitly mounted by the administrator?
>> > A topper situation is applicable only when tucked, right?
>> 
>> In the current code explictly does not care about the difference.
>> The code just restricts untucking mounts of any kind to umount
>> propagation.
>> 
>> This is where we have previously disagreed.
>> 
>> A short summary of our previous discussions:
>> Eric Biederman: find_topper makes tucked mounts ordinary mounts and is simple.
>> Eric Biederman: I don't see a compelling case for a MNT_TUCKED flag
>> Eric Biederman: I think the change is a nice behavioral improvement
>> Ram Pai: a MNT_TUCKED flag would perfectly preserve existing behavior
>> Ram Pai: find_topper while not perfect is better than the previous
>>          very special case for side/shadow mounts
>> 
>> With respect to backwards compatibility the set of bugs I am fixing
>> shows that it is possible to have some very egregious bugs in this
>> area and in practice no one cares.
>> 
>> 
>> Without a MNT_TUCKED flag I can readily tell what the following
>> code should do by simply inspection of the of the mount
>> propgation information in /proc/self/mountinfo:
>> 
> Step 1>  $ mount -t tmpfs test-base /mnt
> Step 2>  $ cd /mnt
> Step 3>  $ mkdir -p 1 2 1/1
> Step 4>  $ mount --bind 1 1
> Step 5>  $ mount --make-shared 1
> Step 6>  $ mount --bind 1 2
> Step 7>  $ mount --bind 1/1 1/1
> Step 8>  $ mount --bind 1/1 1/1
> Step 9>  $ umount 1/1
>> 
>> Before the umount /proc/self/mountinfo shows:
>>  46 31 0:25 / /mnt rw,relatime - tmpfs test-base rw,uid=502,gid=502
>>  47 46 0:25 /1 /mnt/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  48 46 0:25 /1 /mnt/2 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  49 54 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  50 53 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  51 49 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  54 47 0:25 /1/1 /mnt/1/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  53 48 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>>  52 50 0:25 /1/1 /mnt/2/1 rw,relatime shared:1 - tmpfs test-base rw,uid=502,gid=502
>> 
>> So it is clear to me that umount /mnt/1/1 should just leave:
>> /mnt
>> /mnt/1
>> /mnt/2
>
> This is one other place where we disagree....  I expect things to peel
> back to the state the trees were, when the Step 7 was executed. which
> is 
> /mnt 
> /mnt/1
> /mnt/2
> /mnt/1/1
> /mnt/2/1
> And all tucked mounts disapper.
>
> Dont get me wrong. I dont think we will agree because we have different
> expections. There is no standard on what to expect. Someone
> authoritative; may be Al Viro, has to define what to expect. 
>
>> 
>> I would argue that is what the code should always have done.
>> 
>> I believe the code with a MNT_TUCKED flag would leave:
>> /mnt
>> /mnt/1
>> /mnt/1/1
>> /mnt/2
>> But in truth it makes my head hurt to think about it.
>
> Yes it is extremely mind-bending; sometimes mind-bleeding. :-(
>
>> I don't see that MNT_TUCKED adds anything except aditional code
>> complexity.
>> 
>> I don't actually see what the value is in keeping mounts that you can
>> not use (because they are overmounted) around.
>
> I argue that MNT_TUCKED leaves markers that can be used to determine
> what can be taken out and what needs to be kept.
>
> I will stop here and say.. there is value in marking TUCKED mounts.
> Someone will run into some obscure issue in the future; probably a
> decade from now, and the same story will repeat.
>
> I wish there was a mathematical formula, where you plugin the operation
> and a state of the trees, and the new state of the mount-trees emerge.
>
> For now your patches look good to me.

Then let me ask you this.  Please look at the two successor patches to
this.  I am confident in them but I also know I am human and may have
missed something.  

Then on top of those et's look at marking tucked mounts.  If we write
the code and examine motivating cases where the behavior differs we
should be able to make an informed choice.

I say motivating cases as there are use cases with slave mounts that
motivated the support of tucking mounts.  I figure if we go back through
and examine those we can at least see if there are any cases where
without marking them we have a practical issue.  Or perhaps we will
see that for all cases that we can think of that matter there are no
differences.

I am motivated to solve this because after fixing the perfomance issues
we need find a way for some set of mount namespaces to recreate their
mount propgation tree on a different machine.   That is needed for
CRIU and it may be needed for the plan9 case where logging into a remote
system you could bring all of your filesystems with you into your own
personal mount namespace.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-17  5:54                             ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Eric W. Biederman
  2017-05-17  5:55                               ` [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees Eric W. Biederman
@ 2017-05-24 20:42                               ` Ram Pai
  2017-05-24 21:54                                 ` Eric W. Biederman
  2017-05-30  6:07                               ` Ram Pai
  2 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-05-24 20:42 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
> 
> While investigating some poor umount performance I realized that in
> the case of overlapping mount trees where some of the mounts are locked
> the code has been failing to unmount all of the mounts it should
> have been unmounting.
> 
> This failure to unmount all of the necessary
> mounts can be reproduced with:
> 
> $ cat locked_mounts_test.sh
> 
> mount -t tmpfs test-base /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/b
> 
> mount -t tmpfs test1 /mnt/b
> mount --make-shared /mnt/b
> mkdir -p /mnt/b/10
> 
> mount -t tmpfs test2 /mnt/b/10
> mount --make-shared /mnt/b/10
> mkdir -p /mnt/b/10/20
> 
> mount --rbind /mnt/b /mnt/b/10/20
> 
> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
> sleep 1
> umount -l /mnt/b
> wait %%
> 
> $ unshare -Urm ./locked_mounts_test.sh
> 
> This failure is corrected by removing the prepass that marks mounts
> that may be umounted.
> 
> A first pass is added that umounts mounts if possible and if not sets
> mount mark if they could be unmounted if they weren't locked and adds
> them to a list to umount possibilities.  This first pass reconsiders
> the mounts parent if it is on the list of umount possibilities, ensuring
> that information of umoutability will pass from child to mount parent.
> 
> A second pass then walks through all mounts that are umounted and processes
> their children unmounting them or marking them for reparenting.
> 
> A last pass cleans up the state on the mounts that could not be umounted
> and if applicable reparents them to their first parent that remained
> mounted.
> 
> While a bit longer than the old code this code is much more robust
> as it allows information to flow up from the leaves and down
> from the trunk making the order in which mounts are encountered
> in the umount propgation tree irrelevant.

Eric,
	I think we can accomplish what you want in a much simpler way.
       	Would the patch below; UNTESTED BUT COMPILED, resolve your
	issue?

	Its a two pass unmount. First pass marks mounts that can
	be unmounted, and second pass does the neccessary unlinks.
	It does mark TUCKED mounts, and uses that information
	to peel off the correct mounts. Key points are

	a) a tucked mount never entertain any unmount propagation
	 	on its root dentry.

	b) when the child on the root dentry of a tucked mount is
	   unmounted, the tucked mount is not a tucked mount anymore.

	c) if the child is a tucked mount, than its child is reparented
	   to the parent.


Signed-off-by: "Ram Pai" <linuxram@us.ibm.com>

fs/namespace.c        |    4 ++-
fs/pnode.c            |   53 +++++++++++++++++++++++++++++++++++++-------------
fs/pnode.h            |    3 ++
include/linux/mount.h |    1 

diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375ef..ff3ec90 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2050,8 +2050,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		hlist_del_init(&child->mnt_hash);
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
-		if (q)
+		if (q) {
 			mnt_change_mountpoint(child, smp, q);
+			SET_MNT_TUCKED(child);
+		}
 		commit_tree(child);
 	}
 	put_mountpoint(smp);
diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896..b44a544 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -448,31 +448,58 @@ static void __propagate_umount(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *topper;
-		struct mount *child = __lookup_mnt(&m->mnt,
-						mnt->mnt_mountpoint);
-		/*
-		 * umount the child only if the child has no children
-		 * and the child is marked safe to unmount.
+		struct mount *topper, *child;
+
+		/* Tucked mount must drop umount propagation events on
+		 * its **root dentry**.
+		 * The tucked mount did not exist when that child came
+		 * into existence. It never received that mount propagation.
+		 * Hence it should never entertain the umount propagation
+		 * aswell.
 		 */
+		if (IS_MNT_TUCKED(m) && list_is_singular(&mnt->mnt_mounts))
+			continue;
+
+
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
+
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
+
 		CLEAR_MNT_MARK(child);
 
-		/* If there is exactly one mount covering all of child
-		 * replace child with that mount.
-		 */
-		topper = find_topper(child);
-		if (topper)
-			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
-					      topper);
+		if (IS_MNT_TUCKED(child) &&
+			(list_is_singular(&child->mnt_mounts))) {
+			topper = find_topper(child);
+			if (topper) {
+				mnt_change_mountpoint(child->mnt_parent,
+					child->mnt_mp, topper);
+				CLEAR_MNT_TUCKED(child); /*lets be precise*/
+			}
+		}
 
 		if (list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
 			list_move_tail(&child->mnt_list, &mnt->mnt_list);
 		}
+#if 0
+	       	else {
+			mntput(child); /* mark it for deletion. It will
+				       	  be deleted whenever it looses
+					  all its remaining references.
+					  TODO: some more thought
+					  needed, please validate */
+		}
+#endif
 	}
+
+	/*
+	 * This explicit umount operation is exposing the parent.
+	 * In case the parent was a 'tucked' mount, it cannot be so
+	 * anymore.
+	 */
+	CLEAR_MNT_TUCKED(parent);
 }
 
 /*
diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65..9ebd1a8 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -18,8 +18,11 @@
 #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
 #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
 #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define SET_MNT_TUCKED(m) ((m)->mnt.mnt_flags |= MNT_TUCKED)
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define CLEAR_MNT_TUCKED(m) ((m)->mnt.mnt_flags &= ~MNT_TUCKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
+#define IS_MNT_TUCKED(m) ((m)->mnt.mnt_flags & MNT_TUCKED)
 
 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 8e0352a..41674e7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -62,6 +62,7 @@
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
+#define MNT_TUCKED		0x10000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-24 20:42                               ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Ram Pai
@ 2017-05-24 21:54                                 ` Eric W. Biederman
  2017-05-24 22:35                                   ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-24 21:54 UTC (permalink / raw)
  To: Ram Pai; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

Ram Pai <linuxram@us.ibm.com> writes:

> On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
>> 
>> While investigating some poor umount performance I realized that in
>> the case of overlapping mount trees where some of the mounts are locked
>> the code has been failing to unmount all of the mounts it should
>> have been unmounting.
>> 
>> This failure to unmount all of the necessary
>> mounts can be reproduced with:
>> 
>> $ cat locked_mounts_test.sh
>> 
>> mount -t tmpfs test-base /mnt
>> mount --make-shared /mnt
>> mkdir -p /mnt/b
>> 
>> mount -t tmpfs test1 /mnt/b
>> mount --make-shared /mnt/b
>> mkdir -p /mnt/b/10
>> 
>> mount -t tmpfs test2 /mnt/b/10
>> mount --make-shared /mnt/b/10
>> mkdir -p /mnt/b/10/20
>> 
>> mount --rbind /mnt/b /mnt/b/10/20
>> 
>> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
>> sleep 1
>> umount -l /mnt/b
>> wait %%
>> 
>> $ unshare -Urm ./locked_mounts_test.sh
>> 
>> This failure is corrected by removing the prepass that marks mounts
>> that may be umounted.
>> 
>> A first pass is added that umounts mounts if possible and if not sets
>> mount mark if they could be unmounted if they weren't locked and adds
>> them to a list to umount possibilities.  This first pass reconsiders
>> the mounts parent if it is on the list of umount possibilities, ensuring
>> that information of umoutability will pass from child to mount parent.
>> 
>> A second pass then walks through all mounts that are umounted and processes
>> their children unmounting them or marking them for reparenting.
>> 
>> A last pass cleans up the state on the mounts that could not be umounted
>> and if applicable reparents them to their first parent that remained
>> mounted.
>> 
>> While a bit longer than the old code this code is much more robust
>> as it allows information to flow up from the leaves and down
>> from the trunk making the order in which mounts are encountered
>> in the umount propgation tree irrelevant.
>
> Eric,
> 	I think we can accomplish what you want in a much simpler way.
>        	Would the patch below; UNTESTED BUT COMPILED, resolve your
> 	issue?

The reason I came up with an algorithm where the information flows
both directions is that especially in the case of umount -l
but even in some rare cases of a simple umount, the ordering
of the mount propagation tree can result in parent mounts being
visited before the child mounts.

This case shows in in the case of a mount or a set of mounts
being mounted below itself.

So no.   Irregardless of tucked mount state we can't do this.

I see this also doesn't have the change to move mnt_change_mountpoint
into another pass.  That one is quite important from a practical
point of view as that means the way the mount tree changes in umount
is the same irrespective of the number of times a mount shows
up in the mount propagation trees.  Which is a very important
property to have for optimizing umount -l.  Which in
the worst case allows reduces umount from O(N^2+) to roughly O(N).

All of what I am doing should have not effect on an implementation of
MNT_TUCKED.

That said your code to deal with MNT_TUCKED seems reasonable.

Eric

>
> 	Its a two pass unmount. First pass marks mounts that can
> 	be unmounted, and second pass does the neccessary unlinks.
> 	It does mark TUCKED mounts, and uses that information
> 	to peel off the correct mounts. Key points are
>
> 	a) a tucked mount never entertain any unmount propagation
> 	 	on its root dentry.
>
> 	b) when the child on the root dentry of a tucked mount is
> 	   unmounted, the tucked mount is not a tucked mount anymore.
>
> 	c) if the child is a tucked mount, than its child is reparented
> 	   to the parent.
>
>
> Signed-off-by: "Ram Pai" <linuxram@us.ibm.com>
>
> fs/namespace.c        |    4 ++-
> fs/pnode.c            |   53 +++++++++++++++++++++++++++++++++++++-------------
> fs/pnode.h            |    3 ++
> include/linux/mount.h |    1 
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index cc1375ef..ff3ec90 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2050,8 +2050,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  		hlist_del_init(&child->mnt_hash);
>  		q = __lookup_mnt(&child->mnt_parent->mnt,
>  				 child->mnt_mountpoint);
> -		if (q)
> +		if (q) {
>  			mnt_change_mountpoint(child, smp, q);
> +			SET_MNT_TUCKED(child);
> +		}
>  		commit_tree(child);
>  	}
>  	put_mountpoint(smp);
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 5bc7896..b44a544 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -448,31 +448,58 @@ static void __propagate_umount(struct mount *mnt)
>  
>  	for (m = propagation_next(parent, parent); m;
>  			m = propagation_next(m, parent)) {
> -		struct mount *topper;
> -		struct mount *child = __lookup_mnt(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		/*
> -		 * umount the child only if the child has no children
> -		 * and the child is marked safe to unmount.
> +		struct mount *topper, *child;
> +
> +		/* Tucked mount must drop umount propagation events on
> +		 * its **root dentry**.
> +		 * The tucked mount did not exist when that child came
> +		 * into existence. It never received that mount propagation.
> +		 * Hence it should never entertain the umount propagation
> +		 * aswell.
>  		 */
> +		if (IS_MNT_TUCKED(m) && list_is_singular(&mnt->mnt_mounts))
> +			continue;
> +
> +
> +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> +
>  		if (!child || !IS_MNT_MARKED(child))
>  			continue;
> +
>  		CLEAR_MNT_MARK(child);
>  
> -		/* If there is exactly one mount covering all of child
> -		 * replace child with that mount.
> -		 */
> -		topper = find_topper(child);
> -		if (topper)
> -			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> -					      topper);
> +		if (IS_MNT_TUCKED(child) &&
> +			(list_is_singular(&child->mnt_mounts))) {
> +			topper = find_topper(child);
> +			if (topper) {
> +				mnt_change_mountpoint(child->mnt_parent,
> +					child->mnt_mp, topper);
> +				CLEAR_MNT_TUCKED(child); /*lets be precise*/
> +			}
> +		}
>  
>  		if (list_empty(&child->mnt_mounts)) {
>  			list_del_init(&child->mnt_child);
>  			child->mnt.mnt_flags |= MNT_UMOUNT;
>  			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>  		}
> +#if 0
> +	       	else {
> +			mntput(child); /* mark it for deletion. It will
> +				       	  be deleted whenever it looses
> +					  all its remaining references.
> +					  TODO: some more thought
> +					  needed, please validate */
> +		}
> +#endif
>  	}
> +
> +	/*
> +	 * This explicit umount operation is exposing the parent.
> +	 * In case the parent was a 'tucked' mount, it cannot be so
> +	 * anymore.
> +	 */
> +	CLEAR_MNT_TUCKED(parent);
>  }
>  
>  /*
> diff --git a/fs/pnode.h b/fs/pnode.h
> index dc87e65..9ebd1a8 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -18,8 +18,11 @@
>  #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
>  #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
>  #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
> +#define SET_MNT_TUCKED(m) ((m)->mnt.mnt_flags |= MNT_TUCKED)
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
> +#define CLEAR_MNT_TUCKED(m) ((m)->mnt.mnt_flags &= ~MNT_TUCKED)
>  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
> +#define IS_MNT_TUCKED(m) ((m)->mnt.mnt_flags & MNT_TUCKED)
>  
>  #define CL_EXPIRE    		0x01
>  #define CL_SLAVE     		0x02
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 8e0352a..41674e7 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -62,6 +62,7 @@
>  #define MNT_SYNC_UMOUNT		0x2000000
>  #define MNT_MARKED		0x4000000
>  #define MNT_UMOUNT		0x8000000
> +#define MNT_TUCKED		0x10000000
>  
>  struct vfsmount {
>  	struct dentry *mnt_root;	/* root of the mounted tree */

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-24 21:54                                 ` Eric W. Biederman
@ 2017-05-24 22:35                                   ` Ram Pai
  0 siblings, 0 replies; 63+ messages in thread
From: Ram Pai @ 2017-05-24 22:35 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

On Wed, May 24, 2017 at 04:54:34PM -0500, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
> >> 
> >> While investigating some poor umount performance I realized that in
> >> the case of overlapping mount trees where some of the mounts are locked
> >> the code has been failing to unmount all of the mounts it should
> >> have been unmounting.
> >> 
> >> This failure to unmount all of the necessary
> >> mounts can be reproduced with:
> >> 
> >> $ cat locked_mounts_test.sh
> >> 
> >> mount -t tmpfs test-base /mnt
> >> mount --make-shared /mnt
> >> mkdir -p /mnt/b
> >> 
> >> mount -t tmpfs test1 /mnt/b
> >> mount --make-shared /mnt/b
> >> mkdir -p /mnt/b/10
> >> 
> >> mount -t tmpfs test2 /mnt/b/10
> >> mount --make-shared /mnt/b/10
> >> mkdir -p /mnt/b/10/20
> >> 
> >> mount --rbind /mnt/b /mnt/b/10/20
> >> 
> >> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
> >> sleep 1
> >> umount -l /mnt/b
> >> wait %%
> >> 
> >> $ unshare -Urm ./locked_mounts_test.sh
> >> 
> >> This failure is corrected by removing the prepass that marks mounts
> >> that may be umounted.
> >> 
> >> A first pass is added that umounts mounts if possible and if not sets
> >> mount mark if they could be unmounted if they weren't locked and adds
> >> them to a list to umount possibilities.  This first pass reconsiders
> >> the mounts parent if it is on the list of umount possibilities, ensuring
> >> that information of umoutability will pass from child to mount parent.
> >> 
> >> A second pass then walks through all mounts that are umounted and processes
> >> their children unmounting them or marking them for reparenting.
> >> 
> >> A last pass cleans up the state on the mounts that could not be umounted
> >> and if applicable reparents them to their first parent that remained
> >> mounted.
> >> 
> >> While a bit longer than the old code this code is much more robust
> >> as it allows information to flow up from the leaves and down
> >> from the trunk making the order in which mounts are encountered
> >> in the umount propgation tree irrelevant.
> >
> > Eric,
> > 	I think we can accomplish what you want in a much simpler way.
> >        	Would the patch below; UNTESTED BUT COMPILED, resolve your
> > 	issue?
> 
> The reason I came up with an algorithm where the information flows
> both directions is that especially in the case of umount -l
> but even in some rare cases of a simple umount, the ordering
> of the mount propagation tree can result in parent mounts being
> visited before the child mounts.
> 
> This case shows in in the case of a mount or a set of mounts
> being mounted below itself.
> 
> So no.   Irregardless of tucked mount state we can't do this.

Ok. I thought I had taken care, regardles of the order in which the mounts
were encountered. I need to understand your patch better. Will relook at it later
tonight.

RP

> 
> I see this also doesn't have the change to move mnt_change_mountpoint
> into another pass.  That one is quite important from a practical
> point of view as that means the way the mount tree changes in umount
> is the same irrespective of the number of times a mount shows
> up in the mount propagation trees.  Which is a very important
> property to have for optimizing umount -l.  Which in
> the worst case allows reduces umount from O(N^2+) to roughly O(N).
> 
> All of what I am doing should have not effect on an implementation of
> MNT_TUCKED.
> 
> That said your code to deal with MNT_TUCKED seems reasonable.
> 
> Eric
> 
> >
> > 	Its a two pass unmount. First pass marks mounts that can
> > 	be unmounted, and second pass does the neccessary unlinks.
> > 	It does mark TUCKED mounts, and uses that information
> > 	to peel off the correct mounts. Key points are
> >
> > 	a) a tucked mount never entertain any unmount propagation
> > 	 	on its root dentry.
> >
> > 	b) when the child on the root dentry of a tucked mount is
> > 	   unmounted, the tucked mount is not a tucked mount anymore.
> >
> > 	c) if the child is a tucked mount, than its child is reparented
> > 	   to the parent.
> >
> >
> > Signed-off-by: "Ram Pai" <linuxram@us.ibm.com>
> >
> > fs/namespace.c        |    4 ++-
> > fs/pnode.c            |   53 +++++++++++++++++++++++++++++++++++++-------------
> > fs/pnode.h            |    3 ++
> > include/linux/mount.h |    1 
> >
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index cc1375ef..ff3ec90 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -2050,8 +2050,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> >  		hlist_del_init(&child->mnt_hash);
> >  		q = __lookup_mnt(&child->mnt_parent->mnt,
> >  				 child->mnt_mountpoint);
> > -		if (q)
> > +		if (q) {
> >  			mnt_change_mountpoint(child, smp, q);
> > +			SET_MNT_TUCKED(child);
> > +		}
> >  		commit_tree(child);
> >  	}
> >  	put_mountpoint(smp);
> > diff --git a/fs/pnode.c b/fs/pnode.c
> > index 5bc7896..b44a544 100644
> > --- a/fs/pnode.c
> > +++ b/fs/pnode.c
> > @@ -448,31 +448,58 @@ static void __propagate_umount(struct mount *mnt)
> >  
> >  	for (m = propagation_next(parent, parent); m;
> >  			m = propagation_next(m, parent)) {
> > -		struct mount *topper;
> > -		struct mount *child = __lookup_mnt(&m->mnt,
> > -						mnt->mnt_mountpoint);
> > -		/*
> > -		 * umount the child only if the child has no children
> > -		 * and the child is marked safe to unmount.
> > +		struct mount *topper, *child;
> > +
> > +		/* Tucked mount must drop umount propagation events on
> > +		 * its **root dentry**.
> > +		 * The tucked mount did not exist when that child came
> > +		 * into existence. It never received that mount propagation.
> > +		 * Hence it should never entertain the umount propagation
> > +		 * aswell.
> >  		 */
> > +		if (IS_MNT_TUCKED(m) && list_is_singular(&mnt->mnt_mounts))
> > +			continue;
> > +
> > +
> > +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> > +
> >  		if (!child || !IS_MNT_MARKED(child))
> >  			continue;
> > +
> >  		CLEAR_MNT_MARK(child);
> >  
> > -		/* If there is exactly one mount covering all of child
> > -		 * replace child with that mount.
> > -		 */
> > -		topper = find_topper(child);
> > -		if (topper)
> > -			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
> > -					      topper);
> > +		if (IS_MNT_TUCKED(child) &&
> > +			(list_is_singular(&child->mnt_mounts))) {
> > +			topper = find_topper(child);
> > +			if (topper) {
> > +				mnt_change_mountpoint(child->mnt_parent,
> > +					child->mnt_mp, topper);
> > +				CLEAR_MNT_TUCKED(child); /*lets be precise*/
> > +			}
> > +		}
> >  
> >  		if (list_empty(&child->mnt_mounts)) {
> >  			list_del_init(&child->mnt_child);
> >  			child->mnt.mnt_flags |= MNT_UMOUNT;
> >  			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> >  		}
> > +#if 0
> > +	       	else {
> > +			mntput(child); /* mark it for deletion. It will
> > +				       	  be deleted whenever it looses
> > +					  all its remaining references.
> > +					  TODO: some more thought
> > +					  needed, please validate */
> > +		}
> > +#endif
> >  	}
> > +
> > +	/*
> > +	 * This explicit umount operation is exposing the parent.
> > +	 * In case the parent was a 'tucked' mount, it cannot be so
> > +	 * anymore.
> > +	 */
> > +	CLEAR_MNT_TUCKED(parent);
> >  }
> >  
> >  /*
> > diff --git a/fs/pnode.h b/fs/pnode.h
> > index dc87e65..9ebd1a8 100644
> > --- a/fs/pnode.h
> > +++ b/fs/pnode.h
> > @@ -18,8 +18,11 @@
> >  #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
> >  #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
> >  #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
> > +#define SET_MNT_TUCKED(m) ((m)->mnt.mnt_flags |= MNT_TUCKED)
> >  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
> > +#define CLEAR_MNT_TUCKED(m) ((m)->mnt.mnt_flags &= ~MNT_TUCKED)
> >  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
> > +#define IS_MNT_TUCKED(m) ((m)->mnt.mnt_flags & MNT_TUCKED)
> >  
> >  #define CL_EXPIRE    		0x01
> >  #define CL_SLAVE     		0x02
> > diff --git a/include/linux/mount.h b/include/linux/mount.h
> > index 8e0352a..41674e7 100644
> > --- a/include/linux/mount.h
> > +++ b/include/linux/mount.h
> > @@ -62,6 +62,7 @@
> >  #define MNT_SYNC_UMOUNT		0x2000000
> >  #define MNT_MARKED		0x4000000
> >  #define MNT_UMOUNT		0x8000000
> > +#define MNT_TUCKED		0x10000000
> >  
> >  struct vfsmount {
> >  	struct dentry *mnt_root;	/* root of the mounted tree */

-- 
Ram Pai

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-17  5:54                             ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Eric W. Biederman
  2017-05-17  5:55                               ` [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees Eric W. Biederman
  2017-05-24 20:42                               ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Ram Pai
@ 2017-05-30  6:07                               ` Ram Pai
  2017-05-30 15:07                                 ` Eric W. Biederman
  2 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-05-30  6:07 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
> 
> While investigating some poor umount performance I realized that in
> the case of overlapping mount trees where some of the mounts are locked
> the code has been failing to unmount all of the mounts it should
> have been unmounting.
> 
> This failure to unmount all of the necessary
> mounts can be reproduced with:
> 
> $ cat locked_mounts_test.sh
> 
> mount -t tmpfs test-base /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/b
> 
> mount -t tmpfs test1 /mnt/b
> mount --make-shared /mnt/b
> mkdir -p /mnt/b/10
> 
> mount -t tmpfs test2 /mnt/b/10
> mount --make-shared /mnt/b/10
> mkdir -p /mnt/b/10/20
> 
> mount --rbind /mnt/b /mnt/b/10/20
> 
> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
> sleep 1
> umount -l /mnt/b
> wait %%
> 
> $ unshare -Urm ./locked_mounts_test.sh
> 
> This failure is corrected by removing the prepass that marks mounts
> that may be umounted.
> 
> A first pass is added that umounts mounts if possible and if not sets
> mount mark if they could be unmounted if they weren't locked and adds
> them to a list to umount possibilities.  This first pass reconsiders
> the mounts parent if it is on the list of umount possibilities, ensuring
> that information of umoutability will pass from child to mount parent.
> 
> A second pass then walks through all mounts that are umounted and processes
> their children unmounting them or marking them for reparenting.
> 
> A last pass cleans up the state on the mounts that could not be umounted
> and if applicable reparents them to their first parent that remained
> mounted.
> 
> While a bit longer than the old code this code is much more robust
> as it allows information to flow up from the leaves and down
> from the trunk making the order in which mounts are encountered
> in the umount propgation tree irrelevant.

Eric,

	I tried multiple time to understand the algorithm, but failed
	to understand the reasoning behind each of the steps. Hence
	I can't tell if the algorithm is correct or wrong.

	I know you are trying to optimize the current algorithm,
	but what is the key insight that you are trying to leverage
       	to optimize it? That probably might help me analyze the
	algorithm.
	

	You walk the propogation tree, and for each element in the
	propagation-tree you try to unmount its entire mount-tree.
	(not sure if this operation is correct, since I know, I had
	 given an example in the past where this can go wrong).
	And later if you find that the unmount is successful, you try
	to walk up and see if the parent can also be unmounted(dont know
	why this is needed).

Sorry, but if you can help with some key insights, it will help.
RP

> 
> Cc: stable@vger.kernel.org
> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |   2 +-
>  fs/namespace.c |   2 +-
>  fs/pnode.c     | 144 ++++++++++++++++++++++++++++++++++-----------------------
>  3 files changed, 88 insertions(+), 60 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index ede5a1d5cf99..de45d9e76748 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,7 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> -	struct list_head mnt_reparent;	/* reparent list entry */
> +	struct list_head mnt_umounting; /* list entry for umount propagation */
>  #ifdef CONFIG_FSNOTIFY
>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 51e49866e1fe..5e3dcbeb1de5 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -236,7 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> -		INIT_LIST_HEAD(&mnt->mnt_reparent);
> +		INIT_LIST_HEAD(&mnt->mnt_umounting);
>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>  	}
>  	return mnt;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 52aca0a118ff..fbaca7df2eb0 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -413,86 +413,95 @@ void propagate_mount_unlock(struct mount *mnt)
>  	}
>  }
> 
> -/*
> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> - */
> -static void mark_umount_candidates(struct mount *mnt)
> +static void umount_one(struct mount *mnt, struct list_head *to_umount)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> -
> -	BUG_ON(parent == mnt);
> -
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
> -			continue;
> -		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
> -			SET_MNT_MARK(child);
> -		}
> -	}
> +	CLEAR_MNT_MARK(mnt);
> +	mnt->mnt.mnt_flags |= MNT_UMOUNT;
> +	list_del_init(&mnt->mnt_child);
> +	list_del_init(&mnt->mnt_umounting);
> +	list_move_tail(&mnt->mnt_list, to_umount);
>  }
> 
>  /*
>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>   * parent propagates to.
>   */
> -static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
> +static bool __propagate_umount(struct mount *mnt,
> +			       struct list_head *to_umount,
> +			       struct list_head *to_restore)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	bool progress = false;
> +	struct mount *child;
> 
> -	BUG_ON(parent == mnt);
> +	/*
> +	 * The state of the parent won't change if this mount is
> +	 * already unmounted or marked as without children.
> +	 */
> +	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
> +		goto out;
> 
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *topper;
> -		struct mount *child = __lookup_mnt(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		/*
> -		 * umount the child only if the child has no children
> -		 * and the child is marked safe to unmount.
> -		 */
> -		if (!child || !IS_MNT_MARKED(child))
> +	/* Verify topper is the only grandchild that has not been
> +	 * speculatively unmounted.
> +	 */
> +	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
> +		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
>  			continue;
> -		CLEAR_MNT_MARK(child);
> +		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
> +			continue;
> +		/* Found a mounted child */
> +		goto children;
> +	}
> 
> -		/* If there is exactly one mount covering all of child
> -		 * replace child with that mount.
> -		 */
> -		topper = find_topper(child);
> -		if (topper)
> -			list_add_tail(&topper->mnt_reparent, to_reparent);
> +	/* Mark mounts that can be unmounted if not locked */
> +	SET_MNT_MARK(mnt);
> +	progress = true;
> 
> -		if (topper || list_empty(&child->mnt_mounts)) {
> -			list_del_init(&child->mnt_child);
> -			list_del_init(&child->mnt_reparent);
> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> +	/* If a mount is without children and not locked umount it. */
> +	if (!IS_MNT_LOCKED(mnt)) {
> +		umount_one(mnt, to_umount);
> +	} else {
> +children:
> +		list_move_tail(&mnt->mnt_umounting, to_restore);
> +	}
> +out:
> +	return progress;
> +}
> +
> +static void umount_list(struct list_head *to_umount,
> +			struct list_head *to_restore)
> +{
> +	struct mount *mnt, *child, *tmp;
> +	list_for_each_entry(mnt, to_umount, mnt_list) {
> +		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
> +			/* topper? */
> +			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
> +				list_move_tail(&child->mnt_umounting, to_restore);
> +			else
> +				umount_one(child, to_umount);
>  		}
>  	}
>  }
> 
> -static void reparent_mounts(struct list_head *to_reparent)
> +static void restore_mounts(struct list_head *to_restore)
>  {
> -	while (!list_empty(to_reparent)) {
> +	/* Restore mounts to a clean working state */
> +	while (!list_empty(to_restore)) {
>  		struct mount *mnt, *parent;
>  		struct mountpoint *mp;
> 
> -		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
> -		list_del_init(&mnt->mnt_reparent);
> +		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
> +		CLEAR_MNT_MARK(mnt);
> +		list_del_init(&mnt->mnt_umounting);
> 
> -		/* Where should this mount be reparented to? */
> +		/* Should this mount be reparented? */
>  		mp = mnt->mnt_mp;
>  		parent = mnt->mnt_parent;
>  		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
>  			mp = parent->mnt_mp;
>  			parent = parent->mnt_parent;
>  		}
> -
> -		mnt_change_mountpoint(parent, mp, mnt);
> +		if (parent != mnt->mnt_parent)
> +			mnt_change_mountpoint(parent, mp, mnt);
>  	}
>  }
> 
> @@ -506,15 +515,34 @@ static void reparent_mounts(struct list_head *to_reparent)
>  int propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> -	LIST_HEAD(to_reparent);
> +	LIST_HEAD(to_restore);
> +	LIST_HEAD(to_umount);
> 
> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> -		mark_umount_candidates(mnt);
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *parent = mnt->mnt_parent;
> +		struct mount *m;
> 
> -	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt, &to_reparent);
> +		for (m = propagation_next(parent, parent); m;
> +		     m = propagation_next(m, parent)) {
> +			struct mount *child = __lookup_mnt(&m->mnt,
> +							   mnt->mnt_mountpoint);
> +			if (!child)
> +				continue;
> +
> +			/* Check the child and parents while progress is made */
> +			while (__propagate_umount(child,
> +						  &to_umount, &to_restore)) {
> +				/* Is the parent a umount candidate? */
> +				child = child->mnt_parent;
> +				if (list_empty(&child->mnt_umounting))
> +					break;
> +			}
> +		}
> +	}
> 
> -	reparent_mounts(&to_reparent);
> +	umount_list(&to_umount, &to_restore);
> +	restore_mounts(&to_restore);
> +	list_splice_tail(&to_umount, list);
> 
>  	return 0;
>  }
> -- 
> 2.10.1

-- 
Ram Pai

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-30  6:07                               ` Ram Pai
@ 2017-05-30 15:07                                 ` Eric W. Biederman
  2017-06-07  9:54                                   ` Ram Pai
  0 siblings, 1 reply; 63+ messages in thread
From: Eric W. Biederman @ 2017-05-30 15:07 UTC (permalink / raw)
  To: Ram Pai; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

Ram Pai <linuxram@us.ibm.com> writes:

> On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
>> 
>> While investigating some poor umount performance I realized that in
>> the case of overlapping mount trees where some of the mounts are locked
>> the code has been failing to unmount all of the mounts it should
>> have been unmounting.
>> 
>> This failure to unmount all of the necessary
>> mounts can be reproduced with:
>> 
>> $ cat locked_mounts_test.sh
>> 
>> mount -t tmpfs test-base /mnt
>> mount --make-shared /mnt
>> mkdir -p /mnt/b
>> 
>> mount -t tmpfs test1 /mnt/b
>> mount --make-shared /mnt/b
>> mkdir -p /mnt/b/10
>> 
>> mount -t tmpfs test2 /mnt/b/10
>> mount --make-shared /mnt/b/10
>> mkdir -p /mnt/b/10/20
>> 
>> mount --rbind /mnt/b /mnt/b/10/20
>> 
>> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
>> sleep 1
>> umount -l /mnt/b
>> wait %%
>> 
>> $ unshare -Urm ./locked_mounts_test.sh
>> 
>> This failure is corrected by removing the prepass that marks mounts
>> that may be umounted.
>> 
>> A first pass is added that umounts mounts if possible and if not sets
>> mount mark if they could be unmounted if they weren't locked and adds
>> them to a list to umount possibilities.  This first pass reconsiders
>> the mounts parent if it is on the list of umount possibilities, ensuring
>> that information of umoutability will pass from child to mount parent.
>> 
>> A second pass then walks through all mounts that are umounted and processes
>> their children unmounting them or marking them for reparenting.
>> 
>> A last pass cleans up the state on the mounts that could not be umounted
>> and if applicable reparents them to their first parent that remained
>> mounted.
>> 
>> While a bit longer than the old code this code is much more robust
>> as it allows information to flow up from the leaves and down
>> from the trunk making the order in which mounts are encountered
>> in the umount propgation tree irrelevant.
>
> Eric,
>
> 	I tried multiple time to understand the algorithm, but failed
> 	to understand the reasoning behind each of the steps. Hence
> 	I can't tell if the algorithm is correct or wrong.
>
> 	I know you are trying to optimize the current algorithm,
> 	but what is the key insight that you are trying to leverage
>        	to optimize it? That probably might help me analyze the
> 	algorithm.
> 	
>
> 	You walk the propogation tree, and for each element in the
> 	propagation-tree you try to unmount its entire mount-tree.
> 	(not sure if this operation is correct, since I know, I had
> 	 given an example in the past where this can go wrong).

I think you are refering to when I tried to propgate the entire tree
mount and was not following the individual propagation trees for
each mount.  Which left some mounts mounted that if we had
followed the individual propgation trees would have been umounted.

This code does not do anything like that it simply follows the
individual umount propgation trees.

> 	And later if you find that the unmount is successful, you try
> 	to walk up and see if the parent can also be unmounted(dont know
> 	why this is needed).

> Sorry, but if you can help with some key insights, it will help.

The first insight is that the parent child relationship that happens
between mounts in the set of mounts removed by MNT_DETACH is not
necessarily the same parent child relationship between the mounts
the are propagated to.

Which leads to the second insight that we can not guarantee that during
umount when the mount propgation tree is being walked we can not
gaurantee that the leaves are being walked first.

Without tucked mounts, without locked mounts that information is enough
to say the original mount propgation code for umount was incorrect as it
assumed that the leaves would be walked first.

I believe the test case I have given above is an example of that.  It
has locked mounts in it as well which made everything doubly ugly.

To handle the case that the code may visit the parent before the child
if something is mounted on top of a mount we wish to unmount it is
added to the to_restore list instead of the to_umount list.

If a mount does not have children and it is unmounted __propgate_umount
returns true.  The code then looks at the parent mount and it is on
the to_restore list it tries to unmount the parent again.

That is the leaf to root propagation.



There is also root to leaf propgation in umount_list to handle the case of
locked mounts.  Locked mounts come about because a more privileged user
propagated them into your mount namespace as a set, and the unprivileged
user is not allowed to break up the set lest they see something under a
mount they should not see.

When a mount that could be unmounted if it was not locked to it's parent
it is marked and placed on the to_restore list.


When examining children umount_one removes mounts from their parents
mnt_mounts list.

Children that fully cover a mount (toppers) are ignored.
Children that if not locked would be unmounted are ignored
   as those children become unmountable if their parent
   is unmountable.

   This allows a tree of mounts that is locked together to
   be unmounted if the root is unmountable.


Does that help?

Eric





> RP
>
>> 
>> Cc: stable@vger.kernel.org
>> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  fs/mount.h     |   2 +-
>>  fs/namespace.c |   2 +-
>>  fs/pnode.c     | 144 ++++++++++++++++++++++++++++++++++-----------------------
>>  3 files changed, 88 insertions(+), 60 deletions(-)
>> 
>> diff --git a/fs/mount.h b/fs/mount.h
>> index ede5a1d5cf99..de45d9e76748 100644
>> --- a/fs/mount.h
>> +++ b/fs/mount.h
>> @@ -58,7 +58,7 @@ struct mount {
>>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
>> -	struct list_head mnt_reparent;	/* reparent list entry */
>> +	struct list_head mnt_umounting; /* list entry for umount propagation */
>>  #ifdef CONFIG_FSNOTIFY
>>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>>  	__u32 mnt_fsnotify_mask;
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 51e49866e1fe..5e3dcbeb1de5 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -236,7 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
>> -		INIT_LIST_HEAD(&mnt->mnt_reparent);
>> +		INIT_LIST_HEAD(&mnt->mnt_umounting);
>>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>>  	}
>>  	return mnt;
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index 52aca0a118ff..fbaca7df2eb0 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -413,86 +413,95 @@ void propagate_mount_unlock(struct mount *mnt)
>>  	}
>>  }
>> 
>> -/*
>> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
>> - */
>> -static void mark_umount_candidates(struct mount *mnt)
>> +static void umount_one(struct mount *mnt, struct list_head *to_umount)
>>  {
>> -	struct mount *parent = mnt->mnt_parent;
>> -	struct mount *m;
>> -
>> -	BUG_ON(parent == mnt);
>> -
>> -	for (m = propagation_next(parent, parent); m;
>> -			m = propagation_next(m, parent)) {
>> -		struct mount *child = __lookup_mnt(&m->mnt,
>> -						mnt->mnt_mountpoint);
>> -		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
>> -			continue;
>> -		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
>> -			SET_MNT_MARK(child);
>> -		}
>> -	}
>> +	CLEAR_MNT_MARK(mnt);
>> +	mnt->mnt.mnt_flags |= MNT_UMOUNT;
>> +	list_del_init(&mnt->mnt_child);
>> +	list_del_init(&mnt->mnt_umounting);
>> +	list_move_tail(&mnt->mnt_list, to_umount);
>>  }
>> 
>>  /*
>>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>>   * parent propagates to.
>>   */
>> -static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
>> +static bool __propagate_umount(struct mount *mnt,
>> +			       struct list_head *to_umount,
>> +			       struct list_head *to_restore)
>>  {
>> -	struct mount *parent = mnt->mnt_parent;
>> -	struct mount *m;
>> +	bool progress = false;
>> +	struct mount *child;
>> 
>> -	BUG_ON(parent == mnt);
>> +	/*
>> +	 * The state of the parent won't change if this mount is
>> +	 * already unmounted or marked as without children.
>> +	 */
>> +	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
>> +		goto out;
>> 
>> -	for (m = propagation_next(parent, parent); m;
>> -			m = propagation_next(m, parent)) {
>> -		struct mount *topper;
>> -		struct mount *child = __lookup_mnt(&m->mnt,
>> -						mnt->mnt_mountpoint);
>> -		/*
>> -		 * umount the child only if the child has no children
>> -		 * and the child is marked safe to unmount.
>> -		 */
>> -		if (!child || !IS_MNT_MARKED(child))
>> +	/* Verify topper is the only grandchild that has not been
>> +	 * speculatively unmounted.
>> +	 */
>> +	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
>> +		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
>>  			continue;
>> -		CLEAR_MNT_MARK(child);
>> +		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
>> +			continue;
>> +		/* Found a mounted child */
>> +		goto children;
>> +	}
>> 
>> -		/* If there is exactly one mount covering all of child
>> -		 * replace child with that mount.
>> -		 */
>> -		topper = find_topper(child);
>> -		if (topper)
>> -			list_add_tail(&topper->mnt_reparent, to_reparent);
>> +	/* Mark mounts that can be unmounted if not locked */
>> +	SET_MNT_MARK(mnt);
>> +	progress = true;
>> 
>> -		if (topper || list_empty(&child->mnt_mounts)) {
>> -			list_del_init(&child->mnt_child);
>> -			list_del_init(&child->mnt_reparent);
>> -			child->mnt.mnt_flags |= MNT_UMOUNT;
>> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>> +	/* If a mount is without children and not locked umount it. */
>> +	if (!IS_MNT_LOCKED(mnt)) {
>> +		umount_one(mnt, to_umount);
>> +	} else {
>> +children:
>> +		list_move_tail(&mnt->mnt_umounting, to_restore);
>> +	}
>> +out:
>> +	return progress;
>> +}
>> +
>> +static void umount_list(struct list_head *to_umount,
>> +			struct list_head *to_restore)
>> +{
>> +	struct mount *mnt, *child, *tmp;
>> +	list_for_each_entry(mnt, to_umount, mnt_list) {
>> +		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
>> +			/* topper? */
>> +			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
>> +				list_move_tail(&child->mnt_umounting, to_restore);
>> +			else
>> +				umount_one(child, to_umount);
>>  		}
>>  	}
>>  }
>> 
>> -static void reparent_mounts(struct list_head *to_reparent)
>> +static void restore_mounts(struct list_head *to_restore)
>>  {
>> -	while (!list_empty(to_reparent)) {
>> +	/* Restore mounts to a clean working state */
>> +	while (!list_empty(to_restore)) {
>>  		struct mount *mnt, *parent;
>>  		struct mountpoint *mp;
>> 
>> -		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
>> -		list_del_init(&mnt->mnt_reparent);
>> +		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
>> +		CLEAR_MNT_MARK(mnt);
>> +		list_del_init(&mnt->mnt_umounting);
>> 
>> -		/* Where should this mount be reparented to? */
>> +		/* Should this mount be reparented? */
>>  		mp = mnt->mnt_mp;
>>  		parent = mnt->mnt_parent;
>>  		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
>>  			mp = parent->mnt_mp;
>>  			parent = parent->mnt_parent;
>>  		}
>> -
>> -		mnt_change_mountpoint(parent, mp, mnt);
>> +		if (parent != mnt->mnt_parent)
>> +			mnt_change_mountpoint(parent, mp, mnt);
>>  	}
>>  }
>> 
>> @@ -506,15 +515,34 @@ static void reparent_mounts(struct list_head *to_reparent)
>>  int propagate_umount(struct list_head *list)
>>  {
>>  	struct mount *mnt;
>> -	LIST_HEAD(to_reparent);
>> +	LIST_HEAD(to_restore);
>> +	LIST_HEAD(to_umount);
>> 
>> -	list_for_each_entry_reverse(mnt, list, mnt_list)
>> -		mark_umount_candidates(mnt);
>> +	list_for_each_entry(mnt, list, mnt_list) {
>> +		struct mount *parent = mnt->mnt_parent;
>> +		struct mount *m;
>> 
>> -	list_for_each_entry(mnt, list, mnt_list)
>> -		__propagate_umount(mnt, &to_reparent);
>> +		for (m = propagation_next(parent, parent); m;
>> +		     m = propagation_next(m, parent)) {
>> +			struct mount *child = __lookup_mnt(&m->mnt,
>> +							   mnt->mnt_mountpoint);
>> +			if (!child)
>> +				continue;
>> +
>> +			/* Check the child and parents while progress is made */
>> +			while (__propagate_umount(child,
>> +						  &to_umount, &to_restore)) {
>> +				/* Is the parent a umount candidate? */
>> +				child = child->mnt_parent;
>> +				if (list_empty(&child->mnt_umounting))
>> +					break;
>> +			}
>> +		}
>> +	}
>> 
>> -	reparent_mounts(&to_reparent);
>> +	umount_list(&to_umount, &to_restore);
>> +	restore_mounts(&to_restore);
>> +	list_splice_tail(&to_umount, list);
>> 
>>  	return 0;
>>  }
>> -- 
>> 2.10.1

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-05-30 15:07                                 ` Eric W. Biederman
@ 2017-06-07  9:54                                   ` Ram Pai
  2017-06-07 13:09                                     ` Eric W. Biederman
  0 siblings, 1 reply; 63+ messages in thread
From: Ram Pai @ 2017-06-07  9:54 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

On Tue, May 30, 2017 at 10:07:49AM -0500, Eric W. Biederman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> 
> > On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
> >> 
> >> While investigating some poor umount performance I realized that in
> >> the case of overlapping mount trees where some of the mounts are locked
> >> the code has been failing to unmount all of the mounts it should
> >> have been unmounting.
> >> 
> >> This failure to unmount all of the necessary
> >> mounts can be reproduced with:
> >> 
> >> $ cat locked_mounts_test.sh
> >> 
> >> mount -t tmpfs test-base /mnt
> >> mount --make-shared /mnt
> >> mkdir -p /mnt/b
> >> 
> >> mount -t tmpfs test1 /mnt/b
> >> mount --make-shared /mnt/b
> >> mkdir -p /mnt/b/10
> >> 
> >> mount -t tmpfs test2 /mnt/b/10
> >> mount --make-shared /mnt/b/10
> >> mkdir -p /mnt/b/10/20
> >> 
> >> mount --rbind /mnt/b /mnt/b/10/20
> >> 
> >> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
> >> sleep 1
> >> umount -l /mnt/b
> >> wait %%
> >> 
> >> $ unshare -Urm ./locked_mounts_test.sh
> >> 
> >> This failure is corrected by removing the prepass that marks mounts
> >> that may be umounted.
> >> 
> >> A first pass is added that umounts mounts if possible and if not sets
> >> mount mark if they could be unmounted if they weren't locked and adds
> >> them to a list to umount possibilities.  This first pass reconsiders
> >> the mounts parent if it is on the list of umount possibilities, ensuring
> >> that information of umoutability will pass from child to mount parent.
> >> 
> >> A second pass then walks through all mounts that are umounted and processes
> >> their children unmounting them or marking them for reparenting.
> >> 
> >> A last pass cleans up the state on the mounts that could not be umounted
> >> and if applicable reparents them to their first parent that remained
> >> mounted.
> >> 
> >> While a bit longer than the old code this code is much more robust
> >> as it allows information to flow up from the leaves and down
> >> from the trunk making the order in which mounts are encountered
> >> in the umount propgation tree irrelevant.
> >
> > Eric,
> >
> > 	I tried multiple time to understand the algorithm, but failed
> > 	to understand the reasoning behind each of the steps. Hence
> > 	I can't tell if the algorithm is correct or wrong.
> >
> > 	I know you are trying to optimize the current algorithm,
> > 	but what is the key insight that you are trying to leverage
> >        	to optimize it? That probably might help me analyze the
> > 	algorithm.
> > 	
> >
> > 	You walk the propogation tree, and for each element in the
> > 	propagation-tree you try to unmount its entire mount-tree.
> > 	(not sure if this operation is correct, since I know, I had
> > 	 given an example in the past where this can go wrong).
> 
> I think you are refering to when I tried to propgate the entire tree
> mount and was not following the individual propagation trees for
> each mount.  Which left some mounts mounted that if we had
> followed the individual propgation trees would have been umounted.
> 
> This code does not do anything like that it simply follows the
> individual umount propgation trees.
> 
> > 	And later if you find that the unmount is successful, you try
> > 	to walk up and see if the parent can also be unmounted(dont know
> > 	why this is needed).
> 
> > Sorry, but if you can help with some key insights, it will help.
> 
> The first insight is that the parent child relationship that happens
> between mounts in the set of mounts removed by MNT_DETACH is not
> necessarily the same parent child relationship between the mounts
> the are propagated to.
> 
> Which leads to the second insight that we can not guarantee that during
> umount when the mount propgation tree is being walked we can not
> gaurantee that the leaves are being walked first.
> 

If you agree that " (A) tucked mounts do/should NOT receive propagation
events on its root dentry ", than i strongly think; short of asserting,
that the propagation tree will always reach the child first
and not the parent. I will further verify my thoughts before
elevating my strong-thinking to an assertion.

But if you do not agree with (A) than yes we could reach the
parent or the child first and we will badly need your algorithm.

BTW: This is the same disagreement that we had earlier regarding
the semantics of tucked mounts.

Assuming that you may not agree with (A), I looked through your
explaination and the code/algorithm and the steps make sense.

The one thing that was not clear to me was --  is it ok to umount
child whose ancestor is MNT_LOCKED?  Your algorithm seems
to allow that.



> Without tucked mounts, without locked mounts that information is enough
> to say the original mount propgation code for umount was incorrect as it
> assumed that the leaves would be walked first.
> 
> I believe the test case I have given above is an example of that.  It
> has locked mounts in it as well which made everything doubly ugly.
> 
> To handle the case that the code may visit the parent before the child
> if something is mounted on top of a mount we wish to unmount it is
> added to the to_restore list instead of the to_umount list.
> 
> If a mount does not have children and it is unmounted __propgate_umount
> returns true.  The code then looks at the parent mount and it is on
> the to_restore list it tries to unmount the parent again.
> 
> That is the leaf to root propagation.
> 
> 
> 
> There is also root to leaf propgation in umount_list to handle the case of
> locked mounts.  Locked mounts come about because a more privileged user
> propagated them into your mount namespace as a set, and the unprivileged
> user is not allowed to break up the set lest they see something under a
> mount they should not see.
> 
> When a mount that could be unmounted if it was not locked to it's parent
> it is marked and placed on the to_restore list.
> 
> 
> When examining children umount_one removes mounts from their parents
> mnt_mounts list.
> 
> Children that fully cover a mount (toppers) are ignored.
> Children that if not locked would be unmounted are ignored
>    as those children become unmountable if their parent
>    is unmountable.

I find the children can get unmounted even when one its ancestor is
locked.  An example case is -- if the child is a leaf and one of its
ancestor is locked, but is not part of any related propagation tree.


> 
>    This allows a tree of mounts that is locked together to
>    be unmounted if the root is unmountable.
> 
> 
> Does that help?

Yes your explaination helped,

Sorry it took some time to get to this, 
RP

> 
> Eric
> >> 
> >> Cc: stable@vger.kernel.org
> >> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >> ---
> >>  fs/mount.h     |   2 +-
> >>  fs/namespace.c |   2 +-
> >>  fs/pnode.c     | 144 ++++++++++++++++++++++++++++++++++-----------------------
> >>  3 files changed, 88 insertions(+), 60 deletions(-)
> >> 
> >> diff --git a/fs/mount.h b/fs/mount.h
> >> index ede5a1d5cf99..de45d9e76748 100644
> >> --- a/fs/mount.h
> >> +++ b/fs/mount.h
> >> @@ -58,7 +58,7 @@ struct mount {
> >>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
> >>  	struct mountpoint *mnt_mp;	/* where is it mounted */
> >>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> >> -	struct list_head mnt_reparent;	/* reparent list entry */
> >> +	struct list_head mnt_umounting; /* list entry for umount propagation */
> >>  #ifdef CONFIG_FSNOTIFY
> >>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
> >>  	__u32 mnt_fsnotify_mask;
> >> diff --git a/fs/namespace.c b/fs/namespace.c
> >> index 51e49866e1fe..5e3dcbeb1de5 100644
> >> --- a/fs/namespace.c
> >> +++ b/fs/namespace.c
> >> @@ -236,7 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
> >>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
> >>  		INIT_LIST_HEAD(&mnt->mnt_slave);
> >>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> >> -		INIT_LIST_HEAD(&mnt->mnt_reparent);
> >> +		INIT_LIST_HEAD(&mnt->mnt_umounting);
> >>  		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
> >>  	}
> >>  	return mnt;
> >> diff --git a/fs/pnode.c b/fs/pnode.c
> >> index 52aca0a118ff..fbaca7df2eb0 100644
> >> --- a/fs/pnode.c
> >> +++ b/fs/pnode.c
> >> @@ -413,86 +413,95 @@ void propagate_mount_unlock(struct mount *mnt)
> >>  	}
> >>  }
> >> 
> >> -/*
> >> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> >> - */
> >> -static void mark_umount_candidates(struct mount *mnt)
> >> +static void umount_one(struct mount *mnt, struct list_head *to_umount)
> >>  {
> >> -	struct mount *parent = mnt->mnt_parent;
> >> -	struct mount *m;
> >> -
> >> -	BUG_ON(parent == mnt);
> >> -
> >> -	for (m = propagation_next(parent, parent); m;
> >> -			m = propagation_next(m, parent)) {
> >> -		struct mount *child = __lookup_mnt(&m->mnt,
> >> -						mnt->mnt_mountpoint);
> >> -		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
> >> -			continue;
> >> -		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
> >> -			SET_MNT_MARK(child);
> >> -		}
> >> -	}
> >> +	CLEAR_MNT_MARK(mnt);
> >> +	mnt->mnt.mnt_flags |= MNT_UMOUNT;
> >> +	list_del_init(&mnt->mnt_child);
> >> +	list_del_init(&mnt->mnt_umounting);
> >> +	list_move_tail(&mnt->mnt_list, to_umount);
> >>  }
> >> 
> >>  /*
> >>   * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> >>   * parent propagates to.
> >>   */
> >> -static void __propagate_umount(struct mount *mnt, struct list_head *to_reparent)
> >> +static bool __propagate_umount(struct mount *mnt,
> >> +			       struct list_head *to_umount,
> >> +			       struct list_head *to_restore)
> >>  {
> >> -	struct mount *parent = mnt->mnt_parent;
> >> -	struct mount *m;
> >> +	bool progress = false;
> >> +	struct mount *child;
> >> 
> >> -	BUG_ON(parent == mnt);
> >> +	/*
> >> +	 * The state of the parent won't change if this mount is
> >> +	 * already unmounted or marked as without children.
> >> +	 */
> >> +	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
> >> +		goto out;
> >> 
> >> -	for (m = propagation_next(parent, parent); m;
> >> -			m = propagation_next(m, parent)) {
> >> -		struct mount *topper;
> >> -		struct mount *child = __lookup_mnt(&m->mnt,
> >> -						mnt->mnt_mountpoint);
> >> -		/*
> >> -		 * umount the child only if the child has no children
> >> -		 * and the child is marked safe to unmount.
> >> -		 */
> >> -		if (!child || !IS_MNT_MARKED(child))
> >> +	/* Verify topper is the only grandchild that has not been
> >> +	 * speculatively unmounted.
> >> +	 */
> >> +	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
> >> +		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
> >>  			continue;
> >> -		CLEAR_MNT_MARK(child);
> >> +		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
> >> +			continue;
> >> +		/* Found a mounted child */
> >> +		goto children;
> >> +	}
> >> 
> >> -		/* If there is exactly one mount covering all of child
> >> -		 * replace child with that mount.
> >> -		 */
> >> -		topper = find_topper(child);
> >> -		if (topper)
> >> -			list_add_tail(&topper->mnt_reparent, to_reparent);
> >> +	/* Mark mounts that can be unmounted if not locked */
> >> +	SET_MNT_MARK(mnt);
> >> +	progress = true;
> >> 
> >> -		if (topper || list_empty(&child->mnt_mounts)) {
> >> -			list_del_init(&child->mnt_child);
> >> -			list_del_init(&child->mnt_reparent);
> >> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> >> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> >> +	/* If a mount is without children and not locked umount it. */
> >> +	if (!IS_MNT_LOCKED(mnt)) {
> >> +		umount_one(mnt, to_umount);
> >> +	} else {
> >> +children:
> >> +		list_move_tail(&mnt->mnt_umounting, to_restore);
> >> +	}
> >> +out:
> >> +	return progress;
> >> +}
> >> +
> >> +static void umount_list(struct list_head *to_umount,
> >> +			struct list_head *to_restore)
> >> +{
> >> +	struct mount *mnt, *child, *tmp;
> >> +	list_for_each_entry(mnt, to_umount, mnt_list) {
> >> +		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
> >> +			/* topper? */
> >> +			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
> >> +				list_move_tail(&child->mnt_umounting, to_restore);
> >> +			else
> >> +				umount_one(child, to_umount);
> >>  		}
> >>  	}
> >>  }
> >> 
> >> -static void reparent_mounts(struct list_head *to_reparent)
> >> +static void restore_mounts(struct list_head *to_restore)
> >>  {
> >> -	while (!list_empty(to_reparent)) {
> >> +	/* Restore mounts to a clean working state */
> >> +	while (!list_empty(to_restore)) {
> >>  		struct mount *mnt, *parent;
> >>  		struct mountpoint *mp;
> >> 
> >> -		mnt = list_first_entry(to_reparent, struct mount, mnt_reparent);
> >> -		list_del_init(&mnt->mnt_reparent);
> >> +		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
> >> +		CLEAR_MNT_MARK(mnt);
> >> +		list_del_init(&mnt->mnt_umounting);
> >> 
> >> -		/* Where should this mount be reparented to? */
> >> +		/* Should this mount be reparented? */
> >>  		mp = mnt->mnt_mp;
> >>  		parent = mnt->mnt_parent;
> >>  		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
> >>  			mp = parent->mnt_mp;
> >>  			parent = parent->mnt_parent;
> >>  		}
> >> -
> >> -		mnt_change_mountpoint(parent, mp, mnt);
> >> +		if (parent != mnt->mnt_parent)
> >> +			mnt_change_mountpoint(parent, mp, mnt);
> >>  	}
> >>  }
> >> 
> >> @@ -506,15 +515,34 @@ static void reparent_mounts(struct list_head *to_reparent)
> >>  int propagate_umount(struct list_head *list)
> >>  {
> >>  	struct mount *mnt;
> >> -	LIST_HEAD(to_reparent);
> >> +	LIST_HEAD(to_restore);
> >> +	LIST_HEAD(to_umount);
> >> 
> >> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> >> -		mark_umount_candidates(mnt);
> >> +	list_for_each_entry(mnt, list, mnt_list) {
> >> +		struct mount *parent = mnt->mnt_parent;
> >> +		struct mount *m;
> >> 
> >> -	list_for_each_entry(mnt, list, mnt_list)
> >> -		__propagate_umount(mnt, &to_reparent);
> >> +		for (m = propagation_next(parent, parent); m;
> >> +		     m = propagation_next(m, parent)) {
> >> +			struct mount *child = __lookup_mnt(&m->mnt,
> >> +							   mnt->mnt_mountpoint);
> >> +			if (!child)
> >> +				continue;
> >> +
> >> +			/* Check the child and parents while progress is made */
> >> +			while (__propagate_umount(child,
> >> +						  &to_umount, &to_restore)) {
> >> +				/* Is the parent a umount candidate? */
> >> +				child = child->mnt_parent;
> >> +				if (list_empty(&child->mnt_umounting))
> >> +					break;
> >> +			}
> >> +		}
> >> +	}
> >> 
> >> -	reparent_mounts(&to_reparent);
> >> +	umount_list(&to_umount, &to_restore);
> >> +	restore_mounts(&to_restore);
> >> +	list_splice_tail(&to_umount, list);
> >> 
> >>  	return 0;
> >>  }
> >> -- 
> >> 2.10.1

-- 
Ram Pai

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order
  2017-06-07  9:54                                   ` Ram Pai
@ 2017-06-07 13:09                                     ` Eric W. Biederman
  0 siblings, 0 replies; 63+ messages in thread
From: Eric W. Biederman @ 2017-06-07 13:09 UTC (permalink / raw)
  To: Ram Pai; +Cc: Andrei Vagin, Al Viro, linux-fsdevel

Ram Pai <linuxram@us.ibm.com> writes:

> On Tue, May 30, 2017 at 10:07:49AM -0500, Eric W. Biederman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > On Wed, May 17, 2017 at 12:54:34AM -0500, Eric W. Biederman wrote:
>> >> 
>> >> While investigating some poor umount performance I realized that in
>> >> the case of overlapping mount trees where some of the mounts are locked
>> >> the code has been failing to unmount all of the mounts it should
>> >> have been unmounting.
>> >> 
>> >> This failure to unmount all of the necessary
>> >> mounts can be reproduced with:
>> >> 
>> >> $ cat locked_mounts_test.sh
>> >> 
>> >> mount -t tmpfs test-base /mnt
>> >> mount --make-shared /mnt
>> >> mkdir -p /mnt/b
>> >> 
>> >> mount -t tmpfs test1 /mnt/b
>> >> mount --make-shared /mnt/b
>> >> mkdir -p /mnt/b/10
>> >> 
>> >> mount -t tmpfs test2 /mnt/b/10
>> >> mount --make-shared /mnt/b/10
>> >> mkdir -p /mnt/b/10/20
>> >> 
>> >> mount --rbind /mnt/b /mnt/b/10/20
>> >> 
>> >> unshare -Urm --propagation unchaged /bin/sh -c 'sleep 5; if [ $(grep test /proc/self/mountinfo | wc -l) -eq 1 ] ; then echo SUCCESS ; else echo FAILURE ; fi'
>> >> sleep 1
>> >> umount -l /mnt/b
>> >> wait %%
>> >> 
>> >> $ unshare -Urm ./locked_mounts_test.sh
>> >> 
>> >> This failure is corrected by removing the prepass that marks mounts
>> >> that may be umounted.
>> >> 
>> >> A first pass is added that umounts mounts if possible and if not sets
>> >> mount mark if they could be unmounted if they weren't locked and adds
>> >> them to a list to umount possibilities.  This first pass reconsiders
>> >> the mounts parent if it is on the list of umount possibilities, ensuring
>> >> that information of umoutability will pass from child to mount parent.
>> >> 
>> >> A second pass then walks through all mounts that are umounted and processes
>> >> their children unmounting them or marking them for reparenting.
>> >> 
>> >> A last pass cleans up the state on the mounts that could not be umounted
>> >> and if applicable reparents them to their first parent that remained
>> >> mounted.
>> >> 
>> >> While a bit longer than the old code this code is much more robust
>> >> as it allows information to flow up from the leaves and down
>> >> from the trunk making the order in which mounts are encountered
>> >> in the umount propgation tree irrelevant.
>> >
>> > Eric,
>> >
>> > 	I tried multiple time to understand the algorithm, but failed
>> > 	to understand the reasoning behind each of the steps. Hence
>> > 	I can't tell if the algorithm is correct or wrong.
>> >
>> > 	I know you are trying to optimize the current algorithm,
>> > 	but what is the key insight that you are trying to leverage
>> >        	to optimize it? That probably might help me analyze the
>> > 	algorithm.
>> > 	
>> >
>> > 	You walk the propogation tree, and for each element in the
>> > 	propagation-tree you try to unmount its entire mount-tree.
>> > 	(not sure if this operation is correct, since I know, I had
>> > 	 given an example in the past where this can go wrong).
>> 
>> I think you are refering to when I tried to propgate the entire tree
>> mount and was not following the individual propagation trees for
>> each mount.  Which left some mounts mounted that if we had
>> followed the individual propgation trees would have been umounted.
>> 
>> This code does not do anything like that it simply follows the
>> individual umount propgation trees.
>> 
>> > 	And later if you find that the unmount is successful, you try
>> > 	to walk up and see if the parent can also be unmounted(dont know
>> > 	why this is needed).
>> 
>> > Sorry, but if you can help with some key insights, it will help.
>> 
>> The first insight is that the parent child relationship that happens
>> between mounts in the set of mounts removed by MNT_DETACH is not
>> necessarily the same parent child relationship between the mounts
>> the are propagated to.
>> 
>> Which leads to the second insight that we can not guarantee that during
>> umount when the mount propgation tree is being walked we can not
>> gaurantee that the leaves are being walked first.
>> 
>
> If you agree that " (A) tucked mounts do/should NOT receive propagation
> events on its root dentry ", than i strongly think; short of asserting,
> that the propagation tree will always reach the child first
> and not the parent. I will further verify my thoughts before
> elevating my strong-thinking to an assertion.

What led to the implementation of these mounts as tucked mounts rather
than side/shadow mounts is Al Viro's assertion that these are normal
mounts.  As normal mounts I tend to think they should be treated
normally.

My experience to date suggests that except for artifically created
pathlogical cases the possible ways we can treat tucked mounts
does not appear to matter.

So I do not yet agree to (A).

I don't think we have ever actually implemented (A).  We have
implemented something very similar but the act of untucking a mount
resulted in what was at the root of a dentry receving events during
the same umount MNT_DETACH instance.

All that is important for the optimizations in patch 2/2 is that
resolving the underying mount is a separate and final pass from
dealing with mount propgation.  So your (A) could be compatible with
the code in patch 1/2.

> But if you do not agree with (A) than yes we could reach the
> parent or the child first and we will badly need your algorithm.

Please note my example does not involve any tucked mounts.  I see this
issue with a very ordinary set of mounts and mount propagation.

So I think we have rough agreement that my algorithm in patch 1/2 is
needed.

> BTW: This is the same disagreement that we had earlier regarding
> the semantics of tucked mounts.
>
> Assuming that you may not agree with (A), I looked through your
> explaination and the code/algorithm and the steps make sense.

> The one thing that was not clear to me was --  is it ok to umount
> child whose ancestor is MNT_LOCKED?  Your algorithm seems
> to allow that.

If you have a tree of mounts:  With the root of the tree not locked onto
it's mountpoint and the rest of the mounts in the tree locked onto the
mountpoint.  Yes it is ok to unmount the tree. (Just not the individual
pieces).

While user space has access to the mounts it is necessary to keep the
pieces locked together.

MNT_LOCKED guards against seeing the mountpint or any of the files or
directories in the mountpoint.

MNT_LOCKED is a mechanism for dealing with privilege differences between
mount namespaces.

>> Without tucked mounts, without locked mounts that information is enough
>> to say the original mount propgation code for umount was incorrect as it
>> assumed that the leaves would be walked first.
>> 
>> I believe the test case I have given above is an example of that.  It
>> has locked mounts in it as well which made everything doubly ugly.
>> 
>> To handle the case that the code may visit the parent before the child
>> if something is mounted on top of a mount we wish to unmount it is
>> added to the to_restore list instead of the to_umount list.
>> 
>> If a mount does not have children and it is unmounted __propgate_umount
>> returns true.  The code then looks at the parent mount and it is on
>> the to_restore list it tries to unmount the parent again.
>> 
>> That is the leaf to root propagation.
>> 
>> 
>> 
>> There is also root to leaf propgation in umount_list to handle the case of
>> locked mounts.  Locked mounts come about because a more privileged user
>> propagated them into your mount namespace as a set, and the unprivileged
>> user is not allowed to break up the set lest they see something under a
>> mount they should not see.
>> 
>> When a mount that could be unmounted if it was not locked to it's parent
>> it is marked and placed on the to_restore list.
>> 
>> 
>> When examining children umount_one removes mounts from their parents
>> mnt_mounts list.
>> 
>> Children that fully cover a mount (toppers) are ignored.
>> Children that if not locked would be unmounted are ignored
>>    as those children become unmountable if their parent
>>    is unmountable.
>
> I find the children can get unmounted even when one its ancestor is
> locked.  An example case is -- if the child is a leaf and one of its
> ancestor is locked, but is not part of any related propagation tree.

I am not quite certain what you are asserting.  Locking is an operation
to hide the contents of a mountpoint.  So a leaf that is not locked to
it's parent is perfectly fine to unmount. 

>> 
>>    This allows a tree of mounts that is locked together to
>>    be unmounted if the root is unmountable.
>> 
>> 
>> Does that help?
>
> Yes your explaination helped,
>
> Sorry it took some time to get to this,

Thank you for making the time to get to this and have the conversation.
These are tough issues to dig through, and we aren't having the easiest
of conversations.

Eric

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2017-06-07 13:16 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-31  4:10 [PATCH] Fix a race in put_mountpoint Krister Johansen
2016-12-31  6:17 ` Al Viro
2017-01-03  0:51   ` Eric W. Biederman
2017-01-03  1:48     ` Al Viro
2017-01-03  3:17       ` Eric W. Biederman
2017-01-03  4:00         ` Al Viro
2017-01-04  3:52           ` Eric W. Biederman
2017-01-04  3:53             ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Eric W. Biederman
2017-01-04 21:04               ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
2017-01-07  5:06                 ` Al Viro
2017-01-11  0:10                   ` Eric W. Biederman
2017-01-11  4:11                     ` Al Viro
2017-01-11 16:03                       ` Eric W. Biederman
2017-01-11 16:18                         ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Eric W. Biederman
2017-01-11 16:19                           ` [REVIEW][PATCH 2/2] mnt: Tuck mounts under others instead of creating shadow/side mounts Eric W. Biederman
2017-01-12  5:45                             ` Al Viro
2017-01-20  7:20                               ` Eric W. Biederman
2017-01-20  7:26                               ` [PATCH v5] " Eric W. Biederman
2017-01-21  3:58                                 ` Ram Pai
2017-01-21  4:15                                   ` Eric W. Biederman
2017-01-23 19:02                                     ` Ram Pai
2017-01-24  0:16                                       ` Eric W. Biederman
2017-02-03 10:54                                         ` Eric W. Biederman
2017-02-03 17:10                                           ` Ram Pai
2017-02-03 18:26                                             ` Eric W. Biederman
2017-02-03 20:28                                               ` Ram Pai
2017-02-03 20:58                                                 ` Eric W. Biederman
2017-02-06  3:25                                                   ` Andrei Vagin
2017-02-06 21:40                                                     ` Ram Pai
2017-02-07  6:35                                                       ` Andrei Vagin
2017-01-12  5:30                           ` [REVIEW][PATCH 1/2] mnt: Fix propagate_mount_busy to notice all cases of busy mounts Al Viro
2017-01-20  7:18                             ` Eric W. Biederman
2017-01-13 20:32                           ` Andrei Vagin
2017-01-18 19:20                             ` Andrei Vagin
2017-01-20 23:18                           ` Ram Pai
2017-01-23  8:15                             ` Eric W. Biederman
2017-01-23 17:04                               ` Ram Pai
2017-01-12  5:03                         ` [REVIEW][PATCH] mnt: Tuck mounts under others instead of creating shadow/side mounts Al Viro
2017-05-14  2:15                 ` Andrei Vagin
2017-05-14  4:05                   ` Eric W. Biederman
2017-05-14  9:26                     ` Eric W. Biederman
2017-05-15 18:27                       ` Andrei Vagin
2017-05-15 19:42                         ` Eric W. Biederman
2017-05-15 20:10                           ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Eric W. Biederman
2017-05-15 23:12                             ` Andrei Vagin
2017-05-16  5:42                             ` [PATCH] test: check a case when a mount is propagated between exiting mounts Andrei Vagin
2017-05-17  5:54                             ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Eric W. Biederman
2017-05-17  5:55                               ` [REVIEW][PATCH 2/2] mnt: Make propagate_umount less slow for overlapping mount propagation trees Eric W. Biederman
2017-05-17 22:48                                 ` Andrei Vagin
2017-05-17 23:26                                   ` Eric W. Biederman
2017-05-18  0:51                                     ` Andrei Vagin
2017-05-24 20:42                               ` [REVIEW][PATCH 1/2] mnt: In propgate_umount handle visiting mounts in any order Ram Pai
2017-05-24 21:54                                 ` Eric W. Biederman
2017-05-24 22:35                                   ` Ram Pai
2017-05-30  6:07                               ` Ram Pai
2017-05-30 15:07                                 ` Eric W. Biederman
2017-06-07  9:54                                   ` Ram Pai
2017-06-07 13:09                                     ` Eric W. Biederman
2017-05-22  8:15                             ` [REVIEW][PATCH] mnt: In umount propagation reparent in a separate pass Ram Pai
2017-05-22 18:33                               ` Eric W. Biederman
2017-05-22 22:34                                 ` Ram Pai
2017-05-23 13:58                                   ` Eric W. Biederman
2017-01-06  7:00               ` [PATCH] mnt: Protect the mountpoint hashtable with mount_lock Krister Johansen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.