* [PATCH 2/6] fs/namespace.c: shift put_mountpoint() to callers of unhash_mnt()
2019-07-06 0:22 ` [PATCH 1/6] __detach_mounts(): lookup_mountpoint() can't return ERR_PTR() anymore Al Viro
@ 2019-07-06 0:22 ` Al Viro
2019-07-06 0:22 ` [PATCH 3/6] Teach shrink_dcache_parent() to cope with mixed-filesystem shrink lists Al Viro
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Al Viro @ 2019-07-06 0:22 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Linus Torvalds, linux-kernel
From: Al Viro <viro@zeniv.linux.org.uk>
make unhash_mnt() return the mountpoint to be dropped, let callers
deal with it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/namespace.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 746e3fd1f430..b7059a4f07e3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -795,15 +795,17 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
/*
* vfsmount lock must be held for write
*/
-static void unhash_mnt(struct mount *mnt)
+static struct mountpoint *unhash_mnt(struct mount *mnt)
{
+ struct mountpoint *mp;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
hlist_del_init_rcu(&mnt->mnt_hash);
hlist_del_init(&mnt->mnt_mp_list);
- put_mountpoint(mnt->mnt_mp);
+ mp = mnt->mnt_mp;
mnt->mnt_mp = NULL;
+ return mp;
}
/*
@@ -813,7 +815,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
{
old_path->dentry = mnt->mnt_mountpoint;
old_path->mnt = &mnt->mnt_parent->mnt;
- unhash_mnt(mnt);
+ put_mountpoint(unhash_mnt(mnt));
}
/*
@@ -823,7 +825,7 @@ static void umount_mnt(struct mount *mnt)
{
/* old mountpoint will be dropped when we can do that */
mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
- unhash_mnt(mnt);
+ put_mountpoint(unhash_mnt(mnt));
}
/*
--
2.11.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 3/6] Teach shrink_dcache_parent() to cope with mixed-filesystem shrink lists
2019-07-06 0:22 ` [PATCH 1/6] __detach_mounts(): lookup_mountpoint() can't return ERR_PTR() anymore Al Viro
2019-07-06 0:22 ` [PATCH 2/6] fs/namespace.c: shift put_mountpoint() to callers of unhash_mnt() Al Viro
@ 2019-07-06 0:22 ` Al Viro
2019-07-06 0:22 ` [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount Al Viro
` (2 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Al Viro @ 2019-07-06 0:22 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Linus Torvalds, linux-kernel
From: Al Viro <viro@zeniv.linux.org.uk>
Currently, running into a shrink list that contains dentries from different
filesystems can cause several unpleasant things for shrink_dcache_parent()
and for umount(2).
The first problem is that there's a window during shrink_dentry_list() between
__dentry_kill() takes a victim out and dropping reference to its parent. During
that window the parent looks like a genuine busy dentry. shrink_dcache_parent()
(or, worse yet, shrink_dcache_for_umount()) coming at that time will see no
eviction candidates and no indication that it needs to wait for some
shrink_dentry_list() to proceed further.
That applies for any shrink list that might intersect with the subtree we are
trying to shrink; the only reason it does not blow on umount(2) in the mainline
is that we unregister the memory shrinker before hitting shrink_dcache_for_umount().
Another problem happens if something in a mixed-filesystem shrink list gets
be stuck in e.g. iput(), getting umount of unrelated fs to spin waiting for
the stuck shrinker to get around to our dentries.
Solution:
1) have shrink_dentry_list() decrement the parent's refcount and
make sure it's on a shrink list (ours unless it already had been on some
other) before calling __dentry_kill(). That eliminates the window when
shrink_dcache_parent() would've blown past the entire subtree without
noticing anything with zero refcount not on shrink lists.
2) when shrink_dcache_parent() has found no eviction candidates,
but some dentries are still sitting on shrink lists, rather than
repeating the scan in hope that shrinkers have progressed, scan looking
for something on shrink lists with zero refcount. If such a thing is
found, grab rcu_read_lock() and stop the scan, with caller locking
it for eviction, dropping out of RCU and doing __dentry_kill(), with
the same treatment for parent as shrink_dentry_list() would do.
Note that right now mixed-filesystem shrink lists do not occur, so this
is not a mainline bug. Howevere, there's a bunch of uses for such
beasts (e.g. the "try and evict everything we can out of given page"
patches; there are potential uses in mount-related code, considerably
simplifying the life in fs/namespace.c, etc.)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/dcache.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
fs/internal.h | 2 ++
2 files changed, 83 insertions(+), 17 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index c435398f2c81..d8732cf2e302 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -861,6 +861,32 @@ void dput(struct dentry *dentry)
}
EXPORT_SYMBOL(dput);
+static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+__must_hold(&dentry->d_lock)
+{
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ /* let the owner of the list it's on deal with it */
+ --dentry->d_lockref.count;
+ } else {
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ d_lru_del(dentry);
+ if (!--dentry->d_lockref.count)
+ d_shrink_add(dentry, list);
+ }
+}
+
+void dput_to_list(struct dentry *dentry, struct list_head *list)
+{
+ rcu_read_lock();
+ if (likely(fast_dput(dentry))) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+ if (!retain_dentry(dentry))
+ __dput_to_list(dentry, list);
+ spin_unlock(&dentry->d_lock);
+}
/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
@@ -1067,7 +1093,7 @@ static bool shrink_lock_dentry(struct dentry *dentry)
return false;
}
-static void shrink_dentry_list(struct list_head *list)
+void shrink_dentry_list(struct list_head *list)
{
while (!list_empty(list)) {
struct dentry *dentry, *parent;
@@ -1089,18 +1115,9 @@ static void shrink_dentry_list(struct list_head *list)
rcu_read_unlock();
d_shrink_del(dentry);
parent = dentry->d_parent;
+ if (parent != dentry)
+ __dput_to_list(parent, list);
__dentry_kill(dentry);
- if (parent == dentry)
- continue;
- /*
- * We need to prune ancestors too. This is necessary to prevent
- * quadratic behavior of shrink_dcache_parent(), but is also
- * expected to be beneficial in reducing dentry cache
- * fragmentation.
- */
- dentry = parent;
- while (dentry && !lockref_put_or_lock(&dentry->d_lockref))
- dentry = dentry_kill(dentry);
}
}
@@ -1445,8 +1462,11 @@ int d_set_mounted(struct dentry *dentry)
struct select_data {
struct dentry *start;
+ union {
+ long found;
+ struct dentry *victim;
+ };
struct list_head dispose;
- int found;
};
static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
@@ -1478,6 +1498,37 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
return ret;
}
+static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
+{
+ struct select_data *data = _data;
+ enum d_walk_ret ret = D_WALK_CONTINUE;
+
+ if (data->start == dentry)
+ goto out;
+
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ if (!dentry->d_lockref.count) {
+ rcu_read_lock();
+ data->victim = dentry;
+ return D_WALK_QUIT;
+ }
+ } else {
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ d_lru_del(dentry);
+ if (!dentry->d_lockref.count)
+ d_shrink_add(dentry, &data->dispose);
+ }
+ /*
+ * We can return to the caller if we have found some (this
+ * ensures forward progress). We'll be coming back to find
+ * the rest.
+ */
+ if (!list_empty(&data->dispose))
+ ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+out:
+ return ret;
+}
+
/**
* shrink_dcache_parent - prune dcache
* @parent: parent of entries to prune
@@ -1487,12 +1538,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
void shrink_dcache_parent(struct dentry *parent)
{
for (;;) {
- struct select_data data;
+ struct select_data data = {.start = parent};
INIT_LIST_HEAD(&data.dispose);
- data.start = parent;
- data.found = 0;
-
d_walk(parent, &data, select_collect);
if (!list_empty(&data.dispose)) {
@@ -1503,6 +1551,22 @@ void shrink_dcache_parent(struct dentry *parent)
cond_resched();
if (!data.found)
break;
+ data.victim = NULL;
+ d_walk(parent, &data, select_collect2);
+ if (data.victim) {
+ struct dentry *parent;
+ if (!shrink_lock_dentry(data.victim)) {
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ parent = data.victim->d_parent;
+ if (parent != data.victim)
+ __dput_to_list(parent, &data.dispose);
+ __dentry_kill(data.victim);
+ }
+ }
+ if (!list_empty(&data.dispose))
+ shrink_dentry_list(&data.dispose);
}
}
EXPORT_SYMBOL(shrink_dcache_parent);
diff --git a/fs/internal.h b/fs/internal.h
index a48ef81be37d..dc317abe31b5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -156,6 +156,8 @@ extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
+extern void dput_to_list(struct dentry *, struct list_head *);
+extern void shrink_dentry_list(struct list_head *);
/*
* read_write.c
--
2.11.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount
2019-07-06 0:22 ` [PATCH 1/6] __detach_mounts(): lookup_mountpoint() can't return ERR_PTR() anymore Al Viro
2019-07-06 0:22 ` [PATCH 2/6] fs/namespace.c: shift put_mountpoint() to callers of unhash_mnt() Al Viro
2019-07-06 0:22 ` [PATCH 3/6] Teach shrink_dcache_parent() to cope with mixed-filesystem shrink lists Al Viro
@ 2019-07-06 0:22 ` Al Viro
2019-07-07 21:17 ` Linus Torvalds
2019-07-06 0:22 ` [PATCH 5/6] get rid of detach_mnt() Al Viro
2019-07-06 0:22 ` [PATCH 6/6] switch the remnants of releasing the mountpoint away from fs_pin Al Viro
4 siblings, 1 reply; 10+ messages in thread
From: Al Viro @ 2019-07-06 0:22 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Linus Torvalds, linux-kernel
From: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/mount.h | 1 -
fs/namespace.c | 66 +++++++++++++++++++++++++---------------------------------
2 files changed, 28 insertions(+), 39 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h
index 6250de544760..84aa8cdf4971 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -69,7 +69,6 @@ struct mount {
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct fs_pin mnt_umount;
- struct dentry *mnt_ex_mountpoint;
} __randomize_layout;
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index b7059a4f07e3..911675de2a70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -69,6 +69,8 @@ static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static DECLARE_RWSEM(namespace_sem);
+static HLIST_HEAD(unmounted); /* protected by namespace_sem */
+static LIST_HEAD(ex_mountpoints);
/* /sys/fs */
struct kobject *fs_kobj;
@@ -172,7 +174,6 @@ unsigned int mnt_get_count(struct mount *mnt)
static void drop_mountpoint(struct fs_pin *p)
{
struct mount *m = container_of(p, struct mount, mnt_umount);
- dput(m->mnt_ex_mountpoint);
pin_remove(p);
mntput(&m->mnt);
}
@@ -739,7 +740,7 @@ static struct mountpoint *get_mountpoint(struct dentry *dentry)
/* Add the new mountpoint to the hash table */
read_seqlock_excl(&mount_lock);
- new->m_dentry = dentry;
+ new->m_dentry = dget(dentry);
new->m_count = 1;
hlist_add_head(&new->m_hash, mp_hash(dentry));
INIT_HLIST_HEAD(&new->m_list);
@@ -752,7 +753,7 @@ static struct mountpoint *get_mountpoint(struct dentry *dentry)
return mp;
}
-static void put_mountpoint(struct mountpoint *mp)
+static void put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
if (!--mp->m_count) {
struct dentry *dentry = mp->m_dentry;
@@ -760,6 +761,9 @@ static void put_mountpoint(struct mountpoint *mp)
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
+ if (!list)
+ list = &ex_mountpoints;
+ dput_to_list(dentry, list);
hlist_del(&mp->m_hash);
kfree(mp);
}
@@ -813,19 +817,17 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
*/
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
- old_path->dentry = mnt->mnt_mountpoint;
+ old_path->dentry = dget(mnt->mnt_mountpoint);
old_path->mnt = &mnt->mnt_parent->mnt;
- put_mountpoint(unhash_mnt(mnt));
+ put_mountpoint(unhash_mnt(mnt), NULL);
}
/*
* vfsmount lock must be held for write
*/
-static void umount_mnt(struct mount *mnt)
+static void umount_mnt(struct mount *mnt, struct list_head *list)
{
- /* old mountpoint will be dropped when we can do that */
- mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
- put_mountpoint(unhash_mnt(mnt));
+ put_mountpoint(unhash_mnt(mnt), list);
}
/*
@@ -837,7 +839,7 @@ void mnt_set_mountpoint(struct mount *mnt,
{
mp->m_count++;
mnt_add_count(mnt, 1); /* essentially, that's mntget */
- child_mnt->mnt_mountpoint = dget(mp->m_dentry);
+ child_mnt->mnt_mountpoint = mp->m_dentry;
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
@@ -864,7 +866,6 @@ static void attach_mnt(struct mount *mnt,
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
struct mountpoint *old_mp = mnt->mnt_mp;
- struct dentry *old_mountpoint = mnt->mnt_mountpoint;
struct mount *old_parent = mnt->mnt_parent;
list_del_init(&mnt->mnt_child);
@@ -873,23 +874,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
attach_mnt(mnt, parent, mp);
- put_mountpoint(old_mp);
-
- /*
- * Safely avoid even the suggestion this code might sleep or
- * lock the mount hash by taking advantage of the knowledge that
- * mnt_change_mountpoint will not release the final reference
- * to a mountpoint.
- *
- * During mounting, the mount passed in as the parent mount will
- * continue to use the old mountpoint and during unmounting, the
- * old mountpoint will continue to exist until namespace_unlock,
- * which happens well after mnt_change_mountpoint.
- */
- spin_lock(&old_mountpoint->d_lock);
- old_mountpoint->d_lockref.count--;
- spin_unlock(&old_mountpoint->d_lock);
-
+ put_mountpoint(old_mp, NULL);
mnt_add_count(old_parent, -1);
}
@@ -1142,6 +1127,8 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
static void mntput_no_expire(struct mount *mnt)
{
+ LIST_HEAD(list);
+
rcu_read_lock();
if (likely(READ_ONCE(mnt->mnt_ns))) {
/*
@@ -1182,10 +1169,11 @@ static void mntput_no_expire(struct mount *mnt)
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
- umount_mnt(p);
+ umount_mnt(p, &list);
}
}
unlock_mount_hash();
+ shrink_dentry_list(&list);
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
struct task_struct *task = current;
@@ -1371,16 +1359,18 @@ int may_umount(struct vfsmount *mnt)
EXPORT_SYMBOL(may_umount);
-static HLIST_HEAD(unmounted); /* protected by namespace_sem */
-
static void namespace_unlock(void)
{
struct hlist_head head;
+ LIST_HEAD(list);
hlist_move_list(&unmounted, &head);
+ list_splice_init(&ex_mountpoints, &list);
up_write(&namespace_sem);
+ shrink_dentry_list(&list);
+
if (likely(hlist_empty(&head)))
return;
@@ -1481,7 +1471,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
} else {
- umount_mnt(p);
+ umount_mnt(p, NULL);
}
}
change_mnt_propagation(p, MS_PRIVATE);
@@ -1635,11 +1625,11 @@ void __detach_mounts(struct dentry *dentry)
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
- umount_mnt(mnt);
+ umount_mnt(mnt, NULL);
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
- put_mountpoint(mp);
+ put_mountpoint(mp, NULL);
out_unlock:
unlock_mount_hash();
namespace_unlock();
@@ -2110,7 +2100,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
child->mnt.mnt_flags &= ~MNT_LOCKED;
commit_tree(child);
}
- put_mountpoint(smp);
+ put_mountpoint(smp, NULL);
unlock_mount_hash();
return 0;
@@ -2127,7 +2117,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
ns->pending_mounts = 0;
read_seqlock_excl(&mount_lock);
- put_mountpoint(smp);
+ put_mountpoint(smp, NULL);
read_sequnlock_excl(&mount_lock);
return err;
@@ -2167,7 +2157,7 @@ static void unlock_mount(struct mountpoint *where)
struct dentry *dentry = where->m_dentry;
read_seqlock_excl(&mount_lock);
- put_mountpoint(where);
+ put_mountpoint(where, NULL);
read_sequnlock_excl(&mount_lock);
namespace_unlock();
@@ -3663,7 +3653,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
- put_mountpoint(root_mp);
+ put_mountpoint(root_mp, NULL);
unlock_mount_hash();
chroot_fs_refs(&root, &new);
error = 0;
--
2.11.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount
2019-07-06 0:22 ` [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount Al Viro
@ 2019-07-07 21:17 ` Linus Torvalds
2019-07-07 21:40 ` Al Viro
0 siblings, 1 reply; 10+ messages in thread
From: Linus Torvalds @ 2019-07-07 21:17 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel, Linux List Kernel Mailing
On Fri, Jul 5, 2019 at 5:22 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> +static HLIST_HEAD(unmounted); /* protected by namespace_sem */
> +static LIST_HEAD(ex_mountpoints);
What protects the ex_mountpoints list?
It looks like it's the mount_lock, but why isn't that documented?
It sure isn't namespace_sem from the comment above.
Linus
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount
2019-07-07 21:17 ` Linus Torvalds
@ 2019-07-07 21:40 ` Al Viro
2019-07-07 22:41 ` Linus Torvalds
0 siblings, 1 reply; 10+ messages in thread
From: Al Viro @ 2019-07-07 21:40 UTC (permalink / raw)
To: Linus Torvalds; +Cc: linux-fsdevel, Linux List Kernel Mailing
On Sun, Jul 07, 2019 at 02:17:38PM -0700, Linus Torvalds wrote:
> On Fri, Jul 5, 2019 at 5:22 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > +static HLIST_HEAD(unmounted); /* protected by namespace_sem */
> > +static LIST_HEAD(ex_mountpoints);
>
> What protects the ex_mountpoints list?
>
> It looks like it's the mount_lock, but why isn't that documented?
>
> It sure isn't namespace_sem from the comment above.
It is namespace_sem. Of all put_mountpoint() callers only the one
from mntput_no_expire() (disposing of stuck MNT_LOCKed children)
is not under namespace_sem; all such are told to use ex_mountpoints
for disposal. See
+ if (!list)
+ list = &ex_mountpoints;
+ dput_to_list(dentry, list);
in there. Only one call site gets non-default disposal list -
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
- umount_mnt(p);
+ umount_mnt(p, &list);
}
in mntput_no_expire() passes a local list to umount_mnt() (which passes it
to put_mountpoint()).
And namespace_unlock() empties ex_mountpoints before dropping namespace_sem -
the contents gets moved to a local list, which is fed to shrink_dentry_list()
as soon as we drop namespace_sem.
Protection of the disposal list is up to the callers of put_mountpoint();
for ex_mountpoints it's namespace_sem, for the one in mntput_no_expire()
we don't need any exclusion whatsoever - no other thread can access it.
IOW, the comment re namespace_sem applies to ex_mountpoints as well.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount
2019-07-07 21:40 ` Al Viro
@ 2019-07-07 22:41 ` Linus Torvalds
0 siblings, 0 replies; 10+ messages in thread
From: Linus Torvalds @ 2019-07-07 22:41 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel, Linux List Kernel Mailing
On Sun, Jul 7, 2019 at 2:40 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> It is namespace_sem. Of all put_mountpoint() callers only the one
> from mntput_no_expire() (disposing of stuck MNT_LOCKed children)
> is not under namespace_sem;
>
> list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
> - umount_mnt(p);
> + umount_mnt(p, &list);
> }
> in mntput_no_expire() passes a local list to umount_mnt() (which passes it
> to put_mountpoint()).
Ahh. Ok. This would be better with a comment. Maybe a separate helper
function with that comment and the special case of passing in NULL (or
maybe not pass in NULL at all, but pass in ex_mountpoints?).
Different locking requirements depending on argument values is very
confusing and easily overlooked..
Linus
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 5/6] get rid of detach_mnt()
2019-07-06 0:22 ` [PATCH 1/6] __detach_mounts(): lookup_mountpoint() can't return ERR_PTR() anymore Al Viro
` (2 preceding siblings ...)
2019-07-06 0:22 ` [PATCH 4/6] make struct mountpoint bear the dentry reference to mountpoint, not struct mount Al Viro
@ 2019-07-06 0:22 ` Al Viro
2019-07-06 0:22 ` [PATCH 6/6] switch the remnants of releasing the mountpoint away from fs_pin Al Viro
4 siblings, 0 replies; 10+ messages in thread
From: Al Viro @ 2019-07-06 0:22 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Linus Torvalds, linux-kernel
From: Al Viro <viro@zeniv.linux.org.uk>
Lift getting the original mount (dentry is actually not needed at all)
of the mountpoint into the callers - to do_move_mount() and pivot_root()
level. That simplifies the cleanup in those and allows to get saner
arguments for attach_mnt_recursive().
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/namespace.c | 62 ++++++++++++++++++++++++++--------------------------------
1 file changed, 28 insertions(+), 34 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 911675de2a70..326a9ab591bc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -815,16 +815,6 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
/*
* vfsmount lock must be held for write
*/
-static void detach_mnt(struct mount *mnt, struct path *old_path)
-{
- old_path->dentry = dget(mnt->mnt_mountpoint);
- old_path->mnt = &mnt->mnt_parent->mnt;
- put_mountpoint(unhash_mnt(mnt), NULL);
-}
-
-/*
- * vfsmount lock must be held for write
- */
static void umount_mnt(struct mount *mnt, struct list_head *list)
{
put_mountpoint(unhash_mnt(mnt), list);
@@ -2037,7 +2027,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
static int attach_recursive_mnt(struct mount *source_mnt,
struct mount *dest_mnt,
struct mountpoint *dest_mp,
- struct path *parent_path)
+ bool moving)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
@@ -2055,7 +2045,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return PTR_ERR(smp);
/* Is there space to add these mounts to the mount namespace? */
- if (!parent_path) {
+ if (!moving) {
err = count_mounts(ns, source_mnt);
if (err)
goto out;
@@ -2074,8 +2064,8 @@ static int attach_recursive_mnt(struct mount *source_mnt,
} else {
lock_mount_hash();
}
- if (parent_path) {
- detach_mnt(source_mnt, parent_path);
+ if (moving) {
+ unhash_mnt(source_mnt);
attach_mnt(source_mnt, dest_mnt, dest_mp);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
@@ -2173,7 +2163,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp, NULL);
+ return attach_recursive_mnt(mnt, p, mp, false);
}
/*
@@ -2566,11 +2556,11 @@ static bool check_for_nsfs_mounts(struct mount *subtree)
static int do_move_mount(struct path *old_path, struct path *new_path)
{
- struct path parent_path = {.mnt = NULL, .dentry = NULL};
struct mnt_namespace *ns;
struct mount *p;
struct mount *old;
- struct mountpoint *mp;
+ struct mount *parent;
+ struct mountpoint *mp, *old_mp;
int err;
bool attached;
@@ -2580,7 +2570,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
old = real_mount(old_path->mnt);
p = real_mount(new_path->mnt);
+ parent = old->mnt_parent;
attached = mnt_has_parent(old);
+ old_mp = old->mnt_mp;
ns = old->mnt_ns;
err = -EINVAL;
@@ -2608,7 +2600,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
/*
* Don't move a mount residing in a shared parent.
*/
- if (attached && IS_MNT_SHARED(old->mnt_parent))
+ if (attached && IS_MNT_SHARED(parent))
goto out;
/*
* Don't move a mount tree containing unbindable mounts to a destination
@@ -2624,18 +2616,21 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
goto out;
err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
- attached ? &parent_path : NULL);
+ attached);
if (err)
goto out;
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
+ if (attached)
+ put_mountpoint(old_mp, NULL);
out:
unlock_mount(mp);
if (!err) {
- path_put(&parent_path);
- if (!attached)
+ if (attached)
+ mntput_no_expire(parent);
+ else
free_mnt_ns(ns);
}
return err;
@@ -3578,8 +3573,8 @@ EXPORT_SYMBOL(path_is_under);
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
- struct path new, old, parent_path, root_parent, root;
- struct mount *new_mnt, *root_mnt, *old_mnt;
+ struct path new, old, root;
+ struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
struct mountpoint *old_mp, *root_mp;
int error;
@@ -3608,9 +3603,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
old_mnt = real_mount(old.mnt);
+ ex_parent = new_mnt->mnt_parent;
+ root_parent = root_mnt->mnt_parent;
if (IS_MNT_SHARED(old_mnt) ||
- IS_MNT_SHARED(new_mnt->mnt_parent) ||
- IS_MNT_SHARED(root_mnt->mnt_parent))
+ IS_MNT_SHARED(ex_parent) ||
+ IS_MNT_SHARED(root_parent))
goto out4;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
goto out4;
@@ -3627,7 +3624,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out4; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
goto out4; /* not attached */
- root_mp = root_mnt->mnt_mp;
if (new.mnt->mnt_root != new.dentry)
goto out4; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
@@ -3638,10 +3634,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
goto out4;
- root_mp->m_count++; /* pin it so it won't go away */
lock_mount_hash();
- detach_mnt(new_mnt, &parent_path);
- detach_mnt(root_mnt, &root_parent);
+ put_mountpoint(unhash_mnt(new_mnt), NULL);
+ root_mp = unhash_mnt(root_mnt);
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
new_mnt->mnt.mnt_flags |= MNT_LOCKED;
root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
@@ -3649,7 +3644,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* mount old root on put_old */
attach_mnt(root_mnt, old_mnt, old_mp);
/* mount new_root on / */
- attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
+ attach_mnt(new_mnt, root_parent, root_mp);
+ mnt_add_count(root_parent, -1);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
@@ -3659,10 +3655,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = 0;
out4:
unlock_mount(old_mp);
- if (!error) {
- path_put(&root_parent);
- path_put(&parent_path);
- }
+ if (!error)
+ mntput_no_expire(ex_parent);
out3:
path_put(&root);
out2:
--
2.11.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 6/6] switch the remnants of releasing the mountpoint away from fs_pin
2019-07-06 0:22 ` [PATCH 1/6] __detach_mounts(): lookup_mountpoint() can't return ERR_PTR() anymore Al Viro
` (3 preceding siblings ...)
2019-07-06 0:22 ` [PATCH 5/6] get rid of detach_mnt() Al Viro
@ 2019-07-06 0:22 ` Al Viro
4 siblings, 0 replies; 10+ messages in thread
From: Al Viro @ 2019-07-06 0:22 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Linus Torvalds, linux-kernel
From: Al Viro <viro@zeniv.linux.org.uk>
We used to need rather convoluted ordering trickery to guarantee
that dput() of ex-mountpoints happens before the final mntput()
of the same. Since we don't need that anymore, there's no point
playing with fs_pin for that.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/fs_pin.c | 10 ++--------
fs/mount.h | 7 +++++--
fs/namespace.c | 37 +++++++++++++++++++------------------
include/linux/fs_pin.h | 1 -
4 files changed, 26 insertions(+), 29 deletions(-)
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index a6497cf8ae53..47ef3c71ce90 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -19,20 +19,14 @@ void pin_remove(struct fs_pin *pin)
spin_unlock_irq(&pin->wait.lock);
}
-void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
{
spin_lock(&pin_lock);
- if (p)
- hlist_add_head(&pin->s_list, p);
+ hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
spin_unlock(&pin_lock);
}
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
-{
- pin_insert_group(pin, m, &m->mnt_sb->s_pins);
-}
-
void pin_kill(struct fs_pin *p)
{
wait_queue_entry_t wait;
diff --git a/fs/mount.h b/fs/mount.h
index 84aa8cdf4971..711a4093e475 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
- struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
+ union {
+ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
+ struct hlist_node mnt_umount;
+ };
struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
@@ -68,7 +71,7 @@ struct mount {
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
- struct fs_pin mnt_umount;
+ struct hlist_head mnt_stuck_children;
} __randomize_layout;
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 326a9ab591bc..a5d0eac9749d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -171,13 +171,6 @@ unsigned int mnt_get_count(struct mount *mnt)
#endif
}
-static void drop_mountpoint(struct fs_pin *p)
-{
- struct mount *m = container_of(p, struct mount, mnt_umount);
- pin_remove(p);
- mntput(&m->mnt);
-}
-
static struct mount *alloc_vfsmnt(const char *name)
{
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -215,7 +208,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
- init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+ INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
}
return mnt;
@@ -1079,19 +1072,22 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
static void cleanup_mnt(struct mount *mnt)
{
+ struct hlist_node *p;
+ struct mount *m;
/*
- * This probably indicates that somebody messed
- * up a mnt_want/drop_write() pair. If this
- * happens, the filesystem was probably unable
- * to make r/w->r/o transitions.
- */
- /*
+ * The warning here probably indicates that somebody messed
+ * up a mnt_want/drop_write() pair. If this happens, the
+ * filesystem was probably unable to make r/w->r/o transitions.
* The locking used to deal with mnt_count decrement provides barriers,
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt));
if (unlikely(mnt->mnt_pins.first))
mnt_pin_kill(mnt);
+ hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
+ hlist_del(&m->mnt_umount);
+ mntput(&m->mnt);
+ }
fsnotify_vfsmount_delete(&mnt->mnt);
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
@@ -1160,6 +1156,7 @@ static void mntput_no_expire(struct mount *mnt)
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
umount_mnt(p, &list);
+ hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
}
}
unlock_mount_hash();
@@ -1352,6 +1349,8 @@ EXPORT_SYMBOL(may_umount);
static void namespace_unlock(void)
{
struct hlist_head head;
+ struct hlist_node *p;
+ struct mount *m;
LIST_HEAD(list);
hlist_move_list(&unmounted, &head);
@@ -1366,7 +1365,10 @@ static void namespace_unlock(void)
synchronize_rcu_expedited();
- group_pin_kill(&head);
+ hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+ hlist_del(&m->mnt_umount);
+ mntput(&m->mnt);
+ }
}
static inline void namespace_lock(void)
@@ -1453,8 +1455,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
disconnect = disconnect_mount(p, how);
- pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
- disconnect ? &unmounted : NULL);
if (mnt_has_parent(p)) {
mnt_add_count(p->mnt_parent, -1);
if (!disconnect) {
@@ -1462,6 +1462,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
} else {
umount_mnt(p, NULL);
+ hlist_add_head(&p->mnt_umount, &unmounted);
}
}
change_mnt_propagation(p, MS_PRIVATE);
@@ -1614,8 +1615,8 @@ void __detach_mounts(struct dentry *dentry)
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
- hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
umount_mnt(mnt, NULL);
+ hlist_add_head(&mnt->mnt_umount, &unmounted);
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 7cab74d66f85..bdd09fd2520c 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -20,6 +20,5 @@ static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
}
void pin_remove(struct fs_pin *);
-void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
void pin_insert(struct fs_pin *, struct vfsmount *);
void pin_kill(struct fs_pin *);
--
2.11.0
^ permalink raw reply related [flat|nested] 10+ messages in thread