* [PATCH v2] proc/mounts: add cursor
@ 2020-04-09 14:16 Miklos Szeredi
2020-04-09 16:22 ` Aurélien Aptel
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Miklos Szeredi @ 2020-04-09 14:16 UTC (permalink / raw)
To: Al Viro; +Cc: Karel Zak, linux-fsdevel
From: Miklos Szeredi <mszeredi@redhat.com>
If mounts are deleted after a read(2) call on /proc/self/mounts (or its
kin), the subsequent read(2) could miss a mount that comes after the
deleted one in the list. This is because the file position is interpreted
as the number mount entries from the start of the list.
E.g. first read gets entries #0 to #9; the seq file index will be 10. Then
entry #5 is deleted, resulting in #10 becoming #9 and #11 becoming #10,
etc... The next read will continue from entry #10, and #9 is missed.
Solve this by adding a cursor entry for each open instance. Taking the
global namespace_sem for write seems excessive, since we are only dealing
with a per-namespace list. Instead add a per-namespace spinlock and use
that together with namespace_sem taken for read to protect against
concurrent modification of the mount list. This may reduce parallelism of
is_local_mountpoint(), but it's hardly a big contention point. We could
also use RCU freeing of cursors to make traversal not need additional
locks, if that turns out to be neceesary.
Reported-by: Karel Zak <kzak@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
Differences from v1:
- removed unnecessary code that wanted to handle lseeks
- fixed double entry at the start of a read
fs/mount.h | 12 +++++--
fs/namespace.c | 76 ++++++++++++++++++++++++++++++++++++++------------
fs/proc_namespace.c | 4 +-
include/linux/mount.h | 4 +-
4 files changed, 74 insertions(+), 22 deletions(-)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -9,7 +9,13 @@ struct mnt_namespace {
atomic_t count;
struct ns_common ns;
struct mount * root;
+ /*
+ * Traversal and modification of .list is protected by either
+ * - taking namespace_sem for write, OR
+ * - taking namespace_sem for read AND taking .ns_lock.
+ */
struct list_head list;
+ spinlock_t ns_lock;
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
@@ -133,9 +139,7 @@ struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
- void *cached_mount;
- u64 cached_event;
- loff_t cached_index;
+ struct mount cursor;
};
extern const struct seq_operations mounts_op;
@@ -153,3 +157,5 @@ static inline bool is_anon_ns(struct mnt
{
return ns->seq == 0;
}
+
+extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -648,6 +648,30 @@ struct vfsmount *lookup_mnt(const struct
return m;
}
+static inline void lock_ns_list(struct mnt_namespace *ns)
+{
+ spin_lock(&ns->ns_lock);
+}
+
+static inline void unlock_ns_list(struct mnt_namespace *ns)
+{
+ spin_unlock(&ns->ns_lock);
+}
+
+static inline bool mnt_is_cursor(struct mount *mnt)
+{
+ return mnt->mnt.mnt_flags & MNT_CURSOR;
+}
+
+static struct mount *mnt_skip_cursors(struct mnt_namespace *ns,
+ struct mount *mnt)
+{
+ list_for_each_entry_from(mnt, &ns->list, mnt_list)
+ if (!mnt_is_cursor(mnt))
+ return mnt;
+ return NULL;
+}
+
/*
* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
* current mount namespace.
@@ -673,11 +697,15 @@ bool __is_local_mountpoint(struct dentry
goto out;
down_read(&namespace_sem);
+ lock_ns_list(ns);
list_for_each_entry(mnt, &ns->list, mnt_list) {
+ if (mnt_is_cursor(mnt))
+ continue;
is_covered = (mnt->mnt_mountpoint == dentry);
if (is_covered)
break;
}
+ unlock_ns_list(ns);
up_read(&namespace_sem);
out:
return is_covered;
@@ -1249,31 +1277,30 @@ struct vfsmount *mnt_clone_internal(cons
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
+ struct mount *mnt;
down_read(&namespace_sem);
- if (p->cached_event == p->ns->event) {
- void *v = p->cached_mount;
- if (*pos == p->cached_index)
- return v;
- if (*pos == p->cached_index + 1) {
- v = seq_list_next(v, &p->ns->list, &p->cached_index);
- return p->cached_mount = v;
- }
- }
+ lock_ns_list(p->ns);
+ if (!*pos)
+ list_move(&p->cursor.mnt_list, &p->ns->list);
+ mnt = mnt_skip_cursors(p->ns, &p->cursor);
+ unlock_ns_list(p->ns);
- p->cached_event = p->ns->event;
- p->cached_mount = seq_list_start(&p->ns->list, *pos);
- p->cached_index = *pos;
- return p->cached_mount;
+ return mnt;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_mounts *p = m->private;
+ struct mount *mnt = v;
- p->cached_mount = seq_list_next(v, &p->ns->list, pos);
- p->cached_index = *pos;
- return p->cached_mount;
+ lock_ns_list(p->ns);
+ list_move(&p->cursor.mnt_list, &mnt->mnt_list);
+ mnt = mnt_skip_cursors(p->ns, &p->cursor);
+ unlock_ns_list(p->ns);
+ ++*pos;
+
+ return mnt;
}
static void m_stop(struct seq_file *m, void *v)
@@ -1284,7 +1311,7 @@ static void m_stop(struct seq_file *m, v
static int m_show(struct seq_file *m, void *v)
{
struct proc_mounts *p = m->private;
- struct mount *r = list_entry(v, struct mount, mnt_list);
+ struct mount *r = v;
return p->show(m, &r->mnt);
}
@@ -1294,6 +1321,15 @@ const struct seq_operations mounts_op =
.stop = m_stop,
.show = m_show,
};
+
+void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
+{
+ down_read(&namespace_sem);
+ lock_ns_list(ns);
+ list_del(&cursor->mnt_list);
+ unlock_ns_list(ns);
+ up_read(&namespace_sem);
+}
#endif /* CONFIG_PROC_FS */
/**
@@ -3202,6 +3238,7 @@ static struct mnt_namespace *alloc_mnt_n
atomic_set(&new_ns->count, 1);
INIT_LIST_HEAD(&new_ns->list);
init_waitqueue_head(&new_ns->poll);
+ spin_lock_init(&new_ns->ns_lock);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
return new_ns;
@@ -3842,10 +3879,14 @@ static bool mnt_already_visible(struct m
bool visible = false;
down_read(&namespace_sem);
+ lock_ns_list(ns);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;
+ if (mnt_is_cursor(mnt))
+ continue;
+
if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
@@ -3893,6 +3934,7 @@ static bool mnt_already_visible(struct m
next: ;
}
found:
+ unlock_ns_list(ns);
up_read(&namespace_sem);
return visible;
}
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -279,7 +279,8 @@ static int mounts_open_common(struct ino
p->ns = ns;
p->root = root;
p->show = show;
- p->cached_event = ~0ULL;
+ INIT_LIST_HEAD(&p->cursor.mnt_list);
+ p->cursor.mnt.mnt_flags = MNT_CURSOR;
return 0;
@@ -296,6 +297,7 @@ static int mounts_release(struct inode *
struct seq_file *m = file->private_data;
struct proc_mounts *p = m->private;
path_put(&p->root);
+ mnt_cursor_del(p->ns, &p->cursor);
put_mnt_ns(p->ns);
return seq_release_private(inode, file);
}
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,7 +50,8 @@ struct fs_context;
#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
- MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
+ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
+ MNT_CURSOR)
#define MNT_INTERNAL 0x4000
@@ -64,6 +65,7 @@ struct fs_context;
#define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000
#define MNT_UMOUNT 0x8000000
+#define MNT_CURSOR 0x10000000
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] proc/mounts: add cursor
2020-04-09 14:16 [PATCH v2] proc/mounts: add cursor Miklos Szeredi
@ 2020-04-09 16:22 ` Aurélien Aptel
2020-04-09 16:50 ` Al Viro
2020-04-09 18:45 ` Matthew Wilcox
2 siblings, 0 replies; 7+ messages in thread
From: Aurélien Aptel @ 2020-04-09 16:22 UTC (permalink / raw)
To: Miklos Szeredi, Al Viro; +Cc: Karel Zak, linux-fsdevel
Miklos Szeredi <miklos@szeredi.hu> writes:
>
> +static inline void lock_ns_list(struct mnt_namespace *ns)
> +{
> + spin_lock(&ns->ns_lock);
> +}
> +
> +static inline void unlock_ns_list(struct mnt_namespace *ns)
> +{
> + spin_unlock(&ns->ns_lock);
> +}
> +
You could add __acquires sparse annotations for those functions. See:
https://www.kernel.org/doc/html/v5.6/dev-tools/sparse.html#using-sparse-for-lock-checking
--
Aurélien Aptel / SUSE Labs Samba Team
GPG: 1839 CB5F 9F5B FB9B AA97 8C99 03C8 A49B 521B D5D3
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg, DE
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah HRB 247165 (AG München)
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] proc/mounts: add cursor
2020-04-09 14:16 [PATCH v2] proc/mounts: add cursor Miklos Szeredi
2020-04-09 16:22 ` Aurélien Aptel
@ 2020-04-09 16:50 ` Al Viro
2020-04-09 16:54 ` Al Viro
2020-04-09 18:45 ` Matthew Wilcox
2 siblings, 1 reply; 7+ messages in thread
From: Al Viro @ 2020-04-09 16:50 UTC (permalink / raw)
To: Miklos Szeredi; +Cc: Karel Zak, linux-fsdevel
On Thu, Apr 09, 2020 at 04:16:19PM +0200, Miklos Szeredi wrote:
> Solve this by adding a cursor entry for each open instance. Taking the
> global namespace_sem for write seems excessive, since we are only dealing
> with a per-namespace list. Instead add a per-namespace spinlock and use
> that together with namespace_sem taken for read to protect against
> concurrent modification of the mount list. This may reduce parallelism of
> is_local_mountpoint(), but it's hardly a big contention point. We could
> also use RCU freeing of cursors to make traversal not need additional
> locks, if that turns out to be neceesary.
Umm... That can do more than reduction of parallelism - longer lists take
longer to scan and moving cursors dirties cachelines in a bunch of struct
mount instances. And I'm not convinced that your locking in m_next() is
correct.
What's to stop umount_tree() from removing the next entry from the list
just as your m_next() tries to move the cursor? I don't see any common
locks for those two...
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] proc/mounts: add cursor
2020-04-09 16:50 ` Al Viro
@ 2020-04-09 16:54 ` Al Viro
2020-04-09 18:30 ` Al Viro
0 siblings, 1 reply; 7+ messages in thread
From: Al Viro @ 2020-04-09 16:54 UTC (permalink / raw)
To: Miklos Szeredi; +Cc: Karel Zak, linux-fsdevel
On Thu, Apr 09, 2020 at 05:50:48PM +0100, Al Viro wrote:
> On Thu, Apr 09, 2020 at 04:16:19PM +0200, Miklos Szeredi wrote:
> > Solve this by adding a cursor entry for each open instance. Taking the
> > global namespace_sem for write seems excessive, since we are only dealing
> > with a per-namespace list. Instead add a per-namespace spinlock and use
> > that together with namespace_sem taken for read to protect against
> > concurrent modification of the mount list. This may reduce parallelism of
> > is_local_mountpoint(), but it's hardly a big contention point. We could
> > also use RCU freeing of cursors to make traversal not need additional
> > locks, if that turns out to be neceesary.
>
> Umm... That can do more than reduction of parallelism - longer lists take
> longer to scan and moving cursors dirties cachelines in a bunch of struct
> mount instances. And I'm not convinced that your locking in m_next() is
> correct.
>
> What's to stop umount_tree() from removing the next entry from the list
> just as your m_next() tries to move the cursor? I don't see any common
> locks for those two...
Ah, you still have namespace_sem taken (shared) by m_start(). Nevermind
that one, then... Let me get through mnt_list users and see if I can
catch anything.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] proc/mounts: add cursor
2020-04-09 16:54 ` Al Viro
@ 2020-04-09 18:30 ` Al Viro
2020-04-09 19:36 ` Miklos Szeredi
0 siblings, 1 reply; 7+ messages in thread
From: Al Viro @ 2020-04-09 18:30 UTC (permalink / raw)
To: Miklos Szeredi; +Cc: Karel Zak, linux-fsdevel
On Thu, Apr 09, 2020 at 05:54:46PM +0100, Al Viro wrote:
> On Thu, Apr 09, 2020 at 05:50:48PM +0100, Al Viro wrote:
> > On Thu, Apr 09, 2020 at 04:16:19PM +0200, Miklos Szeredi wrote:
> > > Solve this by adding a cursor entry for each open instance. Taking the
> > > global namespace_sem for write seems excessive, since we are only dealing
> > > with a per-namespace list. Instead add a per-namespace spinlock and use
> > > that together with namespace_sem taken for read to protect against
> > > concurrent modification of the mount list. This may reduce parallelism of
> > > is_local_mountpoint(), but it's hardly a big contention point. We could
> > > also use RCU freeing of cursors to make traversal not need additional
> > > locks, if that turns out to be neceesary.
> >
> > Umm... That can do more than reduction of parallelism - longer lists take
> > longer to scan and moving cursors dirties cachelines in a bunch of struct
> > mount instances. And I'm not convinced that your locking in m_next() is
> > correct.
> >
> > What's to stop umount_tree() from removing the next entry from the list
> > just as your m_next() tries to move the cursor? I don't see any common
> > locks for those two...
>
> Ah, you still have namespace_sem taken (shared) by m_start(). Nevermind
> that one, then... Let me get through mnt_list users and see if I can
> catch anything.
OK... Locking is safe, but it's not obvious. And your changes do make it
scarier. There are several kinds of lists that can be threaded through
->mnt_list and your code depends upon never having those suckers appear
in e.g. anon namespace ->list. It is true (AFAICS), but...
Another fun question is ns->mounts rules - it used to be "the number of
entries in ns->list", now it's "the number of non-cursor entries there".
Incidentally, we might have a problem with that logics wrt count_mount().
Sigh... The damn thing has grown much too convoluted over years ;-/
I'm still not happy with that patch; at the very least it needs a lot more
detailed analysis to go along with it.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] proc/mounts: add cursor
2020-04-09 14:16 [PATCH v2] proc/mounts: add cursor Miklos Szeredi
2020-04-09 16:22 ` Aurélien Aptel
2020-04-09 16:50 ` Al Viro
@ 2020-04-09 18:45 ` Matthew Wilcox
2 siblings, 0 replies; 7+ messages in thread
From: Matthew Wilcox @ 2020-04-09 18:45 UTC (permalink / raw)
To: Miklos Szeredi; +Cc: Al Viro, Karel Zak, linux-fsdevel, Kent Overstreet
On Thu, Apr 09, 2020 at 04:16:19PM +0200, Miklos Szeredi wrote:
> Solve this by adding a cursor entry for each open instance. Taking the
> global namespace_sem for write seems excessive, since we are only dealing
> with a per-namespace list. Instead add a per-namespace spinlock and use
> that together with namespace_sem taken for read to protect against
> concurrent modification of the mount list. This may reduce parallelism of
It occurs to me that this is another place where something like Kent's
SIX locks would fit the bill. To recap, that was a mutex with Shared,
Intent-to-upgrade and eXclusive states. eXclusive and Shared are like
write and read states in our rwsem. The Intent state would allow other
Shared users to start, but no more Intent users. You'd have to upgrade
to eXclusive state to actually modify the list, so you might have to
wait for Shared users to finish the section. Might not work out well
in practice for this user, but thought it was worth mentioning.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] proc/mounts: add cursor
2020-04-09 18:30 ` Al Viro
@ 2020-04-09 19:36 ` Miklos Szeredi
0 siblings, 0 replies; 7+ messages in thread
From: Miklos Szeredi @ 2020-04-09 19:36 UTC (permalink / raw)
To: Al Viro; +Cc: Karel Zak, linux-fsdevel
On Thu, Apr 9, 2020 at 8:30 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Thu, Apr 09, 2020 at 05:54:46PM +0100, Al Viro wrote:
> > On Thu, Apr 09, 2020 at 05:50:48PM +0100, Al Viro wrote:
> > > On Thu, Apr 09, 2020 at 04:16:19PM +0200, Miklos Szeredi wrote:
> > > > Solve this by adding a cursor entry for each open instance. Taking the
> > > > global namespace_sem for write seems excessive, since we are only dealing
> > > > with a per-namespace list. Instead add a per-namespace spinlock and use
> > > > that together with namespace_sem taken for read to protect against
> > > > concurrent modification of the mount list. This may reduce parallelism of
> > > > is_local_mountpoint(), but it's hardly a big contention point. We could
> > > > also use RCU freeing of cursors to make traversal not need additional
> > > > locks, if that turns out to be neceesary.
> > >
> > > Umm... That can do more than reduction of parallelism - longer lists take
> > > longer to scan and moving cursors dirties cachelines in a bunch of struct
> > > mount instances. And I'm not convinced that your locking in m_next() is
> > > correct.
> > >
> > > What's to stop umount_tree() from removing the next entry from the list
> > > just as your m_next() tries to move the cursor? I don't see any common
> > > locks for those two...
> >
> > Ah, you still have namespace_sem taken (shared) by m_start(). Nevermind
> > that one, then... Let me get through mnt_list users and see if I can
> > catch anything.
>
> OK... Locking is safe, but it's not obvious. And your changes do make it
> scarier. There are several kinds of lists that can be threaded through
> ->mnt_list and your code depends upon never having those suckers appear
> in e.g. anon namespace ->list. It is true (AFAICS), but...
See analysis below.
> Another fun question is ns->mounts rules - it used to be "the number of
> entries in ns->list", now it's "the number of non-cursor entries there".
> Incidentally, we might have a problem with that logics wrt count_mount().
Nope, count_mount() iterates through the mount tree, not through mnt_ns->list.
> Sigh... The damn thing has grown much too convoluted over years ;-/
>
> I'm still not happy with that patch; at the very least it needs a lot more
> detailed analysis to go along with it.
Functions touching mnt_list:
In pnode.c:
umount_one:
umount_list:
propagate_umount: both of the above are indirectly called from this.
The only caller is umount_tree(), which has lots of different call
paths, but in each one has namespace_sem taken for write:
do_move_mount
attach_recursive_mnt
umount_tree
do_loopback
graft_tree
attach_recursive_mnt
umount_tree
do_new_mount_fc
do_add_mount
graft_tree
attach_recursive_mnt
umount_tree
finish_automount
do_add_mount
graft_tree
attach_recursive_mnt
umount_tree
do_umount
shrink_submounts
umount_tree
namespace.c:
__is_local_mountpoint: takes namespace_sem for read
commit_tree: has namespace_sem for write (only caller being
attach_recursive_mnt, see above for call paths).
m_start:
m_next:
m_show: all have namespace_sem for read
umount_tree: all callers have namespace_sem for write (se above for call paths)
do_umount: has namespace_sem for write
copy_tree: all members are newly allocated
iterate_mounts: operates on private copy built by collect_mounts()
open_detached_copy: takes namespace_sem for write
copy_mnt_ns: takes namespace_sem for write
mount_subtree: adds onto a newly allocated mnt_namespace
sys_fsmount: ditto
init_mount_tree: ditto
mnt_already_visible: takes namespace_sem for read
Patch adds ns_lock locking to all places that only have namespace_sem
for read. So everyone is still excluded: those taking namespace_sem
for write against everyone else obviously, and those taking
namespace_sem for read because they take ns_lock.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2020-04-09 19:36 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-09 14:16 [PATCH v2] proc/mounts: add cursor Miklos Szeredi
2020-04-09 16:22 ` Aurélien Aptel
2020-04-09 16:50 ` Al Viro
2020-04-09 16:54 ` Al Viro
2020-04-09 18:30 ` Al Viro
2020-04-09 19:36 ` Miklos Szeredi
2020-04-09 18:45 ` Matthew Wilcox
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.