Linux-Fsdevel Archive on lore.kernel.org
 help / Atom feed
* [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
@ 2019-02-06 19:53 Salman Qazi
  2019-02-06 20:13 ` Eric Dumazet
  2019-02-07  4:14 ` Al Viro
  0 siblings, 2 replies; 5+ messages in thread
From: Salman Qazi @ 2019-02-06 19:53 UTC (permalink / raw)
  To: Alexander Viro, Eric Biederman, Eric Dumazet, linux-fsdevel; +Cc: Salman Qazi

Prior to this patch, the kernel can spend a lot of time with
this stack trace:

[<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
[<ffffffffbe549418>] synchronize_sched+0x48/0x60
[<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
[<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
[<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b

This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
This is done by implementing an asynchronous version of kern_unmount.

Since mntput() sleeps, it needs to be deferred to a work queue.

Additionally, the callers of mq_put_mnt appear to be safe having
it behave asynchronously.  In particular, put_ipc_ns calls
mq_clear_sbinfo which renders the inode inaccessible for the purposes of
mqueue_create by making s_fs_info NULL.  This appears
to be the thing that prevents access while free_ipc_ns is taking place.
So, the unmount should be able to proceed lazily.

Tested: Ran the following program:

    int main(void)
    {
            int pid;
            int status;
            int i;

            for (i = 0; i < 1000; i++) {
                    pid = fork();
                    if (!pid) {
                            assert(!unshare(CLONE_NEWUSER|
                                      CLONE_NEWIPC|CLONE_NEWNS));
                            return 0;
                    }

                    assert(waitpid(pid, &status, 0) == pid);
            }
    }

Before:

$ time ./unshare2

real    0m9.784s
user    0m0.428s
sys     0m0.000s

After:

$ time ./unshare2

real    0m0.368s
user    0m0.226s
sys     0m0.122s

Signed-off-by: Salman Qazi <sqazi@google.com>
---
 fs/namespace.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 ipc/mqueue.c       |  2 +-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index a677b59efd74..caa51ca81605 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3323,6 +3323,47 @@ void kern_unmount(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(kern_unmount);
 
+struct async_unmount_cb {
+	struct vfsmount *mnt;
+	struct work_struct work;
+	struct rcu_head rcu_head;
+};
+
+static void kern_unmount_work(struct work_struct *work)
+{
+	struct async_unmount_cb *cb = container_of(work,
+			struct async_unmount_cb, work);
+
+	mntput(cb->mnt);
+	kfree(cb);
+}
+
+static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
+{
+	struct async_unmount_cb *cb = container_of(rcu_head,
+			struct async_unmount_cb, rcu_head);
+
+	INIT_WORK(&cb->work, kern_unmount_work);
+	schedule_work(&cb->work);
+
+}
+
+void kern_unmount_async(struct vfsmount *mnt)
+{
+	/* release long term mount so mount point can be released */
+	if (!IS_ERR_OR_NULL(mnt)) {
+		struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+
+		if (cb) {
+			real_mount(mnt)->mnt_ns = NULL;
+			cb->mnt = mnt;
+			call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
+		} else {
+			kern_unmount(mnt);
+		}
+	}
+}
+
 bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d8e2cfed0e..8865997a8722 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
+extern void kern_unmount_async(struct vfsmount *mnt);
 extern void kern_unmount(struct vfsmount *mnt);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..a8c2465ac0cb 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
 
 void mq_put_mnt(struct ipc_namespace *ns)
 {
-	kern_unmount(ns->mq_mnt);
+	kern_unmount_async(ns->mq_mnt);
 }
 
 static int __init init_mqueue_fs(void)
-- 
2.20.1.611.gfbb209baf1-goog


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
  2019-02-06 19:53 [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC Salman Qazi
@ 2019-02-06 20:13 ` Eric Dumazet
  2019-02-07  4:14 ` Al Viro
  1 sibling, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2019-02-06 20:13 UTC (permalink / raw)
  To: Salman Qazi; +Cc: Alexander Viro, Eric Biederman, linux-fsdevel, LKML

On Wed, Feb 6, 2019 at 11:54 AM Salman Qazi <sqazi@google.com> wrote:
>
> Prior to this patch, the kernel can spend a lot of time with
> this stack trace:
>
> [<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
> [<ffffffffbe549418>] synchronize_sched+0x48/0x60
> [<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
> [<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
> [<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b
>
> This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> This is done by implementing an asynchronous version of kern_unmount.
>
> Since mntput() sleeps, it needs to be deferred to a work queue.
>
> Additionally, the callers of mq_put_mnt appear to be safe having
> it behave asynchronously.  In particular, put_ipc_ns calls
> mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> mqueue_create by making s_fs_info NULL.  This appears
> to be the thing that prevents access while free_ipc_ns is taking place.
> So, the unmount should be able to proceed lazily.
>
> Tested: Ran the following program:
>
>     int main(void)
>     {
>             int pid;
>             int status;
>             int i;
>
>             for (i = 0; i < 1000; i++) {
>                     pid = fork();
>                     if (!pid) {
>                             assert(!unshare(CLONE_NEWUSER|
>                                       CLONE_NEWIPC|CLONE_NEWNS));
>                             return 0;
>                     }
>
>                     assert(waitpid(pid, &status, 0) == pid);
>             }
>     }
>
> Before:
>
> $ time ./unshare2
>
> real    0m9.784s
> user    0m0.428s
> sys     0m0.000s
>
> After:
>
> $ time ./unshare2
>
> real    0m0.368s
> user    0m0.226s
> sys     0m0.122s
>
> Signed-off-by: Salman Qazi <sqazi@google.com>

Reviewed-by: Eric Dumazet <edumazet@google.com>

> ---
>  fs/namespace.c     | 41 +++++++++++++++++++++++++++++++++++++++++
>  include/linux/fs.h |  1 +
>  ipc/mqueue.c       |  2 +-
>  3 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index a677b59efd74..caa51ca81605 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3323,6 +3323,47 @@ void kern_unmount(struct vfsmount *mnt)
>  }
>  EXPORT_SYMBOL(kern_unmount);
>
> +struct async_unmount_cb {
> +       struct vfsmount *mnt;
> +       struct work_struct work;
> +       struct rcu_head rcu_head;
> +};
> +
> +static void kern_unmount_work(struct work_struct *work)
> +{
> +       struct async_unmount_cb *cb = container_of(work,
> +                       struct async_unmount_cb, work);
> +
> +       mntput(cb->mnt);
> +       kfree(cb);
> +}
> +
> +static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
> +{
> +       struct async_unmount_cb *cb = container_of(rcu_head,
> +                       struct async_unmount_cb, rcu_head);
> +
> +       INIT_WORK(&cb->work, kern_unmount_work);
> +       schedule_work(&cb->work);
> +
> +}
> +
> +void kern_unmount_async(struct vfsmount *mnt)
> +{
> +       /* release long term mount so mount point can be released */
> +       if (!IS_ERR_OR_NULL(mnt)) {
> +               struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
> +
> +               if (cb) {
> +                       real_mount(mnt)->mnt_ns = NULL;
> +                       cb->mnt = mnt;
> +                       call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
> +               } else {
> +                       kern_unmount(mnt);
> +               }
> +       }
> +}
> +
>  bool our_mnt(struct vfsmount *mnt)
>  {
>         return check_mnt(real_mount(mnt));
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 29d8e2cfed0e..8865997a8722 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
>  extern int unregister_filesystem(struct file_system_type *);
>  extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
>  #define kern_mount(type) kern_mount_data(type, NULL)
> +extern void kern_unmount_async(struct vfsmount *mnt);
>  extern void kern_unmount(struct vfsmount *mnt);
>  extern int may_umount_tree(struct vfsmount *);
>  extern int may_umount(struct vfsmount *);
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index c595bed7bfcb..a8c2465ac0cb 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
>
>  void mq_put_mnt(struct ipc_namespace *ns)
>  {
> -       kern_unmount(ns->mq_mnt);
> +       kern_unmount_async(ns->mq_mnt);
>  }
>
>  static int __init init_mqueue_fs(void)
> --
> 2.20.1.611.gfbb209baf1-goog
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
  2019-02-06 19:53 [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC Salman Qazi
  2019-02-06 20:13 ` Eric Dumazet
@ 2019-02-07  4:14 ` Al Viro
  2019-02-07 18:43   ` Salman Qazi
  1 sibling, 1 reply; 5+ messages in thread
From: Al Viro @ 2019-02-07  4:14 UTC (permalink / raw)
  To: Salman Qazi; +Cc: Eric Biederman, Eric Dumazet, linux-fsdevel

On Wed, Feb 06, 2019 at 11:53:54AM -0800, Salman Qazi wrote:

> This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> This is done by implementing an asynchronous version of kern_unmount.
> 
> Since mntput() sleeps, it needs to be deferred to a work queue.
> 
> Additionally, the callers of mq_put_mnt appear to be safe having
> it behave asynchronously.  In particular, put_ipc_ns calls
> mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> mqueue_create by making s_fs_info NULL.  This appears
> to be the thing that prevents access while free_ipc_ns is taking place.
> So, the unmount should be able to proceed lazily.

Ugh...  I really doubt that it's correct.  The caller is
                mq_put_mnt(ns);
                free_ipc_ns(ns);
and we have
static void mqueue_evict_inode(struct inode *inode)
{

...

        ipc_ns = get_ns_from_inode(inode);

with

static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
{
        struct ipc_namespace *ns;

        spin_lock(&mq_lock);
        ns = __get_ns_from_inode(inode);
        spin_unlock(&mq_lock);
        return ns;
}

and

static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
{
        return get_ipc_ns(inode->i_sb->s_fs_info);
}

with ->s_fs_info being the ipc_namespace we are freeing after mq_put_ns()

Are you saying that get_ipc_ns() after free_ipc_ns() is safe?  Because
->evict_inode() *IS* called on umount.  What happens to your patch if
there was a regular file left on that filesystem?

Smells like a memory corruptor...

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
  2019-02-07  4:14 ` Al Viro
@ 2019-02-07 18:43   ` Salman Qazi
  2019-02-13 21:07     ` Salman Qazi
  0 siblings, 1 reply; 5+ messages in thread
From: Salman Qazi @ 2019-02-07 18:43 UTC (permalink / raw)
  To: Al Viro; +Cc: Eric Biederman, Eric Dumazet, linux-fsdevel

On Wed, Feb 6, 2019 at 8:14 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Wed, Feb 06, 2019 at 11:53:54AM -0800, Salman Qazi wrote:
>
> > This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> > This is done by implementing an asynchronous version of kern_unmount.
> >
> > Since mntput() sleeps, it needs to be deferred to a work queue.
> >
> > Additionally, the callers of mq_put_mnt appear to be safe having
> > it behave asynchronously.  In particular, put_ipc_ns calls
> > mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> > mqueue_create by making s_fs_info NULL.  This appears
> > to be the thing that prevents access while free_ipc_ns is taking place.
> > So, the unmount should be able to proceed lazily.
>
> Ugh...  I really doubt that it's correct.  The caller is
>                 mq_put_mnt(ns);
>                 free_ipc_ns(ns);
> and we have
> static void mqueue_evict_inode(struct inode *inode)
> {
>
> ...
>
>         ipc_ns = get_ns_from_inode(inode);
>
> with
>
> static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
> {
>         struct ipc_namespace *ns;
>
>         spin_lock(&mq_lock);
>         ns = __get_ns_from_inode(inode);
>         spin_unlock(&mq_lock);
>         return ns;
> }
>
> and
>
> static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
> {
>         return get_ipc_ns(inode->i_sb->s_fs_info);
> }
>
> with ->s_fs_info being the ipc_namespace we are freeing after mq_put_ns()
>
> Are you saying that get_ipc_ns() after free_ipc_ns() is safe?  Because
> ->evict_inode() *IS* called on umount.  What happens to your patch if
> there was a regular file left on that filesystem?
>
> Smells like a memory corruptor...

Actually, the full context in the caller is

        if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
                mq_clear_sbinfo(ns);
                spin_unlock(&mq_lock);
                mq_put_mnt(ns);
                free_ipc_ns(ns);
        }

And

void mq_clear_sbinfo(struct ipc_namespace *ns)
{
        ns->mq_mnt->mnt_sb->s_fs_info = NULL;
}

Therefore, s_fs_info should be NULL before we proceed to unmount.  So,
as far as I know, it should not be possible to find the ipc_namespace
from the mount.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
  2019-02-07 18:43   ` Salman Qazi
@ 2019-02-13 21:07     ` Salman Qazi
  0 siblings, 0 replies; 5+ messages in thread
From: Salman Qazi @ 2019-02-13 21:07 UTC (permalink / raw)
  To: Al Viro
  Cc: Eric Biederman, Eric Dumazet, linux-fsdevel, Linux Kernel Mailing List

Do you have any additional concerns?

On Thu, Feb 7, 2019 at 10:43 AM Salman Qazi <sqazi@google.com> wrote:
>
> On Wed, Feb 6, 2019 at 8:14 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > On Wed, Feb 06, 2019 at 11:53:54AM -0800, Salman Qazi wrote:
> >
> > > This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> > > This is done by implementing an asynchronous version of kern_unmount.
> > >
> > > Since mntput() sleeps, it needs to be deferred to a work queue.
> > >
> > > Additionally, the callers of mq_put_mnt appear to be safe having
> > > it behave asynchronously.  In particular, put_ipc_ns calls
> > > mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> > > mqueue_create by making s_fs_info NULL.  This appears
> > > to be the thing that prevents access while free_ipc_ns is taking place.
> > > So, the unmount should be able to proceed lazily.
> >
> > Ugh...  I really doubt that it's correct.  The caller is
> >                 mq_put_mnt(ns);
> >                 free_ipc_ns(ns);
> > and we have
> > static void mqueue_evict_inode(struct inode *inode)
> > {
> >
> > ...
> >
> >         ipc_ns = get_ns_from_inode(inode);
> >
> > with
> >
> > static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
> > {
> >         struct ipc_namespace *ns;
> >
> >         spin_lock(&mq_lock);
> >         ns = __get_ns_from_inode(inode);
> >         spin_unlock(&mq_lock);
> >         return ns;
> > }
> >
> > and
> >
> > static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
> > {
> >         return get_ipc_ns(inode->i_sb->s_fs_info);
> > }
> >
> > with ->s_fs_info being the ipc_namespace we are freeing after mq_put_ns()
> >
> > Are you saying that get_ipc_ns() after free_ipc_ns() is safe?  Because
> > ->evict_inode() *IS* called on umount.  What happens to your patch if
> > there was a regular file left on that filesystem?
> >
> > Smells like a memory corruptor...
>
> Actually, the full context in the caller is
>
>         if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
>                 mq_clear_sbinfo(ns);
>                 spin_unlock(&mq_lock);
>                 mq_put_mnt(ns);
>                 free_ipc_ns(ns);
>         }
>
> And
>
> void mq_clear_sbinfo(struct ipc_namespace *ns)
> {
>         ns->mq_mnt->mnt_sb->s_fs_info = NULL;
> }
>
> Therefore, s_fs_info should be NULL before we proceed to unmount.  So,
> as far as I know, it should not be possible to find the ipc_namespace
> from the mount.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-06 19:53 [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC Salman Qazi
2019-02-06 20:13 ` Eric Dumazet
2019-02-07  4:14 ` Al Viro
2019-02-07 18:43   ` Salman Qazi
2019-02-13 21:07     ` Salman Qazi

Linux-Fsdevel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-fsdevel/0 linux-fsdevel/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-fsdevel linux-fsdevel/ https://lore.kernel.org/linux-fsdevel \
		linux-fsdevel@vger.kernel.org linux-fsdevel@archiver.kernel.org
	public-inbox-index linux-fsdevel


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-fsdevel


AGPL code for this site: git clone https://public-inbox.org/ public-inbox