All of lore.kernel.org
 help / color / mirror / Atom feed
From: Salman Qazi <sqazi@google.com>
To: Al Viro <viro@zeniv.linux.org.uk>,
	Eric Biederman <ebiederm@xmission.com>,
	Eric Dumazet <edumazet@google.com>,
	linux-fsdevel@vger.kernel.org,
	LKML <linux-kernel@vger.kernel.org>
Cc: Salman Qazi <sqazi@google.com>
Subject: [PATCH RESENT] fs, ipc: Use an asynchronous version of kern_unmount in IPC
Date: Mon,  4 Mar 2019 11:48:59 -0800	[thread overview]
Message-ID: <20190304194859.229604-1-sqazi@google.com> (raw)
In-Reply-To: <CAKUOC8Xvda0KPkck3Tunksm8gg2aCi8Zf1EWb7VVa_y9rv6j3g@mail.gmail.com>

Prior to this patch, the kernel can spend a lot of time with
this stack trace:

[<ffffffffbe5491e3>] __wait_rcu_gp+0x93/0xe0
[<ffffffffbe549418>] synchronize_sched+0x48/0x60
[<ffffffffbe7ae5b3>] kern_unmount+0x3a/0x46
[<ffffffffbe847c02>] mq_put_mnt+0x15/0x17
[<ffffffffbe8481af>] put_ipc_ns+0x36/0x8b

This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
This is done by implementing an asynchronous version of kern_unmount.

Since mntput() sleeps, it needs to be deferred to a work queue.

Additionally, the callers of mq_put_mnt appear to be safe having
it behave asynchronously.  In particular, put_ipc_ns calls
mq_clear_sbinfo which renders the inode inaccessible for the purposes of
mqueue_create by making s_fs_info NULL.  This appears
to be the thing that prevents access while free_ipc_ns is taking place.
So, the unmount should be able to proceed lazily.

Tested: Ran the following program:

    int main(void)
    {
            int pid;
            int status;
            int i;

            for (i = 0; i < 1000; i++) {
                    pid = fork();
                    if (!pid) {
                            assert(!unshare(CLONE_NEWUSER|
                                      CLONE_NEWIPC|CLONE_NEWNS));
                            return 0;
                    }

                    assert(waitpid(pid, &status, 0) == pid);
            }
    }

Before:

$ time ./unshare2

real    0m9.784s
user    0m0.428s
sys     0m0.000s

After:

$ time ./unshare2

real    0m0.368s
user    0m0.226s
sys     0m0.122s

Signed-off-by: Salman Qazi <sqazi@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 fs/namespace.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 ipc/mqueue.c       |  2 +-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 678ef175d63a..e60b473c3bbc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3321,6 +3321,47 @@ void kern_unmount(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(kern_unmount);
 
+struct async_unmount_cb {
+	struct vfsmount *mnt;
+	struct work_struct work;
+	struct rcu_head rcu_head;
+};
+
+static void kern_unmount_work(struct work_struct *work)
+{
+	struct async_unmount_cb *cb = container_of(work,
+			struct async_unmount_cb, work);
+
+	mntput(cb->mnt);
+	kfree(cb);
+}
+
+static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
+{
+	struct async_unmount_cb *cb = container_of(rcu_head,
+			struct async_unmount_cb, rcu_head);
+
+	INIT_WORK(&cb->work, kern_unmount_work);
+	schedule_work(&cb->work);
+
+}
+
+void kern_unmount_async(struct vfsmount *mnt)
+{
+	/* release long term mount so mount point can be released */
+	if (!IS_ERR_OR_NULL(mnt)) {
+		struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+
+		if (cb) {
+			real_mount(mnt)->mnt_ns = NULL;
+			cb->mnt = mnt;
+			call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
+		} else {
+			kern_unmount(mnt);
+		}
+	}
+}
+
 bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d8e2cfed0e..8865997a8722 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
+extern void kern_unmount_async(struct vfsmount *mnt);
 extern void kern_unmount(struct vfsmount *mnt);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..a8c2465ac0cb 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
 
 void mq_put_mnt(struct ipc_namespace *ns)
 {
-	kern_unmount(ns->mq_mnt);
+	kern_unmount_async(ns->mq_mnt);
 }
 
 static int __init init_mqueue_fs(void)
-- 
2.21.0.352.gf09ad66450-goog


      parent reply	other threads:[~2019-03-04 19:49 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-06 19:53 [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC Salman Qazi
2019-02-06 20:13 ` Eric Dumazet
2019-02-07  4:14 ` Al Viro
2019-02-07 18:43   ` Salman Qazi
2019-02-13 21:07     ` Salman Qazi
2019-03-04 19:48     ` Salman Qazi [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190304194859.229604-1-sqazi@google.com \
    --to=sqazi@google.com \
    --cc=ebiederm@xmission.com \
    --cc=edumazet@google.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.