All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kirill Tkhai <ktkhai@virtuozzo.com>
To: viro@zeniv.linux.org.uk, adobriyan@gmail.com,
	davem@davemloft.net, ebiederm@xmission.com,
	akpm@linux-foundation.org, christian.brauner@ubuntu.com,
	areber@redhat.com, serge@hallyn.com,
	linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	ktkhai@virtuozzo.com
Subject: [PATCH 09/23] ns: Introduce ns_idr to be able to iterate all allocated namespaces in the system
Date: Thu, 30 Jul 2020 15:00:08 +0300	[thread overview]
Message-ID: <159611040870.535980.13460189038999722608.stgit@localhost.localdomain> (raw)
In-Reply-To: <159611007271.535980.15362304262237658692.stgit@localhost.localdomain>

This patch introduces a new IDR and functions to add/remove and iterate
registered namespaces in the system. It will be used to list namespaces
in /proc/namespaces/... in next patches.

The IDR is protected by ns_idr, and it's choosen to be a spinlock (not
mutex) to allow calling ns_idr_unregister() from put_xxx_ns() methods,
which may be called from (say) softirq context. Spinlock allows us
to avoid introduction of kwork on top of put_xxx_ns() to call mutex_lock().

We introduce a new IDR, because there is no appropriate items to reuse
instead of this. The closest proc_inum_ida does not fit our goals:
it is IDA and its convertation to IDR will bring a big overhead by proc
entries, which are not interested in IDR functionality (pointers).

Read access to ns_idr is made lockless (see ns_get_next()). This is made
for better parallelism and better performance from start. One new requirement
to do this is that namespace memory must be freed one RCU grace period
after ns_idr_unregister(). Some namespaces types already does this (say, net),
the rest will be converted to use kfree_rcu()/etc, where they become
linked to the IDR. See next patches in this series for the details.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/nsfs.c                 |   76 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ns_common.h |   10 ++++++
 include/linux/proc_ns.h   |   11 ++++---
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 800c1d0eb0d0..ee4be67d3a0b 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -11,10 +11,13 @@
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
+#include <linux/idr.h>
 
 #include "internal.h"
 
 static struct vfsmount *nsfs_mnt;
+static DEFINE_SPINLOCK(ns_lock);
+static DEFINE_IDR(ns_idr);
 
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg);
@@ -304,3 +307,76 @@ void __init nsfs_init(void)
 		panic("can't set nsfs up\n");
 	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 }
+
+/*
+ * Add a newly created ns to ns_idr. The ns must be fully
+ * initialized since it becomes available for ns_get_next()
+ * right after we exit this function.
+ */
+int ns_idr_register(struct ns_common *ns)
+{
+	int ret, id = ns->inum - PROC_NS_MIN_INO;
+
+	if (WARN_ON(id < 0))
+		return -EINVAL;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&ns_lock);
+	ret = idr_alloc(&ns_idr, ns, id, id + 1, GFP_ATOMIC);
+	spin_unlock_irq(&ns_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * Remove a dead ns from ns_idr. Note, that ns memory must
+ * be freed not earlier then one RCU grace period after
+ * this function, since ns_get_next() uses RCU to iterate the IDR.
+ */
+void ns_idr_unregister(struct ns_common *ns)
+{
+	int id = ns->inum - PROC_NS_MIN_INO;
+	unsigned long flags;
+
+	if (WARN_ON(id < 0))
+		return;
+
+	spin_lock_irqsave(&ns_lock, flags);
+	idr_remove(&ns_idr, id);
+	spin_unlock_irqrestore(&ns_lock, flags);
+}
+
+/*
+ * This returns ns with inum greater than @id or NULL.
+ * @id is updated to refer the ns inum.
+ */
+struct ns_common *ns_get_next(unsigned int *id)
+{
+	struct ns_common *ns;
+
+	if (*id < PROC_NS_MIN_INO - 1)
+		*id = PROC_NS_MIN_INO - 1;
+
+	*id += 1;
+	*id -= PROC_NS_MIN_INO;
+
+	rcu_read_lock();
+	do {
+		ns = idr_get_next(&ns_idr, id);
+		if (!ns)
+			break;
+		if (!refcount_inc_not_zero(&ns->count)) {
+			ns = NULL;
+			*id += 1;
+		}
+	} while (!ns);
+	rcu_read_unlock();
+
+	if (ns) {
+		*id += PROC_NS_MIN_INO;
+		WARN_ON(*id != ns->inum);
+	}
+
+	return ns;
+}
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 27db02ebdf36..5f460e97151a 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -4,6 +4,12 @@
 
 struct proc_ns_operations;
 
+/*
+ * Common part of all namespaces. Note, that we link namespaces
+ * into IDR, and they are dereferenced via RCU. So, a namespace
+ * memory is allowed to be freed one RCU grace period after final
+ * .count put. See ns_get_next() for the details.
+ */
 struct ns_common {
 	atomic_long_t stashed;
 	const struct proc_ns_operations *ops;
@@ -11,4 +17,8 @@ struct ns_common {
 	refcount_t count;
 };
 
+extern int ns_idr_register(struct ns_common *ns);
+extern void ns_idr_unregister(struct ns_common *ns);
+extern struct ns_common *ns_get_next(unsigned int *id);
+
 #endif
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 75807ecef880..906e6ebb43e4 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -40,12 +40,13 @@ extern const struct proc_ns_operations timens_for_children_operations;
  */
 enum {
 	PROC_ROOT_INO		= 1,
-	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
-	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
-	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
-	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
-	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
 	PROC_TIME_INIT_INO	= 0xEFFFFFFAU,
+	PROC_NS_MIN_INO		= PROC_TIME_INIT_INO,
+	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
 };
 
 #ifdef CONFIG_PROC_FS



  parent reply	other threads:[~2020-07-30 12:00 UTC|newest]

Thread overview: 80+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-30 11:59 [PATCH 00/23] proc: Introduce /proc/namespaces/ directory to expose namespaces lineary Kirill Tkhai
2020-07-30 11:59 ` [PATCH 01/23] ns: Add common refcount into ns_common add use it as counter for net_ns Kirill Tkhai
2020-07-30 13:35   ` Christian Brauner
2020-07-30 14:07     ` Kirill Tkhai
2020-07-30 15:59       ` Christian Brauner
2020-07-30 14:30   ` Christian Brauner
2020-07-30 14:34     ` Kirill Tkhai
2020-07-30 14:39       ` Christian Brauner
2020-07-30 11:59 ` [PATCH 02/23] uts: Use generic ns_common::count Kirill Tkhai
2020-07-30 14:30   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 03/23] ipc: " Kirill Tkhai
2020-07-30 14:32   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 04/23] pid: " Kirill Tkhai
2020-07-30 14:37   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 05/23] user: " Kirill Tkhai
2020-07-30 14:46   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 06/23] mnt: " Kirill Tkhai
2020-07-30 14:49   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 07/23] cgroup: " Kirill Tkhai
2020-07-30 14:50   ` Christian Brauner
2020-07-30 12:00 ` [PATCH 08/23] time: " Kirill Tkhai
2020-07-30 14:52   ` Christian Brauner
2020-07-30 12:00 ` Kirill Tkhai [this message]
2020-07-30 12:23   ` [PATCH 09/23] ns: Introduce ns_idr to be able to iterate all allocated namespaces in the system Matthew Wilcox
2020-07-30 13:32     ` Kirill Tkhai
2020-07-30 13:56       ` Matthew Wilcox
2020-07-30 14:12         ` Kirill Tkhai
2020-07-30 14:15           ` Matthew Wilcox
2020-07-30 14:20             ` Kirill Tkhai
2020-07-30 12:00 ` [PATCH 10/23] fs: Rename fs/proc/namespaces.c into fs/proc/task_namespaces.c Kirill Tkhai
2020-07-30 12:00 ` [PATCH 11/23] fs: Add /proc/namespaces/ directory Kirill Tkhai
2020-07-30 12:18   ` Alexey Dobriyan
2020-07-30 13:22     ` Kirill Tkhai
2020-07-30 13:26   ` Christian Brauner
2020-07-30 14:30     ` Kirill Tkhai
2020-07-30 20:47   ` kernel test robot
2020-07-30 20:47     ` kernel test robot
2020-07-30 22:20   ` kernel test robot
2020-07-30 22:20     ` kernel test robot
2020-08-05  8:17   ` kernel test robot
2020-08-05  8:17     ` kernel test robot
2020-08-05  8:17   ` [RFC PATCH] fs: namespaces_dentry_operations can be static kernel test robot
2020-08-05  8:17     ` kernel test robot
2020-07-30 12:00 ` [PATCH 12/23] user: Free user_ns one RCU grace period after final counter put Kirill Tkhai
2020-07-30 12:00 ` [PATCH 13/23] user: Add user namespaces into ns_idr Kirill Tkhai
2020-07-30 12:00 ` [PATCH 14/23] net: Add net " Kirill Tkhai
2020-07-30 12:00 ` [PATCH 15/23] pid: Eextract child_reaper check from pidns_for_children_get() Kirill Tkhai
2020-07-30 12:00 ` [PATCH 16/23] proc_ns_operations: Add can_get method Kirill Tkhai
2020-07-30 12:00 ` [PATCH 17/23] pid: Add pid namespaces into ns_idr Kirill Tkhai
2020-07-30 12:00 ` [PATCH 18/23] uts: Free uts namespace one RCU grace period after final counter put Kirill Tkhai
2020-07-30 12:01 ` [PATCH 19/23] uts: Add uts namespaces into ns_idr Kirill Tkhai
2020-07-30 12:01 ` [PATCH 20/23] ipc: Add ipc " Kirill Tkhai
2020-07-30 12:01 ` [PATCH 21/23] mnt: Add mount " Kirill Tkhai
2020-07-30 12:01 ` [PATCH 22/23] cgroup: Add cgroup " Kirill Tkhai
2020-07-30 12:01 ` [PATCH 23/23] time: Add time " Kirill Tkhai
2020-07-30 13:08 ` [PATCH 00/23] proc: Introduce /proc/namespaces/ directory to expose namespaces lineary Christian Brauner
2020-07-30 13:38   ` Christian Brauner
2020-07-30 14:34 ` Eric W. Biederman
2020-07-30 14:42   ` Christian Brauner
2020-07-30 15:01   ` Kirill Tkhai
2020-07-30 22:13     ` Eric W. Biederman
2020-07-31  8:48       ` Pavel Tikhomirov
2020-08-03 10:03       ` Kirill Tkhai
2020-08-03 10:51         ` Alexey Dobriyan
2020-08-06  8:05         ` Andrei Vagin
2020-08-07  8:47           ` Kirill Tkhai
2020-08-10 17:34             ` Andrei Vagin
2020-08-11 10:23               ` Kirill Tkhai
2020-08-12 17:53                 ` Andrei Vagin
2020-08-13  8:12                   ` Kirill Tkhai
2020-08-14  1:16                     ` Andrei Vagin
2020-08-14 15:11                       ` Kirill Tkhai
2020-08-14 19:21                         ` Andrei Vagin
2020-08-17 14:05                           ` Kirill Tkhai
2020-08-17 15:48                             ` Eric W. Biederman
2020-08-17 17:47                               ` Christian Brauner
2020-08-17 18:53                                 ` Eric W. Biederman
2020-08-04  5:43     ` Andrei Vagin
2020-08-04 12:11       ` Pavel Tikhomirov
2020-08-04 14:47       ` Kirill Tkhai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=159611040870.535980.13460189038999722608.stgit@localhost.localdomain \
    --to=ktkhai@virtuozzo.com \
    --cc=adobriyan@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=areber@redhat.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=davem@davemloft.net \
    --cc=ebiederm@xmission.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=serge@hallyn.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.