Linux-Fsdevel Archive on lore.kernel.org
 help / color / Atom feed
From: Kirill Tkhai <ktkhai@virtuozzo.com>
To: viro@zeniv.linux.org.uk, adobriyan@gmail.com,
	davem@davemloft.net, ebiederm@xmission.com,
	akpm@linux-foundation.org, christian.brauner@ubuntu.com,
	areber@redhat.com, serge@hallyn.com,
	linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	ktkhai@virtuozzo.com
Subject: [PATCH 09/23] ns: Introduce ns_idr to be able to iterate all allocated namespaces in the system
Date: Thu, 30 Jul 2020 15:00:08 +0300
Message-ID: <159611040870.535980.13460189038999722608.stgit@localhost.localdomain> (raw)
In-Reply-To: <159611007271.535980.15362304262237658692.stgit@localhost.localdomain>

This patch introduces a new IDR and functions to add/remove and iterate
registered namespaces in the system. It will be used to list namespaces
in /proc/namespaces/... in next patches.

The IDR is protected by ns_idr, and it's choosen to be a spinlock (not
mutex) to allow calling ns_idr_unregister() from put_xxx_ns() methods,
which may be called from (say) softirq context. Spinlock allows us
to avoid introduction of kwork on top of put_xxx_ns() to call mutex_lock().

We introduce a new IDR, because there is no appropriate items to reuse
instead of this. The closest proc_inum_ida does not fit our goals:
it is IDA and its convertation to IDR will bring a big overhead by proc
entries, which are not interested in IDR functionality (pointers).

Read access to ns_idr is made lockless (see ns_get_next()). This is made
for better parallelism and better performance from start. One new requirement
to do this is that namespace memory must be freed one RCU grace period
after ns_idr_unregister(). Some namespaces types already does this (say, net),
the rest will be converted to use kfree_rcu()/etc, where they become
linked to the IDR. See next patches in this series for the details.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/nsfs.c                 |   76 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ns_common.h |   10 ++++++
 include/linux/proc_ns.h   |   11 ++++---
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 800c1d0eb0d0..ee4be67d3a0b 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -11,10 +11,13 @@
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
+#include <linux/idr.h>
 
 #include "internal.h"
 
 static struct vfsmount *nsfs_mnt;
+static DEFINE_SPINLOCK(ns_lock);
+static DEFINE_IDR(ns_idr);
 
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg);
@@ -304,3 +307,76 @@ void __init nsfs_init(void)
 		panic("can't set nsfs up\n");
 	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
 }
+
+/*
+ * Add a newly created ns to ns_idr. The ns must be fully
+ * initialized since it becomes available for ns_get_next()
+ * right after we exit this function.
+ */
+int ns_idr_register(struct ns_common *ns)
+{
+	int ret, id = ns->inum - PROC_NS_MIN_INO;
+
+	if (WARN_ON(id < 0))
+		return -EINVAL;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&ns_lock);
+	ret = idr_alloc(&ns_idr, ns, id, id + 1, GFP_ATOMIC);
+	spin_unlock_irq(&ns_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * Remove a dead ns from ns_idr. Note, that ns memory must
+ * be freed not earlier then one RCU grace period after
+ * this function, since ns_get_next() uses RCU to iterate the IDR.
+ */
+void ns_idr_unregister(struct ns_common *ns)
+{
+	int id = ns->inum - PROC_NS_MIN_INO;
+	unsigned long flags;
+
+	if (WARN_ON(id < 0))
+		return;
+
+	spin_lock_irqsave(&ns_lock, flags);
+	idr_remove(&ns_idr, id);
+	spin_unlock_irqrestore(&ns_lock, flags);
+}
+
+/*
+ * This returns ns with inum greater than @id or NULL.
+ * @id is updated to refer the ns inum.
+ */
+struct ns_common *ns_get_next(unsigned int *id)
+{
+	struct ns_common *ns;
+
+	if (*id < PROC_NS_MIN_INO - 1)
+		*id = PROC_NS_MIN_INO - 1;
+
+	*id += 1;
+	*id -= PROC_NS_MIN_INO;
+
+	rcu_read_lock();
+	do {
+		ns = idr_get_next(&ns_idr, id);
+		if (!ns)
+			break;
+		if (!refcount_inc_not_zero(&ns->count)) {
+			ns = NULL;
+			*id += 1;
+		}
+	} while (!ns);
+	rcu_read_unlock();
+
+	if (ns) {
+		*id += PROC_NS_MIN_INO;
+		WARN_ON(*id != ns->inum);
+	}
+
+	return ns;
+}
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 27db02ebdf36..5f460e97151a 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -4,6 +4,12 @@
 
 struct proc_ns_operations;
 
+/*
+ * Common part of all namespaces. Note, that we link namespaces
+ * into IDR, and they are dereferenced via RCU. So, a namespace
+ * memory is allowed to be freed one RCU grace period after final
+ * .count put. See ns_get_next() for the details.
+ */
 struct ns_common {
 	atomic_long_t stashed;
 	const struct proc_ns_operations *ops;
@@ -11,4 +17,8 @@ struct ns_common {
 	refcount_t count;
 };
 
+extern int ns_idr_register(struct ns_common *ns);
+extern void ns_idr_unregister(struct ns_common *ns);
+extern struct ns_common *ns_get_next(unsigned int *id);
+
 #endif
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 75807ecef880..906e6ebb43e4 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -40,12 +40,13 @@ extern const struct proc_ns_operations timens_for_children_operations;
  */
 enum {
 	PROC_ROOT_INO		= 1,
-	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
-	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
-	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
-	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
-	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
 	PROC_TIME_INIT_INO	= 0xEFFFFFFAU,
+	PROC_NS_MIN_INO		= PROC_TIME_INIT_INO,
+	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
 };
 
 #ifdef CONFIG_PROC_FS



  parent reply index

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-30 11:59 [PATCH 00/23] proc: Introduce /proc/namespaces/ directory to expose namespaces lineary Kirill Tkhai
2020-07-30 11:59 ` [PATCH 01/23] ns: Add common refcount into ns_common add use it as counter for net_ns Kirill Tkhai
2020-07-30 13:35   ` Christian Brauner
2020-07-30 14:07     ` Kirill Tkhai
2020-07-30 15:59       ` Christian Brauner
2020-07-30 14:30   ` Christian Brauner
2020-07-30 14:34     ` Kirill Tkhai
2020-07-30 14:39       ` Christian Brauner
2020-07-30 11:59 ` [PATCH 02/23] uts: Use generic ns_common::count Kirill Tkhai
2020-07-30 14:30   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 03/23] ipc: " Kirill Tkhai
2020-07-30 14:32   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 04/23] pid: " Kirill Tkhai
2020-07-30 14:37   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 05/23] user: " Kirill Tkhai
2020-07-30 14:46   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 06/23] mnt: " Kirill Tkhai
2020-07-30 14:49   ` Christian Brauner
2020-07-30 11:59 ` [PATCH 07/23] cgroup: " Kirill Tkhai
2020-07-30 14:50   ` Christian Brauner
2020-07-30 12:00 ` [PATCH 08/23] time: " Kirill Tkhai
2020-07-30 14:52   ` Christian Brauner
2020-07-30 12:00 ` Kirill Tkhai [this message]
2020-07-30 12:23   ` [PATCH 09/23] ns: Introduce ns_idr to be able to iterate all allocated namespaces in the system Matthew Wilcox
2020-07-30 13:32     ` Kirill Tkhai
2020-07-30 13:56       ` Matthew Wilcox
2020-07-30 14:12         ` Kirill Tkhai
2020-07-30 14:15           ` Matthew Wilcox
2020-07-30 14:20             ` Kirill Tkhai
2020-07-30 12:00 ` [PATCH 10/23] fs: Rename fs/proc/namespaces.c into fs/proc/task_namespaces.c Kirill Tkhai
2020-07-30 12:00 ` [PATCH 11/23] fs: Add /proc/namespaces/ directory Kirill Tkhai
2020-07-30 12:18   ` Alexey Dobriyan
2020-07-30 13:22     ` Kirill Tkhai
2020-07-30 13:26   ` Christian Brauner
2020-07-30 14:30     ` Kirill Tkhai
2020-07-30 20:47   ` kernel test robot
2020-07-30 22:20   ` kernel test robot
2020-08-05  8:17   ` kernel test robot
2020-08-05  8:17   ` [RFC PATCH] fs: namespaces_dentry_operations can be static kernel test robot
2020-07-30 12:00 ` [PATCH 12/23] user: Free user_ns one RCU grace period after final counter put Kirill Tkhai
2020-07-30 12:00 ` [PATCH 13/23] user: Add user namespaces into ns_idr Kirill Tkhai
2020-07-30 12:00 ` [PATCH 14/23] net: Add net " Kirill Tkhai
2020-07-30 12:00 ` [PATCH 15/23] pid: Eextract child_reaper check from pidns_for_children_get() Kirill Tkhai
2020-07-30 12:00 ` [PATCH 16/23] proc_ns_operations: Add can_get method Kirill Tkhai
2020-07-30 12:00 ` [PATCH 17/23] pid: Add pid namespaces into ns_idr Kirill Tkhai
2020-07-30 12:00 ` [PATCH 18/23] uts: Free uts namespace one RCU grace period after final counter put Kirill Tkhai
2020-07-30 12:01 ` [PATCH 19/23] uts: Add uts namespaces into ns_idr Kirill Tkhai
2020-07-30 12:01 ` [PATCH 20/23] ipc: Add ipc " Kirill Tkhai
2020-07-30 12:01 ` [PATCH 21/23] mnt: Add mount " Kirill Tkhai
2020-07-30 12:01 ` [PATCH 22/23] cgroup: Add cgroup " Kirill Tkhai
2020-07-30 12:01 ` [PATCH 23/23] time: Add time " Kirill Tkhai
2020-07-30 13:08 ` [PATCH 00/23] proc: Introduce /proc/namespaces/ directory to expose namespaces lineary Christian Brauner
2020-07-30 13:38   ` Christian Brauner
2020-07-30 14:34 ` Eric W. Biederman
2020-07-30 14:42   ` Christian Brauner
2020-07-30 15:01   ` Kirill Tkhai
2020-07-30 22:13     ` Eric W. Biederman
2020-07-31  8:48       ` Pavel Tikhomirov
2020-08-03 10:03       ` Kirill Tkhai
2020-08-03 10:51         ` Alexey Dobriyan
2020-08-06  8:05         ` Andrei Vagin
2020-08-07  8:47           ` Kirill Tkhai
2020-08-10 17:34             ` Andrei Vagin
2020-08-11 10:23               ` Kirill Tkhai
2020-08-12 17:53                 ` Andrei Vagin
2020-08-13  8:12                   ` Kirill Tkhai
2020-08-14  1:16                     ` Andrei Vagin
2020-08-14 15:11                       ` Kirill Tkhai
2020-08-14 19:21                         ` Andrei Vagin
2020-08-17 14:05                           ` Kirill Tkhai
2020-08-17 15:48                             ` Eric W. Biederman
2020-08-17 17:47                               ` Christian Brauner
2020-08-17 18:53                                 ` Eric W. Biederman
2020-08-04  5:43     ` Andrei Vagin
2020-08-04 12:11       ` Pavel Tikhomirov
2020-08-04 14:47       ` Kirill Tkhai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=159611040870.535980.13460189038999722608.stgit@localhost.localdomain \
    --to=ktkhai@virtuozzo.com \
    --cc=adobriyan@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=areber@redhat.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=davem@davemloft.net \
    --cc=ebiederm@xmission.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=serge@hallyn.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-Fsdevel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-fsdevel/0 linux-fsdevel/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-fsdevel linux-fsdevel/ https://lore.kernel.org/linux-fsdevel \
		linux-fsdevel@vger.kernel.org
	public-inbox-index linux-fsdevel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-fsdevel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git