containers.lists.linux.dev archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 0/2] ipc: Store mq and ipc sysctls in the ipc namespace
@ 2022-02-14 18:18 Alexey Gladkov
  2022-02-14 18:18 ` [PATCH v4 1/2] ipc: Store mqueue " Alexey Gladkov
                   ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-02-14 18:18 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

This patchset changes the implementation of mq and ipc sysctls. It moves the
sysctls inside the ipc namespace. This will allow us to manage these sysctls
inside the user namespace.

--

Alexey Gladkov (2):
  ipc: Store mqueue sysctls in the ipc namespace
  ipc: Store ipc sysctls in the ipc namespace

 include/linux/ipc_namespace.h |  37 ++++++-
 ipc/ipc_sysctl.c              | 189 ++++++++++++++++++++++------------
 ipc/mq_sysctl.c               | 121 ++++++++++++----------
 ipc/mqueue.c                  |  10 +-
 ipc/namespace.c               |  10 ++
 5 files changed, 235 insertions(+), 132 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v4 1/2] ipc: Store mqueue sysctls in the ipc namespace
  2022-02-14 18:18 [PATCH v4 0/2] ipc: Store mq and ipc sysctls in the ipc namespace Alexey Gladkov
@ 2022-02-14 18:18 ` Alexey Gladkov
  2022-02-14 18:18 ` [PATCH v4 2/2] ipc: Store ipc " Alexey Gladkov
  2022-03-23 20:24 ` [GIT PULL] ipc: Bind to the ipc namespace at open time Eric W. Biederman
  2 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-02-14 18:18 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot

Right now, the mqueue sysctls take ipc namespaces into account in a
rather hacky way. This works in most cases, but does not respect the
user namespace.

Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
parametres. This poses a problem in the rootless containers.

To solve this I changed the implementation of the mqueue sysctls just
like some other sysctls.

So far, the changes do not provide additional access to files. This will
be done in a future patch.

v3:
* Don't implemenet set_permissions to keep the current behavior.

v2:
* Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not
  specified.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  16 +++--
 ipc/mq_sysctl.c               | 121 ++++++++++++++++++----------------
 ipc/mqueue.c                  |  10 ++-
 ipc/namespace.c               |   6 ++
 4 files changed, 88 insertions(+), 65 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index b75395ec8d52..fa787d97d60a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -10,6 +10,7 @@
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
 #include <linux/rhashtable-types.h>
+#include <linux/sysctl.h>
 
 struct user_namespace;
 
@@ -63,6 +64,9 @@ struct ipc_namespace {
 	unsigned int    mq_msg_default;
 	unsigned int    mq_msgsize_default;
 
+	struct ctl_table_set	mq_set;
+	struct ctl_table_header	*mq_sysctls;
+
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -169,14 +173,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 
 #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
 
-struct ctl_table_header;
-extern struct ctl_table_header *mq_register_sysctl_table(void);
+void retire_mq_sysctls(struct ipc_namespace *ns);
+bool setup_mq_sysctls(struct ipc_namespace *ns);
 
 #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
 
-static inline struct ctl_table_header *mq_register_sysctl_table(void)
+static inline void retire_mq_sysctls(struct ipc_namespace *ns)
 {
-	return NULL;
+}
+
+static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+	return true;
 }
 
 #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 72a92a08c848..fbf6a8b93a26 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -9,39 +9,9 @@
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
 
-#ifdef CONFIG_PROC_SYSCTL
-static void *get_mq(struct ctl_table *table)
-{
-	char *which = table->data;
-	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
-	return which;
-}
-
-static int proc_mq_dointvec(struct ctl_table *table, int write,
-			    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
-}
-
-static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec_minmax(&mq_table, write, buffer,
-					lenp, ppos);
-}
-#else
-#define proc_mq_dointvec NULL
-#define proc_mq_dointvec_minmax NULL
-#endif
+#include <linux/stat.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
 
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
@@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
@@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -89,32 +59,73 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
 	{}
 };
 
-static struct ctl_table mq_sysctl_dir[] = {
-	{
-		.procname	= "mqueue",
-		.mode		= 0555,
-		.child		= mq_sysctls,
-	},
-	{}
-};
+static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
+{
+	return &current->nsproxy->ipc_ns->mq_set;
+}
 
-static struct ctl_table mq_sysctl_root[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= mq_sysctl_dir,
-	},
-	{}
+static int set_is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->ipc_ns->mq_set == set;
+}
+
+static struct ctl_table_root set_root = {
+	.lookup = set_lookup,
 };
 
-struct ctl_table_header *mq_register_sysctl_table(void)
+bool setup_mq_sysctls(struct ipc_namespace *ns)
 {
-	return register_sysctl_table(mq_sysctl_root);
+	struct ctl_table *tbl;
+
+	setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen);
+
+	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
+				tbl[i].data = &ns->mq_queues_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
+				tbl[i].data = &ns->mq_msg_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
+				tbl[i].data = &ns->mq_msgsize_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
+				tbl[i].data = &ns->mq_msg_default;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
+				tbl[i].data = &ns->mq_msgsize_default;
+			else
+				tbl[i].data = NULL;
+		}
+
+		ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
+	}
+	if (!ns->mq_sysctls) {
+		kfree(tbl);
+		retire_sysctl_set(&ns->mq_set);
+		return false;
+	}
+
+	return true;
+}
+
+void retire_mq_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = ns->mq_sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->mq_sysctls);
+	retire_sysctl_set(&ns->mq_set);
+	kfree(tbl);
 }
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5becca9be867..1b4a3be71636 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -163,8 +163,6 @@ static void remove_notification(struct mqueue_inode_info *info);
 
 static struct kmem_cache *mqueue_inode_cachep;
 
-static struct ctl_table_header *mq_sysctl_table;
-
 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 {
 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
@@ -1713,8 +1711,10 @@ static int __init init_mqueue_fs(void)
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
-	/* ignore failures - they are not fatal */
-	mq_sysctl_table = mq_register_sysctl_table();
+	if (!setup_mq_sysctls(&init_ipc_ns)) {
+		pr_warn("sysctl registration failed\n");
+		return -ENOMEM;
+	}
 
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
@@ -1731,8 +1731,6 @@ static int __init init_mqueue_fs(void)
 out_filesystem:
 	unregister_filesystem(&mqueue_fs_type);
 out_sysctl:
-	if (mq_sysctl_table)
-		unregister_sysctl_table(mq_sysctl_table);
 	kmem_cache_destroy(mqueue_inode_cachep);
 	return error;
 }
diff --git a/ipc/namespace.c b/ipc/namespace.c
index ae83f0f2651b..f760243ca685 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (err)
 		goto fail_put;
 
+	err = -ENOMEM;
+	if (!setup_mq_sysctls(ns))
+		goto fail_put;
+
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
 
+	retire_mq_sysctls(ns);
+
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v4 2/2] ipc: Store ipc sysctls in the ipc namespace
  2022-02-14 18:18 [PATCH v4 0/2] ipc: Store mq and ipc sysctls in the ipc namespace Alexey Gladkov
  2022-02-14 18:18 ` [PATCH v4 1/2] ipc: Store mqueue " Alexey Gladkov
@ 2022-02-14 18:18 ` Alexey Gladkov
  2022-03-23 20:24 ` [GIT PULL] ipc: Bind to the ipc namespace at open time Eric W. Biederman
  2 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-02-14 18:18 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

The ipc sysctls are not available for modification inside the user
namespace. Following the mqueue sysctls, we changed the implementation
to be more userns friendly.

So far, the changes do not provide additional access to files. This
will be done in a future patch.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  21 ++++
 ipc/ipc_sysctl.c              | 189 ++++++++++++++++++++++------------
 ipc/namespace.c               |   4 +
 3 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index fa787d97d60a..e3e8c8662b49 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -67,6 +67,9 @@ struct ipc_namespace {
 	struct ctl_table_set	mq_set;
 	struct ctl_table_header	*mq_sysctls;
 
+	struct ctl_table_set	ipc_set;
+	struct ctl_table_header	*ipc_sysctls;
+
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -188,4 +191,22 @@ static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
 }
 
 #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+
+bool setup_ipc_sysctls(struct ipc_namespace *ns);
+void retire_ipc_sysctls(struct ipc_namespace *ns);
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
+{
+}
+
+static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
+{
+	return true;
+}
+
+#endif /* CONFIG_SYSVIPC_SYSCTL */
 #endif
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index f101c171753f..15210ac47e9e 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -13,43 +13,22 @@
 #include <linux/capability.h>
 #include <linux/ipc_namespace.h>
 #include <linux/msg.h>
+#include <linux/slab.h>
 #include "util.h"
 
-static void *get_ipc(struct ctl_table *table)
-{
-	char *which = table->data;
-	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
-	return which;
-}
-
-static int proc_ipc_dointvec(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table ipc_table;
-
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-	ipc_table.data = get_ipc(table);
-
-	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
-}
-
-static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
+static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct ipc_namespace *ns = table->extra1;
 	struct ctl_table ipc_table;
+	int err;
 
 	memcpy(&ipc_table, table, sizeof(ipc_table));
-	ipc_table.data = get_ipc(table);
 
-	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
-}
+	ipc_table.extra1 = SYSCTL_ZERO;
+	ipc_table.extra2 = SYSCTL_ONE;
 
-static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
-	int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 
 	if (err < 0)
 		return err;
@@ -58,17 +37,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 	return err;
 }
 
-static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table ipc_table;
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-	ipc_table.data = get_ipc(table);
-
-	return proc_doulongvec_minmax(&ipc_table, write, buffer,
-					lenp, ppos);
-}
-
 static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -87,11 +55,17 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct ipc_namespace *ns = table->extra1;
+	struct ctl_table ipc_table;
 	int ret, semmni;
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+
+	ipc_table.extra1 = NULL;
+	ipc_table.extra2 = NULL;
 
 	semmni = ns->sem_ctls[3];
-	ret = proc_ipc_dointvec(table, write, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret)
 		ret = sem_check_semmni(current->nsproxy->ipc_ns);
@@ -108,12 +82,18 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
 		int write, void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns;
+	struct ipc_namespace *ns = table->extra1;
+	struct ctl_table ipc_table;
 
-	if (write && !checkpoint_restore_ns_capable(user_ns))
+	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
 		return -EPERM;
 
-	return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+
+	ipc_table.extra1 = SYSCTL_ZERO;
+	ipc_table.extra2 = SYSCTL_INT_MAX;
+
+	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -121,27 +101,27 @@ int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
 
-static struct ctl_table ipc_kern_table[] = {
+static struct ctl_table ipc_sysctls[] = {
 	{
 		.procname	= "shmmax",
 		.data		= &init_ipc_ns.shm_ctlmax,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmall",
 		.data		= &init_ipc_ns.shm_ctlall,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlall),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmmni",
 		.data		= &init_ipc_ns.shm_ctlmni,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
@@ -151,15 +131,13 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
 		.data		= &init_ipc_ns.msg_ctlmax,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
 	},
@@ -168,7 +146,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.data		= &init_ipc_ns.msg_ctlmni,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
@@ -186,7 +164,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.data		= &init_ipc_ns.msg_ctlmnb,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
 	},
@@ -204,8 +182,6 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
 		.mode		= 0666,
 		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
@@ -213,8 +189,6 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
 		.mode		= 0666,
 		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
@@ -222,25 +196,106 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
 		.mode		= 0666,
 		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
 };
 
-static struct ctl_table ipc_root_table[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= ipc_kern_table,
-	},
-	{}
+static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
+{
+	return &current->nsproxy->ipc_ns->ipc_set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->ipc_ns->ipc_set == set;
+}
+
+static struct ctl_table_root set_root = {
+	.lookup = set_lookup,
 };
 
+bool setup_ipc_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen);
+
+	tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.shm_ctlmax) {
+				tbl[i].data = &ns->shm_ctlmax;
+
+			} else if (tbl[i].data == &init_ipc_ns.shm_ctlall) {
+				tbl[i].data = &ns->shm_ctlall;
+
+			} else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) {
+				tbl[i].data = &ns->shm_ctlmni;
+
+			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
+				tbl[i].data = &ns->shm_rmid_forced;
+				tbl[i].extra1 = ns;
+
+			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
+				tbl[i].data = &ns->msg_ctlmax;
+
+			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) {
+				tbl[i].data = &ns->msg_ctlmni;
+
+			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) {
+				tbl[i].data = &ns->msg_ctlmnb;
+
+			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
+				tbl[i].data = &ns->sem_ctls;
+				tbl[i].extra1 = ns;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
+				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
+				tbl[i].extra1 = ns;
+
+			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
+				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
+				tbl[i].extra1 = ns;
+
+			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
+				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
+				tbl[i].extra1 = ns;
+#endif
+			} else {
+				tbl[i].data = NULL;
+			}
+		}
+
+		ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl);
+	}
+	if (!ns->ipc_sysctls) {
+		kfree(tbl);
+		retire_sysctl_set(&ns->ipc_set);
+		return false;
+	}
+
+	return true;
+}
+
+void retire_ipc_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = ns->ipc_sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->ipc_sysctls);
+	retire_sysctl_set(&ns->ipc_set);
+	kfree(tbl);
+}
+
 static int __init ipc_sysctl_init(void)
 {
-	register_sysctl_table(ipc_root_table);
+	if (!setup_ipc_sysctls(&init_ipc_ns)) {
+		pr_warn("ipc sysctl registration failed\n");
+		return -ENOMEM;
+	}
 	return 0;
 }
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index f760243ca685..754f3237194a 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -63,6 +63,9 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (!setup_mq_sysctls(ns))
 		goto fail_put;
 
+	if (!setup_ipc_sysctls(ns))
+		goto fail_put;
+
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -130,6 +133,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	shm_exit_ns(ns);
 
 	retire_mq_sysctls(ns);
+	retire_ipc_sysctls(ns);
 
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [GIT PULL] ipc: Bind to the ipc namespace at open time.
  2022-02-14 18:18 [PATCH v4 0/2] ipc: Store mq and ipc sysctls in the ipc namespace Alexey Gladkov
  2022-02-14 18:18 ` [PATCH v4 1/2] ipc: Store mqueue " Alexey Gladkov
  2022-02-14 18:18 ` [PATCH v4 2/2] ipc: Store ipc " Alexey Gladkov
@ 2022-03-23 20:24 ` Eric W. Biederman
  2022-03-24 18:12   ` Linus Torvalds
  2 siblings, 1 reply; 38+ messages in thread
From: Eric W. Biederman @ 2022-03-23 20:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: LKML, Linux Containers, Alexander Mikhalitsyn, Andrew Morton,
	Christian Brauner, Daniel Walsh, Davidlohr Bueso, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin,
	Alexey Gladkov


Linus,

Please pull the per-namespace-ipc-sysctls-for-v5.18 tag from the git tree:

  git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git per-namespace-ipc-sysctls-for-v5.18
  HEAD: 1f5c135ee509e89e0cc274333a65f73c62cb16e5 ipc: Store ipc sysctls in the ipc namespace


The per ipc namespace sysctls have been imperfect since they were
implemented.  Instead of binding to the ipc namespace of the opener of
the file the code bound to the ipc namespace of the writer of the
file.

This short series of changes addresses that old deficiency in the
code.

Alexey Gladkov (2):
      ipc: Store mqueue sysctls in the ipc namespace
      ipc: Store ipc sysctls in the ipc namespace

 include/linux/ipc_namespace.h |  37 ++++++++-
 ipc/ipc_sysctl.c              | 189 +++++++++++++++++++++++++++---------------
 ipc/mq_sysctl.c               | 121 +++++++++++++++------------
 ipc/mqueue.c                  |  10 +--
 ipc/namespace.c               |  10 +++
 5 files changed, 235 insertions(+), 132 deletions(-)

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>


Eric

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [GIT PULL] ipc: Bind to the ipc namespace at open time.
  2022-03-23 20:24 ` [GIT PULL] ipc: Bind to the ipc namespace at open time Eric W. Biederman
@ 2022-03-24 18:12   ` Linus Torvalds
  2022-03-24 21:48     ` Eric W. Biederman
                       ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Linus Torvalds @ 2022-03-24 18:12 UTC (permalink / raw)
  To: Eric W. Biederman, Alexey Gladkov
  Cc: LKML, Linux Containers, Alexander Mikhalitsyn, Andrew Morton,
	Christian Brauner, Daniel Walsh, Davidlohr Bueso, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

On Wed, Mar 23, 2022 at 1:24 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Please pull the per-namespace-ipc-sysctls-for-v5.18 tag from the git tree:

Ugh.

I pulled this. Then I stared at it for a long time.

And then I decided that this is too ugly to live.

I'm sorry. I think Alexey has probably spent a fair amount of time on
it, but I really think the sysctl code needs to be cleaned up way more
than this.

The old code was horribly hacky too, but that setup_ipc_sysctls() (and
setup_mq_sysctls()) thing that copies the whole sysctls table, and
then walks it entry by entry to modify it, is just too ugly for words.

And then it hides things in .extra1, and because it does that it can't
use the normal "extra1 and extra2 contains the limits" so then at
write() time it copies it into a local one AGAIN only to set the
limits back so that it can call the normal routines again.

Not ok.

Yes, yes, the old code did some similar things - to set the 'data'
pointer. That was disgusting too. Don't get me wrong - the existing
code was nasty too. But this took nasty code, and doubled down on it.

I really think this is a fundamental problem, and needs a more
fundamental fix than adding more and more of these kinds of nasty
hacks.

And yes, that fundamental fix is almost certainly to pass in 'struct
cred *' to all those sysctl helper functions.

Then, when you do the actual 'sysctl()' system calls, you pass in
'current_cred()".

And the /proc users would pass in file->f_cred.

And yes, that patch might be quite messy, because we have quite a lot
of those random .proc_handler users.

But *most* of them by far (at least in random code) use the standard
proc_dointvec_minmax() and friends, and won't even notice.

And then the ones that are about namespace issues will have to
continue to do the nasty "make a copy and update the data pointer",
but *MAYBE* we could also introduce the notion of an "offset" to those
proc_dointvec_minmax() things to help them out (and at least avoid the
"make a copy" thing).

Anyway, I really think we must not make that sysctl code even uglier
than it is today, and I think we need to move towards a model that
actually makes sense. And that "pass in the right cred" is the only
sensible model I can see.

I haven't tried to create such a patch, and maybe Alexey already tried
to do something like that and it turned out to be too ugly for words
and that's why these nasty patches happened.

But at least for now, I can't with a good conscience pull this.

Sorry,
                   Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [GIT PULL] ipc: Bind to the ipc namespace at open time.
  2022-03-24 18:12   ` Linus Torvalds
@ 2022-03-24 21:48     ` Eric W. Biederman
  2022-03-24 22:16       ` Linus Torvalds
  2022-03-25 12:10     ` Alexey Gladkov
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
  2 siblings, 1 reply; 38+ messages in thread
From: Eric W. Biederman @ 2022-03-24 21:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexey Gladkov, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Manfred Spraul, Serge Hallyn, Varad Gautam,
	Vasily Averin

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Wed, Mar 23, 2022 at 1:24 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Please pull the per-namespace-ipc-sysctls-for-v5.18 tag from the git tree:
>
> Ugh.
>
> I pulled this. Then I stared at it for a long time.
>
> And then I decided that this is too ugly to live.
>
> I'm sorry. I think Alexey has probably spent a fair amount of time on
> it, but I really think the sysctl code needs to be cleaned up way more
> than this.
>
> The old code was horribly hacky too, but that setup_ipc_sysctls() (and
> setup_mq_sysctls()) thing that copies the whole sysctls table, and
> then walks it entry by entry to modify it, is just too ugly for words.

I am not convinced when it comes to the .data pointers it is too
ugly to live.  The only realistic alternative is doing something
with the offset to adjust the pointers to the per namespace values.

Fundamentally the copy is needed because in the general case (which is
present in the networking stack) we only show a subset of the sysctls in
anything other than the initial namespace.  That is necessary as some
sysctls have not been shown to be safe for unprivileged users to write.

So there does need to be a separate array of ctl_table entries per
namespace, and the data pointers in those entries need to be adjusted to
the per namespace values.

> And then it hides things in .extra1, and because it does that it can't
> use the normal "extra1 and extra2 contains the limits" so then at
> write() time it copies it into a local one AGAIN only to set the
> limits back so that it can call the normal routines again.
>
> Not ok.

I agree.  That is a bit ugly, and I missed it when I was looking
at the code.

The mq_sysctl.c changes look reasonable to me.  There are no strange
games being played with .extra1 or .extra2.  All I see happening
in mq_sysctl.c is boiler plate to make things work.

Looking at ipc_sysctl.c I agree playing games with .extra1 and .extra2
is ugly and error prone.  Worse I noticed when reading it through that
proc_ipc_sem_dointvec passes sem_check_semmni current->nsproxy->ipc_ns
instead of passing the computed ns value.  A bug but not a regression.

I went through the code and we don't need to play games with .extra1
to get the namespace value all we need to call container_of with
the .data member.  That takes a little extra boiler plate especially
for the checkpoint_restore case.

The checkpoint_restore sysctls arguably need to be done differently so
that the permissions can be checked at open time instead of at write
time.  That would eliminate the need to play games with foobar.

>
> Anyway, I really think we must not make that sysctl code even uglier
> than it is today, and I think we need to move towards a model that
> actually makes sense. And that "pass in the right cred" is the only
> sensible model I can see.

I don't see how passing in struct cred does anything helpful.  The
namespace can not be found in struct cred.

Now this set of changes is a setup to allow implementing .set_ownership
and .permission methods so that we can allow the namespace root to write
to sysctls it is safe for the namespace root to be able to write to.
Even there I don't see how passing in a struct cred would help.

Given that there are no regressions I don't see it even being helpful
to not merge this code.

I did look and there are some cleanup pretty significant cleanup
opportunities, by using container_of on .data, which avoids the need
to stuff a namespace parameter in .extra1.

There are also significant cleanup opportunities in implementing a
.permission method that will allow the checkpoint_restart sysctls
to perform all of their permission checks at open time, and not
need any other special code.

But all of these cleanups I see are moving forward from the current
point so I don't see why we would not want to merge the code as is
and then get the tested versions of my cleanups below.



ipc/ipc_sysctl.c | 81 +++++++++++++++++++++++++-------------------------------
 1 file changed, 36 insertions(+), 45 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 15210ac47e9e..e2209d48db04 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -19,16 +19,11 @@
 static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
+	struct ipc_namespace *ns =
+		container_of(table->data, struct ipc_namespace, shm_rmid_forced);
 	int err;
 
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = SYSCTL_ZERO;
-	ipc_table.extra2 = SYSCTL_ONE;
-
-	err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (err < 0)
 		return err;
@@ -55,20 +50,15 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
+	struct ipc_namespace *ns =
+		container_of(table->data, struct ipc_namespace, sem_ctls);
 	int ret, semmni;
 
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = NULL;
-	ipc_table.extra2 = NULL;
-
 	semmni = ns->sem_ctls[3];
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret)
-		ret = sem_check_semmni(current->nsproxy->ipc_ns);
+		ret = sem_check_semmni(ns);
 
 	/*
 	 * Reset the semmni value if an error happens.
@@ -78,24 +68,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	return ret;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
-		int write, void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
-
-	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
-		return -EPERM;
-
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = SYSCTL_ZERO;
-	ipc_table.extra2 = SYSCTL_INT_MAX;
-
-	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
-}
-#endif
 
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
@@ -131,6 +103,8 @@ static struct ctl_table ipc_sysctls[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
@@ -180,22 +154,28 @@ static struct ctl_table ipc_sysctls[] = {
 		.procname	= "sem_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
 		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
@@ -211,8 +191,24 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->ipc_set == set;
 }
 
+static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+	int mode = table->mode;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
+	     (table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
+	     (table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
+	    checkpoint_restore_ns_capable(ns->user_ns))
+		mode = 0666;
+#endif
+	return mode;
+}
+
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
+	.permissions = ipc_permissions,
 };
 
 bool setup_ipc_sysctls(struct ipc_namespace *ns)
@@ -237,7 +233,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
 				tbl[i].data = &ns->shm_rmid_forced;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
 				tbl[i].data = &ns->msg_ctlmax;
@@ -250,19 +245,15 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
 				tbl[i].data = &ns->sem_ctls;
-				tbl[i].extra1 = ns;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
-				tbl[i].extra1 = ns;
 #endif
 			} else {
 				tbl[i].data = NULL;



Eric

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [GIT PULL] ipc: Bind to the ipc namespace at open time.
  2022-03-24 21:48     ` Eric W. Biederman
@ 2022-03-24 22:16       ` Linus Torvalds
  0 siblings, 0 replies; 38+ messages in thread
From: Linus Torvalds @ 2022-03-24 22:16 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexey Gladkov, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Manfred Spraul, Serge Hallyn, Varad Gautam,
	Vasily Averin

On Thu, Mar 24, 2022 at 2:49 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> I don't see how passing in struct cred does anything helpful.  The
> namespace can not be found in struct cred.

Duh. For some reason I thought we had it there, but yeah, that only
contains the user-namespace.

The rest is in tsk->nsproxy.

So it would have to be squirrelled away in some other way.

> But all of these cleanups I see are moving forward from the current
> point so I don't see why we would not want to merge the code as is
> and then get the tested versions of my cleanups below.

What part of "that code is too ugly to live" did not end up registering.

Maybe it can be cleaned up. But it had better be cleaned up before I pull it.

                Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [GIT PULL] ipc: Bind to the ipc namespace at open time.
  2022-03-24 18:12   ` Linus Torvalds
  2022-03-24 21:48     ` Eric W. Biederman
@ 2022-03-25 12:10     ` Alexey Gladkov
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
  2 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-03-25 12:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric W. Biederman, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Manfred Spraul, Serge Hallyn, Varad Gautam,
	Vasily Averin

On Thu, Mar 24, 2022 at 11:12:21AM -0700, Linus Torvalds wrote:
> On Wed, Mar 23, 2022 at 1:24 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >
> > Please pull the per-namespace-ipc-sysctls-for-v5.18 tag from the git tree:
> 
> Ugh.
> 
> I pulled this. Then I stared at it for a long time.
> 
> And then I decided that this is too ugly to live.
> 
> I'm sorry. I think Alexey has probably spent a fair amount of time on
> it, but I really think the sysctl code needs to be cleaned up way more
> than this.

Apparently it's my fault that the purpose of these changes is not clear. I
did this refactoring not for the sake of refactoring as such.

In my original patch [1], I was trying to fix the situation where the user
cannot change the /proc/sys/fs/mqueue/* options inside rootless container.

But then I split the changes into refactoring which fixes the hack and
permission changes which I wanted to discuss and propose later.

[1] https://lore.kernel.org/lkml/0f0408bb7fbf3187966a9bf19a98642a5d9669fe.1641225760.git.legion@kernel.org/

> The old code was horribly hacky too, but that setup_ipc_sysctls() (and
> setup_mq_sysctls()) thing that copies the whole sysctls table, and
> then walks it entry by entry to modify it, is just too ugly for words.
> 
> And then it hides things in .extra1, and because it does that it can't
> use the normal "extra1 and extra2 contains the limits" so then at
> write() time it copies it into a local one AGAIN only to set the
> limits back so that it can call the normal routines again.
> 
> Not ok.
> 
> Yes, yes, the old code did some similar things - to set the 'data'
> pointer. That was disgusting too. Don't get me wrong - the existing
> code was nasty too. But this took nasty code, and doubled down on it.
> 
> I really think this is a fundamental problem, and needs a more
> fundamental fix than adding more and more of these kinds of nasty
> hacks.
> 
> And yes, that fundamental fix is almost certainly to pass in 'struct
> cred *' to all those sysctl helper functions.
> 
> Then, when you do the actual 'sysctl()' system calls, you pass in
> 'current_cred()".
> 
> And the /proc users would pass in file->f_cred.
> 
> And yes, that patch might be quite messy, because we have quite a lot
> of those random .proc_handler users.
> 
> But *most* of them by far (at least in random code) use the standard
> proc_dointvec_minmax() and friends, and won't even notice.
> 
> And then the ones that are about namespace issues will have to
> continue to do the nasty "make a copy and update the data pointer",
> but *MAYBE* we could also introduce the notion of an "offset" to those
> proc_dointvec_minmax() things to help them out (and at least avoid the
> "make a copy" thing).
> 
> Anyway, I really think we must not make that sysctl code even uglier
> than it is today, and I think we need to move towards a model that
> actually makes sense. And that "pass in the right cred" is the only
> sensible model I can see.
> 
> I haven't tried to create such a patch, and maybe Alexey already tried
> to do something like that and it turned out to be too ugly for words
> and that's why these nasty patches happened.
> 
> But at least for now, I can't with a good conscience pull this.
> 
> Sorry,
>                    Linus

OK. I'll try to come up with some other solution.

-- 
Rgrds, legion


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace
  2022-03-24 18:12   ` Linus Torvalds
  2022-03-24 21:48     ` Eric W. Biederman
  2022-03-25 12:10     ` Alexey Gladkov
@ 2022-04-22 12:53     ` Alexey Gladkov
  2022-04-22 12:53       ` [PATCH v1 1/4] " Alexey Gladkov
                         ` (4 more replies)
  2 siblings, 5 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-04-22 12:53 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

These patches are made on top of branch:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git per-namespace-ipc-sysctls-for-v5.18

After discussion, I took into account Eric Biederman's fixes. With these
changes, the hacks of passing namespace through .extra1 are not required.

With the current design of sysctl dynamic memory allocation is
necessary.

Yes, Linus, these changes are not the refactoring you were talking
about, but I plan to try to do such a refactoring in the my next
patchset.

--

Alexey Gladkov (4):
  ipc: Remove extra1 field abuse to pass ipc namespace
  ipc: Use proper ipc namespace
  ipc: Check permissions for checkpoint_restart sysctls at open time
  ipc: Remove extra braces

 ipc/ipc_sysctl.c | 108 +++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 59 deletions(-)

-- 
2.33.2


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v1 1/4] ipc: Remove extra1 field abuse to pass ipc namespace
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
@ 2022-04-22 12:53       ` Alexey Gladkov
  2022-05-02 16:07         ` Eric W. Biederman
  2022-04-22 12:53       ` [PATCH v1 2/4] ipc: Use proper " Alexey Gladkov
                         ` (3 subsequent siblings)
  4 siblings, 1 reply; 38+ messages in thread
From: Alexey Gladkov @ 2022-04-22 12:53 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

Eric Biederman pointed out that using .extra1 to pass ipc namespace
looks like an ugly hack and there is a better solution.

Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 15210ac47e9e..eb7ba8e0a355 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -19,16 +19,11 @@
 static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
+	struct ipc_namespace *ns =
+		container_of(table->data, struct ipc_namespace, shm_rmid_forced);
 	int err;
 
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = SYSCTL_ZERO;
-	ipc_table.extra2 = SYSCTL_ONE;
-
-	err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (err < 0)
 		return err;
@@ -55,20 +50,15 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
+	struct ipc_namespace *ns =
+		container_of(table->data, struct ipc_namespace, sem_ctls);
 	int ret, semmni;
 
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = NULL;
-	ipc_table.extra2 = NULL;
-
 	semmni = ns->sem_ctls[3];
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret)
-		ret = sem_check_semmni(current->nsproxy->ipc_ns);
+		ret = sem_check_semmni(ns);
 
 	/*
 	 * Reset the semmni value if an error happens.
@@ -131,6 +121,8 @@ static struct ctl_table ipc_sysctls[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
@@ -237,7 +229,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
 				tbl[i].data = &ns->shm_rmid_forced;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
 				tbl[i].data = &ns->msg_ctlmax;
@@ -250,7 +241,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
 				tbl[i].data = &ns->sem_ctls;
-				tbl[i].extra1 = ns;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
-- 
2.33.2


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 2/4] ipc: Use proper ipc namespace
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
  2022-04-22 12:53       ` [PATCH v1 1/4] " Alexey Gladkov
@ 2022-04-22 12:53       ` Alexey Gladkov
  2022-05-02 16:09         ` Eric W. Biederman
  2022-04-22 12:53       ` [PATCH v1 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time Alexey Gladkov
                         ` (2 subsequent siblings)
  4 siblings, 1 reply; 38+ messages in thread
From: Alexey Gladkov @ 2022-04-22 12:53 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

As Eric Biederman pointed out, changing the namespace broke checkpoint
restore. I have reverted my previous changes.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index eb7ba8e0a355..ff99d0305a5b 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -72,7 +72,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
 		int write, void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
 	struct ctl_table ipc_table;
 
 	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
@@ -244,15 +244,12 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
-				tbl[i].extra1 = ns;
 #endif
 			} else {
 				tbl[i].data = NULL;
-- 
2.33.2


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
  2022-04-22 12:53       ` [PATCH v1 1/4] " Alexey Gladkov
  2022-04-22 12:53       ` [PATCH v1 2/4] ipc: Use proper " Alexey Gladkov
@ 2022-04-22 12:53       ` Alexey Gladkov
  2022-04-22 12:53       ` [PATCH v1 4/4] ipc: Remove extra braces Alexey Gladkov
  2022-04-22 20:44       ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Linus Torvalds
  4 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-04-22 12:53 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

As Eric Biederman pointed out, it is possible not to use a custom
proc_handler and check permissions for every write, but to use a
.permission handler. That will allow the checkpoint_restart sysctls to
perform all of their permission checks at open time, and not need any
other special code.

Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 54 ++++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index ff99d0305a5b..5a58598d48c8 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -68,25 +68,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	return ret;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
-		int write, void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
-	struct ctl_table ipc_table;
-
-	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
-		return -EPERM;
-
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = SYSCTL_ZERO;
-	ipc_table.extra2 = SYSCTL_INT_MAX;
-
-	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
-}
-#endif
-
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
@@ -172,22 +153,28 @@ static struct ctl_table ipc_sysctls[] = {
 		.procname	= "sem_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
 		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
@@ -203,8 +190,25 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->ipc_set == set;
 }
 
+static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	int mode = table->mode;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+	if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
+	     (table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
+	     (table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
+	    checkpoint_restore_ns_capable(ns->user_ns))
+		mode = 0666;
+#endif
+	return mode;
+}
+
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
+	.permissions = ipc_permissions,
 };
 
 bool setup_ipc_sysctls(struct ipc_namespace *ns)
-- 
2.33.2


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 4/4] ipc: Remove extra braces
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
                         ` (2 preceding siblings ...)
  2022-04-22 12:53       ` [PATCH v1 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time Alexey Gladkov
@ 2022-04-22 12:53       ` Alexey Gladkov
  2022-04-22 20:44       ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Linus Torvalds
  4 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-04-22 12:53 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

Fix coding style. In the previous commit, I added braces because,
in addition to changing .data, .extra1 also changed. Now this is not
needed.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 5a58598d48c8..ef313ecfb53a 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -222,42 +222,41 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 		int i;
 
 		for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) {
-			if (tbl[i].data == &init_ipc_ns.shm_ctlmax) {
+			if (tbl[i].data == &init_ipc_ns.shm_ctlmax)
 				tbl[i].data = &ns->shm_ctlmax;
 
-			} else if (tbl[i].data == &init_ipc_ns.shm_ctlall) {
+			else if (tbl[i].data == &init_ipc_ns.shm_ctlall)
 				tbl[i].data = &ns->shm_ctlall;
 
-			} else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) {
+			else if (tbl[i].data == &init_ipc_ns.shm_ctlmni)
 				tbl[i].data = &ns->shm_ctlmni;
 
-			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
+			else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced)
 				tbl[i].data = &ns->shm_rmid_forced;
 
-			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
+			else if (tbl[i].data == &init_ipc_ns.msg_ctlmax)
 				tbl[i].data = &ns->msg_ctlmax;
 
-			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) {
+			else if (tbl[i].data == &init_ipc_ns.msg_ctlmni)
 				tbl[i].data = &ns->msg_ctlmni;
 
-			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) {
+			else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb)
 				tbl[i].data = &ns->msg_ctlmnb;
 
-			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
+			else if (tbl[i].data == &init_ipc_ns.sem_ctls)
 				tbl[i].data = &ns->sem_ctls;
 #ifdef CONFIG_CHECKPOINT_RESTORE
-			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
+			else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id)
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
 
-			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
+			else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id)
 				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
 
-			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
+			else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id)
 				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
 #endif
-			} else {
+			else
 				tbl[i].data = NULL;
-			}
 		}
 
 		ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl);
-- 
2.33.2


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace
  2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
                         ` (3 preceding siblings ...)
  2022-04-22 12:53       ` [PATCH v1 4/4] ipc: Remove extra braces Alexey Gladkov
@ 2022-04-22 20:44       ` Linus Torvalds
  2022-05-04  3:42         ` Philip Rhoades
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
  4 siblings, 2 replies; 38+ messages in thread
From: Linus Torvalds @ 2022-04-22 20:44 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Eric W . Biederman, Alexander Mikhalitsyn, Andrew Morton,
	Christian Brauner, Daniel Walsh, Davidlohr Bueso, Kirill Tkhai,
	Linux Containers, Manfred Spraul, Serge Hallyn, Varad Gautam,
	Vasily Averin

On Fri, Apr 22, 2022 at 5:53 AM Alexey Gladkov <legion@kernel.org> wrote:
>
> Yes, Linus, these changes are not the refactoring you were talking
> about, but I plan to try to do such a refactoring in the my next
> patchset.

Heh. Ok, I'm not saying these patches are pretty, and looking up the
namespace thing is a bit subtle, but it's certainly prettier than the
existing odd "create a new ctl_table entry because of field abuse".

So this certainly looks like an improvement from a quick look.

              Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 1/4] ipc: Remove extra1 field abuse to pass ipc namespace
  2022-04-22 12:53       ` [PATCH v1 1/4] " Alexey Gladkov
@ 2022-05-02 16:07         ` Eric W. Biederman
  0 siblings, 0 replies; 38+ messages in thread
From: Eric W. Biederman @ 2022-05-02 16:07 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Linus Torvalds, Alexander Mikhalitsyn, Andrew Morton,
	Christian Brauner, Daniel Walsh, Davidlohr Bueso, Kirill Tkhai,
	Linux Containers, Manfred Spraul, Serge Hallyn, Varad Gautam,
	Vasily Averin

Alexey Gladkov <legion@kernel.org> writes:

> Eric Biederman pointed out that using .extra1 to pass ipc namespace
> looks like an ugly hack and there is a better solution.
>
> Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
> Signed-off-by: Alexey Gladkov <legion@kernel.org>
> ---
>  ipc/ipc_sysctl.c | 26 ++++++++------------------
>  1 file changed, 8 insertions(+), 18 deletions(-)
>
> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
> index 15210ac47e9e..eb7ba8e0a355 100644
> @@ -55,20 +50,15 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
>  static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
>  	void *buffer, size_t *lenp, loff_t *ppos)
>  {
> -	struct ipc_namespace *ns = table->extra1;
> -	struct ctl_table ipc_table;
> +	struct ipc_namespace *ns =
> +		container_of(table->data, struct ipc_namespace, sem_ctls);
>  	int ret, semmni;
>  
> -	memcpy(&ipc_table, table, sizeof(ipc_table));
> -
> -	ipc_table.extra1 = NULL;
> -	ipc_table.extra2 = NULL;
> -
>  	semmni = ns->sem_ctls[3];
>  	ret = proc_dointvec(table, write, buffer, lenp, ppos);
>  
>  	if (!ret)
> -		ret = sem_check_semmni(current->nsproxy->ipc_ns);
> +		ret = sem_check_semmni(ns);
		^^^^^^^^^^^^^^^^^^^^^^^^^^^

Can you break this one line change into a separate patch?

It is a bug fix so that the entire function uses the same
ns value.  I expect the change would read easier if the
change was separate.

>  
>  	/*
>  	 * Reset the semmni value if an error happens.

Eric

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 2/4] ipc: Use proper ipc namespace
  2022-04-22 12:53       ` [PATCH v1 2/4] ipc: Use proper " Alexey Gladkov
@ 2022-05-02 16:09         ` Eric W. Biederman
  2022-05-03 13:39           ` Alexey Gladkov
  0 siblings, 1 reply; 38+ messages in thread
From: Eric W. Biederman @ 2022-05-02 16:09 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Linus Torvalds, Alexander Mikhalitsyn, Andrew Morton,
	Christian Brauner, Daniel Walsh, Davidlohr Bueso, Kirill Tkhai,
	Linux Containers, Manfred Spraul, Serge Hallyn, Varad Gautam,
	Vasily Averin

Alexey Gladkov <legion@kernel.org> writes:

> As Eric Biederman pointed out, changing the namespace broke checkpoint
> restore. I have reverted my previous changes.

Can you remind me the bug that is being fixed here?

I am probably just going to fast to see it, but it would be good to have
it described in the commit comment.

Thanks,
Eric

>
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
> Signed-off-by: Alexey Gladkov <legion@kernel.org>
> ---
>  ipc/ipc_sysctl.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
> index eb7ba8e0a355..ff99d0305a5b 100644
> --- a/ipc/ipc_sysctl.c
> +++ b/ipc/ipc_sysctl.c
> @@ -72,7 +72,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
>  static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
>  		int write, void *buffer, size_t *lenp, loff_t *ppos)
>  {
> -	struct ipc_namespace *ns = table->extra1;
> +	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
>  	struct ctl_table ipc_table;
>  
>  	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
> @@ -244,15 +244,12 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
>  #ifdef CONFIG_CHECKPOINT_RESTORE
>  			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
>  				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
> -				tbl[i].extra1 = ns;
>  
>  			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
>  				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
> -				tbl[i].extra1 = ns;
>  
>  			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
>  				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
> -				tbl[i].extra1 = ns;
>  #endif
>  			} else {
>  				tbl[i].data = NULL;

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 2/4] ipc: Use proper ipc namespace
  2022-05-02 16:09         ` Eric W. Biederman
@ 2022-05-03 13:39           ` Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 1/4] ipc: Use the same namespace to modify and validate Alexey Gladkov
                               ` (3 more replies)
  0 siblings, 4 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-05-03 13:39 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

On Mon, May 02, 2022 at 11:09:22AM -0500, Eric W. Biederman wrote:
> Alexey Gladkov <legion@kernel.org> writes:
> 
> > As Eric Biederman pointed out, changing the namespace broke checkpoint
> > restore. I have reverted my previous changes.
> 
> Can you remind me the bug that is being fixed here?
> 
> I am probably just going to fast to see it, but it would be good to have
> it described in the commit comment.

It was preparation for the "ipc: Check permissions for checkpoint_restart
sysctls at open time". I wanted to split the commit and make it more readable,
but it looks like I just confused everyone. This change should be part of the
next commit.

Here is a new version of the patchset where I moved the bugfix to a separate
commit as you requested in the next email.

Does that look better?

--

Alexey Gladkov (4):
  ipc: Use the same namespace to modify and validate
  ipc: Remove extra1 field abuse to pass ipc namespace
  ipc: Check permissions for checkpoint_restart sysctls at open time
  ipc: Remove extra braces

 ipc/ipc_sysctl.c | 108 +++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 59 deletions(-)

-- 
2.33.3


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v2 1/4] ipc: Use the same namespace to modify and validate
  2022-05-03 13:39           ` Alexey Gladkov
@ 2022-05-03 13:39             ` Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 2/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-05-03 13:39 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

In the 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace") I
missed that in addition to the modification of sem_ctls[3], the change
is validated. This validation must occur in the same namespace.

Link: https://lore.kernel.org/lkml/875ymnvryb.fsf@email.froward.int.ebiederm.org/
Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace")
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 15210ac47e9e..d1d5204cf589 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -68,7 +68,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret)
-		ret = sem_check_semmni(current->nsproxy->ipc_ns);
+		ret = sem_check_semmni(ns);
 
 	/*
 	 * Reset the semmni value if an error happens.
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v2 2/4] ipc: Remove extra1 field abuse to pass ipc namespace
  2022-05-03 13:39           ` Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 1/4] ipc: Use the same namespace to modify and validate Alexey Gladkov
@ 2022-05-03 13:39             ` Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 4/4] ipc: Remove extra braces Alexey Gladkov
  3 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-05-03 13:39 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

Eric Biederman pointed out that using .extra1 to pass ipc namespace
looks like an ugly hack and there is a better solution. We can get the
ipc_namespace using the .data field.

Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/
Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace")
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index d1d5204cf589..eb7ba8e0a355 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -19,16 +19,11 @@
 static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
+	struct ipc_namespace *ns =
+		container_of(table->data, struct ipc_namespace, shm_rmid_forced);
 	int err;
 
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = SYSCTL_ZERO;
-	ipc_table.extra2 = SYSCTL_ONE;
-
-	err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (err < 0)
 		return err;
@@ -55,15 +50,10 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
+	struct ipc_namespace *ns =
+		container_of(table->data, struct ipc_namespace, sem_ctls);
 	int ret, semmni;
 
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = NULL;
-	ipc_table.extra2 = NULL;
-
 	semmni = ns->sem_ctls[3];
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
@@ -131,6 +121,8 @@ static struct ctl_table ipc_sysctls[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
@@ -237,7 +229,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
 				tbl[i].data = &ns->shm_rmid_forced;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
 				tbl[i].data = &ns->msg_ctlmax;
@@ -250,7 +241,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
 				tbl[i].data = &ns->sem_ctls;
-				tbl[i].extra1 = ns;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v2 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time
  2022-05-03 13:39           ` Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 1/4] ipc: Use the same namespace to modify and validate Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 2/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
@ 2022-05-03 13:39             ` Alexey Gladkov
  2022-05-03 13:39             ` [PATCH v2 4/4] ipc: Remove extra braces Alexey Gladkov
  3 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-05-03 13:39 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

As Eric Biederman pointed out, it is possible not to use a custom
proc_handler and check permissions for every write, but to use a
.permission handler. That will allow the checkpoint_restart sysctls to
perform all of their permission checks at open time, and not need any
other special code.

Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/
Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace")
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 57 ++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index eb7ba8e0a355..5a58598d48c8 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -68,25 +68,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	return ret;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
-		int write, void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ipc_namespace *ns = table->extra1;
-	struct ctl_table ipc_table;
-
-	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
-		return -EPERM;
-
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-
-	ipc_table.extra1 = SYSCTL_ZERO;
-	ipc_table.extra2 = SYSCTL_INT_MAX;
-
-	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
-}
-#endif
-
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
@@ -172,22 +153,28 @@ static struct ctl_table ipc_sysctls[] = {
 		.procname	= "sem_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
 		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
-		.mode		= 0666,
-		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
@@ -203,8 +190,25 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->ipc_set == set;
 }
 
+static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	int mode = table->mode;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+	if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
+	     (table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
+	     (table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
+	    checkpoint_restore_ns_capable(ns->user_ns))
+		mode = 0666;
+#endif
+	return mode;
+}
+
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
+	.permissions = ipc_permissions,
 };
 
 bool setup_ipc_sysctls(struct ipc_namespace *ns)
@@ -244,15 +248,12 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
-				tbl[i].extra1 = ns;
 
 			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
 				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
-				tbl[i].extra1 = ns;
 #endif
 			} else {
 				tbl[i].data = NULL;
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v2 4/4] ipc: Remove extra braces
  2022-05-03 13:39           ` Alexey Gladkov
                               ` (2 preceding siblings ...)
  2022-05-03 13:39             ` [PATCH v2 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time Alexey Gladkov
@ 2022-05-03 13:39             ` Alexey Gladkov
  3 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-05-03 13:39 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Kirill Tkhai, Linux Containers,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

Fix coding style. In the previous commit, I added braces because,
in addition to changing .data, .extra1 also changed. Now this is not
needed.

Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace")
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 5a58598d48c8..ef313ecfb53a 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -222,42 +222,41 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 		int i;
 
 		for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) {
-			if (tbl[i].data == &init_ipc_ns.shm_ctlmax) {
+			if (tbl[i].data == &init_ipc_ns.shm_ctlmax)
 				tbl[i].data = &ns->shm_ctlmax;
 
-			} else if (tbl[i].data == &init_ipc_ns.shm_ctlall) {
+			else if (tbl[i].data == &init_ipc_ns.shm_ctlall)
 				tbl[i].data = &ns->shm_ctlall;
 
-			} else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) {
+			else if (tbl[i].data == &init_ipc_ns.shm_ctlmni)
 				tbl[i].data = &ns->shm_ctlmni;
 
-			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
+			else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced)
 				tbl[i].data = &ns->shm_rmid_forced;
 
-			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
+			else if (tbl[i].data == &init_ipc_ns.msg_ctlmax)
 				tbl[i].data = &ns->msg_ctlmax;
 
-			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) {
+			else if (tbl[i].data == &init_ipc_ns.msg_ctlmni)
 				tbl[i].data = &ns->msg_ctlmni;
 
-			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) {
+			else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb)
 				tbl[i].data = &ns->msg_ctlmnb;
 
-			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
+			else if (tbl[i].data == &init_ipc_ns.sem_ctls)
 				tbl[i].data = &ns->sem_ctls;
 #ifdef CONFIG_CHECKPOINT_RESTORE
-			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
+			else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id)
 				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
 
-			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
+			else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id)
 				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
 
-			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
+			else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id)
 				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
 #endif
-			} else {
+			else
 				tbl[i].data = NULL;
-			}
 		}
 
 		ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl);
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace
  2022-04-22 20:44       ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Linus Torvalds
@ 2022-05-04  3:42         ` Philip Rhoades
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
  1 sibling, 0 replies; 38+ messages in thread
From: Philip Rhoades @ 2022-05-04  3:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexey Gladkov, LKML, Eric W . Biederman, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Linux Containers, Manfred Spraul, Serge Hallyn,
	Varad Gautam, Vasily Averin

People,


On 2022-04-23 06:44, Linus Torvalds wrote:
> On Fri, Apr 22, 2022 at 5:53 AM Alexey Gladkov <legion@kernel.org> 
> wrote:
>> 
>> Yes, Linus, these changes are not the refactoring you were talking
>> about, but I plan to try to do such a refactoring in the my next
>> patchset.
> 
> Heh. Ok, I'm not saying these patches are pretty, and looking up the
> namespace thing is a bit subtle, but it's certainly prettier than the
> existing odd "create a new ctl_table entry because of field abuse".
> 
> So this certainly looks like an improvement from a quick look.
> 
>               Linus


Thanks for that little glimpse into what happens with kernel development 
- interesting!

Phil.
-- 
Philip Rhoades

PO Box 896
Cowra  NSW  2794
Australia
E-mail:  phil@pricom.com.au

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [RFC PATCH 0/4] API extension for handling sysctl
  2022-04-22 20:44       ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Linus Torvalds
  2022-05-04  3:42         ` Philip Rhoades
@ 2022-06-01 13:20         ` Alexey Gladkov
  2022-06-01 13:20           ` [RFC PATCH 1/4] sysctl: " Alexey Gladkov
                             ` (4 more replies)
  1 sibling, 5 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 13:20 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Andrew Morton, Christian Brauner, Iurii Zaikin, Kees Cook,
	Linux Containers, linux-fsdevel, Luis Chamberlain, Vasily Averin

On Fri, Apr 22, 2022 at 01:44:50PM -0700, Linus Torvalds wrote:
> On Fri, Apr 22, 2022 at 5:53 AM Alexey Gladkov <legion@kernel.org> wrote:
> >
> > Yes, Linus, these changes are not the refactoring you were talking
> > about, but I plan to try to do such a refactoring in the my next
> > patchset.
> 
> Heh. Ok, I'm not saying these patches are pretty, and looking up the
> namespace thing is a bit subtle, but it's certainly prettier than the
> existing odd "create a new ctl_table entry because of field abuse".

As I promised, here is one of the possible options for how to get rid of dynamic
memory allocation.

We can slightly extend the API and thus be able to save data at the time the
file is opened. This will not only eliminate the need to allocate memory, but
also provide access to file struct and f_cred.

I made an RFC because I'm not sure that I did the permissions check for
ipc_sysctl. I also did not change all the places where this API can be applied
to make the patch smaller. As in the case of /proc/sys/kernel/printk where
CAP_SYS_ADMIN is checked[1] for the current process at the time of write.

I made a patchset on top of:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next

Because there are my previous changes.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/printk/sysctl.c#n17

--

Alexey Gladkov (4):
  sysctl: API extension for handling sysctl
  sysctl: ipc: Do not use dynamic memory
  sysctl: userns: Do not use dynamic memory
  sysctl: mqueue: Do not use dynamic memory

 fs/proc/proc_sysctl.c          |  71 ++++++++--
 include/linux/ipc_namespace.h  |  35 -----
 include/linux/sysctl.h         |  20 ++-
 include/linux/user_namespace.h |   6 -
 ipc/ipc_sysctl.c               | 236 +++++++++++++++++----------------
 ipc/mq_sysctl.c                | 138 ++++++++++---------
 ipc/mqueue.c                   |   5 -
 ipc/namespace.c                |  10 --
 kernel/ucount.c                | 116 +++++++---------
 kernel/user_namespace.c        |  10 +-
 10 files changed, 323 insertions(+), 324 deletions(-)

-- 
2.33.3


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [RFC PATCH 1/4] sysctl: API extension for handling sysctl
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
@ 2022-06-01 13:20           ` Alexey Gladkov
  2022-06-01 19:19             ` Matthew Wilcox
  2022-06-01 13:20           ` [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory Alexey Gladkov
                             ` (3 subsequent siblings)
  4 siblings, 1 reply; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 13:20 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Andrew Morton, Christian Brauner, Iurii Zaikin, Kees Cook,
	Linux Containers, linux-fsdevel, Luis Chamberlain, Vasily Averin

This adds additional optional functions for handling open, read, and
write operations that can be customized for each sysctl file. It also
creates ctl_context that persists from opening to closing the file in
the /proc/sys.

The context allows us to store dynamic information at the time the file
is opened. This eliminates the need to duplicate ctl_table in order to
dynamically change .data, .extra1 or .extra2.

This API extends the existing one and does not require any changes to
already existing sysctl handlers.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 fs/proc/proc_sysctl.c  | 71 +++++++++++++++++++++++++++++++++++-------
 include/linux/sysctl.h | 20 ++++++++++--
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 7d9cfc730bd4..d3d43e738f01 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -560,6 +560,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_fops *fops = table->ctl_fops;
 	size_t count = iov_iter_count(iter);
 	char *kbuf;
 	ssize_t error;
@@ -577,7 +578,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
 
 	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
 	error = -EINVAL;
-	if (!table->proc_handler)
+	if (!table->proc_handler && !fops)
 		goto out;
 
 	/* don't even try if the size is too large */
@@ -600,8 +601,20 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
 	if (error)
 		goto out_free_buf;
 
-	/* careful: calling conventions are nasty here */
-	error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
+	if (fops) {
+		struct ctl_context *ctx = iocb->ki_filp->private_data;
+
+		if (write && fops->write)
+			error = fops->write(ctx, iocb->ki_filp, kbuf, &count, &iocb->ki_pos);
+		else if (!write && fops->read)
+			error = fops->read(ctx, iocb->ki_filp, kbuf, &count, &iocb->ki_pos);
+		else
+			error = -EINVAL;
+	} else {
+		/* careful: calling conventions are nasty here */
+		error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
+	}
+
 	if (error)
 		goto out_free_buf;
 
@@ -634,17 +647,50 @@ static int proc_sys_open(struct inode *inode, struct file *filp)
 {
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_context *ctx;
+	int ret = 0;
 
 	/* sysctl was unregistered */
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
-	if (table->poll)
-		filp->private_data = proc_sys_poll_event(table->poll);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->table = table;
+	filp->private_data = ctx;
+
+	if (table->ctl_fops && table->ctl_fops->open)
+		ret = table->ctl_fops->open(ctx, inode, filp);
+
+	if (!ret && table->poll)
+		ctx->poll_event = proc_sys_poll_event(table->poll);
 
 	sysctl_head_finish(head);
 
-	return 0;
+	return ret;
+}
+
+static int proc_sys_release(struct inode *inode, struct file *filp)
+{
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_context *ctx = filp->private_data;
+	int ret = 0;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	if (table->ctl_fops && table->ctl_fops->release)
+		ret = table->ctl_fops->release(ctx, inode, filp);
+
+	sysctl_head_finish(head);
+
+	kfree(ctx);
+	filp->private_data =  NULL;
+
+	return ret;
 }
 
 static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
@@ -653,23 +699,23 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	__poll_t ret = DEFAULT_POLLMASK;
-	unsigned long event;
+	struct ctl_context *ctx;
 
 	/* sysctl was unregistered */
 	if (IS_ERR(head))
 		return EPOLLERR | EPOLLHUP;
 
-	if (!table->proc_handler)
+	if (!table->proc_handler && !table->ctl_fops)
 		goto out;
 
 	if (!table->poll)
 		goto out;
 
-	event = (unsigned long)filp->private_data;
+	ctx = filp->private_data;
 	poll_wait(filp, &table->poll->wait, wait);
 
-	if (event != atomic_read(&table->poll->event)) {
-		filp->private_data = proc_sys_poll_event(table->poll);
+	if (ctx->poll_event != atomic_read(&table->poll->event)) {
+		ctx->poll_event = proc_sys_poll_event(table->poll);
 		ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
 	}
 
@@ -866,6 +912,7 @@ static int proc_sys_getattr(struct user_namespace *mnt_userns,
 
 static const struct file_operations proc_sys_file_operations = {
 	.open		= proc_sys_open,
+	.release	= proc_sys_release,
 	.poll		= proc_sys_poll,
 	.read_iter	= proc_sys_read,
 	.write_iter	= proc_sys_write,
@@ -1153,7 +1200,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 			else
 				err |= sysctl_check_table_array(path, table);
 		}
-		if (!table->proc_handler)
+		if (!table->proc_handler && !table->ctl_fops)
 			err |= sysctl_err(path, table, "No proc_handler");
 
 		if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 6353d6db69b2..ca5657c9fcb2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -116,9 +116,9 @@ struct ctl_table_poll {
 	wait_queue_head_t wait;
 };
 
-static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
+static inline unsigned long proc_sys_poll_event(struct ctl_table_poll *poll)
 {
-	return (void *)(unsigned long)atomic_read(&poll->event);
+	return (unsigned long)atomic_read(&poll->event);
 }
 
 #define __CTL_TABLE_POLL_INITIALIZER(name) {				\
@@ -128,6 +128,21 @@ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
 #define DEFINE_CTL_TABLE_POLL(name)					\
 	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
 
+struct ctl_context {
+	struct ctl_table *table;
+	unsigned long poll_event;
+	void *ctl_data;
+};
+
+struct inode;
+
+struct ctl_fops {
+	int (*open) (struct ctl_context *, struct inode *, struct file *);
+	int (*release) (struct ctl_context *, struct inode *, struct file *);
+	ssize_t (*read) (struct ctl_context *, struct file *, char *, size_t *, loff_t *);
+	ssize_t (*write) (struct ctl_context *, struct file *, char *, size_t *, loff_t *);
+};
+
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table {
 	const char *procname;		/* Text ID for /proc/sys, or zero */
@@ -139,6 +154,7 @@ struct ctl_table {
 	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
+	struct ctl_fops *ctl_fops;
 } __randomize_layout;
 
 struct ctl_node {
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
  2022-06-01 13:20           ` [RFC PATCH 1/4] sysctl: " Alexey Gladkov
@ 2022-06-01 13:20           ` Alexey Gladkov
  2022-06-01 16:45             ` Linus Torvalds
  2022-06-01 13:20           ` [RFC PATCH 3/4] sysctl: userns: " Alexey Gladkov
                             ` (2 subsequent siblings)
  4 siblings, 1 reply; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 13:20 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Andrew Morton, Christian Brauner, Iurii Zaikin, Kees Cook,
	Linux Containers, linux-fsdevel, Luis Chamberlain, Vasily Averin

Dynamic memory allocation is needed to modify .data and specify the per
namespace parameter. The new sysctl API is allowed to get rid of the
need for such modification.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  18 ---
 ipc/ipc_sysctl.c              | 236 +++++++++++++++++-----------------
 ipc/namespace.c               |   4 -
 3 files changed, 121 insertions(+), 137 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..51c2c247c447 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -191,22 +191,4 @@ static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
 }
 
 #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
-
-#ifdef CONFIG_SYSVIPC_SYSCTL
-
-bool setup_ipc_sysctls(struct ipc_namespace *ns);
-void retire_ipc_sysctls(struct ipc_namespace *ns);
-
-#else /* CONFIG_SYSVIPC_SYSCTL */
-
-static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
-{
-}
-
-static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
-{
-	return true;
-}
-
-#endif /* CONFIG_SYSVIPC_SYSCTL */
 #endif
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index ef313ecfb53a..833b670c38f3 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -68,26 +68,94 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	return ret;
 }
 
+static inline void *data_from_ns(struct ctl_context *ctx, struct ctl_table *table);
+
+static int ipc_sys_open(struct ctl_context *ctx, struct inode *inode, struct file *file)
+{
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+	// For now, we only allow changes in init_user_ns.
+	if (ns->user_ns != &init_user_ns)
+		return -EPERM;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	int index = (ctx->table - ipc_sysctls);
+
+	switch (index) {
+		case IPC_SYSCTL_SEM_NEXT_ID:
+		case IPC_SYSCTL_MSG_NEXT_ID:
+		case IPC_SYSCTL_SHM_NEXT_ID:
+			if (!checkpoint_restore_ns_capable(ns->user_ns))
+				return -EPERM;
+			break;
+	}
+#endif
+	ctx->ctl_data = ns;
+	return 0;
+}
+
+static ssize_t ipc_sys_read(struct ctl_context *ctx, struct file *file,
+		     char *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table table = *ctx->table;
+	table.data = data_from_ns(ctx, ctx->table);
+	return table.proc_handler(&table, 0, buffer, lenp, ppos);
+}
+
+static ssize_t ipc_sys_write(struct ctl_context *ctx, struct file *file,
+		      char *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table table = *ctx->table;
+	table.data = data_from_ns(ctx, ctx->table);
+	return table.proc_handler(&table, 1, buffer, lenp, ppos);
+}
+
+static struct ctl_fops ipc_sys_fops = {
+	.open	= ipc_sys_open,
+	.read	= ipc_sys_read,
+	.write	= ipc_sys_write,
+};
+
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
 
+enum {
+	IPC_SYSCTL_SHMMAX,
+	IPC_SYSCTL_SHMALL,
+	IPC_SYSCTL_SHMMNI,
+	IPC_SYSCTL_SHM_RMID_FORCED,
+	IPC_SYSCTL_MSGMAX,
+	IPC_SYSCTL_MSGMNI,
+	IPC_SYSCTL_AUTO_MSGMNI,
+	IPC_SYSCTL_MSGMNB,
+	IPC_SYSCTL_SEM,
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	IPC_SYSCTL_SEM_NEXT_ID,
+	IPC_SYSCTL_MSG_NEXT_ID,
+	IPC_SYSCTL_SHM_NEXT_ID,
+#endif
+	IPC_SYSCTL_COUNTS
+};
+
 static struct ctl_table ipc_sysctls[] = {
-	{
+	[IPC_SYSCTL_SHMMAX] = {
 		.procname	= "shmmax",
 		.data		= &init_ipc_ns.shm_ctlmax,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_SHMALL] = {
 		.procname	= "shmall",
 		.data		= &init_ipc_ns.shm_ctlall,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlall),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.proc_handler   = proc_doulongvec_minmax,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_SHMMNI] = {
 		.procname	= "shmmni",
 		.data		= &init_ipc_ns.shm_ctlmni,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
@@ -95,8 +163,9 @@ static struct ctl_table ipc_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_SHM_RMID_FORCED] = {
 		.procname	= "shm_rmid_forced",
 		.data		= &init_ipc_ns.shm_rmid_forced,
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
@@ -104,8 +173,9 @@ static struct ctl_table ipc_sysctls[] = {
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_MSGMAX] = {
 		.procname	= "msgmax",
 		.data		= &init_ipc_ns.msg_ctlmax,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
@@ -113,8 +183,9 @@ static struct ctl_table ipc_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_MSGMNI] = {
 		.procname	= "msgmni",
 		.data		= &init_ipc_ns.msg_ctlmni,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
@@ -122,8 +193,9 @@ static struct ctl_table ipc_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_AUTO_MSGMNI] = {
 		.procname	= "auto_msgmni",
 		.data		= NULL,
 		.maxlen		= sizeof(int),
@@ -131,8 +203,9 @@ static struct ctl_table ipc_sysctls[] = {
 		.proc_handler	= proc_ipc_auto_msgmni,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_MSGMNB] = {
 		.procname	=  "msgmnb",
 		.data		= &init_ipc_ns.msg_ctlmnb,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
@@ -140,152 +213,85 @@ static struct ctl_table ipc_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_SEM] = {
 		.procname	= "sem",
 		.data		= &init_ipc_ns.sem_ctls,
 		.maxlen		= 4*sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_sem_dointvec,
+		.ctl_fops	= &ipc_sys_fops,
 	},
 #ifdef CONFIG_CHECKPOINT_RESTORE
-	{
+	[IPC_SYSCTL_SEM_NEXT_ID] = {
 		.procname	= "sem_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
-		.mode		= 0444,
+		.mode		= 0666,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_MSG_NEXT_ID] = {
 		.procname	= "msg_next_id",
 		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
-		.mode		= 0444,
+		.mode		= 0666,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
+		.ctl_fops	= &ipc_sys_fops,
 	},
-	{
+	[IPC_SYSCTL_SHM_NEXT_ID] = {
 		.procname	= "shm_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
-		.mode		= 0444,
+		.mode		= 0666,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
+		.ctl_fops	= &ipc_sys_fops,
 	},
 #endif
-	{}
+	[IPC_SYSCTL_COUNTS] = {}
 };
 
-static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
+static inline void *data_from_ns(struct ctl_context *ctx, struct ctl_table *table)
 {
-	return &current->nsproxy->ipc_ns->ipc_set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
-	return &current->nsproxy->ipc_ns->ipc_set == set;
-}
-
-static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
-{
-	int mode = table->mode;
-
+	struct ipc_namespace *ns = ctx->ctl_data;
+
+	switch (ctx->table - ipc_sysctls) {
+		case IPC_SYSCTL_SHMMAX:			return &ns->shm_ctlmax;
+		case IPC_SYSCTL_SHMALL:			return &ns->shm_ctlall;
+		case IPC_SYSCTL_SHMMNI:			return &ns->shm_ctlmni;
+		case IPC_SYSCTL_SHM_RMID_FORCED:	return &ns->shm_rmid_forced;
+		case IPC_SYSCTL_MSGMAX:			return &ns->msg_ctlmax;
+		case IPC_SYSCTL_MSGMNI:			return &ns->msg_ctlmni;
+		case IPC_SYSCTL_MSGMNB:			return &ns->msg_ctlmnb;
+		case IPC_SYSCTL_SEM:			return &ns->sem_ctls;
 #ifdef CONFIG_CHECKPOINT_RESTORE
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
-
-	if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
-	     (table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
-	     (table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
-	    checkpoint_restore_ns_capable(ns->user_ns))
-		mode = 0666;
+		case IPC_SYSCTL_SEM_NEXT_ID:		return &ns->ids[IPC_SEM_IDS].next_id;
+		case IPC_SYSCTL_MSG_NEXT_ID:		return &ns->ids[IPC_MSG_IDS].next_id;
+		case IPC_SYSCTL_SHM_NEXT_ID:		return &ns->ids[IPC_SHM_IDS].next_id;
 #endif
-	return mode;
-}
-
-static struct ctl_table_root set_root = {
-	.lookup = set_lookup,
-	.permissions = ipc_permissions,
-};
-
-bool setup_ipc_sysctls(struct ipc_namespace *ns)
-{
-	struct ctl_table *tbl;
-
-	setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen);
-
-	tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL);
-	if (tbl) {
-		int i;
-
-		for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) {
-			if (tbl[i].data == &init_ipc_ns.shm_ctlmax)
-				tbl[i].data = &ns->shm_ctlmax;
-
-			else if (tbl[i].data == &init_ipc_ns.shm_ctlall)
-				tbl[i].data = &ns->shm_ctlall;
-
-			else if (tbl[i].data == &init_ipc_ns.shm_ctlmni)
-				tbl[i].data = &ns->shm_ctlmni;
-
-			else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced)
-				tbl[i].data = &ns->shm_rmid_forced;
-
-			else if (tbl[i].data == &init_ipc_ns.msg_ctlmax)
-				tbl[i].data = &ns->msg_ctlmax;
-
-			else if (tbl[i].data == &init_ipc_ns.msg_ctlmni)
-				tbl[i].data = &ns->msg_ctlmni;
-
-			else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb)
-				tbl[i].data = &ns->msg_ctlmnb;
-
-			else if (tbl[i].data == &init_ipc_ns.sem_ctls)
-				tbl[i].data = &ns->sem_ctls;
-#ifdef CONFIG_CHECKPOINT_RESTORE
-			else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id)
-				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
-
-			else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id)
-				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
-
-			else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id)
-				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
-#endif
-			else
-				tbl[i].data = NULL;
-		}
-
-		ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl);
-	}
-	if (!ns->ipc_sysctls) {
-		kfree(tbl);
-		retire_sysctl_set(&ns->ipc_set);
-		return false;
 	}
-
-	return true;
+	return NULL;
 }
 
-void retire_ipc_sysctls(struct ipc_namespace *ns)
-{
-	struct ctl_table *tbl;
-
-	tbl = ns->ipc_sysctls->ctl_table_arg;
-	unregister_sysctl_table(ns->ipc_sysctls);
-	retire_sysctl_set(&ns->ipc_set);
-	kfree(tbl);
-}
+static struct ctl_table ipc_root_table[] = {
+	{
+		.procname       = "kernel",
+		.mode           = 0555,
+		.child          = ipc_sysctls,
+	},
+	{}
+};
 
 static int __init ipc_sysctl_init(void)
 {
-	if (!setup_ipc_sysctls(&init_ipc_ns)) {
-		pr_warn("ipc sysctl registration failed\n");
-		return -ENOMEM;
-	}
+	register_sysctl_table(ipc_root_table);
 	return 0;
 }
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 754f3237194a..f760243ca685 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -63,9 +63,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (!setup_mq_sysctls(ns))
 		goto fail_put;
 
-	if (!setup_ipc_sysctls(ns))
-		goto fail_put;
-
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -133,7 +130,6 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	shm_exit_ns(ns);
 
 	retire_mq_sysctls(ns);
-	retire_ipc_sysctls(ns);
 
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 3/4] sysctl: userns: Do not use dynamic memory
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
  2022-06-01 13:20           ` [RFC PATCH 1/4] sysctl: " Alexey Gladkov
  2022-06-01 13:20           ` [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory Alexey Gladkov
@ 2022-06-01 13:20           ` Alexey Gladkov
  2022-06-01 13:20           ` [RFC PATCH 4/4] sysctl: mqueue: " Alexey Gladkov
  2022-06-09 16:45           ` [RFC PATCH 0/4] API extension for handling sysctl Luis Chamberlain
  4 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 13:20 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Andrew Morton, Christian Brauner, Iurii Zaikin, Kees Cook,
	Linux Containers, linux-fsdevel, Luis Chamberlain, Vasily Averin

Dynamic memory allocation is needed to modify .data and specify the
per namespace parameter. The new sysctl API is allowed to get rid of
the need for such modification.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/user_namespace.h |   6 --
 kernel/ucount.c                | 116 +++++++++++++--------------------
 kernel/user_namespace.c        |  10 +--
 3 files changed, 46 insertions(+), 86 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..7b134516e5cb 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -95,10 +95,6 @@ struct user_namespace {
 	struct key		*persistent_keyring_register;
 #endif
 	struct work_struct	work;
-#ifdef CONFIG_SYSCTL
-	struct ctl_table_set	set;
-	struct ctl_table_header *sysctls;
-#endif
 	struct ucounts		*ucounts;
 	long ucount_max[UCOUNT_COUNTS];
 	long rlimit_max[UCOUNT_RLIMIT_COUNTS];
@@ -116,8 +112,6 @@ struct ucounts {
 extern struct user_namespace init_user_ns;
 extern struct ucounts init_ucounts;
 
-bool setup_userns_sysctls(struct user_namespace *ns);
-void retire_userns_sysctls(struct user_namespace *ns);
 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
 struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index ee8e57fd6f90..4a5072671847 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -7,6 +7,7 @@
 #include <linux/hash.h>
 #include <linux/kmemleak.h>
 #include <linux/user_namespace.h>
+#include <linux/fs.h>
 
 struct ucounts init_ucounts = {
 	.ns    = &init_user_ns,
@@ -26,38 +27,20 @@ static DEFINE_SPINLOCK(ucounts_lock);
 
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_set *
-set_lookup(struct ctl_table_root *root)
-{
-	return &current_user_ns()->set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
-	return &current_user_ns()->set == set;
-}
-
-static int set_permissions(struct ctl_table_header *head,
-				  struct ctl_table *table)
-{
-	struct user_namespace *user_ns =
-		container_of(head->set, struct user_namespace, set);
-	int mode;
-
-	/* Allow users with CAP_SYS_RESOURCE unrestrained access */
-	if (ns_capable(user_ns, CAP_SYS_RESOURCE))
-		mode = (table->mode & S_IRWXU) >> 6;
-	else
-	/* Allow all others at most read-only access */
-		mode = table->mode & S_IROTH;
-	return (mode << 6) | (mode << 3) | mode;
-}
-
-static struct ctl_table_root set_root = {
-	.lookup = set_lookup,
-	.permissions = set_permissions,
+static int user_sys_open(struct ctl_context *ctx, struct inode *inode,
+		        struct file *file);
+static ssize_t user_sys_read(struct ctl_context *ctx, struct file *file,
+			    char *buffer, size_t *lenp, loff_t *ppos);
+static ssize_t user_sys_write(struct ctl_context *ctx, struct file *file,
+			     char *buffer, size_t *lenp, loff_t *ppos);
+
+static struct ctl_fops user_sys_fops = {
+	.open	= user_sys_open,
+	.read	= user_sys_read,
+	.write	= user_sys_write,
 };
 
+static long ue_dummy = 0;
 static long ue_zero = 0;
 static long ue_int_max = INT_MAX;
 
@@ -66,9 +49,11 @@ static long ue_int_max = INT_MAX;
 		.procname	= name,				\
 		.maxlen		= sizeof(long),			\
 		.mode		= 0644,				\
+		.data		= &ue_dummy,			\
 		.proc_handler	= proc_doulongvec_minmax,	\
 		.extra1		= &ue_zero,			\
 		.extra2		= &ue_int_max,			\
+		.ctl_fops	= &user_sys_fops,		\
 	}
 static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_user_namespaces"),
@@ -89,44 +74,43 @@ static struct ctl_table user_table[] = {
 #endif
 	{ }
 };
-#endif /* CONFIG_SYSCTL */
 
-bool setup_userns_sysctls(struct user_namespace *ns)
+static int user_sys_open(struct ctl_context *ctx, struct inode *inode, struct file *file)
 {
-#ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
-
-	BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1);
-	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
-	tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
-	if (tbl) {
-		int i;
-		for (i = 0; i < UCOUNT_COUNTS; i++) {
-			tbl[i].data = &ns->ucount_max[i];
-		}
-		ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
-	}
-	if (!ns->sysctls) {
-		kfree(tbl);
-		retire_sysctl_set(&ns->set);
-		return false;
-	}
-#endif
-	return true;
+	/* Allow users with CAP_SYS_RESOURCE unrestrained access */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !ns_capable(file->f_cred->user_ns, CAP_SYS_RESOURCE))
+		return -EPERM;
+	return 0;
 }
 
-void retire_userns_sysctls(struct user_namespace *ns)
+static ssize_t user_sys_read(struct ctl_context *ctx, struct file *file,
+		     char *buffer, size_t *lenp, loff_t *ppos)
 {
-#ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
+	struct ctl_table table = *ctx->table;
+	table.data = &file->f_cred->user_ns->ucount_max[ctx->table - user_table];
+	return table.proc_handler(&table, 0, buffer, lenp, ppos);
+}
 
-	tbl = ns->sysctls->ctl_table_arg;
-	unregister_sysctl_table(ns->sysctls);
-	retire_sysctl_set(&ns->set);
-	kfree(tbl);
-#endif
+static ssize_t user_sys_write(struct ctl_context *ctx, struct file *file,
+		      char *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table table = *ctx->table;
+	table.data = &file->f_cred->user_ns->ucount_max[ctx->table - user_table];
+	return table.proc_handler(&table, 1, buffer, lenp, ppos);
 }
 
+static struct ctl_table user_root_table[] = {
+	{
+		.procname       = "user",
+		.mode           = 0555,
+		.child          = user_table,
+	},
+	{}
+};
+
+#endif /* CONFIG_SYSCTL */
+
 static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
 {
 	struct ucounts *ucounts;
@@ -357,17 +341,7 @@ bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigne
 static __init int user_namespace_sysctl_init(void)
 {
 #ifdef CONFIG_SYSCTL
-	static struct ctl_table_header *user_header;
-	static struct ctl_table empty[1];
-	/*
-	 * It is necessary to register the user directory in the
-	 * default set so that registrations in the child sets work
-	 * properly.
-	 */
-	user_header = register_sysctl("user", empty);
-	kmemleak_ignore(user_header);
-	BUG_ON(!user_header);
-	BUG_ON(!setup_userns_sysctls(&init_user_ns));
+	register_sysctl_table(user_root_table);
 #endif
 	hlist_add_ucounts(&init_ucounts);
 	inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 981bb2d10d83..c0e707bc9a31 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -149,17 +149,10 @@ int create_user_ns(struct cred *new)
 	INIT_LIST_HEAD(&ns->keyring_name_list);
 	init_rwsem(&ns->keyring_sem);
 #endif
-	ret = -ENOMEM;
-	if (!setup_userns_sysctls(ns))
-		goto fail_keyring;
 
 	set_cred_user_ns(new, ns);
 	return 0;
-fail_keyring:
-#ifdef CONFIG_PERSISTENT_KEYRINGS
-	key_put(ns->persistent_keyring_register);
-#endif
-	ns_free_inum(&ns->ns);
+
 fail_free:
 	kmem_cache_free(user_ns_cachep, ns);
 fail_dec:
@@ -208,7 +201,6 @@ static void free_user_ns(struct work_struct *work)
 			kfree(ns->projid_map.forward);
 			kfree(ns->projid_map.reverse);
 		}
-		retire_userns_sysctls(ns);
 		key_free_user_ns(ns);
 		ns_free_inum(&ns->ns);
 		kmem_cache_free(user_ns_cachep, ns);
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 4/4] sysctl: mqueue: Do not use dynamic memory
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
                             ` (2 preceding siblings ...)
  2022-06-01 13:20           ` [RFC PATCH 3/4] sysctl: userns: " Alexey Gladkov
@ 2022-06-01 13:20           ` Alexey Gladkov
  2022-06-09 16:45           ` [RFC PATCH 0/4] API extension for handling sysctl Luis Chamberlain
  4 siblings, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 13:20 UTC (permalink / raw)
  To: LKML, Eric W . Biederman, Linus Torvalds
  Cc: Andrew Morton, Christian Brauner, Iurii Zaikin, Kees Cook,
	Linux Containers, linux-fsdevel, Luis Chamberlain, Vasily Averin

Dynamic memory allocation is needed to modify .data and specify the
per namespace parameter. The new sysctl API is allowed to get rid of
the need for such modification.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  17 -----
 ipc/mq_sysctl.c               | 138 +++++++++++++++++++---------------
 ipc/mqueue.c                  |   5 --
 ipc/namespace.c               |   6 --
 4 files changed, 79 insertions(+), 87 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 51c2c247c447..d20753093a2c 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -174,21 +174,4 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 }
 #endif
 
-#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
-
-void retire_mq_sysctls(struct ipc_namespace *ns);
-bool setup_mq_sysctls(struct ipc_namespace *ns);
-
-#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
-
-static inline void retire_mq_sysctls(struct ipc_namespace *ns)
-{
-}
-
-static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
-{
-	return true;
-}
-
-#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
 #endif
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index fbf6a8b93a26..08ff7dfb721c 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -13,6 +13,45 @@
 #include <linux/capability.h>
 #include <linux/slab.h>
 
+static inline void *data_from_ns(struct ctl_context *ctx, struct ctl_table *table);
+
+static int mq_sys_open(struct ctl_context *ctx, struct inode *inode, struct file *file)
+{
+	ctx->ctl_data = current->nsproxy->ipc_ns;
+	return 0;
+}
+
+static ssize_t mq_sys_read(struct ctl_context *ctx, struct file *file,
+		     char *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table table = *ctx->table;
+	table.data = data_from_ns(ctx, ctx->table);
+	return table.proc_handler(&table, 0, buffer, lenp, ppos);
+}
+
+static ssize_t mq_sys_write(struct ctl_context *ctx, struct file *file,
+		      char *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table table = *ctx->table;
+	table.data = data_from_ns(ctx, ctx->table);
+	return table.proc_handler(&table, 1, buffer, lenp, ppos);
+}
+
+static struct ctl_fops mq_sys_fops = {
+	.open	= mq_sys_open,
+	.read	= mq_sys_read,
+	.write	= mq_sys_write,
+};
+
+enum {
+	MQ_SYSCTL_QUEUES_MAX,
+	MQ_SYSCTL_MSG_MAX,
+	MQ_SYSCTL_MSGSIZE_MAX,
+	MQ_SYSCTL_MSG_DEFAULT,
+	MQ_SYSCTL_MSGSIZE_DEFAULT,
+	MQ_SYSCTL_COUNTS
+};
+
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
 
@@ -20,14 +59,15 @@ static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
 static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
 
 static struct ctl_table mq_sysctls[] = {
-	{
+	[MQ_SYSCTL_QUEUES_MAX] = {
 		.procname	= "queues_max",
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
+		.ctl_fops	= &mq_sys_fops,
 	},
-	{
+	[MQ_SYSCTL_MSG_MAX] = {
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
@@ -35,8 +75,9 @@ static struct ctl_table mq_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
+		.ctl_fops	= &mq_sys_fops,
 	},
-	{
+	[MQ_SYSCTL_MSGSIZE_MAX] = {
 		.procname	= "msgsize_max",
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
@@ -44,8 +85,9 @@ static struct ctl_table mq_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
+		.ctl_fops	= &mq_sys_fops,
 	},
-	{
+	[MQ_SYSCTL_MSG_DEFAULT] = {
 		.procname	= "msg_default",
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
@@ -53,8 +95,9 @@ static struct ctl_table mq_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
+		.ctl_fops	= &mq_sys_fops,
 	},
-	{
+	[MQ_SYSCTL_MSGSIZE_DEFAULT] = {
 		.procname	= "msgsize_default",
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
@@ -62,70 +105,47 @@ static struct ctl_table mq_sysctls[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
+		.ctl_fops	= &mq_sys_fops,
 	},
 	{}
 };
 
-static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
+static inline void *data_from_ns(struct ctl_context *ctx, struct ctl_table *table)
 {
-	return &current->nsproxy->ipc_ns->mq_set;
+	struct ipc_namespace *ns = ctx->ctl_data;
+
+	switch (ctx->table - mq_sysctls) {
+		case MQ_SYSCTL_QUEUES_MAX:      return &ns->mq_queues_max;
+		case MQ_SYSCTL_MSG_MAX:         return &ns->mq_msg_max;
+		case MQ_SYSCTL_MSGSIZE_MAX:     return &ns->mq_msgsize_max;
+		case MQ_SYSCTL_MSG_DEFAULT:     return &ns->mq_msg_default;
+		case MQ_SYSCTL_MSGSIZE_DEFAULT: return &ns->mq_msgsize_default;
+	}
+	return NULL;
 }
 
-static int set_is_seen(struct ctl_table_set *set)
-{
-	return &current->nsproxy->ipc_ns->mq_set == set;
-}
+static struct ctl_table mq_sysctl_dir[] = {
+	{
+		.procname       = "mqueue",
+		.mode           = 0555,
+		.child          = mq_sysctls,
+	},
+	{}
+};
 
-static struct ctl_table_root set_root = {
-	.lookup = set_lookup,
+static struct ctl_table mq_sysctl_root[] = {
+	{
+		.procname       = "fs",
+		.mode           = 0555,
+		.child          = mq_sysctl_dir,
+	},
+	{}
 };
 
-bool setup_mq_sysctls(struct ipc_namespace *ns)
+static int __init mq_sysctl_init(void)
 {
-	struct ctl_table *tbl;
-
-	setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen);
-
-	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
-	if (tbl) {
-		int i;
-
-		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
-			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
-				tbl[i].data = &ns->mq_queues_max;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
-				tbl[i].data = &ns->mq_msg_max;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
-				tbl[i].data = &ns->mq_msgsize_max;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
-				tbl[i].data = &ns->mq_msg_default;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
-				tbl[i].data = &ns->mq_msgsize_default;
-			else
-				tbl[i].data = NULL;
-		}
-
-		ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
-	}
-	if (!ns->mq_sysctls) {
-		kfree(tbl);
-		retire_sysctl_set(&ns->mq_set);
-		return false;
-	}
-
-	return true;
+	register_sysctl_table(mq_sysctl_root);
+	return 0;
 }
 
-void retire_mq_sysctls(struct ipc_namespace *ns)
-{
-	struct ctl_table *tbl;
-
-	tbl = ns->mq_sysctls->ctl_table_arg;
-	unregister_sysctl_table(ns->mq_sysctls);
-	retire_sysctl_set(&ns->mq_set);
-	kfree(tbl);
-}
+device_initcall(mq_sysctl_init);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c0f24cc9f619..ffb79a24d70b 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1711,11 +1711,6 @@ static int __init init_mqueue_fs(void)
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
-	if (!setup_mq_sysctls(&init_ipc_ns)) {
-		pr_warn("sysctl registration failed\n");
-		return -ENOMEM;
-	}
-
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
 		goto out_sysctl;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index f760243ca685..ae83f0f2651b 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -59,10 +59,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (err)
 		goto fail_put;
 
-	err = -ENOMEM;
-	if (!setup_mq_sysctls(ns))
-		goto fail_put;
-
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -129,8 +125,6 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
 
-	retire_mq_sysctls(ns);
-
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
2.33.3


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory
  2022-06-01 13:20           ` [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory Alexey Gladkov
@ 2022-06-01 16:45             ` Linus Torvalds
  2022-06-01 18:24               ` Alexey Gladkov
  0 siblings, 1 reply; 38+ messages in thread
From: Linus Torvalds @ 2022-06-01 16:45 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Eric W . Biederman, Andrew Morton, Christian Brauner,
	Iurii Zaikin, Kees Cook, Linux Containers, linux-fsdevel,
	Luis Chamberlain, Vasily Averin

On Wed, Jun 1, 2022 at 6:20 AM Alexey Gladkov <legion@kernel.org> wrote:
>
> Dynamic memory allocation is needed to modify .data and specify the per
> namespace parameter. The new sysctl API is allowed to get rid of the
> need for such modification.

Ok, this is looking better. That said, a few comments:

>
> diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
> index ef313ecfb53a..833b670c38f3 100644
> --- a/ipc/ipc_sysctl.c
> +++ b/ipc/ipc_sysctl.c
> @@ -68,26 +68,94 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
>         return ret;
>  }
>
> +static inline void *data_from_ns(struct ctl_context *ctx, struct ctl_table *table);
> +
> +static int ipc_sys_open(struct ctl_context *ctx, struct inode *inode, struct file *file)
> +{
> +       struct ipc_namespace *ns = current->nsproxy->ipc_ns;
> +
> +       // For now, we only allow changes in init_user_ns.
> +       if (ns->user_ns != &init_user_ns)
> +               return -EPERM;
> +
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> +       int index = (ctx->table - ipc_sysctls);
> +
> +       switch (index) {
> +               case IPC_SYSCTL_SEM_NEXT_ID:
> +               case IPC_SYSCTL_MSG_NEXT_ID:
[...]

I don't think you actually even compile-tested this, because you're
using these IPC_SYSCTL_SEM_NEXT_ID etc enums before you even declared
them later in the same file.

> +static ssize_t ipc_sys_read(struct ctl_context *ctx, struct file *file,
> +                    char *buffer, size_t *lenp, loff_t *ppos)
> +{
> +       struct ctl_table table = *ctx->table;
> +       table.data = data_from_ns(ctx, ctx->table);
> +       return table.proc_handler(&table, 0, buffer, lenp, ppos);
> +}

Can we please fix the names and the types of this new 'ctx' structure?

Yes, yes, I know the old legacy "sysctl table" is horribly named, and
uses "ctl_table".

But let's just write it out. It's not some random control table for
anything. It's a very odd and specific thing: "sysctl". Let's use the
full name.

Also, Please just make that "ctl_data" member in that "ctl_context"
struct not just have a real name, but a real type. Make it literally
be

    struct ipc_namespace *ipc_ns;

and if we end up with other things wanting other pointers, just add a
new one (or make a union if we care about the size of that allocation,
which I don't see any reason we'd do when it's literally just like a
couple of pointers in size).

There is no reason to have some pseudo-generic "void *ctl_data" that
makes it ambiguous and allows for type confusion and isn't
self-documenting. I'd rather have a properly typed pointer that is
just initialized to NULL and is not always used or needed, but always
has a clear case for *what* it would be used for.

Yes, yes, we have f_private etc for things that are really very very
generic and have arbitrary users. But 'sysctl' is not that kind of
truly generic use.

I wish we didn't have that silly "create a temporary ctl_table entry"
either, and I wish it was properly named. But it's not worth the
pointless churn to fix old bad interfaces. But the new ones should
have better names, and try to avoid those bad old decisions.

But yeah, I think this all is a step in the right direction. And maybe
some of those cases and old 'ctl_table' things can be migrated to just
using individual read() functions entirely. The whole 'ctl_table'
model was broken, and came from the bad old days with an actual
'sysctl()' system call.

Because I think it would be lovely if people would move away from the
'sysctl table' approach entirely for cases where that makes sense, and
these guys that already need special handling are very much in that
situation.

          Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory
  2022-06-01 16:45             ` Linus Torvalds
@ 2022-06-01 18:24               ` Alexey Gladkov
  2022-06-01 18:34                 ` Linus Torvalds
  0 siblings, 1 reply; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 18:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: LKML, Eric W . Biederman, Andrew Morton, Christian Brauner,
	Iurii Zaikin, Kees Cook, Linux Containers, linux-fsdevel,
	Luis Chamberlain, Vasily Averin

On Wed, Jun 01, 2022 at 09:45:15AM -0700, Linus Torvalds wrote:
> On Wed, Jun 1, 2022 at 6:20 AM Alexey Gladkov <legion@kernel.org> wrote:
> >
> > Dynamic memory allocation is needed to modify .data and specify the per
> > namespace parameter. The new sysctl API is allowed to get rid of the
> > need for such modification.
> 
> Ok, this is looking better. That said, a few comments:
> 
> >
> > diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
> > index ef313ecfb53a..833b670c38f3 100644
> > --- a/ipc/ipc_sysctl.c
> > +++ b/ipc/ipc_sysctl.c
> > @@ -68,26 +68,94 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
> >         return ret;
> >  }
> >
> > +static inline void *data_from_ns(struct ctl_context *ctx, struct ctl_table *table);
> > +
> > +static int ipc_sys_open(struct ctl_context *ctx, struct inode *inode, struct file *file)
> > +{
> > +       struct ipc_namespace *ns = current->nsproxy->ipc_ns;
> > +
> > +       // For now, we only allow changes in init_user_ns.
> > +       if (ns->user_ns != &init_user_ns)
> > +               return -EPERM;
> > +
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
> > +       int index = (ctx->table - ipc_sysctls);
> > +
> > +       switch (index) {
> > +               case IPC_SYSCTL_SEM_NEXT_ID:
> > +               case IPC_SYSCTL_MSG_NEXT_ID

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/bpf/cgroup.c#n1392:
> [...]
> 
> I don't think you actually even compile-tested this, because you're
> using these IPC_SYSCTL_SEM_NEXT_ID etc enums before you even declared
> them later in the same file.

I did it but without CONFIG_CHECKPOINT_RESTORE.

This is where I'm not sure who can write to ipc sysctls inside
ipc_namespace.

> > +static ssize_t ipc_sys_read(struct ctl_context *ctx, struct file *file,
> > +                    char *buffer, size_t *lenp, loff_t *ppos)
> > +{
> > +       struct ctl_table table = *ctx->table;
> > +       table.data = data_from_ns(ctx, ctx->table);
> > +       return table.proc_handler(&table, 0, buffer, lenp, ppos);
> > +}
> 
> Can we please fix the names and the types of this new 'ctx' structure?
> 
> Yes, yes, I know the old legacy "sysctl table" is horribly named, and
> uses "ctl_table".

Sure.

> But let's just write it out. It's not some random control table for
> anything. It's a very odd and specific thing: "sysctl". Let's use the
> full name.
> 
> Also, Please just make that "ctl_data" member in that "ctl_context"
> struct not just have a real name, but a real type. Make it literally
> be
> 
>     struct ipc_namespace *ipc_ns;
> 
> and if we end up with other things wanting other pointers, just add a
> new one (or make a union if we care about the size of that allocation,
> which I don't see any reason we'd do when it's literally just like a
> couple of pointers in size).
> 
> There is no reason to have some pseudo-generic "void *ctl_data" that
> makes it ambiguous and allows for type confusion and isn't
> self-documenting. I'd rather have a properly typed pointer that is
> just initialized to NULL and is not always used or needed, but always
> has a clear case for *what* it would be used for.
> 
> Yes, yes, we have f_private etc for things that are really very very
> generic and have arbitrary users. But 'sysctl' is not that kind of
> truly generic use.

Yep. I made ctl_data in the same way as f_private. My idea is that if
someone needs to store more than one pointer, they can put a struct there.
But it turned out that at least now, apart from ipc_namespace, nothing is
needed.

> I wish we didn't have that silly "create a temporary ctl_table entry"
> either, and I wish it was properly named. But it's not worth the
> pointless churn to fix old bad interfaces. But the new ones should
> have better names, and try to avoid those bad old decisions.

Currently temporary ctl_table is the main strategy for handling sysctl
entries.

Perhaps it will be possible to get rid of this if we add another
get_data() that would return what is currently placed in .data in
ctl_table. I mean make getting .data dynamic.

> But yeah, I think this all is a step in the right direction. And maybe
> some of those cases and old 'ctl_table' things can be migrated to just
> using individual read() functions entirely. The whole 'ctl_table'
> model was broken, and came from the bad old days with an actual
> 'sysctl()' system call.

I'm not sure how to get rid of ctl_table since net sysctls are heavily
dependent on it.

I was wondering if it's possible to get rid of ctl_table but if it's not
possible to rewrite everything to some kind of new magic API, then keeping
two of them would be a nightmare.

Another problem is that ctl_table is being used by __cgroup_bpf_run_filter_sysctl.

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/bpf/cgroup.c#n1392

> Because I think it would be lovely if people would move away from the
> 'sysctl table' approach entirely for cases where that makes sense, and
> these guys that already need special handling are very much in that
> situation.

Since you think that these patches are a step in the right direction, then
I will prepare the first version with your comments in mind.

-- 
Rgrds, legion


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory
  2022-06-01 18:24               ` Alexey Gladkov
@ 2022-06-01 18:34                 ` Linus Torvalds
  2022-06-01 19:05                   ` Alexey Gladkov
  2022-06-09 18:51                   ` Luis Chamberlain
  0 siblings, 2 replies; 38+ messages in thread
From: Linus Torvalds @ 2022-06-01 18:34 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Eric W . Biederman, Andrew Morton, Christian Brauner,
	Iurii Zaikin, Kees Cook, Linux Containers, linux-fsdevel,
	Luis Chamberlain, Vasily Averin

On Wed, Jun 1, 2022 at 11:25 AM Alexey Gladkov <legion@kernel.org> wrote:
>
> I'm not sure how to get rid of ctl_table since net sysctls are heavily
> dependent on it.

I don't actually think it's worth getting rid of entirely, because
there's just a lot of simple cases where it "JustWorks(tm)" and having
just that table entry describe all the semantics is not wrong at all.

The name may suck, but hey, it's not a big deal. Changing it now would
be more pain than it's worth.

No, I was more thinking that things that already need more
infrastructure than that simple static ctl_table entry might be better
off trying to migrate to your new "proper read op" model, and having
more of that dynamic behavior in the read op.

The whole "create dynamic ctl_table entries on the fly" model works,
but it's kind of ugly.

Anyway, I think all of this is "I think there is more room for cleanup
in this area", and maybe we'll never have enough motivation to
actually do that.

Your patches seem to fix the extant issue with the ipc namespace, and
the truly disgusting parts (although maybe there are other truly
disgusting things hiding - I didn't go look for them).

                      Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory
  2022-06-01 18:34                 ` Linus Torvalds
@ 2022-06-01 19:05                   ` Alexey Gladkov
  2022-06-09 18:51                   ` Luis Chamberlain
  1 sibling, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 19:05 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: LKML, Eric W . Biederman, Andrew Morton, Christian Brauner,
	Iurii Zaikin, Kees Cook, Linux Containers, linux-fsdevel,
	Luis Chamberlain, Vasily Averin

On Wed, Jun 01, 2022 at 11:34:18AM -0700, Linus Torvalds wrote:
> On Wed, Jun 1, 2022 at 11:25 AM Alexey Gladkov <legion@kernel.org> wrote:
> >
> > I'm not sure how to get rid of ctl_table since net sysctls are heavily
> > dependent on it.
> 
> I don't actually think it's worth getting rid of entirely, because
> there's just a lot of simple cases where it "JustWorks(tm)" and having
> just that table entry describe all the semantics is not wrong at all.
> 
> The name may suck, but hey, it's not a big deal. Changing it now would
> be more pain than it's worth.
> 
> No, I was more thinking that things that already need more
> infrastructure than that simple static ctl_table entry might be better
> off trying to migrate to your new "proper read op" model, and having
> more of that dynamic behavior in the read op.

This was part of my plan. I wanted to step by step try migrating other
sysctls to use open/read/write where it makes sense.

To be honest, it was Eric Biederman who came up with the idea to separate
open, read and write. I am very grateful to him.

> The whole "create dynamic ctl_table entries on the fly" model works,
> but it's kind of ugly.
> 
> Anyway, I think all of this is "I think there is more room for cleanup
> in this area", and maybe we'll never have enough motivation to
> actually do that.
> 
> Your patches seem to fix the extant issue with the ipc namespace, and
> the truly disgusting parts (although maybe there are other truly
> disgusting things hiding - I didn't go look for them).

I also hope to try and fix the f_cred issue.

-- 
Rgrds, legion


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/4] sysctl: API extension for handling sysctl
  2022-06-01 13:20           ` [RFC PATCH 1/4] sysctl: " Alexey Gladkov
@ 2022-06-01 19:19             ` Matthew Wilcox
  2022-06-01 19:23               ` Linus Torvalds
  2022-06-01 19:32               ` Alexey Gladkov
  0 siblings, 2 replies; 38+ messages in thread
From: Matthew Wilcox @ 2022-06-01 19:19 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Eric W . Biederman, Linus Torvalds, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Luis Chamberlain, Vasily Averin

On Wed, Jun 01, 2022 at 03:20:29PM +0200, Alexey Gladkov wrote:
> +struct ctl_fops {
> +	int (*open) (struct ctl_context *, struct inode *, struct file *);
> +	int (*release) (struct ctl_context *, struct inode *, struct file *);
> +	ssize_t (*read) (struct ctl_context *, struct file *, char *, size_t *, loff_t *);
> +	ssize_t (*write) (struct ctl_context *, struct file *, char *, size_t *, loff_t *);
> +};

Why not pass the iocb in ->read and ->write?  We're still regretting not
doing that with file_operations.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/4] sysctl: API extension for handling sysctl
  2022-06-01 19:19             ` Matthew Wilcox
@ 2022-06-01 19:23               ` Linus Torvalds
  2022-06-01 19:25                 ` Matthew Wilcox
  2022-06-01 19:32               ` Alexey Gladkov
  1 sibling, 1 reply; 38+ messages in thread
From: Linus Torvalds @ 2022-06-01 19:23 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Alexey Gladkov, LKML, Eric W . Biederman, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Luis Chamberlain, Vasily Averin

On Wed, Jun 1, 2022 at 12:19 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> Why not pass the iocb in ->read and ->write?  We're still regretting not
> doing that with file_operations.

No, all the actual "io" is done by the caller.

There is no way in hell I want the sysctl callbacks to actually
possibly do user space accesses etc.

They get a kernel buffer that has already been set up. There is no
iocb or iovec left for them.

(That also means that they can take whatever locks they need,
including spinlocks, because there's not going to be any random user
accesses or complex pipe buffer lookups or whatever).

                Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/4] sysctl: API extension for handling sysctl
  2022-06-01 19:23               ` Linus Torvalds
@ 2022-06-01 19:25                 ` Matthew Wilcox
  2022-06-01 19:31                   ` Linus Torvalds
  0 siblings, 1 reply; 38+ messages in thread
From: Matthew Wilcox @ 2022-06-01 19:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexey Gladkov, LKML, Eric W . Biederman, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Luis Chamberlain, Vasily Averin

On Wed, Jun 01, 2022 at 12:23:06PM -0700, Linus Torvalds wrote:
> On Wed, Jun 1, 2022 at 12:19 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > Why not pass the iocb in ->read and ->write?  We're still regretting not
> > doing that with file_operations.
> 
> No, all the actual "io" is done by the caller.
> 
> There is no way in hell I want the sysctl callbacks to actually
> possibly do user space accesses etc.
> 
> They get a kernel buffer that has already been set up. There is no
> iocb or iovec left for them.

I wasn't suggesting the iovec.  Just the iocb, instead of passing in the
ki_filp and the ki_pos.

> (That also means that they can take whatever locks they need,
> including spinlocks, because there's not going to be any random user
> accesses or complex pipe buffer lookups or whatever).
> 
>                 Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/4] sysctl: API extension for handling sysctl
  2022-06-01 19:25                 ` Matthew Wilcox
@ 2022-06-01 19:31                   ` Linus Torvalds
  0 siblings, 0 replies; 38+ messages in thread
From: Linus Torvalds @ 2022-06-01 19:31 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Alexey Gladkov, LKML, Eric W . Biederman, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Luis Chamberlain, Vasily Averin

On Wed, Jun 1, 2022 at 12:25 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> I wasn't suggesting the iovec.  Just the iocb, instead of passing in the
> ki_filp and the ki_pos.

I guess that would work, but would it actually help anything?

I think it's a lot more obvious when it just looks mostly like a
"read/write()" system call (ok, pread/pwrite since you get the pos,
but still).

There is nothing that could possibly be relevant in the kiocb. All
those fields are about odd async or atomicity issues (or "report
completion" issues) that simply cannot possibly be valid for a sysctl
value.

And if they are, that sysctl value is doing something really really
really wrong. So we absolutely don't want to encourage that.

This is not a "do IO" interface. This is a "read or write value" interface.

               Linus

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/4] sysctl: API extension for handling sysctl
  2022-06-01 19:19             ` Matthew Wilcox
  2022-06-01 19:23               ` Linus Torvalds
@ 2022-06-01 19:32               ` Alexey Gladkov
  1 sibling, 0 replies; 38+ messages in thread
From: Alexey Gladkov @ 2022-06-01 19:32 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: LKML, Eric W . Biederman, Linus Torvalds, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Luis Chamberlain, Vasily Averin

On Wed, Jun 01, 2022 at 08:19:14PM +0100, Matthew Wilcox wrote:
> On Wed, Jun 01, 2022 at 03:20:29PM +0200, Alexey Gladkov wrote:
> > +struct ctl_fops {
> > +	int (*open) (struct ctl_context *, struct inode *, struct file *);
> > +	int (*release) (struct ctl_context *, struct inode *, struct file *);
> > +	ssize_t (*read) (struct ctl_context *, struct file *, char *, size_t *, loff_t *);
> > +	ssize_t (*write) (struct ctl_context *, struct file *, char *, size_t *, loff_t *);
> > +};
> 
> Why not pass the iocb in ->read and ->write?  We're still regretting not
> doing that with file_operations.

Because buf and len can be modified in BPF_CGROUP_RUN_PROG_SYSCTL.
We need to pass the result of this hook to read/write callback.

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/proc/proc_sysctl.c#n605
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/bpf/cgroup.c#n1441

-- 
Rgrds, legion


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 0/4] API extension for handling sysctl
  2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
                             ` (3 preceding siblings ...)
  2022-06-01 13:20           ` [RFC PATCH 4/4] sysctl: mqueue: " Alexey Gladkov
@ 2022-06-09 16:45           ` Luis Chamberlain
  4 siblings, 0 replies; 38+ messages in thread
From: Luis Chamberlain @ 2022-06-09 16:45 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Eric W . Biederman, Linus Torvalds, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Vasily Averin

On Wed, Jun 01, 2022 at 03:20:28PM +0200, Alexey Gladkov wrote:
> On Fri, Apr 22, 2022 at 01:44:50PM -0700, Linus Torvalds wrote:
> > On Fri, Apr 22, 2022 at 5:53 AM Alexey Gladkov <legion@kernel.org> wrote:
> > >
> > > Yes, Linus, these changes are not the refactoring you were talking
> > > about, but I plan to try to do such a refactoring in the my next
> > > patchset.
> > 
> > Heh. Ok, I'm not saying these patches are pretty, and looking up the
> > namespace thing is a bit subtle, but it's certainly prettier than the
> > existing odd "create a new ctl_table entry because of field abuse".
> 
> As I promised, here is one of the possible options for how to get rid of dynamic
> memory allocation.
> 
> We can slightly extend the API and thus be able to save data at the time the
> file is opened. This will not only eliminate the need to allocate memory, but
> also provide access to file struct and f_cred.
> 
> I made an RFC because I'm not sure that I did the permissions check for
> ipc_sysctl. I also did not change all the places where this API can be applied
> to make the patch smaller. As in the case of /proc/sys/kernel/printk where
> CAP_SYS_ADMIN is checked[1] for the current process at the time of write.

Thanks for all this, can you also add respective selftests extensions
for this on lib/test_sysctl.c and tools/testing/selftests/sysctl/sysctl.sh ?

  Luis

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory
  2022-06-01 18:34                 ` Linus Torvalds
  2022-06-01 19:05                   ` Alexey Gladkov
@ 2022-06-09 18:51                   ` Luis Chamberlain
  1 sibling, 0 replies; 38+ messages in thread
From: Luis Chamberlain @ 2022-06-09 18:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexey Gladkov, LKML, Eric W . Biederman, Andrew Morton,
	Christian Brauner, Iurii Zaikin, Kees Cook, Linux Containers,
	linux-fsdevel, Vasily Averin, Muchun Song, Jia He, Pan Xinhui,
	Thomas Huth, ChuckLever

On Wed, Jun 01, 2022 at 11:34:18AM -0700, Linus Torvalds wrote:
> On Wed, Jun 1, 2022 at 11:25 AM Alexey Gladkov <legion@kernel.org> wrote:
> >
> > I'm not sure how to get rid of ctl_table since net sysctls are heavily
> > dependent on it.
> 
> Anyway, I think all of this is "I think there is more room for cleanup
> in this area", and maybe we'll never have enough motivation to
> actually do that.

That's a good summary of the cleanup with sysctls so I appreciate the nudges
in the right directions.

  Luis

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2022-06-09 18:51 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-14 18:18 [PATCH v4 0/2] ipc: Store mq and ipc sysctls in the ipc namespace Alexey Gladkov
2022-02-14 18:18 ` [PATCH v4 1/2] ipc: Store mqueue " Alexey Gladkov
2022-02-14 18:18 ` [PATCH v4 2/2] ipc: Store ipc " Alexey Gladkov
2022-03-23 20:24 ` [GIT PULL] ipc: Bind to the ipc namespace at open time Eric W. Biederman
2022-03-24 18:12   ` Linus Torvalds
2022-03-24 21:48     ` Eric W. Biederman
2022-03-24 22:16       ` Linus Torvalds
2022-03-25 12:10     ` Alexey Gladkov
2022-04-22 12:53     ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
2022-04-22 12:53       ` [PATCH v1 1/4] " Alexey Gladkov
2022-05-02 16:07         ` Eric W. Biederman
2022-04-22 12:53       ` [PATCH v1 2/4] ipc: Use proper " Alexey Gladkov
2022-05-02 16:09         ` Eric W. Biederman
2022-05-03 13:39           ` Alexey Gladkov
2022-05-03 13:39             ` [PATCH v2 1/4] ipc: Use the same namespace to modify and validate Alexey Gladkov
2022-05-03 13:39             ` [PATCH v2 2/4] ipc: Remove extra1 field abuse to pass ipc namespace Alexey Gladkov
2022-05-03 13:39             ` [PATCH v2 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time Alexey Gladkov
2022-05-03 13:39             ` [PATCH v2 4/4] ipc: Remove extra braces Alexey Gladkov
2022-04-22 12:53       ` [PATCH v1 3/4] ipc: Check permissions for checkpoint_restart sysctls at open time Alexey Gladkov
2022-04-22 12:53       ` [PATCH v1 4/4] ipc: Remove extra braces Alexey Gladkov
2022-04-22 20:44       ` [PATCH v1 0/4] ipc: Remove extra1 field abuse to pass ipc namespace Linus Torvalds
2022-05-04  3:42         ` Philip Rhoades
2022-06-01 13:20         ` [RFC PATCH 0/4] API extension for handling sysctl Alexey Gladkov
2022-06-01 13:20           ` [RFC PATCH 1/4] sysctl: " Alexey Gladkov
2022-06-01 19:19             ` Matthew Wilcox
2022-06-01 19:23               ` Linus Torvalds
2022-06-01 19:25                 ` Matthew Wilcox
2022-06-01 19:31                   ` Linus Torvalds
2022-06-01 19:32               ` Alexey Gladkov
2022-06-01 13:20           ` [RFC PATCH 2/4] sysctl: ipc: Do not use dynamic memory Alexey Gladkov
2022-06-01 16:45             ` Linus Torvalds
2022-06-01 18:24               ` Alexey Gladkov
2022-06-01 18:34                 ` Linus Torvalds
2022-06-01 19:05                   ` Alexey Gladkov
2022-06-09 18:51                   ` Luis Chamberlain
2022-06-01 13:20           ` [RFC PATCH 3/4] sysctl: userns: " Alexey Gladkov
2022-06-01 13:20           ` [RFC PATCH 4/4] sysctl: mqueue: " Alexey Gladkov
2022-06-09 16:45           ` [RFC PATCH 0/4] API extension for handling sysctl Luis Chamberlain

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).