All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace
@ 2022-01-03 16:04 Alexey Gladkov
  2022-01-03 20:41   ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-03 16:04 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

Right now, the mqueue sysctls take ipc namespaces into account in a
rather hacky way. This works in most cases, but does not respect the
user namespace.

Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
parametres. This poses a problem in the rootless containers.

To solve this I changed the implementation of the mqueue sysctls just
like some other sysctls.

Before this change:

$ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
tee: /proc/sys/fs/mqueue/msg_max: Permission denied
5

After this change:

$ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
5

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  20 ++---
 ipc/mq_sysctl.c               | 140 +++++++++++++++++++++-------------
 ipc/mqueue.c                  |  10 +--
 ipc/namespace.c               |   6 ++
 4 files changed, 103 insertions(+), 73 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index b75395ec8d52..8ecb965905f5 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -10,6 +10,7 @@
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
 #include <linux/rhashtable-types.h>
+#include <linux/sysctl.h>
 
 struct user_namespace;
 
@@ -63,6 +64,11 @@ struct ipc_namespace {
 	unsigned int    mq_msg_default;
 	unsigned int    mq_msgsize_default;
 
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+	struct ctl_table_set	set;
+	struct ctl_table_header *sysctls;
+#endif
+
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -167,17 +173,7 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 }
 #endif
 
-#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
-
-struct ctl_table_header;
-extern struct ctl_table_header *mq_register_sysctl_table(void);
-
-#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
-
-static inline struct ctl_table_header *mq_register_sysctl_table(void)
-{
-	return NULL;
-}
+void retire_mq_sysctls(struct ipc_namespace *ns);
+bool setup_mq_sysctls(struct ipc_namespace *ns);
 
-#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
 #endif
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 72a92a08c848..f9826700f08c 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -9,39 +9,9 @@
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
 
-#ifdef CONFIG_PROC_SYSCTL
-static void *get_mq(struct ctl_table *table)
-{
-	char *which = table->data;
-	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
-	return which;
-}
-
-static int proc_mq_dointvec(struct ctl_table *table, int write,
-			    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
-}
-
-static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec_minmax(&mq_table, write, buffer,
-					lenp, ppos);
-}
-#else
-#define proc_mq_dointvec NULL
-#define proc_mq_dointvec_minmax NULL
-#endif
+#include <linux/stat.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
 
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
@@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
@@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -89,32 +59,92 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
 	{}
 };
 
-static struct ctl_table mq_sysctl_dir[] = {
-	{
-		.procname	= "mqueue",
-		.mode		= 0555,
-		.child		= mq_sysctls,
-	},
-	{}
-};
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+	return &current->nsproxy->ipc_ns->set;
+}
 
-static struct ctl_table mq_sysctl_root[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= mq_sysctl_dir,
-	},
-	{}
+static int set_is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->ipc_ns->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, set);
+	int mode;
+
+	/* Allow users with CAP_SYS_RESOURCE unrestrained access */
+	if (ns_capable(ns->user_ns, CAP_SYS_RESOURCE))
+		mode = (table->mode & S_IRWXU) >> 6;
+	else
+	/* Allow all others at most read-only access */
+		mode = table->mode & S_IROTH;
+	return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+	.lookup = set_lookup,
+	.permissions = set_permissions,
 };
 
-struct ctl_table_header *mq_register_sysctl_table(void)
+bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+	struct ctl_table *tbl;
+
+	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+
+	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
+				tbl[i].data = &ns->mq_queues_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
+				tbl[i].data = &ns->mq_msg_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
+				tbl[i].data = &ns->mq_msgsize_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
+				tbl[i].data = &ns->mq_msg_default;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
+				tbl[i].data = &ns->mq_msgsize_default;
+			else
+				tbl[i].data = NULL;
+		}
+
+		ns->sysctls = __register_sysctl_table(&ns->set, "fs/mqueue", tbl);
+	}
+	if (!ns->sysctls) {
+		kfree(tbl);
+		retire_sysctl_set(&ns->set);
+		return false;
+	}
+#endif
+	return true;
+}
+
+void retire_mq_sysctls(struct ipc_namespace *ns)
 {
-	return register_sysctl_table(mq_sysctl_root);
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+	struct ctl_table *tbl;
+
+	tbl = ns->sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->sysctls);
+	retire_sysctl_set(&ns->set);
+	kfree(tbl);
+#endif
 }
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5becca9be867..1b4a3be71636 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -163,8 +163,6 @@ static void remove_notification(struct mqueue_inode_info *info);
 
 static struct kmem_cache *mqueue_inode_cachep;
 
-static struct ctl_table_header *mq_sysctl_table;
-
 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 {
 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
@@ -1713,8 +1711,10 @@ static int __init init_mqueue_fs(void)
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
-	/* ignore failures - they are not fatal */
-	mq_sysctl_table = mq_register_sysctl_table();
+	if (!setup_mq_sysctls(&init_ipc_ns)) {
+		pr_warn("sysctl registration failed\n");
+		return -ENOMEM;
+	}
 
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
@@ -1731,8 +1731,6 @@ static int __init init_mqueue_fs(void)
 out_filesystem:
 	unregister_filesystem(&mqueue_fs_type);
 out_sysctl:
-	if (mq_sysctl_table)
-		unregister_sysctl_table(mq_sysctl_table);
 	kmem_cache_destroy(mqueue_inode_cachep);
 	return error;
 }
diff --git a/ipc/namespace.c b/ipc/namespace.c
index ae83f0f2651b..f760243ca685 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (err)
 		goto fail_put;
 
+	err = -ENOMEM;
+	if (!setup_mq_sysctls(ns))
+		goto fail_put;
+
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
 
+	retire_mq_sysctls(ns);
+
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-03 16:04 [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace Alexey Gladkov
@ 2022-01-03 20:41   ` kernel test robot
  2022-01-04  5:28   ` kernel test robot
  2022-01-04 11:51 ` [PATCH v2] " Alexey Gladkov
  2 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2022-01-03 20:41 UTC (permalink / raw)
  To: Alexey Gladkov, LKML, Linux Containers
  Cc: kbuild-all, Alexander Mikhalitsyn, Andrew Morton,
	Linux Memory Management List, Christian Brauner, Daniel Walsh,
	Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai

Hi Alexey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on hnaz-mm/master linus/master v5.16-rc8 next-20211224]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 136057256686de39cc3a07c2e39ef6bc43003ff6
config: arm64-buildonly-randconfig-r005-20220103 (https://download.01.org/0day-ci/archive/20220104/202201040410.Hhzhq6t0-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
        git checkout 637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=arm64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   aarch64-linux-ld: Unexpected GOT/PLT entries detected!
   aarch64-linux-ld: Unexpected run-time procedure linkages detected!
   aarch64-linux-ld: ID map text too big or misaligned
   aarch64-linux-ld: ipc/mqueue.o: in function `init_mqueue_fs':
>> mqueue.c:(.init.text+0x64): undefined reference to `setup_mq_sysctls'
   aarch64-linux-ld: ipc/namespace.o: in function `free_ipc':
>> namespace.c:(.text+0xb0): undefined reference to `retire_mq_sysctls'
   aarch64-linux-ld: ipc/namespace.o: in function `copy_ipcs':
>> namespace.c:(.text+0x4d8): undefined reference to `setup_mq_sysctls'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace
@ 2022-01-03 20:41   ` kernel test robot
  0 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2022-01-03 20:41 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2394 bytes --]

Hi Alexey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on hnaz-mm/master linus/master v5.16-rc8 next-20211224]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 136057256686de39cc3a07c2e39ef6bc43003ff6
config: arm64-buildonly-randconfig-r005-20220103 (https://download.01.org/0day-ci/archive/20220104/202201040410.Hhzhq6t0-lkp(a)intel.com/config)
compiler: aarch64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
        git checkout 637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=arm64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   aarch64-linux-ld: Unexpected GOT/PLT entries detected!
   aarch64-linux-ld: Unexpected run-time procedure linkages detected!
   aarch64-linux-ld: ID map text too big or misaligned
   aarch64-linux-ld: ipc/mqueue.o: in function `init_mqueue_fs':
>> mqueue.c:(.init.text+0x64): undefined reference to `setup_mq_sysctls'
   aarch64-linux-ld: ipc/namespace.o: in function `free_ipc':
>> namespace.c:(.text+0xb0): undefined reference to `retire_mq_sysctls'
   aarch64-linux-ld: ipc/namespace.o: in function `copy_ipcs':
>> namespace.c:(.text+0x4d8): undefined reference to `setup_mq_sysctls'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-03 16:04 [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace Alexey Gladkov
@ 2022-01-04  5:28   ` kernel test robot
  2022-01-04  5:28   ` kernel test robot
  2022-01-04 11:51 ` [PATCH v2] " Alexey Gladkov
  2 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2022-01-04  5:28 UTC (permalink / raw)
  To: Alexey Gladkov, LKML, Linux Containers
  Cc: kbuild-all, Alexander Mikhalitsyn, Andrew Morton,
	Linux Memory Management List, Christian Brauner, Daniel Walsh,
	Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai

Hi Alexey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on hnaz-mm/master linus/master v5.16-rc8 next-20211224]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 136057256686de39cc3a07c2e39ef6bc43003ff6
config: microblaze-buildonly-randconfig-r003-20220103 (https://download.01.org/0day-ci/archive/20220104/202201041311.fGbv03Y7-lkp@intel.com/config)
compiler: microblaze-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
        git checkout 637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=microblaze SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   microblaze-linux-ld: ipc/mqueue.o: in function `init_mqueue_fs':
>> (.init.text+0x4c): undefined reference to `setup_mq_sysctls'
   microblaze-linux-ld: ipc/namespace.o: in function `free_ipc':
>> (.text+0x294): undefined reference to `retire_mq_sysctls'
   microblaze-linux-ld: ipc/namespace.o: in function `copy_ipcs':
>> (.text+0x1114): undefined reference to `setup_mq_sysctls'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace
@ 2022-01-04  5:28   ` kernel test robot
  0 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2022-01-04  5:28 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2199 bytes --]

Hi Alexey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on hnaz-mm/master linus/master v5.16-rc8 next-20211224]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 136057256686de39cc3a07c2e39ef6bc43003ff6
config: microblaze-buildonly-randconfig-r003-20220103 (https://download.01.org/0day-ci/archive/20220104/202201041311.fGbv03Y7-lkp(a)intel.com/config)
compiler: microblaze-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alexey-Gladkov/ipc-Store-mqueue-sysctls-in-the-ipc-namespace/20220104-000523
        git checkout 637324916f39ec562ac383bfbc22ee9fcbfcb1c0
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=microblaze SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   microblaze-linux-ld: ipc/mqueue.o: in function `init_mqueue_fs':
>> (.init.text+0x4c): undefined reference to `setup_mq_sysctls'
   microblaze-linux-ld: ipc/namespace.o: in function `free_ipc':
>> (.text+0x294): undefined reference to `retire_mq_sysctls'
   microblaze-linux-ld: ipc/namespace.o: in function `copy_ipcs':
>> (.text+0x1114): undefined reference to `setup_mq_sysctls'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-03 16:04 [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace Alexey Gladkov
  2022-01-03 20:41   ` kernel test robot
  2022-01-04  5:28   ` kernel test robot
@ 2022-01-04 11:51 ` Alexey Gladkov
  2022-01-04 18:13   ` Manfred Spraul
  2022-01-10 16:26   ` Eric W. Biederman
  2 siblings, 2 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-04 11:51 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot

Right now, the mqueue sysctls take ipc namespaces into account in a
rather hacky way. This works in most cases, but does not respect the
user namespace.

Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
parametres. This poses a problem in the rootless containers.

To solve this I changed the implementation of the mqueue sysctls just
like some other sysctls.

Before this change:

$ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
tee: /proc/sys/fs/mqueue/msg_max: Permission denied
5

After this change:

$ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
5

v2:
* Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not
  specified.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  18 ++++-
 ipc/mq_sysctl.c               | 137 ++++++++++++++++++++--------------
 ipc/mqueue.c                  |  10 +--
 ipc/namespace.c               |   6 ++
 4 files changed, 106 insertions(+), 65 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index b75395ec8d52..bcedc86a6f1d 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -10,6 +10,7 @@
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
 #include <linux/rhashtable-types.h>
+#include <linux/sysctl.h>
 
 struct user_namespace;
 
@@ -63,6 +64,11 @@ struct ipc_namespace {
 	unsigned int    mq_msg_default;
 	unsigned int    mq_msgsize_default;
 
+#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
+	struct ctl_table_set	set;
+	struct ctl_table_header *sysctls;
+#endif
+
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -169,14 +175,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 
 #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
 
-struct ctl_table_header;
-extern struct ctl_table_header *mq_register_sysctl_table(void);
+void retire_mq_sysctls(struct ipc_namespace *ns);
+bool setup_mq_sysctls(struct ipc_namespace *ns);
 
 #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
 
-static inline struct ctl_table_header *mq_register_sysctl_table(void)
+static inline void retire_mq_sysctls(struct ipc_namespace *ns)
 {
-	return NULL;
+}
+
+static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+	return true;
 }
 
 #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 72a92a08c848..56afb0503296 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -9,39 +9,9 @@
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
 
-#ifdef CONFIG_PROC_SYSCTL
-static void *get_mq(struct ctl_table *table)
-{
-	char *which = table->data;
-	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
-	return which;
-}
-
-static int proc_mq_dointvec(struct ctl_table *table, int write,
-			    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
-}
-
-static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec_minmax(&mq_table, write, buffer,
-					lenp, ppos);
-}
-#else
-#define proc_mq_dointvec NULL
-#define proc_mq_dointvec_minmax NULL
-#endif
+#include <linux/stat.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
 
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
@@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
@@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -89,32 +59,89 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
 	{}
 };
 
-static struct ctl_table mq_sysctl_dir[] = {
-	{
-		.procname	= "mqueue",
-		.mode		= 0555,
-		.child		= mq_sysctls,
-	},
-	{}
-};
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+	return &current->nsproxy->ipc_ns->set;
+}
 
-static struct ctl_table mq_sysctl_root[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= mq_sysctl_dir,
-	},
-	{}
+static int set_is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->ipc_ns->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, set);
+	int mode;
+
+	/* Allow users with CAP_SYS_RESOURCE unrestrained access */
+	if (ns_capable(ns->user_ns, CAP_SYS_RESOURCE))
+		mode = (table->mode & S_IRWXU) >> 6;
+	else
+	/* Allow all others at most read-only access */
+		mode = table->mode & S_IROTH;
+	return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+	.lookup = set_lookup,
+	.permissions = set_permissions,
 };
 
-struct ctl_table_header *mq_register_sysctl_table(void)
+bool setup_mq_sysctls(struct ipc_namespace *ns)
 {
-	return register_sysctl_table(mq_sysctl_root);
+	struct ctl_table *tbl;
+
+	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+
+	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
+				tbl[i].data = &ns->mq_queues_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
+				tbl[i].data = &ns->mq_msg_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
+				tbl[i].data = &ns->mq_msgsize_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
+				tbl[i].data = &ns->mq_msg_default;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
+				tbl[i].data = &ns->mq_msgsize_default;
+			else
+				tbl[i].data = NULL;
+		}
+
+		ns->sysctls = __register_sysctl_table(&ns->set, "fs/mqueue", tbl);
+	}
+	if (!ns->sysctls) {
+		kfree(tbl);
+		retire_sysctl_set(&ns->set);
+		return false;
+	}
+
+	return true;
+}
+
+void retire_mq_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = ns->sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->sysctls);
+	retire_sysctl_set(&ns->set);
+	kfree(tbl);
 }
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5becca9be867..1b4a3be71636 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -163,8 +163,6 @@ static void remove_notification(struct mqueue_inode_info *info);
 
 static struct kmem_cache *mqueue_inode_cachep;
 
-static struct ctl_table_header *mq_sysctl_table;
-
 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 {
 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
@@ -1713,8 +1711,10 @@ static int __init init_mqueue_fs(void)
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
-	/* ignore failures - they are not fatal */
-	mq_sysctl_table = mq_register_sysctl_table();
+	if (!setup_mq_sysctls(&init_ipc_ns)) {
+		pr_warn("sysctl registration failed\n");
+		return -ENOMEM;
+	}
 
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
@@ -1731,8 +1731,6 @@ static int __init init_mqueue_fs(void)
 out_filesystem:
 	unregister_filesystem(&mqueue_fs_type);
 out_sysctl:
-	if (mq_sysctl_table)
-		unregister_sysctl_table(mq_sysctl_table);
 	kmem_cache_destroy(mqueue_inode_cachep);
 	return error;
 }
diff --git a/ipc/namespace.c b/ipc/namespace.c
index ae83f0f2651b..f760243ca685 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (err)
 		goto fail_put;
 
+	err = -ENOMEM;
+	if (!setup_mq_sysctls(ns))
+		goto fail_put;
+
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
 
+	retire_mq_sysctls(ns);
+
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-04 11:51 ` [PATCH v2] " Alexey Gladkov
@ 2022-01-04 18:13   ` Manfred Spraul
  2022-01-04 18:42     ` Eric W. Biederman
  2022-01-10 16:26   ` Eric W. Biederman
  1 sibling, 1 reply; 18+ messages in thread
From: Manfred Spraul @ 2022-01-04 18:13 UTC (permalink / raw)
  To: Alexey Gladkov, LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Serge Hallyn, Varad Gautam, Vasily Averin, kernel test robot

Hi Alexey,

On 1/4/22 12:51, Alexey Gladkov wrote:
> Right now, the mqueue sysctls take ipc namespaces into account in a
> rather hacky way. This works in most cases, but does not respect the
> user namespace.
>
> Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
> parametres. This poses a problem in the rootless containers.
>
> To solve this I changed the implementation of the mqueue sysctls just
> like some other sysctls.
>
> Before this change:
>
> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
> tee: /proc/sys/fs/mqueue/msg_max: Permission denied
> 5

Could you crosscheck that all (relevant) allocations in ipc/mqueue.c use 
GFP_KERNEL_ACCOUNT?

We should not allow normal users to use up all memory.

Otherwise:
The idea is good, the limits do not really prevent using up all memory, 
_ACCOUNT is the better approach.
And with _ACCOUNT, it doesn't hurt that the namespace root is able to 
set limits.


--

     Manfred


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-04 18:13   ` Manfred Spraul
@ 2022-01-04 18:42     ` Eric W. Biederman
  2022-01-04 20:48       ` Manfred Spraul
  0 siblings, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2022-01-04 18:42 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Alexey Gladkov, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot


Manfred Spraul <manfred@colorfullife.com> writes:
>
 Hi Alexey,
>
> On 1/4/22 12:51, Alexey Gladkov wrote:
>> Right now, the mqueue sysctls take ipc namespaces into account in a
>> rather hacky way. This works in most cases, but does not respect the
>> user namespace.
>>
>> Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
>> parametres. This poses a problem in the rootless containers.
>>
>> To solve this I changed the implementation of the mqueue sysctls just
>> like some other sysctls.
>>
>> Before this change:
>>
>> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
>> tee: /proc/sys/fs/mqueue/msg_max: Permission denied
>> 5
>
> Could you crosscheck that all (relevant) allocations in ipc/mqueue.c
> use GFP_KERNEL_ACCOUNT?

They are not.

> We should not allow normal users to use up all memory.
>
> Otherwise:
> The idea is good, the limits do not really prevent using up all
> memory, _ACCOUNT is the better approach.
> And with _ACCOUNT, it doesn't hurt that the namespace root is able to
> set limits.

Saying the cgroup kernel memory limit is the only thing that works, and
that is always better is silly.


First the cgroup kernel memory limits noted with ACCOUNT are not
acceptable on several kernel hot paths because they are so expensive.

Further the memory cgroup kernel memory limit is not always delegated to
non-root users, which precludes using the memory cgroup kernel memory
limit in many situations.


The RLIMIT_MQUEUE limit definitely works, and as I read the kernel
source correct it defaults to MQ_BYTES_MAX aka 819200.  A limit of
800KiB should prevent using up all of system memory, except on very low
memory machines.


So please let's not confuse apples and oranges, and let's use the tools
in the kernel where they work, and not set them up in contest with each
other.

Rlimits with generous but real limits in general are good at catching
when a program misbehaves.  The cgroups are better at setting a total
memory cap.  In this case the rlimit cap is low enough it simply should
not matter.

What has been fixed with the ucount rlimits is that (baring
implementation bugs) it is now not possible to create a user namespace
and escape your rlimits by using multiple users.

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-04 18:42     ` Eric W. Biederman
@ 2022-01-04 20:48       ` Manfred Spraul
  2022-01-04 22:17         ` Eric W. Biederman
  0 siblings, 1 reply; 18+ messages in thread
From: Manfred Spraul @ 2022-01-04 20:48 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexey Gladkov, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot

Hello Eric,

On 1/4/22 19:42, Eric W. Biederman wrote:
> Manfred Spraul <manfred@colorfullife.com> writes:
>   Hi Alexey,
>> On 1/4/22 12:51, Alexey Gladkov wrote:
>>> Right now, the mqueue sysctls take ipc namespaces into account in a
>>> rather hacky way. This works in most cases, but does not respect the
>>> user namespace.
>>>
>>> Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
>>> parametres. This poses a problem in the rootless containers.
>>>
>>> To solve this I changed the implementation of the mqueue sysctls just
>>> like some other sysctls.
>>>
>>> Before this change:
>>>
>>> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
>>> tee: /proc/sys/fs/mqueue/msg_max: Permission denied
>>> 5
>> Could you crosscheck that all (relevant) allocations in ipc/mqueue.c
>> use GFP_KERNEL_ACCOUNT?
> They are not.
>
>> We should not allow normal users to use up all memory.
>>
>> Otherwise:
>> The idea is good, the limits do not really prevent using up all
>> memory, _ACCOUNT is the better approach.
>> And with _ACCOUNT, it doesn't hurt that the namespace root is able to
>> set limits.
> Saying the cgroup kernel memory limit is the only thing that works, and
> that is always better is silly.
>
>
> First the cgroup kernel memory limits noted with ACCOUNT are not
> acceptable on several kernel hot paths because they are so expensive.

I was not aware that ACCOUNT allocations are very expensive.

OTHO adding ACCOUNT resolved various out of memory crashes for IIRC 
ipc/sem.c and/or ipc/msg.c. But we also do not have an RLIMIT for 
ipc/sem.c or ipc/msg.c

Let me rephrase my question:

When we allow non-root users to write to /proc/sys/fs/mqueue/msg_max, 
are there any _relevant_ allocations that bypass _all_ limits?

As you write, we have RLIMIT_MSGQUEUE.

And several allocations for ipc/mqueue already use ACCOUNT:

- the messages themselves, via load_msg()/alloc_msg().

- the inodes, via mqueue_inode_cachep().


> Further the memory cgroup kernel memory limit is not always delegated to
> non-root users, which precludes using the memory cgroup kernel memory
> limit in many situations.
>
>
> The RLIMIT_MQUEUE limit definitely works, and as I read the kernel
> source correct it defaults to MQ_BYTES_MAX aka 819200.  A limit of
> 800KiB should prevent using up all of system memory, except on very low
> memory machines.

I'd agree that 800 kB is not relevant. But we need to be certain that 
there are no loopholes.

I do not see anything relevant, e.g. 0-byte messages should be covered 
by mq_maxmsg. But perhaps I overlook something.

> So please let's not confuse apples and oranges, and let's use the tools
> in the kernel where they work, and not set them up in contest with each
> other.
>
> Rlimits with generous but real limits in general are good at catching
> when a program misbehaves.  The cgroups are better at setting a total
> memory cap.  In this case the rlimit cap is low enough it simply should
> not matter.
>
> What has been fixed with the ucount rlimits is that (baring
> implementation bugs) it is now not possible to create a user namespace
> and escape your rlimits by using multiple users.
I'll try to check the patch in detail in the next few days.


--

     Manfred


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-04 20:48       ` Manfred Spraul
@ 2022-01-04 22:17         ` Eric W. Biederman
  2023-09-08 15:47           ` Daniel Walsh
  0 siblings, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2022-01-04 22:17 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Alexey Gladkov, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Daniel Walsh, Davidlohr Bueso,
	Kirill Tkhai, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot

Manfred Spraul <manfred@colorfullife.com> writes:

> Hello Eric,
>
> On 1/4/22 19:42, Eric W. Biederman wrote:
>> Manfred Spraul <manfred@colorfullife.com> writes:
>>   Hi Alexey,
>>> On 1/4/22 12:51, Alexey Gladkov wrote:
>>>> Right now, the mqueue sysctls take ipc namespaces into account in a
>>>> rather hacky way. This works in most cases, but does not respect the
>>>> user namespace.
>>>>
>>>> Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
>>>> parametres. This poses a problem in the rootless containers.
>>>>
>>>> To solve this I changed the implementation of the mqueue sysctls just
>>>> like some other sysctls.
>>>>
>>>> Before this change:
>>>>
>>>> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
>>>> tee: /proc/sys/fs/mqueue/msg_max: Permission denied
>>>> 5
>>> Could you crosscheck that all (relevant) allocations in ipc/mqueue.c
>>> use GFP_KERNEL_ACCOUNT?
>> They are not.
>>
>>> We should not allow normal users to use up all memory.
>>>
>>> Otherwise:
>>> The idea is good, the limits do not really prevent using up all
>>> memory, _ACCOUNT is the better approach.
>>> And with _ACCOUNT, it doesn't hurt that the namespace root is able to
>>> set limits.
>> Saying the cgroup kernel memory limit is the only thing that works, and
>> that is always better is silly.
>>
>>
>> First the cgroup kernel memory limits noted with ACCOUNT are not
>> acceptable on several kernel hot paths because they are so expensive.
>
> I was not aware that ACCOUNT allocations are very expensive.

I think it is a bit like refcount_t vs atomic_t.  Usually it doesn't
matter but there are cases where it does, and so you need to be aware
before adding ACCOUNT.

> OTHO adding ACCOUNT resolved various out of memory crashes for IIRC ipc/sem.c
> and/or ipc/msg.c. But we also do not have an RLIMIT for ipc/sem.c or ipc/msg.c
>
> Let me rephrase my question:
>
> When we allow non-root users to write to /proc/sys/fs/mqueue/msg_max, are there
> any _relevant_ allocations that bypass _all_ limits?

With respect to writing msg_max.  All of the allocations where the
sysctl value is concerned go through mqueue_get_inode.  The interesting
call is mq_open.  Where attributes can be specified today that override
the sysctl.  And nothing can override HARD_MSGMAX and HARD_MSGSIZEMAX.

So I don't think we introduce anything new if we allow userspace to
write to the limits.

> As you write, we have RLIMIT_MSGQUEUE.
>
> And several allocations for ipc/mqueue already use ACCOUNT:
>
> - the messages themselves, via load_msg()/alloc_msg().
>
> - the inodes, via mqueue_inode_cachep().

Yes.  I don't think ACCOUNT is evil or bad, just different.

>> Further the memory cgroup kernel memory limit is not always delegated to
>> non-root users, which precludes using the memory cgroup kernel memory
>> limit in many situations.
>>
>>
>> The RLIMIT_MQUEUE limit definitely works, and as I read the kernel
>> source correct it defaults to MQ_BYTES_MAX aka 819200.  A limit of
>> 800KiB should prevent using up all of system memory, except on very low
>> memory machines.
>
> I'd agree that 800 kB is not relevant. But we need to be certain that there are
> no loopholes.
>
> I do not see anything relevant, e.g. 0-byte messages should be covered by
> mq_maxmsg. But perhaps I overlook something.
>
>> So please let's not confuse apples and oranges, and let's use the tools
>> in the kernel where they work, and not set them up in contest with each
>> other.
>>
>> Rlimits with generous but real limits in general are good at catching
>> when a program misbehaves.  The cgroups are better at setting a total
>> memory cap.  In this case the rlimit cap is low enough it simply should
>> not matter.
>>
>> What has been fixed with the ucount rlimits is that (baring
>> implementation bugs) it is now not possible to create a user namespace
>> and escape your rlimits by using multiple users.
> I'll try to check the patch in detail in the next few days.

Thank you.  It is always useful to look over things closely and ensure
that unprivileged users can't do something unfortunate, before we find
them doing something unfortunate.

So far I have just skimmed the patch but from 10,000 feet it looks good.

Eric


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-04 11:51 ` [PATCH v2] " Alexey Gladkov
  2022-01-04 18:13   ` Manfred Spraul
@ 2022-01-10 16:26   ` Eric W. Biederman
  2022-01-21 13:08     ` [RFC PATCH v3 0/4] ipc: Store mq and ipc " Alexey Gladkov
  1 sibling, 1 reply; 18+ messages in thread
From: Eric W. Biederman @ 2022-01-10 16:26 UTC (permalink / raw)
  To: Alexey Gladkov
  Cc: LKML, Linux Containers, Alexander Mikhalitsyn, Andrew Morton,
	Christian Brauner, Daniel Walsh, Davidlohr Bueso, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot

Alexey Gladkov <legion@kernel.org> writes:

> Right now, the mqueue sysctls take ipc namespaces into account in a
> rather hacky way. This works in most cases, but does not respect the
> user namespace.
>
> Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
> parametres. This poses a problem in the rootless containers.
>
> To solve this I changed the implementation of the mqueue sysctls just
> like some other sysctls.
>
> Before this change:
>
> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
> tee: /proc/sys/fs/mqueue/msg_max: Permission denied
> 5
>
> After this change:
>
> $ echo 5 | unshare -r -U -i tee /proc/sys/fs/mqueue/msg_max
> 5
>
> v2:
> * Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not
>   specified.

Can you split this in two patches?

The first that converts mq_sysctl.c and ipc_sysctl.c to live in
a per ipc namespace sysctl set.  That will just be a bug-fix/cleanup
patch.  

The second that modifies the permissions to allow root in the ipc
namespace to change the parameters.  With that second patch
we can have the discussion about when it is valid to allow
the user namespace root that created the ipc namespace to be able
to set the sysctls.

A few more comments below.


> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Alexey Gladkov <legion@kernel.org>
> ---
>  include/linux/ipc_namespace.h |  18 ++++-
>  ipc/mq_sysctl.c               | 137 ++++++++++++++++++++--------------
>  ipc/mqueue.c                  |  10 +--
>  ipc/namespace.c               |   6 ++
>  4 files changed, 106 insertions(+), 65 deletions(-)
>
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index b75395ec8d52..bcedc86a6f1d 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -10,6 +10,7 @@
>  #include <linux/ns_common.h>
>  #include <linux/refcount.h>
>  #include <linux/rhashtable-types.h>
> +#include <linux/sysctl.h>
>  
>  struct user_namespace;
>  
> @@ -63,6 +64,11 @@ struct ipc_namespace {
>  	unsigned int    mq_msg_default;
>  	unsigned int    mq_msgsize_default;
>  
> +#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
> +	struct ctl_table_set	set;
> +	struct ctl_table_header *sysctls;
> +#endif
> +

Updating the code to handle all of the ipc sysctls should
remove the need for the #ifdef here.

>  	/* user_ns which owns the ipc ns */
>  	struct user_namespace *user_ns;
>  	struct ucounts *ucounts;
> @@ -169,14 +175,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
>  
>  #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
>  
> -struct ctl_table_header;
> -extern struct ctl_table_header *mq_register_sysctl_table(void);
> +void retire_mq_sysctls(struct ipc_namespace *ns);
> +bool setup_mq_sysctls(struct ipc_namespace *ns);
>  
>  #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
>  
> -static inline struct ctl_table_header *mq_register_sysctl_table(void)
> +static inline void retire_mq_sysctls(struct ipc_namespace *ns)
>  {
> -	return NULL;
> +}
> +
> +static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
> +{
> +	return true;
>  }
>  
>  #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
> diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
> index 72a92a08c848..56afb0503296 100644
> --- a/ipc/mq_sysctl.c
> +++ b/ipc/mq_sysctl.c
> @@ -9,39 +9,9 @@
>  #include <linux/ipc_namespace.h>
>  #include <linux/sysctl.h>
>  
> -#ifdef CONFIG_PROC_SYSCTL
> -static void *get_mq(struct ctl_table *table)
> -{
> -	char *which = table->data;
> -	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
> -	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
> -	return which;
> -}
> -
> -static int proc_mq_dointvec(struct ctl_table *table, int write,
> -			    void *buffer, size_t *lenp, loff_t *ppos)
> -{
> -	struct ctl_table mq_table;
> -	memcpy(&mq_table, table, sizeof(mq_table));
> -	mq_table.data = get_mq(table);
> -
> -	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
> -}
> -
> -static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
> -		void *buffer, size_t *lenp, loff_t *ppos)
> -{
> -	struct ctl_table mq_table;
> -	memcpy(&mq_table, table, sizeof(mq_table));
> -	mq_table.data = get_mq(table);
> -
> -	return proc_dointvec_minmax(&mq_table, write, buffer,
> -					lenp, ppos);
> -}
> -#else
> -#define proc_mq_dointvec NULL
> -#define proc_mq_dointvec_minmax NULL
> -#endif
> +#include <linux/stat.h>
> +#include <linux/capability.h>
> +#include <linux/slab.h>
>  
>  static int msg_max_limit_min = MIN_MSGMAX;
>  static int msg_max_limit_max = HARD_MSGMAX;
> @@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = {
>  		.data		= &init_ipc_ns.mq_queues_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_mq_dointvec,
> +		.proc_handler	= proc_dointvec,
>  	},
>  	{
>  		.procname	= "msg_max",
>  		.data		= &init_ipc_ns.mq_msg_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_mq_dointvec_minmax,
> +		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &msg_max_limit_min,
>  		.extra2		= &msg_max_limit_max,
>  	},
> @@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = {
>  		.data		= &init_ipc_ns.mq_msgsize_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_mq_dointvec_minmax,
> +		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &msg_maxsize_limit_min,
>  		.extra2		= &msg_maxsize_limit_max,
>  	},
> @@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = {
>  		.data		= &init_ipc_ns.mq_msg_default,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_mq_dointvec_minmax,
> +		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &msg_max_limit_min,
>  		.extra2		= &msg_max_limit_max,
>  	},
> @@ -89,32 +59,89 @@ static struct ctl_table mq_sysctls[] = {
>  		.data		= &init_ipc_ns.mq_msgsize_default,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_mq_dointvec_minmax,
> +		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &msg_maxsize_limit_min,
>  		.extra2		= &msg_maxsize_limit_max,
>  	},
>  	{}
>  };
>  
> -static struct ctl_table mq_sysctl_dir[] = {
> -	{
> -		.procname	= "mqueue",
> -		.mode		= 0555,
> -		.child		= mq_sysctls,
> -	},
> -	{}
> -};
> +static struct ctl_table_set *
> +set_lookup(struct ctl_table_root *root)
> +{
> +	return &current->nsproxy->ipc_ns->set;
> +}
>  
> -static struct ctl_table mq_sysctl_root[] = {
> -	{
> -		.procname	= "fs",
> -		.mode		= 0555,
> -		.child		= mq_sysctl_dir,
> -	},
> -	{}
> +static int set_is_seen(struct ctl_table_set *set)
> +{
> +	return &current->nsproxy->ipc_ns->set == set;
> +}
> +
> +static int set_permissions(struct ctl_table_header *head, struct ctl_table *table)
> +{
> +	struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, set);
> +	int mode;
> +
> +	/* Allow users with CAP_SYS_RESOURCE unrestrained access */
> +	if (ns_capable(ns->user_ns, CAP_SYS_RESOURCE))
> +		mode = (table->mode & S_IRWXU) >> 6;
> +	else
> +	/* Allow all others at most read-only access */
> +		mode = table->mode & S_IROTH;
> +	return (mode << 6) | (mode << 3) | mode;
> +}

As a cleanup/bug-fix  please don't implemenet set_permissions.  If you
don't the default permissions that we use today will apply.

> +static struct ctl_table_root set_root = {
> +	.lookup = set_lookup,
> +	.permissions = set_permissions,
>  };
>  
> -struct ctl_table_header *mq_register_sysctl_table(void)
> +bool setup_mq_sysctls(struct ipc_namespace *ns)
>  {
> -	return register_sysctl_table(mq_sysctl_root);
> +	struct ctl_table *tbl;
> +
> +	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
> +
> +	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
> +	if (tbl) {
> +		int i;
> +
> +		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
> +			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
> +				tbl[i].data = &ns->mq_queues_max;
> +
> +			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
> +				tbl[i].data = &ns->mq_msg_max;
> +
> +			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
> +				tbl[i].data = &ns->mq_msgsize_max;
> +
> +			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
> +				tbl[i].data = &ns->mq_msg_default;
> +
> +			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
> +				tbl[i].data = &ns->mq_msgsize_default;
> +			else
> +				tbl[i].data = NULL;
> +		}
> +
> +		ns->sysctls = __register_sysctl_table(&ns->set, "fs/mqueue", tbl);
> +	}
> +	if (!ns->sysctls) {
> +		kfree(tbl);
> +		retire_sysctl_set(&ns->set);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +void retire_mq_sysctls(struct ipc_namespace *ns)
> +{
> +	struct ctl_table *tbl;
> +
> +	tbl = ns->sysctls->ctl_table_arg;
> +	unregister_sysctl_table(ns->sysctls);
> +	retire_sysctl_set(&ns->set);
> +	kfree(tbl);
>  }
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 5becca9be867..1b4a3be71636 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -163,8 +163,6 @@ static void remove_notification(struct mqueue_inode_info *info);
>  
>  static struct kmem_cache *mqueue_inode_cachep;
>  
> -static struct ctl_table_header *mq_sysctl_table;
> -
>  static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
>  {
>  	return container_of(inode, struct mqueue_inode_info, vfs_inode);
> @@ -1713,8 +1711,10 @@ static int __init init_mqueue_fs(void)
>  	if (mqueue_inode_cachep == NULL)
>  		return -ENOMEM;
>  
> -	/* ignore failures - they are not fatal */
> -	mq_sysctl_table = mq_register_sysctl_table();
> +	if (!setup_mq_sysctls(&init_ipc_ns)) {
> +		pr_warn("sysctl registration failed\n");
> +		return -ENOMEM;
> +	}
>  
>  	error = register_filesystem(&mqueue_fs_type);
>  	if (error)
> @@ -1731,8 +1731,6 @@ static int __init init_mqueue_fs(void)
>  out_filesystem:
>  	unregister_filesystem(&mqueue_fs_type);
>  out_sysctl:
> -	if (mq_sysctl_table)
> -		unregister_sysctl_table(mq_sysctl_table);
>  	kmem_cache_destroy(mqueue_inode_cachep);
>  	return error;
>  }
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index ae83f0f2651b..f760243ca685 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
>  	if (err)
>  		goto fail_put;
>  
> +	err = -ENOMEM;
> +	if (!setup_mq_sysctls(ns))
> +		goto fail_put;
> +
Please make this handle all ipc sysctls not just the mq sysctls.

>  	sem_init_ns(ns);
>  	msg_init_ns(ns);
>  	shm_init_ns(ns);
> @@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns)
>  	msg_exit_ns(ns);
>  	shm_exit_ns(ns);
>  
> +	retire_mq_sysctls(ns);
> +
>  	dec_ipc_namespaces(ns->ucounts);
>  	put_user_ns(ns->user_ns);
>  	ns_free_inum(&ns->ns);

Eric

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC PATCH v3 0/4] ipc: Store mq and ipc sysctls in the ipc namespace
  2022-01-10 16:26   ` Eric W. Biederman
@ 2022-01-21 13:08     ` Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 1/4] ipc: Store mqueue " Alexey Gladkov
                         ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-21 13:08 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

This patchset changes the implementation of mq and ipc sysctls. It moves the
sysctls inside the ipc namespace.This will allow us to manage these sysctls
inside the user namespace. The implementation also removes helpers duplication
between mq and ipc sysctls.

--

Alexey Gladkov (4):
  ipc: Store mqueue sysctls in the ipc namespace
  ipc: Store ipc sysctls in the ipc namespace
  ipc: Merge ipc_sysctl and mq_sysctl
  ipc: Allow to modify ipc/mq sysctls if CAP_SYS_RESOURCE is present

 include/linux/ipc_namespace.h |  24 ++-
 ipc/Makefile                  |   7 +-
 ipc/ipc_sysctl.c              | 318 +++++++++++++++++++++++++++-------
 ipc/mq_sysctl.c               | 120 -------------
 ipc/mqueue.c                  |   7 -
 ipc/namespace.c               |   6 +
 ipc/util.h                    |   4 +-
 7 files changed, 273 insertions(+), 213 deletions(-)
 delete mode 100644 ipc/mq_sysctl.c

-- 
2.33.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC PATCH v3 1/4] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-21 13:08     ` [RFC PATCH v3 0/4] ipc: Store mq and ipc " Alexey Gladkov
@ 2022-01-21 13:08       ` Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 2/4] ipc: Store ipc " Alexey Gladkov
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-21 13:08 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin,
	kernel test robot

Right now, the mqueue sysctls take ipc namespaces into account in a
rather hacky way. This works in most cases, but does not respect the
user namespace.

Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/*
parametres. This poses a problem in the rootless containers.

To solve this I changed the implementation of the mqueue sysctls just
like some other sysctls.

So far, the changes do not provide additional access to files. This will
be done in a future patch.

v3:
* Don't implemenet set_permissions to keep the current behavior.

v2:
* Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not
  specified.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  16 +++--
 ipc/mq_sysctl.c               | 121 ++++++++++++++++++----------------
 ipc/mqueue.c                  |  10 ++-
 ipc/namespace.c               |   6 ++
 4 files changed, 88 insertions(+), 65 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index b75395ec8d52..fa787d97d60a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -10,6 +10,7 @@
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
 #include <linux/rhashtable-types.h>
+#include <linux/sysctl.h>
 
 struct user_namespace;
 
@@ -63,6 +64,9 @@ struct ipc_namespace {
 	unsigned int    mq_msg_default;
 	unsigned int    mq_msgsize_default;
 
+	struct ctl_table_set	mq_set;
+	struct ctl_table_header	*mq_sysctls;
+
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -169,14 +173,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 
 #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
 
-struct ctl_table_header;
-extern struct ctl_table_header *mq_register_sysctl_table(void);
+void retire_mq_sysctls(struct ipc_namespace *ns);
+bool setup_mq_sysctls(struct ipc_namespace *ns);
 
 #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
 
-static inline struct ctl_table_header *mq_register_sysctl_table(void)
+static inline void retire_mq_sysctls(struct ipc_namespace *ns)
 {
-	return NULL;
+}
+
+static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
+{
+	return true;
 }
 
 #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 72a92a08c848..fbf6a8b93a26 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -9,39 +9,9 @@
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
 
-#ifdef CONFIG_PROC_SYSCTL
-static void *get_mq(struct ctl_table *table)
-{
-	char *which = table->data;
-	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
-	return which;
-}
-
-static int proc_mq_dointvec(struct ctl_table *table, int write,
-			    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
-}
-
-static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table mq_table;
-	memcpy(&mq_table, table, sizeof(mq_table));
-	mq_table.data = get_mq(table);
-
-	return proc_dointvec_minmax(&mq_table, write, buffer,
-					lenp, ppos);
-}
-#else
-#define proc_mq_dointvec NULL
-#define proc_mq_dointvec_minmax NULL
-#endif
+#include <linux/stat.h>
+#include <linux/capability.h>
+#include <linux/slab.h>
 
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
@@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
@@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
 	},
@@ -89,32 +59,73 @@ static struct ctl_table mq_sysctls[] = {
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_mq_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
 	},
 	{}
 };
 
-static struct ctl_table mq_sysctl_dir[] = {
-	{
-		.procname	= "mqueue",
-		.mode		= 0555,
-		.child		= mq_sysctls,
-	},
-	{}
-};
+static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
+{
+	return &current->nsproxy->ipc_ns->mq_set;
+}
 
-static struct ctl_table mq_sysctl_root[] = {
-	{
-		.procname	= "fs",
-		.mode		= 0555,
-		.child		= mq_sysctl_dir,
-	},
-	{}
+static int set_is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->ipc_ns->mq_set == set;
+}
+
+static struct ctl_table_root set_root = {
+	.lookup = set_lookup,
 };
 
-struct ctl_table_header *mq_register_sysctl_table(void)
+bool setup_mq_sysctls(struct ipc_namespace *ns)
 {
-	return register_sysctl_table(mq_sysctl_root);
+	struct ctl_table *tbl;
+
+	setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen);
+
+	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
+				tbl[i].data = &ns->mq_queues_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
+				tbl[i].data = &ns->mq_msg_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
+				tbl[i].data = &ns->mq_msgsize_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
+				tbl[i].data = &ns->mq_msg_default;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
+				tbl[i].data = &ns->mq_msgsize_default;
+			else
+				tbl[i].data = NULL;
+		}
+
+		ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
+	}
+	if (!ns->mq_sysctls) {
+		kfree(tbl);
+		retire_sysctl_set(&ns->mq_set);
+		return false;
+	}
+
+	return true;
+}
+
+void retire_mq_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = ns->mq_sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->mq_sysctls);
+	retire_sysctl_set(&ns->mq_set);
+	kfree(tbl);
 }
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5becca9be867..1b4a3be71636 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -163,8 +163,6 @@ static void remove_notification(struct mqueue_inode_info *info);
 
 static struct kmem_cache *mqueue_inode_cachep;
 
-static struct ctl_table_header *mq_sysctl_table;
-
 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 {
 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
@@ -1713,8 +1711,10 @@ static int __init init_mqueue_fs(void)
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
-	/* ignore failures - they are not fatal */
-	mq_sysctl_table = mq_register_sysctl_table();
+	if (!setup_mq_sysctls(&init_ipc_ns)) {
+		pr_warn("sysctl registration failed\n");
+		return -ENOMEM;
+	}
 
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
@@ -1731,8 +1731,6 @@ static int __init init_mqueue_fs(void)
 out_filesystem:
 	unregister_filesystem(&mqueue_fs_type);
 out_sysctl:
-	if (mq_sysctl_table)
-		unregister_sysctl_table(mq_sysctl_table);
 	kmem_cache_destroy(mqueue_inode_cachep);
 	return error;
 }
diff --git a/ipc/namespace.c b/ipc/namespace.c
index ae83f0f2651b..f760243ca685 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (err)
 		goto fail_put;
 
+	err = -ENOMEM;
+	if (!setup_mq_sysctls(ns))
+		goto fail_put;
+
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
 
+	retire_mq_sysctls(ns);
+
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH v3 2/4] ipc: Store ipc sysctls in the ipc namespace
  2022-01-21 13:08     ` [RFC PATCH v3 0/4] ipc: Store mq and ipc " Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 1/4] ipc: Store mqueue " Alexey Gladkov
@ 2022-01-21 13:08       ` Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 3/4] ipc: Merge ipc_sysctl and mq_sysctl Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 4/4] ipc: Allow to modify ipc/mq sysctls if CAP_SYS_RESOURCE is present Alexey Gladkov
  3 siblings, 0 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-21 13:08 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

The ipc sysctls are not available for modification inside the user
namespace. Following the mqueue sysctls, we changed the implementation
to be more userns friendly.

So far, the changes do not provide additional access to files. This
will be done in a future patch.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  21 ++++
 ipc/ipc_sysctl.c              | 189 ++++++++++++++++++++++------------
 ipc/namespace.c               |   4 +
 3 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index fa787d97d60a..94af746704fa 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -67,6 +67,9 @@ struct ipc_namespace {
 	struct ctl_table_set	mq_set;
 	struct ctl_table_header	*mq_sysctls;
 
+	struct ctl_table_set	set;
+	struct ctl_table_header	*ipc_sysctls;
+
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
@@ -188,4 +191,22 @@ static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
 }
 
 #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+
+bool setup_ipc_sysctls(struct ipc_namespace *ns);
+void retire_ipc_sysctls(struct ipc_namespace *ns);
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
+{
+}
+
+static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
+{
+	return true;
+}
+
+#endif /* CONFIG_SYSVIPC_SYSCTL */
 #endif
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index f101c171753f..dd87ba12f5e3 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -13,43 +13,22 @@
 #include <linux/capability.h>
 #include <linux/ipc_namespace.h>
 #include <linux/msg.h>
+#include <linux/slab.h>
 #include "util.h"
 
-static void *get_ipc(struct ctl_table *table)
-{
-	char *which = table->data;
-	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
-	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
-	return which;
-}
-
-static int proc_ipc_dointvec(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table ipc_table;
-
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-	ipc_table.data = get_ipc(table);
-
-	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
-}
-
-static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
+static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct ipc_namespace *ns = table->extra1;
 	struct ctl_table ipc_table;
+	int err;
 
 	memcpy(&ipc_table, table, sizeof(ipc_table));
-	ipc_table.data = get_ipc(table);
 
-	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
-}
+	ipc_table.extra1 = SYSCTL_ZERO;
+	ipc_table.extra2 = SYSCTL_ONE;
 
-static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
-	int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 
 	if (err < 0)
 		return err;
@@ -58,17 +37,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
 	return err;
 }
 
-static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table ipc_table;
-	memcpy(&ipc_table, table, sizeof(ipc_table));
-	ipc_table.data = get_ipc(table);
-
-	return proc_doulongvec_minmax(&ipc_table, write, buffer,
-					lenp, ppos);
-}
-
 static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -87,11 +55,17 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
 static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 	void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct ipc_namespace *ns = table->extra1;
+	struct ctl_table ipc_table;
 	int ret, semmni;
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+
+	ipc_table.extra1 = NULL;
+	ipc_table.extra2 = NULL;
 
 	semmni = ns->sem_ctls[3];
-	ret = proc_ipc_dointvec(table, write, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret)
 		ret = sem_check_semmni(current->nsproxy->ipc_ns);
@@ -108,12 +82,18 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
 		int write, void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns;
+	struct ipc_namespace *ns = table->extra1;
+	struct ctl_table ipc_table;
 
-	if (write && !checkpoint_restore_ns_capable(user_ns))
+	if (write && !checkpoint_restore_ns_capable(ns->user_ns))
 		return -EPERM;
 
-	return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+
+	ipc_table.extra1 = SYSCTL_ZERO;
+	ipc_table.extra2 = SYSCTL_INT_MAX;
+
+	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -121,27 +101,27 @@ int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
 
-static struct ctl_table ipc_kern_table[] = {
+static struct ctl_table ipc_sysctls[] = {
 	{
 		.procname	= "shmmax",
 		.data		= &init_ipc_ns.shm_ctlmax,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmall",
 		.data		= &init_ipc_ns.shm_ctlall,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlall),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmmni",
 		.data		= &init_ipc_ns.shm_ctlmni,
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
@@ -151,15 +131,13 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
 		.data		= &init_ipc_ns.msg_ctlmax,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
 	},
@@ -168,7 +146,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.data		= &init_ipc_ns.msg_ctlmni,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
@@ -186,7 +164,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.data		= &init_ipc_ns.msg_ctlmnb,
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
-		.proc_handler	= proc_ipc_dointvec_minmax,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
 	},
@@ -204,8 +182,6 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
 		.mode		= 0666,
 		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
@@ -213,8 +189,6 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
 		.mode		= 0666,
 		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
@@ -222,25 +196,106 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
 		.mode		= 0666,
 		.proc_handler	= proc_ipc_dointvec_minmax_checkpoint_restore,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
 };
 
-static struct ctl_table ipc_root_table[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= ipc_kern_table,
-	},
-	{}
+static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
+{
+	return &current->nsproxy->ipc_ns->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->ipc_ns->set == set;
+}
+
+static struct ctl_table_root set_root = {
+	.lookup = set_lookup,
 };
 
+bool setup_ipc_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+
+	tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.shm_ctlmax) {
+				tbl[i].data = &ns->shm_ctlmax;
+
+			} else if (tbl[i].data == &init_ipc_ns.shm_ctlall) {
+				tbl[i].data = &ns->shm_ctlall;
+
+			} else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) {
+				tbl[i].data = &ns->shm_ctlmni;
+
+			} else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) {
+				tbl[i].data = &ns->shm_rmid_forced;
+				tbl[i].extra1 = ns;
+
+			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) {
+				tbl[i].data = &ns->msg_ctlmax;
+
+			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) {
+				tbl[i].data = &ns->msg_ctlmni;
+
+			} else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) {
+				tbl[i].data = &ns->msg_ctlmnb;
+
+			} else if (tbl[i].data == &init_ipc_ns.sem_ctls) {
+				tbl[i].data = &ns->sem_ctls;
+				tbl[i].extra1 = ns;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) {
+				tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id;
+				tbl[i].extra1 = ns;
+
+			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) {
+				tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id;
+				tbl[i].extra1 = ns;
+
+			} else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) {
+				tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id;
+				tbl[i].extra1 = ns;
+#endif
+			} else {
+				tbl[i].data = NULL;
+			}
+		}
+
+		ns->ipc_sysctls = __register_sysctl_table(&ns->set, "kernel", tbl);
+	}
+	if (!ns->ipc_sysctls) {
+		kfree(tbl);
+		retire_sysctl_set(&ns->set);
+		return false;
+	}
+
+	return true;
+}
+
+void retire_ipc_sysctls(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = ns->ipc_sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->ipc_sysctls);
+	retire_sysctl_set(&ns->set);
+	kfree(tbl);
+}
+
 static int __init ipc_sysctl_init(void)
 {
-	register_sysctl_table(ipc_root_table);
+	if (!setup_ipc_sysctls(&init_ipc_ns)) {
+		pr_warn("ipc sysctl registration failed\n");
+		return -ENOMEM;
+	}
 	return 0;
 }
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index f760243ca685..754f3237194a 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -63,6 +63,9 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (!setup_mq_sysctls(ns))
 		goto fail_put;
 
+	if (!setup_ipc_sysctls(ns))
+		goto fail_put;
+
 	sem_init_ns(ns);
 	msg_init_ns(ns);
 	shm_init_ns(ns);
@@ -130,6 +133,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	shm_exit_ns(ns);
 
 	retire_mq_sysctls(ns);
+	retire_ipc_sysctls(ns);
 
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH v3 3/4] ipc: Merge ipc_sysctl and mq_sysctl
  2022-01-21 13:08     ` [RFC PATCH v3 0/4] ipc: Store mq and ipc " Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 1/4] ipc: Store mqueue " Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 2/4] ipc: Store ipc " Alexey Gladkov
@ 2022-01-21 13:08       ` Alexey Gladkov
  2022-01-21 13:08       ` [RFC PATCH v3 4/4] ipc: Allow to modify ipc/mq sysctls if CAP_SYS_RESOURCE is present Alexey Gladkov
  3 siblings, 0 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-21 13:08 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

Both mq_sysctl and ipc_sysctl are in the ipc namespace and both use
identical helpers so they can be merged into a single source file.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 include/linux/ipc_namespace.h |  41 ++---------
 ipc/Makefile                  |   7 +-
 ipc/ipc_sysctl.c              | 131 ++++++++++++++++++++++++++++++++--
 ipc/mq_sysctl.c               | 131 ----------------------------------
 ipc/mqueue.c                  |   5 --
 ipc/namespace.c               |   4 --
 ipc/util.h                    |   4 +-
 7 files changed, 132 insertions(+), 191 deletions(-)
 delete mode 100644 ipc/mq_sysctl.c

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 94af746704fa..461540d1cac4 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -64,10 +64,8 @@ struct ipc_namespace {
 	unsigned int    mq_msg_default;
 	unsigned int    mq_msgsize_default;
 
-	struct ctl_table_set	mq_set;
-	struct ctl_table_header	*mq_sysctls;
-
 	struct ctl_table_set	set;
+	struct ctl_table_header	*mq_sysctls;
 	struct ctl_table_header	*ipc_sysctls;
 
 	/* user_ns which owns the ipc ns */
@@ -88,8 +86,6 @@ extern void shm_destroy_orphaned(struct ipc_namespace *ns);
 static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
 #endif /* CONFIG_SYSVIPC */
 
-#ifdef CONFIG_POSIX_MQUEUE
-extern int mq_init_ns(struct ipc_namespace *ns);
 /*
  * POSIX Message Queue default values:
  *
@@ -123,6 +119,9 @@ extern int mq_init_ns(struct ipc_namespace *ns);
 #define DFLT_MSGSIZE		     8192U
 #define DFLT_MSGSIZEMAX		     8192
 #define HARD_MSGSIZEMAX	    (16*1024*1024)
+
+#ifdef CONFIG_POSIX_MQUEUE
+extern int mq_init_ns(struct ipc_namespace *ns);
 #else
 static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #endif
@@ -174,39 +173,7 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 }
 #endif
 
-#ifdef CONFIG_POSIX_MQUEUE_SYSCTL
-
-void retire_mq_sysctls(struct ipc_namespace *ns);
-bool setup_mq_sysctls(struct ipc_namespace *ns);
-
-#else /* CONFIG_POSIX_MQUEUE_SYSCTL */
-
-static inline void retire_mq_sysctls(struct ipc_namespace *ns)
-{
-}
-
-static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
-{
-	return true;
-}
-
-#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
-
-#ifdef CONFIG_SYSVIPC_SYSCTL
-
 bool setup_ipc_sysctls(struct ipc_namespace *ns);
 void retire_ipc_sysctls(struct ipc_namespace *ns);
 
-#else /* CONFIG_SYSVIPC_SYSCTL */
-
-static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
-{
-}
-
-static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
-{
-	return true;
-}
-
-#endif /* CONFIG_SYSVIPC_SYSCTL */
 #endif
diff --git a/ipc/Makefile b/ipc/Makefile
index c2558c430f51..f79eab42a4dc 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -4,9 +4,6 @@
 #
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
-obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o
-obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
-obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o ipc_sysctl.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o ipc_sysctl.o
 obj-$(CONFIG_IPC_NS) += namespace.o
-obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
-
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index dd87ba12f5e3..9fc8e3e75be7 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -201,6 +201,59 @@ static struct ctl_table ipc_sysctls[] = {
 	{}
 };
 
+static int msg_max_limit_min = MIN_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
+
+static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
+static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
+
+static struct ctl_table mq_sysctls[] = {
+	{
+		.procname	= "queues_max",
+		.data		= &init_ipc_ns.mq_queues_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "msg_max",
+		.data		= &init_ipc_ns.mq_msg_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.procname	= "msgsize_max",
+		.data		= &init_ipc_ns.mq_msgsize_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{
+		.procname	= "msg_default",
+		.data		= &init_ipc_ns.mq_msg_default,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.procname	= "msgsize_default",
+		.data		= &init_ipc_ns.mq_msgsize_default,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{}
+};
+
 static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
 {
 	return &current->nsproxy->ipc_ns->set;
@@ -215,12 +268,10 @@ static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
 };
 
-bool setup_ipc_sysctls(struct ipc_namespace *ns)
+static bool register_ipc_sysctl_table(struct ipc_namespace *ns)
 {
 	struct ctl_table *tbl;
 
-	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
-
 	tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL);
 	if (tbl) {
 		int i;
@@ -273,23 +324,91 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 	}
 	if (!ns->ipc_sysctls) {
 		kfree(tbl);
-		retire_sysctl_set(&ns->set);
 		return false;
 	}
 
 	return true;
 }
 
-void retire_ipc_sysctls(struct ipc_namespace *ns)
+static void unregister_ipc_sysctl_table(struct ipc_namespace *ns)
 {
 	struct ctl_table *tbl;
 
 	tbl = ns->ipc_sysctls->ctl_table_arg;
 	unregister_sysctl_table(ns->ipc_sysctls);
-	retire_sysctl_set(&ns->set);
 	kfree(tbl);
 }
 
+static bool register_mqueue_sysctl_table(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
+	if (tbl) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
+			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
+				tbl[i].data = &ns->mq_queues_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
+				tbl[i].data = &ns->mq_msg_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
+				tbl[i].data = &ns->mq_msgsize_max;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
+				tbl[i].data = &ns->mq_msg_default;
+
+			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
+				tbl[i].data = &ns->mq_msgsize_default;
+			else
+				tbl[i].data = NULL;
+		}
+
+		ns->mq_sysctls = __register_sysctl_table(&ns->set, "fs/mqueue", tbl);
+	}
+	if (!ns->mq_sysctls) {
+		kfree(tbl);
+		return false;
+	}
+
+	return true;
+}
+
+static void unregister_mqueue_sysctl_table(struct ipc_namespace *ns)
+{
+	struct ctl_table *tbl;
+
+	tbl = ns->mq_sysctls->ctl_table_arg;
+	unregister_sysctl_table(ns->mq_sysctls);
+	kfree(tbl);
+}
+
+bool setup_ipc_sysctls(struct ipc_namespace *ns)
+{
+	setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+
+	if (IS_ENABLED(CONFIG_SYSVIPC_SYSCTL))
+		register_ipc_sysctl_table(ns);
+
+	if (IS_ENABLED(CONFIG_POSIX_MQUEUE_SYSCTL))
+		register_mqueue_sysctl_table(ns);
+
+	return true;
+}
+
+void retire_ipc_sysctls(struct ipc_namespace *ns)
+{
+	if (IS_ENABLED(CONFIG_SYSVIPC_SYSCTL))
+		unregister_ipc_sysctl_table(ns);
+
+	if (IS_ENABLED(CONFIG_POSIX_MQUEUE_SYSCTL))
+		unregister_mqueue_sysctl_table(ns);
+
+	retire_sysctl_set(&ns->set);
+}
+
 static int __init ipc_sysctl_init(void)
 {
 	if (!setup_ipc_sysctls(&init_ipc_ns)) {
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
deleted file mode 100644
index fbf6a8b93a26..000000000000
--- a/ipc/mq_sysctl.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 2007 IBM Corporation
- *
- *  Author: Cedric Le Goater <clg@fr.ibm.com>
- */
-
-#include <linux/nsproxy.h>
-#include <linux/ipc_namespace.h>
-#include <linux/sysctl.h>
-
-#include <linux/stat.h>
-#include <linux/capability.h>
-#include <linux/slab.h>
-
-static int msg_max_limit_min = MIN_MSGMAX;
-static int msg_max_limit_max = HARD_MSGMAX;
-
-static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
-static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
-
-static struct ctl_table mq_sysctls[] = {
-	{
-		.procname	= "queues_max",
-		.data		= &init_ipc_ns.mq_queues_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "msg_max",
-		.data		= &init_ipc_ns.mq_msg_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &msg_max_limit_min,
-		.extra2		= &msg_max_limit_max,
-	},
-	{
-		.procname	= "msgsize_max",
-		.data		= &init_ipc_ns.mq_msgsize_max,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &msg_maxsize_limit_min,
-		.extra2		= &msg_maxsize_limit_max,
-	},
-	{
-		.procname	= "msg_default",
-		.data		= &init_ipc_ns.mq_msg_default,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &msg_max_limit_min,
-		.extra2		= &msg_max_limit_max,
-	},
-	{
-		.procname	= "msgsize_default",
-		.data		= &init_ipc_ns.mq_msgsize_default,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &msg_maxsize_limit_min,
-		.extra2		= &msg_maxsize_limit_max,
-	},
-	{}
-};
-
-static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
-{
-	return &current->nsproxy->ipc_ns->mq_set;
-}
-
-static int set_is_seen(struct ctl_table_set *set)
-{
-	return &current->nsproxy->ipc_ns->mq_set == set;
-}
-
-static struct ctl_table_root set_root = {
-	.lookup = set_lookup,
-};
-
-bool setup_mq_sysctls(struct ipc_namespace *ns)
-{
-	struct ctl_table *tbl;
-
-	setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen);
-
-	tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL);
-	if (tbl) {
-		int i;
-
-		for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) {
-			if (tbl[i].data == &init_ipc_ns.mq_queues_max)
-				tbl[i].data = &ns->mq_queues_max;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msg_max)
-				tbl[i].data = &ns->mq_msg_max;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max)
-				tbl[i].data = &ns->mq_msgsize_max;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msg_default)
-				tbl[i].data = &ns->mq_msg_default;
-
-			else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default)
-				tbl[i].data = &ns->mq_msgsize_default;
-			else
-				tbl[i].data = NULL;
-		}
-
-		ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
-	}
-	if (!ns->mq_sysctls) {
-		kfree(tbl);
-		retire_sysctl_set(&ns->mq_set);
-		return false;
-	}
-
-	return true;
-}
-
-void retire_mq_sysctls(struct ipc_namespace *ns)
-{
-	struct ctl_table *tbl;
-
-	tbl = ns->mq_sysctls->ctl_table_arg;
-	unregister_sysctl_table(ns->mq_sysctls);
-	retire_sysctl_set(&ns->mq_set);
-	kfree(tbl);
-}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1b4a3be71636..f08e9f8db195 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1711,11 +1711,6 @@ static int __init init_mqueue_fs(void)
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
-	if (!setup_mq_sysctls(&init_ipc_ns)) {
-		pr_warn("sysctl registration failed\n");
-		return -ENOMEM;
-	}
-
 	error = register_filesystem(&mqueue_fs_type);
 	if (error)
 		goto out_sysctl;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 754f3237194a..e18b6b5c2a46 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -60,9 +60,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 		goto fail_put;
 
 	err = -ENOMEM;
-	if (!setup_mq_sysctls(ns))
-		goto fail_put;
-
 	if (!setup_ipc_sysctls(ns))
 		goto fail_put;
 
@@ -132,7 +129,6 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
 
-	retire_mq_sysctls(ns);
 	retire_ipc_sysctls(ns);
 
 	dec_ipc_namespaces(ns->ucounts);
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..e88e486e9048 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -31,18 +31,16 @@
 #define IPCMNI			(1 << IPCMNI_SHIFT)
 #define IPCMNI_EXTEND		(1 << IPCMNI_EXTEND_SHIFT)
 
-#ifdef CONFIG_SYSVIPC_SYSCTL
 extern int ipc_mni;
 extern int ipc_mni_shift;
 extern int ipc_min_cycle;
 
+#ifdef CONFIG_SYSVIPC_SYSCTL
 #define ipcmni_seq_shift()	ipc_mni_shift
 #define IPCMNI_IDX_MASK		((1 << ipc_mni_shift) - 1)
 
 #else /* CONFIG_SYSVIPC_SYSCTL */
 
-#define ipc_mni			IPCMNI
-#define ipc_min_cycle		((int)RADIX_TREE_MAP_SIZE)
 #define ipcmni_seq_shift()	IPCMNI_SHIFT
 #define IPCMNI_IDX_MASK		((1 << IPCMNI_SHIFT) - 1)
 #endif /* CONFIG_SYSVIPC_SYSCTL */
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH v3 4/4] ipc: Allow to modify ipc/mq sysctls if CAP_SYS_RESOURCE is present
  2022-01-21 13:08     ` [RFC PATCH v3 0/4] ipc: Store mq and ipc " Alexey Gladkov
                         ` (2 preceding siblings ...)
  2022-01-21 13:08       ` [RFC PATCH v3 3/4] ipc: Merge ipc_sysctl and mq_sysctl Alexey Gladkov
@ 2022-01-21 13:08       ` Alexey Gladkov
  3 siblings, 0 replies; 18+ messages in thread
From: Alexey Gladkov @ 2022-01-21 13:08 UTC (permalink / raw)
  To: LKML, Linux Containers
  Cc: Alexander Mikhalitsyn, Andrew Morton, Christian Brauner,
	Daniel Walsh, Davidlohr Bueso, Eric W . Biederman, Kirill Tkhai,
	Manfred Spraul, Serge Hallyn, Varad Gautam, Vasily Averin

The mq_overview(7) says that mq sysctls are available for modification
to a privileged process (CAP_SYS_RESOURCE). Right now, within userns, a
privileged process cannot modify these files. Once the mq and ipc
sysctls have been moved to the ipc namespace we can grant access to
these files.

mqueue sysctls
--------------

The mq sysctls are protected by an upper limit that cannot be exceeded
on the system:

For /proc/sys/fs/mqueue/msg_max the upper limit is HARD_MSGMAX.
For /proc/sys/fs/mqueue/msgsize_max the upper limit is HARD_MSGSIZEMAX.

Also RLIMIT_MSGQUEUE limits all queues used by the process. This limit
is also tied to userns.

ipc sysctls
-----------

The implementation has no specific limits for the per-process maximum
number of shared memory segments. Only SHM_LOCK and SHM_HUGETLB limited
by RLIMIT_MEMLOCK which is also tied to userns.

This patch is RPC only and should not be applied without a security
discussion.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 ipc/ipc_sysctl.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 9fc8e3e75be7..f1d1c83656f9 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -254,6 +254,21 @@ static struct ctl_table mq_sysctls[] = {
 	{}
 };
 
+static int set_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, set);
+	int mode;
+
+	/* Allow users with CAP_SYS_RESOURCE unrestrained access */
+	if (ns_capable(ns->user_ns, CAP_SYS_RESOURCE))
+		mode = (table->mode & S_IRWXU) >> 6;
+	else
+	/* Allow all others at most read-only access */
+		mode = table->mode & S_IROTH;
+
+	return (mode << 6) | (mode << 3) | mode;
+}
+
 static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
 {
 	return &current->nsproxy->ipc_ns->set;
@@ -266,6 +281,7 @@ static int set_is_seen(struct ctl_table_set *set)
 
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
+	.permissions = set_permissions,
 };
 
 static bool register_ipc_sysctl_table(struct ipc_namespace *ns)
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2022-01-04 22:17         ` Eric W. Biederman
@ 2023-09-08 15:47           ` Daniel Walsh
  2023-09-10 18:51             ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Walsh @ 2023-09-08 15:47 UTC (permalink / raw)
  To: Eric W. Biederman, Manfred Spraul
  Cc: Alexey Gladkov, LKML, Linux Containers, Alexander Mikhalitsyn,
	Andrew Morton, Christian Brauner, Davidlohr Bueso, Kirill Tkhai,
	Serge Hallyn, Varad Gautam, Vasily Averin, kernel test robot

On 1/4/22 17:17, Eric W. Biederman wrote:
> parametres. This poses a problem in the rootless containers.

Did this ever go into the Linux kernel?


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ipc: Store mqueue sysctls in the ipc namespace
  2023-09-08 15:47           ` Daniel Walsh
@ 2023-09-10 18:51             ` Andrew Morton
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2023-09-10 18:51 UTC (permalink / raw)
  To: dwalsh
  Cc: Eric W. Biederman, Manfred Spraul, Alexey Gladkov, LKML,
	Linux Containers, Alexander Mikhalitsyn, Christian Brauner,
	Davidlohr Bueso, Kirill Tkhai, Serge Hallyn, Varad Gautam,
	Vasily Averin, kernel test robot

On Fri, 8 Sep 2023 11:47:21 -0400 Daniel Walsh <dwalsh@redhat.com> wrote:

> On 1/4/22 17:17, Eric W. Biederman wrote:
> > parametres. This poses a problem in the rootless containers.
> 
> Did this ever go into the Linux kernel?

yup.

commit dc55e35f9e810f23dd69cfdc91a3d636023f57a2
Author: Alexey Gladkov <legion@kernel.org>
Date:   Mon Feb 14 19:18:14 2022 +0100

    ipc: Store mqueue sysctls in the ipc namespace


with a couple of later fixups, c579d60f0d0cd87552f64fdebe68b5d941d20309
and 12b677f2c697d61e5ddbcb6c1650050a39392f54

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-09-10 18:51 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-03 16:04 [PATCH v1] ipc: Store mqueue sysctls in the ipc namespace Alexey Gladkov
2022-01-03 20:41 ` kernel test robot
2022-01-03 20:41   ` kernel test robot
2022-01-04  5:28 ` kernel test robot
2022-01-04  5:28   ` kernel test robot
2022-01-04 11:51 ` [PATCH v2] " Alexey Gladkov
2022-01-04 18:13   ` Manfred Spraul
2022-01-04 18:42     ` Eric W. Biederman
2022-01-04 20:48       ` Manfred Spraul
2022-01-04 22:17         ` Eric W. Biederman
2023-09-08 15:47           ` Daniel Walsh
2023-09-10 18:51             ` Andrew Morton
2022-01-10 16:26   ` Eric W. Biederman
2022-01-21 13:08     ` [RFC PATCH v3 0/4] ipc: Store mq and ipc " Alexey Gladkov
2022-01-21 13:08       ` [RFC PATCH v3 1/4] ipc: Store mqueue " Alexey Gladkov
2022-01-21 13:08       ` [RFC PATCH v3 2/4] ipc: Store ipc " Alexey Gladkov
2022-01-21 13:08       ` [RFC PATCH v3 3/4] ipc: Merge ipc_sysctl and mq_sysctl Alexey Gladkov
2022-01-21 13:08       ` [RFC PATCH v3 4/4] ipc: Allow to modify ipc/mq sysctls if CAP_SYS_RESOURCE is present Alexey Gladkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.