All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nikolay Borisov <kernel-6AxghH7DbtA@public.gmane.org>
To: ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
	Nikolay Borisov <kernel-6AxghH7DbtA@public.gmane.org>,
	operations-/eCPMmvKun9pLGFMi4vTTA@public.gmane.org
Subject: [PATCH 3/4] userns/inotify: Initial implementation of inotify per-userns
Date: Wed, 13 Jul 2016 15:14:12 +0300	[thread overview]
Message-ID: <1468412053-30130-4-git-send-email-kernel@kyup.com> (raw)
In-Reply-To: <1468412053-30130-1-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>

So here is the first version of the hierarchical inotify limits. Changes
include:
 * Added 2 new sysctls:
    - inotify_reserved_user_instances and inotify_reserved_user_watches these essentially
    control the distribution of instances/watches down the hierarchy. For example if we
    have instances/watches limit of 1024/256 and reserved instances/watches are set to
    128/32 then at every level of the hierarchy instances/watches are going to be reduced
    by 128/32, so at userns level of 1 (e.g. init_user_ns->level_1_user_ns) each user would
    have 896/224 respectively. Currently the defaults are calculated so that at least 8 levels
    of indirection are allowed. Those can be set only by global root user.

 * Changed core userns code to support adding per-userns/per-user counters, this
 is happening in the nsuser_state structure.

 * Add necessary functionality to inotify to make use of the newly added
 userns infrastructure.

 * Moved the initialization of the inotify_max_user_instances/watches to
 user_namespaces_init so that it's initialised by the time inotify is
 bootstrapped.

Signed-off-by: Nikolay Borisov <kernel-6AxghH7DbtA@public.gmane.org>
---
 fs/notify/inotify/inotify.h      |   2 +
 fs/notify/inotify/inotify_user.c |  93 +++++++++++++++++++++++++++++++++-
 include/linux/fsnotify_backend.h |   3 ++
 include/linux/user_namespace.h   |  45 +++++++++++++++++
 kernel/user_namespace.c          | 106 ++++++++++++++++++++++++++++++++++++++-
 5 files changed, 246 insertions(+), 3 deletions(-)

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..8ead0a1a3cdb 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -1,6 +1,8 @@
 #include <linux/fsnotify_backend.h>
 #include <linux/inotify.h>
 #include <linux/slab.h> /* struct kmem_cache */
+#include <linux/page_counter.h>
+#include <linux/user_namespace.h>
 
 struct inotify_event_info {
 	struct fsnotify_event fse;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b8d08d0d0a4d..076a9990eff4 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -48,6 +48,8 @@
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
 static int inotify_max_user_watches __read_mostly;
+int inotify_reserved_user_instances __read_mostly;
+int inotify_reserved_user_watches   __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
@@ -82,10 +84,96 @@ struct ctl_table inotify_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero
 	},
+	{
+		.procname	= "reserved_user_instances",
+		.data		= &inotify_reserved_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "reserved_user_watches",
+		.data		= &inotify_reserved_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
 
+static inline void __init_counters(struct nsuser_state *state,
+				   struct nsuser_state *parent,
+				   struct user_namespace *ns)
+{
+	if (ns == &init_user_ns) {
+		page_counter_init(&state->inotify_watches, NULL);
+		page_counter_init(&state->inotify_instances, NULL);
+		page_counter_limit(&state->inotify_watches,
+				   init_user_ns.inotify_max_user_watches);
+		page_counter_limit(&state->inotify_instances,
+				   init_user_ns.inotify_max_user_instances);
+	} else {
+		page_counter_init(&state->inotify_watches,
+				  &parent->inotify_watches);
+		page_counter_init(&state->inotify_instances,
+				  &parent->inotify_instances);
+		page_counter_limit(&state->inotify_watches, ns->inotify_max_user_watches);
+		page_counter_limit(&state->inotify_instances, ns->inotify_max_user_instances);
+	}
+}
+
+static noinline int inotify_init_state(struct user_namespace *ns, kuid_t uid)
+{
+	struct nsuser_state *state;
+	struct page_counter *cnt;
+
+	/* We can work with the data without the lock held, since liveliness
+	 * of data is guaranteed as long as the namespace is alive
+	 */
+	spin_lock_bh(&nsuser_state_lock);
+	state = get_nsuser_state(ns, uid);
+	spin_unlock_bh(&nsuser_state_lock);
+
+	if (!state) {
+
+		state = kzalloc(sizeof(struct nsuser_state), GFP_KERNEL);
+		if (!state)
+			return -ENOMEM;
+
+		state->uid = uid;
+		state->ns = ns;
+
+		if (ns == &init_user_ns)
+			__init_counters(state, NULL, ns);
+		else {
+			struct nsuser_state *parent_state;
+
+			spin_lock_bh(&nsuser_state_lock);
+			parent_state = get_nsuser_state(ns->parent, ns->owner);
+			spin_unlock_bh(&nsuser_state_lock);
+
+			BUG_ON(!parent_state);
+
+			__init_counters(state, parent_state, ns);
+		}
+
+		page_counter_charge(&state->inotify_instances, 1);
+
+		spin_lock_bh(&nsuser_state_lock);
+		hash_add(nsstate_hash, &state->node, __kuid_val(uid));
+		spin_unlock_bh(&nsuser_state_lock);
+	} else {
+		if (!page_counter_try_charge(&state->inotify_instances, 1, &cnt))
+			return -EMFILE;
+	}
+
+	return 0;
+}
+
+
 static inline __u32 inotify_arg_to_mask(u32 arg)
 {
 	__u32 mask;
@@ -819,8 +907,9 @@ static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	/* These reserves should allow for 8 levels of nesting in userns */
+	inotify_reserved_user_instances = 32;
+	inotify_reserved_user_watches = 1024;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 29f917517299..eb83a10afac7 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -170,6 +170,9 @@ struct fsnotify_group {
 			spinlock_t	idr_lock;
 			struct idr      idr;
 			struct user_struct      *user;
+			struct user_namespace *userns;
+			kuid_t uid; /* id in the userns this group is
+				      associated with */
 		} inotify_data;
 #endif
 #ifdef CONFIG_FANOTIFY
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8297e5b341d8..3116a2df1cee 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -6,6 +6,9 @@
 #include <linux/ns_common.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/hashtable.h>
+#include <linux/spinlock.h>
+#include <linux/page_counter.h>
 
 #define UID_GID_MAP_MAX_EXTENTS 5
 
@@ -22,6 +25,21 @@ struct uid_gid_map {	/* 64 bytes -- 1 cache line */
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
+#define NSSTATE_HASHTABLE_BITS 10
+extern DECLARE_HASHTABLE(nsstate_hash, NSSTATE_HASHTABLE_BITS);
+extern spinlock_t nsuser_state_lock;
+
+/* Generic struct to hold various peruser/perns state */
+struct nsuser_state {
+	struct hlist_node node; /* keyed at nstate_hash */
+	void *ns; /* ns in which uid is valid */
+	kuid_t uid;
+#ifdef CONFIG_INOTIFY_USER
+	struct page_counter inotify_watches; /* How many inotify watches does this user */
+	struct page_counter inotify_instances; /* How many inotify devs does this user have opened? */
+#endif
+};
+
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -39,11 +57,28 @@ struct user_namespace {
 	struct key		*persistent_keyring_register;
 	struct rw_semaphore	persistent_keyring_register_sem;
 #endif
+
+#ifdef CONFIG_INOTIFY_USER
+	int inotify_max_user_instances;
+	int inotify_max_user_watches;
+#endif
 };
 
 extern struct user_namespace init_user_ns;
 
 #ifdef CONFIG_USER_NS
+static inline struct nsuser_state *get_nsuser_state(struct user_namespace *ns,
+						    kuid_t uid)
+{
+       struct nsuser_state *state;
+
+       WARN_ON(!spin_is_locked(&nsuser_state_lock));
+
+       hash_for_each_possible(nsstate_hash, state, node, __kuid_val(uid))
+               if (state->ns == ns && uid_eq(state->uid, uid))
+                       return state;
+       return NULL;
+}
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
@@ -74,6 +109,16 @@ extern int proc_setgroups_show(struct seq_file *m, void *v);
 extern bool userns_may_setgroups(const struct user_namespace *ns);
 #else
 
+static inline struct nsuser_state *get_nsuser_state(struct user_namespace *ns,
+						    kuid_t uid)
+{
+       struct nsuser_state *state;
+       hash_for_each_possible(nsstate_hash, state, node, &init_user_ns)
+               if (uid_eq(uid, state->uid) && state->ns == ns);
+                       return state;
+       return NULL;
+}
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..cb51e3607d2d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,10 +22,20 @@
 #include <linux/ctype.h>
 #include <linux/projid.h>
 #include <linux/fs_struct.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
 
+DEFINE_HASHTABLE(nsstate_hash, NSSTATE_HASHTABLE_BITS);
+DEFINE_SPINLOCK(nsuser_state_lock);
+
+#ifdef CONFIG_INOTIFY_USER
+extern int inotify_reserved_user_instances;
+extern int inotify_reserved_user_watches;
+#endif
+
 static bool new_idmap_permitted(const struct file *file,
 				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
@@ -60,10 +70,13 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns, *parent_ns = new->user_ns;
+	struct nsuser_state *state, *parent_state;
 	kuid_t owner = new->euid;
 	kgid_t group = new->egid;
 	int ret;
-
+#ifdef CONFIG_INOTIFY_USER
+	int tmp;
+#endif
 	if (parent_ns->level > 32)
 		return -EUSERS;
 
@@ -88,9 +101,16 @@ int create_user_ns(struct cred *new)
 	if (!ns)
 		return -ENOMEM;
 
+	state = kmalloc(sizeof(struct nsuser_state), GFP_KERNEL);
+	if (!state) {
+		kmem_cache_free(user_ns_cachep, ns);
+		return -ENOMEM;
+	}
+
 	ret = ns_alloc_inum(&ns->ns);
 	if (ret) {
 		kmem_cache_free(user_ns_cachep, ns);
+		kfree(state);
 		return ret;
 	}
 	ns->ns.ops = &userns_operations;
@@ -101,6 +121,13 @@ int create_user_ns(struct cred *new)
 	ns->level = parent_ns->level + 1;
 	ns->owner = owner;
 	ns->group = group;
+#ifdef CONFIG_INOTIFY_USER
+	tmp = parent_ns->inotify_max_user_instances - inotify_reserved_user_instances;
+	ns->inotify_max_user_instances = max(0, tmp);
+
+	tmp = parent_ns->inotify_max_user_watches - inotify_reserved_user_watches;
+	ns->inotify_max_user_watches = max(0, tmp);
+#endif
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
 	mutex_lock(&userns_state_mutex);
@@ -112,8 +139,63 @@ int create_user_ns(struct cred *new)
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	init_rwsem(&ns->persistent_keyring_register_sem);
 #endif
+
+	spin_lock_bh(&nsuser_state_lock);
+	parent_state = get_nsuser_state(parent_ns, owner);
+	spin_unlock_bh(&nsuser_state_lock);
+	if (!parent_state) {
+		struct nsuser_state *grandfather_state;
+
+		spin_lock_bh(&nsuser_state_lock);
+		/* init_user_ns doesn't have a parent */
+		if (parent_ns == &init_user_ns)
+			grandfather_state = get_nsuser_state(parent_ns, parent_ns->owner);
+		else
+			grandfather_state = get_nsuser_state(parent_ns->parent, parent_ns->owner);
+		spin_unlock_bh(&nsuser_state_lock);
+
+		state->uid = owner;
+		state->ns = parent_ns;
+
+#ifdef CONFIG_INOTIFY_USER
+		page_counter_init(&state->inotify_watches,
+				  &grandfather_state->inotify_watches);
+		page_counter_init(&state->inotify_instances,
+				  &grandfather_state->inotify_instances);
+		page_counter_limit(&state->inotify_watches,
+				   parent_ns->inotify_max_user_watches);
+		page_counter_limit(&state->inotify_instances,
+				   parent_ns->inotify_max_user_instances);
+#endif
+
+		spin_lock_bh(&nsuser_state_lock);
+		hash_add(nsstate_hash, &state->node, __kuid_val(owner));
+		spin_unlock_bh(&nsuser_state_lock);
+	}
+
 	return 0;
 }
+/* Delete all state related to a user ns. All processes of a
+ * namespace should be dead by this time and no references
+ * to the peruser/perns state variables should be live.As such
+ * we can be modifying the hashtable without holding the lock
+ */
+static void free_nsuser_state(struct user_namespace *ns)
+{
+	int bkt;
+	struct hlist_node *tmp;
+	struct nsuser_state *state;
+
+	hash_for_each_safe(nsstate_hash, bkt, tmp, state, node) {
+		if (state->ns == ns) {
+			BUG_ON(page_counter_read(&state->inotify_instances));
+			BUG_ON(page_counter_read(&state->inotify_watches));
+
+			hash_del(&state->node);
+			kfree(state);
+		}
+	}
+}
 
 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
 {
@@ -141,6 +223,10 @@ void free_user_ns(struct user_namespace *ns)
 
 	do {
 		parent = ns->parent;
+
+		spin_lock_bh(&nsuser_state_lock);
+		free_nsuser_state(ns);
+		spin_unlock_bh(&nsuser_state_lock);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 		key_put(ns->persistent_keyring_register);
 #endif
@@ -1000,7 +1086,25 @@ const struct proc_ns_operations userns_operations = {
 
 static __init int user_namespaces_init(void)
 {
+	struct nsuser_state *root_state = kmalloc(sizeof(struct nsuser_state),
+						  GFP_KERNEL);
+
+	init_user_ns.inotify_max_user_instances = 256;
+	init_user_ns.inotify_max_user_watches = 8192;
+
+#ifdef CONFIG_INOTIFY_USE
+	page_counter_init(&root_state->inotify_watches, NULL);
+	page_counter_init(&root_state->inotify_instances, NULL);
+	page_counter_limit(&root_state->inotify_watches,
+			   init_user_ns.inotify_max_user_watches);
+	page_counter_limit(&root_state->inotify_instances,
+					   init_user_ns.inotify_max_user_instances);
+#endif
+	root_state->uid = GLOBAL_ROOT_UID;
+	root_state->ns = &init_user_ns;
+	hash_add(nsstate_hash, &root_state->node, __kuid_val(GLOBAL_ROOT_UID));
 	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+
 	return 0;
 }
 subsys_initcall(user_namespaces_init);
-- 
2.5.0

  parent reply	other threads:[~2016-07-13 12:14 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-07-13 12:14 [RFC PATCH 0/4 v3] Inotify limits per usernamespace Nikolay Borisov
     [not found] ` <1468412053-30130-1-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
2016-07-13 12:14   ` [PATCH 1/4] hashtable: Add __HASHTABLE_INITIALIZER Nikolay Borisov
2016-07-13 12:14   ` [PATCH 2/4] misc: Rename the HASH_SIZE macro Nikolay Borisov
2016-07-13 12:14   ` Nikolay Borisov [this message]
2016-07-13 12:14   ` [PATCH 4/4] inotify: Convert to using new userns infrastructure Nikolay Borisov
2016-07-20  0:41   ` [RFC PATCH 0/4 v3] Inotify limits per usernamespace Eric W. Biederman
  -- strict thread matches above, loose matches on Subject: below --
2016-06-29 13:37 [RFC PATCH 0/4 v2] " Nikolay Borisov
     [not found] ` <1467207425-22072-1-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
2016-06-29 13:37   ` [PATCH 3/4] userns/inotify: Initial implementation of inotify per-userns Nikolay Borisov
     [not found]     ` <1467207425-22072-4-git-send-email-kernel-6AxghH7DbtA@public.gmane.org>
2016-07-06 17:29       ` Eric W. Biederman
     [not found]         ` <87mvluekun.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-07-07 13:40           ` Nikolay Borisov
     [not found]             ` <577E5BC2.1000208-6AxghH7DbtA@public.gmane.org>
2016-07-07 15:27               ` Eric W. Biederman
     [not found]                 ` <87inwh31v6.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-07-08 11:43                   ` Nikolay Borisov
     [not found]                     ` <577F91C9.9060903-6AxghH7DbtA@public.gmane.org>
2016-07-08 15:08                       ` Eric W. Biederman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1468412053-30130-4-git-send-email-kernel@kyup.com \
    --to=kernel-6axghh7dbta@public.gmane.org \
    --cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
    --cc=ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org \
    --cc=operations-/eCPMmvKun9pLGFMi4vTTA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.