All of lore.kernel.org
 help / color / mirror / Atom feed
* userns: targeted capabilities v3
@ 2011-01-10 21:11 Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
                   ` (7 more replies)
  0 siblings, 8 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:11 UTC (permalink / raw)
  To: LSM
  Cc: James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk

Following is the next version of my user namespace patchset.

The core of the set is patch 2, originally conceived and
implemented by Eric Biederman.  The concept is to target
capabilities at user namespaces.  A task's capabilities are
now contextualized as follows (previously, capabilities had
no context):

1. For a task in the initial user namespace, the calculated
capabilities (pi, pe, pp) are available to act upon any
user namespace.

2. For a task in a child user namespace, the calculated
capabilities are available to act only on its own or any
descendent user namespace.  It has no capabilities to any
parent or unrelated user namespaces.

3. If a user A creates a new user namespace, that user has
all capabilities to that new user namespace and any of its
descendents.  (Contrast this with a user namespace created
by another user B in the same user namespace, to which this
user A has only his calculated capabilities)

All existing 'capable' checks are automatically converted to
checks against the initial user namespace.  The rest of the
patches begin to enable capabilities in child user namespaces
to setuid, setgid, set hostnames, kill tasks, and do ptrace.

My next step would be to re-introduce a part of a several year
old patchset which assigns a userns to a superblock (and hence
to inodes), and grants 'user other' permissions to any task
whose uid does not map to the target userns.  (By default, this
will be all but the initial userns)

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-10 21:13   ` Serge E. Hallyn
  2011-01-10 21:13   ` [PATCH 2/7] security: Make capabilities relative to the user namespace Serge E. Hallyn
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

copy_process() handles CLONE_NEWUSER before the rest of the
namespaces.  So in the case of clone(CLONE_NEWUSER|CLONE_NEWUTS)
the new uts namespace will have the new user namespace as its
owner.  That is what we want, since we want root in that new
userns to be able to have privilege over it.

Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 include/linux/utsname.h |    3 +++
 init/version.c          |    2 ++
 kernel/nsproxy.c        |    3 +++
 kernel/user.c           |    8 ++++++--
 kernel/utsname.c        |    4 ++++
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 69f3997..85171be 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,9 +37,12 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+struct user_namespace;
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
+	struct user_namespace *user_ns;
 };
 extern struct uts_namespace init_uts_ns;
 
diff --git a/init/version.c b/init/version.c
index adff586..97bb86f 100644
--- a/init/version.c
+++ b/init/version.c
@@ -21,6 +21,7 @@ extern int version_string(LINUX_VERSION_CODE);
 int version_string(LINUX_VERSION_CODE);
 #endif
 
+extern struct user_namespace init_user_ns;
 struct uts_namespace init_uts_ns = {
 	.kref = {
 		.refcount	= ATOMIC_INIT(2),
@@ -33,6 +34,7 @@ struct uts_namespace init_uts_ns = {
 		.machine	= UTS_MACHINE,
 		.domainname	= UTS_DOMAINNAME,
 	},
+	.user_ns = &init_user_ns,
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c0..5a22dcf 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -74,6 +74,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}
+	put_user_ns(new_nsp->uts_ns->user_ns);
+	new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns;
+	get_user_ns(new_nsp->uts_ns->user_ns);
 
 	new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca..9e03e9c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
 
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(2),
+		.refcount	= ATOMIC_INIT(3),
 	},
 	.creator = &root_user,
 };
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(2),
 	.processes	= ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b..a7b3a8d 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -40,6 +41,8 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+	ns->user_ns = old_ns->user_ns;
+	get_user_ns(ns->user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@@ -71,5 +74,6 @@ void free_uts_ns(struct kref *kref)
 	struct uts_namespace *ns;
 
 	ns = container_of(kref, struct uts_namespace, kref);
+	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
@ 2011-01-10 21:13 ` Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 2/7] security: Make capabilities relative to the user namespace Serge E. Hallyn
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk

copy_process() handles CLONE_NEWUSER before the rest of the
namespaces.  So in the case of clone(CLONE_NEWUSER|CLONE_NEWUTS)
the new uts namespace will have the new user namespace as its
owner.  That is what we want, since we want root in that new
userns to be able to have privilege over it.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/utsname.h |    3 +++
 init/version.c          |    2 ++
 kernel/nsproxy.c        |    3 +++
 kernel/user.c           |    8 ++++++--
 kernel/utsname.c        |    4 ++++
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 69f3997..85171be 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,9 +37,12 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+struct user_namespace;
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
+	struct user_namespace *user_ns;
 };
 extern struct uts_namespace init_uts_ns;
 
diff --git a/init/version.c b/init/version.c
index adff586..97bb86f 100644
--- a/init/version.c
+++ b/init/version.c
@@ -21,6 +21,7 @@ extern int version_string(LINUX_VERSION_CODE);
 int version_string(LINUX_VERSION_CODE);
 #endif
 
+extern struct user_namespace init_user_ns;
 struct uts_namespace init_uts_ns = {
 	.kref = {
 		.refcount	= ATOMIC_INIT(2),
@@ -33,6 +34,7 @@ struct uts_namespace init_uts_ns = {
 		.machine	= UTS_MACHINE,
 		.domainname	= UTS_DOMAINNAME,
 	},
+	.user_ns = &init_user_ns,
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c0..5a22dcf 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -74,6 +74,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}
+	put_user_ns(new_nsp->uts_ns->user_ns);
+	new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns;
+	get_user_ns(new_nsp->uts_ns->user_ns);
 
 	new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca..9e03e9c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
 
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(2),
+		.refcount	= ATOMIC_INIT(3),
 	},
 	.creator = &root_user,
 };
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(2),
 	.processes	= ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b..a7b3a8d 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -40,6 +41,8 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+	ns->user_ns = old_ns->user_ns;
+	get_user_ns(ns->user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@@ -71,5 +74,6 @@ void free_uts_ns(struct kref *kref)
 	struct uts_namespace *ns;
 
 	ns = container_of(kref, struct uts_namespace, kref);
+	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 2/7] security: Make capabilities relative to the user namespace.
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-10 21:13   ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
@ 2011-01-10 21:13   ` Serge E. Hallyn
  2011-01-10 21:13   ` [PATCH 3/7] allow sethostname in a container Serge E. Hallyn
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk


- Introduce ns_capable to test for a capability in a non-default
  user namespace.
- Teach cap_capable to handle capabilities in a non-default
  user namespace.

The motivation is to get to the unprivileged creation of new
namespaces.  It looks like this gets us 90% of the way there, with
only potential uid confusion issues left.

I still need to handle getting all caps after creation but otherwise I
think I have a good starter patch that achieves all of your goals.

Changelog:
	11/05/2010: [serge] add apparmor
	12/14/2010: [serge] fix capabilities to created user namespaces
	Without this, if user serge creates a user_ns, he won't have
	capabilities to the user_ns he created.  THis is because we
	were first checking whether his effective caps had the caps
	he needed and returning -EPERM if not, and THEN checking whether
	he was the creator.  Reverse those checks.
	12/16/2010: [serge] security_real_capable needs ns argument in !security case

Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 include/linux/capability.h |    7 +++++--
 include/linux/security.h   |   22 ++++++++++++----------
 kernel/capability.c        |   22 ++++++++++++++++++++--
 security/apparmor/lsm.c    |    5 +++--
 security/commoncap.c       |   40 +++++++++++++++++++++++++++++++++-------
 security/security.c        |   12 ++++++------
 security/selinux/hooks.c   |   14 +++++++++-----
 7 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index fb16a36..7b3be11 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -544,7 +544,7 @@ extern const kernel_cap_t __cap_init_eff_set;
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-#define has_capability(t, cap) (security_real_capable((t), (cap)) == 0)
+#define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0)
 
 /**
  * has_capability_noaudit - Determine if a task has a superior capability available (unaudited)
@@ -558,9 +558,12 @@ extern const kernel_cap_t __cap_init_eff_set;
  * Note that this does not set PF_SUPERPRIV on the task.
  */
 #define has_capability_noaudit(t, cap) \
-	(security_real_capable_noaudit((t), (cap)) == 0)
+	(security_real_capable_noaudit((t), &init_user_ns, (cap)) == 0)
 
+struct user_namespace;
+extern struct user_namespace init_user_ns;
 extern int capable(int cap);
+extern int ns_capable(struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 struct dentry;
diff --git a/include/linux/security.h b/include/linux/security.h
index c642bb8..f31bffd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -46,13 +46,14 @@
 
 struct ctl_table;
 struct audit_krule;
+struct user_namespace;
 
 /*
  * These functions are in security/capability.c and are used
  * as the default capabilities functions
  */
 extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
-		       int cap, int audit);
+		       struct user_namespace *ns, int cap, int audit);
 extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
@@ -1254,6 +1255,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	credentials.
  *	@tsk contains the task_struct for the process.
  *	@cred contains the credentials to use.
+ *      @ns contains the user namespace we want the capability in
  *	@cap contains the capability <include/linux/capability.h>.
  *	@audit: Whether to write an audit message or not
  *	Return 0 if the capability is granted for @tsk.
@@ -1382,7 +1384,7 @@ struct security_operations {
 		       const kernel_cap_t *inheritable,
 		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, const struct cred *cred,
-			int cap, int audit);
+			struct user_namespace *ns, int cap, int audit);
 	int (*sysctl) (struct ctl_table *table, int op);
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
 	int (*quota_on) (struct dentry *dentry);
@@ -1662,9 +1664,9 @@ int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted);
-int security_capable(int cap);
-int security_real_capable(struct task_struct *tsk, int cap);
-int security_real_capable_noaudit(struct task_struct *tsk, int cap);
+int security_capable(struct user_namespace *ns, int cap);
+int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap);
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap);
 int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
@@ -1858,26 +1860,26 @@ static inline int security_capset(struct cred *new,
 
 static inline int security_capable(int cap)
 {
-	return cap_capable(current, current_cred(), cap, SECURITY_CAP_AUDIT);
+	return cap_capable(current, current_cred(), &init_user_ns, cap, SECURITY_CAP_AUDIT);
 }
 
-static inline int security_real_capable(struct task_struct *tsk, int cap)
+static inline int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = cap_capable(tsk, __task_cred(tsk), cap, SECURITY_CAP_AUDIT);
+	ret = cap_capable(tsk, __task_cred(tsk), ns, cap, SECURITY_CAP_AUDIT);
 	rcu_read_unlock();
 	return ret;
 }
 
 static inline
-int security_real_capable_noaudit(struct task_struct *tsk, int cap)
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = cap_capable(tsk, __task_cred(tsk), cap,
+	ret = cap_capable(tsk, __task_cred(tsk), ns, cap,
 			       SECURITY_CAP_NOAUDIT);
 	rcu_read_unlock();
 	return ret;
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303..744dd6e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 
 /*
@@ -301,15 +302,32 @@ error:
  */
 int capable(int cap)
 {
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int ns_capable(struct user_namespace *ns, int cap)
+{
 	if (unlikely(!cap_valid(cap))) {
 		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
 		BUG();
 	}
 
-	if (security_capable(cap) == 0) {
+	if (security_capable(ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b7106f1..b37c2cd 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/sysctl.h>
 #include <linux/audit.h>
+#include <linux/user_namespace.h>
 #include <net/sock.h>
 
 #include "include/apparmor.h"
@@ -136,11 +137,11 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective,
 }
 
 static int apparmor_capable(struct task_struct *task, const struct cred *cred,
-			    int cap, int audit)
+			    struct user_namespace *ns, int cap, int audit)
 {
 	struct aa_profile *profile;
 	/* cap_capable returns 0 on success, else -EPERM */
-	int error = cap_capable(task, cred, cap, audit);
+	int error = cap_capable(task, cred, ns, cap, audit);
 	if (!error) {
 		profile = aa_cred_profile(cred);
 		if (!unconfined(profile))
diff --git a/security/commoncap.c b/security/commoncap.c
index 64c2ed9..1e9dd00 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/prctl.h>
 #include <linux/securebits.h>
+#include <linux/user_namespace.h>
 
 /*
  * If a non-root user executes a setuid-root binary in
@@ -68,6 +69,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * cap_capable - Determine whether a task has a particular effective capability
  * @tsk: The task to query
  * @cred: The credentials to use
+ * @ns:  The user namespace in which we need the capability
  * @cap: The capability to check for
  * @audit: Whether to write an audit message or not
  *
@@ -79,10 +81,32 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * cap_has_capability() returns 0 when a task has a capability, but the
  * kernel's capable() and has_capability() returns 1 for this case.
  */
-int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
-		int audit)
+int cap_capable(struct task_struct *tsk, const struct cred *cred,
+		struct user_namespace *targ_ns, int cap, int audit)
 {
-	return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+	for (;;) {
+		/* The creator of the user namespace has all caps. */
+		if (targ_ns->creator == cred->user)
+			return 0;
+
+		/* Do we have the necessary capabilities? */
+		if (targ_ns == cred->user->user_ns)
+			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+
+		/* Have we tried all of the parent namespaces? */
+		if (targ_ns == &init_user_ns)
+			return -EPERM;
+
+		/* If you have the capability in a parent user ns you have it
+		 * in the over all children user namespaces as well, so see
+		 * if this process has the capability in the parent user
+		 * namespace.
+		 */
+		targ_ns = targ_ns->creator->user_ns;
+	}
+
+	/* We never get here */
+	return -EPERM;
 }
 
 /**
@@ -177,7 +201,8 @@ static inline int cap_inh_is_capped(void)
 	/* they are so limited unless the current task has the CAP_SETPCAP
 	 * capability
 	 */
-	if (cap_capable(current, current_cred(), CAP_SETPCAP,
+	if (cap_capable(current, current_cred(),
+			current_cred()->user->user_ns, CAP_SETPCAP,
 			SECURITY_CAP_AUDIT) == 0)
 		return 0;
 	return 1;
@@ -829,7 +854,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		     & (new->securebits ^ arg2))			/*[1]*/
 		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
-		    || (cap_capable(current, current_cred(), CAP_SETPCAP,
+		    || (cap_capable(current, current_cred(),
+				    current_cred()->user->user_ns, CAP_SETPCAP,
 				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
@@ -894,7 +920,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int cap_sys_admin = 0;
 
-	if (cap_capable(current, current_cred(), CAP_SYS_ADMIN,
+	if (cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_ADMIN,
 			SECURITY_CAP_NOAUDIT) == 0)
 		cap_sys_admin = 1;
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
@@ -921,7 +947,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
 	int ret = 0;
 
 	if (addr < dac_mmap_min_addr) {
-		ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO,
+		ret = cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_RAWIO,
 				  SECURITY_CAP_AUDIT);
 		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
 		if (ret == 0)
diff --git a/security/security.c b/security/security.c
index 739e403..0042770 100644
--- a/security/security.c
+++ b/security/security.c
@@ -154,30 +154,30 @@ int security_capset(struct cred *new, const struct cred *old,
 				    effective, inheritable, permitted);
 }
 
-int security_capable(int cap)
+int security_capable(struct user_namespace *ns, int cap)
 {
-	return security_ops->capable(current, current_cred(), cap,
+	return security_ops->capable(current, current_cred(), ns, cap,
 				     SECURITY_CAP_AUDIT);
 }
 
-int security_real_capable(struct task_struct *tsk, int cap)
+int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	const struct cred *cred;
 	int ret;
 
 	cred = get_task_cred(tsk);
-	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_AUDIT);
+	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_AUDIT);
 	put_cred(cred);
 	return ret;
 }
 
-int security_real_capable_noaudit(struct task_struct *tsk, int cap)
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	const struct cred *cred;
 	int ret;
 
 	cred = get_task_cred(tsk);
-	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_NOAUDIT);
+	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_NOAUDIT);
 	put_cred(cred);
 	return ret;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e276eb4..e1f8385 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -77,6 +77,7 @@
 #include <linux/mutex.h>
 #include <linux/posix-timers.h>
 #include <linux/syslog.h>
+#include <linux/user_namespace.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -1423,6 +1424,7 @@ static int current_has_perm(const struct task_struct *tsk,
 /* Check whether a task is allowed to use a capability. */
 static int task_has_capability(struct task_struct *tsk,
 			       const struct cred *cred,
+			       struct user_namespace *ns,
 			       int cap, int audit)
 {
 	struct common_audit_data ad;
@@ -1851,15 +1853,15 @@ static int selinux_capset(struct cred *new, const struct cred *old,
  */
 
 static int selinux_capable(struct task_struct *tsk, const struct cred *cred,
-			   int cap, int audit)
+			   struct user_namespace *ns, int cap, int audit)
 {
 	int rc;
 
-	rc = cap_capable(tsk, cred, cap, audit);
+	rc = cap_capable(tsk, cred, ns, cap, audit);
 	if (rc)
 		return rc;
 
-	return task_has_capability(tsk, cred, cap, audit);
+	return task_has_capability(tsk, cred, ns, cap, audit);
 }
 
 static int selinux_sysctl_get_sid(ctl_table *table, u16 tclass, u32 *sid)
@@ -2012,7 +2014,8 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int rc, cap_sys_admin = 0;
 
-	rc = selinux_capable(current, current_cred(), CAP_SYS_ADMIN,
+	rc = selinux_capable(current, current_cred(),
+			     &init_user_ns, CAP_SYS_ADMIN,
 			     SECURITY_CAP_NOAUDIT);
 	if (rc == 0)
 		cap_sys_admin = 1;
@@ -2829,7 +2832,8 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name
 	 * and lack of permission just means that we fall back to the
 	 * in-core context value, not a denial.
 	 */
-	error = selinux_capable(current, current_cred(), CAP_MAC_ADMIN,
+	error = selinux_capable(current, current_cred(),
+				&init_user_ns, CAP_MAC_ADMIN,
 				SECURITY_CAP_NOAUDIT);
 	if (!error)
 		error = security_sid_to_context_force(isec->sid, &context,
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 2/7] security: Make capabilities relative to the user namespace.
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
@ 2011-01-10 21:13 ` Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 3/7] allow sethostname in a container Serge E. Hallyn
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk


- Introduce ns_capable to test for a capability in a non-default
  user namespace.
- Teach cap_capable to handle capabilities in a non-default
  user namespace.

The motivation is to get to the unprivileged creation of new
namespaces.  It looks like this gets us 90% of the way there, with
only potential uid confusion issues left.

I still need to handle getting all caps after creation but otherwise I
think I have a good starter patch that achieves all of your goals.

Changelog:
	11/05/2010: [serge] add apparmor
	12/14/2010: [serge] fix capabilities to created user namespaces
	Without this, if user serge creates a user_ns, he won't have
	capabilities to the user_ns he created.  THis is because we
	were first checking whether his effective caps had the caps
	he needed and returning -EPERM if not, and THEN checking whether
	he was the creator.  Reverse those checks.
	12/16/2010: [serge] security_real_capable needs ns argument in !security case

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/capability.h |    7 +++++--
 include/linux/security.h   |   22 ++++++++++++----------
 kernel/capability.c        |   22 ++++++++++++++++++++--
 security/apparmor/lsm.c    |    5 +++--
 security/commoncap.c       |   40 +++++++++++++++++++++++++++++++++-------
 security/security.c        |   12 ++++++------
 security/selinux/hooks.c   |   14 +++++++++-----
 7 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index fb16a36..7b3be11 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -544,7 +544,7 @@ extern const kernel_cap_t __cap_init_eff_set;
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-#define has_capability(t, cap) (security_real_capable((t), (cap)) == 0)
+#define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0)
 
 /**
  * has_capability_noaudit - Determine if a task has a superior capability available (unaudited)
@@ -558,9 +558,12 @@ extern const kernel_cap_t __cap_init_eff_set;
  * Note that this does not set PF_SUPERPRIV on the task.
  */
 #define has_capability_noaudit(t, cap) \
-	(security_real_capable_noaudit((t), (cap)) == 0)
+	(security_real_capable_noaudit((t), &init_user_ns, (cap)) == 0)
 
+struct user_namespace;
+extern struct user_namespace init_user_ns;
 extern int capable(int cap);
+extern int ns_capable(struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 struct dentry;
diff --git a/include/linux/security.h b/include/linux/security.h
index c642bb8..f31bffd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -46,13 +46,14 @@
 
 struct ctl_table;
 struct audit_krule;
+struct user_namespace;
 
 /*
  * These functions are in security/capability.c and are used
  * as the default capabilities functions
  */
 extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
-		       int cap, int audit);
+		       struct user_namespace *ns, int cap, int audit);
 extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
@@ -1254,6 +1255,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	credentials.
  *	@tsk contains the task_struct for the process.
  *	@cred contains the credentials to use.
+ *      @ns contains the user namespace we want the capability in
  *	@cap contains the capability <include/linux/capability.h>.
  *	@audit: Whether to write an audit message or not
  *	Return 0 if the capability is granted for @tsk.
@@ -1382,7 +1384,7 @@ struct security_operations {
 		       const kernel_cap_t *inheritable,
 		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, const struct cred *cred,
-			int cap, int audit);
+			struct user_namespace *ns, int cap, int audit);
 	int (*sysctl) (struct ctl_table *table, int op);
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
 	int (*quota_on) (struct dentry *dentry);
@@ -1662,9 +1664,9 @@ int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted);
-int security_capable(int cap);
-int security_real_capable(struct task_struct *tsk, int cap);
-int security_real_capable_noaudit(struct task_struct *tsk, int cap);
+int security_capable(struct user_namespace *ns, int cap);
+int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap);
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap);
 int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
@@ -1858,26 +1860,26 @@ static inline int security_capset(struct cred *new,
 
 static inline int security_capable(int cap)
 {
-	return cap_capable(current, current_cred(), cap, SECURITY_CAP_AUDIT);
+	return cap_capable(current, current_cred(), &init_user_ns, cap, SECURITY_CAP_AUDIT);
 }
 
-static inline int security_real_capable(struct task_struct *tsk, int cap)
+static inline int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = cap_capable(tsk, __task_cred(tsk), cap, SECURITY_CAP_AUDIT);
+	ret = cap_capable(tsk, __task_cred(tsk), ns, cap, SECURITY_CAP_AUDIT);
 	rcu_read_unlock();
 	return ret;
 }
 
 static inline
-int security_real_capable_noaudit(struct task_struct *tsk, int cap)
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = cap_capable(tsk, __task_cred(tsk), cap,
+	ret = cap_capable(tsk, __task_cred(tsk), ns, cap,
 			       SECURITY_CAP_NOAUDIT);
 	rcu_read_unlock();
 	return ret;
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303..744dd6e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 
 /*
@@ -301,15 +302,32 @@ error:
  */
 int capable(int cap)
 {
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int ns_capable(struct user_namespace *ns, int cap)
+{
 	if (unlikely(!cap_valid(cap))) {
 		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
 		BUG();
 	}
 
-	if (security_capable(cap) == 0) {
+	if (security_capable(ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b7106f1..b37c2cd 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/sysctl.h>
 #include <linux/audit.h>
+#include <linux/user_namespace.h>
 #include <net/sock.h>
 
 #include "include/apparmor.h"
@@ -136,11 +137,11 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective,
 }
 
 static int apparmor_capable(struct task_struct *task, const struct cred *cred,
-			    int cap, int audit)
+			    struct user_namespace *ns, int cap, int audit)
 {
 	struct aa_profile *profile;
 	/* cap_capable returns 0 on success, else -EPERM */
-	int error = cap_capable(task, cred, cap, audit);
+	int error = cap_capable(task, cred, ns, cap, audit);
 	if (!error) {
 		profile = aa_cred_profile(cred);
 		if (!unconfined(profile))
diff --git a/security/commoncap.c b/security/commoncap.c
index 64c2ed9..1e9dd00 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/prctl.h>
 #include <linux/securebits.h>
+#include <linux/user_namespace.h>
 
 /*
  * If a non-root user executes a setuid-root binary in
@@ -68,6 +69,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * cap_capable - Determine whether a task has a particular effective capability
  * @tsk: The task to query
  * @cred: The credentials to use
+ * @ns:  The user namespace in which we need the capability
  * @cap: The capability to check for
  * @audit: Whether to write an audit message or not
  *
@@ -79,10 +81,32 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * cap_has_capability() returns 0 when a task has a capability, but the
  * kernel's capable() and has_capability() returns 1 for this case.
  */
-int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
-		int audit)
+int cap_capable(struct task_struct *tsk, const struct cred *cred,
+		struct user_namespace *targ_ns, int cap, int audit)
 {
-	return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+	for (;;) {
+		/* The creator of the user namespace has all caps. */
+		if (targ_ns->creator == cred->user)
+			return 0;
+
+		/* Do we have the necessary capabilities? */
+		if (targ_ns == cred->user->user_ns)
+			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+
+		/* Have we tried all of the parent namespaces? */
+		if (targ_ns == &init_user_ns)
+			return -EPERM;
+
+		/* If you have the capability in a parent user ns you have it
+		 * in the over all children user namespaces as well, so see
+		 * if this process has the capability in the parent user
+		 * namespace.
+		 */
+		targ_ns = targ_ns->creator->user_ns;
+	}
+
+	/* We never get here */
+	return -EPERM;
 }
 
 /**
@@ -177,7 +201,8 @@ static inline int cap_inh_is_capped(void)
 	/* they are so limited unless the current task has the CAP_SETPCAP
 	 * capability
 	 */
-	if (cap_capable(current, current_cred(), CAP_SETPCAP,
+	if (cap_capable(current, current_cred(),
+			current_cred()->user->user_ns, CAP_SETPCAP,
 			SECURITY_CAP_AUDIT) == 0)
 		return 0;
 	return 1;
@@ -829,7 +854,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		     & (new->securebits ^ arg2))			/*[1]*/
 		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
-		    || (cap_capable(current, current_cred(), CAP_SETPCAP,
+		    || (cap_capable(current, current_cred(),
+				    current_cred()->user->user_ns, CAP_SETPCAP,
 				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
@@ -894,7 +920,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int cap_sys_admin = 0;
 
-	if (cap_capable(current, current_cred(), CAP_SYS_ADMIN,
+	if (cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_ADMIN,
 			SECURITY_CAP_NOAUDIT) == 0)
 		cap_sys_admin = 1;
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
@@ -921,7 +947,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
 	int ret = 0;
 
 	if (addr < dac_mmap_min_addr) {
-		ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO,
+		ret = cap_capable(current, current_cred(), &init_user_ns, CAP_SYS_RAWIO,
 				  SECURITY_CAP_AUDIT);
 		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
 		if (ret == 0)
diff --git a/security/security.c b/security/security.c
index 739e403..0042770 100644
--- a/security/security.c
+++ b/security/security.c
@@ -154,30 +154,30 @@ int security_capset(struct cred *new, const struct cred *old,
 				    effective, inheritable, permitted);
 }
 
-int security_capable(int cap)
+int security_capable(struct user_namespace *ns, int cap)
 {
-	return security_ops->capable(current, current_cred(), cap,
+	return security_ops->capable(current, current_cred(), ns, cap,
 				     SECURITY_CAP_AUDIT);
 }
 
-int security_real_capable(struct task_struct *tsk, int cap)
+int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	const struct cred *cred;
 	int ret;
 
 	cred = get_task_cred(tsk);
-	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_AUDIT);
+	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_AUDIT);
 	put_cred(cred);
 	return ret;
 }
 
-int security_real_capable_noaudit(struct task_struct *tsk, int cap)
+int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
 {
 	const struct cred *cred;
 	int ret;
 
 	cred = get_task_cred(tsk);
-	ret = security_ops->capable(tsk, cred, cap, SECURITY_CAP_NOAUDIT);
+	ret = security_ops->capable(tsk, cred, ns, cap, SECURITY_CAP_NOAUDIT);
 	put_cred(cred);
 	return ret;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e276eb4..e1f8385 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -77,6 +77,7 @@
 #include <linux/mutex.h>
 #include <linux/posix-timers.h>
 #include <linux/syslog.h>
+#include <linux/user_namespace.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -1423,6 +1424,7 @@ static int current_has_perm(const struct task_struct *tsk,
 /* Check whether a task is allowed to use a capability. */
 static int task_has_capability(struct task_struct *tsk,
 			       const struct cred *cred,
+			       struct user_namespace *ns,
 			       int cap, int audit)
 {
 	struct common_audit_data ad;
@@ -1851,15 +1853,15 @@ static int selinux_capset(struct cred *new, const struct cred *old,
  */
 
 static int selinux_capable(struct task_struct *tsk, const struct cred *cred,
-			   int cap, int audit)
+			   struct user_namespace *ns, int cap, int audit)
 {
 	int rc;
 
-	rc = cap_capable(tsk, cred, cap, audit);
+	rc = cap_capable(tsk, cred, ns, cap, audit);
 	if (rc)
 		return rc;
 
-	return task_has_capability(tsk, cred, cap, audit);
+	return task_has_capability(tsk, cred, ns, cap, audit);
 }
 
 static int selinux_sysctl_get_sid(ctl_table *table, u16 tclass, u32 *sid)
@@ -2012,7 +2014,8 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int rc, cap_sys_admin = 0;
 
-	rc = selinux_capable(current, current_cred(), CAP_SYS_ADMIN,
+	rc = selinux_capable(current, current_cred(),
+			     &init_user_ns, CAP_SYS_ADMIN,
 			     SECURITY_CAP_NOAUDIT);
 	if (rc == 0)
 		cap_sys_admin = 1;
@@ -2829,7 +2832,8 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name
 	 * and lack of permission just means that we fall back to the
 	 * in-core context value, not a denial.
 	 */
-	error = selinux_capable(current, current_cred(), CAP_MAC_ADMIN,
+	error = selinux_capable(current, current_cred(),
+				&init_user_ns, CAP_MAC_ADMIN,
 				SECURITY_CAP_NOAUDIT);
 	if (!error)
 		error = security_sid_to_context_force(isec->sid, &context,
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 3/7] allow sethostname in a container
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-10 21:13   ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
  2011-01-10 21:13   ` [PATCH 2/7] security: Make capabilities relative to the user namespace Serge E. Hallyn
@ 2011-01-10 21:13   ` Serge E. Hallyn
  2011-01-10 21:13   ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk


Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 kernel/sys.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcd..9b9b03b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1171,7 +1171,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 3/7] allow sethostname in a container
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 2/7] security: Make capabilities relative to the user namespace Serge E. Hallyn
@ 2011-01-10 21:13 ` Serge E. Hallyn
  2011-01-10 21:13 ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk


Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/sys.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcd..9b9b03b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1171,7 +1171,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 4/7] allow killing tasks in your own or child userns
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
                     ` (2 preceding siblings ...)
  2011-01-10 21:13   ` [PATCH 3/7] allow sethostname in a container Serge E. Hallyn
@ 2011-01-10 21:13   ` Serge E. Hallyn
  2011-01-10 21:13   ` [PATCH 5/7] Allow ptrace from non-init user namespaces Serge E. Hallyn
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

Changelog:
	Dec 8: Fixed bug in my check_kill_permission pointed out by
	       Eric Biederman.
	Dec 13: Apply Eric's suggestion to pass target task into kill_ok_by_cred()
	        for clarity
	Dec 31: address comment by Eric Biederman:
		don't need cred/tcred in check_kill_permission.
	Jan 1: use const cred struct.

Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
Reviewed-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---
 kernel/signal.c |   36 ++++++++++++++++++++++++++++--------
 1 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..6a12eae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -636,13 +636,39 @@ static inline bool si_fromuser(const struct siginfo *info)
 }
 
 /*
+ * called with RCU read lock from check_kill_permission()
+ */
+static inline int kill_ok_by_cred(struct task_struct *t)
+{
+	const struct cred *cred = current_cred();
+	const struct cred *tcred = __task_cred(t);
+
+	if (cred->user->user_ns != tcred->user->user_ns) {
+		/* userids are not equivalent - either you have the
+		   capability to the target user ns or you don't */
+		if (ns_capable(tcred->user->user_ns, CAP_KILL))
+			return 1;
+		return 0;
+	}
+
+	/* same user namespace - usual credentials checks apply */
+	if ((cred->euid ^ tcred->suid) &&
+	    (cred->euid ^ tcred->uid) &&
+	    (cred->uid  ^ tcred->suid) &&
+	    (cred->uid  ^ tcred->uid) &&
+	    !ns_capable(tcred->user->user_ns, CAP_KILL))
+		return 0;
+
+	return 1;
+}
+
+/*
  * Bad permissions for sending the signal
  * - the caller must hold the RCU read lock
  */
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
-	const struct cred *cred, *tcred;
 	struct pid *sid;
 	int error;
 
@@ -656,14 +682,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	cred = current_cred();
-	tcred = __task_cred(t);
 	if (!same_thread_group(current, t) &&
-	    (cred->euid ^ tcred->suid) &&
-	    (cred->euid ^ tcred->uid) &&
-	    (cred->uid  ^ tcred->suid) &&
-	    (cred->uid  ^ tcred->uid) &&
-	    !capable(CAP_KILL)) {
+	    !kill_ok_by_cred(t)) {
 		switch (sig) {
 		case SIGCONT:
 			sid = task_session(t);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
                   ` (2 preceding siblings ...)
  2011-01-10 21:13 ` [PATCH 3/7] allow sethostname in a container Serge E. Hallyn
@ 2011-01-10 21:13 ` Serge E. Hallyn
       [not found]   ` <20110110211334.GD22564-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-10 21:13 ` [PATCH 5/7] Allow ptrace from non-init user namespaces Serge E. Hallyn
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk

Changelog:
	Dec 8: Fixed bug in my check_kill_permission pointed out by
	       Eric Biederman.
	Dec 13: Apply Eric's suggestion to pass target task into kill_ok_by_cred()
	        for clarity
	Dec 31: address comment by Eric Biederman:
		don't need cred/tcred in check_kill_permission.
	Jan 1: use const cred struct.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c |   36 ++++++++++++++++++++++++++++--------
 1 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..6a12eae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -636,13 +636,39 @@ static inline bool si_fromuser(const struct siginfo *info)
 }
 
 /*
+ * called with RCU read lock from check_kill_permission()
+ */
+static inline int kill_ok_by_cred(struct task_struct *t)
+{
+	const struct cred *cred = current_cred();
+	const struct cred *tcred = __task_cred(t);
+
+	if (cred->user->user_ns != tcred->user->user_ns) {
+		/* userids are not equivalent - either you have the
+		   capability to the target user ns or you don't */
+		if (ns_capable(tcred->user->user_ns, CAP_KILL))
+			return 1;
+		return 0;
+	}
+
+	/* same user namespace - usual credentials checks apply */
+	if ((cred->euid ^ tcred->suid) &&
+	    (cred->euid ^ tcred->uid) &&
+	    (cred->uid  ^ tcred->suid) &&
+	    (cred->uid  ^ tcred->uid) &&
+	    !ns_capable(tcred->user->user_ns, CAP_KILL))
+		return 0;
+
+	return 1;
+}
+
+/*
  * Bad permissions for sending the signal
  * - the caller must hold the RCU read lock
  */
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
-	const struct cred *cred, *tcred;
 	struct pid *sid;
 	int error;
 
@@ -656,14 +682,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	cred = current_cred();
-	tcred = __task_cred(t);
 	if (!same_thread_group(current, t) &&
-	    (cred->euid ^ tcred->suid) &&
-	    (cred->euid ^ tcred->uid) &&
-	    (cred->uid  ^ tcred->suid) &&
-	    (cred->uid  ^ tcred->uid) &&
-	    !capable(CAP_KILL)) {
+	    !kill_ok_by_cred(t)) {
 		switch (sig) {
 		case SIGCONT:
 			sid = task_session(t);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 5/7] Allow ptrace from non-init user namespaces
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
                     ` (3 preceding siblings ...)
  2011-01-10 21:13   ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
@ 2011-01-10 21:13   ` Serge E. Hallyn
  2011-01-10 21:14   ` [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c Serge E. Hallyn
  2011-01-10 21:14   ` [PATCH 7/7] user namespaces: convert several capable() calls Serge E. Hallyn
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

ptrace is allowed to tasks in the same user namespace according to
the usual rules (i.e. the same rules as for two tasks in the init
user namespace).  ptrace is also allowed to a user namespace to
which the current task the has CAP_SYS_PTRACE capability.

Changelog:
	Dec 31: Address feedback by Eric:
		. Correct ptrace uid check
		. Rename may_ptrace_ns to ptrace_capable
		. Also fix the cap_ptrace checks.
	Jan 1: Use const cred struct

Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 include/linux/capability.h     |    2 +
 include/linux/user_namespace.h |    9 +++++++
 kernel/ptrace.c                |   40 +++++++++++++++++++++++----------
 kernel/user_namespace.c        |   16 +++++++++++++
 security/commoncap.c           |   48 +++++++++++++++++++++++++++++++++------
 5 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7b3be11..501b8c9 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -546,6 +546,8 @@ extern const kernel_cap_t __cap_init_eff_set;
  */
 #define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0)
 
+#define has_ns_capability(t, ns, cap) (security_real_capable((t), (ns), (cap)) == 0)
+
 /**
  * has_capability_noaudit - Determine if a task has a superior capability available (unaudited)
  * @t: The task in question
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8178156..91c4f10 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -39,6 +39,9 @@ static inline void put_user_ns(struct user_namespace *ns)
 uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid);
 gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid);
 
+int same_or_ancestor_user_ns(struct task_struct *task,
+	struct task_struct *victim);
+
 #else
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -66,6 +69,12 @@ static inline gid_t user_ns_map_gid(struct user_namespace *to,
 	return gid;
 }
 
+static inline int same_or_ancestor_user_ns(struct task_struct *task,
+	struct task_struct *victim)
+{
+	return 1;
+}
+
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3..88e3fb3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -116,6 +116,19 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
+static inline int ptrace_capable(struct task_struct *t)
+{
+	struct user_namespace *ns;
+	int ret;
+
+	rcu_read_lock();
+	ns = task_cred_xxx(t, user)->user_ns;
+	ret = ns_capable(ns, CAP_SYS_PTRACE);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
@@ -134,21 +147,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 		return 0;
 	rcu_read_lock();
 	tcred = __task_cred(task);
-	if ((cred->uid != tcred->euid ||
-	     cred->uid != tcred->suid ||
-	     cred->uid != tcred->uid  ||
-	     cred->gid != tcred->egid ||
-	     cred->gid != tcred->sgid ||
-	     cred->gid != tcred->gid) &&
-	    !capable(CAP_SYS_PTRACE)) {
-		rcu_read_unlock();
-		return -EPERM;
-	}
+	if (cred->user->user_ns == tcred->user->user_ns &&
+	    (cred->uid == tcred->euid &&
+	     cred->uid == tcred->suid &&
+	     cred->uid == tcred->uid  &&
+	     cred->gid == tcred->egid &&
+	     cred->gid == tcred->sgid &&
+	     cred->gid == tcred->gid))
+		goto ok;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto ok;
+	rcu_read_unlock();
+	return -EPERM;
+ok:
 	rcu_read_unlock();
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !capable(CAP_SYS_PTRACE))
+	if (!dumpable && !ptrace_capable(task))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
@@ -198,7 +214,7 @@ int ptrace_attach(struct task_struct *task)
 		goto unlock_tasklist;
 
 	task->ptrace = PT_PTRACED;
-	if (capable(CAP_SYS_PTRACE))
+	if (ptrace_capable(task))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2591583..4b70999 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -126,3 +126,19 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
 	/* No useful relationship so no mapping */
 	return overflowgid;
 }
+
+int same_or_ancestor_user_ns(struct task_struct *task,
+	struct task_struct *victim)
+{
+	struct user_namespace *u1 = task_cred_xxx(task, user)->user_ns;
+	struct user_namespace *u2 = task_cred_xxx(victim, user)->user_ns;
+	for (;;) {
+		if (u1 == u2)
+			return 1;
+		if (u1 == &init_user_ns)
+			return 0;
+		u1 = u1->creator->user_ns;
+	}
+	/* We never get here */
+	return 0;
+}
diff --git a/security/commoncap.c b/security/commoncap.c
index 1e9dd00..a061636 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -130,18 +130,34 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
  * @child: The process to be accessed
  * @mode: The mode of attachment.
  *
+ * If we are in the same or an ancestor user_ns and have all the target
+ * task's capabilities, then ptrace access is allowed.
+ * If we have the ptrace capability to the target user_ns, then ptrace
+ * access is allowed.
+ * Else denied.
+ *
  * Determine whether a process may access another, returning 0 if permission
  * granted, -ve if denied.
  */
 int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 	int ret = 0;
+	const struct cred *cred, *tcred;
 
 	rcu_read_lock();
-	if (!cap_issubset(__task_cred(child)->cap_permitted,
-			  current_cred()->cap_permitted) &&
-	    !capable(CAP_SYS_PTRACE))
-		ret = -EPERM;
+	cred = current_cred();
+	tcred = __task_cred(child);
+	/*
+	 * The ancestor user_ns check may be gratuitous, as I think
+	 * we've already guaranteed that in kernel/ptrace.c.
+	 */
+	if (same_or_ancestor_user_ns(current, child) &&
+	    cap_issubset(tcred->cap_permitted, cred->cap_permitted))
+		goto out;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto out;
+	ret = -EPERM;
+out:
 	rcu_read_unlock();
 	return ret;
 }
@@ -150,18 +166,34 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
  * cap_ptrace_traceme - Determine whether another process may trace the current
  * @parent: The task proposed to be the tracer
  *
+ * If parent is in the same or an ancestor user_ns and has all current's
+ * capabilities, then ptrace access is allowed.
+ * If parent has the ptrace capability to current's user_ns, then ptrace
+ * access is allowed.
+ * Else denied.
+ *
  * Determine whether the nominated task is permitted to trace the current
  * process, returning 0 if permission is granted, -ve if denied.
  */
 int cap_ptrace_traceme(struct task_struct *parent)
 {
 	int ret = 0;
+	const struct cred *cred, *tcred;
 
 	rcu_read_lock();
-	if (!cap_issubset(current_cred()->cap_permitted,
-			  __task_cred(parent)->cap_permitted) &&
-	    !has_capability(parent, CAP_SYS_PTRACE))
-		ret = -EPERM;
+	cred = __task_cred(parent);
+	tcred = current_cred();
+	/*
+	 * The ancestor user_ns check may be gratuitous, as I think
+	 * we've already guaranteed that in kernel/ptrace.c.
+	 */
+	if (same_or_ancestor_user_ns(parent, current) &&
+	    cap_issubset(tcred->cap_permitted, cred->cap_permitted))
+		goto out;
+	if (has_ns_capability(parent, tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto out;
+	ret = -EPERM;
+out:
 	rcu_read_unlock();
 	return ret;
 }
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 5/7] Allow ptrace from non-init user namespaces
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
                   ` (3 preceding siblings ...)
  2011-01-10 21:13 ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
@ 2011-01-10 21:13 ` Serge E. Hallyn
  2011-01-10 21:14 ` [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c Serge E. Hallyn
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk

ptrace is allowed to tasks in the same user namespace according to
the usual rules (i.e. the same rules as for two tasks in the init
user namespace).  ptrace is also allowed to a user namespace to
which the current task the has CAP_SYS_PTRACE capability.

Changelog:
	Dec 31: Address feedback by Eric:
		. Correct ptrace uid check
		. Rename may_ptrace_ns to ptrace_capable
		. Also fix the cap_ptrace checks.
	Jan 1: Use const cred struct

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/capability.h     |    2 +
 include/linux/user_namespace.h |    9 +++++++
 kernel/ptrace.c                |   40 +++++++++++++++++++++++----------
 kernel/user_namespace.c        |   16 +++++++++++++
 security/commoncap.c           |   48 +++++++++++++++++++++++++++++++++------
 5 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7b3be11..501b8c9 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -546,6 +546,8 @@ extern const kernel_cap_t __cap_init_eff_set;
  */
 #define has_capability(t, cap) (security_real_capable((t), &init_user_ns, (cap)) == 0)
 
+#define has_ns_capability(t, ns, cap) (security_real_capable((t), (ns), (cap)) == 0)
+
 /**
  * has_capability_noaudit - Determine if a task has a superior capability available (unaudited)
  * @t: The task in question
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8178156..91c4f10 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -39,6 +39,9 @@ static inline void put_user_ns(struct user_namespace *ns)
 uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid);
 gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid);
 
+int same_or_ancestor_user_ns(struct task_struct *task,
+	struct task_struct *victim);
+
 #else
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -66,6 +69,12 @@ static inline gid_t user_ns_map_gid(struct user_namespace *to,
 	return gid;
 }
 
+static inline int same_or_ancestor_user_ns(struct task_struct *task,
+	struct task_struct *victim)
+{
+	return 1;
+}
+
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3..88e3fb3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -116,6 +116,19 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
+static inline int ptrace_capable(struct task_struct *t)
+{
+	struct user_namespace *ns;
+	int ret;
+
+	rcu_read_lock();
+	ns = task_cred_xxx(t, user)->user_ns;
+	ret = ns_capable(ns, CAP_SYS_PTRACE);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
@@ -134,21 +147,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 		return 0;
 	rcu_read_lock();
 	tcred = __task_cred(task);
-	if ((cred->uid != tcred->euid ||
-	     cred->uid != tcred->suid ||
-	     cred->uid != tcred->uid  ||
-	     cred->gid != tcred->egid ||
-	     cred->gid != tcred->sgid ||
-	     cred->gid != tcred->gid) &&
-	    !capable(CAP_SYS_PTRACE)) {
-		rcu_read_unlock();
-		return -EPERM;
-	}
+	if (cred->user->user_ns == tcred->user->user_ns &&
+	    (cred->uid == tcred->euid &&
+	     cred->uid == tcred->suid &&
+	     cred->uid == tcred->uid  &&
+	     cred->gid == tcred->egid &&
+	     cred->gid == tcred->sgid &&
+	     cred->gid == tcred->gid))
+		goto ok;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto ok;
+	rcu_read_unlock();
+	return -EPERM;
+ok:
 	rcu_read_unlock();
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !capable(CAP_SYS_PTRACE))
+	if (!dumpable && !ptrace_capable(task))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
@@ -198,7 +214,7 @@ int ptrace_attach(struct task_struct *task)
 		goto unlock_tasklist;
 
 	task->ptrace = PT_PTRACED;
-	if (capable(CAP_SYS_PTRACE))
+	if (ptrace_capable(task))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2591583..4b70999 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -126,3 +126,19 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
 	/* No useful relationship so no mapping */
 	return overflowgid;
 }
+
+int same_or_ancestor_user_ns(struct task_struct *task,
+	struct task_struct *victim)
+{
+	struct user_namespace *u1 = task_cred_xxx(task, user)->user_ns;
+	struct user_namespace *u2 = task_cred_xxx(victim, user)->user_ns;
+	for (;;) {
+		if (u1 == u2)
+			return 1;
+		if (u1 == &init_user_ns)
+			return 0;
+		u1 = u1->creator->user_ns;
+	}
+	/* We never get here */
+	return 0;
+}
diff --git a/security/commoncap.c b/security/commoncap.c
index 1e9dd00..a061636 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -130,18 +130,34 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
  * @child: The process to be accessed
  * @mode: The mode of attachment.
  *
+ * If we are in the same or an ancestor user_ns and have all the target
+ * task's capabilities, then ptrace access is allowed.
+ * If we have the ptrace capability to the target user_ns, then ptrace
+ * access is allowed.
+ * Else denied.
+ *
  * Determine whether a process may access another, returning 0 if permission
  * granted, -ve if denied.
  */
 int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
 	int ret = 0;
+	const struct cred *cred, *tcred;
 
 	rcu_read_lock();
-	if (!cap_issubset(__task_cred(child)->cap_permitted,
-			  current_cred()->cap_permitted) &&
-	    !capable(CAP_SYS_PTRACE))
-		ret = -EPERM;
+	cred = current_cred();
+	tcred = __task_cred(child);
+	/*
+	 * The ancestor user_ns check may be gratuitous, as I think
+	 * we've already guaranteed that in kernel/ptrace.c.
+	 */
+	if (same_or_ancestor_user_ns(current, child) &&
+	    cap_issubset(tcred->cap_permitted, cred->cap_permitted))
+		goto out;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto out;
+	ret = -EPERM;
+out:
 	rcu_read_unlock();
 	return ret;
 }
@@ -150,18 +166,34 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
  * cap_ptrace_traceme - Determine whether another process may trace the current
  * @parent: The task proposed to be the tracer
  *
+ * If parent is in the same or an ancestor user_ns and has all current's
+ * capabilities, then ptrace access is allowed.
+ * If parent has the ptrace capability to current's user_ns, then ptrace
+ * access is allowed.
+ * Else denied.
+ *
  * Determine whether the nominated task is permitted to trace the current
  * process, returning 0 if permission is granted, -ve if denied.
  */
 int cap_ptrace_traceme(struct task_struct *parent)
 {
 	int ret = 0;
+	const struct cred *cred, *tcred;
 
 	rcu_read_lock();
-	if (!cap_issubset(current_cred()->cap_permitted,
-			  __task_cred(parent)->cap_permitted) &&
-	    !has_capability(parent, CAP_SYS_PTRACE))
-		ret = -EPERM;
+	cred = __task_cred(parent);
+	tcred = current_cred();
+	/*
+	 * The ancestor user_ns check may be gratuitous, as I think
+	 * we've already guaranteed that in kernel/ptrace.c.
+	 */
+	if (same_or_ancestor_user_ns(parent, current) &&
+	    cap_issubset(tcred->cap_permitted, cred->cap_permitted))
+		goto out;
+	if (has_ns_capability(parent, tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto out;
+	ret = -EPERM;
+out:
 	rcu_read_unlock();
 	return ret;
 }
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
                     ` (4 preceding siblings ...)
  2011-01-10 21:13   ` [PATCH 5/7] Allow ptrace from non-init user namespaces Serge E. Hallyn
@ 2011-01-10 21:14   ` Serge E. Hallyn
  2011-01-10 21:14   ` [PATCH 7/7] user namespaces: convert several capable() calls Serge E. Hallyn
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:14 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

This allows setuid/setgid in containers.  It also fixes some
corner cases where kernel logic foregoes capability checks when
uids are equivalent.  The latter will need to be done throughout
the whole kernel.

Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 kernel/sys.c |   35 ++++++++++++++++++++---------------
 1 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 9b9b03b..2278e87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -125,8 +125,10 @@ static int set_one_prio(struct task_struct *p, int niceval, int error)
 	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 	int no_nice;
 
-	if (pcred->uid  != cred->euid &&
-	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+	if (pcred->user->user_ns != cred->user->user_ns &&
+	    pcred->uid  != cred->euid &&
+	    pcred->euid != cred->euid &&
+	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -496,7 +498,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 	if (rgid != (gid_t) -1) {
 		if (old->gid == rgid ||
 		    old->egid == rgid ||
-		    capable(CAP_SETGID))
+		    ns_capable(current_user_ns(), CAP_SETGID))
 			new->gid = rgid;
 		else
 			goto error;
@@ -505,7 +507,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 		if (old->gid == egid ||
 		    old->egid == egid ||
 		    old->sgid == egid ||
-		    capable(CAP_SETGID))
+		    ns_capable(current_user_ns(), CAP_SETGID))
 			new->egid = egid;
 		else
 			goto error;
@@ -540,7 +542,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (capable(CAP_SETGID))
+	if (ns_capable(current_user_ns(), CAP_SETGID))
 		new->gid = new->egid = new->sgid = new->fsgid = gid;
 	else if (gid == old->gid || gid == old->sgid)
 		new->egid = new->fsgid = gid;
@@ -607,7 +609,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		new->uid = ruid;
 		if (old->uid != ruid &&
 		    old->euid != ruid &&
-		    !capable(CAP_SETUID))
+		    !ns_capable(current_user_ns(), CAP_SETUID))
 			goto error;
 	}
 
@@ -616,7 +618,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		if (old->uid != euid &&
 		    old->euid != euid &&
 		    old->suid != euid &&
-		    !capable(CAP_SETUID))
+		    !ns_capable(current_user_ns(), CAP_SETUID))
 			goto error;
 	}
 
@@ -664,7 +666,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (capable(CAP_SETUID)) {
+	if (ns_capable(current_user_ns(), CAP_SETUID)) {
 		new->suid = new->uid = uid;
 		if (uid != old->uid) {
 			retval = set_user(new);
@@ -706,7 +708,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!capable(CAP_SETUID)) {
+	if (!ns_capable(current_user_ns(), CAP_SETUID)) {
 		if (ruid != (uid_t) -1 && ruid != old->uid &&
 		    ruid != old->euid  && ruid != old->suid)
 			goto error;
@@ -770,7 +772,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!capable(CAP_SETGID)) {
+	if (!ns_capable(current_user_ns(), CAP_SETGID)) {
 		if (rgid != (gid_t) -1 && rgid != old->gid &&
 		    rgid != old->egid  && rgid != old->sgid)
 			goto error;
@@ -830,7 +832,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 
 	if (uid == old->uid  || uid == old->euid  ||
 	    uid == old->suid || uid == old->fsuid ||
-	    capable(CAP_SETUID)) {
+	    ns_capable(current_user_ns(), CAP_SETUID)) {
 		if (uid != old_fsuid) {
 			new->fsuid = uid;
 			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -863,7 +865,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 
 	if (gid == old->gid  || gid == old->egid  ||
 	    gid == old->sgid || gid == old->fsgid ||
-	    capable(CAP_SETGID)) {
+	    ns_capable(current_user_ns(), CAP_SETGID)) {
 		if (gid != old_fsgid) {
 			new->fsgid = gid;
 			goto change_okay;
@@ -1220,7 +1222,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
@@ -1335,6 +1337,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 	rlim = tsk->signal->rlim + resource;
 	task_lock(tsk->group_leader);
 	if (new_rlim) {
+		/* Keep the capable check against init_user_ns until
+		   cgroups can contain all limits */
 		if (new_rlim->rlim_max > rlim->rlim_max &&
 				!capable(CAP_SYS_RESOURCE))
 			retval = -EPERM;
@@ -1379,13 +1383,14 @@ static int check_prlimit_permission(struct task_struct *task)
 	const struct cred *cred = current_cred(), *tcred;
 
 	tcred = __task_cred(task);
-	if ((cred->uid != tcred->euid ||
+	if ((cred->user->user_ns != tcred->user->user_ns ||
+	     cred->uid != tcred->euid ||
 	     cred->uid != tcred->suid ||
 	     cred->uid != tcred->uid  ||
 	     cred->gid != tcred->egid ||
 	     cred->gid != tcred->sgid ||
 	     cred->gid != tcred->gid) &&
-	     !capable(CAP_SYS_RESOURCE)) {
+	     !ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) {
 		return -EPERM;
 	}
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
                   ` (4 preceding siblings ...)
  2011-01-10 21:13 ` [PATCH 5/7] Allow ptrace from non-init user namespaces Serge E. Hallyn
@ 2011-01-10 21:14 ` Serge E. Hallyn
       [not found]   ` <20110110211406.GF22564-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-10 21:59   ` Bastian Blank
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-10 21:14 ` Serge E. Hallyn
  7 siblings, 2 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:14 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk

This allows setuid/setgid in containers.  It also fixes some
corner cases where kernel logic foregoes capability checks when
uids are equivalent.  The latter will need to be done throughout
the whole kernel.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/sys.c |   35 ++++++++++++++++++++---------------
 1 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 9b9b03b..2278e87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -125,8 +125,10 @@ static int set_one_prio(struct task_struct *p, int niceval, int error)
 	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 	int no_nice;
 
-	if (pcred->uid  != cred->euid &&
-	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+	if (pcred->user->user_ns != cred->user->user_ns &&
+	    pcred->uid  != cred->euid &&
+	    pcred->euid != cred->euid &&
+	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -496,7 +498,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 	if (rgid != (gid_t) -1) {
 		if (old->gid == rgid ||
 		    old->egid == rgid ||
-		    capable(CAP_SETGID))
+		    ns_capable(current_user_ns(), CAP_SETGID))
 			new->gid = rgid;
 		else
 			goto error;
@@ -505,7 +507,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 		if (old->gid == egid ||
 		    old->egid == egid ||
 		    old->sgid == egid ||
-		    capable(CAP_SETGID))
+		    ns_capable(current_user_ns(), CAP_SETGID))
 			new->egid = egid;
 		else
 			goto error;
@@ -540,7 +542,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (capable(CAP_SETGID))
+	if (ns_capable(current_user_ns(), CAP_SETGID))
 		new->gid = new->egid = new->sgid = new->fsgid = gid;
 	else if (gid == old->gid || gid == old->sgid)
 		new->egid = new->fsgid = gid;
@@ -607,7 +609,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		new->uid = ruid;
 		if (old->uid != ruid &&
 		    old->euid != ruid &&
-		    !capable(CAP_SETUID))
+		    !ns_capable(current_user_ns(), CAP_SETUID))
 			goto error;
 	}
 
@@ -616,7 +618,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		if (old->uid != euid &&
 		    old->euid != euid &&
 		    old->suid != euid &&
-		    !capable(CAP_SETUID))
+		    !ns_capable(current_user_ns(), CAP_SETUID))
 			goto error;
 	}
 
@@ -664,7 +666,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (capable(CAP_SETUID)) {
+	if (ns_capable(current_user_ns(), CAP_SETUID)) {
 		new->suid = new->uid = uid;
 		if (uid != old->uid) {
 			retval = set_user(new);
@@ -706,7 +708,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!capable(CAP_SETUID)) {
+	if (!ns_capable(current_user_ns(), CAP_SETUID)) {
 		if (ruid != (uid_t) -1 && ruid != old->uid &&
 		    ruid != old->euid  && ruid != old->suid)
 			goto error;
@@ -770,7 +772,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 	old = current_cred();
 
 	retval = -EPERM;
-	if (!capable(CAP_SETGID)) {
+	if (!ns_capable(current_user_ns(), CAP_SETGID)) {
 		if (rgid != (gid_t) -1 && rgid != old->gid &&
 		    rgid != old->egid  && rgid != old->sgid)
 			goto error;
@@ -830,7 +832,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 
 	if (uid == old->uid  || uid == old->euid  ||
 	    uid == old->suid || uid == old->fsuid ||
-	    capable(CAP_SETUID)) {
+	    ns_capable(current_user_ns(), CAP_SETUID)) {
 		if (uid != old_fsuid) {
 			new->fsuid = uid;
 			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -863,7 +865,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 
 	if (gid == old->gid  || gid == old->egid  ||
 	    gid == old->sgid || gid == old->fsgid ||
-	    capable(CAP_SETGID)) {
+	    ns_capable(current_user_ns(), CAP_SETGID)) {
 		if (gid != old_fsgid) {
 			new->fsgid = gid;
 			goto change_okay;
@@ -1220,7 +1222,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 	int errno;
 	char tmp[__NEW_UTS_LEN];
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (len < 0 || len > __NEW_UTS_LEN)
 		return -EINVAL;
@@ -1335,6 +1337,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 	rlim = tsk->signal->rlim + resource;
 	task_lock(tsk->group_leader);
 	if (new_rlim) {
+		/* Keep the capable check against init_user_ns until
+		   cgroups can contain all limits */
 		if (new_rlim->rlim_max > rlim->rlim_max &&
 				!capable(CAP_SYS_RESOURCE))
 			retval = -EPERM;
@@ -1379,13 +1383,14 @@ static int check_prlimit_permission(struct task_struct *task)
 	const struct cred *cred = current_cred(), *tcred;
 
 	tcred = __task_cred(task);
-	if ((cred->uid != tcred->euid ||
+	if ((cred->user->user_ns != tcred->user->user_ns ||
+	     cred->uid != tcred->euid ||
 	     cred->uid != tcred->suid ||
 	     cred->uid != tcred->uid  ||
 	     cred->gid != tcred->egid ||
 	     cred->gid != tcred->sgid ||
 	     cred->gid != tcred->gid) &&
-	     !capable(CAP_SYS_RESOURCE)) {
+	     !ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) {
 		return -EPERM;
 	}
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 7/7] user namespaces: convert several capable() calls
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
                     ` (5 preceding siblings ...)
  2011-01-10 21:14   ` [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c Serge E. Hallyn
@ 2011-01-10 21:14   ` Serge E. Hallyn
  6 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:14 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

CAP_IPC_OWNER and CAP_IPC_LOCK can be checked against current_user_ns(),
because the resource comes from current's own ipc namespace.

setuid/setgid are to uids in own namespace, so again checks can be
against current_user_ns().

Signed-off-by: Serge E. Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
---
 ipc/shm.c             |    2 +-
 ipc/util.c            |    5 +++--
 kernel/futex.c        |   11 ++++++++++-
 kernel/futex_compat.c |   11 ++++++++++-
 kernel/groups.c       |    2 +-
 kernel/sched.c        |   25 ++++++++++++++++++++++---
 kernel/uid16.c        |    2 +-
 7 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 7d3bb22..13891f8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -773,7 +773,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 
 		audit_ipc_obj(&(shp->shm_perm));
 
-		if (!capable(CAP_IPC_LOCK)) {
+		if (!ns_capable(current_user_ns(), CAP_IPC_LOCK)) {
 			uid_t euid = current_euid();
 			err = -EPERM;
 			if (euid != shp->shm_perm.uid &&
diff --git a/ipc/util.c b/ipc/util.c
index 69a0cc1..0e832b9 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -627,7 +627,7 @@ int ipcperms (struct kern_ipc_perm *ipcp, short flag)
 		granted_mode >>= 3;
 	/* is there some bit set in requested_mode but not in granted_mode? */
 	if ((requested_mode & ~granted_mode & 0007) && 
-	    !capable(CAP_IPC_OWNER))
+	    !ns_capable(current->cred->user->user_ns, CAP_IPC_OWNER))
 		return -1;
 
 	return security_ipc_permission(ipcp, flag);
@@ -800,7 +800,8 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 
 	euid = current_euid();
 	if (euid == ipcp->cuid ||
-	    euid == ipcp->uid  || capable(CAP_SYS_ADMIN))
+	    euid == ipcp->uid  ||
+	    ns_capable(current->cred->user->user_ns, CAP_SYS_ADMIN))
 		return ipcp;
 
 	err = -EPERM;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92..1025fd7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2387,10 +2387,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 			goto err_unlock;
 		ret = -EPERM;
 		pcred = __task_cred(p);
+		/* If victim is in different user_ns, then uids are not
+		   comparable, so we must have CAP_SYS_PTRACE */
+		if (cred->user->user_ns != pcred->user->user_ns) {
+			if (ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+				goto ok;
+			goto err_unlock;
+		}
+		/* If victim is in same user_ns, then uids are comparable */
 		if (cred->euid != pcred->euid &&
 		    cred->euid != pcred->uid &&
-		    !capable(CAP_SYS_PTRACE))
+		    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
 			goto err_unlock;
+ok:
 		head = p->robust_list;
 		rcu_read_unlock();
 	}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac..f84cb9a 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			goto err_unlock;
 		ret = -EPERM;
 		pcred = __task_cred(p);
+		/* If victim is in different user_ns, then uids are not
+		   comparable, so we must have CAP_SYS_PTRACE */
+		if (cred->user->user_ns != pcred->user->user_ns) {
+			if (ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+				goto ok;
+			goto err_unlock;
+		}
+		/* If victim is in same user_ns, then uids are comparable */
 		if (cred->euid != pcred->euid &&
 		    cred->euid != pcred->uid &&
-		    !capable(CAP_SYS_PTRACE))
+		    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
 			goto err_unlock;
+ok:
 		head = p->compat_robust_list;
 		rcu_read_unlock();
 	}
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f..335586a 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 	struct group_info *group_info;
 	int retval;
 
-	if (!capable(CAP_SETGID))
+	if (!ns_capable(current_user_ns(), CAP_SETGID))
 		return -EPERM;
 	if ((unsigned)gidsetsize > NGROUPS_MAX)
 		return -EINVAL;
diff --git a/kernel/sched.c b/kernel/sched.c
index a0eb094..1078fe3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4761,8 +4761,11 @@ static bool check_same_owner(struct task_struct *p)
 
 	rcu_read_lock();
 	pcred = __task_cred(p);
-	match = (cred->euid == pcred->euid ||
-		 cred->euid == pcred->uid);
+	if (cred->user->user_ns != pcred->user->user_ns)
+		match = false;
+	match = (cred->user->user_ns == pcred->user->user_ns &&
+			(cred->euid == pcred->euid ||
+			 cred->euid == pcred->uid));
 	rcu_read_unlock();
 	return match;
 }
@@ -5058,6 +5061,22 @@ out_unlock:
 	return retval;
 }
 
+/* TODO: Create a common helper to consolidate sched_capable and
+ * ptrace_capable, silly.
+ */
+static inline int sched_capable(struct task_struct *t, int cap)
+{
+	struct user_namespace *ns;
+	int ret;
+
+	rcu_read_lock();
+	ns = task_cred_xxx(t, user)->user_ns;
+	ret = ns_capable(ns, cap);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
@@ -5087,7 +5106,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+	if (!check_same_owner(p) && !sched_capable(p, CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 4192098..8a70480 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 	struct group_info *group_info;
 	int retval;
 
-	if (!capable(CAP_SETGID))
+	if (!ns_capable(current_user_ns(), CAP_SETGID))
 		return -EPERM;
 	if ((unsigned)gidsetsize > NGROUPS_MAX)
 		return -EINVAL;
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 7/7] user namespaces: convert several capable() calls
  2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
                   ` (6 preceding siblings ...)
       [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-10 21:14 ` Serge E. Hallyn
  7 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:14 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, James Morris, Kees Cook, containers, kernel list,
	Eric W. Biederman, Alexey Dobriyan, Michael Kerrisk

CAP_IPC_OWNER and CAP_IPC_LOCK can be checked against current_user_ns(),
because the resource comes from current's own ipc namespace.

setuid/setgid are to uids in own namespace, so again checks can be
against current_user_ns().

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 ipc/shm.c             |    2 +-
 ipc/util.c            |    5 +++--
 kernel/futex.c        |   11 ++++++++++-
 kernel/futex_compat.c |   11 ++++++++++-
 kernel/groups.c       |    2 +-
 kernel/sched.c        |   25 ++++++++++++++++++++++---
 kernel/uid16.c        |    2 +-
 7 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 7d3bb22..13891f8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -773,7 +773,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 
 		audit_ipc_obj(&(shp->shm_perm));
 
-		if (!capable(CAP_IPC_LOCK)) {
+		if (!ns_capable(current_user_ns(), CAP_IPC_LOCK)) {
 			uid_t euid = current_euid();
 			err = -EPERM;
 			if (euid != shp->shm_perm.uid &&
diff --git a/ipc/util.c b/ipc/util.c
index 69a0cc1..0e832b9 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -627,7 +627,7 @@ int ipcperms (struct kern_ipc_perm *ipcp, short flag)
 		granted_mode >>= 3;
 	/* is there some bit set in requested_mode but not in granted_mode? */
 	if ((requested_mode & ~granted_mode & 0007) && 
-	    !capable(CAP_IPC_OWNER))
+	    !ns_capable(current->cred->user->user_ns, CAP_IPC_OWNER))
 		return -1;
 
 	return security_ipc_permission(ipcp, flag);
@@ -800,7 +800,8 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 
 	euid = current_euid();
 	if (euid == ipcp->cuid ||
-	    euid == ipcp->uid  || capable(CAP_SYS_ADMIN))
+	    euid == ipcp->uid  ||
+	    ns_capable(current->cred->user->user_ns, CAP_SYS_ADMIN))
 		return ipcp;
 
 	err = -EPERM;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92..1025fd7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2387,10 +2387,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 			goto err_unlock;
 		ret = -EPERM;
 		pcred = __task_cred(p);
+		/* If victim is in different user_ns, then uids are not
+		   comparable, so we must have CAP_SYS_PTRACE */
+		if (cred->user->user_ns != pcred->user->user_ns) {
+			if (ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+				goto ok;
+			goto err_unlock;
+		}
+		/* If victim is in same user_ns, then uids are comparable */
 		if (cred->euid != pcred->euid &&
 		    cred->euid != pcred->uid &&
-		    !capable(CAP_SYS_PTRACE))
+		    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
 			goto err_unlock;
+ok:
 		head = p->robust_list;
 		rcu_read_unlock();
 	}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac..f84cb9a 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			goto err_unlock;
 		ret = -EPERM;
 		pcred = __task_cred(p);
+		/* If victim is in different user_ns, then uids are not
+		   comparable, so we must have CAP_SYS_PTRACE */
+		if (cred->user->user_ns != pcred->user->user_ns) {
+			if (ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+				goto ok;
+			goto err_unlock;
+		}
+		/* If victim is in same user_ns, then uids are comparable */
 		if (cred->euid != pcred->euid &&
 		    cred->euid != pcred->uid &&
-		    !capable(CAP_SYS_PTRACE))
+		    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
 			goto err_unlock;
+ok:
 		head = p->compat_robust_list;
 		rcu_read_unlock();
 	}
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f..335586a 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 	struct group_info *group_info;
 	int retval;
 
-	if (!capable(CAP_SETGID))
+	if (!ns_capable(current_user_ns(), CAP_SETGID))
 		return -EPERM;
 	if ((unsigned)gidsetsize > NGROUPS_MAX)
 		return -EINVAL;
diff --git a/kernel/sched.c b/kernel/sched.c
index a0eb094..1078fe3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4761,8 +4761,11 @@ static bool check_same_owner(struct task_struct *p)
 
 	rcu_read_lock();
 	pcred = __task_cred(p);
-	match = (cred->euid == pcred->euid ||
-		 cred->euid == pcred->uid);
+	if (cred->user->user_ns != pcred->user->user_ns)
+		match = false;
+	match = (cred->user->user_ns == pcred->user->user_ns &&
+			(cred->euid == pcred->euid ||
+			 cred->euid == pcred->uid));
 	rcu_read_unlock();
 	return match;
 }
@@ -5058,6 +5061,22 @@ out_unlock:
 	return retval;
 }
 
+/* TODO: Create a common helper to consolidate sched_capable and
+ * ptrace_capable, silly.
+ */
+static inline int sched_capable(struct task_struct *t, int cap)
+{
+	struct user_namespace *ns;
+	int ret;
+
+	rcu_read_lock();
+	ns = task_cred_xxx(t, user)->user_ns;
+	ret = ns_capable(ns, cap);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
@@ -5087,7 +5106,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+	if (!check_same_owner(p) && !sched_capable(p, CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 4192098..8a70480 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 	struct group_info *group_info;
 	int retval;
 
-	if (!capable(CAP_SETGID))
+	if (!ns_capable(current_user_ns(), CAP_SETGID))
 		return -EPERM;
 	if ((unsigned)gidsetsize > NGROUPS_MAX)
 		return -EINVAL;
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 21:13 ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
@ 2011-01-10 21:52       ` Bastian Blank
  0 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-10 21:52 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

On Mon, Jan 10, 2011 at 09:13:34PM +0000, Serge E. Hallyn wrote:
> +	const struct cred *cred = current_cred();
> +	const struct cred *tcred = __task_cred(t);
> +
> +	if (cred->user->user_ns != tcred->user->user_ns) {
> +		/* userids are not equivalent - either you have the
> +		   capability to the target user ns or you don't */
> +		if (ns_capable(tcred->user->user_ns, CAP_KILL))
> +			return 1;
> +		return 0;
> +	}
> +
> +	/* same user namespace - usual credentials checks apply */
> +	if ((cred->euid ^ tcred->suid) &&
> +	    (cred->euid ^ tcred->uid) &&
> +	    (cred->uid  ^ tcred->suid) &&
> +	    (cred->uid  ^ tcred->uid) &&
> +	    !ns_capable(tcred->user->user_ns, CAP_KILL))
> +		return 0;
> +
> +	return 1;

Isn't that equal to this?

	if (ns_capable(tcred->user->user_ns, CAP_KILL))
		return 1;

	if (cred->user->user_ns == tcred->user->user_ns &&
	    (cred->euid == tcred->suid ||
	     cred->euid == tcred->uid ||
	     cred->uid == tcred->suid ||
	     cred->uid == tcred->uid))
		return 1;

	return 0;

I would consider this much easier to read.

Bastian

-- 
I'm a soldier, not a diplomat.  I can only tell the truth.
		-- Kirk, "Errand of Mercy", stardate 3198.9

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
@ 2011-01-10 21:52       ` Bastian Blank
  0 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-10 21:52 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers, kernel list, LSM, Eric W. Biederman, Kees Cook,
	Alexey Dobriyan, Michael Kerrisk

On Mon, Jan 10, 2011 at 09:13:34PM +0000, Serge E. Hallyn wrote:
> +	const struct cred *cred = current_cred();
> +	const struct cred *tcred = __task_cred(t);
> +
> +	if (cred->user->user_ns != tcred->user->user_ns) {
> +		/* userids are not equivalent - either you have the
> +		   capability to the target user ns or you don't */
> +		if (ns_capable(tcred->user->user_ns, CAP_KILL))
> +			return 1;
> +		return 0;
> +	}
> +
> +	/* same user namespace - usual credentials checks apply */
> +	if ((cred->euid ^ tcred->suid) &&
> +	    (cred->euid ^ tcred->uid) &&
> +	    (cred->uid  ^ tcred->suid) &&
> +	    (cred->uid  ^ tcred->uid) &&
> +	    !ns_capable(tcred->user->user_ns, CAP_KILL))
> +		return 0;
> +
> +	return 1;

Isn't that equal to this?

	if (ns_capable(tcred->user->user_ns, CAP_KILL))
		return 1;

	if (cred->user->user_ns == tcred->user->user_ns &&
	    (cred->euid == tcred->suid ||
	     cred->euid == tcred->uid ||
	     cred->uid == tcred->suid ||
	     cred->uid == tcred->uid))
		return 1;

	return 0;

I would consider this much easier to read.

Bastian

-- 
I'm a soldier, not a diplomat.  I can only tell the truth.
		-- Kirk, "Errand of Mercy", stardate 3198.9

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
       [not found]   ` <20110110211406.GF22564-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-10 21:59     ` Bastian Blank
  0 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-10 21:59 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> -	if (pcred->uid  != cred->euid &&
> -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> +	if (pcred->user->user_ns != cred->user->user_ns &&
> +	    pcred->uid  != cred->euid &&
> +	    pcred->euid != cred->euid &&
> +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {

I don't think this is correct. This would not error out if the both
userns are the same. Because the same patern (check uid if same userns,
otherwise only capability) shows up in several parts of the code, maybe
this should be factored out.

> @@ -496,7 +498,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
>  	if (rgid != (gid_t) -1) {
>  		if (old->gid == rgid ||
>  		    old->egid == rgid ||
> -		    capable(CAP_SETGID))
> +		    ns_capable(current_user_ns(), CAP_SETGID))

Would it not possible to add another function (nsown_capable?) that
checks against the own userns?

Bastian

-- 
Change is the essential process of all existence.
		-- Spock, "Let That Be Your Last Battlefield", stardate 5730.2

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
  2011-01-10 21:14 ` [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c Serge E. Hallyn
       [not found]   ` <20110110211406.GF22564-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-10 21:59   ` Bastian Blank
  2011-01-10 22:56     ` Serge Hallyn
                       ` (2 more replies)
  1 sibling, 3 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-10 21:59 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers, kernel list, LSM, Eric W. Biederman, Kees Cook,
	Alexey Dobriyan, Michael Kerrisk

On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> -	if (pcred->uid  != cred->euid &&
> -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> +	if (pcred->user->user_ns != cred->user->user_ns &&
> +	    pcred->uid  != cred->euid &&
> +	    pcred->euid != cred->euid &&
> +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {

I don't think this is correct. This would not error out if the both
userns are the same. Because the same patern (check uid if same userns,
otherwise only capability) shows up in several parts of the code, maybe
this should be factored out.

> @@ -496,7 +498,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
>  	if (rgid != (gid_t) -1) {
>  		if (old->gid == rgid ||
>  		    old->egid == rgid ||
> -		    capable(CAP_SETGID))
> +		    ns_capable(current_user_ns(), CAP_SETGID))

Would it not possible to add another function (nsown_capable?) that
checks against the own userns?

Bastian

-- 
Change is the essential process of all existence.
		-- Spock, "Let That Be Your Last Battlefield", stardate 5730.2

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]       ` <20110110215240.GA21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-10 22:51         ` Serge Hallyn
  0 siblings, 0 replies; 44+ messages in thread
From: Serge Hallyn @ 2011-01-10 22:51 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM

Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> On Mon, Jan 10, 2011 at 09:13:34PM +0000, Serge E. Hallyn wrote:
> > +	const struct cred *cred = current_cred();
> > +	const struct cred *tcred = __task_cred(t);
> > +
> > +	if (cred->user->user_ns != tcred->user->user_ns) {
> > +		/* userids are not equivalent - either you have the
> > +		   capability to the target user ns or you don't */
> > +		if (ns_capable(tcred->user->user_ns, CAP_KILL))
> > +			return 1;
> > +		return 0;
> > +	}
> > +
> > +	/* same user namespace - usual credentials checks apply */
> > +	if ((cred->euid ^ tcred->suid) &&
> > +	    (cred->euid ^ tcred->uid) &&
> > +	    (cred->uid  ^ tcred->suid) &&
> > +	    (cred->uid  ^ tcred->uid) &&
> > +	    !ns_capable(tcred->user->user_ns, CAP_KILL))
> > +		return 0;
> > +
> > +	return 1;
> 
> Isn't that equal to this?
> 
> 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
> 		return 1;
> 
> 	if (cred->user->user_ns == tcred->user->user_ns &&
> 	    (cred->euid == tcred->suid ||
> 	     cred->euid == tcred->uid ||
> 	     cred->uid == tcred->suid ||
> 	     cred->uid == tcred->uid))
> 		return 1;
> 
> 	return 0;
> 
> I would consider this much easier to read.

Unfortunately, it's actually not equivalent.  when capable()
returns success, then it sets the current->flags |= PF_SUPERPRIV.
If permission is granted based on userids and the capability
isn't needed, then we don't want to needlessly set PF_SUPERPRIV.

That's why I'm going to such lengths to call capable() as a last
resort.

I'm definately open to any ideas that'll get the code cleaner.

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 21:52       ` Bastian Blank
  (?)
  (?)
@ 2011-01-10 22:51       ` Serge Hallyn
  2011-01-10 23:23         ` Bastian Blank
                           ` (2 more replies)
  -1 siblings, 3 replies; 44+ messages in thread
From: Serge Hallyn @ 2011-01-10 22:51 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn, containers, kernel list, LSM,
	Eric W. Biederman, Kees Cook, Alexey Dobriyan, Michael Kerrisk

Quoting Bastian Blank (bastian@waldi.eu.org):
> On Mon, Jan 10, 2011 at 09:13:34PM +0000, Serge E. Hallyn wrote:
> > +	const struct cred *cred = current_cred();
> > +	const struct cred *tcred = __task_cred(t);
> > +
> > +	if (cred->user->user_ns != tcred->user->user_ns) {
> > +		/* userids are not equivalent - either you have the
> > +		   capability to the target user ns or you don't */
> > +		if (ns_capable(tcred->user->user_ns, CAP_KILL))
> > +			return 1;
> > +		return 0;
> > +	}
> > +
> > +	/* same user namespace - usual credentials checks apply */
> > +	if ((cred->euid ^ tcred->suid) &&
> > +	    (cred->euid ^ tcred->uid) &&
> > +	    (cred->uid  ^ tcred->suid) &&
> > +	    (cred->uid  ^ tcred->uid) &&
> > +	    !ns_capable(tcred->user->user_ns, CAP_KILL))
> > +		return 0;
> > +
> > +	return 1;
> 
> Isn't that equal to this?
> 
> 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
> 		return 1;
> 
> 	if (cred->user->user_ns == tcred->user->user_ns &&
> 	    (cred->euid == tcred->suid ||
> 	     cred->euid == tcred->uid ||
> 	     cred->uid == tcred->suid ||
> 	     cred->uid == tcred->uid))
> 		return 1;
> 
> 	return 0;
> 
> I would consider this much easier to read.

Unfortunately, it's actually not equivalent.  when capable()
returns success, then it sets the current->flags |= PF_SUPERPRIV.
If permission is granted based on userids and the capability
isn't needed, then we don't want to needlessly set PF_SUPERPRIV.

That's why I'm going to such lengths to call capable() as a last
resort.

I'm definately open to any ideas that'll get the code cleaner.

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
       [not found]     ` <20110110215937.GB21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-10 22:56       ` Serge Hallyn
  2011-01-11  5:27       ` Serge E. Hallyn
  1 sibling, 0 replies; 44+ messages in thread
From: Serge Hallyn @ 2011-01-10 22:56 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM

Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> > -	if (pcred->uid  != cred->euid &&
> > -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> > +	if (pcred->user->user_ns != cred->user->user_ns &&
> > +	    pcred->uid  != cred->euid &&
> > +	    pcred->euid != cred->euid &&
> > +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
> 
> I don't think this is correct. This would not error out if the both
> userns are the same. Because the same patern (check uid if same userns,
> otherwise only capability) shows up in several parts of the code, maybe
> this should be factored out.

Yes, I'm really not sure what I was thinking here.  Thanks!

> > @@ -496,7 +498,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
> >  	if (rgid != (gid_t) -1) {
> >  		if (old->gid == rgid ||
> >  		    old->egid == rgid ||
> > -		    capable(CAP_SETGID))
> > +		    ns_capable(current_user_ns(), CAP_SETGID))
> 
> Would it not possible to add another function (nsown_capable?) that
> checks against the own userns?

Good idea, I'll add that one.

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
  2011-01-10 21:59   ` Bastian Blank
@ 2011-01-10 22:56     ` Serge Hallyn
       [not found]     ` <20110110215937.GB21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
  2011-01-11  5:27     ` Serge E. Hallyn
  2 siblings, 0 replies; 44+ messages in thread
From: Serge Hallyn @ 2011-01-10 22:56 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn, containers, kernel list, LSM,
	Eric W. Biederman, Kees Cook, Alexey Dobriyan, Michael Kerrisk

Quoting Bastian Blank (bastian@waldi.eu.org):
> On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> > -	if (pcred->uid  != cred->euid &&
> > -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> > +	if (pcred->user->user_ns != cred->user->user_ns &&
> > +	    pcred->uid  != cred->euid &&
> > +	    pcred->euid != cred->euid &&
> > +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
> 
> I don't think this is correct. This would not error out if the both
> userns are the same. Because the same patern (check uid if same userns,
> otherwise only capability) shows up in several parts of the code, maybe
> this should be factored out.

Yes, I'm really not sure what I was thinking here.  Thanks!

> > @@ -496,7 +498,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
> >  	if (rgid != (gid_t) -1) {
> >  		if (old->gid == rgid ||
> >  		    old->egid == rgid ||
> > -		    capable(CAP_SETGID))
> > +		    ns_capable(current_user_ns(), CAP_SETGID))
> 
> Would it not possible to add another function (nsown_capable?) that
> checks against the own userns?

Good idea, I'll add that one.

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 22:51       ` Serge Hallyn
@ 2011-01-10 23:23         ` Bastian Blank
  2011-01-10 23:23         ` Bastian Blank
  2011-01-11  4:22           ` Oren Laadan
  2 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-10 23:23 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Michael Kerrisk, Kees Cook,
	Alexey Dobriyan

On Mon, Jan 10, 2011 at 04:51:51PM -0600, Serge Hallyn wrote:
> Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > Isn't that equal to this?
> > 
> > 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
> > 		return 1;
> > 
> > 	if (cred->user->user_ns == tcred->user->user_ns &&
> > 	    (cred->euid == tcred->suid ||
> > 	     cred->euid == tcred->uid ||
> > 	     cred->uid == tcred->suid ||
> > 	     cred->uid == tcred->uid))
> > 		return 1;
> > 
> > 	return 0;
> > 
> > I would consider this much easier to read.
> 
> Unfortunately, it's actually not equivalent.  when capable()
> returns success, then it sets the current->flags |= PF_SUPERPRIV.
> If permission is granted based on userids and the capability
> isn't needed, then we don't want to needlessly set PF_SUPERPRIV.

Well, then switch the two if-clauses.

What is this flag used for anyway? I only see it used in the accounting
stuff, and if every user can get it, it is not longer useful.

Bastian

-- 
Bones: "The man's DEAD, Jim!"

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 22:51       ` Serge Hallyn
  2011-01-10 23:23         ` Bastian Blank
@ 2011-01-10 23:23         ` Bastian Blank
       [not found]           ` <20110110232335.GA27029-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
  2011-01-11  1:31           ` Serge E. Hallyn
  2011-01-11  4:22           ` Oren Laadan
  2 siblings, 2 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-10 23:23 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: Serge E. Hallyn, containers, kernel list, LSM, Eric W. Biederman,
	Kees Cook, Alexey Dobriyan, Michael Kerrisk

On Mon, Jan 10, 2011 at 04:51:51PM -0600, Serge Hallyn wrote:
> Quoting Bastian Blank (bastian@waldi.eu.org):
> > Isn't that equal to this?
> > 
> > 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
> > 		return 1;
> > 
> > 	if (cred->user->user_ns == tcred->user->user_ns &&
> > 	    (cred->euid == tcred->suid ||
> > 	     cred->euid == tcred->uid ||
> > 	     cred->uid == tcred->suid ||
> > 	     cred->uid == tcred->uid))
> > 		return 1;
> > 
> > 	return 0;
> > 
> > I would consider this much easier to read.
> 
> Unfortunately, it's actually not equivalent.  when capable()
> returns success, then it sets the current->flags |= PF_SUPERPRIV.
> If permission is granted based on userids and the capability
> isn't needed, then we don't want to needlessly set PF_SUPERPRIV.

Well, then switch the two if-clauses.

What is this flag used for anyway? I only see it used in the accounting
stuff, and if every user can get it, it is not longer useful.

Bastian

-- 
Bones: "The man's DEAD, Jim!"

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]           ` <20110110232335.GA27029-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-11  1:31             ` Serge E. Hallyn
  0 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-11  1:31 UTC (permalink / raw)
  To: Bastian Blank, Serge Hallyn, Serge E. Hallyn,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> On Mon, Jan 10, 2011 at 04:51:51PM -0600, Serge Hallyn wrote:
> > Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > > Isn't that equal to this?
> > > 
> > > 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
> > > 		return 1;
> > > 
> > > 	if (cred->user->user_ns == tcred->user->user_ns &&
> > > 	    (cred->euid == tcred->suid ||
> > > 	     cred->euid == tcred->uid ||
> > > 	     cred->uid == tcred->suid ||
> > > 	     cred->uid == tcred->uid))
> > > 		return 1;
> > > 
> > > 	return 0;
> > > 
> > > I would consider this much easier to read.
> > 
> > Unfortunately, it's actually not equivalent.  when capable()
> > returns success, then it sets the current->flags |= PF_SUPERPRIV.
> > If permission is granted based on userids and the capability
> > isn't needed, then we don't want to needlessly set PF_SUPERPRIV.
> 
> Well, then switch the two if-clauses.

hup, will do, much nicer, thanks.

> What is this flag used for anyway? I only see it used in the accounting
> stuff, and if every user can get it, it is not longer useful.

hm, I'm not sure...  maybe noone is using it!

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 23:23         ` Bastian Blank
       [not found]           ` <20110110232335.GA27029-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-11  1:31           ` Serge E. Hallyn
       [not found]             ` <20110111013152.GA23860-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-14 14:50             ` Bastian Blank
  1 sibling, 2 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-11  1:31 UTC (permalink / raw)
  To: Bastian Blank, Serge Hallyn, Serge E. Hallyn, containers,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

Quoting Bastian Blank (bastian@waldi.eu.org):
> On Mon, Jan 10, 2011 at 04:51:51PM -0600, Serge Hallyn wrote:
> > Quoting Bastian Blank (bastian@waldi.eu.org):
> > > Isn't that equal to this?
> > > 
> > > 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
> > > 		return 1;
> > > 
> > > 	if (cred->user->user_ns == tcred->user->user_ns &&
> > > 	    (cred->euid == tcred->suid ||
> > > 	     cred->euid == tcred->uid ||
> > > 	     cred->uid == tcred->suid ||
> > > 	     cred->uid == tcred->uid))
> > > 		return 1;
> > > 
> > > 	return 0;
> > > 
> > > I would consider this much easier to read.
> > 
> > Unfortunately, it's actually not equivalent.  when capable()
> > returns success, then it sets the current->flags |= PF_SUPERPRIV.
> > If permission is granted based on userids and the capability
> > isn't needed, then we don't want to needlessly set PF_SUPERPRIV.
> 
> Well, then switch the two if-clauses.

hup, will do, much nicer, thanks.

> What is this flag used for anyway? I only see it used in the accounting
> stuff, and if every user can get it, it is not longer useful.

hm, I'm not sure...  maybe noone is using it!

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-10 22:51       ` Serge Hallyn
@ 2011-01-11  4:22           ` Oren Laadan
  2011-01-10 23:23         ` Bastian Blank
  2011-01-11  4:22           ` Oren Laadan
  2 siblings, 0 replies; 44+ messages in thread
From: Oren Laadan @ 2011-01-11  4:22 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, Bastian Blank, LSM, Eric W. Biederman,
	Michael Kerrisk, Kees Cook, Alexey Dobriyan



On 01/10/2011 05:51 PM, Serge Hallyn wrote:
> Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
>> On Mon, Jan 10, 2011 at 09:13:34PM +0000, Serge E. Hallyn wrote:
>>> +	const struct cred *cred = current_cred();
>>> +	const struct cred *tcred = __task_cred(t);
>>> +
>>> +	if (cred->user->user_ns != tcred->user->user_ns) {
>>> +		/* userids are not equivalent - either you have the
>>> +		   capability to the target user ns or you don't */
>>> +		if (ns_capable(tcred->user->user_ns, CAP_KILL))
>>> +			return 1;
>>> +		return 0;
>>> +	}
>>> +
>>> +	/* same user namespace - usual credentials checks apply */
>>> +	if ((cred->euid ^ tcred->suid) &&
>>> +	    (cred->euid ^ tcred->uid) &&
>>> +	    (cred->uid  ^ tcred->suid) &&
>>> +	    (cred->uid  ^ tcred->uid) &&
>>> +	    !ns_capable(tcred->user->user_ns, CAP_KILL))
>>> +		return 0;
>>> +
>>> +	return 1;
>>
>> Isn't that equal to this?
>>
>> 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
>> 		return 1;
>>
>> 	if (cred->user->user_ns == tcred->user->user_ns &&
>> 	    (cred->euid == tcred->suid ||
>> 	     cred->euid == tcred->uid ||
>> 	     cred->uid == tcred->suid ||
>> 	     cred->uid == tcred->uid))
>> 		return 1;
>>
>> 	return 0;
>>
>> I would consider this much easier to read.
> 
> Unfortunately, it's actually not equivalent.  when capable()
> returns success, then it sets the current->flags |= PF_SUPERPRIV.
> If permission is granted based on userids and the capability
> isn't needed, then we don't want to needlessly set PF_SUPERPRIV.

A bit off-topic: does this means that c/r needs to save and 
restore this process flag ?
> 
> That's why I'm going to such lengths to call capable() as a last
> resort.

IMHO, worth a one line comment in the code ...

Oren.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
@ 2011-01-11  4:22           ` Oren Laadan
  0 siblings, 0 replies; 44+ messages in thread
From: Oren Laadan @ 2011-01-11  4:22 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: Bastian Blank, Serge E. Hallyn, containers, kernel list, LSM,
	Eric W. Biederman, Kees Cook, Alexey Dobriyan, Michael Kerrisk



On 01/10/2011 05:51 PM, Serge Hallyn wrote:
> Quoting Bastian Blank (bastian@waldi.eu.org):
>> On Mon, Jan 10, 2011 at 09:13:34PM +0000, Serge E. Hallyn wrote:
>>> +	const struct cred *cred = current_cred();
>>> +	const struct cred *tcred = __task_cred(t);
>>> +
>>> +	if (cred->user->user_ns != tcred->user->user_ns) {
>>> +		/* userids are not equivalent - either you have the
>>> +		   capability to the target user ns or you don't */
>>> +		if (ns_capable(tcred->user->user_ns, CAP_KILL))
>>> +			return 1;
>>> +		return 0;
>>> +	}
>>> +
>>> +	/* same user namespace - usual credentials checks apply */
>>> +	if ((cred->euid ^ tcred->suid) &&
>>> +	    (cred->euid ^ tcred->uid) &&
>>> +	    (cred->uid  ^ tcred->suid) &&
>>> +	    (cred->uid  ^ tcred->uid) &&
>>> +	    !ns_capable(tcred->user->user_ns, CAP_KILL))
>>> +		return 0;
>>> +
>>> +	return 1;
>>
>> Isn't that equal to this?
>>
>> 	if (ns_capable(tcred->user->user_ns, CAP_KILL))
>> 		return 1;
>>
>> 	if (cred->user->user_ns == tcred->user->user_ns &&
>> 	    (cred->euid == tcred->suid ||
>> 	     cred->euid == tcred->uid ||
>> 	     cred->uid == tcred->suid ||
>> 	     cred->uid == tcred->uid))
>> 		return 1;
>>
>> 	return 0;
>>
>> I would consider this much easier to read.
> 
> Unfortunately, it's actually not equivalent.  when capable()
> returns success, then it sets the current->flags |= PF_SUPERPRIV.
> If permission is granted based on userids and the capability
> isn't needed, then we don't want to needlessly set PF_SUPERPRIV.

A bit off-topic: does this means that c/r needs to save and 
restore this process flag ?
> 
> That's why I'm going to such lengths to call capable() as a last
> resort.

IMHO, worth a one line comment in the code ...

Oren.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]           ` <4D2BDAF4.9040908-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2011-01-11  4:32             ` Serge E. Hallyn
  0 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-11  4:32 UTC (permalink / raw)
  To: Oren Laadan
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Bastian Blank, kernel list, LSM, Eric W. Biederman,
	Michael Kerrisk, Kees Cook, Alexey Dobriyan

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
...
> > If permission is granted based on userids and the capability
> > isn't needed, then we don't want to needlessly set PF_SUPERPRIV.
> 
> A bit off-topic: does this means that c/r needs to save and 
> restore this process flag ?

It should, yeah.  (Until we decide to nuke the flag)

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-11  4:22           ` Oren Laadan
  (?)
  (?)
@ 2011-01-11  4:32           ` Serge E. Hallyn
  -1 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-11  4:32 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Serge Hallyn, Bastian Blank, Serge E. Hallyn, containers,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

Quoting Oren Laadan (orenl@cs.columbia.edu):
...
> > If permission is granted based on userids and the capability
> > isn't needed, then we don't want to needlessly set PF_SUPERPRIV.
> 
> A bit off-topic: does this means that c/r needs to save and 
> restore this process flag ?

It should, yeah.  (Until we decide to nuke the flag)

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
       [not found]     ` <20110110215937.GB21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
  2011-01-10 22:56       ` Serge Hallyn
@ 2011-01-11  5:27       ` Serge E. Hallyn
  1 sibling, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-11  5:27 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM

Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> > -	if (pcred->uid  != cred->euid &&
> > -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> > +	if (pcred->user->user_ns != cred->user->user_ns &&
> > +	    pcred->uid  != cred->euid &&
> > +	    pcred->euid != cred->euid &&
> > +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
> 
> I don't think this is correct. This would not error out if the both
> userns are the same. Because the same patern (check uid if same userns,
> otherwise only capability) shows up in several parts of the code, maybe
> this should be factored out.

Yeah, I'd really like to factor this out because it shows up everywhere
and I have to think about it every time I look at it.  But each time it
shows up, the uids being compared slightly change.  There must be some
clever way of doing it, hopefully it'll fall out soon.

Eric's ns_capable() has already simplified this quite a bit - which is
part of why I've sometimes not been thinking about it right, it's now
simpler than it used to be :)

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
  2011-01-10 21:59   ` Bastian Blank
  2011-01-10 22:56     ` Serge Hallyn
       [not found]     ` <20110110215937.GB21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-11  5:27     ` Serge E. Hallyn
       [not found]       ` <20110111052759.GA24970-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  2011-01-14 15:02       ` Bastian Blank
  2 siblings, 2 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-11  5:27 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn, containers, kernel list, LSM,
	Eric W. Biederman, Kees Cook, Alexey Dobriyan, Michael Kerrisk

Quoting Bastian Blank (bastian@waldi.eu.org):
> On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> > -	if (pcred->uid  != cred->euid &&
> > -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> > +	if (pcred->user->user_ns != cred->user->user_ns &&
> > +	    pcred->uid  != cred->euid &&
> > +	    pcred->euid != cred->euid &&
> > +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
> 
> I don't think this is correct. This would not error out if the both
> userns are the same. Because the same patern (check uid if same userns,
> otherwise only capability) shows up in several parts of the code, maybe
> this should be factored out.

Yeah, I'd really like to factor this out because it shows up everywhere
and I have to think about it every time I look at it.  But each time it
shows up, the uids being compared slightly change.  There must be some
clever way of doing it, hopefully it'll fall out soon.

Eric's ns_capable() has already simplified this quite a bit - which is
part of why I've sometimes not been thinking about it right, it's now
simpler than it used to be :)

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]             ` <20110111013152.GA23860-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-14 14:50               ` Bastian Blank
  0 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-14 14:50 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, kernel list, LSM

On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > What is this flag used for anyway? I only see it used in the accounting
> > stuff, and if every user can get it, it is not longer useful.
> hm, I'm not sure...  maybe noone is using it!

This flag is from pre-git.

The only information is:
| #define ASU                0x02    /* ... used super-user privileges */

However with your patches (or at least the goal), everyone is super-user
in derived namespaces.

Bastian

-- 
Where there's no emotion, there's no motive for violence.
		-- Spock, "Dagger of the Mind", stardate 2715.1

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-11  1:31           ` Serge E. Hallyn
       [not found]             ` <20110111013152.GA23860-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-14 14:50             ` Bastian Blank
       [not found]               ` <20110114145001.GB20945-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
  2011-01-15  0:31               ` Serge E. Hallyn
  1 sibling, 2 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-14 14:50 UTC (permalink / raw)
  To: containers, kernel list, LSM

On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> Quoting Bastian Blank (bastian@waldi.eu.org):
> > What is this flag used for anyway? I only see it used in the accounting
> > stuff, and if every user can get it, it is not longer useful.
> hm, I'm not sure...  maybe noone is using it!

This flag is from pre-git.

The only information is:
| #define ASU                0x02    /* ... used super-user privileges */

However with your patches (or at least the goal), everyone is super-user
in derived namespaces.

Bastian

-- 
Where there's no emotion, there's no motive for violence.
		-- Spock, "Dagger of the Mind", stardate 2715.1

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
       [not found]       ` <20110111052759.GA24970-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-14 15:02         ` Bastian Blank
  0 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-14 15:02 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

On Tue, Jan 11, 2011 at 05:27:59AM +0000, Serge E. Hallyn wrote:
> Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> > > -	if (pcred->uid  != cred->euid &&
> > > -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> > > +	if (pcred->user->user_ns != cred->user->user_ns &&
> > > +	    pcred->uid  != cred->euid &&
> > > +	    pcred->euid != cred->euid &&
> > > +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
> > 
> > I don't think this is correct. This would not error out if the both
> > userns are the same. Because the same patern (check uid if same userns,
> > otherwise only capability) shows up in several parts of the code, maybe
> > this should be factored out.
> 
> Yeah, I'd really like to factor this out because it shows up everywhere
> and I have to think about it every time I look at it.  But each time it
> shows up, the uids being compared slightly change.  There must be some
> clever way of doing it, hopefully it'll fall out soon.

Well, then make mostly identical (_inline_) functions in one location
(include/linux/cred.h comes in mind).  You can ask later why they have
to be different.

You are scaling the complexity up. So you need to make it somehow
manageable, and even slightly different versions in one place are much
easier to handle than the same in many different places.

kill_ok_by_cred would be:
cred_check_euid_suid(struct task_struct *p, X capable)

set_one_prio_perm would be:
cred_check_euid_euid(struct task_struct *p, X capable)

Bastian

-- 
	"Life and death are seldom logical."
	"But attaining a desired goal always is."
		-- McCoy and Spock, "The Galileo Seven", stardate 2821.7

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c
  2011-01-11  5:27     ` Serge E. Hallyn
       [not found]       ` <20110111052759.GA24970-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-14 15:02       ` Bastian Blank
  1 sibling, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-14 15:02 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers, kernel list, LSM, Eric W. Biederman, Kees Cook,
	Alexey Dobriyan, Michael Kerrisk

On Tue, Jan 11, 2011 at 05:27:59AM +0000, Serge E. Hallyn wrote:
> Quoting Bastian Blank (bastian@waldi.eu.org):
> > On Mon, Jan 10, 2011 at 09:14:07PM +0000, Serge E. Hallyn wrote:
> > > -	if (pcred->uid  != cred->euid &&
> > > -	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
> > > +	if (pcred->user->user_ns != cred->user->user_ns &&
> > > +	    pcred->uid  != cred->euid &&
> > > +	    pcred->euid != cred->euid &&
> > > +	    !ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) {
> > 
> > I don't think this is correct. This would not error out if the both
> > userns are the same. Because the same patern (check uid if same userns,
> > otherwise only capability) shows up in several parts of the code, maybe
> > this should be factored out.
> 
> Yeah, I'd really like to factor this out because it shows up everywhere
> and I have to think about it every time I look at it.  But each time it
> shows up, the uids being compared slightly change.  There must be some
> clever way of doing it, hopefully it'll fall out soon.

Well, then make mostly identical (_inline_) functions in one location
(include/linux/cred.h comes in mind).  You can ask later why they have
to be different.

You are scaling the complexity up. So you need to make it somehow
manageable, and even slightly different versions in one place are much
easier to handle than the same in many different places.

kill_ok_by_cred would be:
cred_check_euid_suid(struct task_struct *p, X capable)

set_one_prio_perm would be:
cred_check_euid_euid(struct task_struct *p, X capable)

Bastian

-- 
	"Life and death are seldom logical."
	"But attaining a desired goal always is."
		-- McCoy and Spock, "The Galileo Seven", stardate 2821.7

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]               ` <20110114145001.GB20945-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-15  0:31                 ` Serge E. Hallyn
  0 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-15  0:31 UTC (permalink / raw)
  To: Bastian Blank,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM

Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> > Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > > What is this flag used for anyway? I only see it used in the accounting
> > > stuff, and if every user can get it, it is not longer useful.
> > hm, I'm not sure...  maybe noone is using it!
> 
> This flag is from pre-git.
> 
> The only information is:
> | #define ASU                0x02    /* ... used super-user privileges */
> 
> However with your patches (or at least the goal), everyone is super-user
> in derived namespaces.

No, a task just sitting in a derived ns won't necessarily need/use
super-user privileges...  (and, if we ever get far enough along, it
won't even necessarily have CAP_SYS_ADMIN/etc targeted to the parent
userns, bc it won't need those to do the unshares).

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-14 14:50             ` Bastian Blank
       [not found]               ` <20110114145001.GB20945-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-15  0:31               ` Serge E. Hallyn
  2011-01-15 11:30                 ` Bastian Blank
       [not found]                 ` <20110115003114.GA24569-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  1 sibling, 2 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-15  0:31 UTC (permalink / raw)
  To: Bastian Blank, containers, kernel list, LSM

Quoting Bastian Blank (bastian@waldi.eu.org):
> On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> > Quoting Bastian Blank (bastian@waldi.eu.org):
> > > What is this flag used for anyway? I only see it used in the accounting
> > > stuff, and if every user can get it, it is not longer useful.
> > hm, I'm not sure...  maybe noone is using it!
> 
> This flag is from pre-git.
> 
> The only information is:
> | #define ASU                0x02    /* ... used super-user privileges */
> 
> However with your patches (or at least the goal), everyone is super-user
> in derived namespaces.

No, a task just sitting in a derived ns won't necessarily need/use
super-user privileges...  (and, if we ever get far enough along, it
won't even necessarily have CAP_SYS_ADMIN/etc targeted to the parent
userns, bc it won't need those to do the unshares).

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]                 ` <20110115003114.GA24569-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2011-01-15 11:30                   ` Bastian Blank
  0 siblings, 0 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-15 11:30 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: LSM, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, kernel list

On Sat, Jan 15, 2011 at 12:31:14AM +0000, Serge E. Hallyn wrote:
> Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> > > Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > > > What is this flag used for anyway? I only see it used in the accounting
> > > > stuff, and if every user can get it, it is not longer useful.
> > > hm, I'm not sure...  maybe noone is using it!
> > However with your patches (or at least the goal), everyone is super-user
> > in derived namespaces.
> 
> No, a task just sitting in a derived ns won't necessarily need/use
> super-user privileges...  (and, if we ever get far enough along, it
> won't even necessarily have CAP_SYS_ADMIN/etc targeted to the parent
> userns, bc it won't need those to do the unshares).

The goal, as I understand it, is that everyone can create derived user
namespaces. However the creator have automatically all the capabilities
in the derived namespace. So he can use them to gain this super-user
flag. Even killing a tasks in the derived namespace needs the capability
already.

Bastian

-- 
Those who hate and fight must stop themselves -- otherwise it is not stopped.
		-- Spock, "Day of the Dove", stardate unknown

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-15  0:31               ` Serge E. Hallyn
@ 2011-01-15 11:30                 ` Bastian Blank
       [not found]                   ` <20110115113025.GB18682-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
  2011-01-15 14:12                   ` Serge E. Hallyn
       [not found]                 ` <20110115003114.GA24569-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
  1 sibling, 2 replies; 44+ messages in thread
From: Bastian Blank @ 2011-01-15 11:30 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers, kernel list, LSM

On Sat, Jan 15, 2011 at 12:31:14AM +0000, Serge E. Hallyn wrote:
> Quoting Bastian Blank (bastian@waldi.eu.org):
> > On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> > > Quoting Bastian Blank (bastian@waldi.eu.org):
> > > > What is this flag used for anyway? I only see it used in the accounting
> > > > stuff, and if every user can get it, it is not longer useful.
> > > hm, I'm not sure...  maybe noone is using it!
> > However with your patches (or at least the goal), everyone is super-user
> > in derived namespaces.
> 
> No, a task just sitting in a derived ns won't necessarily need/use
> super-user privileges...  (and, if we ever get far enough along, it
> won't even necessarily have CAP_SYS_ADMIN/etc targeted to the parent
> userns, bc it won't need those to do the unshares).

The goal, as I understand it, is that everyone can create derived user
namespaces. However the creator have automatically all the capabilities
in the derived namespace. So he can use them to gain this super-user
flag. Even killing a tasks in the derived namespace needs the capability
already.

Bastian

-- 
Those who hate and fight must stop themselves -- otherwise it is not stopped.
		-- Spock, "Day of the Dove", stardate unknown

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
       [not found]                   ` <20110115113025.GB18682-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-15 14:12                     ` Serge E. Hallyn
  0 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-15 14:12 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, LSM

Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> On Sat, Jan 15, 2011 at 12:31:14AM +0000, Serge E. Hallyn wrote:
> > Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > > On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> > > > Quoting Bastian Blank (bastian-yyjItF7Rl6lg9hUCZPvPmw@public.gmane.org):
> > > > > What is this flag used for anyway? I only see it used in the accounting
> > > > > stuff, and if every user can get it, it is not longer useful.
> > > > hm, I'm not sure...  maybe noone is using it!
> > > However with your patches (or at least the goal), everyone is super-user
> > > in derived namespaces.
> > 
> > No, a task just sitting in a derived ns won't necessarily need/use
> > super-user privileges...  (and, if we ever get far enough along, it
> > won't even necessarily have CAP_SYS_ADMIN/etc targeted to the parent
> > userns, bc it won't need those to do the unshares).
> 
> The goal, as I understand it, is that everyone can create derived user
> namespaces. However the creator have automatically all the capabilities
> in the derived namespace.

Yes.  While he is root in that namespace.  But then he can drop
privileges and spawn new tasks, which have no capabilities in any
namespace.

Even if he didn't do that, the PF_SUPERPRIV flag doesn't say that
you have capabilities, but that you used them.  So if I star a task
as root which never does anything but ls /root, then the flag should
not be set.

> So he can use them to gain this super-user
> flag. Even killing a tasks in the derived namespace needs the capability
> already.

-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 4/7] allow killing tasks in your own or child userns
  2011-01-15 11:30                 ` Bastian Blank
       [not found]                   ` <20110115113025.GB18682-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
@ 2011-01-15 14:12                   ` Serge E. Hallyn
  1 sibling, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-15 14:12 UTC (permalink / raw)
  To: Bastian Blank, Serge E. Hallyn, containers, kernel list, LSM

Quoting Bastian Blank (bastian@waldi.eu.org):
> On Sat, Jan 15, 2011 at 12:31:14AM +0000, Serge E. Hallyn wrote:
> > Quoting Bastian Blank (bastian@waldi.eu.org):
> > > On Tue, Jan 11, 2011 at 01:31:52AM +0000, Serge E. Hallyn wrote:
> > > > Quoting Bastian Blank (bastian@waldi.eu.org):
> > > > > What is this flag used for anyway? I only see it used in the accounting
> > > > > stuff, and if every user can get it, it is not longer useful.
> > > > hm, I'm not sure...  maybe noone is using it!
> > > However with your patches (or at least the goal), everyone is super-user
> > > in derived namespaces.
> > 
> > No, a task just sitting in a derived ns won't necessarily need/use
> > super-user privileges...  (and, if we ever get far enough along, it
> > won't even necessarily have CAP_SYS_ADMIN/etc targeted to the parent
> > userns, bc it won't need those to do the unshares).
> 
> The goal, as I understand it, is that everyone can create derived user
> namespaces. However the creator have automatically all the capabilities
> in the derived namespace.

Yes.  While he is root in that namespace.  But then he can drop
privileges and spawn new tasks, which have no capabilities in any
namespace.

Even if he didn't do that, the PF_SUPERPRIV flag doesn't say that
you have capabilities, but that you used them.  So if I star a task
as root which never does anything but ls /root, then the flag should
not be set.

> So he can use them to gain this super-user
> flag. Even killing a tasks in the derived namespace needs the capability
> already.

-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

* userns: targeted capabilities v3
@ 2011-01-10 21:11 Serge E. Hallyn
  0 siblings, 0 replies; 44+ messages in thread
From: Serge E. Hallyn @ 2011-01-10 21:11 UTC (permalink / raw)
  To: LSM
  Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	kernel list, Eric W. Biederman, Kees Cook, Alexey Dobriyan,
	Michael Kerrisk

Following is the next version of my user namespace patchset.

The core of the set is patch 2, originally conceived and
implemented by Eric Biederman.  The concept is to target
capabilities at user namespaces.  A task's capabilities are
now contextualized as follows (previously, capabilities had
no context):

1. For a task in the initial user namespace, the calculated
capabilities (pi, pe, pp) are available to act upon any
user namespace.

2. For a task in a child user namespace, the calculated
capabilities are available to act only on its own or any
descendent user namespace.  It has no capabilities to any
parent or unrelated user namespaces.

3. If a user A creates a new user namespace, that user has
all capabilities to that new user namespace and any of its
descendents.  (Contrast this with a user namespace created
by another user B in the same user namespace, to which this
user A has only his calculated capabilities)

All existing 'capable' checks are automatically converted to
checks against the initial user namespace.  The rest of the
patches begin to enable capabilities in child user namespaces
to setuid, setgid, set hostnames, kill tasks, and do ptrace.

My next step would be to re-introduce a part of a several year
old patchset which assigns a userns to a superblock (and hence
to inodes), and grants 'user other' permissions to any task
whose uid does not map to the target userns.  (By default, this
will be all but the initial userns)

thanks,
-serge

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2011-01-15 14:12 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn
2011-01-10 21:13 ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
2011-01-10 21:13 ` [PATCH 2/7] security: Make capabilities relative to the user namespace Serge E. Hallyn
2011-01-10 21:13 ` [PATCH 3/7] allow sethostname in a container Serge E. Hallyn
2011-01-10 21:13 ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
     [not found]   ` <20110110211334.GD22564-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2011-01-10 21:52     ` Bastian Blank
2011-01-10 21:52       ` Bastian Blank
     [not found]       ` <20110110215240.GA21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
2011-01-10 22:51         ` Serge Hallyn
2011-01-10 22:51       ` Serge Hallyn
2011-01-10 23:23         ` Bastian Blank
2011-01-10 23:23         ` Bastian Blank
     [not found]           ` <20110110232335.GA27029-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
2011-01-11  1:31             ` Serge E. Hallyn
2011-01-11  1:31           ` Serge E. Hallyn
     [not found]             ` <20110111013152.GA23860-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2011-01-14 14:50               ` Bastian Blank
2011-01-14 14:50             ` Bastian Blank
     [not found]               ` <20110114145001.GB20945-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
2011-01-15  0:31                 ` Serge E. Hallyn
2011-01-15  0:31               ` Serge E. Hallyn
2011-01-15 11:30                 ` Bastian Blank
     [not found]                   ` <20110115113025.GB18682-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
2011-01-15 14:12                     ` Serge E. Hallyn
2011-01-15 14:12                   ` Serge E. Hallyn
     [not found]                 ` <20110115003114.GA24569-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2011-01-15 11:30                   ` Bastian Blank
2011-01-11  4:22         ` Oren Laadan
2011-01-11  4:22           ` Oren Laadan
     [not found]           ` <4D2BDAF4.9040908-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2011-01-11  4:32             ` Serge E. Hallyn
2011-01-11  4:32           ` Serge E. Hallyn
2011-01-10 21:13 ` [PATCH 5/7] Allow ptrace from non-init user namespaces Serge E. Hallyn
2011-01-10 21:14 ` [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c Serge E. Hallyn
     [not found]   ` <20110110211406.GF22564-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2011-01-10 21:59     ` Bastian Blank
2011-01-10 21:59   ` Bastian Blank
2011-01-10 22:56     ` Serge Hallyn
     [not found]     ` <20110110215937.GB21351-0IJIQSrh9RL9UF0aPl6fsj8Kkb2uy4ct@public.gmane.org>
2011-01-10 22:56       ` Serge Hallyn
2011-01-11  5:27       ` Serge E. Hallyn
2011-01-11  5:27     ` Serge E. Hallyn
     [not found]       ` <20110111052759.GA24970-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2011-01-14 15:02         ` Bastian Blank
2011-01-14 15:02       ` Bastian Blank
     [not found] ` <20110110211135.GA22446-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2011-01-10 21:13   ` [PATCH 1/7] Add a user_namespace as creator/owner of uts_namespace Serge E. Hallyn
2011-01-10 21:13   ` [PATCH 2/7] security: Make capabilities relative to the user namespace Serge E. Hallyn
2011-01-10 21:13   ` [PATCH 3/7] allow sethostname in a container Serge E. Hallyn
2011-01-10 21:13   ` [PATCH 4/7] allow killing tasks in your own or child userns Serge E. Hallyn
2011-01-10 21:13   ` [PATCH 5/7] Allow ptrace from non-init user namespaces Serge E. Hallyn
2011-01-10 21:14   ` [PATCH 6/7] user namespaces: convert all capable checks in kernel/sys.c Serge E. Hallyn
2011-01-10 21:14   ` [PATCH 7/7] user namespaces: convert several capable() calls Serge E. Hallyn
2011-01-10 21:14 ` Serge E. Hallyn
  -- strict thread matches above, loose matches on Subject: below --
2011-01-10 21:11 userns: targeted capabilities v3 Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.