All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] credentials c/r: Introduction
@ 2009-05-29 22:32 Serge E. Hallyn
  2009-05-29 22:32 ` [PATCH 1/9] cred: #include init.h in cred.h Serge E. Hallyn
                   ` (7 more replies)
  0 siblings, 8 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:32 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

This is the next, hopefully final, version of the patchset to
start supporting credentials in Oren's c/r set.  It addresses
feedback by Alexey, is ported on top of v16, supports c/r of
securebits, and adds support for credentials on ipc objects.

Next I intend to support the LSM ->security field on tasks,
files, and ipc objects.

thanks,
-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/9] cred: #include init.h in cred.h
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
@ 2009-05-29 22:32 ` Serge E. Hallyn
  2009-05-29 22:32 ` [PATCH 2/9] groups: move code to kernel/groups.c Serge E. Hallyn
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:32 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

cred.h can't be included as first header because it uses __init and
doesn't include init.h which is enough to break compilation on at least
ia64.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 include/linux/cred.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 3282ee4..4fa9996 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -13,6 +13,7 @@
 #define _LINUX_CRED_H
 
 #include <linux/capability.h>
+#include <linux/init.h>
 #include <linux/key.h>
 #include <asm/atomic.h>
 
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/9] groups: move code to kernel/groups.c
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
  2009-05-29 22:32 ` [PATCH 1/9] cred: #include init.h in cred.h Serge E. Hallyn
@ 2009-05-29 22:32 ` Serge E. Hallyn
  2009-05-29 22:33 ` [PATCH 3/9] cr: break out new_user_ns() Serge E. Hallyn
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:32 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

Move supplementary groups implementation to kernel/groups.c .
kernel/sys.c already accumulated quite a few random stuff.

Do strictly copy/paste + add required headers to compile.
Compile-tested on many configs and archs.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
---
 kernel/Makefile |    1 +
 kernel/groups.c |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c    |  283 ------------------------------------------------------
 3 files changed, 289 insertions(+), 283 deletions(-)
 create mode 100644 kernel/groups.c

diff --git a/kernel/Makefile b/kernel/Makefile
index 6bc638d..4d4f741 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
 	    async.o
+obj-y += groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 0000000..2b45b2e
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+	struct group_info *group_info;
+	int nblocks;
+	int i;
+
+	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+	/* Make sure we always allocate at least one indirect block pointer */
+	nblocks = nblocks ? : 1;
+	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+	if (!group_info)
+		return NULL;
+	group_info->ngroups = gidsetsize;
+	group_info->nblocks = nblocks;
+	atomic_set(&group_info->usage, 1);
+
+	if (gidsetsize <= NGROUPS_SMALL)
+		group_info->blocks[0] = group_info->small_block;
+	else {
+		for (i = 0; i < nblocks; i++) {
+			gid_t *b;
+			b = (void *)__get_free_page(GFP_USER);
+			if (!b)
+				goto out_undo_partial_alloc;
+			group_info->blocks[i] = b;
+		}
+	}
+	return group_info;
+
+out_undo_partial_alloc:
+	while (--i >= 0) {
+		free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+	return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+	if (group_info->blocks[0] != group_info->small_block) {
+		int i;
+		for (i = 0; i < group_info->nblocks; i++)
+			free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+			  const struct group_info *group_info)
+{
+	int i;
+	unsigned int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+		unsigned int len = cp_count * sizeof(*grouplist);
+
+		if (copy_to_user(grouplist, group_info->blocks[i], len))
+			return -EFAULT;
+
+		grouplist += NGROUPS_PER_BLOCK;
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+{
+	int i;
+	unsigned int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+		unsigned int len = cp_count * sizeof(*grouplist);
+
+		if (copy_from_user(group_info->blocks[i], grouplist, len))
+			return -EFAULT;
+
+		grouplist += NGROUPS_PER_BLOCK;
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = GROUP_AT(group_info, right);
+
+			while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+				GROUP_AT(group_info, right) =
+				    GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+	unsigned int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		unsigned int mid = (left+right)/2;
+		int cmp = grp - GROUP_AT(group_info, mid);
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+	int retval;
+
+	retval = security_task_setgroups(group_info);
+	if (retval)
+		return retval;
+
+	put_group_info(new->group_info);
+	groups_sort(group_info);
+	get_group_info(group_info);
+	new->group_info = group_info;
+	return 0;
+}
+
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = set_groups(new, group_info);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+	const struct cred *cred = current_cred();
+	int i;
+
+	if (gidsetsize < 0)
+		return -EINVAL;
+
+	/* no need to grab task_lock here; it cannot change */
+	i = cred->group_info->ngroups;
+	if (gidsetsize) {
+		if (i > gidsetsize) {
+			i = -EINVAL;
+			goto out;
+		}
+		if (groups_to_user(grouplist, cred->group_info)) {
+			i = -EFAULT;
+			goto out;
+		}
+	}
+out:
+	return i;
+}
+
+/*
+ *	SMP: Our groups are copy-on-write. We can set them safely
+ *	without another task interfering.
+ */
+
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+	struct group_info *group_info;
+	int retval;
+
+	if (!capable(CAP_SETGID))
+		return -EPERM;
+	if ((unsigned)gidsetsize > NGROUPS_MAX)
+		return -EINVAL;
+
+	group_info = groups_alloc(gidsetsize);
+	if (!group_info)
+		return -ENOMEM;
+	retval = groups_from_user(group_info, grouplist);
+	if (retval) {
+		put_group_info(group_info);
+		return retval;
+	}
+
+	retval = set_current_groups(group_info);
+	put_group_info(group_info);
+
+	return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+	const struct cred *cred = current_cred();
+	int retval = 1;
+
+	if (grp != cred->fsgid)
+		retval = groups_search(cred->group_info, grp);
+	return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+	const struct cred *cred = current_cred();
+	int retval = 1;
+
+	if (grp != cred->egid)
+		retval = groups_search(cred->group_info, grp);
+	return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf..4edcf51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1112,289 +1112,6 @@ out:
 	return err;
 }
 
-/*
- * Supplementary group IDs
- */
-
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
-struct group_info *groups_alloc(int gidsetsize)
-{
-	struct group_info *group_info;
-	int nblocks;
-	int i;
-
-	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
-	/* Make sure we always allocate at least one indirect block pointer */
-	nblocks = nblocks ? : 1;
-	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
-	if (!group_info)
-		return NULL;
-	group_info->ngroups = gidsetsize;
-	group_info->nblocks = nblocks;
-	atomic_set(&group_info->usage, 1);
-
-	if (gidsetsize <= NGROUPS_SMALL)
-		group_info->blocks[0] = group_info->small_block;
-	else {
-		for (i = 0; i < nblocks; i++) {
-			gid_t *b;
-			b = (void *)__get_free_page(GFP_USER);
-			if (!b)
-				goto out_undo_partial_alloc;
-			group_info->blocks[i] = b;
-		}
-	}
-	return group_info;
-
-out_undo_partial_alloc:
-	while (--i >= 0) {
-		free_page((unsigned long)group_info->blocks[i]);
-	}
-	kfree(group_info);
-	return NULL;
-}
-
-EXPORT_SYMBOL(groups_alloc);
-
-void groups_free(struct group_info *group_info)
-{
-	if (group_info->blocks[0] != group_info->small_block) {
-		int i;
-		for (i = 0; i < group_info->nblocks; i++)
-			free_page((unsigned long)group_info->blocks[i]);
-	}
-	kfree(group_info);
-}
-
-EXPORT_SYMBOL(groups_free);
-
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
-			  const struct group_info *group_info)
-{
-	int i;
-	unsigned int count = group_info->ngroups;
-
-	for (i = 0; i < group_info->nblocks; i++) {
-		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-		unsigned int len = cp_count * sizeof(*grouplist);
-
-		if (copy_to_user(grouplist, group_info->blocks[i], len))
-			return -EFAULT;
-
-		grouplist += NGROUPS_PER_BLOCK;
-		count -= cp_count;
-	}
-	return 0;
-}
-
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
-    gid_t __user *grouplist)
-{
-	int i;
-	unsigned int count = group_info->ngroups;
-
-	for (i = 0; i < group_info->nblocks; i++) {
-		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-		unsigned int len = cp_count * sizeof(*grouplist);
-
-		if (copy_from_user(group_info->blocks[i], grouplist, len))
-			return -EFAULT;
-
-		grouplist += NGROUPS_PER_BLOCK;
-		count -= cp_count;
-	}
-	return 0;
-}
-
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
-	int base, max, stride;
-	int gidsetsize = group_info->ngroups;
-
-	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-		; /* nothing */
-	stride /= 3;
-
-	while (stride) {
-		max = gidsetsize - stride;
-		for (base = 0; base < max; base++) {
-			int left = base;
-			int right = left + stride;
-			gid_t tmp = GROUP_AT(group_info, right);
-
-			while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
-				GROUP_AT(group_info, right) =
-				    GROUP_AT(group_info, left);
-				right = left;
-				left -= stride;
-			}
-			GROUP_AT(group_info, right) = tmp;
-		}
-		stride /= 3;
-	}
-}
-
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
-	unsigned int left, right;
-
-	if (!group_info)
-		return 0;
-
-	left = 0;
-	right = group_info->ngroups;
-	while (left < right) {
-		unsigned int mid = (left+right)/2;
-		int cmp = grp - GROUP_AT(group_info, mid);
-		if (cmp > 0)
-			left = mid + 1;
-		else if (cmp < 0)
-			right = mid;
-		else
-			return 1;
-	}
-	return 0;
-}
-
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
-	int retval;
-
-	retval = security_task_setgroups(group_info);
-	if (retval)
-		return retval;
-
-	put_group_info(new->group_info);
-	groups_sort(group_info);
-	get_group_info(group_info);
-	new->group_info = group_info;
-	return 0;
-}
-
-EXPORT_SYMBOL(set_groups);
-
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
-	struct cred *new;
-	int ret;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
-
-	ret = set_groups(new, group_info);
-	if (ret < 0) {
-		abort_creds(new);
-		return ret;
-	}
-
-	return commit_creds(new);
-}
-
-EXPORT_SYMBOL(set_current_groups);
-
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-	const struct cred *cred = current_cred();
-	int i;
-
-	if (gidsetsize < 0)
-		return -EINVAL;
-
-	/* no need to grab task_lock here; it cannot change */
-	i = cred->group_info->ngroups;
-	if (gidsetsize) {
-		if (i > gidsetsize) {
-			i = -EINVAL;
-			goto out;
-		}
-		if (groups_to_user(grouplist, cred->group_info)) {
-			i = -EFAULT;
-			goto out;
-		}
-	}
-out:
-	return i;
-}
-
-/*
- *	SMP: Our groups are copy-on-write. We can set them safely
- *	without another task interfering.
- */
- 
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
-	struct group_info *group_info;
-	int retval;
-
-	if (!capable(CAP_SETGID))
-		return -EPERM;
-	if ((unsigned)gidsetsize > NGROUPS_MAX)
-		return -EINVAL;
-
-	group_info = groups_alloc(gidsetsize);
-	if (!group_info)
-		return -ENOMEM;
-	retval = groups_from_user(group_info, grouplist);
-	if (retval) {
-		put_group_info(group_info);
-		return retval;
-	}
-
-	retval = set_current_groups(group_info);
-	put_group_info(group_info);
-
-	return retval;
-}
-
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
-	const struct cred *cred = current_cred();
-	int retval = 1;
-
-	if (grp != cred->fsgid)
-		retval = groups_search(cred->group_info, grp);
-	return retval;
-}
-
-EXPORT_SYMBOL(in_group_p);
-
-int in_egroup_p(gid_t grp)
-{
-	const struct cred *cred = current_cred();
-	int retval = 1;
-
-	if (grp != cred->egid)
-		retval = groups_search(cred->group_info, grp);
-	return retval;
-}
-
-EXPORT_SYMBOL(in_egroup_p);
-
 DECLARE_RWSEM(uts_sem);
 
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/9] cr: break out new_user_ns()
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
  2009-05-29 22:32 ` [PATCH 1/9] cred: #include init.h in cred.h Serge E. Hallyn
  2009-05-29 22:32 ` [PATCH 2/9] groups: move code to kernel/groups.c Serge E. Hallyn
@ 2009-05-29 22:33 ` Serge E. Hallyn
  2009-05-29 22:33 ` [PATCH 4/9] cr: split core function out of some set*{u,g}id functions Serge E. Hallyn
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:33 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

Break out the core function which checks privilege and (if
allowed) creates a new user namespace, with the passed-in
creating user_struct.  Note that a user_namespace, unlike
other namespace pointers, is not stored in the nsproxy.
Rather it is purely a property of user_structs.

This will let us keep the task restore code simpler.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/user_namespace.h |    8 ++++++
 kernel/user_namespace.c        |   53 ++++++++++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index cc4f453..a2b82d5 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -20,6 +20,8 @@ extern struct user_namespace init_user_ns;
 
 #ifdef CONFIG_USER_NS
 
+struct user_namespace *new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot);
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -38,6 +40,12 @@ static inline void put_user_ns(struct user_namespace *ns)
 
 #else
 
+static inline struct user_namespace *new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot)
+{
+	return -EINVAL;
+}
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8..e624b0f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,15 +11,8 @@
 #include <linux/user_namespace.h>
 #include <linux/cred.h>
 
-/*
- * Create a new user namespace, deriving the creator from the user in the
- * passed credentials, and replacing that user with the new root user for the
- * new namespace.
- *
- * This is called by copy_creds(), which will finish setting the target task's
- * credentials.
- */
-int create_user_ns(struct cred *new)
+static struct user_namespace *_new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot)
 {
 	struct user_namespace *ns;
 	struct user_struct *root_user;
@@ -27,7 +20,7 @@ int create_user_ns(struct cred *new)
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
 	if (!ns)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	kref_init(&ns->kref);
 
@@ -38,12 +31,43 @@ int create_user_ns(struct cred *new)
 	root_user = alloc_uid(ns, 0);
 	if (!root_user) {
 		kfree(ns);
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	/* set the new root user in the credentials under preparation */
-	ns->creator = new->user;
-	new->user = root_user;
+	ns->creator = creator;
+
+	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
+	kref_set(&ns->kref, 1);
+
+	*newroot = root_user;
+	return ns;
+}
+
+struct user_namespace *new_user_ns(struct user_struct *creator,
+				   struct user_struct **newroot)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	return _new_user_ns(creator, newroot);
+}
+
+/*
+ * Create a new user namespace, deriving the creator from the user in the
+ * passed credentials, and replacing that user with the new root user for the
+ * new namespace.
+ *
+ * This is called by copy_creds(), which will finish setting the target task's
+ * credentials.
+ */
+int create_user_ns(struct cred *new)
+{
+	struct user_namespace *ns;
+
+	ns = new_user_ns(new->user, &new->user);
+	if (IS_ERR(ns))
+		return PTR_ERR(ns);
+
 	new->uid = new->euid = new->suid = new->fsuid = 0;
 	new->gid = new->egid = new->sgid = new->fsgid = 0;
 	put_group_info(new->group_info);
@@ -54,9 +78,6 @@ int create_user_ns(struct cred *new)
 #endif
 	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
-	kref_set(&ns->kref, 1);
-
 	return 0;
 }
 
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 4/9] cr: split core function out of some set*{u,g}id functions
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
                   ` (2 preceding siblings ...)
  2009-05-29 22:33 ` [PATCH 3/9] cr: break out new_user_ns() Serge E. Hallyn
@ 2009-05-29 22:33 ` Serge E. Hallyn
  2009-05-29 22:33 ` [PATCH 5/9] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:33 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

When restarting tasks, we want to be able to change xuid and
xgid in a struct cred, and do so with security checks.  Break
the core functionality of set{fs,res}{u,g}id into cred_setX
which performs the access checks based on current_cred(),
but performs the requested change on a passed-in cred.

This will allow us to securely construct struct creds based
on a checkpoint image, constrained by the caller's permissions,
and apply them to the caller at the end of sys_restart().

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/cred.h |    8 +++
 kernel/cred.c        |  114 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c         |  134 ++++++++------------------------------------------
 3 files changed, 143 insertions(+), 113 deletions(-)

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4fa9996..2ffffbe 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -21,6 +21,9 @@ struct user_struct;
 struct cred;
 struct inode;
 
+/* defined in sys.c, used in cred_setresuid */
+extern int set_user(struct cred *new);
+
 /*
  * COW Supplementary groups list
  */
@@ -344,4 +347,9 @@ do {						\
 	*(_fsgid) = __cred->fsgid;		\
 } while(0)
 
+int cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid);
+int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid, gid_t sgid);
+int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid);
+int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid);
+
 #endif /* _LINUX_CRED_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a03918..a017399 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -589,3 +589,117 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 	return security_kernel_create_files_as(new, inode);
 }
 EXPORT_SYMBOL(set_create_files_as);
+
+int cred_setresuid(struct cred *new, uid_t ruid, uid_t euid, uid_t suid)
+{
+	int retval;
+	const struct cred *old;
+
+	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+	old = current_cred();
+
+	if (!capable(CAP_SETUID)) {
+		if (ruid != (uid_t) -1 && ruid != old->uid &&
+		    ruid != old->euid  && ruid != old->suid)
+			return -EPERM;
+		if (euid != (uid_t) -1 && euid != old->uid &&
+		    euid != old->euid  && euid != old->suid)
+			return -EPERM;
+		if (suid != (uid_t) -1 && suid != old->uid &&
+		    suid != old->euid  && suid != old->suid)
+			return -EPERM;
+	}
+
+	if (ruid != (uid_t) -1) {
+		new->uid = ruid;
+		if (ruid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				return retval;
+		}
+	}
+	if (euid != (uid_t) -1)
+		new->euid = euid;
+	if (suid != (uid_t) -1)
+		new->suid = suid;
+	new->fsuid = new->euid;
+
+	return security_task_fix_setuid(new, old, LSM_SETID_RES);
+}
+
+int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid,
+			gid_t sgid)
+{
+	const struct cred *old = current_cred();
+	int retval;
+
+	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+
+	if (!capable(CAP_SETGID)) {
+		if (rgid != (gid_t) -1 && rgid != old->gid &&
+		    rgid != old->egid  && rgid != old->sgid)
+			return -EPERM;
+		if (egid != (gid_t) -1 && egid != old->gid &&
+		    egid != old->egid  && egid != old->sgid)
+			return -EPERM;
+		if (sgid != (gid_t) -1 && sgid != old->gid &&
+		    sgid != old->egid  && sgid != old->sgid)
+			return -EPERM;
+	}
+
+	if (rgid != (gid_t) -1)
+		new->gid = rgid;
+	if (egid != (gid_t) -1)
+		new->egid = egid;
+	if (sgid != (gid_t) -1)
+		new->sgid = sgid;
+	new->fsgid = new->egid;
+	return 0;
+}
+
+int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid)
+{
+	const struct cred *old;
+
+	old = current_cred();
+	*old_fsuid = old->fsuid;
+
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+		return -EPERM;
+
+	if (uid == old->uid  || uid == old->euid  ||
+	    uid == old->suid || uid == old->fsuid ||
+	    capable(CAP_SETUID)) {
+		if (uid != *old_fsuid) {
+			new->fsuid = uid;
+			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+				return 0;
+		}
+	}
+	return -EPERM;
+}
+
+int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid)
+{
+	const struct cred *old;
+
+	old = current_cred();
+	*old_fsgid = old->fsgid;
+
+	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
+		return -EPERM;
+
+	if (gid == old->gid  || gid == old->egid  ||
+	    gid == old->sgid || gid == old->fsgid ||
+	    capable(CAP_SETGID)) {
+		if (gid != *old_fsgid) {
+			new->fsgid = gid;
+			return 0;
+		}
+	}
+	return -EPERM;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 4edcf51..0cedec0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -558,11 +558,12 @@ error:
 /*
  * change the user struct in a credentials set to match the new UID
  */
-static int set_user(struct cred *new)
+int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current_user_ns(), new->uid);
+	/* is this ok? */
+	new_user = alloc_uid(new->user->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
@@ -703,14 +704,12 @@ error:
 	return retval;
 }
 
-
 /*
  * This function implements a generic ability to update ruid, euid,
  * and suid.  This allows you to implement the 4.4 compatible seteuid().
  */
 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 {
-	const struct cred *old;
 	struct cred *new;
 	int retval;
 
@@ -718,45 +717,10 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	if (!new)
 		return -ENOMEM;
 
-	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
-	if (retval)
-		goto error;
-	old = current_cred();
-
-	retval = -EPERM;
-	if (!capable(CAP_SETUID)) {
-		if (ruid != (uid_t) -1 && ruid != old->uid &&
-		    ruid != old->euid  && ruid != old->suid)
-			goto error;
-		if (euid != (uid_t) -1 && euid != old->uid &&
-		    euid != old->euid  && euid != old->suid)
-			goto error;
-		if (suid != (uid_t) -1 && suid != old->uid &&
-		    suid != old->euid  && suid != old->suid)
-			goto error;
-	}
-
-	if (ruid != (uid_t) -1) {
-		new->uid = ruid;
-		if (ruid != old->uid) {
-			retval = set_user(new);
-			if (retval < 0)
-				goto error;
-		}
-	}
-	if (euid != (uid_t) -1)
-		new->euid = euid;
-	if (suid != (uid_t) -1)
-		new->suid = suid;
-	new->fsuid = new->euid;
-
-	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
-	if (retval < 0)
-		goto error;
-
-	return commit_creds(new);
+	retval = cred_setresuid(new, ruid, euid, suid);
+	if (retval == 0)
+		return commit_creds(new);
 
-error:
 	abort_creds(new);
 	return retval;
 }
@@ -778,43 +742,17 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
  */
 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 {
-	const struct cred *old;
 	struct cred *new;
 	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
-	old = current_cred();
 
-	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
-	if (retval)
-		goto error;
+	retval = cred_setresgid(new, rgid, egid, sgid);
+	if (retval == 0)
+		return commit_creds(new);
 
-	retval = -EPERM;
-	if (!capable(CAP_SETGID)) {
-		if (rgid != (gid_t) -1 && rgid != old->gid &&
-		    rgid != old->egid  && rgid != old->sgid)
-			goto error;
-		if (egid != (gid_t) -1 && egid != old->gid &&
-		    egid != old->egid  && egid != old->sgid)
-			goto error;
-		if (sgid != (gid_t) -1 && sgid != old->gid &&
-		    sgid != old->egid  && sgid != old->sgid)
-			goto error;
-	}
-
-	if (rgid != (gid_t) -1)
-		new->gid = rgid;
-	if (egid != (gid_t) -1)
-		new->egid = egid;
-	if (sgid != (gid_t) -1)
-		new->sgid = sgid;
-	new->fsgid = new->egid;
-
-	return commit_creds(new);
-
-error:
 	abort_creds(new);
 	return retval;
 }
@@ -831,7 +769,6 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u
 	return retval;
 }
 
-
 /*
  * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
  * is used for "access()" and for the NFS daemon (letting nfsd stay at
@@ -840,35 +777,20 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u
  */
 SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 {
-	const struct cred *old;
 	struct cred *new;
 	uid_t old_fsuid;
+	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return current_fsuid();
-	old = current_cred();
-	old_fsuid = old->fsuid;
-
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
-		goto error;
-
-	if (uid == old->uid  || uid == old->euid  ||
-	    uid == old->suid || uid == old->fsuid ||
-	    capable(CAP_SETUID)) {
-		if (uid != old_fsuid) {
-			new->fsuid = uid;
-			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
-				goto change_okay;
-		}
-	}
 
-error:
-	abort_creds(new);
-	return old_fsuid;
+	retval = cred_setfsuid(new, uid, &old_fsuid);
+	if (retval == 0)
+		commit_creds(new);
+	else
+		abort_creds(new);
 
-change_okay:
-	commit_creds(new);
 	return old_fsuid;
 }
 
@@ -877,34 +799,20 @@ change_okay:
  */
 SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 {
-	const struct cred *old;
 	struct cred *new;
 	gid_t old_fsgid;
+	int retval;
 
 	new = prepare_creds();
 	if (!new)
 		return current_fsgid();
-	old = current_cred();
-	old_fsgid = old->fsgid;
-
-	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		goto error;
-
-	if (gid == old->gid  || gid == old->egid  ||
-	    gid == old->sgid || gid == old->fsgid ||
-	    capable(CAP_SETGID)) {
-		if (gid != old_fsgid) {
-			new->fsgid = gid;
-			goto change_okay;
-		}
-	}
 
-error:
-	abort_creds(new);
-	return old_fsgid;
+	retval = cred_setfsgid(new, gid, &old_fsgid);
+	if (retval == 0)
+		commit_creds(new);
+	else
+		abort_creds(new);
 
-change_okay:
-	commit_creds(new);
 	return old_fsgid;
 }
 
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
                   ` (3 preceding siblings ...)
  2009-05-29 22:33 ` [PATCH 4/9] cr: split core function out of some set*{u,g}id functions Serge E. Hallyn
@ 2009-05-29 22:33 ` Serge E. Hallyn
  2009-05-31 20:26   ` Andrew G. Morgan
  2009-05-29 22:33 ` [PATCH 6/9] cr: checkpoint and restore task credentials Serge E. Hallyn
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:33 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

An application checkpoint image will store capability sets
(and the bounding set) as __u64s.  Define checkpoint and
restart functions to translate between those and kernel_cap_t's.

Define a common function do_capset_tocred() which applies capability
set changes to a passed-in struct cred.

The restore function uses do_capset_tocred() to apply the restored
capabilities to the struct cred being crafted, subject to the
current task's (task executing sys_restart()) permissions.

Changelog:
	May 28: add helpers to c/r securebits

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/capability.h |    7 +++
 kernel/capability.c        |   94 +++++++++++++++++++++++++++++++++++++------
 security/commoncap.c       |   51 ++++++++++++++++--------
 3 files changed, 122 insertions(+), 30 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c302110..b3853ca 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -536,6 +536,13 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+extern void checkpoint_save_cap(__u64 *dest, kernel_cap_t src);
+struct cred;
+extern int checkpoint_restore_cap(__u64 e, __u64 i, __u64 p, __u64 x,
+				struct cred *cred);
+extern void checkpoint_save_securebits(unsigned *, unsigned);
+extern int checkpoint_restore_securebits(unsigned, struct cred *);
+
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041..d2c9bb3 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -217,6 +217,45 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 	return ret;
 }
 
+static int do_capset_tocred(kernel_cap_t *effective, kernel_cap_t *inheritable,
+			kernel_cap_t *permitted, struct cred *new)
+{
+	int ret;
+
+	ret = security_capset(new, current_cred(),
+			      effective, inheritable, permitted);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * for checkpoint-restart, do we want to wait until end of restart?
+	 * not sure we care */
+	audit_log_capset(current->pid, new, current_cred());
+
+	return 0;
+}
+
+static int do_capset(kernel_cap_t *effective, kernel_cap_t *inheritable,
+			kernel_cap_t *permitted)
+{
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = do_capset_tocred(effective, inheritable, permitted, new);
+	if (ret < 0)
+		goto error;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return ret;
+}
+
 /**
  * sys_capset - set capabilities for a process or (*) a group of processes
  * @header: pointer to struct that contains capability version and
@@ -240,7 +279,6 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -271,22 +309,52 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 		i++;
 	}
 
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
+	return do_capset(&effective, &inheritable, &permitted);
 
-	ret = security_capset(new, current_cred(),
-			      &effective, &inheritable, &permitted);
-	if (ret < 0)
-		goto error;
+}
 
-	audit_log_capset(pid, new, current_cred());
 
-	return commit_creds(new);
+void checkpoint_save_cap(__u64 *dest, kernel_cap_t src)
+{
+	*dest = src.cap[0] | (src.cap[1] << sizeof(__u32));
+}
 
-error:
-	abort_creds(new);
-	return ret;
+static void do_capbset_drop(struct cred *cred, int cap)
+{
+	cap_lower(cred->cap_bset, cap);
+}
+
+int checkpoint_restore_cap(__u64 newe, __u64 newi, __u64 newp, __u64 newx,
+			struct cred *cred)
+{
+	kernel_cap_t effective, inheritable, permitted, bset;
+	int may_dropbcap = capable(CAP_SETPCAP);
+	int ret, i;
+
+	effective.cap[0] = newe;
+	effective.cap[1] = (newe >> sizeof(__u32));
+	inheritable.cap[0] = newi;
+	inheritable.cap[1] = (newi >> sizeof(__u32));
+	permitted.cap[0] = newp;
+	permitted.cap[1] = (newp >> sizeof(__u32));
+	bset.cap[0] = newx;
+	bset.cap[1] = (newx >> sizeof(__u32));
+
+	ret = do_capset_tocred(&effective, &inheritable, &permitted, cred);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < CAP_LAST_CAP; i++) {
+		if (cap_raised(bset, i))
+			continue;
+		if (!cap_raised(current_cred()->cap_bset, i))
+			continue;
+		if (!may_dropbcap)
+			return -EPERM;
+		do_capbset_drop(cred, i);
+	}
+
+	return 0;
 }
 
 /**
diff --git a/security/commoncap.c b/security/commoncap.c
index beac025..8054a07 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -804,6 +804,28 @@ int cap_task_setnice (struct task_struct *p, int nice)
 }
 #endif
 
+int cap_set_securebits(struct cred *new, unsigned securebits)
+{
+	if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
+	     & (new->securebits ^ securebits))				/*[1]*/
+	    || ((new->securebits & SECURE_ALL_LOCKS & ~securebits))	/*[2]*/
+	    || (securebits & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
+	    || (cap_capable(current, current_cred(), CAP_SETPCAP,
+			    SECURITY_CAP_AUDIT) != 0)			/*[4]*/
+		/*
+		 * [1] no changing of bits that are locked
+		 * [2] no unlocking of locks
+		 * [3] no setting of unsupported bits
+		 * [4] doing anything requires privilege (go read about
+		 *     the "sendmail capabilities bug")
+		 */
+	    )
+		/* cannot change a locked bit */
+		return -EPERM;
+	new->securebits = securebits;
+	return 0;
+}
+
 /**
  * cap_task_prctl - Implement process control functions for this security module
  * @option: The process control function requested
@@ -861,24 +883,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		error = -EPERM;
-		if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (new->securebits ^ arg2))			/*[1]*/
-		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
-		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
-		    || (cap_capable(current, current_cred(), CAP_SETPCAP,
-				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
-			/*
-			 * [1] no changing of bits that are locked
-			 * [2] no unlocking of locks
-			 * [3] no setting of unsupported bits
-			 * [4] doing anything requires privilege (go read about
-			 *     the "sendmail capabilities bug")
-			 */
-		    )
-			/* cannot change a locked bit */
+		error = cap_set_securebits(new, arg2);
+		if (error)
 			goto error;
-		new->securebits = arg2;
 		goto changed;
 
 	case PR_GET_SECUREBITS:
@@ -921,6 +928,16 @@ error:
 	return error;
 }
 
+void checkpoint_save_securebits(unsigned *b, unsigned cred_securebits)
+{
+	*b = cred_securebits;
+}
+
+int checkpoint_restore_securebits(unsigned b, struct cred *cred)
+{
+	return cap_set_securebits(cred, b);
+}
+
 /**
  * cap_syslog - Determine whether syslog function is permitted
  * @type: Function requested
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 6/9] cr: checkpoint and restore task credentials
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
                   ` (4 preceding siblings ...)
  2009-05-29 22:33 ` [PATCH 5/9] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
@ 2009-05-29 22:33 ` Serge E. Hallyn
       [not found] ` <20090529223229.GA14536-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-29 22:34 ` [PATCH 9/9] cr: ipc: reset kern_ipc_perms Serge E. Hallyn
  7 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:33 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

This patch adds the checkpointing and restart of credentials
(uids, gids, and capabilities) to Oren's c/r patchset (on top
of v14).  It goes to great pains to re-use (and define when
needed) common helpers, in order to make sure that as security
code is modified, the cr code will be updated.  Some of the
helpers should still be moved (i.e. _creds() functions should
be in kernel/cred.c).

When building the credentials for the restarted process, I
1. create a new struct cred as a copy of the running task's
cred (using prepare_cred())
2. always authorize any changes to the new struct cred
based on the permissions of current_cred() (not the current
transient state of the new cred).

While this may mean that certain transient_cred1->transient_cred2
states are allowed which otherwise wouldn't be allowed, the
fact remains that current_cred() is allowed to transition to
transient_cred2.

The reconstructed creds are applied to the task at the very
end of the sys_restart call.  This ensures that any objects which
need to be re-created (file, socket, etc) are re-created using
the creds of the task calling sys_restart - preventing an unpriv
user from creating a privileged object, and ensuring that a
root task can restart a process which had started out privileged,
created some privileged objects, then dropped its privilege.

With these patches, the root user can restart checkpoint images
(created by either hallyn or root) of user hallyn's tasks,
resulting in a program owned by hallyn.

Plenty of bugs to be found, no doubt.

Changelog:
	May 28: 1. Restore securebits
		2. Address Alexey's comments: move prototypes out of
		   sched.h, validate ngroups < NGROUPS_MAX, validate
		   groups are sorted, and get rid of ckpt_hdr_cred->version.
		3. remove bogus unused flag RESTORE_CREATE_USERNS
	May 26: Move group, user, userns, creds c/r functions out
		of checkpoint/process.c and into the appropriate files.
	May 26: Define struct ckpt_hdr_task_creds and move task cred
		objref c/r into {checkpoint_restore}_task_shared().
	May 26: Take cred refs around checkpoint_write_creds()
	May 20: Remove the limit on number of groups in groupinfo
		at checkpoint time
	May 20: Remove the depth limit on empty user namespaces
	May 20: Better document checkpoint_user
	May 18: fix more refcounting: if (userns 5, uid 0) had
		no active tasks or child user_namespaces, then
		it shouldn't exist at restart or it, its namespace,
		and its whole chain of creators will be leaked.
	May 14: fix some refcounting:
		1. a new user_ns needs a ref to remain pinned
		   by its root user
		2. current_user_ns needs an extra ref bc objhash
		   drops two on restart
		3. cred needs a ref for the real credentials bc
		   commit_creds eats one ref.
	May 13: folded in fix to userns refcounting.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 checkpoint/objhash.c             |  118 +++++++++++++++++++++++++++++
 checkpoint/process.c             |  152 +++++++++++++++++++++++++++++++++++++-
 include/linux/checkpoint.h       |   11 +++
 include/linux/checkpoint_hdr.h   |   61 +++++++++++++++
 include/linux/checkpoint_types.h |    2 +-
 include/linux/cred.h             |   13 +++
 include/linux/user_namespace.h   |    6 ++
 kernel/cred.c                    |  117 +++++++++++++++++++++++++++++
 kernel/groups.c                  |   59 +++++++++++++++
 kernel/user.c                    |  147 ++++++++++++++++++++++++++++++++++++
 kernel/user_namespace.c          |   86 +++++++++++++++++++++
 11 files changed, 769 insertions(+), 3 deletions(-)

diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 045a920..20866c5 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -17,6 +17,7 @@
 #include <linux/fdtable.h>
 #include <linux/sched.h>
 #include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -177,6 +178,71 @@ static int obj_ipc_ns_users(void *ptr)
 	return atomic_read(&((struct ipc_namespace *) ptr)->count);
 }
 
+static int obj_cred_grab(void *ptr)
+{
+	get_cred((struct cred *) ptr);
+	return 0;
+}
+
+static void obj_cred_drop(void *ptr)
+{
+	put_cred((struct cred *) ptr);
+}
+
+static int obj_cred_users(void *ptr)
+{
+	return atomic_read(&((struct cred *) ptr)->usage);
+}
+
+static int obj_user_grab(void *ptr)
+{
+	struct user_struct *u = ptr;
+	(void) get_uid(u);
+	return 0;
+}
+
+static void obj_user_drop(void *ptr)
+{
+	free_uid((struct user_struct *) ptr);
+}
+
+static int obj_user_users(void *ptr)
+{
+	return atomic_read(&((struct user_struct *) ptr)->__count);
+}
+
+static int obj_userns_grab(void *ptr)
+{
+	get_user_ns((struct user_namespace *) ptr);
+	return 0;
+}
+
+static void obj_userns_drop(void *ptr)
+{
+	put_user_ns((struct user_namespace *) ptr);
+}
+
+static int obj_user_ns_users(void *ptr)
+{
+	return atomic_read(&((struct user_namespace *) ptr)->kref.refcount);
+}
+
+static int obj_groupinfo_grab(void *ptr)
+{
+	get_group_info((struct group_info *) ptr);
+	return 0;
+}
+
+static void obj_groupinfo_drop(void *ptr)
+{
+	put_group_info((struct group_info *) ptr);
+}
+
+static int obj_groupinfo_users(void *ptr)
+{
+	return atomic_read(&((struct group_info *) ptr)->usage);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -254,6 +320,46 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_bad,
 		.restore = restore_bad,
 	},
+	/* user_ns object */
+	{
+		.obj_name = "USER_NS",
+		.obj_type = CKPT_OBJ_USER_NS,
+		.ref_drop = obj_userns_drop,
+		.ref_grab = obj_userns_grab,
+		.ref_users = obj_user_ns_users,
+		.checkpoint = checkpoint_userns,
+		.restore = restore_userns,
+	},
+	/* struct cred */
+	{
+		.obj_name = "CRED",
+		.obj_type = CKPT_OBJ_CRED,
+		.ref_drop = obj_cred_drop,
+		.ref_grab = obj_cred_grab,
+		.ref_users = obj_cred_users,
+		.checkpoint = checkpoint_cred,
+		.restore = restore_cred,
+	},
+	/* user object */
+	{
+		.obj_name = "USER",
+		.obj_type = CKPT_OBJ_USER,
+		.ref_drop = obj_user_drop,
+		.ref_grab = obj_user_grab,
+		.ref_users = obj_user_users,
+		.checkpoint = checkpoint_user,
+		.restore = restore_user,
+	},
+	/* struct groupinfo */
+	{
+		.obj_name = "GROUPINFO",
+		.obj_type = CKPT_OBJ_GROUPINFO,
+		.ref_drop = obj_groupinfo_drop,
+		.ref_grab = obj_groupinfo_grab,
+		.ref_users = obj_groupinfo_users,
+		.checkpoint = checkpoint_groupinfo,
+		.restore = restore_groupinfo,
+	},
 };
 
 
@@ -323,6 +429,18 @@ static struct ckpt_obj *obj_find_by_ptr(struct ckpt_ctx *ctx, void *ptr)
 	return NULL;
 }
 
+/*
+ * look up an obj and return objref if in hash, else
+ * return 0.  Used during checkpoint.
+ */
+int obj_lookup(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_obj *obj = obj_find_by_ptr(ctx, ptr);
+	if (obj)
+		return obj->objref;
+	return 0;
+}
+
 static struct ckpt_obj *obj_find_by_objref(struct ckpt_ctx *ctx, int objref)
 {
 	struct hlist_head *h;
diff --git a/checkpoint/process.c b/checkpoint/process.c
index b604a85..5c59cc0 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -17,6 +17,7 @@
 #include <linux/futex.h>
 #include <linux/poll.h>
 #include <linux/utsname.h>
+#include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 #include <linux/syscalls.h>
@@ -25,6 +26,26 @@
  * Checkpoint
  */
 
+int checkpoint_groupinfo(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_groupinfo(ctx, (struct group_info *)ptr);
+}
+
+int checkpoint_userns(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_userns(ctx, (struct user_namespace *) ptr);
+}
+
+int checkpoint_user(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_user(ctx, (struct user_struct *)ptr);
+}
+
+int checkpoint_cred(struct ckpt_ctx *ctx, void *ptr)
+{
+	return checkpoint_write_cred(ctx, (struct cred *) ptr);
+}
+
 /* dump the task_struct of a given task */
 static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 {
@@ -161,6 +182,46 @@ static int checkpoint_task_ns(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+static int checkpoint_task_creds(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int realcred_ref, ecred_ref;
+	struct cred *rcred, *ecred;
+	struct ckpt_hdr_task_creds *h;
+	int ret;
+
+	rcred = get_cred(t->real_cred);
+	ecred = get_cred(t->cred);
+
+	realcred_ref = checkpoint_obj(ctx, rcred, CKPT_OBJ_CRED);
+	if (realcred_ref < 0) {
+		ret = realcred_ref;
+		goto error;
+	}
+
+	ecred_ref = checkpoint_obj(ctx, ecred, CKPT_OBJ_CRED);
+	if (ecred_ref < 0) {
+		ret = ecred_ref;
+		goto error;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_CREDS);
+	if (!h) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	h->cred_ref = realcred_ref;
+	h->ecred_ref = ecred_ref;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+error:
+	put_cred(rcred);
+	put_cred(ecred);
+	return ret;
+
+}
+
 static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 {
 	struct ckpt_hdr_task_objs *h;
@@ -176,7 +237,9 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	 * restored when it gets to restore, e.g. its memory.
 	 */
 
-	ret = checkpoint_task_ns(ctx, t);
+	ret = checkpoint_task_creds(ctx, t);
+	if (!ret)
+		ret = checkpoint_task_ns(ctx, t);
 	if (ret < 0)
 		return ret;
 
@@ -348,6 +411,26 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
  * Restart
  */
 
+void *restore_groupinfo(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_groupinfo(ctx);
+}
+
+void *restore_userns(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_userns(ctx);
+}
+
+void *restore_user(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_user(ctx);
+}
+
+void *restore_cred(struct ckpt_ctx *ctx)
+{
+	return (void *) restore_read_cred(ctx);
+}
+
 /* read the task_struct into the current task */
 static int restore_task_struct(struct ckpt_ctx *ctx)
 {
@@ -365,8 +448,12 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 
 	memset(t->comm, 0, TASK_COMM_LEN);
 	ret = _ckpt_read_string(ctx, t->comm, h->task_comm_len);
+	if (ret < 0)
+		goto out;
 
 	/* FIXME: restore remaining relevant task_struct fields */
+
+	ret = 0;
  out:
 	ckpt_hdr_put(ctx, h);
 	return ret;
@@ -514,6 +601,34 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int restore_task_creds(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_creds *h;
+	struct cred *realcred, *ecred;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_CREDS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	realcred = ckpt_obj_fetch(ctx, h->cred_ref, CKPT_OBJ_CRED);
+	if (IS_ERR(realcred)) {
+		ret = PTR_ERR(realcred);
+		goto out;
+	}
+	ecred = ckpt_obj_fetch(ctx, h->ecred_ref, CKPT_OBJ_CRED);
+	if (IS_ERR(ecred)) {
+		ret = PTR_ERR(ecred);
+		goto out;
+	}
+	ctx->realcred = realcred;
+	ctx->ecred = ecred;
+
+out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
 static int restore_task_objs(struct ckpt_ctx *ctx)
 {
 	struct ckpt_hdr_task_objs *h;
@@ -524,7 +639,9 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
 	 * and because shared objects are restored before they are
 	 * referenced. See comment in checkpoint_task_objs.
 	 */
-	ret = restore_task_ns(ctx);
+	ret = restore_task_creds(ctx);
+	if (!ret)
+		ret = restore_task_ns(ctx);
 	if (ret < 0)
 		return ret;
 
@@ -542,6 +659,33 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int restore_creds(struct ckpt_ctx *ctx)
+{
+	int ret;
+	const struct cred *old;
+	struct cred *rcred, *ecred;
+
+	rcred = ctx->realcred;
+	ecred = ctx->ecred;
+
+	/* commit_creds will take one ref for the eff creds, but
+	 * expects us to hold a ref for the obj creds, so take a
+	 * ref here */
+	get_cred(rcred);
+	ret = commit_creds(rcred);
+	if (ret)
+		return ret;
+
+	if (ecred == rcred)
+		return 0;
+
+	old =  override_creds(ecred); /* override_creds otoh takes new ref */
+	put_cred(old);
+
+	ctx->realcred = ctx->ecred = NULL;
+	return 0;
+}
+
 int restore_restart_block(struct ckpt_ctx *ctx)
 {
 	struct ckpt_hdr_restart_block *h;
@@ -670,6 +814,10 @@ int restore_task(struct ckpt_ctx *ctx)
 		goto out;
 	ret = restore_cpu(ctx);
 	ckpt_debug("cpu %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_creds(ctx);
+	ckpt_debug("creds: ret %d\n", ret);
  out:
 	return ret;
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index ec117f0..812a444 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -53,6 +53,7 @@ extern void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref,
 			    enum obj_type type);
 extern int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
 			       enum obj_type type, int *first);
+extern int obj_lookup(struct ckpt_ctx *ctx, void *ptr);
 extern int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref,
 			   enum obj_type type);
 
@@ -117,6 +118,16 @@ extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
 extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
 			       struct ckpt_hdr_file *h);
 
+/* credentials */
+int checkpoint_groupinfo(struct ckpt_ctx *ctx, void *ptr);
+int checkpoint_userns(struct ckpt_ctx *ctx, void *ptr);
+int checkpoint_user(struct ckpt_ctx *ctx, void *ptr);
+int checkpoint_cred(struct ckpt_ctx *ctx, void *ptr);
+void *restore_groupinfo(struct ckpt_ctx *ctx);
+void *restore_userns(struct ckpt_ctx *ctx);
+void *restore_user(struct ckpt_ctx *ctx);
+void *restore_cred(struct ckpt_ctx *ctx);
+
 /* memory */
 extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 57d29c1..e38867e 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -56,6 +56,11 @@ enum {
 	CKPT_HDR_NS,
 	CKPT_HDR_UTS_NS,
 	CKPT_HDR_IPC_NS,
+	CKPT_HDR_USER_NS,
+	CKPT_HDR_CRED,
+	CKPT_HDR_USER,
+	CKPT_HDR_GROUPINFO,
+	CKPT_HDR_TASK_CREDS,
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -104,6 +109,10 @@ enum obj_type {
 	CKPT_OBJ_NS,
 	CKPT_OBJ_UTS_NS,
 	CKPT_OBJ_IPC_NS,
+	CKPT_OBJ_USER_NS,
+	CKPT_OBJ_CRED,
+	CKPT_OBJ_USER,
+	CKPT_OBJ_GROUPINFO,
 	CKPT_OBJ_MAX
 };
 
@@ -163,9 +172,61 @@ struct ckpt_hdr_task {
 	__u32 exit_code;
 	__u32 exit_signal;
 
+#ifdef CONFIG_AUDITSYSCALL
+	/* would audit want to track the checkpointed ids,
+	   or (more likely) who actually restarted? */
+#endif
+
 	__u32 task_comm_len;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_task_creds {
+	struct ckpt_hdr h;
+	__s32 cred_ref;
+	__s32 ecred_ref;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_cred {
+	struct ckpt_hdr h;
+	__u32 uid, suid, euid, fsuid;
+	__u32 gid, sgid, egid, fsgid;
+	__u64 cap_i, cap_p, cap_e;
+	__u64 cap_x;  /* bounding set ('X') */
+	__s32 user_ref;
+	__s32 groupinfo_ref;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_groupinfo {
+	struct ckpt_hdr h;
+	__u32 ngroups;
+	/*
+	 * This is followed by ngroups __u32s
+	 */
+	__u32 groups[0];
+} __attribute__((aligned(8)));
+
+/*
+ * todo - keyrings and LSM
+ * These may be better done with userspace help though
+ */
+struct ckpt_hdr_user_struct {
+	struct ckpt_hdr h;
+	__u32 uid;
+	__s32 userns_ref;
+} __attribute__((aligned(8)));
+
+/*
+ * The user-struct mostly tracks system resource usage.
+ * Most of it's contents therefore will simply be set
+ * correctly as restart opens resources
+ */
+#define CKPT_USERNS_INIT 1
+struct ckpt_hdr_user_ns {
+	struct ckpt_hdr h;
+	__u32 flags;
+	__s32 creator_ref;
+} __attribute__((aligned(8)));
+
 /* namespaces */
 struct ckpt_hdr_task_ns {
 	struct ckpt_hdr h;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 45a0f80..ffa94b3 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -15,7 +15,6 @@
 
 #define CHECKPOINT_SUBTREE	0x1
 
-
 #ifdef __KERNEL__
 struct ckpt_ctx;
 struct ckpt_hdr_file;
@@ -70,6 +69,7 @@ struct ckpt_ctx {
 	atomic_t tasks_count;		/* sync of tasks: used to coordinate */
 	struct completion complete;	/* container root and other tasks on */
 	wait_queue_head_t waitq;	/* start, end, and restart ordering */
+	struct cred *realcred, *ecred;	/* tmp storage for cred at restart */
 };
 
 /* ckpt_ctx: kflags */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2ffffbe..e3269d5 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -77,6 +77,14 @@ extern int groups_search(const struct group_info *, gid_t);
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
 
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+int checkpoint_write_groupinfo(struct ckpt_ctx *, struct group_info *);
+struct group_info *restore_read_groupinfo(struct ckpt_ctx *);
+int checkpoint_write_user(struct ckpt_ctx *, struct user_struct *);
+struct user_struct *restore_read_user(struct ckpt_ctx *);
+#endif
+
 /*
  * The common credentials for a thread group
  * - shared by CLONE_THREAD
@@ -352,4 +360,9 @@ int cred_setresgid(struct cred *new, gid_t rgid, gid_t egid, gid_t sgid);
 int cred_setfsuid(struct cred *new, uid_t uid, uid_t *old_fsuid);
 int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid);
 
+#ifdef CONFIG_CHECKPOINT
+int checkpoint_write_cred(struct ckpt_ctx *, const struct cred *);
+struct cred *restore_read_cred(struct ckpt_ctx *);
+#endif
+
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index a2b82d5..3eeee40 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -62,4 +62,10 @@ static inline void put_user_ns(struct user_namespace *ns)
 
 #endif
 
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_ctx;
+int checkpoint_write_userns(struct ckpt_ctx *, struct user_namespace *);
+struct user_namespace *restore_read_userns(struct ckpt_ctx *);
+#endif
+
 #endif /* _LINUX_USER_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index a017399..70dbc78 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
+#include <linux/checkpoint.h>
 #include "cred-internals.h"
 
 static struct kmem_cache *cred_jar;
@@ -703,3 +704,119 @@ int cred_setfsgid(struct cred *new, gid_t gid, gid_t *old_fsgid)
 	}
 	return -EPERM;
 }
+
+#ifdef CONFIG_CHECKPOINT
+int checkpoint_write_cred(struct ckpt_ctx *ctx, const struct cred *cred)
+{
+	int ret;
+	int groupinfo_ref, user_ref;
+	struct ckpt_hdr_cred *h;
+
+	groupinfo_ref = checkpoint_obj(ctx, cred->group_info,
+					CKPT_OBJ_GROUPINFO);
+	if (groupinfo_ref < 0)
+		return groupinfo_ref;
+	user_ref = checkpoint_obj(ctx, cred->user, CKPT_OBJ_USER);
+	if (user_ref < 0)
+		return user_ref;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CRED);
+	if (!h)
+		return -ENOMEM;
+
+	h->uid = cred->uid;
+	h->suid = cred->suid;
+	h->euid = cred->euid;
+	h->fsuid = cred->fsuid;
+
+	h->gid = cred->gid;
+	h->sgid = cred->sgid;
+	h->egid = cred->egid;
+	h->fsgid = cred->fsgid;
+
+	checkpoint_save_cap(&h->cap_i, cred->cap_inheritable);
+	checkpoint_save_cap(&h->cap_p, cred->cap_permitted);
+	checkpoint_save_cap(&h->cap_e, cred->cap_effective);
+	checkpoint_save_cap(&h->cap_x, cred->cap_bset);
+
+	h->user_ref = user_ref;
+	h->groupinfo_ref = groupinfo_ref;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+struct cred *restore_read_cred(struct ckpt_ctx *ctx)
+{
+	struct cred *cred;
+	struct ckpt_hdr_cred *h;
+	struct user_struct *user;
+	struct group_info *groupinfo;
+	int ret = -EINVAL;
+	uid_t olduid;
+	gid_t oldgid;
+	int i;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CRED);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+
+	cred = prepare_creds();
+	if (!cred)
+		goto error;
+
+
+	/* Do we care if the target user and target group were compatible?
+	 * Probably.  But then, we can't do any setuid without CAP_SETUID,
+	 * so we must have been privileged to abuse it... */
+	groupinfo = ckpt_obj_fetch(ctx, h->groupinfo_ref, CKPT_OBJ_GROUPINFO);
+	if (IS_ERR(groupinfo))
+		goto err_putcred;
+	user = ckpt_obj_fetch(ctx, h->user_ref, CKPT_OBJ_USER);
+	if (IS_ERR(user))
+		goto err_putcred;
+
+	/*
+	 * TODO: this check should  go into the common helper in
+	 * kernel/sys.c, and should account for user namespaces
+	 */
+	if (!capable(CAP_SETGID))
+		for (i = 0; i < groupinfo->ngroups; i++) {
+			if (!in_egroup_p(GROUP_AT(groupinfo, i)))
+				goto err_putcred;
+		}
+	ret = set_groups(cred, groupinfo);
+	if (ret < 0)
+		goto err_putcred;
+	free_uid(cred->user);
+	cred->user = get_uid(user);
+	ret = cred_setresuid(cred, h->uid, h->euid, h->suid);
+	if (ret < 0)
+		goto err_putcred;
+	ret = cred_setfsuid(cred, h->fsuid, &olduid);
+	if (olduid != h->fsuid && ret < 0)
+		goto err_putcred;
+	ret = cred_setresgid(cred, h->gid, h->egid, h->sgid);
+	if (ret < 0)
+		goto err_putcred;
+	ret = cred_setfsgid(cred, h->fsgid, &oldgid);
+	if (oldgid != h->fsgid && ret < 0)
+		goto err_putcred;
+	ret = checkpoint_restore_cap(h->cap_e, h->cap_i, h->cap_p, h->cap_x,
+				cred);
+	if (ret)
+		goto err_putcred;
+
+	ckpt_hdr_put(ctx, h);
+	return cred;
+
+err_putcred:
+	abort_creds(cred);
+error:
+	ckpt_hdr_put(ctx, h);
+	return ERR_PTR(ret);
+}
+
+#endif
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2e..74db0ae 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/checkpoint.h>
 #include <asm/uaccess.h>
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
@@ -286,3 +287,61 @@ int in_egroup_p(gid_t grp)
 }
 
 EXPORT_SYMBOL(in_egroup_p);
+
+#ifdef CONFIG_CHECKPOINT
+int checkpoint_write_groupinfo(struct ckpt_ctx *ctx, struct group_info *g)
+{
+	int ret, i, size;
+	struct ckpt_hdr_groupinfo *h;
+
+	size = sizeof(*h) + g->ngroups * sizeof(__u32);
+	h = ckpt_hdr_get_type(ctx, size, CKPT_HDR_GROUPINFO);
+	if (!h)
+		return -ENOMEM;
+
+	h->ngroups = g->ngroups;
+	for (i = 0; i < g->ngroups; i++)
+		h->groups[i] = GROUP_AT(g, i);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/*
+ * TODO - switch to reading in smaller blocks?
+ */
+#define MAX_GROUPINFO_SIZE (sizeof(*h)+NGROUPS_MAX*sizeof(gid_t))
+struct group_info *restore_read_groupinfo(struct ckpt_ctx *ctx)
+{
+	struct group_info *g;
+	struct ckpt_hdr_groupinfo *h;
+	int i;
+
+	h = ckpt_read_buf_type(ctx, MAX_GROUPINFO_SIZE, CKPT_HDR_GROUPINFO);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+
+	g = ERR_PTR(-EINVAL);
+	if (h->ngroups > NGROUPS_MAX)
+		goto out;
+
+	for (i = 1; i < h->ngroups; i++)
+		if (h->groups[i-1] >= h->groups[i])
+			goto out;
+
+	g = groups_alloc(h->ngroups);
+	if (!g) {
+		g = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	for (i = 0; i < h->ngroups; i++)
+		GROUP_AT(g, i) = h->groups[i];
+
+out:
+	ckpt_hdr_put(ctx, h);
+	return g;
+}
+
+#endif
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba..97f13e2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+#include <linux/checkpoint.h>
 #include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
@@ -497,3 +498,149 @@ static int __init uid_cache_init(void)
 }
 
 module_init(uid_cache_init);
+
+#ifdef CONFIG_CHECKPOINT
+/*
+ * write the user struct
+ * TODO keyring will need to be dumped
+ *
+ * Here is what we're doing.  Remember a task can do clone(CLONE_NEWUSER)
+ * resulting in a cloned task in a new user namespace, with uid 0 in that
+ * new user_ns.  In that case, the parent's user (uid+user_ns) is the
+ * 'creator' of the new user_ns.
+ * Here, we call the user_ns of the ctx->root_task the 'root_ns'.  When we
+ * checkpoint a user-struct, we must store the chain of creators.  We
+ * must not do so recursively, this being the kernel.  In
+ * checkpoint_write_user() we walk and record in memory the list of creators up
+ * to either the latest user_struct which has already been saved, or the
+ * root_ns.  Then we walk that chain backward, writing out the user_ns and
+ * user_struct to the checkpoint image.
+ */
+#define UNSAVED_STRIDE 50
+int checkpoint_write_user(struct ckpt_ctx *ctx, struct user_struct *u)
+{
+	struct user_namespace *ns, *root_ns;
+	struct ckpt_hdr_user_struct *h;
+	int ns_objref;
+	int ret, i, unsaved_ns_nr = 0;
+	struct user_struct *save_u;
+	struct user_struct **unsaved_creators;
+	int step = 1, size;
+
+	/* if we've already saved the userns, then life is good */
+	ns_objref = obj_lookup(ctx, u->user_ns);
+	if (ns_objref)
+		goto write_user;
+
+	root_ns = task_cred_xxx(ctx->root_task, user)->user_ns;
+
+	if (u->user_ns == root_ns)
+		goto save_last_ns;
+
+	size = UNSAVED_STRIDE*sizeof(struct user_struct *);
+	unsaved_creators = kmalloc(size, GFP_KERNEL);
+	if (!unsaved_creators)
+		return -ENOMEM;
+	save_u = u;
+	do {
+		ns = save_u->user_ns;
+		save_u = ns->creator;
+		if (obj_lookup(ctx, save_u))
+			goto found;
+		unsaved_creators[unsaved_ns_nr++] = save_u;
+		if (unsaved_ns_nr == step * UNSAVED_STRIDE) {
+			step++;
+			size = step*UNSAVED_STRIDE*sizeof(struct user_struct *);
+			unsaved_creators = krealloc(unsaved_creators, size,
+							GFP_KERNEL);
+			if (!unsaved_creators)
+				return -ENOMEM;
+		}
+	} while (ns != root_ns);
+
+found:
+	for (i = unsaved_ns_nr-1; i >= 0; i--) {
+		ret = checkpoint_obj(ctx, unsaved_creators[i], CKPT_OBJ_USER);
+		if (ret < 0) {
+			kfree(unsaved_creators);
+			return ret;
+		}
+	}
+	kfree(unsaved_creators);
+
+save_last_ns:
+	ns_objref = checkpoint_obj(ctx, u->user_ns, CKPT_OBJ_USER_NS);
+	if (ns_objref < 0)
+		return ns_objref;
+
+write_user:
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_USER);
+	if (!h)
+		return -ENOMEM;
+
+	h->uid = u->uid;
+	h->userns_ref = ns_objref;
+
+	/* write out the user_struct */
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int may_setuid(struct user_namespace *ns, uid_t uid)
+{
+	/*
+	 * this next check will one day become
+	 * if capable(CAP_SETUID, ns) return 1;
+	 * followed by uid_equiv(current_userns, current_uid, ns, uid)
+	 * instead of just uids.
+	 */
+	if (capable(CAP_SETUID))
+		return 1;
+
+	/*
+	 * this may be overly strict, but since we might end up
+	 * restarting a privileged program here, we do not want
+	 * someone with only CAP_SYS_ADMIN but no CAP_SETUID to
+	 * be able to create random userids even in a userns he
+	 * created.
+	 */
+	if (current_user()->user_ns != ns)
+		return 0;
+	if (current_uid() == uid ||
+		current_euid() == uid ||
+		current_suid() == uid)
+		return 1;
+	return 0;
+}
+
+struct user_struct *restore_read_user(struct ckpt_ctx *ctx)
+{
+	struct user_struct *u;
+	struct user_namespace *ns;
+	struct ckpt_hdr_user_struct *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_USER);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+
+	ns = ckpt_obj_fetch(ctx, h->userns_ref, CKPT_OBJ_USER_NS);
+	if (IS_ERR(ns)) {
+		u = ERR_PTR(PTR_ERR(ns));
+		goto out;
+	}
+
+	if (!may_setuid(ns, h->uid)) {
+		u = ERR_PTR(-EPERM);
+		goto out;
+	}
+	u = alloc_uid(ns, h->uid);
+	if (!u)
+		u = ERR_PTR(-EINVAL);
+
+out:
+	ckpt_hdr_put(ctx, h);
+	return u;
+}
+#endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e624b0f..857cb3d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/checkpoint.h>
 #include <linux/cred.h>
 
 static struct user_namespace *_new_user_ns(struct user_struct *creator,
@@ -103,3 +104,88 @@ void free_user_ns(struct kref *kref)
 	schedule_work(&ns->destroyer);
 }
 EXPORT_SYMBOL(free_user_ns);
+
+#ifdef CONFIG_CHECKPOINT
+/*
+ * checkpoint_write_userns() is only called from
+ * checkpoint_write_user().  When called, we always know that
+ * either:
+ *   1. This is the root_ns (user_ns of the ctx->root_task),
+ *	in which case we don't store a creator, but rather
+ *	set the CKPT_USERNS_INIT flag.
+ * or
+ *   2. The creator has already been written out to the
+ *	checkpoint image (and saved in the objhash)
+ */
+int checkpoint_write_userns(struct ckpt_ctx *ctx,
+				   struct user_namespace *ns)
+{
+	struct ckpt_hdr_user_ns *h;
+	int creator_ref = 0;
+	unsigned int flags = 0;
+	struct user_namespace *root_ns;
+	int ret;
+
+	root_ns = task_cred_xxx(ctx->root_task, user)->user_ns;
+	if (ns == root_ns)
+		flags = CKPT_USERNS_INIT;
+	else
+		creator_ref = obj_lookup(ctx, ns->creator);
+	if (!flags && !creator_ref)
+		return -EINVAL;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_USER_NS);
+	if (!h)
+		return -ENOMEM;
+	h->creator_ref = creator_ref;
+	h->flags = flags;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+struct user_namespace *restore_read_userns(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_user_ns *h;
+	struct user_namespace *ns;
+	struct user_struct *new_root, *creator;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_USER_NS);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+	if (h->flags & ~CKPT_USERNS_INIT)  /* only 1 valid flag */
+		return ERR_PTR(-EINVAL);
+	if (h->flags & CKPT_USERNS_INIT) {
+		ckpt_hdr_put(ctx, h);
+		/* grab an extra ref bc objhash will drop an extra */
+		return get_user_ns(current_user_ns());
+	}
+	creator = ckpt_obj_fetch(ctx, h->creator_ref, CKPT_OBJ_USER);
+	ckpt_hdr_put(ctx, h);
+
+	if (IS_ERR(creator))
+		return ERR_PTR(-EINVAL);
+	ns = new_user_ns(creator, &new_root);
+
+	if (IS_ERR(ns))
+		return ns;
+
+	/* new_user_ns() doesn't bump creator's refcount */
+	get_uid(creator);
+
+	/* objhash will drop new_ns refcount, but new_root
+	 * should hold a ref */
+	get_user_ns(ns);
+
+	/*
+	 * Free the new root user.  If we actually needed it,
+	 * then it will show up later in the checkpoint image
+	 * The objhash will keep the userns pinned until then.
+	 */
+	free_uid(new_root);
+
+	return ns;
+}
+
+#endif
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 7/9] cr: restore file->f_cred
       [not found] ` <20090529223229.GA14536-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-29 22:33   ` Serge E. Hallyn
  2009-05-29 22:33   ` [PATCH 8/9] user namespaces: debug refcounts Serge E. Hallyn
  1 sibling, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:33 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells,
	linux-security-module-u79uwXL29TY76Z2rM5mHXA, Alexey Dobriyan,
	Andrew Morgan

Restore a file's f_cred.  This is set to the cred of the task doing
the open, so often it will be the same as that of the restarted task.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/files.c             |   16 ++++++++++++++--
 include/linux/checkpoint_hdr.h |    2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/checkpoint/files.c b/checkpoint/files.c
index b264e40..cc813ed 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -154,7 +154,11 @@ int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
 	h->f_pos = file->f_pos;
 	h->f_version = file->f_version;
 
-	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
+	h->f_credref = checkpoint_obj(ctx, file->f_cred, CKPT_OBJ_CRED);
+	if (h->f_credref < 0)
+		return h->f_credref;
+
+	/* FIX: need also file->f_owner, etc */
 
 	return 0;
 }
@@ -377,8 +381,16 @@ int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
 			struct ckpt_hdr_file *h)
 {
 	int ret;
+	struct cred *cred;
+
+	/* FIX: need to restore owner etc */
 
-	/* FIX: need to restore uid, gid, owner etc */
+	/* restore the cred */
+	cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+	put_cred(file->f_cred);
+	file->f_cred = get_cred(cred);
 
 	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
 	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index e38867e..7f4972b 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -296,7 +296,7 @@ struct ckpt_hdr_file {
 	__u32 f_type;
 	__u32 f_mode;
 	__u32 f_flags;
-	__u32 _padding;
+	__s32 f_credref;
 	__u64 f_pos;
 	__u64 f_version;
 } __attribute__((aligned(8)));
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 8/9] user namespaces: debug refcounts
       [not found] ` <20090529223229.GA14536-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-29 22:33   ` [PATCH 7/9] cr: restore file->f_cred Serge E. Hallyn
@ 2009-05-29 22:33   ` Serge E. Hallyn
  2009-05-31 18:51     ` Alexey Dobriyan
  1 sibling, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:33 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells,
	linux-security-module-u79uwXL29TY76Z2rM5mHXA, Alexey Dobriyan,
	Andrew Morgan

Create /proc/userns, which prints out all user namespaces.  It
prints the address of the user_ns itself, the uid and userns address
of the user who created it, and the reference count.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/user_namespace.h |    2 +
 kernel/user.c                  |    1 +
 kernel/user_namespace.c        |   84 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 0 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 3eeee40..4503224 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -14,8 +14,10 @@ struct user_namespace {
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*creator;
 	struct work_struct	destroyer;
+	struct list_head	list;
 };
 
+extern spinlock_t usernslist_lock;
 extern struct user_namespace init_user_ns;
 
 #ifdef CONFIG_USER_NS
diff --git a/kernel/user.c b/kernel/user.c
index 97f13e2..1a9a44f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -24,6 +24,7 @@ struct user_namespace init_user_ns = {
 		.refcount	= ATOMIC_INIT(2),
 	},
 	.creator = &root_user,
+	.list = LIST_HEAD_INIT(init_user_ns.list),
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 857cb3d..e76b38f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,6 +11,11 @@
 #include <linux/user_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/cred.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+DEFINE_SPINLOCK(usernslist_lock);
 
 static struct user_namespace *_new_user_ns(struct user_struct *creator,
 				   struct user_struct **newroot)
@@ -41,6 +46,9 @@ static struct user_namespace *_new_user_ns(struct user_struct *creator,
 	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
 	kref_set(&ns->kref, 1);
 
+	spin_lock(&usernslist_lock);
+	list_add_tail(&ns->list, &init_user_ns.list);
+	spin_unlock(&usernslist_lock);
 	*newroot = root_user;
 	return ns;
 }
@@ -91,6 +99,9 @@ static void free_user_ns_work(struct work_struct *work)
 {
 	struct user_namespace *ns =
 		container_of(work, struct user_namespace, destroyer);
+	spin_lock(&usernslist_lock);
+	list_del(&ns->list);
+	spin_unlock(&usernslist_lock);
 	free_uid(ns->creator);
 	kfree(ns);
 }
@@ -105,6 +116,79 @@ void free_user_ns(struct kref *kref)
 }
 EXPORT_SYMBOL(free_user_ns);
 
+#ifdef CONFIG_PROC_FS
+static int proc_userns_show(struct seq_file *m, void *v)
+{
+	struct user_namespace *ns = v;
+	seq_printf(m, "userns %p creator (uid %d ns %p) count %d\n",
+		(void *)ns, ns->creator->uid, (void *) ns->creator->user_ns,
+		atomic_read(&ns->kref.refcount));
+	return 0;
+}
+
+static void *proc_userns_start(struct seq_file *p, loff_t *_pos)
+{
+	loff_t pos = *_pos;
+	struct user_namespace *ns = &init_user_ns;
+	spin_lock(&usernslist_lock);
+	while (pos) {
+		pos--;
+		ns = list_entry(ns->list.next, struct user_namespace, list);
+		if (ns  == &init_user_ns)
+			return NULL;
+	}
+	return ns;
+}
+
+static void *proc_userns_next(struct seq_file *p, void *v, loff_t *_pos)
+{
+	struct user_namespace *ns = v;
+	(*_pos)++;
+	ns = list_entry(ns->list.next, struct user_namespace, list);
+	if (ns == &init_user_ns)
+		return NULL;
+	return ns;
+}
+
+static void proc_userns_stop(struct seq_file *p, void *v)
+{
+	spin_unlock(&usernslist_lock);
+}
+
+static const struct seq_operations proc_userns_ops;
+
+static int proc_userns_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &proc_userns_ops);
+}
+
+static const struct seq_operations proc_userns_ops = {
+	.start	= proc_userns_start,
+	.next	= proc_userns_next,
+	.stop	= proc_userns_stop,
+	.show	= proc_userns_show,
+};
+
+const struct file_operations proc_userns_fops = {
+	.open		= proc_userns_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static __init int user_ns_debug(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create("userns", 0, NULL, &proc_userns_fops);
+	if (!p)
+		panic("cannot create /proc/userns\n");
+	return 0;
+}
+
+__initcall(user_ns_debug);
+#endif
+
 #ifdef CONFIG_CHECKPOINT
 /*
  * checkpoint_write_userns() is only called from
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 9/9] cr: ipc: reset kern_ipc_perms
  2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
                   ` (6 preceding siblings ...)
       [not found] ` <20090529223229.GA14536-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-29 22:34 ` Serge E. Hallyn
  7 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-05-29 22:34 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, Alexey Dobriyan, Andrew Morgan, David Howells,
	linux-security-module

Reset the checkpointed uid and gid info on ipc objects.

Right now, we return -EPERM if the user calling sys_restart() isn't
allowed to create an object with the checkpointed uid.  We may prefer
to simply use the caller's uid in that case - but that could lead to
subtle userspace bugs?  Unsure, so going for the stricter behavior.

TODO: restore kern_ipc_perms->security.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 ipc/checkpoint.c |   33 +++++++++++++++++++++++++++++++--
 1 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index f621226..bc77743 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -119,6 +119,26 @@ int checkpoint_ipc_ns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
  * Restart
  */
 
+/*
+ * check whether current task may create ipc object with
+ * checkpointed uids and gids.
+ * Return 1 if ok, 0 if not.
+ */
+static int validate_created_perms(struct ckpt_hdr_ipc_perms *h)
+{
+	const struct cred *cred = current_cred();
+	uid_t uid = cred->uid, euid = cred->euid;
+
+	/* actually I don't know - is CAP_IPC_OWNER the right one? */
+	if (((h->uid != uid && h->uid == euid) ||
+			(h->cuid != uid && h->cuid != euid) ||
+			!in_group_p(h->cgid) ||
+			!in_group_p(h->gid)) &&
+			!capable(CAP_IPC_OWNER))
+		return 0;
+	return 1;
+}
+
 int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 			   struct kern_ipc_perm *perm)
 {
@@ -139,14 +159,23 @@ int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 
 	perm->id = h->id;
 	perm->key = h->key;
-#if 0 /* FIX: requires security checks */
+
+	if (!validate_created_perms(h))
+		return -EPERM;
 	perm->uid = h->uid;
 	perm->gid = h->gid;
 	perm->cuid = h->cuid;
 	perm->cgid = h->cgid;
-#endif
 	perm->mode = h->mode;
 	perm->seq = h->seq;
+	/*
+	 * Todo: restore perm->security.
+	 * At the moment it gets set by security_x_alloc() called through
+	 * ipcget()->ipcget_public()->ops-.getnew (->nequeue for instance)
+	 * We will want to ask the LSM to consider resetting the
+	 * checkpointed ->security, based on current_security(),
+	 * the checkpointed ->security, and the checkpoint file context.
+	 */
 
 	return 0;
 }
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/9] user namespaces: debug refcounts
  2009-05-29 22:33   ` [PATCH 8/9] user namespaces: debug refcounts Serge E. Hallyn
@ 2009-05-31 18:51     ` Alexey Dobriyan
  2009-06-01 19:02       ` Serge E. Hallyn
  0 siblings, 1 reply; 33+ messages in thread
From: Alexey Dobriyan @ 2009-05-31 18:51 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Andrew Morgan, David Howells,
	linux-security-module

On Fri, May 29, 2009 at 05:33:52PM -0500, Serge E. Hallyn wrote:
> Create /proc/userns, which prints out all user namespaces.  It
> prints the address of the user_ns itself, the uid and userns address
> of the user who created it, and the reference count.

> +static int proc_userns_show(struct seq_file *m, void *v)
> +{
> +	struct user_namespace *ns = v;
> +	seq_printf(m, "userns %p creator (uid %d ns %p) count %d\n",
> +		(void *)ns, ns->creator->uid, (void *) ns->creator->user_ns,
> +		atomic_read(&ns->kref.refcount));
> +	return 0;
> +}

Kernel shouldn't expose location of kernel objects to userspace.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-05-29 22:33 ` [PATCH 5/9] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
@ 2009-05-31 20:26   ` Andrew G. Morgan
  2009-05-31 20:56     ` Alexey Dobriyan
                       ` (2 more replies)
  0 siblings, 3 replies; 33+ messages in thread
From: Andrew G. Morgan @ 2009-05-31 20:26 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Serge,

I'm not sure I'm too happy with hard coding the 64-bitness of
capability sets. It may well be a very long time before we increase
their size, but couldn't you prepare for that with some reference to
the prevailing magic numbers for the current ABI representation?

Also, the use of 'error' as both a variable and a goto destination
looks a little confusing.

Cheers

Andrew


On Fri, May 29, 2009 at 3:33 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> An application checkpoint image will store capability sets
> (and the bounding set) as __u64s.  Define checkpoint and
> restart functions to translate between those and kernel_cap_t's.
>
> Define a common function do_capset_tocred() which applies capability
> set changes to a passed-in struct cred.
>
> The restore function uses do_capset_tocred() to apply the restored
> capabilities to the struct cred being crafted, subject to the
> current task's (task executing sys_restart()) permissions.
>
> Changelog:
>        May 28: add helpers to c/r securebits
>
> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
>  include/linux/capability.h |    7 +++
>  kernel/capability.c        |   94 +++++++++++++++++++++++++++++++++++++------
>  security/commoncap.c       |   51 ++++++++++++++++--------
>  3 files changed, 122 insertions(+), 30 deletions(-)
>
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index c302110..b3853ca 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -536,6 +536,13 @@ extern const kernel_cap_t __cap_empty_set;
>  extern const kernel_cap_t __cap_full_set;
>  extern const kernel_cap_t __cap_init_eff_set;
>
> +extern void checkpoint_save_cap(__u64 *dest, kernel_cap_t src);
> +struct cred;
> +extern int checkpoint_restore_cap(__u64 e, __u64 i, __u64 p, __u64 x,
> +                               struct cred *cred);
> +extern void checkpoint_save_securebits(unsigned *, unsigned);
> +extern int checkpoint_restore_securebits(unsigned, struct cred *);
> +
>  /**
>  * has_capability - Determine if a task has a superior capability available
>  * @t: The task in question
> diff --git a/kernel/capability.c b/kernel/capability.c
> index 4e17041..d2c9bb3 100644
> --- a/kernel/capability.c
> +++ b/kernel/capability.c
> @@ -217,6 +217,45 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
>        return ret;
>  }
>
> +static int do_capset_tocred(kernel_cap_t *effective, kernel_cap_t *inheritable,
> +                       kernel_cap_t *permitted, struct cred *new)
> +{
> +       int ret;
> +
> +       ret = security_capset(new, current_cred(),
> +                             effective, inheritable, permitted);
> +       if (ret < 0)
> +               return ret;
> +
> +       /*
> +        * for checkpoint-restart, do we want to wait until end of restart?
> +        * not sure we care */
> +       audit_log_capset(current->pid, new, current_cred());
> +
> +       return 0;
> +}
> +
> +static int do_capset(kernel_cap_t *effective, kernel_cap_t *inheritable,
> +                       kernel_cap_t *permitted)
> +{
> +       struct cred *new;
> +       int ret;
> +
> +       new = prepare_creds();
> +       if (!new)
> +               return -ENOMEM;
> +
> +       ret = do_capset_tocred(effective, inheritable, permitted, new);
> +       if (ret < 0)
> +               goto error;
> +
> +       return commit_creds(new);
> +
> +error:
> +       abort_creds(new);
> +       return ret;
> +}
> +
>  /**
>  * sys_capset - set capabilities for a process or (*) a group of processes
>  * @header: pointer to struct that contains capability version and
> @@ -240,7 +279,6 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
>        struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
>        unsigned i, tocopy;
>        kernel_cap_t inheritable, permitted, effective;
> -       struct cred *new;
>        int ret;
>        pid_t pid;
>
> @@ -271,22 +309,52 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
>                i++;
>        }
>
> -       new = prepare_creds();
> -       if (!new)
> -               return -ENOMEM;
> +       return do_capset(&effective, &inheritable, &permitted);
>
> -       ret = security_capset(new, current_cred(),
> -                             &effective, &inheritable, &permitted);
> -       if (ret < 0)
> -               goto error;
> +}
>
> -       audit_log_capset(pid, new, current_cred());
>
> -       return commit_creds(new);
> +void checkpoint_save_cap(__u64 *dest, kernel_cap_t src)
> +{
> +       *dest = src.cap[0] | (src.cap[1] << sizeof(__u32));
> +}
>
> -error:
> -       abort_creds(new);
> -       return ret;
> +static void do_capbset_drop(struct cred *cred, int cap)
> +{
> +       cap_lower(cred->cap_bset, cap);
> +}
> +
> +int checkpoint_restore_cap(__u64 newe, __u64 newi, __u64 newp, __u64 newx,
> +                       struct cred *cred)
> +{
> +       kernel_cap_t effective, inheritable, permitted, bset;
> +       int may_dropbcap = capable(CAP_SETPCAP);
> +       int ret, i;
> +
> +       effective.cap[0] = newe;
> +       effective.cap[1] = (newe >> sizeof(__u32));
> +       inheritable.cap[0] = newi;
> +       inheritable.cap[1] = (newi >> sizeof(__u32));
> +       permitted.cap[0] = newp;
> +       permitted.cap[1] = (newp >> sizeof(__u32));
> +       bset.cap[0] = newx;
> +       bset.cap[1] = (newx >> sizeof(__u32));
> +
> +       ret = do_capset_tocred(&effective, &inheritable, &permitted, cred);
> +       if (ret < 0)
> +               return ret;
> +
> +       for (i = 0; i < CAP_LAST_CAP; i++) {
> +               if (cap_raised(bset, i))
> +                       continue;
> +               if (!cap_raised(current_cred()->cap_bset, i))
> +                       continue;
> +               if (!may_dropbcap)
> +                       return -EPERM;
> +               do_capbset_drop(cred, i);
> +       }
> +
> +       return 0;
>  }
>
>  /**
> diff --git a/security/commoncap.c b/security/commoncap.c
> index beac025..8054a07 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -804,6 +804,28 @@ int cap_task_setnice (struct task_struct *p, int nice)
>  }
>  #endif
>
> +int cap_set_securebits(struct cred *new, unsigned securebits)
> +{
> +       if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
> +            & (new->securebits ^ securebits))                          /*[1]*/
> +           || ((new->securebits & SECURE_ALL_LOCKS & ~securebits))     /*[2]*/
> +           || (securebits & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))     /*[3]*/
> +           || (cap_capable(current, current_cred(), CAP_SETPCAP,
> +                           SECURITY_CAP_AUDIT) != 0)                   /*[4]*/
> +               /*
> +                * [1] no changing of bits that are locked
> +                * [2] no unlocking of locks
> +                * [3] no setting of unsupported bits
> +                * [4] doing anything requires privilege (go read about
> +                *     the "sendmail capabilities bug")
> +                */
> +           )
> +               /* cannot change a locked bit */
> +               return -EPERM;
> +       new->securebits = securebits;
> +       return 0;
> +}
> +
>  /**
>  * cap_task_prctl - Implement process control functions for this security module
>  * @option: The process control function requested
> @@ -861,24 +883,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
>         * capability-based-privilege environment.
>         */
>        case PR_SET_SECUREBITS:
> -               error = -EPERM;
> -               if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
> -                    & (new->securebits ^ arg2))                        /*[1]*/
> -                   || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))   /*[2]*/
> -                   || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))   /*[3]*/
> -                   || (cap_capable(current, current_cred(), CAP_SETPCAP,
> -                                   SECURITY_CAP_AUDIT) != 0)           /*[4]*/
> -                       /*
> -                        * [1] no changing of bits that are locked
> -                        * [2] no unlocking of locks
> -                        * [3] no setting of unsupported bits
> -                        * [4] doing anything requires privilege (go read about
> -                        *     the "sendmail capabilities bug")
> -                        */
> -                   )
> -                       /* cannot change a locked bit */
> +               error = cap_set_securebits(new, arg2);
> +               if (error)
>                        goto error;
> -               new->securebits = arg2;
>                goto changed;
>
>        case PR_GET_SECUREBITS:
> @@ -921,6 +928,16 @@ error:
>        return error;
>  }
>
> +void checkpoint_save_securebits(unsigned *b, unsigned cred_securebits)
> +{
> +       *b = cred_securebits;
> +}
> +
> +int checkpoint_restore_securebits(unsigned b, struct cred *cred)
> +{
> +       return cap_set_securebits(cred, b);
> +}
> +
>  /**
>  * cap_syslog - Determine whether syslog function is permitted
>  * @type: Function requested
> --
> 1.6.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-05-31 20:26   ` Andrew G. Morgan
@ 2009-05-31 20:56     ` Alexey Dobriyan
  2009-06-01  1:38     ` Serge E. Hallyn
  2009-06-01 15:49     ` Serge E. Hallyn
  2 siblings, 0 replies; 33+ messages in thread
From: Alexey Dobriyan @ 2009-05-31 20:56 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Oren Laadan, Linux Containers, David Howells,
	linux-security-module

On Sun, May 31, 2009 at 01:26:21PM -0700, Andrew G. Morgan wrote:
> I'm not sure I'm too happy with hard coding the 64-bitness of
> capability sets. It may well be a very long time before we increase
> their size, but couldn't you prepare for that with some reference to
> the prevailing magic numbers for the current ABI representation?

Yes, please, use BUILD_BUG_ON or something

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-05-31 20:26   ` Andrew G. Morgan
  2009-05-31 20:56     ` Alexey Dobriyan
@ 2009-06-01  1:38     ` Serge E. Hallyn
  2009-06-01  2:18       ` Andrew G. Morgan
  2009-06-01 15:49     ` Serge E. Hallyn
  2 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-01  1:38 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Oren Laadan, Linux Containers, Alexey Dobriyan,
	David Howells, linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> Serge,
> 
> I'm not sure I'm too happy with hard coding the 64-bitness of
> capability sets. It may well be a very long time before we increase
> their size, but couldn't you prepare for that with some reference to
> the prevailing magic numbers for the current ABI representation?

Hmm, ok.  I figured since the c/r code was in capability.h it would
be obvious that going past 64-bit would mean a new checkpoint image
format.  I can see where that's silly...

I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
suffice?

> Also, the use of 'error' as both a variable and a goto destination
> looks a little confusing.

Ok will change.

Did you see any problems with the way I authorize a task's resetting
of capabilities at sys_restart()?

thanks,
-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-01  1:38     ` Serge E. Hallyn
@ 2009-06-01  2:18       ` Andrew G. Morgan
  2009-06-01 13:35         ` Serge E. Hallyn
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-01  2:18 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn, Oren Laadan, Linux Containers, Alexey Dobriyan,
	David Howells, linux-security-module

On Sun, May 31, 2009 at 6:38 PM, Serge E. Hallyn <serge@hallyn.com> wrote:
>
> Quoting Andrew G. Morgan (morgan@kernel.org):
> > Serge,
> >
> > I'm not sure I'm too happy with hard coding the 64-bitness of
> > capability sets. It may well be a very long time before we increase
> > their size, but couldn't you prepare for that with some reference to
> > the prevailing magic numbers for the current ABI representation?
>
> Hmm, ok.  I figured since the c/r code was in capability.h it would
> be obvious that going past 64-bit would mean a new checkpoint image
> format.  I can see where that's silly...
>
> I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
> suffice?

I guess I'm not really well up on what the plans are for checkpoint
images. Is there some sort of version control/signature/checksum to
protect a kernel from loading an image that has been hacked to modify
the privilege it was running with when the checkpoint was created?

> > Also, the use of 'error' as both a variable and a goto destination
> > looks a little confusing.
>
> Ok will change.
>
> Did you see any problems with the way I authorize a task's resetting
> of capabilities at sys_restart()?

[See above.] Is there a mailing list or something I can lurk on to get
up to speed on what is being intended?

Thanks

Andrew

>
> thanks,
> -serge
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-01  2:18       ` Andrew G. Morgan
@ 2009-06-01 13:35         ` Serge E. Hallyn
  2009-06-01 15:46           ` Andrew G. Morgan
  0 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 13:35 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> On Sun, May 31, 2009 at 6:38 PM, Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > Quoting Andrew G. Morgan (morgan@kernel.org):
> > > Serge,
> > >
> > > I'm not sure I'm too happy with hard coding the 64-bitness of
> > > capability sets. It may well be a very long time before we increase
> > > their size, but couldn't you prepare for that with some reference to
> > > the prevailing magic numbers for the current ABI representation?
> >
> > Hmm, ok.  I figured since the c/r code was in capability.h it would
> > be obvious that going past 64-bit would mean a new checkpoint image
> > format.  I can see where that's silly...
> >
> > I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
> > suffice?
> 
> I guess I'm not really well up on what the plans are for checkpoint
> images. Is there some sort of version control/signature/checksum to
> protect a kernel from loading an image that has been hacked to modify
> the privilege it was running with when the checkpoint was created?

No.  One day we expect there will be TPM-signing of checkpoint images,
but that will be up to userspace to properly exploit.  So if userspace
wants to enforce a certain flow control to prevent an unprivileged user
from modifying a checkpoint image (which of course it does), then it
should set up DAC and/or MAC to enforce that.

> > > Also, the use of 'error' as both a variable and a goto destination
> > > looks a little confusing.
> >
> > Ok will change.
> >
> > Did you see any problems with the way I authorize a task's resetting
> > of capabilities at sys_restart()?
> 
> [See above.] Is there a mailing list or something I can lurk on to get
> up to speed on what is being intended?

It mainly gets discussed on the containers list
(https://lists.linux-foundation.org/mailman/listinfo/containers) and
on freenode/#lxcontainers.

thanks,
-serge
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-01 13:35         ` Serge E. Hallyn
@ 2009-06-01 15:46           ` Andrew G. Morgan
  2009-06-01 22:18             ` Serge E. Hallyn
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-01 15:46 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>> > I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
>> > suffice?

I can't speak for other subsystems, but it seems to me as if for the
capabilities, I'd want to create something like this in
include/linux/capabilities.h

typedef struct checkpoint_caps_s {
   /* what goes in here is the capability code's business */
} checkpoint_caps_t;

and then have a kernel-internal callable:

   caps_checkpoint_save(&caps_storage);
   caps_checkpoint_restore(&caps_storage);

kind if API. That way, the maintainer of the subsystem state gets to
make sure that the right things are saved/restored/sanity checked.
Breaking things out explicitly in "struct ckpt_hdr_cred" looks like it
will complicate the distribution of knowledge about what's going on
throughout the kernel.

Cheers

Andrew


>>
>> I guess I'm not really well up on what the plans are for checkpoint
>> images. Is there some sort of version control/signature/checksum to
>> protect a kernel from loading an image that has been hacked to modify
>> the privilege it was running with when the checkpoint was created?
>
> No.  One day we expect there will be TPM-signing of checkpoint images,
> but that will be up to userspace to properly exploit.  So if userspace
> wants to enforce a certain flow control to prevent an unprivileged user
> from modifying a checkpoint image (which of course it does), then it
> should set up DAC and/or MAC to enforce that.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-05-31 20:26   ` Andrew G. Morgan
  2009-05-31 20:56     ` Alexey Dobriyan
  2009-06-01  1:38     ` Serge E. Hallyn
@ 2009-06-01 15:49     ` Serge E. Hallyn
  2009-06-01 16:34       ` Oren Laadan
  2 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 15:49 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> Serge,
> 
> I'm not sure I'm too happy with hard coding the 64-bitness of
> capability sets. It may well be a very long time before we increase
> their size, but couldn't you prepare for that with some reference to
> the prevailing magic numbers for the current ABI representation?

Is the appended, updated version of the patch ok?

> Also, the use of 'error' as both a variable and a goto destination
> looks a little confusing.

Ok, but that was in the original code.  I can send a separate
patch for mainline to change that, but it's not hurting
anything at the moment so not sure it's worth it?

-serge

From aa72f022fb5788ab46658e6eb94eaf18e8c6568a Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serue@us.ibm.com>
Date: Mon, 11 May 2009 09:44:42 -0400
Subject: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns

An application checkpoint image will store capability sets
(and the bounding set) as __u64s.  Define checkpoint and
restart functions to translate between those and kernel_cap_t's.

Define a common function do_capset_tocred() which applies capability
set changes to a passed-in struct cred.

The restore function uses do_capset_tocred() to apply the restored
capabilities to the struct cred being crafted, subject to the
current task's (task executing sys_restart()) permissions.

Changelog:
	Jun 01: Add commented BUILD_BUG_ON() to point out that the
		current implementation depends on 64-bit capabilities.
		(Andrew Morgan and Alexey Dobriyan).
	May 28: add helpers to c/r securebits

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/capability.h |    7 +++
 kernel/capability.c        |  102 ++++++++++++++++++++++++++++++++++++++------
 security/commoncap.c       |   51 +++++++++++++++-------
 3 files changed, 130 insertions(+), 30 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c302110..b3853ca 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -536,6 +536,13 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+extern void checkpoint_save_cap(__u64 *dest, kernel_cap_t src);
+struct cred;
+extern int checkpoint_restore_cap(__u64 e, __u64 i, __u64 p, __u64 x,
+				struct cred *cred);
+extern void checkpoint_save_securebits(unsigned *, unsigned);
+extern int checkpoint_restore_securebits(unsigned, struct cred *);
+
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041..6bfd180 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -217,6 +217,45 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 	return ret;
 }
 
+static int do_capset_tocred(kernel_cap_t *effective, kernel_cap_t *inheritable,
+			kernel_cap_t *permitted, struct cred *new)
+{
+	int ret;
+
+	ret = security_capset(new, current_cred(),
+			      effective, inheritable, permitted);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * for checkpoint-restart, do we want to wait until end of restart?
+	 * not sure we care */
+	audit_log_capset(current->pid, new, current_cred());
+
+	return 0;
+}
+
+static int do_capset(kernel_cap_t *effective, kernel_cap_t *inheritable,
+			kernel_cap_t *permitted)
+{
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = do_capset_tocred(effective, inheritable, permitted, new);
+	if (ret < 0)
+		goto error;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return ret;
+}
+
 /**
  * sys_capset - set capabilities for a process or (*) a group of processes
  * @header: pointer to struct that contains capability version and
@@ -240,7 +279,6 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -271,22 +309,60 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 		i++;
 	}
 
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
+	return do_capset(&effective, &inheritable, &permitted);
 
-	ret = security_capset(new, current_cred(),
-			      &effective, &inheritable, &permitted);
-	if (ret < 0)
-		goto error;
+}
 
-	audit_log_capset(pid, new, current_cred());
+void checkpoint_save_cap(__u64 *dest, kernel_cap_t src)
+{
 
-	return commit_creds(new);
+	/*
+	 * If the following triggers, then capabilities no longer fit into
+	 * 64 bits.  The checkpoint image defined in include/linux/checkpoint_hdr.h
+	 * as well as the checkpoint_save_cap() and checkpoint_restore_cap()
+	 * helpers below need to be updated
+	 */
+	BUILD_BUG_ON(CAP_LAST_CAP >= 64);
 
-error:
-	abort_creds(new);
-	return ret;
+	*dest = src.cap[0] | (src.cap[1] << sizeof(__u32));
+}
+
+static void do_capbset_drop(struct cred *cred, int cap)
+{
+	cap_lower(cred->cap_bset, cap);
+}
+
+int checkpoint_restore_cap(__u64 newe, __u64 newi, __u64 newp, __u64 newx,
+			struct cred *cred)
+{
+	kernel_cap_t effective, inheritable, permitted, bset;
+	int may_dropbcap = capable(CAP_SETPCAP);
+	int ret, i;
+
+	effective.cap[0] = newe;
+	effective.cap[1] = (newe >> sizeof(__u32));
+	inheritable.cap[0] = newi;
+	inheritable.cap[1] = (newi >> sizeof(__u32));
+	permitted.cap[0] = newp;
+	permitted.cap[1] = (newp >> sizeof(__u32));
+	bset.cap[0] = newx;
+	bset.cap[1] = (newx >> sizeof(__u32));
+
+	ret = do_capset_tocred(&effective, &inheritable, &permitted, cred);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < CAP_LAST_CAP; i++) {
+		if (cap_raised(bset, i))
+			continue;
+		if (!cap_raised(current_cred()->cap_bset, i))
+			continue;
+		if (!may_dropbcap)
+			return -EPERM;
+		do_capbset_drop(cred, i);
+	}
+
+	return 0;
 }
 
 /**
diff --git a/security/commoncap.c b/security/commoncap.c
index beac025..8054a07 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -804,6 +804,28 @@ int cap_task_setnice (struct task_struct *p, int nice)
 }
 #endif
 
+int cap_set_securebits(struct cred *new, unsigned securebits)
+{
+	if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
+	     & (new->securebits ^ securebits))				/*[1]*/
+	    || ((new->securebits & SECURE_ALL_LOCKS & ~securebits))	/*[2]*/
+	    || (securebits & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
+	    || (cap_capable(current, current_cred(), CAP_SETPCAP,
+			    SECURITY_CAP_AUDIT) != 0)			/*[4]*/
+		/*
+		 * [1] no changing of bits that are locked
+		 * [2] no unlocking of locks
+		 * [3] no setting of unsupported bits
+		 * [4] doing anything requires privilege (go read about
+		 *     the "sendmail capabilities bug")
+		 */
+	    )
+		/* cannot change a locked bit */
+		return -EPERM;
+	new->securebits = securebits;
+	return 0;
+}
+
 /**
  * cap_task_prctl - Implement process control functions for this security module
  * @option: The process control function requested
@@ -861,24 +883,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		error = -EPERM;
-		if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (new->securebits ^ arg2))			/*[1]*/
-		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
-		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
-		    || (cap_capable(current, current_cred(), CAP_SETPCAP,
-				    SECURITY_CAP_AUDIT) != 0)		/*[4]*/
-			/*
-			 * [1] no changing of bits that are locked
-			 * [2] no unlocking of locks
-			 * [3] no setting of unsupported bits
-			 * [4] doing anything requires privilege (go read about
-			 *     the "sendmail capabilities bug")
-			 */
-		    )
-			/* cannot change a locked bit */
+		error = cap_set_securebits(new, arg2);
+		if (error)
 			goto error;
-		new->securebits = arg2;
 		goto changed;
 
 	case PR_GET_SECUREBITS:
@@ -921,6 +928,16 @@ error:
 	return error;
 }
 
+void checkpoint_save_securebits(unsigned *b, unsigned cred_securebits)
+{
+	*b = cred_securebits;
+}
+
+int checkpoint_restore_securebits(unsigned b, struct cred *cred)
+{
+	return cap_set_securebits(cred, b);
+}
+
 /**
  * cap_syslog - Determine whether syslog function is permitted
  * @type: Function requested
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-01 15:49     ` Serge E. Hallyn
@ 2009-06-01 16:34       ` Oren Laadan
  0 siblings, 0 replies; 33+ messages in thread
From: Oren Laadan @ 2009-06-01 16:34 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Andrew G. Morgan, Linux Containers, Alexey Dobriyan,
	David Howells, linux-security-module



Serge E. Hallyn wrote:
> Quoting Andrew G. Morgan (morgan@kernel.org):
>> Serge,
>>
>> I'm not sure I'm too happy with hard coding the 64-bitness of
>> capability sets. It may well be a very long time before we increase
>> their size, but couldn't you prepare for that with some reference to
>> the prevailing magic numbers for the current ABI representation?
> 
> Is the appended, updated version of the patch ok?
> 
>> Also, the use of 'error' as both a variable and a goto destination
>> looks a little confusing.
> 
> Ok, but that was in the original code.  I can send a separate
> patch for mainline to change that, but it's not hurting
> anything at the moment so not sure it's worth it?
> 
> -serge
> 
> From aa72f022fb5788ab46658e6eb94eaf18e8c6568a Mon Sep 17 00:00:00 2001
> From: Serge E. Hallyn <serue@us.ibm.com>
> Date: Mon, 11 May 2009 09:44:42 -0400
> Subject: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
> 
> An application checkpoint image will store capability sets
> (and the bounding set) as __u64s.  Define checkpoint and
> restart functions to translate between those and kernel_cap_t's.
> 
> Define a common function do_capset_tocred() which applies capability
> set changes to a passed-in struct cred.
> 
> The restore function uses do_capset_tocred() to apply the restored
> capabilities to the struct cred being crafted, subject to the
> current task's (task executing sys_restart()) permissions.
> 
> Changelog:
> 	Jun 01: Add commented BUILD_BUG_ON() to point out that the
> 		current implementation depends on 64-bit capabilities.
> 		(Andrew Morgan and Alexey Dobriyan).
> 	May 28: add helpers to c/r securebits
> 
> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
>  include/linux/capability.h |    7 +++
>  kernel/capability.c        |  102 ++++++++++++++++++++++++++++++++++++++------
>  security/commoncap.c       |   51 +++++++++++++++-------
>  3 files changed, 130 insertions(+), 30 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index c302110..b3853ca 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -536,6 +536,13 @@ extern const kernel_cap_t __cap_empty_set;
>  extern const kernel_cap_t __cap_full_set;
>  extern const kernel_cap_t __cap_init_eff_set;
>  
> +extern void checkpoint_save_cap(__u64 *dest, kernel_cap_t src);
> +struct cred;
> +extern int checkpoint_restore_cap(__u64 e, __u64 i, __u64 p, __u64 x,
> +				struct cred *cred);
> +extern void checkpoint_save_securebits(unsigned *, unsigned);
> +extern int checkpoint_restore_securebits(unsigned, struct cred *);

(nit) How about:
	checkpoint_capabilities()  or  checkpoint_cap_t()
	restore_capabilities()  or  restore_cap_t()
?  (also consistent with rest of the c/r code)

Oren.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 8/9] user namespaces: debug refcounts
  2009-05-31 18:51     ` Alexey Dobriyan
@ 2009-06-01 19:02       ` Serge E. Hallyn
  0 siblings, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 19:02 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Serge E. Hallyn, Oren Laadan, Linux Containers, Andrew Morgan,
	David Howells, linux-security-module

Quoting Alexey Dobriyan (adobriyan@gmail.com):
> On Fri, May 29, 2009 at 05:33:52PM -0500, Serge E. Hallyn wrote:
> > Create /proc/userns, which prints out all user namespaces.  It
> > prints the address of the user_ns itself, the uid and userns address
> > of the user who created it, and the reference count.
> 
> > +static int proc_userns_show(struct seq_file *m, void *v)
> > +{
> > +	struct user_namespace *ns = v;
> > +	seq_printf(m, "userns %p creator (uid %d ns %p) count %d\n",
> > +		(void *)ns, ns->creator->uid, (void *) ns->creator->user_ns,
> > +		atomic_read(&ns->kref.refcount));
> > +	return 0;
> > +}
> 
> Kernel shouldn't expose location of kernel objects to userspace.

This one was just so ppl could verify things were working as
promised.  I won't be sending it again.

(OTOH, noone noticed i wasn't actually calling the securebits
c/r helpers...  I'll be sending a new set of patches fixing that,
not including this patch, and hopefully addressing Andrew's and
Oren's latest replies.

thanks,
-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-01 15:46           ` Andrew G. Morgan
@ 2009-06-01 22:18             ` Serge E. Hallyn
  2009-06-02 13:49               ` Andrew G. Morgan
  0 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 22:18 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> >> > I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
> >> > suffice?
> 
> I can't speak for other subsystems, but it seems to me as if for the
> capabilities, I'd want to create something like this in
> include/linux/capabilities.h
> 
> typedef struct checkpoint_caps_s {
>    /* what goes in here is the capability code's business */
> } checkpoint_caps_t;

Sigh - Did a patch this way, but the problem is userspace needs to be
able to parse the checkpoint image, so it needs to know what this struct
looks like.  So if I put it the struct definition
include/linux/capability.h, I run into a whole new set of problems
trying to compile a userspace program to do a sys_restart().

So I went part-way to what you suggested in the patchset I'm about to
send out (please see patch 6/8).  I think the caps code does look
nicer in this new version.

thanks,
-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-01 22:18             ` Serge E. Hallyn
@ 2009-06-02 13:49               ` Andrew G. Morgan
  2009-06-02 14:23                 ` Serge E. Hallyn
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-02 13:49 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

On Mon, Jun 1, 2009 at 3:18 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> Quoting Andrew G. Morgan (morgan@kernel.org):
>> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>> >> > I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
>> >> > suffice?
>>
>> I can't speak for other subsystems, but it seems to me as if for the
>> capabilities, I'd want to create something like this in
>> include/linux/capabilities.h
>>
>> typedef struct checkpoint_caps_s {
>>    /* what goes in here is the capability code's business */
>> } checkpoint_caps_t;
>
> Sigh - Did a patch this way, but the problem is userspace needs to be
> able to parse the checkpoint image, so it needs to know what this struct
> looks like.  So if I put it the struct definition
> include/linux/capability.h, I run into a whole new set of problems
> trying to compile a userspace program to do a sys_restart().

Does the user space app need to be able to modify the data in some
way? It seems like embedding a length with the structure or something
might simplify such a user space dependency.

> So I went part-way to what you suggested in the patchset I'm about to
> send out (please see patch 6/8).  I think the caps code does look
> nicer in this new version.

Better, but I remain concerned that the code looks hard to maintain
when structured this way.

Cheers

Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-02 13:49               ` Andrew G. Morgan
@ 2009-06-02 14:23                 ` Serge E. Hallyn
  2009-06-02 15:26                   ` Oren Laadan
  2009-06-02 15:49                   ` Andrew G. Morgan
  0 siblings, 2 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-02 14:23 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> On Mon, Jun 1, 2009 at 3:18 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> > Quoting Andrew G. Morgan (morgan@kernel.org):
> >> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> >> >> > I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
> >> >> > suffice?
> >>
> >> I can't speak for other subsystems, but it seems to me as if for the
> >> capabilities, I'd want to create something like this in
> >> include/linux/capabilities.h
> >>
> >> typedef struct checkpoint_caps_s {
> >>    /* what goes in here is the capability code's business */
> >> } checkpoint_caps_t;
> >
> > Sigh - Did a patch this way, but the problem is userspace needs to be
> > able to parse the checkpoint image, so it needs to know what this struct
> > looks like.  So if I put it the struct definition
> > include/linux/capability.h, I run into a whole new set of problems
> > trying to compile a userspace program to do a sys_restart().
> 
> Does the user space app need to be able to modify the data in some
> way? It seems like embedding a length with the structure or something
> might simplify such a user space dependency.

Hmm, I suppose I could do something like define struct ckpt_capabilities
in capabilities.h, then in checkpoint_hdr.h do

struct ckpt_capabilities;
struct ckpt_cap_dummy {
	__u64 dummies[9];
};

struct ckpt_hdr_cred {
	...
	union {
		struct ckpt_capabilities r;
		struct ckpt_cap_dummy d;
	} caps;
};

with a BUILD_BUG_ON to ensure that sizeof(r)==sizeof(d).  Ugly, but
should suit everyone?

> > So I went part-way to what you suggested in the patchset I'm about to
> > send out (please see patch 6/8).  I think the caps code does look
> > nicer in this new version.
> 
> Better, but I remain concerned that the code looks hard to maintain
> when structured this way.

Why exactly?  Just having the struct defined in checkpoint_hdr.h?  Or
is there something else I'm unwittingly doing?

thanks,
-serge
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-02 14:23                 ` Serge E. Hallyn
@ 2009-06-02 15:26                   ` Oren Laadan
  2009-06-02 15:49                   ` Andrew G. Morgan
  1 sibling, 0 replies; 33+ messages in thread
From: Oren Laadan @ 2009-06-02 15:26 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Andrew G. Morgan, Linux Containers, Alexey Dobriyan,
	David Howells, linux-security-module



Serge E. Hallyn wrote:
> Quoting Andrew G. Morgan (morgan@kernel.org):
>> On Mon, Jun 1, 2009 at 3:18 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>>> Quoting Andrew G. Morgan (morgan@kernel.org):
>>>> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>>>>>>> I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
>>>>>>> suffice?
>>>> I can't speak for other subsystems, but it seems to me as if for the
>>>> capabilities, I'd want to create something like this in
>>>> include/linux/capabilities.h
>>>>
>>>> typedef struct checkpoint_caps_s {
>>>>    /* what goes in here is the capability code's business */
>>>> } checkpoint_caps_t;
>>> Sigh - Did a patch this way, but the problem is userspace needs to be
>>> able to parse the checkpoint image, so it needs to know what this struct
>>> looks like.  So if I put it the struct definition
>>> include/linux/capability.h, I run into a whole new set of problems
>>> trying to compile a userspace program to do a sys_restart().
>> Does the user space app need to be able to modify the data in some
>> way? It seems like embedding a length with the structure or something
>> might simplify such a user space dependency.

Userspace needs to be able not only to parse the checkpoint image
but also to understand the contents:

1) For analysis and debugging tool(s) that will give information
about a checkpoint image

2) For conversion tool(s) that will convert a checkpoint image from
an older to a newer kernel. The idea is to avoid accumulating endless
compatibility code in the kernel.

So yes, these userspace apps need to be able to look and perhaps
modify the data in some way.

However, this raises an interesting question of _how_ to do this even
in userspace. Suppose we want to convert from version X to version Y,
where struct ckpt_blah changed between the version.

What would be an effective way to allow userspace to include both the
old and the new ckpt_blah and have them named differently ?

Oren.

> 
> Hmm, I suppose I could do something like define struct ckpt_capabilities
> in capabilities.h, then in checkpoint_hdr.h do
> 
> struct ckpt_capabilities;
> struct ckpt_cap_dummy {
> 	__u64 dummies[9];
> };
> 
> struct ckpt_hdr_cred {
> 	...
> 	union {
> 		struct ckpt_capabilities r;
> 		struct ckpt_cap_dummy d;
> 	} caps;
> };
> 
> with a BUILD_BUG_ON to ensure that sizeof(r)==sizeof(d).  Ugly, but
> should suit everyone?
> 
>>> So I went part-way to what you suggested in the patchset I'm about to
>>> send out (please see patch 6/8).  I think the caps code does look
>>> nicer in this new version.
>> Better, but I remain concerned that the code looks hard to maintain
>> when structured this way.
> 
> Why exactly?  Just having the struct defined in checkpoint_hdr.h?  Or
> is there something else I'm unwittingly doing?
> 
> thanks,
> -serge
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-02 14:23                 ` Serge E. Hallyn
  2009-06-02 15:26                   ` Oren Laadan
@ 2009-06-02 15:49                   ` Andrew G. Morgan
  2009-06-02 17:15                     ` Serge E. Hallyn
  2009-06-03  0:05                     ` Oren Laadan
  1 sibling, 2 replies; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-02 15:49 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

[I'm sorry if I'm flogging a dead horse. I'm actually really excited
by this functionality! :-) ]

Comments inline...

On Tue, Jun 2, 2009 at 7:23 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> Quoting Andrew G. Morgan (morgan@kernel.org):
>> On Mon, Jun 1, 2009 at 3:18 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>> > Quoting Andrew G. Morgan (morgan@kernel.org):
>> >> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>> >> >> > I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
>> >> >> > suffice?
>> >>
>> >> I can't speak for other subsystems, but it seems to me as if for the
>> >> capabilities, I'd want to create something like this in
>> >> include/linux/capabilities.h
>> >>
>> >> typedef struct checkpoint_caps_s {
>> >>    /* what goes in here is the capability code's business */
>> >> } checkpoint_caps_t;
>> >
>> > Sigh - Did a patch this way, but the problem is userspace needs to be
>> > able to parse the checkpoint image, so it needs to know what this struct
>> > looks like.  So if I put it the struct definition
>> > include/linux/capability.h, I run into a whole new set of problems
>> > trying to compile a userspace program to do a sys_restart().
>>
>> Does the user space app need to be able to modify the data in some
>> way? It seems like embedding a length with the structure or something
>> might simplify such a user space dependency.
>
> Hmm, I suppose I could do something like define struct ckpt_capabilities
> in capabilities.h, then in checkpoint_hdr.h do
>
> struct ckpt_capabilities;
> struct ckpt_cap_dummy {
>        __u64 dummies[9];
> };
>
> struct ckpt_hdr_cred {
>        ...
>        union {
>                struct ckpt_capabilities r;
>                struct ckpt_cap_dummy d;
>        } caps;
> };

Yes, something like this, but perhaps:

    struct ckpt_caps_part_s {
       int length; /* = sizeof(struct ckpt_capabilities) */
       cap_ckpt_t data;
    } caps;

and then the generic checkpoint code would do:

   #include <linux/capabilities.h>
   caps.ckpt_capabilities_length = cap_checkpoint_save(&caps.data);
   [...]
   cap_checkpoint_restore(caps.length, &caps.caps.data);

and the capability code could opaquely deal with the details.

The reason I think this is more maintainable is that its clear (to the
capability code) what is being check-pointed and, conversely, for the
checkpoint code it is abstracted with the responsibility for detailed
state decisions elsewhere in the kernel.

I suspect I don't understand the user space code issue sufficiently.
But if, for some reason, the user space source code is unable to
include the definition of cap_ckpt_t, it should be clear that parsing
this type of data structure, given that offsets are embedded in it,
should be straightforward.

> with a BUILD_BUG_ON to ensure that sizeof(r)==sizeof(d).  Ugly, but
> should suit everyone?

could the checkpointing code check the return value for
cap_checkpoint_restore() and fail the restore if it returned an error?

Cheers

Andrew

>
>> > So I went part-way to what you suggested in the patchset I'm about to
>> > send out (please see patch 6/8).  I think the caps code does look
>> > nicer in this new version.
>>
>> Better, but I remain concerned that the code looks hard to maintain
>> when structured this way.
>
> Why exactly?  Just having the struct defined in checkpoint_hdr.h?  Or
> is there something else I'm unwittingly doing?
>
> thanks,
> -serge
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-02 15:49                   ` Andrew G. Morgan
@ 2009-06-02 17:15                     ` Serge E. Hallyn
  2009-06-03  0:05                     ` Oren Laadan
  1 sibling, 0 replies; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-02 17:15 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> > with a BUILD_BUG_ON to ensure that sizeof(r)==sizeof(d).  Ugly, but
> > should suit everyone?
> 
> could the checkpointing code check the return value for
> cap_checkpoint_restore() and fail the restore if it returned an error?

Sorry things seem mixed up here.  Let's stick to the naming Oren
suggested (and i used in the latest set):

	checkpoint_capabilities() saves the credential's caps to the
		checkpoint image
	restore_capabilities() takes state from checkpoint file and
		sets a credential's caps accordingly if allowed.

restore_capabilities() returns an error now on failure (-EPERM or
-ENOMEM).  We might talk about it returning -EINVAL if capability
sets aren't valid, but then the kernel currently allows invalid
capabilities to be set anyway (hence CapPrm for root tasks is
generally 0xffffffffffffffff, not just filled with valid bits).

checkpoint_capabilities() doesn't need to return an error - if it
is called at all, it is called with enough space for the struct
it expects to write out.

So I don't understand what it is you're asking for above?

thanks,
-serge
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-02 15:49                   ` Andrew G. Morgan
  2009-06-02 17:15                     ` Serge E. Hallyn
@ 2009-06-03  0:05                     ` Oren Laadan
       [not found]                       ` <4A25BE4F.6000603-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 1 reply; 33+ messages in thread
From: Oren Laadan @ 2009-06-03  0:05 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Linux Containers, Alexey Dobriyan,
	David Howells, linux-security-module



Andrew G. Morgan wrote:
> [I'm sorry if I'm flogging a dead horse. I'm actually really excited
> by this functionality! :-) ]
> 
> Comments inline...
> 
> On Tue, Jun 2, 2009 at 7:23 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>> Quoting Andrew G. Morgan (morgan@kernel.org):
>>> On Mon, Jun 1, 2009 at 3:18 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>>>> Quoting Andrew G. Morgan (morgan@kernel.org):
>>>>> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
>>>>>>>> I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
>>>>>>>> suffice?
>>>>> I can't speak for other subsystems, but it seems to me as if for the
>>>>> capabilities, I'd want to create something like this in
>>>>> include/linux/capabilities.h
>>>>>
>>>>> typedef struct checkpoint_caps_s {
>>>>>    /* what goes in here is the capability code's business */
>>>>> } checkpoint_caps_t;
>>>> Sigh - Did a patch this way, but the problem is userspace needs to be
>>>> able to parse the checkpoint image, so it needs to know what this struct
>>>> looks like.  So if I put it the struct definition
>>>> include/linux/capability.h, I run into a whole new set of problems
>>>> trying to compile a userspace program to do a sys_restart().
>>> Does the user space app need to be able to modify the data in some
>>> way? It seems like embedding a length with the structure or something
>>> might simplify such a user space dependency.
>> Hmm, I suppose I could do something like define struct ckpt_capabilities
>> in capabilities.h, then in checkpoint_hdr.h do
>>
>> struct ckpt_capabilities;
>> struct ckpt_cap_dummy {
>>        __u64 dummies[9];
>> };
>>
>> struct ckpt_hdr_cred {
>>        ...
>>        union {
>>                struct ckpt_capabilities r;
>>                struct ckpt_cap_dummy d;
>>        } caps;
>> };
> 
> Yes, something like this, but perhaps:
> 
>     struct ckpt_caps_part_s {
>        int length; /* = sizeof(struct ckpt_capabilities) */
>        cap_ckpt_t data;
>     } caps;
> 
> and then the generic checkpoint code would do:
> 
>    #include <linux/capabilities.h>
>    caps.ckpt_capabilities_length = cap_checkpoint_save(&caps.data);
>    [...]
>    cap_checkpoint_restore(caps.length, &caps.caps.data);
> 
> and the capability code could opaquely deal with the details.
> 
> The reason I think this is more maintainable is that its clear (to the
> capability code) what is being check-pointed and, conversely, for the
> checkpoint code it is abstracted with the responsibility for detailed
> state decisions elsewhere in the kernel.
> 
> I suspect I don't understand the user space code issue sufficiently.
> But if, for some reason, the user space source code is unable to
> include the definition of cap_ckpt_t, it should be clear that parsing
> this type of data structure, given that offsets are embedded in it,
> should be straightforward.
> 

As I said here:
https://lists.linux-foundation.org/pipermail/containers/2009-June/018288.html

Userspace needs to understand what's in the image to be able to
provide debug/info about the image, and to be able to convert
images from older kernel version to newer (or even vice versa if
one insists).

Oren.

>> with a BUILD_BUG_ON to ensure that sizeof(r)==sizeof(d).  Ugly, but
>> should suit everyone?
> 
> could the checkpointing code check the return value for
> cap_checkpoint_restore() and fail the restore if it returned an error?
> 
> Cheers
> 
> Andrew
> 
>>>> So I went part-way to what you suggested in the patchset I'm about to
>>>> send out (please see patch 6/8).  I think the caps code does look
>>>> nicer in this new version.
>>> Better, but I remain concerned that the code looks hard to maintain
>>> when structured this way.
>> Why exactly?  Just having the struct defined in checkpoint_hdr.h?  Or
>> is there something else I'm unwittingly doing?
>>
>> thanks,
>> -serge
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
       [not found]                       ` <4A25BE4F.6000603-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-06-03 15:03                         ` Andrew G. Morgan
  2009-06-03 16:45                           ` Serge E. Hallyn
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-03 15:03 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Linux Containers, David Howells, Alexey Dobriyan,
	linux-security-module-u79uwXL29TY76Z2rM5mHXA

You guys are well on the road to having something functional. I do not
want to get in the way of that.

What I'm trying to question is the way the code is abstracted. My
sense is that its not very friendly to the subsystems being
checkpointed. I could offer an alternative patch for the capabilities
code, but if it breaks some invisible constraint I'd rather you tell
me about it first.

There are three sets of code in these changes:

  1. the capabilities code (but I think I'm arguing about that because
it is an example I'm familiar with and this probably applies to other
subsystems in the kernel)
  2. the glue code in the kernel that implements sys_restore() etc.
  3. the user space code

What seems to have happened with these patches is that the glue code
and the user space code have knowledge of each other and things are
written to make an API/ABI that they can agree on. (Part of this is
evidenced by the u64 reference to capabilities and non-use of the more
natural kernel_cap_t data that the kernel itself uses.)

It seems to me that a more natural abstraction is that 1 and 3 know
about each other (actually 3 knows about how 1 chooses to define
things), and 2 is a generic data packager interface.

This should be functionally equivalent, but my belief is an
implementation based on the current abstraction will be broken more
often because some subsystem (eg. 1) evolves its critical state and
the corresponding (2+3)'s relationship is too confusing/constraining
for the kernel subsystem developer to keep in sync (someone else's
problem). And subtle problems of ignored state in checkpoints can be
really hard to debug!

Bottom line. Don't wait for me to Ack this change. I intend to lurk
and if I see a way to "clean it up", I'll offer a patch, which you can
of course reject... But don't wait on me.

Cheers

Andrew

On Tue, Jun 2, 2009 at 5:05 PM, Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org> wrote:
>
>
> Andrew G. Morgan wrote:
>> [I'm sorry if I'm flogging a dead horse. I'm actually really excited
>> by this functionality! :-) ]
>>
>> Comments inline...
>>
>> On Tue, Jun 2, 2009 at 7:23 AM, Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:
>>> Quoting Andrew G. Morgan (morgan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org):
>>>> On Mon, Jun 1, 2009 at 3:18 PM, Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:
>>>>> Quoting Andrew G. Morgan (morgan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org):
>>>>>> On Mon, Jun 1, 2009 at 6:35 AM, Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> wrote:
>>>>>>>>> I'll put in a commented BUILD_BUG_ON like Alexey suggests - does that
>>>>>>>>> suffice?
>>>>>> I can't speak for other subsystems, but it seems to me as if for the
>>>>>> capabilities, I'd want to create something like this in
>>>>>> include/linux/capabilities.h
>>>>>>
>>>>>> typedef struct checkpoint_caps_s {
>>>>>>    /* what goes in here is the capability code's business */
>>>>>> } checkpoint_caps_t;
>>>>> Sigh - Did a patch this way, but the problem is userspace needs to be
>>>>> able to parse the checkpoint image, so it needs to know what this struct
>>>>> looks like.  So if I put it the struct definition
>>>>> include/linux/capability.h, I run into a whole new set of problems
>>>>> trying to compile a userspace program to do a sys_restart().
>>>> Does the user space app need to be able to modify the data in some
>>>> way? It seems like embedding a length with the structure or something
>>>> might simplify such a user space dependency.
>>> Hmm, I suppose I could do something like define struct ckpt_capabilities
>>> in capabilities.h, then in checkpoint_hdr.h do
>>>
>>> struct ckpt_capabilities;
>>> struct ckpt_cap_dummy {
>>>        __u64 dummies[9];
>>> };
>>>
>>> struct ckpt_hdr_cred {
>>>        ...
>>>        union {
>>>                struct ckpt_capabilities r;
>>>                struct ckpt_cap_dummy d;
>>>        } caps;
>>> };
>>
>> Yes, something like this, but perhaps:
>>
>>     struct ckpt_caps_part_s {
>>        int length; /* = sizeof(struct ckpt_capabilities) */
>>        cap_ckpt_t data;
>>     } caps;
>>
>> and then the generic checkpoint code would do:
>>
>>    #include <linux/capabilities.h>
>>    caps.ckpt_capabilities_length = cap_checkpoint_save(&caps.data);
>>    [...]
>>    cap_checkpoint_restore(caps.length, &caps.caps.data);
>>
>> and the capability code could opaquely deal with the details.
>>
>> The reason I think this is more maintainable is that its clear (to the
>> capability code) what is being check-pointed and, conversely, for the
>> checkpoint code it is abstracted with the responsibility for detailed
>> state decisions elsewhere in the kernel.
>>
>> I suspect I don't understand the user space code issue sufficiently.
>> But if, for some reason, the user space source code is unable to
>> include the definition of cap_ckpt_t, it should be clear that parsing
>> this type of data structure, given that offsets are embedded in it,
>> should be straightforward.
>>
>
> As I said here:
> https://lists.linux-foundation.org/pipermail/containers/2009-June/018288.html
>
> Userspace needs to understand what's in the image to be able to
> provide debug/info about the image, and to be able to convert
> images from older kernel version to newer (or even vice versa if
> one insists).
>
> Oren.
>
>>> with a BUILD_BUG_ON to ensure that sizeof(r)==sizeof(d).  Ugly, but
>>> should suit everyone?
>>
>> could the checkpointing code check the return value for
>> cap_checkpoint_restore() and fail the restore if it returned an error?
>>
>> Cheers
>>
>> Andrew
>>
>>>>> So I went part-way to what you suggested in the patchset I'm about to
>>>>> send out (please see patch 6/8).  I think the caps code does look
>>>>> nicer in this new version.
>>>> Better, but I remain concerned that the code looks hard to maintain
>>>> when structured this way.
>>> Why exactly?  Just having the struct defined in checkpoint_hdr.h?  Or
>>> is there something else I'm unwittingly doing?
>>>
>>> thanks,
>>> -serge
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
>>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-03 15:03                         ` Andrew G. Morgan
@ 2009-06-03 16:45                           ` Serge E. Hallyn
  2009-06-04 14:13                             ` Andrew G. Morgan
  0 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-03 16:45 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
> You guys are well on the road to having something functional. I do not
> want to get in the way of that.
> 
> What I'm trying to question is the way the code is abstracted. My
> sense is that its not very friendly to the subsystems being
> checkpointed. I could offer an alternative patch for the capabilities
> code, but if it breaks some invisible constraint I'd rather you tell
> me about it first.
> 
> There are three sets of code in these changes:
> 
>   1. the capabilities code (but I think I'm arguing about that because
> it is an example I'm familiar with and this probably applies to other
> subsystems in the kernel)
>   2. the glue code in the kernel that implements sys_restore() etc.
>   3. the user space code
> 
> What seems to have happened with these patches is that the glue code
> and the user space code have knowledge of each other and things are
> written to make an API/ABI that they can agree on. (Part of this is
> evidenced by the u64 reference to capabilities and non-use of the more
> natural kernel_cap_t data that the kernel itself uses.)
> 
> It seems to me that a more natural abstraction is that 1 and 3 know
> about each other (actually 3 knows about how 1 chooses to define
> things), and 2 is a generic data packager interface.

One reason (3) and (2) need to know about each is other is bc
userspace needs to read the checkpoint image enough to read
the task states, clone off suitable children, and then each
child calls sys_restart().  So to this end, there are certainly
details which userspace doesn't need to be able to parse.

The other reason is that userspace will expect to be able to
make changes to the checkpoint file.  For instance, if restarting
a process from one kernel version with 34 capabilities defined,
on a kernel version with 35 defined, then userspace may want to
fill in an extra cap.

But I strongly agree with your concern that subsystems will get
out of sync with the checkpoint code.

If we just move the struct ckpt_capabilities definition into
include/linux/capabilities.h, does that suffice?  The c/r
userspace could then do something like

cat > ckpthdr.sed << EOF
/^[^ \t]*struct ckpt[^\n]*{/,/}/p
EOF
cat /usr/include/linux/*.h /usr/include/asm/*.h | sed -n -f ckpthdr.sed \
	> checkpoint_headers.h

Now you mention using kernel_cap_t's...  we can go partway
down that road, but the inherent problem is serializing the
relevant data so it can be streamed to some file.  So I
think it's better if the subsystems are required to specify
a useful format (like struct ckpt_capabilities) and define
the checkpoint/restart helpers to serialize data into the
struct.  We can try and get cute with dual mirrored
struct defs, one which is defined in terms the caps code
can understand, and one defined in more arch-independent
terms (which is why we need __u64s and packed, for instance).
But that seems more fragile than having clear and simple
requirements for the $SUBYSTEM_checkpoint() and $SUBSYSTEM_restart()
helpers.

(I'll wait to hear whether you think I'm on the right track
before reworking my patch)

> This should be functionally equivalent, but my belief is an
> implementation based on the current abstraction will be broken more
> often because some subsystem (eg. 1) evolves its critical state and
> the corresponding (2+3)'s relationship is too confusing/constraining
> for the kernel subsystem developer to keep in sync (someone else's
> problem). And subtle problems of ignored state in checkpoints can be
> really hard to debug!
> 
> Bottom line. Don't wait for me to Ack this change. I intend to lurk
> and if I see a way to "clean it up", I'll offer a patch, which you can
> of course reject... But don't wait on me.

Thanks, it's much appreciated.  But please don't hold back,
as your points are imo valid, and we need to work out how to
address them.

-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-03 16:45                           ` Serge E. Hallyn
@ 2009-06-04 14:13                             ` Andrew G. Morgan
  2009-06-05 19:41                               ` Serge E. Hallyn
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-04 14:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

On Wed, Jun 3, 2009 at 9:45 AM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> Quoting Andrew G. Morgan (morgan@kernel.org):
>> You guys are well on the road to having something functional. I do not
>> want to get in the way of that.
>>
>> What I'm trying to question is the way the code is abstracted. My
>> sense is that its not very friendly to the subsystems being
>> checkpointed. I could offer an alternative patch for the capabilities
>> code, but if it breaks some invisible constraint I'd rather you tell
>> me about it first.
>>
>> There are three sets of code in these changes:
>>
>>   1. the capabilities code (but I think I'm arguing about that because
>> it is an example I'm familiar with and this probably applies to other
>> subsystems in the kernel)
>>   2. the glue code in the kernel that implements sys_restore() etc.
>>   3. the user space code
>>
>> What seems to have happened with these patches is that the glue code
>> and the user space code have knowledge of each other and things are
>> written to make an API/ABI that they can agree on. (Part of this is
>> evidenced by the u64 reference to capabilities and non-use of the more
>> natural kernel_cap_t data that the kernel itself uses.)
>>
>> It seems to me that a more natural abstraction is that 1 and 3 know
>> about each other (actually 3 knows about how 1 chooses to define
>> things), and 2 is a generic data packager interface.
>
> One reason (3) and (2) need to know about each is other is bc
> userspace needs to read the checkpoint image enough to read
> the task states, clone off suitable children, and then each
> child calls sys_restart().  So to this end, there are certainly
> details which userspace doesn't need to be able to parse.
>
> The other reason is that userspace will expect to be able to
> make changes to the checkpoint file.  For instance, if restarting
> a process from one kernel version with 34 capabilities defined,
> on a kernel version with 35 defined, then userspace may want to
> fill in an extra cap.
>
> But I strongly agree with your concern that subsystems will get
> out of sync with the checkpoint code.
>
> If we just move the struct ckpt_capabilities definition into
> include/linux/capabilities.h, does that suffice?  The c/r
> userspace could then do something like
>
> cat > ckpthdr.sed << EOF
> /^[^ \t]*struct ckpt[^\n]*{/,/}/p
> EOF
> cat /usr/include/linux/*.h /usr/include/asm/*.h | sed -n -f ckpthdr.sed \
>        > checkpoint_headers.h
>
> Now you mention using kernel_cap_t's...  we can go partway
> down that road, but the inherent problem is serializing the
> relevant data so it can be streamed to some file.  So I
> think it's better if the subsystems are required to specify
> a useful format (like struct ckpt_capabilities) and define
> the checkpoint/restart helpers to serialize data into the
> struct.  We can try and get cute with dual mirrored
> struct defs, one which is defined in terms the caps code
> can understand, and one defined in more arch-independent
> terms (which is why we need __u64s and packed, for instance).
> But that seems more fragile than having clear and simple
> requirements for the $SUBYSTEM_checkpoint() and $SUBSYSTEM_restart()
> helpers.

I like this $SUBSYSTEM_checkpoint() etc. thing.

I like the ckpthdr.sed thing. I think a similar rule could be used to
generate the calls to the list of $SUBSYSTEM_checkpoint() functions.

For serialization, could a kernel "gcc -E checkpoint-headers.h >
this-kernel-checkpoint-file.h" build rule be enough?

Cheers

Andrew

>
> (I'll wait to hear whether you think I'm on the right track
> before reworking my patch)
>
>> This should be functionally equivalent, but my belief is an
>> implementation based on the current abstraction will be broken more
>> often because some subsystem (eg. 1) evolves its critical state and
>> the corresponding (2+3)'s relationship is too confusing/constraining
>> for the kernel subsystem developer to keep in sync (someone else's
>> problem). And subtle problems of ignored state in checkpoints can be
>> really hard to debug!
>>
>> Bottom line. Don't wait for me to Ack this change. I intend to lurk
>> and if I see a way to "clean it up", I'll offer a patch, which you can
>> of course reject... But don't wait on me.
>
> Thanks, it's much appreciated.  But please don't hold back,
> as your points are imo valid, and we need to work out how to
> address them.
>
> -serge
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-04 14:13                             ` Andrew G. Morgan
@ 2009-06-05 19:41                               ` Serge E. Hallyn
  2009-06-06 15:02                                 ` Andrew G. Morgan
  0 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-06-05 19:41 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

Quoting Andrew G. Morgan (morgan@kernel.org):
...
> > Now you mention using kernel_cap_t's...  we can go partway
> > down that road, but the inherent problem is serializing the
> > relevant data so it can be streamed to some file.  So I
> > think it's better if the subsystems are required to specify
> > a useful format (like struct ckpt_capabilities) and define
> > the checkpoint/restart helpers to serialize data into the
> > struct.  We can try and get cute with dual mirrored
> > struct defs, one which is defined in terms the caps code
> > can understand, and one defined in more arch-independent
> > terms (which is why we need __u64s and packed, for instance).
> > But that seems more fragile than having clear and simple
> > requirements for the $SUBYSTEM_checkpoint() and $SUBSYSTEM_restart()
> > helpers.
> 
> I like this $SUBSYSTEM_checkpoint() etc. thing.
> 
> I like the ckpthdr.sed thing. I think a similar rule could be used to
> generate the calls to the list of $SUBSYSTEM_checkpoint() functions.

Sorry, I don't follow.  Could you say a bit more about this?

> For serialization, could a kernel "gcc -E checkpoint-headers.h >
> this-kernel-checkpoint-file.h" build rule be enough?

Again, I don't follow.  Do you mean to turn something like kernel_cap_t
into something like struct ckpt_capabilities?

thanks,
-serge
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-05 19:41                               ` Serge E. Hallyn
@ 2009-06-06 15:02                                 ` Andrew G. Morgan
  2009-06-15  9:58                                   ` Alexey Dobriyan
  0 siblings, 1 reply; 33+ messages in thread
From: Andrew G. Morgan @ 2009-06-06 15:02 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oren Laadan, Linux Containers, Alexey Dobriyan, David Howells,
	linux-security-module

On Fri, Jun 5, 2009 at 12:41 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> Quoting Andrew G. Morgan (morgan@kernel.org):
> ...
>> > Now you mention using kernel_cap_t's...  we can go partway
>> > down that road, but the inherent problem is serializing the
>> > relevant data so it can be streamed to some file.  So I
>> > think it's better if the subsystems are required to specify
>> > a useful format (like struct ckpt_capabilities) and define
>> > the checkpoint/restart helpers to serialize data into the
>> > struct.  We can try and get cute with dual mirrored
>> > struct defs, one which is defined in terms the caps code
>> > can understand, and one defined in more arch-independent
>> > terms (which is why we need __u64s and packed, for instance).
>> > But that seems more fragile than having clear and simple
>> > requirements for the $SUBYSTEM_checkpoint() and $SUBSYSTEM_restart()
>> > helpers.
>>
>> I like this $SUBSYSTEM_checkpoint() etc. thing.
>>
>> I like the ckpthdr.sed thing. I think a similar rule could be used to
>> generate the calls to the list of $SUBSYSTEM_checkpoint() functions.
>
> Sorry, I don't follow.  Could you say a bit more about this?

See sketch below.

>
>> For serialization, could a kernel "gcc -E checkpoint-headers.h >
>> this-kernel-checkpoint-file.h" build rule be enough?
>
> Again, I don't follow.  Do you mean to turn something like kernel_cap_t
> into something like struct ckpt_capabilities?

Yes, but without manual effort.

To return the the 1,2,3 breakdown. I think it should be possible for 1
to provide code like this:

$SUBSYSTEM_header.h:
  struct ckpt_$SUBSYSTEM_s {
     kernel_foo_t a, b, c;
  };

$SUBSYSTEM_code.c:
  void $SUBSYSTEM_checkpoint(struct *storage_ptr)
  {
  }

  int $SUBSYSTEM_restore(struct *storage_ptr)
  {
  }

2 would be 'sed'(perl) generated:

temp_header.h:
  foreach $x (sort @SUBSYSTEMS) {
    print "#include <$SUBSYSTEM_header.h>\n";
  }
  print "extern full_checkpoint();\n";
  print "struct uber_ckpt_struct {\n";
  foreach $x (sort @SUBSYSTEMS) {
    print "  struct ckpt_$x_s $x;\n";
  }
  print <<EOT;
};

#ifdef __kernel_source__
full_checkpoint()
{
EOT
  foreach $x (sort @SUBSYSTEMS) {
    print "  $x_checkpoint();\n";
  }
  print <<EOT;
}
#endif
EOT

"ditto for restore, but pay attention to return codes".

Then, for '3', gcc -E temp_header.h | sed 'cut from line after 'extern
full_checkpoint'.

That means:

  1. gets to define its checkpoint state with its own conventions
  2. gets generated with some perl/sed/whatever - low maintenance
  3. gets a plain C serialization of the checkpointed state.

The kernel code subsystem code is king here. It defines its
checkpoints in whatever manner it feels is natural for the specific
subsystem. The glue code is autogenerated. User space has full access
to the data in a plain C format. The user space code is where things
are fragile, if the user space code wants to manipulate the
checkpointed data in some way, but I think there will always be at
least one place in the system for fragility and user space is the best
place for it.

Cheers

Andrew

>
> thanks,
> -serge
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/9] cr: capabilities: define checkpoint and restore fns
  2009-06-06 15:02                                 ` Andrew G. Morgan
@ 2009-06-15  9:58                                   ` Alexey Dobriyan
  0 siblings, 0 replies; 33+ messages in thread
From: Alexey Dobriyan @ 2009-06-15  9:58 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Oren Laadan, Linux Containers, David Howells,
	linux-security-module

On Sat, Jun 06, 2009 at 08:02:28AM -0700, Andrew G. Morgan wrote:
> On Fri, Jun 5, 2009 at 12:41 PM, Serge E. Hallyn <serue@us.ibm.com> wrote:
> > Quoting Andrew G. Morgan (morgan@kernel.org):
> > ...
> >> > Now you mention using kernel_cap_t's...  we can go partway
> >> > down that road, but the inherent problem is serializing the
> >> > relevant data so it can be streamed to some file.  So I
> >> > think it's better if the subsystems are required to specify
> >> > a useful format (like struct ckpt_capabilities) and define
> >> > the checkpoint/restart helpers to serialize data into the
> >> > struct.  We can try and get cute with dual mirrored
> >> > struct defs, one which is defined in terms the caps code
> >> > can understand, and one defined in more arch-independent
> >> > terms (which is why we need __u64s and packed, for instance).
> >> > But that seems more fragile than having clear and simple
> >> > requirements for the $SUBYSTEM_checkpoint() and $SUBSYSTEM_restart()
> >> > helpers.
> >>
> >> I like this $SUBSYSTEM_checkpoint() etc. thing.
> >>
> >> I like the ckpthdr.sed thing. I think a similar rule could be used to
> >> generate the calls to the list of $SUBSYSTEM_checkpoint() functions.
> >
> > Sorry, I don't follow.  Could you say a bit more about this?
> 
> See sketch below.
> 
> >
> >> For serialization, could a kernel "gcc -E checkpoint-headers.h >
> >> this-kernel-checkpoint-file.h" build rule be enough?
> >
> > Again, I don't follow.  Do you mean to turn something like kernel_cap_t
> > into something like struct ckpt_capabilities?
> 
> Yes, but without manual effort.
> 
> To return the the 1,2,3 breakdown. I think it should be possible for 1
> to provide code like this:
> 
> $SUBSYSTEM_header.h:
>   struct ckpt_$SUBSYSTEM_s {
>      kernel_foo_t a, b, c;
>   };
> 
> $SUBSYSTEM_code.c:
>   void $SUBSYSTEM_checkpoint(struct *storage_ptr)
>   {
>   }
> 
>   int $SUBSYSTEM_restore(struct *storage_ptr)
>   {
>   }

This doesn't work. Kernel internals don't have clearly defined subsystem boundaries,
capabilities or not.

As for capabilities, I think code below is everything what's needed:

	typedef __u64 kstate_cap_t;

	struct kstate_image_cred {
			...
		kstate_cap_t    cap_inheritable;
	        kstate_cap_t    cap_permitted;
	        kstate_cap_t    cap_effective;
	        kstate_cap_t    cap_bset;
			...
	};

	static void dump_capabilities(kstate_cap_t *i, kernel_cap_t *cap)
	{
	        BUILD_BUG_ON(sizeof(kstate_cap_t) != sizeof(kernel_cap_t));
	        memcpy(i, cap, sizeof(kstate_cap_t));
	}

Kernel and userspace can do anything they want with such image easily.
--
To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2009-06-15  9:58 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-29 22:32 [PATCH 0/9] credentials c/r: Introduction Serge E. Hallyn
2009-05-29 22:32 ` [PATCH 1/9] cred: #include init.h in cred.h Serge E. Hallyn
2009-05-29 22:32 ` [PATCH 2/9] groups: move code to kernel/groups.c Serge E. Hallyn
2009-05-29 22:33 ` [PATCH 3/9] cr: break out new_user_ns() Serge E. Hallyn
2009-05-29 22:33 ` [PATCH 4/9] cr: split core function out of some set*{u,g}id functions Serge E. Hallyn
2009-05-29 22:33 ` [PATCH 5/9] cr: capabilities: define checkpoint and restore fns Serge E. Hallyn
2009-05-31 20:26   ` Andrew G. Morgan
2009-05-31 20:56     ` Alexey Dobriyan
2009-06-01  1:38     ` Serge E. Hallyn
2009-06-01  2:18       ` Andrew G. Morgan
2009-06-01 13:35         ` Serge E. Hallyn
2009-06-01 15:46           ` Andrew G. Morgan
2009-06-01 22:18             ` Serge E. Hallyn
2009-06-02 13:49               ` Andrew G. Morgan
2009-06-02 14:23                 ` Serge E. Hallyn
2009-06-02 15:26                   ` Oren Laadan
2009-06-02 15:49                   ` Andrew G. Morgan
2009-06-02 17:15                     ` Serge E. Hallyn
2009-06-03  0:05                     ` Oren Laadan
     [not found]                       ` <4A25BE4F.6000603-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-06-03 15:03                         ` Andrew G. Morgan
2009-06-03 16:45                           ` Serge E. Hallyn
2009-06-04 14:13                             ` Andrew G. Morgan
2009-06-05 19:41                               ` Serge E. Hallyn
2009-06-06 15:02                                 ` Andrew G. Morgan
2009-06-15  9:58                                   ` Alexey Dobriyan
2009-06-01 15:49     ` Serge E. Hallyn
2009-06-01 16:34       ` Oren Laadan
2009-05-29 22:33 ` [PATCH 6/9] cr: checkpoint and restore task credentials Serge E. Hallyn
     [not found] ` <20090529223229.GA14536-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-29 22:33   ` [PATCH 7/9] cr: restore file->f_cred Serge E. Hallyn
2009-05-29 22:33   ` [PATCH 8/9] user namespaces: debug refcounts Serge E. Hallyn
2009-05-31 18:51     ` Alexey Dobriyan
2009-06-01 19:02       ` Serge E. Hallyn
2009-05-29 22:34 ` [PATCH 9/9] cr: ipc: reset kern_ipc_perms Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.