All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] System Wide Capability Bounding Set
@ 2011-01-05 22:25 Eric Paris
  2011-01-06 11:30 ` Tetsuo Handa
  2011-01-11 22:02 ` Serge E. Hallyn
  0 siblings, 2 replies; 28+ messages in thread
From: Eric Paris @ 2011-01-05 22:25 UTC (permalink / raw)
  To: linux-kernel, linux-security-module; +Cc: serue, sgrubb

Not so long ago the global capability bounding set was removed from the
kernel.  Instead we created a new per task capability bounding set which
was inherited by children.  This feature is quite reasonable if you want
to start some task and its descendants in a limited capability box but
it is completely useless if you want to make system wide changes.  This
is the reason we had to add the /proc/sys/kernel/modules_disabled
tunable even though CAP_SYS_MODULE controls the operation.  There is
absolutely no way to eliminate a capability from the system.  At first I
thought maybe we could do something smart, like, drop the capability in
question by init before anything else ran, thus it would be gone from
the bounding set of every process.  But this is not even possible!  All
one must do it cause the kernel to attempt to auto load a module and
viola, you win!  The kernel will upcall to userspace
(maybe /sbin/modprobe, maybe something root dropped there, or maybe root
rewrote what's called with /proc/sys/kernel/modprobe) from a kernel
thread which has a full capability bounding set.  Thus whatever gets
called has everything.  And you can't drop privs.  Period.  We just
can't do it.

This patch reintroduces the global bounding set.  It's global.  Period.
Unlike the old days not even init can put things back.  It's a one way
street.  Notice that it only applies at the exec boundary, so programs
running before the bounding set is lowered are still able to use those
caps, but they cannot be passed onto children.  This does allow us to
drop caps very early by init and never have them come back.  Sure kernel
threads may still have them, but they will not be able to pass them onto
child tasks (like modprobe)

Signed-off-by: Eric Paris <eparis@redhat.com>
---
I'd love to hear comments.....

 include/linux/capability.h |    1 
 include/linux/security.h   |    5 ++++
 include/linux/sysctl.h     |    3 ++
 kernel/sysctl.c            |   56 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl_binary.c     |    2 +
 security/commoncap.c       |   17 ++++++++++---
 6 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 90012b9..2aebcb1 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -224,6 +224,7 @@ struct cpu_vfs_cap_data {
 #define CAP_IPC_OWNER        15
 
 /* Insert and remove kernel modules - modify kernel without limit */
+/* Remove from the global cap_bset */
 #define CAP_SYS_MODULE       16
 
 /* Allow ioperm/iopl access */
diff --git a/include/linux/security.h b/include/linux/security.h
index 02fcc0e..522d387 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -49,6 +49,11 @@ struct ctl_table;
 struct audit_krule;
 
 /*
+ * Global bounding set
+ */
+extern kernel_cap_t global_cap_bset;
+
+/*
  * These functions are in security/capability.c and are used
  * as the default capabilities functions
  */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7bb5cb6..4e80767 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
 	KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+	KERN_CAP_BSET=77,	/* int: global capability bset */
 };
 
 
@@ -968,6 +969,8 @@ extern int proc_dostring(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
+extern int proc_dointvec_bset(struct ctl_table *, int, struct file *,
+			      void __user *, size_t *, loff_t *);
 extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa15..6843f85 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -166,6 +166,8 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_cap_bset(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -428,6 +430,12 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "cap-bound",
+		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+		.mode		= 0600,
+		.proc_handler	= proc_cap_bset,
+	},
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
@@ -2365,6 +2373,54 @@ int proc_dointvec(struct ctl_table *table, int write,
 }
 
 /*
+ * CAP_SYS_MODULE needed to drop bits.
+ */
+static int proc_cap_bset(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	unsigned long bset[_KERNEL_CAPABILITY_U32S];
+	kernel_cap_t new_bset;
+	int err, i;
+
+	if (write && !capable(CAP_SYS_MODULE))
+		return -EPERM;
+
+	/*
+	 * convert from the global kernel_cap_t to the ulong array to print to
+	 * userspace if this is a read.
+	 */
+	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+		bset[i] = global_cap_bset.cap[i];
+
+	t = *table;
+	t.data = &bset;
+
+	/*
+	 * actually read or write and array of ulongs from userspace.  Remember
+	 * these are least significant 32 bits first
+	 */
+	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+
+	/*
+	 * convert from the sysctl array of ulongs to the kernel_cap_t
+	 * internal representation
+	 */
+	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+		new_bset.cap[i] = bset[i];
+
+	/*
+	 * Drop everything not in the new_bset (but don't add things)
+	 */
+	if (write)
+		global_cap_bset = cap_intersect(global_cap_bset, new_bset);
+
+	return 0;
+}
+
+/*
  * Taint values can only be increased
  * This means we can safely use a temporary.
  */
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c57..6486633 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -71,6 +71,8 @@ static const struct bin_table bin_kern_table[] = {
 	{ CTL_STR,	KERN_NODENAME,			"hostname" },
 	{ CTL_STR,	KERN_DOMAINNAME,		"domainname" },
 
+	{ CTL_INT,	KERN_CAP_BSET,			"cap-bound" },
+
 	{ CTL_INT,	KERN_PANIC,			"panic" },
 	{ CTL_INT,	KERN_REALROOTDEV,		"real-root-dev" },
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 64c2ed9..e615224 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -11,6 +11,7 @@
 #include <linux/audit.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/init_task.h> /* CAP_INIT_BSET */
 #include <linux/kernel.h>
 #include <linux/security.h>
 #include <linux/file.h>
@@ -28,6 +29,8 @@
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 
+kernel_cap_t global_cap_bset = CAP_INIT_BSET;    /* systemwide capability bound */
+
 /*
  * If a non-root user executes a setuid-root binary in
  * !secure(SECURE_NOROOT) mode, then we raise capabilities.
@@ -201,6 +204,9 @@ int cap_capset(struct cred *new,
 	       const kernel_cap_t *inheritable,
 	       const kernel_cap_t *permitted)
 {
+	kernel_cap_t bset = cap_intersect(old->cap_bset,
+					  global_cap_bset);
+
 	if (cap_inh_is_capped() &&
 	    !cap_issubset(*inheritable,
 			  cap_combine(old->cap_inheritable,
@@ -209,8 +215,7 @@ int cap_capset(struct cred *new,
 		return -EPERM;
 
 	if (!cap_issubset(*inheritable,
-			  cap_combine(old->cap_inheritable,
-				      old->cap_bset)))
+			  cap_combine(old->cap_inheritable, bset)))
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
 
@@ -305,6 +310,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 		new->cap_permitted.cap[i] =
 			(new->cap_bset.cap[i] & permitted) |
 			(new->cap_inheritable.cap[i] & inheritable);
+		/* the global set is global damn it */
+		new->cap_permitted.cap[i] &= global_cap_bset.cap[i];
 
 		if (permitted & ~new->cap_permitted.cap[i])
 			/* insufficient to execute correctly */
@@ -438,6 +445,9 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 		return ret;
 
 	if (!issecure(SECURE_NOROOT)) {
+		kernel_cap_t bset = cap_intersect(old->cap_bset,
+						  global_cap_bset);
+
 		/*
 		 * If the legacy file capability is set, then don't set privs
 		 * for a setuid root binary run by a non-root user.  Do set it
@@ -456,8 +466,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 		 */
 		if (new->euid == 0 || new->uid == 0) {
 			/* pP' = (cap_bset & ~0) | (pI & ~0) */
-			new->cap_permitted = cap_combine(old->cap_bset,
-							 old->cap_inheritable);
+			new->cap_permitted = cap_combine(bset, old->cap_inheritable);
 		}
 		if (new->euid == 0)
 			effective = true;



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-05 22:25 [PATCH] System Wide Capability Bounding Set Eric Paris
@ 2011-01-06 11:30 ` Tetsuo Handa
  2011-01-06 16:44   ` Theodore Tso
  2011-01-11 22:02 ` Serge E. Hallyn
  1 sibling, 1 reply; 28+ messages in thread
From: Tetsuo Handa @ 2011-01-06 11:30 UTC (permalink / raw)
  To: eparis, linux-kernel, linux-security-module; +Cc: serue, sgrubb

Eric Paris wrote:
> Not so long ago the global capability bounding set was removed from the
> kernel.  Instead we created a new per task capability bounding set which
> was inherited by children.

An LSM module can provide ability to aggregate several tasks into a group
(called "security context" or "domain") and grant permissions against groups.
We can selectively grant whatever capabilities against groups.
Why do we need to get bothered by capability inheritance problem?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-06 11:30 ` Tetsuo Handa
@ 2011-01-06 16:44   ` Theodore Tso
  0 siblings, 0 replies; 28+ messages in thread
From: Theodore Tso @ 2011-01-06 16:44 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: eparis, linux-kernel, linux-security-module, serue, sgrubb


On Jan 6, 2011, at 6:30 AM, Tetsuo Handa wrote:

> An LSM module can provide ability to aggregate several tasks into a group
> (called "security context" or "domain") and grant permissions against groups.
> We can selectively grant whatever capabilities against groups.
> Why do we need to get bothered by capability inheritance problem?

Yes, but LSM modules still can't stack, last I checked.  So people would need to choose between this or SELinux, or build this capability into every single LSM module....

-- Ted


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-05 22:25 [PATCH] System Wide Capability Bounding Set Eric Paris
  2011-01-06 11:30 ` Tetsuo Handa
@ 2011-01-11 22:02 ` Serge E. Hallyn
  2011-01-11 22:12   ` Serge E. Hallyn
  2011-01-14 19:50   ` Eric Paris
  1 sibling, 2 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2011-01-11 22:02 UTC (permalink / raw)
  To: Eric Paris
  Cc: linux-kernel, linux-security-module, serue, sgrubb, Andrew Morgan

Quoting Eric Paris (eparis@redhat.com):
> Not so long ago the global capability bounding set was removed from the
> kernel.  Instead we created a new per task capability bounding set which
> was inherited by children.  This feature is quite reasonable if you want
> to start some task and its descendants in a limited capability box but
> it is completely useless if you want to make system wide changes.  This
> is the reason we had to add the /proc/sys/kernel/modules_disabled
> tunable even though CAP_SYS_MODULE controls the operation.  There is
> absolutely no way to eliminate a capability from the system.  At first I
> thought maybe we could do something smart, like, drop the capability in
> question by init before anything else ran, thus it would be gone from
> the bounding set of every process.  But this is not even possible!  All
> one must do it cause the kernel to attempt to auto load a module and
> viola, you win!  The kernel will upcall to userspace
> (maybe /sbin/modprobe, maybe something root dropped there, or maybe root
> rewrote what's called with /proc/sys/kernel/modprobe) from a kernel
> thread which has a full capability bounding set.  Thus whatever gets
> called has everything.  And you can't drop privs.  Period.  We just
> can't do it.
> 
> This patch reintroduces the global bounding set.  It's global.  Period.
> Unlike the old days not even init can put things back.  It's a one way
> street.  Notice that it only applies at the exec boundary, so programs
> running before the bounding set is lowered are still able to use those
> caps, but they cannot be passed onto children.  This does allow us to
> drop caps very early by init and never have them come back.  Sure kernel
> threads may still have them, but they will not be able to pass them onto
> child tasks (like modprobe)
> 
> Signed-off-by: Eric Paris <eparis@redhat.com>
> ---
> I'd love to hear comments.....
> 
>  include/linux/capability.h |    1 
>  include/linux/security.h   |    5 ++++
>  include/linux/sysctl.h     |    3 ++
>  kernel/sysctl.c            |   56 +++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sysctl_binary.c     |    2 +
>  security/commoncap.c       |   17 ++++++++++---
>  6 files changed, 80 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index 90012b9..2aebcb1 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -224,6 +224,7 @@ struct cpu_vfs_cap_data {
>  #define CAP_IPC_OWNER        15
>  
>  /* Insert and remove kernel modules - modify kernel without limit */
> +/* Remove from the global cap_bset */
>  #define CAP_SYS_MODULE       16
>  
>  /* Allow ioperm/iopl access */
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 02fcc0e..522d387 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -49,6 +49,11 @@ struct ctl_table;
>  struct audit_krule;
>  
>  /*
> + * Global bounding set
> + */
> +extern kernel_cap_t global_cap_bset;
> +
> +/*
>   * These functions are in security/capability.c and are used
>   * as the default capabilities functions
>   */
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 7bb5cb6..4e80767 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -153,6 +153,7 @@ enum
>  	KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */
>  	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
>  	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
> +	KERN_CAP_BSET=77,	/* int: global capability bset */
>  };
>  
>  
> @@ -968,6 +969,8 @@ extern int proc_dostring(struct ctl_table *, int,
>  			 void __user *, size_t *, loff_t *);
>  extern int proc_dointvec(struct ctl_table *, int,
>  			 void __user *, size_t *, loff_t *);
> +extern int proc_dointvec_bset(struct ctl_table *, int, struct file *,
> +			      void __user *, size_t *, loff_t *);
>  extern int proc_dointvec_minmax(struct ctl_table *, int,
>  				void __user *, size_t *, loff_t *);
>  extern int proc_dointvec_jiffies(struct ctl_table *, int,
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 5abfa15..6843f85 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -166,6 +166,8 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
>  		  void __user *buffer, size_t *lenp, loff_t *ppos);
>  static int proc_taint(struct ctl_table *table, int write,
>  			       void __user *buffer, size_t *lenp, loff_t *ppos);
> +static int proc_cap_bset(struct ctl_table *table, int write,
> +			 void __user *buffer, size_t *lenp, loff_t *ppos);
>  #endif
>  
>  #ifdef CONFIG_MAGIC_SYSRQ
> @@ -428,6 +430,12 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec,
>  	},
> +	{
> +		.procname	= "cap-bound",
> +		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
> +		.mode		= 0600,
> +		.proc_handler	= proc_cap_bset,
> +	},
>  #ifdef CONFIG_PROC_SYSCTL
>  	{
>  		.procname	= "tainted",
> @@ -2365,6 +2373,54 @@ int proc_dointvec(struct ctl_table *table, int write,
>  }
>  
>  /*
> + * CAP_SYS_MODULE needed to drop bits.
> + */
> +static int proc_cap_bset(struct ctl_table *table, int write,
> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	unsigned long bset[_KERNEL_CAPABILITY_U32S];
> +	kernel_cap_t new_bset;
> +	int err, i;
> +
> +	if (write && !capable(CAP_SYS_MODULE))
> +		return -EPERM;
> +
> +	/*
> +	 * convert from the global kernel_cap_t to the ulong array to print to
> +	 * userspace if this is a read.
> +	 */
> +	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
> +		bset[i] = global_cap_bset.cap[i];
> +
> +	t = *table;
> +	t.data = &bset;
> +
> +	/*
> +	 * actually read or write and array of ulongs from userspace.  Remember
> +	 * these are least significant 32 bits first
> +	 */
> +	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0)
> +		return err;
> +
> +	/*
> +	 * convert from the sysctl array of ulongs to the kernel_cap_t
> +	 * internal representation
> +	 */
> +	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
> +		new_bset.cap[i] = bset[i];
> +
> +	/*
> +	 * Drop everything not in the new_bset (but don't add things)
> +	 */
> +	if (write)
> +		global_cap_bset = cap_intersect(global_cap_bset, new_bset);
> +
> +	return 0;
> +}
> +
> +/*
>   * Taint values can only be increased
>   * This means we can safely use a temporary.
>   */
> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
> index 1357c57..6486633 100644
> --- a/kernel/sysctl_binary.c
> +++ b/kernel/sysctl_binary.c
> @@ -71,6 +71,8 @@ static const struct bin_table bin_kern_table[] = {
>  	{ CTL_STR,	KERN_NODENAME,			"hostname" },
>  	{ CTL_STR,	KERN_DOMAINNAME,		"domainname" },
>  
> +	{ CTL_INT,	KERN_CAP_BSET,			"cap-bound" },
> +
>  	{ CTL_INT,	KERN_PANIC,			"panic" },
>  	{ CTL_INT,	KERN_REALROOTDEV,		"real-root-dev" },
>  
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 64c2ed9..e615224 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -11,6 +11,7 @@
>  #include <linux/audit.h>
>  #include <linux/module.h>
>  #include <linux/init.h>
> +#include <linux/init_task.h> /* CAP_INIT_BSET */
>  #include <linux/kernel.h>
>  #include <linux/security.h>
>  #include <linux/file.h>
> @@ -28,6 +29,8 @@
>  #include <linux/prctl.h>
>  #include <linux/securebits.h>
>  
> +kernel_cap_t global_cap_bset = CAP_INIT_BSET;    /* systemwide capability bound */
> +
>  /*
>   * If a non-root user executes a setuid-root binary in
>   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> @@ -201,6 +204,9 @@ int cap_capset(struct cred *new,
>  	       const kernel_cap_t *inheritable,
>  	       const kernel_cap_t *permitted)
>  {
> +	kernel_cap_t bset = cap_intersect(old->cap_bset,
> +					  global_cap_bset);
> +
>  	if (cap_inh_is_capped() &&
>  	    !cap_issubset(*inheritable,
>  			  cap_combine(old->cap_inheritable,
> @@ -209,8 +215,7 @@ int cap_capset(struct cred *new,
>  		return -EPERM;
>  
>  	if (!cap_issubset(*inheritable,
> -			  cap_combine(old->cap_inheritable,
> -				      old->cap_bset)))
> +			  cap_combine(old->cap_inheritable, bset)))
>  		/* no new pI capabilities outside bounding set */
>  		return -EPERM;
>  
> @@ -305,6 +310,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
>  		new->cap_permitted.cap[i] =
>  			(new->cap_bset.cap[i] & permitted) |
>  			(new->cap_inheritable.cap[i] & inheritable);
> +		/* the global set is global damn it */
> +		new->cap_permitted.cap[i] &= global_cap_bset.cap[i];

[ If I'm thinking right: ]

Global may be global, but you're changing the formula (here, for a
non-root task executing a file with filecaps) from

	pP' = (X & fP) | (pI & fI)

to

	A  = (X & FP) | (pI & fI)
	pP'= Z & A                    // Z == global bounding set

In other words, you are not simply enforcing "the intersection of
the global and per-process bounding sets".

Whereas,

>  		if (permitted & ~new->cap_permitted.cap[i])
>  			/* insufficient to execute correctly */
> @@ -438,6 +445,9 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
>  		return ret;
>  
>  	if (!issecure(SECURE_NOROOT)) {
> +		kernel_cap_t bset = cap_intersect(old->cap_bset,
> +						  global_cap_bset);
> +
>  		/*
>  		 * If the legacy file capability is set, then don't set privs
>  		 * for a setuid root binary run by a non-root user.  Do set it
> @@ -456,8 +466,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
>  		 */
>  		if (new->euid == 0 || new->uid == 0) {
>  			/* pP' = (cap_bset & ~0) | (pI & ~0) */
> -			new->cap_permitted = cap_combine(old->cap_bset,
> -							 old->cap_inheritable);
> +			new->cap_permitted = cap_combine(bset, old->cap_inheritable);

here (for a root task) you are using 

	pP' = (Z & X) | pI

So the inheritable tasks get masked with the global bounding set for
non-root tasks, but not for root tasks.

>  		}
>  		if (new->euid == 0)
>  			effective = true;
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-11 22:02 ` Serge E. Hallyn
@ 2011-01-11 22:12   ` Serge E. Hallyn
  2011-01-14 19:50   ` Eric Paris
  1 sibling, 0 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2011-01-11 22:12 UTC (permalink / raw)
  To: Eric Paris, linux-kernel
  Cc: linux-security-module, serue, sgrubb, Andrew Morgan

Sorry - .muttrc snafu - please reply to serge.hallyn@canonical.com,
not serge@canonical.com :(

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-11 22:02 ` Serge E. Hallyn
  2011-01-11 22:12   ` Serge E. Hallyn
@ 2011-01-14 19:50   ` Eric Paris
  2011-01-17  3:16     ` Andrew G. Morgan
  1 sibling, 1 reply; 28+ messages in thread
From: Eric Paris @ 2011-01-14 19:50 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: linux-kernel, linux-security-module, sgrubb, Andrew Morgan

On Tue, 2011-01-11 at 16:02 -0600, Serge E. Hallyn wrote:
> Quoting Eric Paris (eparis@redhat.com):

> > @@ -305,6 +310,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
> >  		new->cap_permitted.cap[i] =
> >  			(new->cap_bset.cap[i] & permitted) |
> >  			(new->cap_inheritable.cap[i] & inheritable);
> > +		/* the global set is global damn it */
> > +		new->cap_permitted.cap[i] &= global_cap_bset.cap[i];
> 
> [ If I'm thinking right: ]
> 
> Global may be global, but you're changing the formula (here, for a
> non-root task executing a file with filecaps) from
> 
> 	pP' = (X & fP) | (pI & fI)
> 
> to
> 
> 	A  = (X & FP) | (pI & fI)
> 	pP'= Z & A                    // Z == global bounding set
> 
> In other words, you are not simply enforcing "the intersection of
> the global and per-process bounding sets".
> 
> Whereas,
> 
> >  		if (permitted & ~new->cap_permitted.cap[i])
> >  			/* insufficient to execute correctly */
> > @@ -438,6 +445,9 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
> >  		return ret;
> >  
> >  	if (!issecure(SECURE_NOROOT)) {
> > +		kernel_cap_t bset = cap_intersect(old->cap_bset,
> > +						  global_cap_bset);
> > +
> >  		/*
> >  		 * If the legacy file capability is set, then don't set privs
> >  		 * for a setuid root binary run by a non-root user.  Do set it
> > @@ -456,8 +466,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
> >  		 */
> >  		if (new->euid == 0 || new->uid == 0) {
> >  			/* pP' = (cap_bset & ~0) | (pI & ~0) */
> > -			new->cap_permitted = cap_combine(old->cap_bset,
> > -							 old->cap_inheritable);
> > +			new->cap_permitted = cap_combine(bset, old->cap_inheritable);
> 
> here (for a root task) you are using 
> 
> 	pP' = (Z & X) | pI
> 
> So the inheritable tasks get masked with the global bounding set for
> non-root tasks, but not for root tasks.

I believe you are thinking correctly and I am wrong.  Someone else has
some other issues with the patch but would prefer to keep that
conversation offline.  I will certainly be back with changes and
explanation of changes (hopefully shortly)

Thanks Serge!

-Eric


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-14 19:50   ` Eric Paris
@ 2011-01-17  3:16     ` Andrew G. Morgan
  2011-01-21 21:25       ` Eric Paris
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew G. Morgan @ 2011-01-17  3:16 UTC (permalink / raw)
  To: Eric Paris; +Cc: Serge E. Hallyn, linux-kernel, linux-security-module, sgrubb

Being the someone else, I'll elaborate my previously offline:

   Nacked-by: Andrew G. Morgan <morgan@kernel.org>

further...

I'm not a supporter of system-wide security contexts changing under
running programs. Previous experience has taught me that tricking a
program into thinking it has one sort of privilege and then
withdrawing it without notice can be dangerous. Since privileged
programs include shells that invoke privileged commands to perform
system admin tasks, I consider a global bounding set to be trouble in
the making. I'm also concerned that an attacker tinkering with the
global bounding set can make remote administration of a machine
(rebooting say) impossible. We have 'kill' for 'sorry, you can't have
that privilege and preventing further damage', its nice and
synchronous and has auditable semantics.

Saying no, however, is not very constructive and I can't argue that
there isn't a privilege escalation issue with the kernel loading
modules and running helpers. If I can't support a global bounding set,
what can I suggest instead? This is why I've been quiet... I've been
thinking.

My first observation may seem odd, but I have to ask why init has a
special privilege enabling mechanism vs. that used for normal
binaries. Naively, I would think that since it is run as root, and
root is (by default) all-capable, the root-fixup code in the generic
exec path would cause init to run with pE=pP=~0 anyway. There was a
time when init was run with one capability suppressed from the pE set,
but the meaning of this capability has changed dramatically since then
so I'm not sure that suppression is needed any longer.

If we were to delete that special code what I think is missing from
the current kernel model, is not a global bounding set, but a 'kernel
auto-exec' securebits value. One that can be set by an admin at
runtime to suppress the root-is-all-capable behavior of the auto-exec
process, and defer the privilege escalation to carefully audited file
capabilities on the relevant helper binaries.

There is probably more detail needed for this idea, but from the
perspective of a root-is-impotent kernel configuration this seems much
more consistent to me.

Hope that clarifies my nack for this global-bounding set kernel change.

Cheers

Andrew

On Fri, Jan 14, 2011 at 11:50 AM, Eric Paris <eparis@redhat.com> wrote:
> On Tue, 2011-01-11 at 16:02 -0600, Serge E. Hallyn wrote:
>> Quoting Eric Paris (eparis@redhat.com):
>
>> > @@ -305,6 +310,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
>> >             new->cap_permitted.cap[i] =
>> >                     (new->cap_bset.cap[i] & permitted) |
>> >                     (new->cap_inheritable.cap[i] & inheritable);
>> > +           /* the global set is global damn it */
>> > +           new->cap_permitted.cap[i] &= global_cap_bset.cap[i];
>>
>> [ If I'm thinking right: ]
>>
>> Global may be global, but you're changing the formula (here, for a
>> non-root task executing a file with filecaps) from
>>
>>       pP' = (X & fP) | (pI & fI)
>>
>> to
>>
>>       A  = (X & FP) | (pI & fI)
>>       pP'= Z & A                    // Z == global bounding set
>>
>> In other words, you are not simply enforcing "the intersection of
>> the global and per-process bounding sets".
>>
>> Whereas,
>>
>> >             if (permitted & ~new->cap_permitted.cap[i])
>> >                     /* insufficient to execute correctly */
>> > @@ -438,6 +445,9 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
>> >             return ret;
>> >
>> >     if (!issecure(SECURE_NOROOT)) {
>> > +           kernel_cap_t bset = cap_intersect(old->cap_bset,
>> > +                                             global_cap_bset);
>> > +
>> >             /*
>> >              * If the legacy file capability is set, then don't set privs
>> >              * for a setuid root binary run by a non-root user.  Do set it
>> > @@ -456,8 +466,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
>> >              */
>> >             if (new->euid == 0 || new->uid == 0) {
>> >                     /* pP' = (cap_bset & ~0) | (pI & ~0) */
>> > -                   new->cap_permitted = cap_combine(old->cap_bset,
>> > -                                                    old->cap_inheritable);
>> > +                   new->cap_permitted = cap_combine(bset, old->cap_inheritable);
>>
>> here (for a root task) you are using
>>
>>       pP' = (Z & X) | pI
>>
>> So the inheritable tasks get masked with the global bounding set for
>> non-root tasks, but not for root tasks.
>
> I believe you are thinking correctly and I am wrong.  Someone else has
> some other issues with the patch but would prefer to keep that
> conversation offline.  I will certainly be back with changes and
> explanation of changes (hopefully shortly)
>
> Thanks Serge!
>
> -Eric
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-17  3:16     ` Andrew G. Morgan
@ 2011-01-21 21:25       ` Eric Paris
  2011-01-23  3:39         ` Andrew G. Morgan
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Paris @ 2011-01-21 21:25 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, linux-kernel, linux-security-module, sgrubb

On Sun, 2011-01-16 at 19:16 -0800, Andrew G. Morgan wrote:

> I'm not a supporter of system-wide security contexts changing under
> running programs. Previous experience has taught me that tricking a
> program into thinking it has one sort of privilege and then
> withdrawing it without notice can be dangerous.

But a bounding set doesn't affect a running program's abilities, the
bset is only applied at exec().  So if it's fcaps based you have all
that wacky kill logic.

>  Since privileged
> programs include shells that invoke privileged commands to perform
> system admin tasks, I consider a global bounding set to be trouble in
> the making. 

No question, but then again, if you have CAP_SYS_MODULE there are a lot
easier ways to make trouble   :)

> If we were to delete that special code what I think is missing from
> the current kernel model, is not a global bounding set, but a 'kernel
> auto-exec' securebits value. One that can be set by an admin at
> runtime to suppress the root-is-all-capable behavior of the auto-exec
> process, and defer the privilege escalation to carefully audited file
> capabilities on the relevant helper binaries.

But how can that leave us with an impotent root?  Root would be easily
able to craft a file with any caps it wants in fI and fP on any of the
plethora of helper programs the kernel calls and escalate away it's
impotence.

I'd really like to drop a cap from an entire system never to return.
Maybe I can get there by exposing the bset of the kthread which launches
the helpers (makes me feel very dirty).  But that is smaller than the
global bset....

-Eric


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-21 21:25       ` Eric Paris
@ 2011-01-23  3:39         ` Andrew G. Morgan
  2011-01-24 21:40           ` Serge Hallyn
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew G. Morgan @ 2011-01-23  3:39 UTC (permalink / raw)
  To: Eric Paris; +Cc: Serge E. Hallyn, linux-kernel, linux-security-module, sgrubb

On Fri, Jan 21, 2011 at 1:25 PM, Eric Paris <eparis@redhat.com> wrote:
> On Sun, 2011-01-16 at 19:16 -0800, Andrew G. Morgan wrote:
>
>> I'm not a supporter of system-wide security contexts changing under
>> running programs. Previous experience has taught me that tricking a
>> program into thinking it has one sort of privilege and then
>> withdrawing it without notice can be dangerous.
>
> But a bounding set doesn't affect a running program's abilities, the
> bset is only applied at exec().  So if it's fcaps based you have all
> that wacky kill logic.
>
>>  Since privileged
>> programs include shells that invoke privileged commands to perform
>> system admin tasks, I consider a global bounding set to be trouble in
>> the making.
>
> No question, but then again, if you have CAP_SYS_MODULE there are a lot
> easier ways to make trouble   :)

But I thought the issue at hand was not protecting a system from root,
but protecting a system from kernel auto-exec'd programs.

>> If we were to delete that special code what I think is missing from
>> the current kernel model, is not a global bounding set, but a 'kernel
>> auto-exec' securebits value. One that can be set by an admin at
>> runtime to suppress the root-is-all-capable behavior of the auto-exec
>> process, and defer the privilege escalation to carefully audited file
>> capabilities on the relevant helper binaries.
>
> But how can that leave us with an impotent root?  Root would be easily
> able to craft a file with any caps it wants in fI and fP on any of the
> plethora of helper programs the kernel calls and escalate away it's
> impotence.

Again, assuming that you are really trying to limit the power of
kernel auto-exec'd programs, then you can see how secure bits can make
root-power harder to obtain. (Examples using libcap utilities.)

[root@pip foo]# whoami
root
[root@pip foo]# ls -l foo.sh
-rwx------ 1 bin nobody 31 2011-01-22 19:07 foo.sh
[root@pip foo]# /sbin/capsh --secbits=0x0 -- ./foo.sh
Hello, Root
[root@pip foo]# /sbin/capsh --secbits=0x2f -- ./foo.sh
/bin/bash: ./foo.sh: Permission denied
[root@pip foo]#

That is, the 0x2f value of the secure bits turns off root's privilege.
This includes the privilege to add capabilities to files (run this
from the progs subdirectory of the libcap source, after a build):

[root@pip progs]# /sbin/capsh --secbits=0 -- \
-c "/sbin/setcap cap_setfcap=ep setcap"
[root@pip progs]# /sbin/getcap -v setcap
setcap = cap_setfcap+ep
[root@pip progs]# /sbin/setcap -r setcap
[root@pip progs]# /sbin/getcap -v setcap
setcap
[root@pip progs]# /sbin/capsh --secbits=0x2f -- \
-c "/sbin/setcap cap_setfcap=ep setcap"
unable to set CAP_SETFCAP effective capability: Operation not permitted
[root@pip progs]#

So, my point is that if the kernel threads were launched with a
user-space configurable set of secure bits, and the regular exec()
rules were used by these kernel-launched processes to obtain
privilege, you could block the kernel from getting any user-space
privileges via secure bits.

> I'd really like to drop a cap from an entire system never to return.
> Maybe I can get there by exposing the bset of the kthread which launches
> the helpers (makes me feel very dirty).  But that is smaller than the
> global bset....

Does any of the above help clarify how else to achieve your ends?

Cheers

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-23  3:39         ` Andrew G. Morgan
@ 2011-01-24 21:40           ` Serge Hallyn
  2011-01-26 23:34             ` Eric Paris
  0 siblings, 1 reply; 28+ messages in thread
From: Serge Hallyn @ 2011-01-24 21:40 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Eric Paris, Serge E. Hallyn, linux-kernel, linux-security-module, sgrubb

Quoting Andrew G. Morgan (morgan@kernel.org):
> > But how can that leave us with an impotent root?  Root would be easily
> > able to craft a file with any caps it wants in fI and fP on any of the
> > plethora of helper programs the kernel calls and escalate away it's
> > impotence.
> 
> Again, assuming that you are really trying to limit the power of
> kernel auto-exec'd programs, then you can see how secure bits can make
> root-power harder to obtain. (Examples using libcap utilities.)
> 
> [root@pip foo]# whoami
> root
> [root@pip foo]# ls -l foo.sh
> -rwx------ 1 bin nobody 31 2011-01-22 19:07 foo.sh
> [root@pip foo]# /sbin/capsh --secbits=0x0 -- ./foo.sh
> Hello, Root
> [root@pip foo]# /sbin/capsh --secbits=0x2f -- ./foo.sh
> /bin/bash: ./foo.sh: Permission denied
> [root@pip foo]#
> 
> That is, the 0x2f value of the secure bits turns off root's privilege.
> This includes the privilege to add capabilities to files (run this
> from the progs subdirectory of the libcap source, after a build):
> 
> [root@pip progs]# /sbin/capsh --secbits=0 -- \
> -c "/sbin/setcap cap_setfcap=ep setcap"
> [root@pip progs]# /sbin/getcap -v setcap
> setcap = cap_setfcap+ep
> [root@pip progs]# /sbin/setcap -r setcap
> [root@pip progs]# /sbin/getcap -v setcap
> setcap
> [root@pip progs]# /sbin/capsh --secbits=0x2f -- \
> -c "/sbin/setcap cap_setfcap=ep setcap"
> unable to set CAP_SETFCAP effective capability: Operation not permitted
> [root@pip progs]#
> 
> So, my point is that if the kernel threads were launched with a
> user-space configurable set of secure bits, and the regular exec()
> rules were used by these kernel-launched processes to obtain
> privilege, you could block the kernel from getting any user-space
> privileges via secure bits.

That's not even necessary, is it?  In order to get capabilities
from fI into pP, you need those capabilities in pI to begin
with.  So as long as we make sure that removing a capability from
the root task's bounding set also removes it from it's inheritable
set, no task it execs can re-gain those capabilities.

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-24 21:40           ` Serge Hallyn
@ 2011-01-26 23:34             ` Eric Paris
  2011-01-27 14:02               ` Serge E. Hallyn
  2011-01-27 14:26               ` Andrew G. Morgan
  0 siblings, 2 replies; 28+ messages in thread
From: Eric Paris @ 2011-01-26 23:34 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: Andrew G. Morgan, Serge E. Hallyn, linux-kernel,
	linux-security-module, sgrubb

Let me reboot the conversation just a little.  My goal is to be able to
drop capabilities from a system such that they can never come back.  The
two capabilities I am most interested in are CAP_SYS_MODULE and
CAP_SYS_RAWIO.  I want to boot a machine in a manor I control, drop
those caps, and then give root to an entity that I do not trust.  At the
moment it is impossible to do this.  Absolutely impossible.  It is
impossible because the earliest I can drop capabilities is
in /sbin/init.  Lets assume I patch /sbin/init to drop those two caps
from the bset, pE, pI, and pP.  One might think they are gone for good,
but they aren't.

The way that kernel helper programs get exec'ed is that a task or maybe
an async event causes the kernel to queue some information on a list.
That information is removed from the list by a special kthread and a
userspace program is exec'ed.  The exec rules are going to apply to the
capability sets associated with the kthread.  Those are compiled into
the kernel.  I need a way to prevent capabilities from coming back.

I proposed a global bset (much like we used to have, but without the
ability for init to add stuff back) and was shot down.

Andrew original proposed a prctl() which would cause the exec call to
not automatically add capabilities to pE and would rely on filecaps.  I
claim this is not reasonable since root is going to have control of the
fcaps.  All that would be needed for root to regain either cap is to
change the helper program to be an sshd server and add back these 2
dropped caps in fcaps.

At this point it seems to me like what I must do is add a way for a task
with enough priv to force caps out of the bset and pI of the kthread
which upcalls to run userspace programs.  Thus when the kthread runs a
program it cannot give those privs....

Does this seem reasonable?  What would such an interface look like?
(This is scarily like the old meaning of CAP_SETPCAP....)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-26 23:34             ` Eric Paris
@ 2011-01-27 14:02               ` Serge E. Hallyn
  2011-01-27 14:42                 ` Steve Grubb
  2011-01-27 14:26               ` Andrew G. Morgan
  1 sibling, 1 reply; 28+ messages in thread
From: Serge E. Hallyn @ 2011-01-27 14:02 UTC (permalink / raw)
  To: Eric Paris
  Cc: Serge Hallyn, Andrew G. Morgan, Serge E. Hallyn, linux-kernel,
	linux-security-module, sgrubb

Quoting Eric Paris (eparis@redhat.com):
> At this point it seems to me like what I must do is add a way for a task
> with enough priv to force caps out of the bset and pI of the kthread
> which upcalls to run userspace programs.  Thus when the kthread runs a
> program it cannot give those privs....

Exactly, that's what I was envisioning.

How about adding an optional kthread-wrapper program which, when
specified on the boot cmdline, will be the program to wrap any
userspace programs which the kernel executes?  Then that program
can do the work it needs to remove capabilities from pI and X,
set selinux context, etc.  Just a thought, seems somewhat more
elegant thatn hard-coding the behavior in the kernel, but not sure
it's practical.

> Does this seem reasonable?  What would such an interface look like?
> (This is scarily like the old meaning of CAP_SETPCAP....)

I'm not sure it's unreasonable, but in one sense it seems like just
walking one more step toward the end of the plank:  next, you're
going to have to worry about a kernel-spawned thread, from which you've
denied cap_sys_module, writing to /boot/initrd* the malicious steps
it wanted to take, to be executed at next boot when it still has
cap_sys_module.  (Fine, enter TPM :)

What is the attack vector you're actually envisioning?  Does some
trojan come in and overwrite a program which which it hopes the
kernel will execute?  Or is there just an existing vuln in such
a program?  Are there other ways we can address these?  Can we find
a way to classify the kernel-spawned userspace programs?  Perhaps
based on the selinux context assigned to the program, we can assign
some level of trust that noone could have modified the source?

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-26 23:34             ` Eric Paris
  2011-01-27 14:02               ` Serge E. Hallyn
@ 2011-01-27 14:26               ` Andrew G. Morgan
  1 sibling, 0 replies; 28+ messages in thread
From: Andrew G. Morgan @ 2011-01-27 14:26 UTC (permalink / raw)
  To: Eric Paris
  Cc: Serge Hallyn, Serge E. Hallyn, linux-kernel,
	linux-security-module, sgrubb

On Wed, Jan 26, 2011 at 3:34 PM, Eric Paris <eparis@redhat.com> wrote:
> Let me reboot the conversation just a little.  My goal is to be able to
> drop capabilities from a system such that they can never come back.  The
> two capabilities I am most interested in are CAP_SYS_MODULE and
> CAP_SYS_RAWIO.  I want to boot a machine in a manor I control, drop
> those caps, and then give root to an entity that I do not trust.  At the
> moment it is impossible to do this.  Absolutely impossible.  It is
> impossible because the earliest I can drop capabilities is
> in /sbin/init.  Lets assume I patch /sbin/init to drop those two caps
> from the bset, pE, pI, and pP.  One might think they are gone for good,
> but they aren't.
>
> The way that kernel helper programs get exec'ed is that a task or maybe
> an async event causes the kernel to queue some information on a list.
> That information is removed from the list by a special kthread and a
> userspace program is exec'ed.  The exec rules are going to apply to the
> capability sets associated with the kthread.  Those are compiled into
> the kernel.  I need a way to prevent capabilities from coming back.
>
> I proposed a global bset (much like we used to have, but without the
> ability for init to add stuff back) and was shot down.
>
> Andrew original proposed a prctl() which would cause the exec call to
> not automatically add capabilities to pE and would rely on filecaps.  I
> claim this is not reasonable since root is going to have control of the
> fcaps.  All that would be needed for root to regain either cap is to
> change the helper program to be an sshd server and add back these 2
> dropped caps in fcaps.

Which root are we talking about? A rogue all capable root that already
has full control of the system, or a root escalation that is caused by
some automated kernel initiated exec()?

Privilege is not inherited naively with the right secure bits enabled,
so I'm saying you can trust file-capable binaries even more with the
right prevailing secure bits. That is, the right capabilities on the
helper binaries.

Serge asks an excellent question: is the set of helper binaries
finite? You have already conceded that init is a helper binary you can
change to achieve your goals. Why not these others?

> At this point it seems to me like what I must do is add a way for a task
> with enough priv to force caps out of the bset and pI of the kthread
> which upcalls to run userspace programs.  Thus when the kthread runs a
> program it cannot give those privs....

I'm still claiming you can (or, should be made so, to) effectively do
this with a set of secure bits that govern the way this kthread's
exec() works. It then falls to the user-space binary to determine what
capabilities are needed and this whole issue becomes one of userspace
DAC/MAC policy. With the right secure bits, the exec()'d helper would
get pI=pP=pE=0 unless the binary had specific capabilities enabled on
it, and in such cases the binary itself could consult some userspace
system policy and force pB (aka X) to zero or at least 'minus' the
bits you want to suppress on your system.

> Does this seem reasonable?  What would such an interface look like?
> (This is scarily like the old meaning of CAP_SETPCAP....)

[No. Please don't bring that back.]

Cheers

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-27 14:02               ` Serge E. Hallyn
@ 2011-01-27 14:42                 ` Steve Grubb
  2011-01-27 16:43                   ` Andrew G. Morgan
                                     ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Steve Grubb @ 2011-01-27 14:42 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Eric Paris, Andrew G. Morgan, Serge E. Hallyn, linux-kernel,
	linux-security-module

Hi Serge,

On Thursday, January 27, 2011 09:02:55 am Serge E. Hallyn wrote:
> What is the attack vector you're actually envisioning?  Does some
> trojan come in and overwrite a program which which it hopes the
> kernel will execute?  Or is there just an existing vuln in such
> a program?  Are there other ways we can address these?  Can we find
> a way to classify the kernel-spawned userspace programs?  Perhaps
> based on the selinux context assigned to the program, we can assign
> some level of trust that noone could have modified the source?


I think that what is causing the confusion is that we are considering a different 
threat model than the normal, historic view. The way its normally viewed, if you have 
root, you can do anything you want to a machine. The threat model revolves around 
becoming root on a machine and defense rests on splitting root so a complete system 
compromise might not occur.

Today, people want to have multi-tenant hosting using virtual machines whereby they 
give away root control of the guest VM. If you were renting system space, you would 
expect root access. That would make a nice juicy hacking target because you don't know 
who else is sharing the physical machine with you and they might have something in 
their VM worth stealing.

So, the threat model becomes how do we prevent one guest from attacking another? We 
have sVirt which prevents resource based attacks from occurring. Its pretty effective 
for that. However, what if the bad guy wants to start attacking the hypervisor 
directly in effort to start attacking the host OS? 

They need to be able to run arbitrary code in ring 0 of the VM. That means the hosting 
provider might want to eliminate some capabilities from the whole kernel so that they 
have some assurance that a root user cannot get arbitrary code running in ring 0 
without knowing a kernel level exploit. Also assume that the root user has no control 
over the kernel or modules or initrd which are kept on a read only partition enforced 
by the hypervisor. And the hosting provider will make kernel updates as kernel 
security releases are made.

This kind of turns around some of the threat modeling that people have always made. 
There are not a whole lot of changes that need to be made. I think there was one other 
patch that we needed to prevent arbitrary code injection. Eric's initial patch was 
overly generous in my opinion. It allowed further modification of the global bounding 
set after boot had finished and could probably be used for mischief as pointed out. 
Perhaps the setting should be immutable after any change to it - which is really how 
its intended to be used. Or maybe even only a subset of the bounding set is modifiable.

Using a wrapper program is a NOGO because the admin renting the machine would be able 
to overwrite the wrapper and then they have arbitrary code running with full privs and 
we trust it will do the right thing. We need all modification to the running kernel out 
of reach from root in that VM.

-Steve

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-27 14:42                 ` Steve Grubb
@ 2011-01-27 16:43                   ` Andrew G. Morgan
       [not found]                   ` <AANLkTi=k5QeE_-iNuW3-M5K3BnBtRxk-QYO5624HKrpE@mail.gmail.com>
  2011-01-28 18:49                   ` Serge E. Hallyn
  2 siblings, 0 replies; 28+ messages in thread
From: Andrew G. Morgan @ 2011-01-27 16:43 UTC (permalink / raw)
  To: Steve Grubb
  Cc: Serge E. Hallyn, Eric Paris, Serge E. Hallyn, linux-kernel,
	linux-security-module

[Resend because of bounces.]

On Thu, Jan 27, 2011 at 6:42 AM, Steve Grubb <sgrubb@redhat.com> wrote:
> Hi Serge,
>
> On Thursday, January 27, 2011 09:02:55 am Serge E. Hallyn wrote:
>> What is the attack vector you're actually envisioning?  Does some
>> trojan come in and overwrite a program which which it hopes the
>> kernel will execute?  Or is there just an existing vuln in such
>> a program?  Are there other ways we can address these?  Can we find
>> a way to classify the kernel-spawned userspace programs?  Perhaps
>> based on the selinux context assigned to the program, we can assign
>> some level of trust that noone could have modified the source?
>
>
> I think that what is causing the confusion is that we are considering a different
> threat model than the normal, historic view. The way its normally viewed, if you have
> root, you can do anything you want to a machine. The threat model revolves around
> becoming root on a machine and defense rests on splitting root so a complete system
> compromise might not occur.
>
> Today, people want to have multi-tenant hosting using virtual machines whereby they
> give away root control of the guest VM. If you were renting system space, you would
> expect root access. That would make a nice juicy hacking target because you don't know
> who else is sharing the physical machine with you and they might have something in
> their VM worth stealing.
>
> So, the threat model becomes how do we prevent one guest from attacking another? We
> have sVirt which prevents resource based attacks from occurring. Its pretty effective
> for that. However, what if the bad guy wants to start attacking the hypervisor
> directly in effort to start attacking the host OS?

Which root filesystem (/) do kernel helpers run in in such a setup? I
would have expected that they occurred in the hypervisor where the
selection of helper binaries would be outside the control of the
guests. Is this not the case?

Thanks

Andrew

>
> They need to be able to run arbitrary code in ring 0 of the VM. That means the hosting
> provider might want to eliminate some capabilities from the whole kernel so that they
> have some assurance that a root user cannot get arbitrary code running in ring 0
> without knowing a kernel level exploit. Also assume that the root user has no control
> over the kernel or modules or initrd which are kept on a read only partition enforced
> by the hypervisor. And the hosting provider will make kernel updates as kernel
> security releases are made.
>
> This kind of turns around some of the threat modeling that people have always made.
> There are not a whole lot of changes that need to be made. I think there was one other
> patch that we needed to prevent arbitrary code injection. Eric's initial patch was
> overly generous in my opinion. It allowed further modification of the global bounding
> set after boot had finished and could probably be used for mischief as pointed out.
> Perhaps the setting should be immutable after any change to it - which is really how
> its intended to be used. Or maybe even only a subset of the bounding set is modifiable.
>
> Using a wrapper program is a NOGO because the admin renting the machine would be able
> to overwrite the wrapper and then they have arbitrary code running with full privs and
> we trust it will do the right thing. We need all modification to the running kernel out
> of reach from root in that VM.
>
> -Steve
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
       [not found]                   ` <AANLkTi=k5QeE_-iNuW3-M5K3BnBtRxk-QYO5624HKrpE@mail.gmail.com>
@ 2011-01-27 16:50                     ` Steve Grubb
  2011-01-28 18:19                       ` Eric Paris
  0 siblings, 1 reply; 28+ messages in thread
From: Steve Grubb @ 2011-01-27 16:50 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Eric Paris, linux-kernel, Serge E. Hallyn, Serge E. Hallyn,
	linux-security-module

On Thursday, January 27, 2011 11:35:13 am Andrew G. Morgan wrote:
> > Today, people want to have multi-tenant hosting using virtual
> > machines whereby they give away root control of the guest VM.
> > If you were renting system space, you would expect root access.
> > That would make a nice juicy hacking target because you don't know
> > who else is sharing the physical machine with you and they might
> > have something in their VM worth stealing.
> 
> Which root filesystem (/) do kernel helpers run in in such a virtual setup?

I would assume that root in the VM could umount and mount anything. Or bind mount over 
it. We really want any change to a global bounding set done before initrd finishes 
doing its thing. This way there is no chance for mischief by the time control is 
turned over to /sbin/init - which root controls.

-Steve

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-27 16:50                     ` Steve Grubb
@ 2011-01-28 18:19                       ` Eric Paris
  0 siblings, 0 replies; 28+ messages in thread
From: Eric Paris @ 2011-01-28 18:19 UTC (permalink / raw)
  To: Steve Grubb
  Cc: Andrew G. Morgan, linux-kernel, Serge E. Hallyn, Serge E. Hallyn,
	linux-security-module

On Thu, 2011-01-27 at 11:50 -0500, Steve Grubb wrote:
> On Thursday, January 27, 2011 11:35:13 am Andrew G. Morgan wrote:
> > > Today, people want to have multi-tenant hosting using virtual
> > > machines whereby they give away root control of the guest VM.
> > > If you were renting system space, you would expect root access.
> > > That would make a nice juicy hacking target because you don't know
> > > who else is sharing the physical machine with you and they might
> > > have something in their VM worth stealing.
> > 
> > Which root filesystem (/) do kernel helpers run in in such a virtual setup?
> 
> I would assume that root in the VM could umount and mount anything. Or bind mount over 
> it. We really want any change to a global bounding set done before initrd finishes 
> doing its thing. This way there is no chance for mischief by the time control is 
> turned over to /sbin/init - which root controls.

I feel like we are all starting to understand the problem.  It still
leaves me with the belief that the only 2 known ways to solve it are

1) global bounding set which bounds the pP = fI & pI rule, unlike the
per process bset

2) a mechanism to drop caps from the bset and pI of the kthread which
runs helper apps

Do others see another way?

-Eric


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-27 14:42                 ` Steve Grubb
  2011-01-27 16:43                   ` Andrew G. Morgan
       [not found]                   ` <AANLkTi=k5QeE_-iNuW3-M5K3BnBtRxk-QYO5624HKrpE@mail.gmail.com>
@ 2011-01-28 18:49                   ` Serge E. Hallyn
  2011-01-28 19:10                     ` Steve Grubb
  2 siblings, 1 reply; 28+ messages in thread
From: Serge E. Hallyn @ 2011-01-28 18:49 UTC (permalink / raw)
  To: Steve Grubb
  Cc: Serge E. Hallyn, Eric Paris, Andrew G. Morgan, Serge E. Hallyn,
	linux-kernel, linux-security-module

Quoting Steve Grubb (sgrubb@redhat.com):
> Hi Serge,
> 
> On Thursday, January 27, 2011 09:02:55 am Serge E. Hallyn wrote:
> > What is the attack vector you're actually envisioning?  Does some
> > trojan come in and overwrite a program which which it hopes the
> > kernel will execute?  Or is there just an existing vuln in such
> > a program?  Are there other ways we can address these?  Can we find
> > a way to classify the kernel-spawned userspace programs?  Perhaps
> > based on the selinux context assigned to the program, we can assign
> > some level of trust that noone could have modified the source?
> 
> 
> I think that what is causing the confusion is that we are considering a different 
> threat model than the normal, historic view. The way its normally viewed, if you have 
> root, you can do anything you want to a machine. The threat model revolves around 
> becoming root on a machine and defense rests on splitting root so a complete system 
> compromise might not occur.

I wasn't looking at it as root being all-powerful, but rather as kernel
being all-powerful.

> Today, people want to have multi-tenant hosting using virtual machines whereby they 
> give away root control of the guest VM. If you were renting system space, you would 
> expect root access. That would make a nice juicy hacking target because you don't know 
> who else is sharing the physical machine with you and they might have something in 
> their VM worth stealing.
> 
> So, the threat model becomes how do we prevent one guest from attacking another? We 
> have sVirt which prevents resource based attacks from occurring. Its pretty effective 
> for that. However, what if the bad guy wants to start attacking the hypervisor 
> directly in effort to start attacking the host OS? 

I always assume they are :)

> They need to be able to run arbitrary code in ring 0 of the VM. That means the hosting 
> provider might want to eliminate some capabilities from the whole kernel so that they 
> have some assurance that a root user cannot get arbitrary code running in ring 0 
> without knowing a kernel level exploit. Also assume that the root user has no control 
> over the kernel or modules or initrd which are kept on a read only partition enforced 
> by the hypervisor. And the hosting provider will make kernel updates as kernel 
> security releases are made.
> 
> This kind of turns around some of the threat modeling that people have always made. 
> There are not a whole lot of changes that need to be made. I think there was one other 
> patch that we needed to prevent arbitrary code injection. Eric's initial patch was 
> overly generous in my opinion. It allowed further modification of the global bounding 
> set after boot had finished and could probably be used for mischief as pointed out. 
> Perhaps the setting should be immutable after any change to it - which is really how 
> its intended to be used. Or maybe even only a subset of the bounding set is modifiable.
> 
> Using a wrapper program is a NOGO because the admin renting the machine would be able 
> to overwrite the wrapper and then they have arbitrary code running with full privs and 

Not sure I've got this.  Wrapper program in the VM he can over-write,
but then he can overwrite the kernel too.  But what we are worried about
is the host, so you must mean that.  But if the wrapper program is of
type noone_may_write_this_t, then wouldn't finding a way to replace that
be as hard as overwriting the host kernel?  Which, of course, still
remains as a viable attack vector for the guest admin, whether you have
this bounding set or not.

In other words, we have to accept that the TCB is always not just the
kernel, but some user-space too.  And yes, the wrapper program here
would be part of the TCB.

Again, what I like about the wrapper idea is that it plugs in at one
place in the kernel tree (kernel/ktrhead.c), and can do other setup
if the host admin likes.

> we trust it will do the right thing. We need all modification to the running kernel out 
> of reach from root in that VM.

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-28 18:49                   ` Serge E. Hallyn
@ 2011-01-28 19:10                     ` Steve Grubb
  2011-01-28 19:38                       ` Serge E. Hallyn
  0 siblings, 1 reply; 28+ messages in thread
From: Steve Grubb @ 2011-01-28 19:10 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Eric Paris, Andrew G. Morgan, Serge E. Hallyn, linux-kernel,
	linux-security-module

On Friday, January 28, 2011 01:49:01 pm Serge E. Hallyn wrote:
> > Using a wrapper program is a NOGO because the admin renting the machine
> > would be able  to overwrite the wrapper and then they have arbitrary
> > code running with full privs and
> 
> Not sure I've got this.  Wrapper program in the VM he can over-write,
> but then he can overwrite the kernel too.

No, because the kernel is only read in at boot. After that, /boot can disapear and it 
won't matter. It can be replaced with something and that won't matter because that's 
not the real boot partition.

> But what we are worried about is the host, so you must mean that.  But if the
> wrapper program is of type noone_may_write_this_t, then wouldn't finding a way to
> replace that be as hard as overwriting the host kernel? 

No, because we aren't taking away the ability to mount or unmount. Not to mention that 
root can replace his selinux policy so that next boot it doesn't define 
noone_may_write_this_t. He might even put selinux in his VM in permissive.

> Which, of course, still remains as a viable attack vector for the guest admin,
> whether you have this bounding set or not.

No, with the bounding set, any external call the kernel makes has the bounding set 
applied. This means we don't have to further restrict root in unnatural ways.

> In other words, we have to accept that the TCB is always not just the
> kernel, but some user-space too.  And yes, the wrapper program here
> would be part of the TCB.

If you give someone root access in the VM, they probably want to set things up their 
way. So, we really would like it if all the security mechanism were inside where they 
can't be easily tampered with.

-Steve

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-28 19:10                     ` Steve Grubb
@ 2011-01-28 19:38                       ` Serge E. Hallyn
  2011-01-28 22:24                         ` Eric Paris
  2011-02-01 18:17                         ` Eric Paris
  0 siblings, 2 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2011-01-28 19:38 UTC (permalink / raw)
  To: Steve Grubb
  Cc: Serge E. Hallyn, Eric Paris, Andrew G. Morgan, Serge E. Hallyn,
	linux-kernel, linux-security-module

Quoting Steve Grubb (sgrubb@redhat.com):
> On Friday, January 28, 2011 01:49:01 pm Serge E. Hallyn wrote:
> > > Using a wrapper program is a NOGO because the admin renting the machine
> > > would be able  to overwrite the wrapper and then they have arbitrary
> > > code running with full privs and
> > 
> > Not sure I've got this.  Wrapper program in the VM he can over-write,
> > but then he can overwrite the kernel too.
> 
> No, because the kernel is only read in at boot. After that, /boot can disapear and it 

And you can set it up so userspace cannot remount it, I assume?

> won't matter. It can be replaced with something and that won't matter because that's 
> not the real boot partition.
> 
> > But what we are worried about is the host, so you must mean that.  But if the
> > wrapper program is of type noone_may_write_this_t, then wouldn't finding a way to
> > replace that be as hard as overwriting the host kernel? 
> 
> No, because we aren't taking away the ability to mount or unmount. Not to mention that 
> root can replace his selinux policy so that next boot it doesn't define 
> noone_may_write_this_t. He might even put selinux in his VM in permissive.
> 
> > Which, of course, still remains as a viable attack vector for the guest admin,
> > whether you have this bounding set or not.
> 
> No, with the bounding set, any external call the kernel makes has the bounding set 
> applied. This means we don't have to further restrict root in unnatural ways.
> 
> > In other words, we have to accept that the TCB is always not just the
> > kernel, but some user-space too.  And yes, the wrapper program here
> > would be part of the TCB.
> 
> If you give someone root access in the VM, they probably want to set things up their 
> way. So, we really would like it if all the security mechanism were inside where they 
> can't be easily tampered with.

That's cool  :)

Thanks for the elaboration, that's very interesting and helpful.

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-28 19:38                       ` Serge E. Hallyn
@ 2011-01-28 22:24                         ` Eric Paris
  2011-02-01 18:17                         ` Eric Paris
  1 sibling, 0 replies; 28+ messages in thread
From: Eric Paris @ 2011-01-28 22:24 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Steve Grubb, Andrew G. Morgan, Serge E. Hallyn, linux-kernel,
	linux-security-module

On Fri, 2011-01-28 at 13:38 -0600, Serge E. Hallyn wrote:
> Quoting Steve Grubb (sgrubb@redhat.com):
> > On Friday, January 28, 2011 01:49:01 pm Serge E. Hallyn wrote:
> > > > Using a wrapper program is a NOGO because the admin renting the machine
> > > > would be able  to overwrite the wrapper and then they have arbitrary
> > > > code running with full privs and
> > > 
> > > Not sure I've got this.  Wrapper program in the VM he can over-write,
> > > but then he can overwrite the kernel too.
> > 
> > No, because the kernel is only read in at boot. After that, /boot can disapear and it 
> 
> And you can set it up so userspace cannot remount it, I assume?

Undecided at this time.  There are 2 possibilities.  We will either
expose a small partition to the VM which the hypervisor enforces RO
access which contains the kernel, initrd, and bootloader (so basically
a /boot).  This is what I have been thinking.  Or we may just directly
launch a kernel and initrd from the hypervisor and those files need
never be exposed to the VM at all.  In an case, there will be no
possibility that root in the VM will be able to modify their kernel or
initrd.  The plan is to implement capability restrictions inside the
initrd.

> > > In other words, we have to accept that the TCB is always not just the
> > > kernel, but some user-space too.  And yes, the wrapper program here
> > > would be part of the TCB.

That's correct, the TCB is going to be the kernel+initrd.  We must
accomplish all lockdowns inside the initrd before control is passed to
the root admin.

The helper script idea doesn't appear to meet the goal since the admin
would have control over the complete filesystem namespace and would be
able bypass it altogether (sgrubb mentioned bind mounting on top of it,
even if they couldn't overwrite it)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-01-28 19:38                       ` Serge E. Hallyn
  2011-01-28 22:24                         ` Eric Paris
@ 2011-02-01 18:17                         ` Eric Paris
  2011-02-01 21:26                           ` Serge E. Hallyn
  1 sibling, 1 reply; 28+ messages in thread
From: Eric Paris @ 2011-02-01 18:17 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Steve Grubb, Andrew G. Morgan, Serge E. Hallyn, linux-kernel,
	linux-security-module

What are we thinking?  Any suggestions how to do what we need other than

global bounding such that  pP' = gbset & (fI | pI)

Or an interface in which I can force things out of the bset and pI of
other tasks?  Possibly the interface could be specific to the "khelper"
thread?

-Eric


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-02-01 18:17                         ` Eric Paris
@ 2011-02-01 21:26                           ` Serge E. Hallyn
  2011-02-02  4:02                             ` Andrew G. Morgan
  0 siblings, 1 reply; 28+ messages in thread
From: Serge E. Hallyn @ 2011-02-01 21:26 UTC (permalink / raw)
  To: Eric Paris
  Cc: Serge E. Hallyn, Steve Grubb, Andrew G. Morgan, Serge E. Hallyn,
	linux-kernel, linux-security-module

Quoting Eric Paris (eparis@redhat.com):
> What are we thinking?  Any suggestions how to do what we need other than
> 
> global bounding such that  pP' = gbset & (fI | pI)

That should be sufficient for what you want.

I would however like to hear whether Andrew has had any other ideas
given the broader picture.

> Or an interface in which I can force things out of the bset and pI of
> other tasks?  Possibly the interface could be specific to the "khelper"
> thread?

No no no no no :)

thanks,
-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-02-01 21:26                           ` Serge E. Hallyn
@ 2011-02-02  4:02                             ` Andrew G. Morgan
  2011-02-08  2:55                               ` Eric Paris
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew G. Morgan @ 2011-02-02  4:02 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Eric Paris, Steve Grubb, Serge E. Hallyn, linux-kernel,
	linux-security-module

On Tue, Feb 1, 2011 at 1:26 PM, Serge E. Hallyn
<serge.hallyn@canonical.com> wrote:
> Quoting Eric Paris (eparis@redhat.com):
>> What are we thinking?  Any suggestions how to do what we need other than
>>
>> global bounding such that  pP' = gbset & (fI | pI)
>
> That should be sufficient for what you want.
>
> I would however like to hear whether Andrew has had any other ideas
> given the broader picture.
>

I think I now see what you are after.

You want some sort of transient TCB that can lock all of the doors you
care to lock and then run the whole system in a partially crippled
sandbox.

I have some concerns about how you know you have truly locked down the
system and question the viability of a VM that doesn't virtualize IO
too, but presumably you have some way to protect the storage of the
kernel binary and initrd that cannot be overcome, and protections from
DMA etc. being used by the guest to to overwrite kernel memory.

In this case, I would like to suggest what you need is a user
configurable state for kernel threads to launch helper programs - a
kernel side equivalent to Sergey's wrapper idea.  I continue to
dislike the global bounding set idea, but I would support a base
credential set for this kthread launcher. I'd include pI, bset, and
securebits and uid as something your initrd could initialize away from
their default values for kernel launched helper binaries. I'd prefer
it if you allowed the regular capability convolution rules to apply
and propagate this bounding set for all kernel launched binaries, and
also add the relevant code to init to enforce your desired bounding
set for init parented processes.

This way you will both meet your current needs and also maintain
support for a capability managed 'raw' kernel experience with no
asynchronous capability manipulation system-wide.

Cheers

Andrew


>> Or an interface in which I can force things out of the bset and pI of
>> other tasks?  Possibly the interface could be specific to the "khelper"
>> thread?
>
> No no no no no :)
>
> thanks,
> -serge
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-02-02  4:02                             ` Andrew G. Morgan
@ 2011-02-08  2:55                               ` Eric Paris
  2011-02-14 20:45                                 ` Eric Paris
  2011-02-18  0:29                                 ` Serge E. Hallyn
  0 siblings, 2 replies; 28+ messages in thread
From: Eric Paris @ 2011-02-08  2:55 UTC (permalink / raw)
  To: Andrew G. Morgan
  Cc: Serge E. Hallyn, Steve Grubb, Serge E. Hallyn, linux-kernel,
	linux-security-module

On Tue, 2011-02-01 at 20:02 -0800, Andrew G. Morgan wrote:

> In this case, I would like to suggest what you need is a user
> configurable state for kernel threads to launch helper programs - a
> kernel side equivalent to Sergey's wrapper idea.  I continue to
> dislike the global bounding set idea, but I would support a base
> credential set for this kthread launcher. I'd include pI, bset, and
> securebits and uid as something your initrd could initialize away from
> their default values for kernel launched helper binaries. I'd prefer
> it if you allowed the regular capability convolution rules to apply
> and propagate this bounding set for all kernel launched binaries, and
> also add the relevant code to init to enforce your desired bounding
> set for init parented processes.


> >> Or an interface in which I can force things out of the bset and pI of
> >> other tasks?  Possibly the interface could be specific to the "khelper"
> >> thread?
> >
> > No no no no no :)

Below is what I'm working on.  I've asked dhowells to review the creds
code, since commit_creds() does not take const.  Maybe that's just an
oversight.  Basically I've exposed two new sysctls.

/proc/sys/kernel/usermodehelper/bset
/proc/sys/kernel/usermodehelper/inheritable

You must have CAP_SYS_MODULE to change these (changes are &= ONLY).
When the kernel launches a usermodehelper it will do so with these as
the bset and pI.  I haven't attempted securebits and uid (since I didn't
really need them I don't think)  But will if anyone can think of a use
case.

Is this what you were thinking?

-Eric

commit 23dbb6813349509a463103a34b51b18182c2ca0f
Author: Eric Paris <eparis@redhat.com>
Date:   Mon Feb 7 21:39:58 2011 -0500

    limit kthreads usermode helper words stuff

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 6efd7a7..79bb98d 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -24,6 +24,7 @@
 #include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/workqueue.h>
+#include <linux/sysctl.h>
 
 #define KMOD_PATH_LEN 256
 
@@ -109,6 +110,8 @@ call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
 				       NULL, NULL, NULL);
 }
 
+extern struct ctl_table usermodehelper_table[];
+
 extern void usermodehelper_init(void);
 
 extern int usermodehelper_disable(void);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591..d38be14 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/completion.h>
+#include <linux/cred.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/workqueue.h>
@@ -43,6 +44,12 @@ extern int max_threads;
 
 static struct workqueue_struct *khelper_wq;
 
+#define CAP_BSET	(void *)1
+#define CAP_PI		(void *)2
+
+kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+
 #ifdef CONFIG_MODULES
 
 /*
@@ -132,6 +139,8 @@ EXPORT_SYMBOL(__request_module);
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
 	spin_lock_irq(&current->sighand->siglock);
@@ -153,10 +162,23 @@ static int ____call_usermodehelper(void *data)
 			goto fail;
 	}
 
+	retval = -ENOMEM;
+	new = prepare_kernel_cred(current);
+	if (!new)
+		goto fail;
+
+	new->cap_bset = usermodehelper_bset;
+	new->cap_inheritable = usermodehelper_inheritable; 
+
+	old = get_cred(current_cred());
+	commit_creds(new);
+
 	retval = kernel_execve(sub_info->path,
 			       (const char *const *)sub_info->argv,
 			       (const char *const *)sub_info->envp);
 
+	commit_creds(old);
+
 	/* Exec failed? */
 fail:
 	sub_info->retval = retval;
@@ -418,6 +440,79 @@ unlock:
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
+static int proc_cap_handler(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+	kernel_cap_t new_cap;
+	int err, i;
+
+	if (write && !capable(CAP_SYS_MODULE))
+		return -EPERM;
+
+	/*
+	 * convert from the global kernel_cap_t to the ulong array to print to
+	 * userspace if this is a read.
+	 */
+	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  {
+		if (table->data == CAP_BSET)
+			cap_array[i] = usermodehelper_bset.cap[i];
+		else if (table->data == CAP_PI)
+			cap_array[i] = usermodehelper_inheritable.cap[i];
+		else
+			BUG();
+	}
+
+	t = *table;
+	t.data = &cap_array;
+
+	/*
+	 * actually read or write and array of ulongs from userspace.  Remember
+	 * these are least significant 32 bits first
+	 */
+	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+
+	/*
+	 * convert from the sysctl array of ulongs to the kernel_cap_t
+	 * internal representation
+	 */
+	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+		new_cap.cap[i] = cap_array[i];
+
+	/*
+	 * Drop everything not in the new_cap (but don't add things)
+	 */
+	if (write) {
+		if (table->data == CAP_BSET)
+			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+		if (table->data == CAP_PI)
+			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+	}
+
+	return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+	{
+		.procname	= "bset",
+		.data		= CAP_BSET,
+		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+		.mode		= 0600,
+		.proc_handler	= proc_cap_handler,
+	},
+	{
+		.procname	= "inheritable",
+		.data		= CAP_PI,
+		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+		.mode		= 0600,
+		.proc_handler	= proc_cap_handler,
+	},
+	{ }
+};
+
 void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83..099d2e2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/kmod.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -617,6 +618,11 @@ static struct ctl_table kern_table[] = {
 		.child		= random_table,
 	},
 	{
+		.procname	= "usermodehelper",
+		.mode		= 0555,
+		.child		= usermodehelper_table,
+	},
+	{
 		.procname	= "overflowuid",
 		.data		= &overflowuid,
 		.maxlen		= sizeof(int),



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-02-08  2:55                               ` Eric Paris
@ 2011-02-14 20:45                                 ` Eric Paris
  2011-02-14 21:24                                   ` Serge E. Hallyn
  2011-02-18  0:29                                 ` Serge E. Hallyn
  1 sibling, 1 reply; 28+ messages in thread
From: Eric Paris @ 2011-02-14 20:45 UTC (permalink / raw)
  To: Eric Paris
  Cc: Andrew G. Morgan, Serge E. Hallyn, Steve Grubb, Serge E. Hallyn,
	linux-kernel, linux-security-module

On Mon, Feb 7, 2011 at 9:55 PM, Eric Paris <eparis@redhat.com> wrote:
>
> Below is what I'm working on.  I've asked dhowells to review the creds
> code, since commit_creds() does not take const.  Maybe that's just an
> oversight.  Basically I've exposed two new sysctls.
>
> /proc/sys/kernel/usermodehelper/bset
> /proc/sys/kernel/usermodehelper/inheritable
>
> You must have CAP_SYS_MODULE to change these (changes are &= ONLY).
> When the kernel launches a usermodehelper it will do so with these as
> the bset and pI.  I haven't attempted securebits and uid (since I didn't
> really need them I don't think)  But will if anyone can think of a use
> case.
>
> Is this what you were thinking?


Anything?  Problems with this patch/approach?

-Eric

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-02-14 20:45                                 ` Eric Paris
@ 2011-02-14 21:24                                   ` Serge E. Hallyn
  0 siblings, 0 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2011-02-14 21:24 UTC (permalink / raw)
  To: Eric Paris
  Cc: Eric Paris, Andrew G. Morgan, Serge E. Hallyn, Steve Grubb,
	Serge E. Hallyn, linux-kernel, linux-security-module

Quoting Eric Paris (eparis@parisplace.org):
> On Mon, Feb 7, 2011 at 9:55 PM, Eric Paris <eparis@redhat.com> wrote:
> >
> > Below is what I'm working on.  I've asked dhowells to review the creds
> > code, since commit_creds() does not take const.  Maybe that's just an
> > oversight.  Basically I've exposed two new sysctls.
> >
> > /proc/sys/kernel/usermodehelper/bset
> > /proc/sys/kernel/usermodehelper/inheritable
> >
> > You must have CAP_SYS_MODULE to change these (changes are &= ONLY).
> > When the kernel launches a usermodehelper it will do so with these as
> > the bset and pI.  I haven't attempted securebits and uid (since I didn't
> > really need them I don't think)  But will if anyone can think of a use
> > case.
> >
> > Is this what you were thinking?
> 
> 
> Anything?  Problems with this patch/approach?

Sorry, I've just not had a chance to take a close enough look.  I'll
try to do so tonight.

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] System Wide Capability Bounding Set
  2011-02-08  2:55                               ` Eric Paris
  2011-02-14 20:45                                 ` Eric Paris
@ 2011-02-18  0:29                                 ` Serge E. Hallyn
  1 sibling, 0 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2011-02-18  0:29 UTC (permalink / raw)
  To: Eric Paris
  Cc: Andrew G. Morgan, Serge E. Hallyn, Steve Grubb, Serge E. Hallyn,
	linux-kernel, linux-security-module

Quoting Eric Paris (eparis@redhat.com):
> On Tue, 2011-02-01 at 20:02 -0800, Andrew G. Morgan wrote:
> 
> > In this case, I would like to suggest what you need is a user
> > configurable state for kernel threads to launch helper programs - a
> > kernel side equivalent to Sergey's wrapper idea.  I continue to
> > dislike the global bounding set idea, but I would support a base
> > credential set for this kthread launcher. I'd include pI, bset, and
> > securebits and uid as something your initrd could initialize away from
> > their default values for kernel launched helper binaries. I'd prefer
> > it if you allowed the regular capability convolution rules to apply
> > and propagate this bounding set for all kernel launched binaries, and
> > also add the relevant code to init to enforce your desired bounding
> > set for init parented processes.
> 
> 
> > >> Or an interface in which I can force things out of the bset and pI of
> > >> other tasks?  Possibly the interface could be specific to the "khelper"
> > >> thread?
> > >
> > > No no no no no :)
> 
> Below is what I'm working on.  I've asked dhowells to review the creds
> code, since commit_creds() does not take const.  Maybe that's just an
> oversight.  Basically I've exposed two new sysctls.
> 
> /proc/sys/kernel/usermodehelper/bset
> /proc/sys/kernel/usermodehelper/inheritable
> 
> You must have CAP_SYS_MODULE to change these (changes are &= ONLY).
> When the kernel launches a usermodehelper it will do so with these as
> the bset and pI.  I haven't attempted securebits and uid (since I didn't
> really need them I don't think)  But will if anyone can think of a use
> case.
> 
> Is this what you were thinking?

Sorry about the wait.

No objection from me.

If someone ends up wanting to do more generic security context tweaking
we can worry about that later.

thanks,
-serge

> -Eric
> 
> commit 23dbb6813349509a463103a34b51b18182c2ca0f
> Author: Eric Paris <eparis@redhat.com>
> Date:   Mon Feb 7 21:39:58 2011 -0500
> 
>     limit kthreads usermode helper words stuff
> 
> diff --git a/include/linux/kmod.h b/include/linux/kmod.h
> index 6efd7a7..79bb98d 100644
> --- a/include/linux/kmod.h
> +++ b/include/linux/kmod.h
> @@ -24,6 +24,7 @@
>  #include <linux/errno.h>
>  #include <linux/compiler.h>
>  #include <linux/workqueue.h>
> +#include <linux/sysctl.h>
>  
>  #define KMOD_PATH_LEN 256
>  
> @@ -109,6 +110,8 @@ call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
>  				       NULL, NULL, NULL);
>  }
>  
> +extern struct ctl_table usermodehelper_table[];
> +
>  extern void usermodehelper_init(void);
>  
>  extern int usermodehelper_disable(void);
> diff --git a/kernel/kmod.c b/kernel/kmod.c
> index 9cd0591..d38be14 100644
> --- a/kernel/kmod.c
> +++ b/kernel/kmod.c
> @@ -25,6 +25,7 @@
>  #include <linux/kmod.h>
>  #include <linux/slab.h>
>  #include <linux/completion.h>
> +#include <linux/cred.h>
>  #include <linux/file.h>
>  #include <linux/fdtable.h>
>  #include <linux/workqueue.h>
> @@ -43,6 +44,12 @@ extern int max_threads;
>  
>  static struct workqueue_struct *khelper_wq;
>  
> +#define CAP_BSET	(void *)1
> +#define CAP_PI		(void *)2
> +
> +kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
> +kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
> +
>  #ifdef CONFIG_MODULES
>  
>  /*
> @@ -132,6 +139,8 @@ EXPORT_SYMBOL(__request_module);
>  static int ____call_usermodehelper(void *data)
>  {
>  	struct subprocess_info *sub_info = data;
> +	const struct cred *old;
> +	struct cred *new;
>  	int retval;
>  
>  	spin_lock_irq(&current->sighand->siglock);
> @@ -153,10 +162,23 @@ static int ____call_usermodehelper(void *data)
>  			goto fail;
>  	}
>  
> +	retval = -ENOMEM;
> +	new = prepare_kernel_cred(current);
> +	if (!new)
> +		goto fail;
> +
> +	new->cap_bset = usermodehelper_bset;
> +	new->cap_inheritable = usermodehelper_inheritable; 
> +
> +	old = get_cred(current_cred());
> +	commit_creds(new);
> +
>  	retval = kernel_execve(sub_info->path,
>  			       (const char *const *)sub_info->argv,
>  			       (const char *const *)sub_info->envp);
>  
> +	commit_creds(old);
> +
>  	/* Exec failed? */
>  fail:
>  	sub_info->retval = retval;
> @@ -418,6 +440,79 @@ unlock:
>  }
>  EXPORT_SYMBOL(call_usermodehelper_exec);
>  
> +static int proc_cap_handler(struct ctl_table *table, int write,
> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
> +	kernel_cap_t new_cap;
> +	int err, i;
> +
> +	if (write && !capable(CAP_SYS_MODULE))
> +		return -EPERM;
> +
> +	/*
> +	 * convert from the global kernel_cap_t to the ulong array to print to
> +	 * userspace if this is a read.
> +	 */
> +	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  {
> +		if (table->data == CAP_BSET)
> +			cap_array[i] = usermodehelper_bset.cap[i];
> +		else if (table->data == CAP_PI)
> +			cap_array[i] = usermodehelper_inheritable.cap[i];
> +		else
> +			BUG();
> +	}
> +
> +	t = *table;
> +	t.data = &cap_array;
> +
> +	/*
> +	 * actually read or write and array of ulongs from userspace.  Remember
> +	 * these are least significant 32 bits first
> +	 */
> +	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0)
> +		return err;
> +
> +	/*
> +	 * convert from the sysctl array of ulongs to the kernel_cap_t
> +	 * internal representation
> +	 */
> +	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
> +		new_cap.cap[i] = cap_array[i];
> +
> +	/*
> +	 * Drop everything not in the new_cap (but don't add things)
> +	 */
> +	if (write) {
> +		if (table->data == CAP_BSET)
> +			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
> +		if (table->data == CAP_PI)
> +			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
> +	}
> +
> +	return 0;
> +}
> +
> +struct ctl_table usermodehelper_table[] = {
> +	{
> +		.procname	= "bset",
> +		.data		= CAP_BSET,
> +		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
> +		.mode		= 0600,
> +		.proc_handler	= proc_cap_handler,
> +	},
> +	{
> +		.procname	= "inheritable",
> +		.data		= CAP_PI,
> +		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
> +		.mode		= 0600,
> +		.proc_handler	= proc_cap_handler,
> +	},
> +	{ }
> +};
> +
>  void __init usermodehelper_init(void)
>  {
>  	khelper_wq = create_singlethread_workqueue("khelper");
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 0f1bd83..099d2e2 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -56,6 +56,7 @@
>  #include <linux/kprobes.h>
>  #include <linux/pipe_fs_i.h>
>  #include <linux/oom.h>
> +#include <linux/kmod.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/processor.h>
> @@ -617,6 +618,11 @@ static struct ctl_table kern_table[] = {
>  		.child		= random_table,
>  	},
>  	{
> +		.procname	= "usermodehelper",
> +		.mode		= 0555,
> +		.child		= usermodehelper_table,
> +	},
> +	{
>  		.procname	= "overflowuid",
>  		.data		= &overflowuid,
>  		.maxlen		= sizeof(int),
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-02-18  0:29 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-05 22:25 [PATCH] System Wide Capability Bounding Set Eric Paris
2011-01-06 11:30 ` Tetsuo Handa
2011-01-06 16:44   ` Theodore Tso
2011-01-11 22:02 ` Serge E. Hallyn
2011-01-11 22:12   ` Serge E. Hallyn
2011-01-14 19:50   ` Eric Paris
2011-01-17  3:16     ` Andrew G. Morgan
2011-01-21 21:25       ` Eric Paris
2011-01-23  3:39         ` Andrew G. Morgan
2011-01-24 21:40           ` Serge Hallyn
2011-01-26 23:34             ` Eric Paris
2011-01-27 14:02               ` Serge E. Hallyn
2011-01-27 14:42                 ` Steve Grubb
2011-01-27 16:43                   ` Andrew G. Morgan
     [not found]                   ` <AANLkTi=k5QeE_-iNuW3-M5K3BnBtRxk-QYO5624HKrpE@mail.gmail.com>
2011-01-27 16:50                     ` Steve Grubb
2011-01-28 18:19                       ` Eric Paris
2011-01-28 18:49                   ` Serge E. Hallyn
2011-01-28 19:10                     ` Steve Grubb
2011-01-28 19:38                       ` Serge E. Hallyn
2011-01-28 22:24                         ` Eric Paris
2011-02-01 18:17                         ` Eric Paris
2011-02-01 21:26                           ` Serge E. Hallyn
2011-02-02  4:02                             ` Andrew G. Morgan
2011-02-08  2:55                               ` Eric Paris
2011-02-14 20:45                                 ` Eric Paris
2011-02-14 21:24                                   ` Serge E. Hallyn
2011-02-18  0:29                                 ` Serge E. Hallyn
2011-01-27 14:26               ` Andrew G. Morgan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.