All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] user namespace: make signal.c respect user namespaces
@ 2011-09-19 21:45 Serge E. Hallyn
  2011-09-19 21:47 ` [PATCH] user namespace: usb: make usb urbs user namespace aware Serge E. Hallyn
                   ` (2 more replies)
  0 siblings, 3 replies; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-19 21:45 UTC (permalink / raw)
  To: lkml
  Cc: richard, Andrew Morton, Oleg Nesterov, Eric W. Biederman,
	Tejun Heo, serge, serge.hallyn

__send_signal: convert the uid being sent in SI_USER to the target task's
user namespace.

do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
 user namespace

ptrace_signal: map parent's uid into current's user namespace before
including in signal to current.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c |   19 +++++++++++++++----
 1 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 291c970..bb8ce03 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,6 +27,7 @@
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -1073,7 +1074,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_tgid_nr_ns(current,
 							task_active_pid_ns(t));
-			q->info.si_uid = current_uid();
+			q->info.si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+					current_cred(), current_uid());
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1618,7 +1620,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = user_ns_map_uid(task_cred_xxx(tsk->parent, user_ns),
+				      __task_cred(tsk), __task_cred(tsk)->uid);
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
@@ -1688,6 +1691,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	unsigned long flags;
 	struct task_struct *parent;
 	struct sighand_struct *sighand;
+	const struct cred *cred;
 
 	if (for_ptracer) {
 		parent = tsk->parent;
@@ -1703,7 +1707,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	cred = __task_cred(tsk);
+	info.si_uid = user_ns_map_uid(task_cred_xxx(parent, user_ns),
+				cred, cred->uid);
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2118,11 +2124,16 @@ static int ptrace_signal(int signr, siginfo_t *info,
 	 * have updated *info via PTRACE_SETSIGINFO.
 	 */
 	if (signr != info->si_signo) {
+		const struct cred *pcred;
 		info->si_signo = signr;
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = task_uid(current->parent);
+		rcu_read_lock();
+		pcred = __task_cred(current->parent);
+		info->si_uid = user_ns_map_uid(current_user_ns(),
+			pcred, pcred->uid);
+		rcu_read_unlock();
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH] user namespace: usb: make usb urbs user namespace aware
  2011-09-19 21:45 [PATCH] user namespace: make signal.c respect user namespaces Serge E. Hallyn
@ 2011-09-19 21:47 ` Serge E. Hallyn
  2011-09-20 13:17   ` Oleg Nesterov
  2011-09-20 12:22 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
  2011-09-20 17:48 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
  2 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-19 21:47 UTC (permalink / raw)
  To: lkml
  Cc: richard, Andrew Morton, Oleg Nesterov, Eric W. Biederman,
	Tejun Heo, serge, serge.hallyn

Add to the dev_state and alloc_async structures the user namespace
corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
which can then implement a proper, user-namespace-aware uid check.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Greg KH <greg@kroah.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
---
 drivers/usb/core/devio.c |   17 +++++++++++++----
 include/linux/sched.h    |    3 ++-
 kernel/signal.c          |   22 ++++++++++++++++------
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 37518df..b875a11 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -46,6 +46,7 @@
 #include <linux/cdev.h>
 #include <linux/notifier.h>
 #include <linux/security.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
@@ -68,6 +69,7 @@ struct dev_state {
 	wait_queue_head_t wait;     /* wake up if a request completed */
 	unsigned int discsignr;
 	struct pid *disc_pid;
+	struct user_namespace *user_ns;
 	uid_t disc_uid, disc_euid;
 	void __user *disccontext;
 	unsigned long ifclaimed;
@@ -79,6 +81,7 @@ struct async {
 	struct list_head asynclist;
 	struct dev_state *ps;
 	struct pid *pid;
+	struct user_namespace *user_ns;
 	uid_t uid, euid;
 	unsigned int signr;
 	unsigned int ifnum;
@@ -248,6 +251,7 @@ static struct async *alloc_async(unsigned int numisoframes)
 static void free_async(struct async *as)
 {
 	put_pid(as->pid);
+	put_user_ns(as->user_ns);
 	kfree(as->urb->transfer_buffer);
 	kfree(as->urb->setup_packet);
 	usb_free_urb(as->urb);
@@ -396,6 +400,7 @@ static void async_completed(struct urb *urb)
 	uid_t uid = 0;
 	uid_t euid = 0;
 	u32 secid = 0;
+	struct user_namespace *user_ns;
 	int signr;
 
 	spin_lock(&ps->lock);
@@ -408,6 +413,7 @@ static void async_completed(struct urb *urb)
 		sinfo.si_code = SI_ASYNCIO;
 		sinfo.si_addr = as->userurb;
 		pid = as->pid;
+		user_ns = as->user_ns;
 		uid = as->uid;
 		euid = as->euid;
 		secid = as->secid;
@@ -423,8 +429,8 @@ static void async_completed(struct urb *urb)
 	spin_unlock(&ps->lock);
 
 	if (signr)
-		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
-				      euid, secid);
+		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, user_ns,
+				     uid, euid, secid);
 
 	wake_up(&ps->wait);
 }
@@ -708,6 +714,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	ps->disc_pid = get_pid(task_pid(current));
 	ps->disc_uid = cred->uid;
 	ps->disc_euid = cred->euid;
+	ps->user_ns = get_user_ns(cred->user_ns);
 	ps->disccontext = NULL;
 	ps->ifclaimed = 0;
 	security_task_getsecid(current, &ps->secid);
@@ -749,6 +756,7 @@ static int usbdev_release(struct inode *inode, struct file *file)
 	usb_unlock_device(dev);
 	usb_put_dev(dev);
 	put_pid(ps->disc_pid);
+	put_user_ns(ps->user_ns);
 
 	as = async_getcompleted(ps);
 	while (as) {
@@ -1262,6 +1270,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
 	as->pid = get_pid(task_pid(current));
+	as->user_ns = get_user_ns(cred->user_ns);
 	as->uid = cred->uid;
 	as->euid = cred->euid;
 	security_task_getsecid(current, &as->secid);
@@ -1982,8 +1991,8 @@ static void usbdev_remove(struct usb_device *udev)
 			sinfo.si_code = SI_ASYNCIO;
 			sinfo.si_addr = ps->disccontext;
 			kill_pid_info_as_uid(ps->discsignr, &sinfo,
-					ps->disc_pid, ps->disc_uid,
-					ps->disc_euid, ps->secid);
+					ps->disc_pid, ps->user_ns,
+					ps->disc_uid, ps->disc_euid, ps->secid);
 		}
 	}
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c05..2d47cc2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2166,7 +2166,8 @@ extern int force_sigsegv(int, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
+extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *,
+				struct user_namespace *, uid_t, uid_t, u32);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
diff --git a/kernel/signal.c b/kernel/signal.c
index bb8ce03..4a00eb2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1346,13 +1346,26 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+static int kill_as_uid_perm(struct task_struct *p,
+			    struct user_namespace *user_ns, uid_t uid,
+			    uid_t euid)
+{
+	const struct cred *pcred = __task_cred(p);
+	if (user_ns != pcred->user_ns)
+		return 0;
+	if (euid != pcred->suid && euid != pcred->uid &&
+	    uid  != pcred->suid && uid  != pcred->uid)
+		return 0;
+	return 1;
+}
+
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
 int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
-		      uid_t uid, uid_t euid, u32 secid)
+			 struct user_namespace *user_ns, uid_t uid, uid_t euid,
+			 u32 secid)
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
-	const struct cred *pcred;
 	unsigned long flags;
 
 	if (!valid_signal(sig))
@@ -1364,10 +1377,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	pcred = __task_cred(p);
-	if (si_fromuser(info) &&
-	    euid != pcred->suid && euid != pcred->uid &&
-	    uid  != pcred->suid && uid  != pcred->uid) {
+	if (si_fromuser(info) && !kill_as_uid_perm(p, user_ns, uid, euid)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-19 21:45 [PATCH] user namespace: make signal.c respect user namespaces Serge E. Hallyn
  2011-09-19 21:47 ` [PATCH] user namespace: usb: make usb urbs user namespace aware Serge E. Hallyn
@ 2011-09-20 12:22 ` Oleg Nesterov
  2011-09-20 12:44   ` Serge E. Hallyn
                     ` (2 more replies)
  2011-09-20 17:48 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
  2 siblings, 3 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 12:22 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/19, Serge E. Hallyn wrote:
>
> __send_signal: convert the uid being sent in SI_USER to the target task's
> user namespace.
>
> do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
>  user namespace
>
> ptrace_signal: map parent's uid into current's user namespace before
> including in signal to current.

And all of them follow the same pattern,

> @@ -1073,7 +1074,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
>  			q->info.si_code = SI_USER;
>  			q->info.si_pid = task_tgid_nr_ns(current,
>  							task_active_pid_ns(t));
> -			q->info.si_uid = current_uid();
> +			q->info.si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> +					current_cred(), current_uid());

Up to you, but may be we can add a helper? Something like

	static inline uid_t good_name(struct task_struct *from, struct task_struct *to)
	{
		// the caller does rcu_read_lock() if needed
		const struct cred *from_cred = __task_cred(from);
		return user_ns_map_uid(task_cred_xxx(to, user_ns),
					from_cred, from_cred->uid);
	}

As for __send_signal() in particular, I guess we could do

		q->info.si_uid = from_ancestor_ns ? 0 : current_uid();

instead, but otoh perhaps it is better to use user_ns_map_uid() anyway
for consistency.

> @@ -2118,11 +2124,16 @@ static int ptrace_signal(int signr, siginfo_t *info,
>  	 * have updated *info via PTRACE_SETSIGINFO.
>  	 */
>  	if (signr != info->si_signo) {
> +		const struct cred *pcred;
>  		info->si_signo = signr;
>  		info->si_errno = 0;
>  		info->si_code = SI_USER;
>  		info->si_pid = task_pid_vnr(current->parent);
> -		info->si_uid = task_uid(current->parent);
> +		rcu_read_lock();

In this case please add rcu_read_lock() earlier, before task_pid_vnr().
It has the same (theoretical) reasons for rcu lock.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 12:22 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
@ 2011-09-20 12:44   ` Serge E. Hallyn
  2011-09-20 13:41     ` Oleg Nesterov
  2011-09-20 15:39   ` [PATCH] user namespace: make signal.c respect user namespaces Serge Hallyn
  2011-09-21  5:00   ` [PATCH] user namespace: make signal.c respect user namespaces (v2) Serge E. Hallyn
  2 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-20 12:44 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Serge E. Hallyn, lkml, richard, Andrew Morton, Eric W. Biederman,
	Tejun Heo

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/19, Serge E. Hallyn wrote:
> >
> > __send_signal: convert the uid being sent in SI_USER to the target task's
> > user namespace.
> >
> > do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
> >  user namespace
> >
> > ptrace_signal: map parent's uid into current's user namespace before
> > including in signal to current.
> 
> And all of them follow the same pattern,
> 
> > @@ -1073,7 +1074,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
> >  			q->info.si_code = SI_USER;
> >  			q->info.si_pid = task_tgid_nr_ns(current,
> >  							task_active_pid_ns(t));
> > -			q->info.si_uid = current_uid();
> > +			q->info.si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> > +					current_cred(), current_uid());
> 
> Up to you, but may be we can add a helper? Something like
> 
> 	static inline uid_t good_name(struct task_struct *from, struct task_struct *to)
> 	{
> 		// the caller does rcu_read_lock() if needed
> 		const struct cred *from_cred = __task_cred(from);
> 		return user_ns_map_uid(task_cred_xxx(to, user_ns),
> 					from_cred, from_cred->uid);
> 	}

That looks great, thanks.  I couldn't think it up myself, but now that
I see it in your email, I see this would be very valuable in helping
make this code more readable  :)

> As for __send_signal() in particular, I guess we could do
> 
> 		q->info.si_uid = from_ancestor_ns ? 0 : current_uid();
> 
> instead, but otoh perhaps it is better to use user_ns_map_uid() anyway
> for consistency.

Yeah I'm torn on this, but actually I think the above with a comment
explaining it might be better.

> > @@ -2118,11 +2124,16 @@ static int ptrace_signal(int signr, siginfo_t *info,
> >  	 * have updated *info via PTRACE_SETSIGINFO.
> >  	 */
> >  	if (signr != info->si_signo) {
> > +		const struct cred *pcred;
> >  		info->si_signo = signr;
> >  		info->si_errno = 0;
> >  		info->si_code = SI_USER;
> >  		info->si_pid = task_pid_vnr(current->parent);
> > -		info->si_uid = task_uid(current->parent);
> > +		rcu_read_lock();
> 
> In this case please add rcu_read_lock() earlier, before task_pid_vnr().
> It has the same (theoretical) reasons for rcu lock.

Ah.  I stared at that for some time trying to figure out if I was
wrong about my needing it since it didn't have that :)  Will do.

Thanks for the feedback, Oleg!

-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: usb: make usb urbs user namespace aware
  2011-09-19 21:47 ` [PATCH] user namespace: usb: make usb urbs user namespace aware Serge E. Hallyn
@ 2011-09-20 13:17   ` Oleg Nesterov
  2011-09-20 13:33     ` Serge E. Hallyn
  2011-09-21  5:01     ` [PATCH] user namespace: usb: make usb urbs user namespace aware (v2) Serge E. Hallyn
  0 siblings, 2 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 13:17 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/19, Serge E. Hallyn wrote:
>
> Add to the dev_state and alloc_async structures the user namespace
> corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> which can then implement a proper, user-namespace-aware uid check.

IOW, we add the additional "user_namespace *" member/argument, and use
it along with uid/euid.

I am not really sure, but can't we simplify this?

> @@ -68,6 +69,7 @@ struct dev_state {
>  	wait_queue_head_t wait;     /* wake up if a request completed */
>  	unsigned int discsignr;
>  	struct pid *disc_pid;
> +	struct user_namespace *user_ns;
>  	uid_t disc_uid, disc_euid;

Can't we add "const struct cred *disc_cred" and kill disc_uid/disc_euid
instead?

Then we redefine kill_pid_info_as_uid() as kill_pid_info_as_cred(...cred...),
it can use cred->cred->uid/euid directly.

devio.c does get_cred/put_cred instead of get_user_ns/put_user_ns.

What do you think?

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: usb: make usb urbs user namespace aware
  2011-09-20 13:17   ` Oleg Nesterov
@ 2011-09-20 13:33     ` Serge E. Hallyn
  2011-09-21  5:01     ` [PATCH] user namespace: usb: make usb urbs user namespace aware (v2) Serge E. Hallyn
  1 sibling, 0 replies; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-20 13:33 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/19, Serge E. Hallyn wrote:
> >
> > Add to the dev_state and alloc_async structures the user namespace
> > corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> > which can then implement a proper, user-namespace-aware uid check.
> 
> IOW, we add the additional "user_namespace *" member/argument, and use
> it along with uid/euid.
> 
> I am not really sure, but can't we simplify this?
> 
> > @@ -68,6 +69,7 @@ struct dev_state {
> >  	wait_queue_head_t wait;     /* wake up if a request completed */
> >  	unsigned int discsignr;
> >  	struct pid *disc_pid;
> > +	struct user_namespace *user_ns;
> >  	uid_t disc_uid, disc_euid;
> 
> Can't we add "const struct cred *disc_cred" and kill disc_uid/disc_euid
> instead?
> 
> Then we redefine kill_pid_info_as_uid() as kill_pid_info_as_cred(...cred...),
> it can use cred->cred->uid/euid directly.
> 
> devio.c does get_cred/put_cred instead of get_user_ns/put_user_ns.
> 
> What do you think?

Yeah, just as file->f_cred does.  Sounds good.

I'll re-send both these patches with your suggestions applied.  Thanks!

-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 12:44   ` Serge E. Hallyn
@ 2011-09-20 13:41     ` Oleg Nesterov
  2011-09-20 14:39       ` [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces) Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 13:41 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn, lkml, richard, Andrew Morton, Eric W. Biederman,
	Tejun Heo

On 09/20, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> >
> > 	static inline uid_t good_name(struct task_struct *from, struct task_struct *to)
> > 	{
> > 		// the caller does rcu_read_lock() if needed
> > 		const struct cred *from_cred = __task_cred(from);
> > 		return user_ns_map_uid(task_cred_xxx(to, user_ns),
> > 					from_cred, from_cred->uid);
> > 	}
>
> That looks great, thanks.  I couldn't think it up myself, but now that
> I see it in your email, I see this would be very valuable in helping
> make this code more readable  :)

Damn ;) The problem is, "the caller does rcu_read_lock() if needed"
can't shut up __rcu_dereference_check(). current_cred() passes c == true
to do this.

This reminds me, __task_cred()->task_is_dead() should go away. Probably
we can replace it with (__t == current). Otherwise send_signal() needs
rcu_read_lock() to avoid the warning from lockdep, or the helper needs
the unconditional rcu_read_lock().

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces)
  2011-09-20 13:41     ` Oleg Nesterov
@ 2011-09-20 14:39       ` Oleg Nesterov
  2011-09-20 14:39         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Oleg Nesterov
                           ` (3 more replies)
  0 siblings, 4 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 14:39 UTC (permalink / raw)
  To: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney
  Cc: Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

On 09/20, Oleg Nesterov wrote:
>
> On 09/20, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > >
> > > 	static inline uid_t good_name(struct task_struct *from, struct task_struct *to)
> > > 	{
> > > 		// the caller does rcu_read_lock() if needed
> > > 		const struct cred *from_cred = __task_cred(from);
> > > 		return user_ns_map_uid(task_cred_xxx(to, user_ns),
> > > 					from_cred, from_cred->uid);
> > > 	}
> >
> > That looks great, thanks.  I couldn't think it up myself, but now that
> > I see it in your email, I see this would be very valuable in helping
> > make this code more readable  :)
>
> Damn ;) The problem is, "the caller does rcu_read_lock() if needed"
> can't shut up __rcu_dereference_check(). current_cred() passes c == true
> to do this.
>
> This reminds me, __task_cred()->task_is_dead() should go away. Probably
> we can replace it with (__t == current). Otherwise send_signal() needs
> rcu_read_lock() to avoid the warning from lockdep, or the helper needs
> the unconditional rcu_read_lock().

IOW, I think we need 2 simple patches first.

The problem is, I have no idea how can I really test these changes ;)
David, Paul, may be you can review...

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check
  2011-09-20 14:39       ` [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces) Oleg Nesterov
@ 2011-09-20 14:39         ` Oleg Nesterov
  2011-09-20 15:14           ` drivers/staging/usbip/ abuses task_is_dead/exit_state Oleg Nesterov
  2011-09-20 15:28           ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Paul E. McKenney
  2011-09-20 14:39         ` [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held() Oleg Nesterov
                           ` (2 subsequent siblings)
  3 siblings, 2 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 14:39 UTC (permalink / raw)
  To: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney
  Cc: Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

>From 8f92054e commit:

    Instead, add the following validation condition:

        task->exit_state >= 0

    to permit the access if the target task is dead and therefore unable to change
    its own credentials.

OK, but afaics currently this can only help wait_task_zombie() which
calls __task_cred() without rcu lock.

Remove this validation and change wait_task_zombie() to use task_uid()
instead. This means we do rcu_read_lock() only to shut up the lockdep,
but we already do the same in, say, wait_task_stopped().

Unfortunately, we can't kill task_is_dead() right now, it has already
found the users in drivers/staging/, and I bet the usage is wrong.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/cred.h |    3 +--
 kernel/exit.c        |    2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

--- 3.1/include/linux/cred.h~1_kill_task_is_dead	2011-09-20 16:28:22.000000000 +0200
+++ 3.1/include/linux/cred.h	2011-09-20 16:28:47.000000000 +0200
@@ -284,8 +284,7 @@ static inline void put_cred(const struct
 #define __task_cred(task)						\
 	({								\
 		const struct task_struct *__t = (task);			\
-		rcu_dereference_check(__t->real_cred,			\
-				      task_is_dead(__t));		\
+		rcu_dereference_check(__t->real_cred, 0);		\
 	})
 
 /**
--- 3.1/kernel/exit.c~1_kill_task_is_dead	2011-09-20 16:28:22.000000000 +0200
+++ 3.1/kernel/exit.c	2011-09-20 16:28:47.000000000 +0200
@@ -1191,7 +1191,7 @@ static int wait_task_zombie(struct wait_
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
-	uid_t uid = __task_cred(p)->uid;
+	uid_t uid = task_uid(p);
 	struct siginfo __user *infop;
 
 	if (!likely(wo->wo_flags & WEXITED))


^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 14:39       ` [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces) Oleg Nesterov
  2011-09-20 14:39         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Oleg Nesterov
@ 2011-09-20 14:39         ` Oleg Nesterov
  2011-09-20 15:07           ` Serge Hallyn
  2011-09-20 16:19         ` David Howells
  2011-09-20 16:27         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check David Howells
  3 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 14:39 UTC (permalink / raw)
  To: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney
  Cc: Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

Change __task_cred(task) to accept "task == current" without
rcu_read_lock_held(). This is what current_cred() currently does,
and with this change __task_cred() becomes more flexible/usable.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/cred.h |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

--- 3.1/include/linux/cred.h~2_task_cred_ck_current	2011-09-20 16:28:47.000000000 +0200
+++ 3.1/include/linux/cred.h	2011-09-20 16:33:12.000000000 +0200
@@ -268,8 +268,7 @@ static inline void put_cred(const struct
  * Access the subjective credentials of the current task.  RCU-safe,
  * since nobody else can modify it.
  */
-#define current_cred() \
-	rcu_dereference_protected(current->cred, 1)
+#define current_cred()	__task_cred(current)
 
 /**
  * __task_cred - Access a task's objective credentials
@@ -284,7 +283,7 @@ static inline void put_cred(const struct
 #define __task_cred(task)						\
 	({								\
 		const struct task_struct *__t = (task);			\
-		rcu_dereference_check(__t->real_cred, 0);		\
+		rcu_dereference_check(__t->real_cred, (__t == current));\
 	})
 
 /**


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 14:39         ` [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held() Oleg Nesterov
@ 2011-09-20 15:07           ` Serge Hallyn
  2011-09-20 15:35             ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Serge Hallyn @ 2011-09-20 15:07 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney,
	lkml, richard, Eric W. Biederman, Tejun Heo

Quoting Oleg Nesterov (oleg@redhat.com):
> Change __task_cred(task) to accept "task == current" without
> rcu_read_lock_held(). This is what current_cred() currently does,
> and with this change __task_cred() becomes more flexible/usable.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

So to check whether I've got this straight, the original __task_cred()
was allowing rcu read lock to not be held for a non-running task, but
required rcu read lock if task was running?  With these two patches,
rcu read lock will not be needed if task == current?

If so, then that sounds good to me, and an unconditional rcu_read_lock()
at wait_task_zombie() seems better than at send_signal()...

> ---
> 
>  include/linux/cred.h |    5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> --- 3.1/include/linux/cred.h~2_task_cred_ck_current	2011-09-20 16:28:47.000000000 +0200
> +++ 3.1/include/linux/cred.h	2011-09-20 16:33:12.000000000 +0200
> @@ -268,8 +268,7 @@ static inline void put_cred(const struct
>   * Access the subjective credentials of the current task.  RCU-safe,
>   * since nobody else can modify it.
>   */
> -#define current_cred() \
> -	rcu_dereference_protected(current->cred, 1)
> +#define current_cred()	__task_cred(current)
>  
>  /**
>   * __task_cred - Access a task's objective credentials
> @@ -284,7 +283,7 @@ static inline void put_cred(const struct
>  #define __task_cred(task)						\
>  	({								\
>  		const struct task_struct *__t = (task);			\
> -		rcu_dereference_check(__t->real_cred, 0);		\
> +		rcu_dereference_check(__t->real_cred, (__t == current));\
>  	})
>  
>  /**
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* drivers/staging/usbip/ abuses task_is_dead/exit_state
  2011-09-20 14:39         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Oleg Nesterov
@ 2011-09-20 15:14           ` Oleg Nesterov
  2011-09-20 18:38             ` Greg KH
  2011-09-20 15:28           ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Paul E. McKenney
  1 sibling, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 15:14 UTC (permalink / raw)
  To: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney,
	Tobias Klauser, Greg Kroah-Hartman, Matt Mooney
  Cc: Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

(add more cc's)

On 09/20, Oleg Nesterov wrote:
>
> Unfortunately, we can't kill task_is_dead() right now, it has already
> found the users in drivers/staging/, and I bet the usage is wrong.

It is used by drivers/staging/usbip/

For what? The code:

	if (vdev->ud.tcp_rx && !task_is_dead(vdev->ud.tcp_rx))
		kthread_stop(vdev->ud.tcp_rx);

And how task_is_dead() can help? This helper is really "special", it
shouldn't be used anyway. But why do we check ->exit_state? Without
tasklist the check is racy anyway, the task can exit right after the
check.

And. It is safe to use kthread_stop(t) even if t has already exited.

OK, this was added by 8547d4cc2b616e4f1dafebe2c673fc986422b506
"Staging: usbip: vhci-hcd: Do not kill already dead RX/TX kthread"

	When unbinding a device on the host which was still attached on the
	client, I got a NULL pointer dereference on the client.

Where?

	This turned out
	to be due to kthread_stop() being called on an already dead kthread.

This should work.

I'm afraid this can only fix the symptom. Probably, the problem is that
we do not have the reference and thus even task_is_dead(t) is not safe.

This kthread was created by kthread_run(). If it exits, nothing protects
this task_struct.

In any case, please do not use ->exit_state. It should not be used outside
of exit.c/etc paths, "exit_state != 0" means "exit_notify() was called".

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check
  2011-09-20 14:39         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Oleg Nesterov
  2011-09-20 15:14           ` drivers/staging/usbip/ abuses task_is_dead/exit_state Oleg Nesterov
@ 2011-09-20 15:28           ` Paul E. McKenney
  2011-09-20 15:40             ` Oleg Nesterov
  1 sibling, 1 reply; 63+ messages in thread
From: Paul E. McKenney @ 2011-09-20 15:28 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Serge E. Hallyn, Andrew Morton, David Howells, Serge E. Hallyn,
	lkml, richard, Eric W. Biederman, Tejun Heo

On Tue, Sep 20, 2011 at 04:39:42PM +0200, Oleg Nesterov wrote:
> >From 8f92054e commit:
> 
>     Instead, add the following validation condition:
> 
>         task->exit_state >= 0
> 
>     to permit the access if the target task is dead and therefore unable to change
>     its own credentials.
> 
> OK, but afaics currently this can only help wait_task_zombie() which
> calls __task_cred() without rcu lock.
> 
> Remove this validation and change wait_task_zombie() to use task_uid()
> instead. This means we do rcu_read_lock() only to shut up the lockdep,
> but we already do the same in, say, wait_task_stopped().
> 
> Unfortunately, we can't kill task_is_dead() right now, it has already
> found the users in drivers/staging/, and I bet the usage is wrong.

>From a first quick scan...

							Thanx, Paul

> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> 
>  include/linux/cred.h |    3 +--
>  kernel/exit.c        |    2 +-
>  2 files changed, 2 insertions(+), 3 deletions(-)
> 
> --- 3.1/include/linux/cred.h~1_kill_task_is_dead	2011-09-20 16:28:22.000000000 +0200
> +++ 3.1/include/linux/cred.h	2011-09-20 16:28:47.000000000 +0200
> @@ -284,8 +284,7 @@ static inline void put_cred(const struct
>  #define __task_cred(task)						\
>  	({								\
>  		const struct task_struct *__t = (task);			\
> -		rcu_dereference_check(__t->real_cred,			\
> -				      task_is_dead(__t));		\
> +		rcu_dereference_check(__t->real_cred, 0);		\

The "0" above will make lockdep-RCU complain unconditionally.  My guess
is that you want rcu_dereference_raw().

>  	})
> 
>  /**
> --- 3.1/kernel/exit.c~1_kill_task_is_dead	2011-09-20 16:28:22.000000000 +0200
> +++ 3.1/kernel/exit.c	2011-09-20 16:28:47.000000000 +0200
> @@ -1191,7 +1191,7 @@ static int wait_task_zombie(struct wait_
>  	unsigned long state;
>  	int retval, status, traced;
>  	pid_t pid = task_pid_vnr(p);
> -	uid_t uid = __task_cred(p)->uid;
> +	uid_t uid = task_uid(p);
>  	struct siginfo __user *infop;
> 
>  	if (!likely(wo->wo_flags & WEXITED))
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 15:07           ` Serge Hallyn
@ 2011-09-20 15:35             ` Oleg Nesterov
  0 siblings, 0 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 15:35 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney,
	lkml, richard, Eric W. Biederman, Tejun Heo

On 09/20, Serge Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> > Change __task_cred(task) to accept "task == current" without
> > rcu_read_lock_held(). This is what current_cred() currently does,
> > and with this change __task_cred() becomes more flexible/usable.
> >
> > Signed-off-by: Oleg Nesterov <oleg@redhat.com>
>
> So to check whether I've got this straight, the original __task_cred()
> was allowing rcu read lock to not be held for a non-running task, but
> required rcu read lock if task was running?  With these two patches,
> rcu read lock will not be needed if task == current?

Yes.

> If so, then that sounds good to me, and an unconditional rcu_read_lock()
> at wait_task_zombie() seems better than at send_signal()...

Agreed.

Although sometimes I think it would be better to use rcu_dereference_raw()
with a comment in some places.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 12:22 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
  2011-09-20 12:44   ` Serge E. Hallyn
@ 2011-09-20 15:39   ` Serge Hallyn
  2011-09-20 16:24     ` Oleg Nesterov
  2011-09-21  5:00   ` [PATCH] user namespace: make signal.c respect user namespaces (v2) Serge E. Hallyn
  2 siblings, 1 reply; 63+ messages in thread
From: Serge Hallyn @ 2011-09-20 15:39 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/19, Serge E. Hallyn wrote:
> >
> > __send_signal: convert the uid being sent in SI_USER to the target task's
> > user namespace.
> >
> > do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
> >  user namespace
> >
> > ptrace_signal: map parent's uid into current's user namespace before
> > including in signal to current.
> 
> And all of them follow the same pattern,
> 
> > @@ -1073,7 +1074,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
> >  			q->info.si_code = SI_USER;
> >  			q->info.si_pid = task_tgid_nr_ns(current,
> >  							task_active_pid_ns(t));
> > -			q->info.si_uid = current_uid();
> > +			q->info.si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> > +					current_cred(), current_uid());
> 
> Up to you, but may be we can add a helper? Something like
> 
> 	static inline uid_t good_name(struct task_struct *from, struct task_struct *to)
> 	{
> 		// the caller does rcu_read_lock() if needed
> 		const struct cred *from_cred = __task_cred(from);
> 		return user_ns_map_uid(task_cred_xxx(to, user_ns),
> 					from_cred, from_cred->uid);
> 	}
> 
> As for __send_signal() in particular, I guess we could do
> 
> 		q->info.si_uid = from_ancestor_ns ? 0 : current_uid();

But wait - can't __send_signal end up being triggered through fcntl
F_SETSIG+F_SETOWN to another but none-ancestor namespace?  So we
do actually need to call the full user_ns_map_uid() there.  However,
the uid sent in ptrace_signal() can (if i'm thinking right) be set
using

	info->si_uid = (current_user_ns() == parent_user_ns()) ?
			0 : current_uid();

Not much of a win performance-wise, but perhaps better by making it
clear that those are the only possibilities (and overflowuid is not
possible).

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check
  2011-09-20 15:28           ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Paul E. McKenney
@ 2011-09-20 15:40             ` Oleg Nesterov
  2011-09-20 15:48               ` Paul E. McKenney
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 15:40 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Serge E. Hallyn, Andrew Morton, David Howells, Serge E. Hallyn,
	lkml, richard, Eric W. Biederman, Tejun Heo

On 09/20, Paul E. McKenney wrote:
>
> On Tue, Sep 20, 2011 at 04:39:42PM +0200, Oleg Nesterov wrote:
> >
> > --- 3.1/include/linux/cred.h~1_kill_task_is_dead	2011-09-20 16:28:22.000000000 +0200
> > +++ 3.1/include/linux/cred.h	2011-09-20 16:28:47.000000000 +0200
> > @@ -284,8 +284,7 @@ static inline void put_cred(const struct
> >  #define __task_cred(task)						\
> >  	({								\
> >  		const struct task_struct *__t = (task);			\
> > -		rcu_dereference_check(__t->real_cred,			\
> > -				      task_is_dead(__t));		\
> > +		rcu_dereference_check(__t->real_cred, 0);		\
>
> The "0" above will make lockdep-RCU complain unconditionally.  My guess
> is that you want rcu_dereference_raw().

Hmm. I hope you are wrong this time ;)

rcu_dereference_check() checks rcu_read_lock_held(). IOW, with this
change __task_cred() always requires rcu_read_lock(), and this is
what the patch wants.

The next one adds " || (task == current" to the rcu_read_lock_held()
check above.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check
  2011-09-20 15:40             ` Oleg Nesterov
@ 2011-09-20 15:48               ` Paul E. McKenney
  0 siblings, 0 replies; 63+ messages in thread
From: Paul E. McKenney @ 2011-09-20 15:48 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Serge E. Hallyn, Andrew Morton, David Howells, Serge E. Hallyn,
	lkml, richard, Eric W. Biederman, Tejun Heo

On Tue, Sep 20, 2011 at 05:40:15PM +0200, Oleg Nesterov wrote:
> On 09/20, Paul E. McKenney wrote:
> >
> > On Tue, Sep 20, 2011 at 04:39:42PM +0200, Oleg Nesterov wrote:
> > >
> > > --- 3.1/include/linux/cred.h~1_kill_task_is_dead	2011-09-20 16:28:22.000000000 +0200
> > > +++ 3.1/include/linux/cred.h	2011-09-20 16:28:47.000000000 +0200
> > > @@ -284,8 +284,7 @@ static inline void put_cred(const struct
> > >  #define __task_cred(task)						\
> > >  	({								\
> > >  		const struct task_struct *__t = (task);			\
> > > -		rcu_dereference_check(__t->real_cred,			\
> > > -				      task_is_dead(__t));		\
> > > +		rcu_dereference_check(__t->real_cred, 0);		\
> >
> > The "0" above will make lockdep-RCU complain unconditionally.  My guess
> > is that you want rcu_dereference_raw().
> 
> Hmm. I hope you are wrong this time ;)
> 
> rcu_dereference_check() checks rcu_read_lock_held(). IOW, with this
> change __task_cred() always requires rcu_read_lock(), and this is
> what the patch wants.

Oy...  rcu_dereference_check(), not rcu_dereference_protected().

You are correct.

But why not just rcu_dereference()?

> The next one adds " || (task == current" to the rcu_read_lock_held()
> check above.

OK, I guess I got my answer.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 14:39       ` [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces) Oleg Nesterov
  2011-09-20 14:39         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Oleg Nesterov
  2011-09-20 14:39         ` [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held() Oleg Nesterov
@ 2011-09-20 16:19         ` David Howells
  2011-09-20 16:38           ` Oleg Nesterov
  2011-09-20 16:50           ` David Howells
  2011-09-20 16:27         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check David Howells
  3 siblings, 2 replies; 63+ messages in thread
From: David Howells @ 2011-09-20 16:19 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: dhowells, Serge E. Hallyn, Andrew Morton, Paul E. McKenney,
	Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

Oleg Nesterov <oleg@redhat.com> wrote:

> Change __task_cred(task) to accept "task == current" without
> rcu_read_lock_held(). This is what current_cred() currently does,
> and with this change __task_cred() becomes more flexible/usable.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

NAK!

If you compare carefully:

> -	rcu_dereference_protected(current->cred, 1)

and:

> -		rcu_dereference_check(__t->real_cred, 0);		\

you'll notice they aren't quite the same in one very fundamental way.

David

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 15:39   ` [PATCH] user namespace: make signal.c respect user namespaces Serge Hallyn
@ 2011-09-20 16:24     ` Oleg Nesterov
  2011-09-20 16:45       ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 16:24 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/20, Serge Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> >
> > As for __send_signal() in particular, I guess we could do
> >
> > 		q->info.si_uid = from_ancestor_ns ? 0 : current_uid();
>
> But wait - can't __send_signal end up being triggered through fcntl
> F_SETSIG+F_SETOWN to another but none-ancestor namespace?

I didn't really check, but afaics send_sigio_to_task/etc never use
SEND_SIG_NOINFO. And they shouldn't, there is no the sending process.

> do actually need to call the full user_ns_map_uid() there.  However,
> the uid sent in ptrace_signal() can (if i'm thinking right) be set
> using
>
> 	info->si_uid = (current_user_ns() == parent_user_ns()) ?
> 			0 : current_uid();
>
> Not much of a win performance-wise, but perhaps better by making it
> clear that those are the only possibilities (and overflowuid is not
> possible).

I think you are right right, except s/==/!=/. This is like send_signal()
which can only send the signal down or to the same namespace.

I guess ptrace_signal() can even do

	info->si_pid = task_pid_vnr(current->parent);

	if (info->si_pid)	// same namespace
		info->si_uid = current_uid();
	else			// can't see the tracer, we are from the sub-namespace
		info->si_uid = 0;

Whatever you like more.

And just in case let me repeat, I won't argue if you use
user_ns_map_uid/helper for consistency. Without CONFIG_USER_NS it is
even better performance-wise (even if we add a helper I guess).

So this all is up to you.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check
  2011-09-20 14:39       ` [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces) Oleg Nesterov
                           ` (2 preceding siblings ...)
  2011-09-20 16:19         ` David Howells
@ 2011-09-20 16:27         ` David Howells
  3 siblings, 0 replies; 63+ messages in thread
From: David Howells @ 2011-09-20 16:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: dhowells, Serge E. Hallyn, Andrew Morton, Paul E. McKenney,
	Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

Oleg Nesterov <oleg@redhat.com> wrote:

> From 8f92054e commit:
> 
>     Instead, add the following validation condition:
> 
>         task->exit_state >= 0
> 
>     to permit the access if the target task is dead and therefore unable to change
>     its own credentials.
> 
> OK, but afaics currently this can only help wait_task_zombie() which
> calls __task_cred() without rcu lock.
> 
> Remove this validation and change wait_task_zombie() to use task_uid()
> instead. This means we do rcu_read_lock() only to shut up the lockdep,
> but we already do the same in, say, wait_task_stopped().
> 
> Unfortunately, we can't kill task_is_dead() right now, it has already
> found the users in drivers/staging/, and I bet the usage is wrong.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

Acked-by: David Howells <dhowells@redhat.com>

It simplifies stuff in exchange for the most minor of slowdowns, so fair
enough.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 16:19         ` David Howells
@ 2011-09-20 16:38           ` Oleg Nesterov
  2011-09-20 16:50           ` David Howells
  1 sibling, 0 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 16:38 UTC (permalink / raw)
  To: David Howells
  Cc: Serge E. Hallyn, Andrew Morton, Paul E. McKenney,
	Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

On 09/20, David Howells wrote:
>
> Oleg Nesterov <oleg@redhat.com> wrote:
>
> > Change __task_cred(task) to accept "task == current" without
> > rcu_read_lock_held(). This is what current_cred() currently does,
> > and with this change __task_cred() becomes more flexible/usable.
> >
> > Signed-off-by: Oleg Nesterov <oleg@redhat.com>
>
> NAK!
>
> If you compare carefully:
>
> > -	rcu_dereference_protected(current->cred, 1)
>
> and:
>
> > -		rcu_dereference_check(__t->real_cred, 0);		\
>
> you'll notice they aren't quite the same in one very fundamental way.

Do you mean that this patch adds the unnecessary ACCESS_ONCE +
smp_read_barrier_depends() to current_cred() or I missed something
else?

And what do you think about (__t == current) in __task_cred ?

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 16:24     ` Oleg Nesterov
@ 2011-09-20 16:45       ` Serge E. Hallyn
  2011-09-20 18:17         ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-20 16:45 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/20, Serge Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > >
> > > As for __send_signal() in particular, I guess we could do
> > >
> > > 		q->info.si_uid = from_ancestor_ns ? 0 : current_uid();
> >
> > But wait - can't __send_signal end up being triggered through fcntl
> > F_SETSIG+F_SETOWN to another but none-ancestor namespace?
> 
> I didn't really check, but afaics send_sigio_to_task/etc never use
> SEND_SIG_NOINFO. And they shouldn't, there is no the sending process.

(I'm sure you're right - but I'll have to take a closer look for my
own understanding)

> > do actually need to call the full user_ns_map_uid() there.  However,
> > the uid sent in ptrace_signal() can (if i'm thinking right) be set
> > using
> >
> > 	info->si_uid = (current_user_ns() == parent_user_ns()) ?
> > 			0 : current_uid();
> >
> > Not much of a win performance-wise, but perhaps better by making it
> > clear that those are the only possibilities (and overflowuid is not
> > possible).
> 
> I think you are right right, except s/==/!=/.

Yes :)

> This is like send_signal()
> which can only send the signal down or to the same namespace.
> 
> I guess ptrace_signal() can even do
> 
> 	info->si_pid = task_pid_vnr(current->parent);
> 
> 	if (info->si_pid)	// same namespace
> 		info->si_uid = current_uid();
> 	else			// can't see the tracer, we are from the sub-namespace
> 		info->si_uid = 0;
> 
> Whatever you like more.

Except you don't have to split pid and user namespaces at the same
time.

> And just in case let me repeat, I won't argue if you use
> user_ns_map_uid/helper for consistency. Without CONFIG_USER_NS it is
> even better performance-wise (even if we add a helper I guess).

Ok.  On the one hand, using your new mapping helper will make it
easier to make sure everything is right.  This special code may be
more educational to the new reader, but isn't as trivial to
review.

Maybe I'll go with the mapping.

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 16:19         ` David Howells
  2011-09-20 16:38           ` Oleg Nesterov
@ 2011-09-20 16:50           ` David Howells
  2011-09-20 17:13             ` Oleg Nesterov
  1 sibling, 1 reply; 63+ messages in thread
From: David Howells @ 2011-09-20 16:50 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: dhowells, Serge E. Hallyn, Andrew Morton, Paul E. McKenney,
	Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

Oleg Nesterov <oleg@redhat.com> wrote:

> > > -	rcu_dereference_protected(current->cred, 1)
> >
> > and:
> >
> > > -		rcu_dereference_check(__t->real_cred, 0);		\
> >
> > you'll notice they aren't quite the same in one very fundamental way.
> 
> Do you mean that this patch adds the unnecessary ACCESS_ONCE +
> smp_read_barrier_depends() to current_cred() or I missed something
> else?

Something else.  The current_cred() uses ->cred:

 * current_cred - Access the current task's subjective credentials

and __task_cred() uses ->real_cred:

 * __task_cred - Access a task's objective credentials

Ordinarily both pointers will point to the same set of creds, but this change
will break anything that uses override_creds() + revert_creds() such as
faccessat() and fscache.

David

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held()
  2011-09-20 16:50           ` David Howells
@ 2011-09-20 17:13             ` Oleg Nesterov
  0 siblings, 0 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 17:13 UTC (permalink / raw)
  To: David Howells
  Cc: Serge E. Hallyn, Andrew Morton, Paul E. McKenney,
	Serge E. Hallyn, lkml, richard, Eric W. Biederman, Tejun Heo

On 09/20, David Howells wrote:
>
> Oleg Nesterov <oleg@redhat.com> wrote:
>
> > > > -	rcu_dereference_protected(current->cred, 1)
> > >
> > > and:
> > >
> > > > -		rcu_dereference_check(__t->real_cred, 0);		\
> > >
> > > you'll notice they aren't quite the same in one very fundamental way.
> >
> > Do you mean that this patch adds the unnecessary ACCESS_ONCE +
> > smp_read_barrier_depends() to current_cred() or I missed something
> > else?
>
> Something else.  The current_cred() uses ->cred:

Argh!!! I am soooo stupid.

Thanks a lot for the quick NACK!


So. This patch shouldn't touch current_cred().

But this also means, in theory it is not good to assume that
send_signal() can use __task_cred(current) instead of current_cred(),
although I doubt very much that someone can do override_creds() +
kill(SEND_SIG_NOINFO). So the change in __task_cred() is probably not
really good too.

Thanks.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-19 21:45 [PATCH] user namespace: make signal.c respect user namespaces Serge E. Hallyn
  2011-09-19 21:47 ` [PATCH] user namespace: usb: make usb urbs user namespace aware Serge E. Hallyn
  2011-09-20 12:22 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
@ 2011-09-20 17:48 ` Oleg Nesterov
  2011-09-20 18:53   ` Serge E. Hallyn
  2 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 17:48 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/19, Serge E. Hallyn wrote:
>
> __send_signal: convert the uid being sent in SI_USER to the target task's
> user namespace.
>
> do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
>  user namespace
>
> ptrace_signal: map parent's uid into current's user namespace before
> including in signal to current.

Btw, what about the other users of ->si_uid? Say, kill() or tkill().
Looks like, we need a lot of complications to make this correct...

As for send_signal(), may be we can simply do


	--- x/kernel/signal.c
	+++ x/kernel/signal.c
	@@ -1084,10 +1084,13 @@ static int __send_signal(int sig, struct
				break;
			default:
				copy_siginfo(&q->info, info);
	-			if (from_ancestor_ns)
	-				q->info.si_pid = 0;
				break;
			}
	+
	+		if (unlikely(from_ancestor_ns)) {
	+				q->info.si_pid = 0;
	+				q->info.si_uid = 0;
	+		}
		} else if (!is_si_special(info)) {
			if (sig >= SIGRTMIN && info->si_code != SI_USER) {
				/*

?

Yes, this "breaks" sys_rt_sigqueueinfo() evem more if it is used
to send the signal to the sub-namespace.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 16:45       ` Serge E. Hallyn
@ 2011-09-20 18:17         ` Oleg Nesterov
  0 siblings, 0 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-20 18:17 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/20, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> >
> > I didn't really check, but afaics send_sigio_to_task/etc never use
> > SEND_SIG_NOINFO. And they shouldn't, there is no the sending process.
>
> (I'm sure you're right -

I am not that sure, please recheck ;)

> > I guess ptrace_signal() can even do
> >
> > 	info->si_pid = task_pid_vnr(current->parent);
> >
> > 	if (info->si_pid)	// same namespace
> > 		info->si_uid = current_uid();
> > 	else			// can't see the tracer, we are from the sub-namespace
> > 		info->si_uid = 0;
> >
> > Whatever you like more.
>
> Except you don't have to split pid and user namespaces at the same
> time.

Argh, you mean CLONE_NEWUSER? I fotgot about it. And probably
there is something else which I simply do not know?

I got lost. This means send_signal() can't rely on from_parent_ns.
I have no idea how can we fix sys_kill() without a lot of ugly changes
then.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: drivers/staging/usbip/ abuses task_is_dead/exit_state
  2011-09-20 15:14           ` drivers/staging/usbip/ abuses task_is_dead/exit_state Oleg Nesterov
@ 2011-09-20 18:38             ` Greg KH
  2012-03-06 17:39               ` ping: " Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Greg KH @ 2011-09-20 18:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Serge E. Hallyn, Andrew Morton, David Howells, Paul E. McKenney,
	Tobias Klauser, Matt Mooney, Serge E. Hallyn, lkml, richard,
	Eric W. Biederman, Tejun Heo

On Tue, Sep 20, 2011 at 05:14:10PM +0200, Oleg Nesterov wrote:
> (add more cc's)
> 
> On 09/20, Oleg Nesterov wrote:
> >
> > Unfortunately, we can't kill task_is_dead() right now, it has already
> > found the users in drivers/staging/, and I bet the usage is wrong.
> 
> It is used by drivers/staging/usbip/
> 
> For what? The code:
> 
> 	if (vdev->ud.tcp_rx && !task_is_dead(vdev->ud.tcp_rx))
> 		kthread_stop(vdev->ud.tcp_rx);
> 
> And how task_is_dead() can help? This helper is really "special", it
> shouldn't be used anyway. But why do we check ->exit_state? Without
> tasklist the check is racy anyway, the task can exit right after the
> check.
> 
> And. It is safe to use kthread_stop(t) even if t has already exited.
> 
> OK, this was added by 8547d4cc2b616e4f1dafebe2c673fc986422b506
> "Staging: usbip: vhci-hcd: Do not kill already dead RX/TX kthread"
> 
> 	When unbinding a device on the host which was still attached on the
> 	client, I got a NULL pointer dereference on the client.
> 
> Where?
> 
> 	This turned out
> 	to be due to kthread_stop() being called on an already dead kthread.
> 
> This should work.
> 
> I'm afraid this can only fix the symptom. Probably, the problem is that
> we do not have the reference and thus even task_is_dead(t) is not safe.
> 
> This kthread was created by kthread_run(). If it exits, nothing protects
> this task_struct.
> 
> In any case, please do not use ->exit_state. It should not be used outside
> of exit.c/etc paths, "exit_state != 0" means "exit_notify() was called".

Patches to fix this up in this driver are always gladly appreciated :)

greg k-h

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 17:48 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
@ 2011-09-20 18:53   ` Serge E. Hallyn
  2011-09-21 17:53     ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-20 18:53 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/19, Serge E. Hallyn wrote:
> >
> > __send_signal: convert the uid being sent in SI_USER to the target task's
> > user namespace.
> >
> > do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
> >  user namespace
> >
> > ptrace_signal: map parent's uid into current's user namespace before
> > including in signal to current.
> 
> Btw, what about the other users of ->si_uid? Say, kill() or tkill().

Well, they end up calling __send_signal().

> Looks like, we need a lot of complications to make this correct...
> 
> As for send_signal(), may be we can simply do
> 
> 
> 	--- x/kernel/signal.c
> 	+++ x/kernel/signal.c
> 	@@ -1084,10 +1084,13 @@ static int __send_signal(int sig, struct
> 				break;
> 			default:
> 				copy_siginfo(&q->info, info);
> 	-			if (from_ancestor_ns)
> 	-				q->info.si_pid = 0;
> 				break;
> 			}
> 	+
> 	+		if (unlikely(from_ancestor_ns)) {
> 	+				q->info.si_pid = 0;
> 	+				q->info.si_uid = 0;
> 	+		}
> 		} else if (!is_si_special(info)) {
> 			if (sig >= SIGRTMIN && info->si_code != SI_USER) {
> 				/*
> 
> ?
> 
> Yes, this "breaks" sys_rt_sigqueueinfo() evem more if it is used
> to send the signal to the sub-namespace.

Eh, yeah, so we do have to use the user_ns_map_uid(), or rather your
new helper.

But you sound more alarmed than that.  So I might be missing something.
You think that user_ns_map_uid() at the locations I identified in
kernel/signal.c does not suffice?

-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] user namespace: make signal.c respect user namespaces (v2)
  2011-09-20 12:22 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
  2011-09-20 12:44   ` Serge E. Hallyn
  2011-09-20 15:39   ` [PATCH] user namespace: make signal.c respect user namespaces Serge Hallyn
@ 2011-09-21  5:00   ` Serge E. Hallyn
  2 siblings, 0 replies; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-21  5:00 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

__send_signal: convert the uid being sent in SI_USER to the target task's
user namespace.

do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
 user namespace

ptrace_signal: map parent's uid into current's user namespace before
including in signal to current.

Changelog:
Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to
	simplify callers and help make clear what we are translating
        (which uid into which namespace).  Passing the target task would
	make callers even easier to read, but we pass in user_ns because
	current_user_ns() != task_cred_xxx(current, user_ns).
Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock
	in ptrace_signal().

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c |   24 ++++++++++++++++++++----
 1 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 291c970..466219b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,6 +27,7 @@
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -1019,6 +1020,15 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+				struct user_namespace *ns)
+{
+	return user_ns_map_uid(ns, cred, cred->uid);
+}
+
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group, int from_ancestor_ns)
 {
@@ -1073,7 +1083,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_tgid_nr_ns(current,
 							task_active_pid_ns(t));
-			q->info.si_uid = current_uid();
+			q->info.si_uid = map_cred_ns(current_cred(),
+						     task_cred_xxx(t, user_ns));
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1618,7 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+				  task_cred_xxx(tsk->parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
@@ -1703,7 +1715,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+				  task_cred_xxx(parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2121,8 +2134,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_signo = signr;
 		info->si_errno = 0;
 		info->si_code = SI_USER;
+		rcu_read_lock();
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = task_uid(current->parent);
+		info->si_uid = map_cred_ns(__task_cred(current->parent),
+					   current_user_ns());
+		rcu_read_unlock();
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-20 13:17   ` Oleg Nesterov
  2011-09-20 13:33     ` Serge E. Hallyn
@ 2011-09-21  5:01     ` Serge E. Hallyn
  2011-09-21 18:31       ` Oleg Nesterov
  1 sibling, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-21  5:01 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Add to the dev_state and alloc_async structures the user namespace
corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
which can then implement a proper, user-namespace-aware uid check.

Changelog:
Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
	uid, and euid each separately, pass a struct cred.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Greg KH <greg@kroah.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
---
 drivers/usb/core/devio.c |   28 ++++++++++++----------------
 include/linux/sched.h    |    3 ++-
 kernel/signal.c          |   24 ++++++++++++++++--------
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 37518df..e53a585 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -46,6 +46,7 @@
 #include <linux/cdev.h>
 #include <linux/notifier.h>
 #include <linux/security.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
@@ -68,7 +69,7 @@ struct dev_state {
 	wait_queue_head_t wait;     /* wake up if a request completed */
 	unsigned int discsignr;
 	struct pid *disc_pid;
-	uid_t disc_uid, disc_euid;
+	const struct cred *cred;
 	void __user *disccontext;
 	unsigned long ifclaimed;
 	u32 secid;
@@ -79,7 +80,7 @@ struct async {
 	struct list_head asynclist;
 	struct dev_state *ps;
 	struct pid *pid;
-	uid_t uid, euid;
+	const struct cred *cred;
 	unsigned int signr;
 	unsigned int ifnum;
 	void __user *userbuffer;
@@ -248,6 +249,7 @@ static struct async *alloc_async(unsigned int numisoframes)
 static void free_async(struct async *as)
 {
 	put_pid(as->pid);
+	put_cred(as->cred);
 	kfree(as->urb->transfer_buffer);
 	kfree(as->urb->setup_packet);
 	usb_free_urb(as->urb);
@@ -393,9 +395,8 @@ static void async_completed(struct urb *urb)
 	struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
 	struct pid *pid = NULL;
-	uid_t uid = 0;
-	uid_t euid = 0;
 	u32 secid = 0;
+	const struct cred *cred = NULL;
 	int signr;
 
 	spin_lock(&ps->lock);
@@ -408,8 +409,7 @@ static void async_completed(struct urb *urb)
 		sinfo.si_code = SI_ASYNCIO;
 		sinfo.si_addr = as->userurb;
 		pid = as->pid;
-		uid = as->uid;
-		euid = as->euid;
+		cred = as->cred;
 		secid = as->secid;
 	}
 	snoop(&urb->dev->dev, "urb complete\n");
@@ -423,8 +423,7 @@ static void async_completed(struct urb *urb)
 	spin_unlock(&ps->lock);
 
 	if (signr)
-		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
-				      euid, secid);
+		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid);
 
 	wake_up(&ps->wait);
 }
@@ -706,8 +705,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	init_waitqueue_head(&ps->wait);
 	ps->discsignr = 0;
 	ps->disc_pid = get_pid(task_pid(current));
-	ps->disc_uid = cred->uid;
-	ps->disc_euid = cred->euid;
+	ps->cred = get_cred(cred);
 	ps->disccontext = NULL;
 	ps->ifclaimed = 0;
 	security_task_getsecid(current, &ps->secid);
@@ -749,6 +747,7 @@ static int usbdev_release(struct inode *inode, struct file *file)
 	usb_unlock_device(dev);
 	usb_put_dev(dev);
 	put_pid(ps->disc_pid);
+	put_cred(ps->cred);
 
 	as = async_getcompleted(ps);
 	while (as) {
@@ -1048,7 +1047,6 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	struct usb_host_endpoint *ep;
 	struct async *as;
 	struct usb_ctrlrequest *dr = NULL;
-	const struct cred *cred = current_cred();
 	unsigned int u, totlen, isofrmlen;
 	int ret, ifnum = -1;
 	int is_in;
@@ -1262,8 +1260,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
 	as->pid = get_pid(task_pid(current));
-	as->uid = cred->uid;
-	as->euid = cred->euid;
+	as->cred = get_current_cred();
 	security_task_getsecid(current, &as->secid);
 	if (!is_in && uurb->buffer_length > 0) {
 		if (copy_from_user(as->urb->transfer_buffer, uurb->buffer,
@@ -1981,9 +1978,8 @@ static void usbdev_remove(struct usb_device *udev)
 			sinfo.si_errno = EPIPE;
 			sinfo.si_code = SI_ASYNCIO;
 			sinfo.si_addr = ps->disccontext;
-			kill_pid_info_as_uid(ps->discsignr, &sinfo,
-					ps->disc_pid, ps->disc_uid,
-					ps->disc_euid, ps->secid);
+			kill_pid_info_as_cred(ps->discsignr, &sinfo,
+					ps->disc_pid, ps->cred, ps->secid);
 		}
 	}
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c05..57ddb52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2166,7 +2166,8 @@ extern int force_sigsegv(int, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
+extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
+				const struct cred *, u32);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
diff --git a/kernel/signal.c b/kernel/signal.c
index 466219b..9803707 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1355,13 +1355,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+static int kill_as_cred_perm(const struct cred *cred,
+			     struct task_struct *target)
+{
+	const struct cred *pcred = __task_cred(target);
+	if (cred->user_ns != pcred->user_ns)
+		return 0;
+	if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
+	    cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+		return 0;
+	return 1;
+}
+
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
-		      uid_t uid, uid_t euid, u32 secid)
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
+			 const struct cred *cred, u32 secid)
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
-	const struct cred *pcred;
 	unsigned long flags;
 
 	if (!valid_signal(sig))
@@ -1373,10 +1384,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	pcred = __task_cred(p);
-	if (si_fromuser(info) &&
-	    euid != pcred->suid && euid != pcred->uid &&
-	    uid  != pcred->suid && uid  != pcred->uid) {
+	if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1395,7 +1403,7 @@ out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-20 18:53   ` Serge E. Hallyn
@ 2011-09-21 17:53     ` Oleg Nesterov
  2011-09-22 15:23       ` Serge Hallyn
  2011-09-23 16:31       ` Serge E. Hallyn
  0 siblings, 2 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-21 17:53 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/20, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> > On 09/19, Serge E. Hallyn wrote:
> > >
> > > __send_signal: convert the uid being sent in SI_USER to the target task's
> > > user namespace.
> > >
> > > do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
> > >  user namespace
> > >
> > > ptrace_signal: map parent's uid into current's user namespace before
> > > including in signal to current.
> >
> > Btw, what about the other users of ->si_uid? Say, kill() or tkill().
>
> Well, they end up calling __send_signal().

Sure. But your patch changes only the __send_signal(SEND_SIG_NOINFO).
Note that SEND_SIG_NOINFO is rarely used, and only by kernel.

> But you sound more alarmed than that.

I am ;)

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-21  5:01     ` [PATCH] user namespace: usb: make usb urbs user namespace aware (v2) Serge E. Hallyn
@ 2011-09-21 18:31       ` Oleg Nesterov
  2011-09-21 19:12         ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-21 18:31 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/21, Serge E. Hallyn wrote:
>
> Add to the dev_state and alloc_async structures the user namespace
> corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> which can then implement a proper, user-namespace-aware uid check.

Looks correct.



But I have off-topic question. And in fact I am a bit confused,
please help.

First of all, I assume that CLONE_NEWUSER is the only way to change
->user_ns, right?

And, looking at copy_creds() I think that cred->user_ns is always
equal to cred->user->user_ns. However, grep shows a lot of
cred->user->user_ns examples. Why?

> +static int kill_as_cred_perm(const struct cred *cred,
> +			     struct task_struct *target)
> +{
> +	const struct cred *pcred = __task_cred(target);
> +	if (cred->user_ns != pcred->user_ns)
> +		return 0;

Should we really fail if cred->user_ns == pcred->user_ns->creator ?
(or creator of creator, etc).

IOW, shouldn't this match kill_ok_by_cred() path which (at least
cap_capable) checks the ->creator chain when ->user_ns differ?

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-21 18:31       ` Oleg Nesterov
@ 2011-09-21 19:12         ` Serge E. Hallyn
  2011-09-21 19:18           ` Greg KH
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-21 19:12 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo,
	serge, Greg KH

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/21, Serge E. Hallyn wrote:
> >
> > Add to the dev_state and alloc_async structures the user namespace
> > corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> > which can then implement a proper, user-namespace-aware uid check.
> 
> Looks correct.
> 
> 
> 
> But I have off-topic question. And in fact I am a bit confused,
> please help.
> 
> First of all, I assume that CLONE_NEWUSER is the only way to change
> ->user_ns, right?

Yes.

> And, looking at copy_creds() I think that cred->user_ns is always
> equal to cred->user->user_ns. However, grep shows a lot of
> cred->user->user_ns examples. Why?

Good question.  It's only because cred->user_ns is an optimization
recently introduced.  I think those can be safely switched over.

> > +static int kill_as_cred_perm(const struct cred *cred,
> > +			     struct task_struct *target)
> > +{
> > +	const struct cred *pcred = __task_cred(target);
> > +	if (cred->user_ns != pcred->user_ns)
> > +		return 0;
> 
> Should we really fail if cred->user_ns == pcred->user_ns->creator ?
> (or creator of creator, etc).
> 
> IOW, shouldn't this match kill_ok_by_cred() path which (at least
> cap_capable) checks the ->creator chain when ->user_ns differ?

I'm not sure.  We can relax that later if need be, but as this has to do
with usb urbs and userspace drivers, I don't think we'll want to.
Hopefully we can talk with Greg KH about it at some point.  But for now,
with this patch, all interactions between tasks in the initial user
namespace will continue as normal, and we're not allowing anything
untoward between user namespaces, so I think this is best.

Drat.  Greg, sorry about not Cc:ing you on the original patch.  Please
let me know if you'd like me to resend to you.

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-21 19:12         ` Serge E. Hallyn
@ 2011-09-21 19:18           ` Greg KH
  2011-09-23  1:27             ` [PATCH resend] " Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Greg KH @ 2011-09-21 19:18 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Oleg Nesterov, lkml, richard, Andrew Morton, Eric W. Biederman,
	Tejun Heo, serge

On Wed, Sep 21, 2011 at 02:12:38PM -0500, Serge E. Hallyn wrote:
> 
> Drat.  Greg, sorry about not Cc:ing you on the original patch.  Please
> let me know if you'd like me to resend to you.

Please do, and cc: linux-usb@vger.kernel.org as I have no idea what you
are referring to here at all.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-21 17:53     ` Oleg Nesterov
@ 2011-09-22 15:23       ` Serge Hallyn
  2011-09-23 16:31       ` Serge E. Hallyn
  1 sibling, 0 replies; 63+ messages in thread
From: Serge Hallyn @ 2011-09-22 15:23 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/20, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > On 09/19, Serge E. Hallyn wrote:
> > > >
> > > > __send_signal: convert the uid being sent in SI_USER to the target task's
> > > > user namespace.
> > > >
> > > > do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
> > > >  user namespace
> > > >
> > > > ptrace_signal: map parent's uid into current's user namespace before
> > > > including in signal to current.
> > >
> > > Btw, what about the other users of ->si_uid? Say, kill() or tkill().
> >
> > Well, they end up calling __send_signal().
> 
> Sure. But your patch changes only the __send_signal(SEND_SIG_NOINFO).
> Note that SEND_SIG_NOINFO is rarely used, and only by kernel.

Ah, I was misreading.  I was mistakenly thinking tkill for instance was going
to be handled by the SEND_SIG_NOINFO case.  I'll fix my patch, thanks!

-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH resend] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-21 19:18           ` Greg KH
@ 2011-09-23  1:27             ` Serge E. Hallyn
  2011-09-23 15:48               ` Alan Stern
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-23  1:27 UTC (permalink / raw)
  To: Greg KH
  Cc: Serge E. Hallyn, Oleg Nesterov, lkml, richard, Andrew Morton,
	Eric W. Biederman, Tejun Heo, linux-usb

(re-sending to Cc: Greg and linux-usb)

Add to the dev_state and alloc_async structures the user namespace
corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
which can then implement a proper, user-namespace-aware uid check.

Changelog:
Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
	uid, and euid each separately, pass a struct cred.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Greg KH <greg@kroah.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
---
 drivers/usb/core/devio.c |   28 ++++++++++++----------------
 include/linux/sched.h    |    3 ++-
 kernel/signal.c          |   24 ++++++++++++++++--------
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 37518df..e53a585 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -46,6 +46,7 @@
 #include <linux/cdev.h>
 #include <linux/notifier.h>
 #include <linux/security.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
@@ -68,7 +69,7 @@ struct dev_state {
 	wait_queue_head_t wait;     /* wake up if a request completed */
 	unsigned int discsignr;
 	struct pid *disc_pid;
-	uid_t disc_uid, disc_euid;
+	const struct cred *cred;
 	void __user *disccontext;
 	unsigned long ifclaimed;
 	u32 secid;
@@ -79,7 +80,7 @@ struct async {
 	struct list_head asynclist;
 	struct dev_state *ps;
 	struct pid *pid;
-	uid_t uid, euid;
+	const struct cred *cred;
 	unsigned int signr;
 	unsigned int ifnum;
 	void __user *userbuffer;
@@ -248,6 +249,7 @@ static struct async *alloc_async(unsigned int numisoframes)
 static void free_async(struct async *as)
 {
 	put_pid(as->pid);
+	put_cred(as->cred);
 	kfree(as->urb->transfer_buffer);
 	kfree(as->urb->setup_packet);
 	usb_free_urb(as->urb);
@@ -393,9 +395,8 @@ static void async_completed(struct urb *urb)
 	struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
 	struct pid *pid = NULL;
-	uid_t uid = 0;
-	uid_t euid = 0;
 	u32 secid = 0;
+	const struct cred *cred = NULL;
 	int signr;
 
 	spin_lock(&ps->lock);
@@ -408,8 +409,7 @@ static void async_completed(struct urb *urb)
 		sinfo.si_code = SI_ASYNCIO;
 		sinfo.si_addr = as->userurb;
 		pid = as->pid;
-		uid = as->uid;
-		euid = as->euid;
+		cred = as->cred;
 		secid = as->secid;
 	}
 	snoop(&urb->dev->dev, "urb complete\n");
@@ -423,8 +423,7 @@ static void async_completed(struct urb *urb)
 	spin_unlock(&ps->lock);
 
 	if (signr)
-		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
-				      euid, secid);
+		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid);
 
 	wake_up(&ps->wait);
 }
@@ -706,8 +705,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	init_waitqueue_head(&ps->wait);
 	ps->discsignr = 0;
 	ps->disc_pid = get_pid(task_pid(current));
-	ps->disc_uid = cred->uid;
-	ps->disc_euid = cred->euid;
+	ps->cred = get_cred(cred);
 	ps->disccontext = NULL;
 	ps->ifclaimed = 0;
 	security_task_getsecid(current, &ps->secid);
@@ -749,6 +747,7 @@ static int usbdev_release(struct inode *inode, struct file *file)
 	usb_unlock_device(dev);
 	usb_put_dev(dev);
 	put_pid(ps->disc_pid);
+	put_cred(ps->cred);
 
 	as = async_getcompleted(ps);
 	while (as) {
@@ -1048,7 +1047,6 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	struct usb_host_endpoint *ep;
 	struct async *as;
 	struct usb_ctrlrequest *dr = NULL;
-	const struct cred *cred = current_cred();
 	unsigned int u, totlen, isofrmlen;
 	int ret, ifnum = -1;
 	int is_in;
@@ -1262,8 +1260,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
 	as->pid = get_pid(task_pid(current));
-	as->uid = cred->uid;
-	as->euid = cred->euid;
+	as->cred = get_current_cred();
 	security_task_getsecid(current, &as->secid);
 	if (!is_in && uurb->buffer_length > 0) {
 		if (copy_from_user(as->urb->transfer_buffer, uurb->buffer,
@@ -1981,9 +1978,8 @@ static void usbdev_remove(struct usb_device *udev)
 			sinfo.si_errno = EPIPE;
 			sinfo.si_code = SI_ASYNCIO;
 			sinfo.si_addr = ps->disccontext;
-			kill_pid_info_as_uid(ps->discsignr, &sinfo,
-					ps->disc_pid, ps->disc_uid,
-					ps->disc_euid, ps->secid);
+			kill_pid_info_as_cred(ps->discsignr, &sinfo,
+					ps->disc_pid, ps->cred, ps->secid);
 		}
 	}
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c05..57ddb52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2166,7 +2166,8 @@ extern int force_sigsegv(int, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
+extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
+				const struct cred *, u32);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
diff --git a/kernel/signal.c b/kernel/signal.c
index 466219b..9803707 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1355,13 +1355,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+static int kill_as_cred_perm(const struct cred *cred,
+			     struct task_struct *target)
+{
+	const struct cred *pcred = __task_cred(target);
+	if (cred->user_ns != pcred->user_ns)
+		return 0;
+	if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
+	    cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+		return 0;
+	return 1;
+}
+
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
-		      uid_t uid, uid_t euid, u32 secid)
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
+			 const struct cred *cred, u32 secid)
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
-	const struct cred *pcred;
 	unsigned long flags;
 
 	if (!valid_signal(sig))
@@ -1373,10 +1384,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	pcred = __task_cred(p);
-	if (si_fromuser(info) &&
-	    euid != pcred->suid && euid != pcred->uid &&
-	    uid  != pcred->suid && uid  != pcred->uid) {
+	if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1395,7 +1403,7 @@ out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH resend] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-23  1:27             ` [PATCH resend] " Serge E. Hallyn
@ 2011-09-23 15:48               ` Alan Stern
  2011-09-23 16:06                 ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Alan Stern @ 2011-09-23 15:48 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Greg KH, Serge E. Hallyn, Oleg Nesterov, lkml, richard,
	Andrew Morton, Eric W. Biederman, Tejun Heo, linux-usb

On Fri, 23 Sep 2011, Serge E. Hallyn wrote:

> (re-sending to Cc: Greg and linux-usb)
> 
> Add to the dev_state and alloc_async structures the user namespace
> corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> which can then implement a proper, user-namespace-aware uid check.
> 
> Changelog:
> Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
> 	uid, and euid each separately, pass a struct cred.

This should be broken up into two separate patches: One to add
kill_pid_info_as_cred() and the other to modify the usbfs driver.

> --- a/drivers/usb/core/devio.c
> +++ b/drivers/usb/core/devio.c

> @@ -393,9 +395,8 @@ static void async_completed(struct urb *urb)
>  	struct dev_state *ps = as->ps;
>  	struct siginfo sinfo;
>  	struct pid *pid = NULL;
> -	uid_t uid = 0;
> -	uid_t euid = 0;
>  	u32 secid = 0;
> +	const struct cred *cred = NULL;
>  	int signr;
>  
>  	spin_lock(&ps->lock);
> @@ -408,8 +409,7 @@ static void async_completed(struct urb *urb)
>  		sinfo.si_code = SI_ASYNCIO;
>  		sinfo.si_addr = as->userurb;
>  		pid = as->pid;
> -		uid = as->uid;
> -		euid = as->euid;
> +		cred = as->cred;
>  		secid = as->secid;
>  	}
>  	snoop(&urb->dev->dev, "urb complete\n");
> @@ -423,8 +423,7 @@ static void async_completed(struct urb *urb)
>  	spin_unlock(&ps->lock);
>  
>  	if (signr)
> -		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
> -				      euid, secid);
> +		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid);

This continues a bug that already exists in the current code.  Once 
ps->lock is released, there is no guarantee that the async structure 
will still exist.  It may already have been freed, and the reference to 
as->cred may already have been dropped.  That's why the local copies 
have to be made above.  cred shouldn't be a simple copy of as->cred; it 
should also increment the reference count.

> @@ -706,8 +705,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
>  	init_waitqueue_head(&ps->wait);
>  	ps->discsignr = 0;
>  	ps->disc_pid = get_pid(task_pid(current));
> -	ps->disc_uid = cred->uid;
> -	ps->disc_euid = cred->euid;
> +	ps->cred = get_cred(cred);

You might as well get rid of the "cred" local variable.  It isn't used 
for anything except this assignment.

Alan Stern


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH resend] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-23 15:48               ` Alan Stern
@ 2011-09-23 16:06                 ` Serge E. Hallyn
  2011-09-23 16:21                   ` Alan Stern
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-23 16:06 UTC (permalink / raw)
  To: Alan Stern
  Cc: Serge E. Hallyn, Greg KH, Oleg Nesterov, lkml, richard,
	Andrew Morton, Eric W. Biederman, Tejun Heo, linux-usb

Quoting Alan Stern (stern@rowland.harvard.edu):
> On Fri, 23 Sep 2011, Serge E. Hallyn wrote:
> 
> > (re-sending to Cc: Greg and linux-usb)
> > 
> > Add to the dev_state and alloc_async structures the user namespace
> > corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> > which can then implement a proper, user-namespace-aware uid check.
> > 
> > Changelog:
> > Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
> > 	uid, and euid each separately, pass a struct cred.
> 
> This should be broken up into two separate patches: One to add
> kill_pid_info_as_cred() and the other to modify the usbfs driver.

It seems like that would make the first patch harder to review (since
it won't just show the changes from kill_pid_info_as_uid to
kill_pid_info_as_cred), but I'll go ahead and split it up.  I assume
kill_pid_info_as_uid should be removed in a third patch?

> > --- a/drivers/usb/core/devio.c
> > +++ b/drivers/usb/core/devio.c
> 
> > @@ -393,9 +395,8 @@ static void async_compled ted(struct urb *urb)
> >  	struct dev_state *ps = as->ps;
> >  	struct siginfo sinfo;
> >  	struct pid *pid = NULL;
> > -	uid_t uid = 0;
> > -	uid_t euid = 0;
> >  	u32 secid = 0;
> > +	const struct cred *cred = NULL;
> >  	int signr;
> >  
> >  	spin_lock(&ps->lock);
> > @@ -408,8 +409,7 @@ static void async_completed(struct urb *urb)
> >  		sinfo.si_code = SI_ASYNCIO;
> >  		sinfo.si_addr = as->userurb;
> >  		pid = as->pid;
> > -		uid = as->uid;
> > -		euid = as->euid;
> > +		cred = as->cred;
> >  		secid = as->secid;
> >  	}
> >  	snoop(&urb->dev->dev, "urb complete\n");
> > @@ -423,8 +423,7 @@ static void async_completed(struct urb *urb)
> >  	spin_unlock(&ps->lock);
> >  
> >  	if (signr)
> > -		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
> > -				      euid, secid);
> > +		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid);
> 
> This continues a bug that already exists in the current code.  Once 
> ps->lock is released, there is no guarantee that the async structure 
> will still exist.  It may already have been freed, and the reference to 

Yikes.  That makes sense.  I'll fix that for the cred and the pid as well
then?

> as->cred may already have been dropped.  That's why the local copies 
> have to be made above.  cred shouldn't be a simple copy of as->cred; it 
> should also increment the reference count.
> 
> > @@ -706,8 +705,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
> >  	init_waitqueue_head(&ps->wait);
> >  	ps->discsignr = 0;
> >  	ps->disc_pid = get_pid(task_pid(current));
> > -	ps->disc_uid = cred->uid;
> > -	ps->disc_euid = cred->euid;
> > +	ps->cred = get_cred(cred);
> 
> You might as well get rid of the "cred" local variable.  It isn't used 
> for anything except this assignment.
> 
> Alan Stern

Thanks for looking, Alan.

-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH resend] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-23 16:06                 ` Serge E. Hallyn
@ 2011-09-23 16:21                   ` Alan Stern
  2011-09-23 17:22                     ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Alan Stern @ 2011-09-23 16:21 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn, Greg KH, Oleg Nesterov, lkml, richard,
	Andrew Morton, Eric W. Biederman, Tejun Heo, linux-usb

On Fri, 23 Sep 2011, Serge E. Hallyn wrote:

> Quoting Alan Stern (stern@rowland.harvard.edu):
> > On Fri, 23 Sep 2011, Serge E. Hallyn wrote:
> > 
> > > (re-sending to Cc: Greg and linux-usb)
> > > 
> > > Add to the dev_state and alloc_async structures the user namespace
> > > corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> > > which can then implement a proper, user-namespace-aware uid check.
> > > 
> > > Changelog:
> > > Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
> > > 	uid, and euid each separately, pass a struct cred.
> > 
> > This should be broken up into two separate patches: One to add
> > kill_pid_info_as_cred() and the other to modify the usbfs driver.
> 
> It seems like that would make the first patch harder to review (since
> it won't just show the changes from kill_pid_info_as_uid to
> kill_pid_info_as_cred), but I'll go ahead and split it up.  I assume
> kill_pid_info_as_uid should be removed in a third patch?

No, what I meant was that all the changes to /kernel/signal.c should be 
in one patch and all the changes to drivers/usb/core/devio.c should be 
in a second patch.

You did check to make sure there are no other references to
kill_pid_info_as_uid() in the kernel?

> > >  	if (signr)
> > > -		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
> > > -				      euid, secid);
> > > +		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid);
> > 
> > This continues a bug that already exists in the current code.  Once 
> > ps->lock is released, there is no guarantee that the async structure 
> > will still exist.  It may already have been freed, and the reference to 
> 
> Yikes.  That makes sense.  I'll fix that for the cred and the pid as well
> then?

Right.

> > as->cred may already have been dropped.  That's why the local copies 
> > have to be made above.  cred shouldn't be a simple copy of as->cred; it 
> > should also increment the reference count.
> > 
> > > @@ -706,8 +705,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
> > >  	init_waitqueue_head(&ps->wait);
> > >  	ps->discsignr = 0;
> > >  	ps->disc_pid = get_pid(task_pid(current));
> > > -	ps->disc_uid = cred->uid;
> > > -	ps->disc_euid = cred->euid;
> > > +	ps->cred = get_cred(cred);
> > 
> > You might as well get rid of the "cred" local variable.  It isn't used 
> > for anything except this assignment.
> > 
> > Alan Stern
> 
> Thanks for looking, Alan.

You're welcome.

Alan Stern


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-21 17:53     ` Oleg Nesterov
  2011-09-22 15:23       ` Serge Hallyn
@ 2011-09-23 16:31       ` Serge E. Hallyn
  2011-09-23 17:36         ` Oleg Nesterov
  1 sibling, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-23 16:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/20, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > On 09/19, Serge E. Hallyn wrote:
> > > >
> > > > __send_signal: convert the uid being sent in SI_USER to the target task's
> > > > user namespace.
> > > >
> > > > do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
> > > >  user namespace
> > > >
> > > > ptrace_signal: map parent's uid into current's user namespace before
> > > > including in signal to current.
> > >
> > > Btw, what about the other users of ->si_uid? Say, kill() or tkill().
> >
> > Well, they end up calling __send_signal().
> 
> Sure. But your patch changes only the __send_signal(SEND_SIG_NOINFO).
> Note that SEND_SIG_NOINFO is rarely used, and only by kernel.
> 
> > But you sound more alarmed than that.
> 
> I am ;)

Ok, thanks for setting me straight, Oleg.  It looks like I can fix all the
cases at send_signal() by checking whether si_fromuser(info) and
current_user_ns() != task_cred_xxx(t, user_ns), and mapping the uid if
so.

-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH resend] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-23 16:21                   ` Alan Stern
@ 2011-09-23 17:22                     ` Serge E. Hallyn
  2011-09-23 18:35                       ` Alan Stern
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-23 17:22 UTC (permalink / raw)
  To: Alan Stern
  Cc: Serge E. Hallyn, Greg KH, Oleg Nesterov, lkml, richard,
	Andrew Morton, Eric W. Biederman, Tejun Heo, linux-usb

Quoting Alan Stern (stern@rowland.harvard.edu):
> On Fri, 23 Sep 2011, Serge E. Hallyn wrote:
> 
> > Quoting Alan Stern (stern@rowland.harvard.edu):
> > > On Fri, 23 Sep 2011, Serge E. Hallyn wrote:
> > > 
> > > > (re-sending to Cc: Greg and linux-usb)
> > > > 
> > > > Add to the dev_state and alloc_async structures the user namespace
> > > > corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
> > > > which can then implement a proper, user-namespace-aware uid check.
> > > > 
> > > > Changelog:
> > > > Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
> > > > 	uid, and euid each separately, pass a struct cred.
> > > 
> > > This should be broken up into two separate patches: One to add
> > > kill_pid_info_as_cred() and the other to modify the usbfs driver.
> > 
> > It seems like that would make the first patch harder to review (since
> > it won't just show the changes from kill_pid_info_as_uid to
> > kill_pid_info_as_cred), but I'll go ahead and split it up.  I assume
> > kill_pid_info_as_uid should be removed in a third patch?
> 
> No, what I meant was that all the changes to /kernel/signal.c should be 
> in one patch and all the changes to drivers/usb/core/devio.c should be 
> in a second patch.

But doing that in two patches would not be bisect-safe.

> You did check to make sure there are no other references to
> kill_pid_info_as_uid() in the kernel?

Yup.

> > > >  	if (signr)
> > > > -		kill_pid_info_as_uid(sinfo.si_signo, &sinfo, pid, uid,
> > > > -				      euid, secid);
> > > > +		kill_pid_info_as_cred(sinfo.si_signo, &sinfo, pid, cred, secid);
> > > 
> > > This continues a bug that already exists in the current code.  Once 
> > > ps->lock is released, there is no guarantee that the async structure 
> > > will still exist.  It may already have been freed, and the reference to 
> > 
> > Yikes.  That makes sense.  I'll fix that for the cred and the pid as well
> > then?
> 
> Right.

Ok.

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-23 16:31       ` Serge E. Hallyn
@ 2011-09-23 17:36         ` Oleg Nesterov
  2011-09-23 21:20           ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-23 17:36 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/23, Serge E. Hallyn wrote:
>
> It looks like I can fix all the
> cases

except ptrace_signal(). Although we can simply ignore this case, imho.

> at send_signal()

Yes, I was thinking about this too but didn't have the time to check
if this can really work.

> by checking whether si_fromuser(info)

I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
the "fromkernel" case too. May be we can ignore this too.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH resend] user namespace: usb: make usb urbs user namespace aware (v2)
  2011-09-23 17:22                     ` Serge E. Hallyn
@ 2011-09-23 18:35                       ` Alan Stern
  0 siblings, 0 replies; 63+ messages in thread
From: Alan Stern @ 2011-09-23 18:35 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn, Greg KH, Oleg Nesterov, lkml, richard,
	Andrew Morton, Eric W. Biederman, Tejun Heo, linux-usb

On Fri, 23 Sep 2011, Serge E. Hallyn wrote:

> > > > This should be broken up into two separate patches: One to add
> > > > kill_pid_info_as_cred() and the other to modify the usbfs driver.
> > > 
> > > It seems like that would make the first patch harder to review (since
> > > it won't just show the changes from kill_pid_info_as_uid to
> > > kill_pid_info_as_cred), but I'll go ahead and split it up.  I assume
> > > kill_pid_info_as_uid should be removed in a third patch?
> > 
> > No, what I meant was that all the changes to /kernel/signal.c should be 
> > in one patch and all the changes to drivers/usb/core/devio.c should be 
> > in a second patch.
> 
> But doing that in two patches would not be bisect-safe.

Argh, you're right.

It's not a good idea to combine changes to widely differing subsystems
in a single patch, but luckily this is small enough that it doesn't
matter too much.  If nobody else minds, I won't complain.

Alan Stern


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-23 17:36         ` Oleg Nesterov
@ 2011-09-23 21:20           ` Serge E. Hallyn
  2011-09-24 16:37             ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-23 21:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/23, Serge E. Hallyn wrote:
> >
> > It looks like I can fix all the
> > cases
> 
> except ptrace_signal(). Although we can simply ignore this case, imho.

ptrace_signal() calls send_signal() though.

But oh, dear.  It is sending a signal on behalf of the parent, so what I was
about to send, which does the uid mapping at send_signal() using current_cred()
as the id to send, is not correct.

Which unfortunately means that I'll have to have the caller of send_signal
(when doing SI_USER) either send the signal sender's cred, or do the
mapping.  Or special-case ptrace_signal().  This is not as pretty as I'd
hoped.

> > at send_signal()
> 
> Yes, I was thinking about this too but didn't have the time to check
> if this can really work.
> 
> > by checking whether si_fromuser(info)
> 
> I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
> the "fromkernel" case too. May be we can ignore this too.

sys_rt_tgsigqueueinfo() still seems to go through send_signal().

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-23 21:20           ` Serge E. Hallyn
@ 2011-09-24 16:37             ` Oleg Nesterov
  2011-09-25 20:17               ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-24 16:37 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/23, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> > On 09/23, Serge E. Hallyn wrote:
> > >
> > > It looks like I can fix all the
> > > cases
> >
> > except ptrace_signal(). Although we can simply ignore this case, imho.
>
> ptrace_signal() calls send_signal() though.

Confused... I meant the "if (signr != info->si_signo)" case. This is
simple, and I only meant that this case is not that important.

> > > by checking whether si_fromuser(info)
> >
> > I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
> > the "fromkernel" case too. May be we can ignore this too.
>
> sys_rt_tgsigqueueinfo() still seems to go through send_signal().

Yes. But how can you fix si_uid? We do not even know if it exists.
Please look at siginfo/_uid, there is a union. We can't know what
the caller of sys_rt_sigqueueinfo() puts in this location.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-24 16:37             ` Oleg Nesterov
@ 2011-09-25 20:17               ` Serge E. Hallyn
  2011-09-26 16:06                 ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-09-25 20:17 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/23, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > On 09/23, Serge E. Hallyn wrote:
> > > >
> > > > It looks like I can fix all the
> > > > cases
> > >
> > > except ptrace_signal(). Although we can simply ignore this case, imho.
> >
> > ptrace_signal() calls send_signal() though.
> 
> Confused... I meant the "if (signr != info->si_signo)" case. This is
> simple, and I only meant that this case is not that important.

Yes, that's the case I was talking about.  That then proceeds through
send_signal().

It appears to be the only (user) caller of send_signal() where the
info->si_pid and info->si_uid are not filled in from current.  But I
just realized that it's not a problem for the same reason that it isn't
for the analogous pid code - since the target task is current, the test
for whether current userns != target userns will always return false, so
the code simply won't trigger :)

+	if (si_fromuser(info) && current_user_ns() != task_cred_xxx(t, user_ns)) {
+		rcu_read_lock();
+		if (!map_cred_ns(current_cred(), task_cred_xxx(t, user_ns)))
+			userns_rel = USERNS_ANCESTOR;
+		else
+			userns_rel = USERNS_OTHER;
+		rcu_read_unlock();
+	}

The whole new patch (so far only compile-tested) is below.

> > > > by checking whether si_fromuser(info)
> > >
> > > I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
> > > the "fromkernel" case too. May be we can ignore this too.
> >
> > sys_rt_tgsigqueueinfo() still seems to go through send_signal().
> 
> Yes. But how can you fix si_uid? We do not even know if it exists.
> Please look at siginfo/_uid, there is a union. We can't know what
> the caller of sys_rt_sigqueueinfo() puts in this location.

But it's a union alongside the pid.  I don't understand the callers
well enough to say whether there is a bug in the pid handling.  I'll
certainly try to take a wider look next week, but it seems to me
at least this patch should bring the uid namespacing code up to par
with the pid namespacing code for signals.

(patch follows - I'll resend if it passes more strenuous testing)

thanks,
-serge

 From d73866bec4032bed383ba977d11f97d0ae3e9596 Mon Sep 17 00:00:00 2001
 From: Serge Hallyn <serge.hallyn@canonical.com>
 Date: Mon, 19 Sep 2011 18:07:55 +0100
 Subject: [PATCH 1/1] user namespace: make signal.c respect user namespaces (v3)

__send_signal: convert the uid being sent in SI_USER to the target task's
user namespace.

do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
 user namespace

ptrace_signal: map parent's uid into current's user namespace before
including in signal to current.

Changelog:
Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to
	simplify callers and help make clear what we are translating
        (which uid into which namespace).  Passing the target task would
	make callers even easier to read, but we pass in user_ns because
	current_user_ns() != task_cred_xxx(current, user_ns).
Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock
	in ptrace_signal().
Sep 23: In send_signal(), detect when (user) signal is coming from an
	ancestor or unrelated user namespace.  Pass that on to __send_signal,
	which sets si_uid to 0 or overflowuid if needed.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c |   59 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 291c970..d2ee505 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,6 +27,7 @@
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -37,6 +38,14 @@
 #include <asm/siginfo.h>
 #include "audit.h"	/* audit_signal_info() */
 
+extern int overflowuid;
+/* sender and target have same user namespace */
+#define USERNS_SAME 0
+/* sendor's userns is ancestor of target's */
+#define USERNS_ANCESTOR 1
+/* sendor's userns is child of target's or unrelated to it */
+#define USERNS_OTHER 2
+
 /*
  * SLAB caches for signal bits.
  */
@@ -1019,8 +1028,17 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+				struct user_namespace *ns)
+{
+	return user_ns_map_uid(ns, cred, cred->uid);
+}
+
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			int group, int from_ancestor_ns)
+			int group, int from_ancestor_ns, int userns_rel)
 {
 	struct sigpending *pending;
 	struct sigqueue *q;
@@ -1073,7 +1091,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_tgid_nr_ns(current,
 							task_active_pid_ns(t));
-			q->info.si_uid = current_uid();
+			q->info.si_uid = map_cred_ns(current_cred(),
+						     task_cred_xxx(t, user_ns));
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1086,6 +1105,16 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			copy_siginfo(&q->info, info);
 			if (from_ancestor_ns)
 				q->info.si_pid = 0;
+			switch (userns_rel) {
+			case USERNS_SAME:
+				break;
+			case USERNS_ANCESTOR:
+				q->info.si_uid = 0;
+				break;
+			default:
+				q->info.si_uid = overflowuid;
+				break;
+			}
 			break;
 		}
 	} else if (!is_si_special(info)) {
@@ -1117,13 +1146,24 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group)
 {
 	int from_ancestor_ns = 0;
+	int userns_rel = USERNS_SAME;
 
 #ifdef CONFIG_PID_NS
 	from_ancestor_ns = si_fromuser(info) &&
 			   !task_pid_nr_ns(current, task_active_pid_ns(t));
 #endif
+#ifdef CONFIG_USER_NS
+	if (si_fromuser(info) && current_user_ns() != task_cred_xxx(t, user_ns)) {
+		rcu_read_lock();
+		if (!map_cred_ns(current_cred(), task_cred_xxx(t, user_ns)))
+			userns_rel = USERNS_ANCESTOR;
+		else
+			userns_rel = USERNS_OTHER;
+		rcu_read_unlock();
+	}
+#endif
 
-	return __send_signal(sig, info, t, group, from_ancestor_ns);
+	return __send_signal(sig, info, t, group, from_ancestor_ns, userns_rel);
 }
 
 static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1375,7 +1415,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 
 	if (sig) {
 		if (lock_task_sighand(p, &flags)) {
-			ret = __send_signal(sig, info, p, 1, 0);
+			ret = __send_signal(sig, info, p, 1, 0, USERNS_SAME);
 			unlock_task_sighand(p, &flags);
 		} else
 			ret = -ESRCH;
@@ -1618,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+				  task_cred_xxx(tsk->parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
@@ -1703,7 +1744,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+				  task_cred_xxx(parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2121,8 +2163,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_signo = signr;
 		info->si_errno = 0;
 		info->si_code = SI_USER;
+		rcu_read_lock();
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = task_uid(current->parent);
+		info->si_uid = map_cred_ns(__task_cred(current->parent),
+					   current_user_ns());
+		rcu_read_unlock();
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-25 20:17               ` Serge E. Hallyn
@ 2011-09-26 16:06                 ` Oleg Nesterov
  2011-09-27 14:28                   ` Serge Hallyn
                                     ` (2 more replies)
  0 siblings, 3 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-26 16:06 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/25, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> > On 09/23, Serge E. Hallyn wrote:
> > >
> > > Quoting Oleg Nesterov (oleg@redhat.com):
> > > > On 09/23, Serge E. Hallyn wrote:
> > > > >
> > > > > It looks like I can fix all the
> > > > > cases
> > > >
> > > > except ptrace_signal(). Although we can simply ignore this case, imho.
> > >
> > > ptrace_signal() calls send_signal() though.
> >
> > Confused... I meant the "if (signr != info->si_signo)" case. This is
> > simple, and I only meant that this case is not that important.
>
> Yes, that's the case I was talking about.  That then proceeds through
> send_signal().

It doesn't? I am even more confuused. Anyway, your patch adds map_cred_ns()
into ptrace_signal().

> The whole new patch (so far only compile-tested) is below.

Perhaps I missed something, but it looks overcomplicated. I was thinking
about the (uncompiled/untested) simple patch below (it ignores ptrace_signal
for clarity).

And note that this way we do not need to modify do_notify_parent*()
or ipc/mqueue.c:__do_notify() (your patch doesn't cover the latter).
Unless I missed something of course.

And we do not need to handle the SEND_SIG_NOINFO case separately.


However, we still have the problems with sigqueueinfo, 

> > > > > by checking whether si_fromuser(info)
> > > >
> > > > I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
> > > > the "fromkernel" case too. May be we can ignore this too.
> > >
> > > sys_rt_tgsigqueueinfo() still seems to go through send_signal().
> >
> > Yes. But how can you fix si_uid? We do not even know if it exists.
> > Please look at siginfo/_uid, there is a union. We can't know what
> > the caller of sys_rt_sigqueueinfo() puts in this location.
>
> But it's a union alongside the pid.

Again, I do not understand... Yes, we have the same problem with

	if (from_ancestor_ns)
		q->info.si_pid = 0;

This was discussed, we do not know what we can do. My point was, this
change is not sigqueueinfo-friendly too.

Oleg.

--- x/kernel/signal.c
+++ x/kernel/signal.c
@@ -1019,6 +1019,27 @@ static inline int legacy_queue(struct si
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
+static inline fixup_uid(struct siginfo *info, struct task_struct *t)
+{
+#ifdef CONFIG_USER_NS
+	if (current_user_ns() == task_cred_xxx(t, user_ns)))
+#endif
+		return;
+
+	if (SI_FROMKERNEL(info))
+		switch (info->si_code & __SI_MASK) {
+			default:
+				return;
+
+			case __SI_CHLD:
+			case __SI_MESGQ:
+				break;
+		}
+
+	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+					current_cred(), info->si_uid);
+}
+
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group, int from_ancestor_ns)
 {
@@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
 				q->info.si_pid = 0;
 			break;
 		}
+
+		fixup_uid(info, t);
+
 	} else if (!is_si_special(info)) {
 		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
 			/*


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-26 16:06                 ` Oleg Nesterov
@ 2011-09-27 14:28                   ` Serge Hallyn
  2011-09-27 14:38                     ` Oleg Nesterov
  2011-10-04 17:42                   ` Serge E. Hallyn
  2011-10-08 20:02                   ` Serge E. Hallyn
  2 siblings, 1 reply; 63+ messages in thread
From: Serge Hallyn @ 2011-09-27 14:28 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/25, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > On 09/23, Serge E. Hallyn wrote:
> > > >
> > > > Quoting Oleg Nesterov (oleg@redhat.com):
> > > > > On 09/23, Serge E. Hallyn wrote:
> > > > > >
> > > > > > It looks like I can fix all the
> > > > > > cases
> > > > >
> > > > > except ptrace_signal(). Although we can simply ignore this case, imho.
> > > >
> > > > ptrace_signal() calls send_signal() though.
> > >
> > > Confused... I meant the "if (signr != info->si_signo)" case. This is
> > > simple, and I only meant that this case is not that important.
> >
> > Yes, that's the case I was talking about.  That then proceeds through
> > send_signal().
> 
> It doesn't?

No, I was saying it *does*.

> I am even more confuused. Anyway, your patch adds map_cred_ns()
> into ptrace_signal().

Yes.  Which is fine, because target task is current, so the code I add
in send_signal() which tries to map the uid if necessary will check
that current == current, and not re-map the uid.

> > The whole new patch (so far only compile-tested) is below.
> 
> Perhaps I missed something, but it looks overcomplicated.

That may be :)

> I was thinking
> about the (uncompiled/untested) simple patch below (it ignores ptrace_signal
> for clarity).
> 
> And note that this way we do not need to modify do_notify_parent*()
> or ipc/mqueue.c:__do_notify() (your patch doesn't cover the latter).
> Unless I missed something of course.
> 
> And we do not need to handle the SEND_SIG_NOINFO case separately.
> 
> 
> However, we still have the problems with sigqueueinfo, 
> 
> > > > > > by checking whether si_fromuser(info)
> > > > >
> > > > > I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
> > > > > the "fromkernel" case too. May be we can ignore this too.
> > > >
> > > > sys_rt_tgsigqueueinfo() still seems to go through send_signal().
> > >
> > > Yes. But how can you fix si_uid? We do not even know if it exists.
> > > Please look at siginfo/_uid, there is a union. We can't know what
> > > the caller of sys_rt_sigqueueinfo() puts in this location.
> >
> > But it's a union alongside the pid.
> 
> Again, I do not understand... Yes, we have the same problem with
> 
> 	if (from_ancestor_ns)
> 		q->info.si_pid = 0;
> 
> This was discussed, we do not know what we can do.

I see.

> My point was, this
> change is not sigqueueinfo-friendly too.

Yup.

> Oleg.
> 
> --- x/kernel/signal.c
> +++ x/kernel/signal.c
> @@ -1019,6 +1019,27 @@ static inline int legacy_queue(struct si
>  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
>  }
>  
> +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> +{
> +#ifdef CONFIG_USER_NS
> +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> +#endif
> +		return;
> +
> +	if (SI_FROMKERNEL(info))
> +		switch (info->si_code & __SI_MASK) {
> +			default:
> +				return;
> +
> +			case __SI_CHLD:
> +			case __SI_MESGQ:
> +				break;
> +		}
> +
> +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> +					current_cred(), info->si_uid);
> +}
> +
>  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
>  			int group, int from_ancestor_ns)
>  {
> @@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
>  				q->info.si_pid = 0;
>  			break;
>  		}
> +
> +		fixup_uid(info, t);
> +
>  	} else if (!is_si_special(info)) {
>  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
>  			/*

It certainly is much simpler.  I'll take some time to walk through all
of send_signal again and make sure I understand what it does in all
the cases.

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-27 14:28                   ` Serge Hallyn
@ 2011-09-27 14:38                     ` Oleg Nesterov
  2011-09-27 15:27                       ` Serge Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-27 14:38 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/27, Serge Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> > On 09/25, Serge E. Hallyn wrote:
> > >
> > > Yes, that's the case I was talking about.  That then proceeds through
> > > send_signal().
> >
> > It doesn't?
>
> No, I was saying it *does*.

But it doesn't ;)

Serge, there is some misunderstanding. And I do not know who is confused,
me or your.

ptrace_signal() simply fills *info with some "random" data before
processing the signal. It doesn't pass this info to send_signal().

If the debuggure wants something meaningfull in *info, it should
fill it itself via PTRACE_SETSIGINFO. This handles the case when
the debugger doesn't really care and only changes signr.

> > +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> > +{
> > +#ifdef CONFIG_USER_NS
> > +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> > +#endif
> > +		return;
> > +
> > +	if (SI_FROMKERNEL(info))
> > +		switch (info->si_code & __SI_MASK) {
> > +			default:
> > +				return;
> > +
> > +			case __SI_CHLD:
> > +			case __SI_MESGQ:
> > +				break;
> > +		}
> > +
> > +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> > +					current_cred(), info->si_uid);
> > +}
> > +
> >  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
> >  			int group, int from_ancestor_ns)
> >  {
> > @@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
> >  				q->info.si_pid = 0;
> >  			break;
> >  		}
> > +
> > +		fixup_uid(info, t);
> > +
> >  	} else if (!is_si_special(info)) {
> >  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
> >  			/*
>
> It certainly is much simpler.  I'll take some time to walk through all
> of send_signal again and make sure I understand what it does in all
> the cases.

Thanks.



As for ptrace_signal(), it can use the same helper too. Once again,
debugger can use PTRACE_SETSIGINFO, if we really want to fixup si_uid
we should do this even if signr == info->si_signo. OTOH, we do not
know what debugger puts in this chunk of memory. This is like
sigqueueinfo().

Or we can simply leave this code as is. I do not think this si_uid
(in this case) is really important.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-27 14:38                     ` Oleg Nesterov
@ 2011-09-27 15:27                       ` Serge Hallyn
  2011-09-27 17:12                         ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Serge Hallyn @ 2011-09-27 15:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/27, Serge Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > On 09/25, Serge E. Hallyn wrote:
> > > >
> > > > Yes, that's the case I was talking about.  That then proceeds through
> > > > send_signal().
> > >
> > > It doesn't?
> >
> > No, I was saying it *does*.
> 
> But it doesn't ;)
> 
> Serge, there is some misunderstanding. And I do not know who is confused,
> me or your.
> 
> ptrace_signal() simply fills *info with some "random" data before
> processing the signal. It doesn't pass this info to send_signal().

Oh, well not always, but it does in the case where
	sigismember(&current->blocked, signr);
But I suppose that's not a common path :)

> If the debuggure wants something meaningfull in *info, it should
> fill it itself via PTRACE_SETSIGINFO. This handles the case when
> the debugger doesn't really care and only changes signr.

Ok.

> > > +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> > > +{
> > > +#ifdef CONFIG_USER_NS
> > > +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> > > +#endif
> > > +		return;
> > > +
> > > +	if (SI_FROMKERNEL(info))
> > > +		switch (info->si_code & __SI_MASK) {
> > > +			default:
> > > +				return;
> > > +
> > > +			case __SI_CHLD:
> > > +			case __SI_MESGQ:
> > > +				break;
> > > +		}
> > > +
> > > +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> > > +					current_cred(), info->si_uid);
> > > +}
> > > +
> > >  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
> > >  			int group, int from_ancestor_ns)
> > >  {
> > > @@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
> > >  				q->info.si_pid = 0;
> > >  			break;
> > >  		}
> > > +
> > > +		fixup_uid(info, t);
> > > +
> > >  	} else if (!is_si_special(info)) {
> > >  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
> > >  			/*
> >
> > It certainly is much simpler.  I'll take some time to walk through all
> > of send_signal again and make sure I understand what it does in all
> > the cases.
> 
> Thanks.
> 
> 
> 
> As for ptrace_signal(), it can use the same helper too. Once again,
> debugger can use PTRACE_SETSIGINFO, if we really want to fixup si_uid
> we should do this even if signr == info->si_signo. OTOH, we do not
> know what debugger puts in this chunk of memory. This is like
> sigqueueinfo().

True.

> Or we can simply leave this code as is. I do not think this si_uid
> (in this case) is really important.

Ok, that sounds good to me.

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-27 15:27                       ` Serge Hallyn
@ 2011-09-27 17:12                         ` Oleg Nesterov
  0 siblings, 0 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-09-27 17:12 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

On 09/27, Serge Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> >
> > ptrace_signal() simply fills *info with some "random" data before
> > processing the signal. It doesn't pass this info to send_signal().
>
> Oh, well not always, but it does in the case where
> 	sigismember(&current->blocked, signr);

Ah, thanks, now I see what you mean.

> But I suppose that's not a common path :)

Yes. And, in this case the task "sends" a signal to itself. This means,
whatever we do with  __send_signal() it shouldn't change info->si_uid.
(At least with fixup_uid() I suggested).

I am glad we finally understand each other, most probably I didn't read
your previous emails carefully ;)

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-26 16:06                 ` Oleg Nesterov
  2011-09-27 14:28                   ` Serge Hallyn
@ 2011-10-04 17:42                   ` Serge E. Hallyn
  2011-10-09 19:00                     ` Oleg Nesterov
  2011-10-08 20:02                   ` Serge E. Hallyn
  2 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-10-04 17:42 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> --- x/kernel/signal.c
> +++ x/kernel/signal.c
> @@ -1019,6 +1019,27 @@ static inline int legacy_queue(struct si
>  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
>  }
>  
> +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> +{
> +#ifdef CONFIG_USER_NS
> +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> +#endif
> +		return;
> +
> +	if (SI_FROMKERNEL(info))
> +		switch (info->si_code & __SI_MASK) {
> +			default:
> +				return;
> +
> +			case __SI_CHLD:

If I'm reading this right, this will catch do_notify_parent, which is
sending signals not from current.  Not sure of the best way to avoid
this.  For pids that case is being ignored by not using the __SI_MASK,
so CLD_EXITED etc will cause from_ancestor_ns to be false.  Could we
do that here too, and then translate the pids in do_notify_parent?

> +			case __SI_MESGQ:
> +				break;
> +		}
> +
> +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> +					current_cred(), info->si_uid);
> +}
> +
>  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
>  			int group, int from_ancestor_ns)
>  {
> @@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
>  				q->info.si_pid = 0;
>  			break;
>  		}
> +
> +		fixup_uid(info, t);
> +
>  	} else if (!is_si_special(info)) {
>  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
>  			/*
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-09-26 16:06                 ` Oleg Nesterov
  2011-09-27 14:28                   ` Serge Hallyn
  2011-10-04 17:42                   ` Serge E. Hallyn
@ 2011-10-08 20:02                   ` Serge E. Hallyn
  2011-10-09 19:03                     ` Oleg Nesterov
  2 siblings, 1 reply; 63+ messages in thread
From: Serge E. Hallyn @ 2011-10-08 20:02 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Serge E. Hallyn, lkml, richard, Andrew Morton, Eric W. Biederman,
	Tejun Heo

Quoting Oleg Nesterov (oleg@redhat.com):
> On 09/25, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > On 09/23, Serge E. Hallyn wrote:
> > > >
> > > > Quoting Oleg Nesterov (oleg@redhat.com):
> > > > > On 09/23, Serge E. Hallyn wrote:
> > > > > >
> > > > > > It looks like I can fix all the
> > > > > > cases
> > > > >
> > > > > except ptrace_signal(). Although we can simply ignore this case, imho.
> > > >
> > > > ptrace_signal() calls send_signal() though.
> > >
> > > Confused... I meant the "if (signr != info->si_signo)" case. This is
> > > simple, and I only meant that this case is not that important.
> >
> > Yes, that's the case I was talking about.  That then proceeds through
> > send_signal().
> 
> It doesn't? I am even more confuused. Anyway, your patch adds map_cred_ns()
> into ptrace_signal().
> 
> > The whole new patch (so far only compile-tested) is below.
> 
> Perhaps I missed something, but it looks overcomplicated. I was thinking
> about the (uncompiled/untested) simple patch below (it ignores ptrace_signal
> for clarity).
> 
> And note that this way we do not need to modify do_notify_parent*()
> or ipc/mqueue.c:__do_notify() (your patch doesn't cover the latter).
> Unless I missed something of course.
> 
> And we do not need to handle the SEND_SIG_NOINFO case separately.
> 
> 
> However, we still have the problems with sigqueueinfo, 
> 
> > > > > > by checking whether si_fromuser(info)
> > > > >
> > > > > I am not sure... sys_rt_queueinfo() is nasty. Plus we have to handle
> > > > > the "fromkernel" case too. May be we can ignore this too.
> > > >
> > > > sys_rt_tgsigqueueinfo() still seems to go through send_signal().
> > >
> > > Yes. But how can you fix si_uid? We do not even know if it exists.
> > > Please look at siginfo/_uid, there is a union. We can't know what
> > > the caller of sys_rt_sigqueueinfo() puts in this location.
> >
> > But it's a union alongside the pid.
> 
> Again, I do not understand... Yes, we have the same problem with
> 
> 	if (from_ancestor_ns)
> 		q->info.si_pid = 0;
> 
> This was discussed, we do not know what we can do. My point was, this
> change is not sigqueueinfo-friendly too.
> 
> Oleg.
> 
> --- x/kernel/signal.c
> +++ x/kernel/signal.c
> @@ -1019,6 +1019,27 @@ static inline int legacy_queue(struct si
>  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
>  }
>  
> +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> +{
> +#ifdef CONFIG_USER_NS
> +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> +#endif
> +		return;
> +
> +	if (SI_FROMKERNEL(info))
> +		switch (info->si_code & __SI_MASK) {
> +			default:
> +				return;
> +
> +			case __SI_CHLD:

After looking a bit more, I really think the __SI_CHLD case just needs to
always be converted at the callers (i.e. do_notify_parent).

> +			case __SI_MESGQ:

This can be done like this, but if this is going to be the only case of
a SI_FROMKERNEL not being converted at the caller, and there is aiui only
one caller (__do_notivy in ipc/mmqueue.c), it seems better to just have
fixup_uid() always return for SI_FROMKERNEL(info).

Does that sound ok?

If so, I'll whip up the patch and send it out after some testing.  (If
not, I'll try to better understand :)


> +				break;
> +		}
> +
> +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> +					current_cred(), info->si_uid);
> +}
> +
>  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
>  			int group, int from_ancestor_ns)
>  {
> @@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
>  				q->info.si_pid = 0;
>  			break;
>  		}
> +
> +		fixup_uid(info, t);
> +
>  	} else if (!is_si_special(info)) {
>  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
>  			/*
> 

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-10-04 17:42                   ` Serge E. Hallyn
@ 2011-10-09 19:00                     ` Oleg Nesterov
  2011-10-11 13:08                       ` Serge E. Hallyn
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2011-10-09 19:00 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Sorry, I missed this email...

On 10/04, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> > --- x/kernel/signal.c
> > +++ x/kernel/signal.c
> > @@ -1019,6 +1019,27 @@ static inline int legacy_queue(struct si
> >  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
> >  }
> >
> > +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> > +{
> > +#ifdef CONFIG_USER_NS
> > +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> > +#endif
> > +		return;
> > +
> > +	if (SI_FROMKERNEL(info))
> > +		switch (info->si_code & __SI_MASK) {
> > +			default:
> > +				return;
> > +
> > +			case __SI_CHLD:
>
> If I'm reading this right, this will catch do_notify_parent, which is
> sending signals not from current.

Argh, indeed. So we need to modify the callers.

OTOH, this is only possible if the debugger notifies tracee->real_parent,
or we reparent to /sbin/init.

Anyways, yes, I forgot about this. Thanks.

Oleg.

Not sure of the best way to avoid
> this.  For pids that case is being ignored by not using the __SI_MASK,
> so CLD_EXITED etc will cause from_ancestor_ns to be false.  Could we
> do that here too, and then translate the pids in do_notify_parent?
> 
> > +			case __SI_MESGQ:
> > +				break;
> > +		}
> > +
> > +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
> > +					current_cred(), info->si_uid);
> > +}
> > +
> >  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
> >  			int group, int from_ancestor_ns)
> >  {
> > @@ -1088,6 +1109,9 @@ static int __send_signal(int sig, struct
> >  				q->info.si_pid = 0;
> >  			break;
> >  		}
> > +
> > +		fixup_uid(info, t);
> > +
> >  	} else if (!is_si_special(info)) {
> >  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
> >  			/*
> > 


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-10-08 20:02                   ` Serge E. Hallyn
@ 2011-10-09 19:03                     ` Oleg Nesterov
  0 siblings, 0 replies; 63+ messages in thread
From: Oleg Nesterov @ 2011-10-09 19:03 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn, lkml, richard, Andrew Morton, Eric W. Biederman,
	Tejun Heo

On 10/08, Serge E. Hallyn wrote:
>
> Quoting Oleg Nesterov (oleg@redhat.com):
> >
> > +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> > +{
> > +#ifdef CONFIG_USER_NS
> > +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> > +#endif
> > +		return;
> > +
> > +	if (SI_FROMKERNEL(info))
> > +		switch (info->si_code & __SI_MASK) {
> > +			default:
> > +				return;
> > +
> > +			case __SI_CHLD:
>
> After looking a bit more, I really think the __SI_CHLD case just needs to
> always be converted at the callers (i.e. do_notify_parent).

Yes, unfortunately.

> > +			case __SI_MESGQ:
>
> This can be done like this, but if this is going to be the only case of
> a SI_FROMKERNEL not being converted at the caller, and there is aiui only
> one caller (__do_notivy in ipc/mmqueue.c), it seems better to just have
> fixup_uid() always return for SI_FROMKERNEL(info).

Yes, agreed.

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] user namespace: make signal.c respect user namespaces
  2011-10-09 19:00                     ` Oleg Nesterov
@ 2011-10-11 13:08                       ` Serge E. Hallyn
  0 siblings, 0 replies; 63+ messages in thread
From: Serge E. Hallyn @ 2011-10-11 13:08 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, richard, Andrew Morton, Eric W. Biederman, Tejun Heo, serge

Quoting Oleg Nesterov (oleg@redhat.com):
> Sorry, I missed this email...
> 
> On 10/04, Serge E. Hallyn wrote:
> >
> > Quoting Oleg Nesterov (oleg@redhat.com):
> > > --- x/kernel/signal.c
> > > +++ x/kernel/signal.c
> > > @@ -1019,6 +1019,27 @@ static inline int legacy_queue(struct si
> > >  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
> > >  }
> > >
> > > +static inline fixup_uid(struct siginfo *info, struct task_struct *t)
> > > +{
> > > +#ifdef CONFIG_USER_NS
> > > +	if (current_user_ns() == task_cred_xxx(t, user_ns)))
> > > +#endif
> > > +		return;
> > > +
> > > +	if (SI_FROMKERNEL(info))
> > > +		switch (info->si_code & __SI_MASK) {
> > > +			default:
> > > +				return;
> > > +
> > > +			case __SI_CHLD:
> >
> > If I'm reading this right, this will catch do_notify_parent, which is
> > sending signals not from current.
> 
> Argh, indeed. So we need to modify the callers.
> 
> OTOH, this is only possible if the debugger notifies tracee->real_parent,
> or we reparent to /sbin/init.

Right.  (I thought we decided that was the case anyway, but maybe I was
missing something)

> Anyways, yes, I forgot about this. Thanks.
> 
> Oleg.

I'll try to get the new patch out this week, else early next.  (Shouldn't
take long to write, but testing is another matter)

thanks,
-serge

^ permalink raw reply	[flat|nested] 63+ messages in thread

* ping: drivers/staging/usbip/ abuses task_is_dead/exit_state
  2011-09-20 18:38             ` Greg KH
@ 2012-03-06 17:39               ` Oleg Nesterov
  2012-03-06 19:30                 ` Tobias Klauser
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2012-03-06 17:39 UTC (permalink / raw)
  To: Tobias Klauser, Matt Mooney, Greg Kroah-Hartman; +Cc: linux-kernel

On 09/20, Greg KH wrote:
>
> On Tue, Sep 20, 2011 at 05:14:10PM +0200, Oleg Nesterov wrote:
> > (add more cc's)
> >
> > On 09/20, Oleg Nesterov wrote:
> > >
> > > Unfortunately, we can't kill task_is_dead() right now, it has already
> > > found the users in drivers/staging/, and I bet the usage is wrong.
> >
> > It is used by drivers/staging/usbip/
> >
> > For what? The code:
> >
> > 	if (vdev->ud.tcp_rx && !task_is_dead(vdev->ud.tcp_rx))
> > 		kthread_stop(vdev->ud.tcp_rx);
> >
> > And how task_is_dead() can help? This helper is really "special", it
> > shouldn't be used anyway. But why do we check ->exit_state? Without
> > tasklist the check is racy anyway, the task can exit right after the
> > check.
> >
> > And. It is safe to use kthread_stop(t) even if t has already exited.
> >
> > OK, this was added by 8547d4cc2b616e4f1dafebe2c673fc986422b506
> > "Staging: usbip: vhci-hcd: Do not kill already dead RX/TX kthread"
> >
> > 	When unbinding a device on the host which was still attached on the
> > 	client, I got a NULL pointer dereference on the client.
> >
> > Where?
> >
> > 	This turned out
> > 	to be due to kthread_stop() being called on an already dead kthread.
> >
> > This should work.
> >
> > I'm afraid this can only fix the symptom. Probably, the problem is that
> > we do not have the reference and thus even task_is_dead(t) is not safe.
> >
> > This kthread was created by kthread_run(). If it exits, nothing protects
> > this task_struct.
> >
> > In any case, please do not use ->exit_state. It should not be used outside
> > of exit.c/etc paths, "exit_state != 0" means "exit_notify() was called".
>
> Patches to fix this up in this driver are always gladly appreciated :)

OK, since nobody cares, probably I should make the patch even if I don't
understand this code at all and can't test the change.

But, Tobias, may be you can explain what this task_is_dead() check was
supposed to do?

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: ping: drivers/staging/usbip/ abuses task_is_dead/exit_state
  2012-03-06 17:39               ` ping: " Oleg Nesterov
@ 2012-03-06 19:30                 ` Tobias Klauser
  2012-03-08 18:57                   ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Tobias Klauser @ 2012-03-06 19:30 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Matt Mooney, Greg Kroah-Hartman, linux-kernel

On 2012-03-06 at 18:39:25 +0100, Oleg Nesterov <oleg@redhat.com> wrote:
> On 09/20, Greg KH wrote:
> >
> > On Tue, Sep 20, 2011 at 05:14:10PM +0200, Oleg Nesterov wrote:
> > > (add more cc's)
> > >
> > > On 09/20, Oleg Nesterov wrote:
> > > >
> > > > Unfortunately, we can't kill task_is_dead() right now, it has already
> > > > found the users in drivers/staging/, and I bet the usage is wrong.
> > >
> > > It is used by drivers/staging/usbip/
> > >
> > > For what? The code:
> > >
> > > 	if (vdev->ud.tcp_rx && !task_is_dead(vdev->ud.tcp_rx))
> > > 		kthread_stop(vdev->ud.tcp_rx);
> > >
> > > And how task_is_dead() can help? This helper is really "special", it
> > > shouldn't be used anyway. But why do we check ->exit_state? Without
> > > tasklist the check is racy anyway, the task can exit right after the
> > > check.
> > >
> > > And. It is safe to use kthread_stop(t) even if t has already exited.
> > >
> > > OK, this was added by 8547d4cc2b616e4f1dafebe2c673fc986422b506
> > > "Staging: usbip: vhci-hcd: Do not kill already dead RX/TX kthread"
> > >
> > > 	When unbinding a device on the host which was still attached on the
> > > 	client, I got a NULL pointer dereference on the client.
> > >
> > > Where?
> > >
> > > 	This turned out
> > > 	to be due to kthread_stop() being called on an already dead kthread.
> > >
> > > This should work.
> > >
> > > I'm afraid this can only fix the symptom. Probably, the problem is that
> > > we do not have the reference and thus even task_is_dead(t) is not safe.
> > >
> > > This kthread was created by kthread_run(). If it exits, nothing protects
> > > this task_struct.
> > >
> > > In any case, please do not use ->exit_state. It should not be used outside
> > > of exit.c/etc paths, "exit_state != 0" means "exit_notify() was called".
> >
> > Patches to fix this up in this driver are always gladly appreciated :)
> 
> OK, since nobody cares, probably I should make the patch even if I don't
> understand this code at all and can't test the change.
> 
> But, Tobias, may be you can explain what this task_is_dead() check was
> supposed to do?

As mentioned in the commit message, this was needed for me to work
around a NULL pointer dereference I got during unbinding (I only
experienced this behaviour on the nios2 platform though, couldn't
reproduce it on e.g. x86_64).

I wasn't really familiar with the codebase of usbip (and still am not)
but came up with the fix by more or less blindly copying what the
opposite side is checking for in stub_shutdown_connection(). This fixed
the behaviour for me and seemed legitimate as it was done equally there.

Tobias

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: ping: drivers/staging/usbip/ abuses task_is_dead/exit_state
  2012-03-06 19:30                 ` Tobias Klauser
@ 2012-03-08 18:57                   ` Oleg Nesterov
  2012-03-13 11:45                     ` Tobias Klauser
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2012-03-08 18:57 UTC (permalink / raw)
  To: Tobias Klauser; +Cc: Matt Mooney, Greg Kroah-Hartman, linux-kernel

On 03/06, Tobias Klauser wrote:
>
> On 2012-03-06 at 18:39:25 +0100, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > OK, since nobody cares, probably I should make the patch even if I don't
> > understand this code at all and can't test the change.
> >
> > But, Tobias, may be you can explain what this task_is_dead() check was
> > supposed to do?
>
> As mentioned in the commit message, this was needed for me to work
> around a NULL pointer dereference I got during unbinding

Where? OK, I guess you do not remember the trace ;)

> (I only
> experienced this behaviour on the nios2 platform though, couldn't
> reproduce it on e.g. x86_64).

OK,

> I wasn't really familiar with the codebase of usbip (and still am not)
> but came up with the fix by more or less blindly copying what the
> opposite side is checking for in stub_shutdown_connection(). This fixed
> the behaviour for me and seemed legitimate as it was done equally there.

But this looks "obviously wrong", and afaics just hides the problem.
Not to mention this check is racy, it is simply unsafe to dereference
this task_struct if the kthread has already exited.

At first glance we need something like the patch below (and stub_dev.c
needs the same fix). It assumes that:

	- vhci_shutdown_connection() is the only caller of kthread_stop(),
	  iow nobody else does kthread_stop(tcp_*x)

	- we can't leak the task_struct, vhci_shutdown_connection() should
	  be called in any case at some point.

I'll try to grep more, but perhaps you can ack my understanding?

Oleg.

--- x/drivers/staging/usbip/vhci_sysfs.c
+++ x/drivers/staging/usbip/vhci_sysfs.c
@@ -155,6 +155,16 @@ static int valid_args(__u32 rhport, enum
 	return 0;
 }
 
+#define kthread_get_run(threadfn, data, namefmt, ...)			   \
+({									   \
+	struct task_struct *__k						   \
+		= kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
+	if (!IS_ERR(__k)) {						   \
+		get_task_struct(__k);
+		wake_up_process(__k);					   \
+	}								   \
+	__k;								   \
+})
 /*
  * To start a new USB/IP attachment, a userland program needs to setup a TCP
  * connection and then write its socket descriptor with remote device
@@ -222,8 +232,8 @@ static ssize_t store_attach(struct devic
 	spin_unlock(&the_controller->lock);
 	/* end the lock */
 
-	vdev->ud.tcp_rx = kthread_run(vhci_rx_loop, &vdev->ud, "vhci_rx");
-	vdev->ud.tcp_tx = kthread_run(vhci_tx_loop, &vdev->ud, "vhci_tx");
+	vdev->ud.tcp_rx = kthread_get_run(vhci_rx_loop, &vdev->ud, "vhci_rx");
+	vdev->ud.tcp_tx = kthread_get_run(vhci_tx_loop, &vdev->ud, "vhci_tx");
 
 	rh_port_connect(rhport, speed);
 
--- x/drivers/staging/usbip/vhci_hcd.c
+++ x/drivers/staging/usbip/vhci_hcd.c
@@ -860,10 +860,14 @@ static void vhci_shutdown_connection(str
 	}
 
 	/* kill threads related to this sdev, if v.c. exists */
-	if (vdev->ud.tcp_rx && !task_is_dead(vdev->ud.tcp_rx))
+	if (vdev->ud.tcp_rx) {
 		kthread_stop(vdev->ud.tcp_rx);
-	if (vdev->ud.tcp_tx && !task_is_dead(vdev->ud.tcp_tx))
+		put_task_struct(vdev->ud.tcp_rx);
+	}
+	if (vdev->ud.tcp_tx) {
 		kthread_stop(vdev->ud.tcp_tx);
+		put_task_struct(vdev->ud.tcp_tx);
+	}
 
 	pr_info("stop threads\n");
 


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: ping: drivers/staging/usbip/ abuses task_is_dead/exit_state
  2012-03-08 18:57                   ` Oleg Nesterov
@ 2012-03-13 11:45                     ` Tobias Klauser
  2012-03-13 18:07                       ` [PATCH] staging: usbip: fix the usage of kthread_stop() Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Tobias Klauser @ 2012-03-13 11:45 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Matt Mooney, Greg Kroah-Hartman, linux-kernel

On 2012-03-08 at 19:57:39 +0100, Oleg Nesterov <oleg@redhat.com> wrote:
> On 03/06, Tobias Klauser wrote:
> >
> > On 2012-03-06 at 18:39:25 +0100, Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > OK, since nobody cares, probably I should make the patch even if I don't
> > > understand this code at all and can't test the change.
> > >
> > > But, Tobias, may be you can explain what this task_is_dead() check was
> > > supposed to do?
> >
> > As mentioned in the commit message, this was needed for me to work
> > around a NULL pointer dereference I got during unbinding
> 
> Where? OK, I guess you do not remember the trace ;)

Yup, sorry I can't precisely recall where it happened anymore.

> > (I only
> > experienced this behaviour on the nios2 platform though, couldn't
> > reproduce it on e.g. x86_64).
> 
> OK,
> 
> > I wasn't really familiar with the codebase of usbip (and still am not)
> > but came up with the fix by more or less blindly copying what the
> > opposite side is checking for in stub_shutdown_connection(). This fixed
> > the behaviour for me and seemed legitimate as it was done equally there.
> 
> But this looks "obviously wrong", and afaics just hides the problem.
> Not to mention this check is racy, it is simply unsafe to dereference
> this task_struct if the kthread has already exited.
> 
> At first glance we need something like the patch below (and stub_dev.c
> needs the same fix). It assumes that:
> 
> 	- vhci_shutdown_connection() is the only caller of kthread_stop(),
> 	  iow nobody else does kthread_stop(tcp_*x)
> 
> 	- we can't leak the task_struct, vhci_shutdown_connection() should
> 	  be called in any case at some point.
> 
> I'll try to grep more, but perhaps you can ack my understanding?

Looks OK to me at a first glance. I'll try to dig up my test system
where I was able to produce that NULL pointer dereference and test your
patch there and will then let you know my results.

Thanks
Tobias

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] staging: usbip: fix the usage of kthread_stop()
  2012-03-13 11:45                     ` Tobias Klauser
@ 2012-03-13 18:07                       ` Oleg Nesterov
  2012-04-01 23:17                         ` Oleg Nesterov
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2012-03-13 18:07 UTC (permalink / raw)
  To: Tobias Klauser; +Cc: Matt Mooney, Greg Kroah-Hartman, linux-kernel

On 03/13, Tobias Klauser wrote:
>
> Looks OK to me at a first glance. I'll try to dig up my test system
> where I was able to produce that NULL pointer dereference and test your
> patch there and will then let you know my results.

Oh, great. Please find the patch below.

Of course it wasn't tested at all ;)

-----------------------------------------------------------------------
>From 8bec01c69c2b523e93432d36f9e211b6ec4a6771 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 13 Mar 2012 18:55:20 +0100
Subject: [PATCH] staging: usbip: fix the usage of kthread_stop()

stub_shutdown_connection() and vhci_shutdown_connection() use
task_is_dead() before kthread_stop(). This buys nothing and wrong.
kthread_stop() is fine even if this thread is dead. However, if it
is dead nothing protects this task_struct, we shouldn't touch this
memory.

Change the code to do the necessary get_task_struct/put_task_struct.

This patch assumes that

	- xxx_shutdown_connection() is always called, so we can't
	  leak the task_struct.

	- kthread_stop_put() can't be called twice on the same task.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 drivers/staging/usbip/stub_dev.c     |   12 ++++++------
 drivers/staging/usbip/usbip_common.h |   17 +++++++++++++++++
 drivers/staging/usbip/vhci_hcd.c     |    8 ++++----
 drivers/staging/usbip/vhci_sysfs.c   |    4 ++--
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/drivers/staging/usbip/stub_dev.c b/drivers/staging/usbip/stub_dev.c
index 03420e2..fc6ceca 100644
--- a/drivers/staging/usbip/stub_dev.c
+++ b/drivers/staging/usbip/stub_dev.c
@@ -113,8 +113,8 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr,
 
 		spin_unlock(&sdev->ud.lock);
 
-		sdev->ud.tcp_rx = kthread_run(stub_rx_loop, &sdev->ud, "stub_rx");
-		sdev->ud.tcp_tx = kthread_run(stub_tx_loop, &sdev->ud, "stub_tx");
+		sdev->ud.tcp_rx = kthread_get_run(stub_rx_loop, &sdev->ud, "stub_rx");
+		sdev->ud.tcp_tx = kthread_get_run(stub_tx_loop, &sdev->ud, "stub_tx");
 
 		spin_lock(&sdev->ud.lock);
 		sdev->ud.status = SDEV_ST_USED;
@@ -187,10 +187,10 @@ static void stub_shutdown_connection(struct usbip_device *ud)
 	}
 
 	/* 1. stop threads */
-	if (ud->tcp_rx && !task_is_dead(ud->tcp_rx))
-		kthread_stop(ud->tcp_rx);
-	if (ud->tcp_tx && !task_is_dead(ud->tcp_tx))
-		kthread_stop(ud->tcp_tx);
+	if (ud->tcp_rx)
+		kthread_stop_put(ud->tcp_rx);
+	if (ud->tcp_tx)
+		kthread_stop_put(ud->tcp_tx);
 
 	/*
 	 * 2. close the socket
diff --git a/drivers/staging/usbip/usbip_common.h b/drivers/staging/usbip/usbip_common.h
index b8f8c48..0d52234 100644
--- a/drivers/staging/usbip/usbip_common.h
+++ b/drivers/staging/usbip/usbip_common.h
@@ -292,6 +292,23 @@ struct usbip_device {
 	} eh_ops;
 };
 
+#define kthread_get_run(threadfn, data, namefmt, ...)			   \
+({									   \
+	struct task_struct *__k						   \
+		= kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
+	if (!IS_ERR(__k)) {						   \
+		get_task_struct(__k);					   \
+		wake_up_process(__k);					   \
+	}								   \
+	__k;								   \
+})
+
+#define kthread_stop_put(k)		\
+	do {				\
+		kthread_stop(k);	\
+		put_task_struct(k);	\
+	} while (0)
+
 /* usbip_common.c */
 void usbip_dump_urb(struct urb *purb);
 void usbip_dump_header(struct usbip_header *pdu);
diff --git a/drivers/staging/usbip/vhci_hcd.c b/drivers/staging/usbip/vhci_hcd.c
index 2ee97e2..fb50fdc 100644
--- a/drivers/staging/usbip/vhci_hcd.c
+++ b/drivers/staging/usbip/vhci_hcd.c
@@ -860,10 +860,10 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
 	}
 
 	/* kill threads related to this sdev, if v.c. exists */
-	if (vdev->ud.tcp_rx && !task_is_dead(vdev->ud.tcp_rx))
-		kthread_stop(vdev->ud.tcp_rx);
-	if (vdev->ud.tcp_tx && !task_is_dead(vdev->ud.tcp_tx))
-		kthread_stop(vdev->ud.tcp_tx);
+	if (vdev->ud.tcp_rx)
+		kthread_stop_put(vdev->ud.tcp_rx);
+	if (vdev->ud.tcp_tx)
+		kthread_stop_put(vdev->ud.tcp_tx);
 
 	pr_info("stop threads\n");
 
diff --git a/drivers/staging/usbip/vhci_sysfs.c b/drivers/staging/usbip/vhci_sysfs.c
index 0cd039b..7ce9c2f 100644
--- a/drivers/staging/usbip/vhci_sysfs.c
+++ b/drivers/staging/usbip/vhci_sysfs.c
@@ -222,8 +222,8 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr,
 	spin_unlock(&the_controller->lock);
 	/* end the lock */
 
-	vdev->ud.tcp_rx = kthread_run(vhci_rx_loop, &vdev->ud, "vhci_rx");
-	vdev->ud.tcp_tx = kthread_run(vhci_tx_loop, &vdev->ud, "vhci_tx");
+	vdev->ud.tcp_rx = kthread_get_run(vhci_rx_loop, &vdev->ud, "vhci_rx");
+	vdev->ud.tcp_tx = kthread_get_run(vhci_tx_loop, &vdev->ud, "vhci_tx");
 
 	rh_port_connect(rhport, speed);
 
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] staging: usbip: fix the usage of kthread_stop()
  2012-03-13 18:07                       ` [PATCH] staging: usbip: fix the usage of kthread_stop() Oleg Nesterov
@ 2012-04-01 23:17                         ` Oleg Nesterov
  2012-04-02  8:11                           ` Tobias Klauser
  0 siblings, 1 reply; 63+ messages in thread
From: Oleg Nesterov @ 2012-04-01 23:17 UTC (permalink / raw)
  To: Tobias Klauser; +Cc: Matt Mooney, Greg Kroah-Hartman, linux-kernel

On 03/13, Oleg Nesterov wrote:
>
> On 03/13, Tobias Klauser wrote:
> >
> > Looks OK to me at a first glance. I'll try to dig up my test system
> > where I was able to produce that NULL pointer dereference and test your
> > patch there and will then let you know my results.
>
> Oh, great. Please find the patch below.
>
> Of course it wasn't tested at all ;)

Cough... Sorry, but another ping ;)

Oleg.


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] staging: usbip: fix the usage of kthread_stop()
  2012-04-01 23:17                         ` Oleg Nesterov
@ 2012-04-02  8:11                           ` Tobias Klauser
  0 siblings, 0 replies; 63+ messages in thread
From: Tobias Klauser @ 2012-04-02  8:11 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Matt Mooney, Greg Kroah-Hartman, linux-kernel

On 2012-04-02 at 01:17:41 +0200, Oleg Nesterov <oleg@redhat.com> wrote:
> On 03/13, Oleg Nesterov wrote:
> >
> > On 03/13, Tobias Klauser wrote:
> > >
> > > Looks OK to me at a first glance. I'll try to dig up my test system
> > > where I was able to produce that NULL pointer dereference and test your
> > > patch there and will then let you know my results.
> >
> > Oh, great. Please find the patch below.
> >
> > Of course it wasn't tested at all ;)
> 
> Cough... Sorry, but another ping ;)

I currently don't have the test system at hand (I'm not on this project
anymore), so it might take a while until I get to it, sorry.

Tobias

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2012-04-02  8:11 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-09-19 21:45 [PATCH] user namespace: make signal.c respect user namespaces Serge E. Hallyn
2011-09-19 21:47 ` [PATCH] user namespace: usb: make usb urbs user namespace aware Serge E. Hallyn
2011-09-20 13:17   ` Oleg Nesterov
2011-09-20 13:33     ` Serge E. Hallyn
2011-09-21  5:01     ` [PATCH] user namespace: usb: make usb urbs user namespace aware (v2) Serge E. Hallyn
2011-09-21 18:31       ` Oleg Nesterov
2011-09-21 19:12         ` Serge E. Hallyn
2011-09-21 19:18           ` Greg KH
2011-09-23  1:27             ` [PATCH resend] " Serge E. Hallyn
2011-09-23 15:48               ` Alan Stern
2011-09-23 16:06                 ` Serge E. Hallyn
2011-09-23 16:21                   ` Alan Stern
2011-09-23 17:22                     ` Serge E. Hallyn
2011-09-23 18:35                       ` Alan Stern
2011-09-20 12:22 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
2011-09-20 12:44   ` Serge E. Hallyn
2011-09-20 13:41     ` Oleg Nesterov
2011-09-20 14:39       ` [PATCH 0/2] (Was: user namespace: make signal.c respect user namespaces) Oleg Nesterov
2011-09-20 14:39         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Oleg Nesterov
2011-09-20 15:14           ` drivers/staging/usbip/ abuses task_is_dead/exit_state Oleg Nesterov
2011-09-20 18:38             ` Greg KH
2012-03-06 17:39               ` ping: " Oleg Nesterov
2012-03-06 19:30                 ` Tobias Klauser
2012-03-08 18:57                   ` Oleg Nesterov
2012-03-13 11:45                     ` Tobias Klauser
2012-03-13 18:07                       ` [PATCH] staging: usbip: fix the usage of kthread_stop() Oleg Nesterov
2012-04-01 23:17                         ` Oleg Nesterov
2012-04-02  8:11                           ` Tobias Klauser
2011-09-20 15:28           ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check Paul E. McKenney
2011-09-20 15:40             ` Oleg Nesterov
2011-09-20 15:48               ` Paul E. McKenney
2011-09-20 14:39         ` [PATCH 2/2] creds: __task_cred(current) doesn't need rcu_read_lock_held() Oleg Nesterov
2011-09-20 15:07           ` Serge Hallyn
2011-09-20 15:35             ` Oleg Nesterov
2011-09-20 16:19         ` David Howells
2011-09-20 16:38           ` Oleg Nesterov
2011-09-20 16:50           ` David Howells
2011-09-20 17:13             ` Oleg Nesterov
2011-09-20 16:27         ` [PATCH 1/2] creds: kill __task_cred()->task_is_dead() check David Howells
2011-09-20 15:39   ` [PATCH] user namespace: make signal.c respect user namespaces Serge Hallyn
2011-09-20 16:24     ` Oleg Nesterov
2011-09-20 16:45       ` Serge E. Hallyn
2011-09-20 18:17         ` Oleg Nesterov
2011-09-21  5:00   ` [PATCH] user namespace: make signal.c respect user namespaces (v2) Serge E. Hallyn
2011-09-20 17:48 ` [PATCH] user namespace: make signal.c respect user namespaces Oleg Nesterov
2011-09-20 18:53   ` Serge E. Hallyn
2011-09-21 17:53     ` Oleg Nesterov
2011-09-22 15:23       ` Serge Hallyn
2011-09-23 16:31       ` Serge E. Hallyn
2011-09-23 17:36         ` Oleg Nesterov
2011-09-23 21:20           ` Serge E. Hallyn
2011-09-24 16:37             ` Oleg Nesterov
2011-09-25 20:17               ` Serge E. Hallyn
2011-09-26 16:06                 ` Oleg Nesterov
2011-09-27 14:28                   ` Serge Hallyn
2011-09-27 14:38                     ` Oleg Nesterov
2011-09-27 15:27                       ` Serge Hallyn
2011-09-27 17:12                         ` Oleg Nesterov
2011-10-04 17:42                   ` Serge E. Hallyn
2011-10-09 19:00                     ` Oleg Nesterov
2011-10-11 13:08                       ` Serge E. Hallyn
2011-10-08 20:02                   ` Serge E. Hallyn
2011-10-09 19:03                     ` Oleg Nesterov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.