io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] signalfd: add support for SFD_TASK
@ 2019-11-27  5:11 Jens Axboe
  2019-11-27 19:23 ` Jann Horn
  0 siblings, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2019-11-27  5:11 UTC (permalink / raw)
  To: io-uring; +Cc: Jann Horn, linux-kernel, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 7356 bytes --]

I posted this a few weeks back, took another look at it and refined it a
bit. I'd like some input on the viability of this approach.

A new signalfd setup flag is added, SFD_TASK. This is only valid if used
with SFD_CLOEXEC. If set, the task setting up the signalfd descriptor is
remembered in the signalfd context, and will be the one we use for
checking signals in the poll/read handlers in signalfd.

This is needed to make signalfd useful with io_uring and aio, of which
the former in particular has my interest.

I _think_ this is sane. To prevent the case of a task clearing O_CLOEXEC
on the signalfd descriptor, forking, and then exiting, we grab a
reference to the task when we assign it. If that original task exits, we
catch it in signalfd_flush() and ensure waiters are woken up. The
waiters also hold a task reference, so we don't have to wait for them to
go away.

Need to double check we can't race between original task exiting and new
task grabbing a reference. I don't think this is solid in the version
below. Probably need to add a refcount for ctx->task (the pointer, not
the task) for that.

Comments? Attaching two test programs using io_uring, one using poll and
the other read. Remove SFD_TASK from either of them, and they will fail
ala:

./signalfd-read
Timed out waiting for cqe

and with SFD_TASK set, both will exit silent with a value of 0. You need
liburing installed, then compile them with:

gcc -Wall -O2 -o signalfd-read signalfd-read.c -luring

---

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 44b6845b071c..4bbdab9438c1 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -50,28 +50,62 @@ void signalfd_cleanup(struct sighand_struct *sighand)
  
  struct signalfd_ctx {
  	sigset_t sigmask;
+	struct task_struct *task;
  };
  
+static int signalfd_flush(struct file *file, void *data)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	struct task_struct *tsk = ctx->task;
+
+	if (tsk == current) {
+		ctx->task = NULL;
+		wake_up(&tsk->sighand->signalfd_wqh);
+		put_task_struct(tsk);
+	}
+
+	return 0;
+}
+
  static int signalfd_release(struct inode *inode, struct file *file)
  {
-	kfree(file->private_data);
+	struct signalfd_ctx *ctx = file->private_data;
+
+	if (ctx->task)
+		put_task_struct(ctx->task);
+	kfree(ctx);
  	return 0;
  }
  
+static void signalfd_put_task(struct task_struct *tsk)
+{
+	put_task_struct(tsk);
+}
+
+static struct task_struct *signalfd_get_task(struct signalfd_ctx *ctx)
+{
+	struct task_struct *tsk = ctx->task ?: current;
+
+	get_task_struct(tsk);
+	return tsk;
+}
+
  static __poll_t signalfd_poll(struct file *file, poll_table *wait)
  {
  	struct signalfd_ctx *ctx = file->private_data;
+	struct task_struct *tsk = signalfd_get_task(ctx);
  	__poll_t events = 0;
  
-	poll_wait(file, &current->sighand->signalfd_wqh, wait);
+	poll_wait(file, &tsk->sighand->signalfd_wqh, wait);
  
-	spin_lock_irq(&current->sighand->siglock);
-	if (next_signal(&current->pending, &ctx->sigmask) ||
-	    next_signal(&current->signal->shared_pending,
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (next_signal(&tsk->pending, &ctx->sigmask) ||
+	    next_signal(&tsk->signal->shared_pending,
  			&ctx->sigmask))
  		events |= EPOLLIN;
-	spin_unlock_irq(&current->sighand->siglock);
+	spin_unlock_irq(&tsk->sighand->siglock);
  
+	signalfd_put_task(tsk);
  	return events;
  }
  
@@ -167,10 +201,11 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
  				int nonblock)
  {
  	ssize_t ret;
+	struct task_struct *tsk = signalfd_get_task(ctx);
  	DECLARE_WAITQUEUE(wait, current);
  
-	spin_lock_irq(&current->sighand->siglock);
-	ret = dequeue_signal(current, &ctx->sigmask, info);
+	spin_lock_irq(&tsk->sighand->siglock);
+	ret = dequeue_signal(tsk, &ctx->sigmask, info);
  	switch (ret) {
  	case 0:
  		if (!nonblock)
@@ -178,29 +213,35 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
  		ret = -EAGAIN;
  		/* fall through */
  	default:
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&tsk->sighand->siglock);
+		signalfd_put_task(tsk);
  		return ret;
  	}
  
-	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	add_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
  	for (;;) {
  		set_current_state(TASK_INTERRUPTIBLE);
-		ret = dequeue_signal(current, &ctx->sigmask, info);
+		ret = dequeue_signal(tsk, &ctx->sigmask, info);
  		if (ret != 0)
  			break;
  		if (signal_pending(current)) {
  			ret = -ERESTARTSYS;
  			break;
  		}
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&tsk->sighand->siglock);
  		schedule();
-		spin_lock_irq(&current->sighand->siglock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		if (tsk != current && !ctx->task) {
+			ret = -ESRCH;
+			break;
+		}
  	}
-	spin_unlock_irq(&current->sighand->siglock);
+	spin_unlock_irq(&tsk->sighand->siglock);
  
-	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	remove_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
  	__set_current_state(TASK_RUNNING);
  
+	signalfd_put_task(tsk);
  	return ret;
  }
  
@@ -254,6 +295,7 @@ static const struct file_operations signalfd_fops = {
  #ifdef CONFIG_PROC_FS
  	.show_fdinfo	= signalfd_show_fdinfo,
  #endif
+	.flush		= signalfd_flush,
  	.release	= signalfd_release,
  	.poll		= signalfd_poll,
  	.read		= signalfd_read,
@@ -267,19 +309,26 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
  	/* Check the SFD_* constants for consistency.  */
  	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
  	BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+	BUILD_BUG_ON(SFD_TASK & (SFD_CLOEXEC | SFD_NONBLOCK));
  
-	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_TASK))
+		return -EINVAL;
+	if ((flags & (SFD_CLOEXEC | SFD_TASK)) == SFD_TASK)
  		return -EINVAL;
  
  	sigdelsetmask(mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
  	signotset(mask);
  
  	if (ufd == -1) {
-		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+		ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
  		if (!ctx)
  			return -ENOMEM;
  
  		ctx->sigmask = *mask;
+		if (flags & SFD_TASK) {
+			ctx->task = current;
+			get_task_struct(ctx->task);
+		}
  
  		/*
  		 * When we call this, the initialization must be complete, since
@@ -290,6 +339,7 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
  		if (ufd < 0)
  			kfree(ctx);
  	} else {
+		struct task_struct *tsk;
  		struct fd f = fdget(ufd);
  		if (!f.file)
  			return -EBADF;
@@ -298,11 +348,13 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
  			fdput(f);
  			return -EINVAL;
  		}
-		spin_lock_irq(&current->sighand->siglock);
+		tsk = signalfd_get_task(ctx);
+		spin_lock_irq(&tsk->sighand->siglock);
  		ctx->sigmask = *mask;
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&tsk->sighand->siglock);
  
-		wake_up(&current->sighand->signalfd_wqh);
+		wake_up(&tsk->sighand->signalfd_wqh);
+		signalfd_put_task(tsk);
  		fdput(f);
  	}
  
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 83429a05b698..064c5dc3eb99 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -16,6 +16,7 @@
  /* Flags for signalfd4.  */
  #define SFD_CLOEXEC O_CLOEXEC
  #define SFD_NONBLOCK O_NONBLOCK
+#define SFD_TASK 00000001
  
  struct signalfd_siginfo {
  	__u32 ssi_signo;

-- 
Jens Axboe


[-- Attachment #2: signalfd-poll.c --]
[-- Type: text/x-csrc, Size: 1411 bytes --]

#include <unistd.h>
#include <sys/signalfd.h>
#include <sys/poll.h>
#include <sys/time.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>

#include <liburing.h>

#define SFD_TASK	00000001

int main(int argc, char *argv[])
{
	struct __kernel_timespec ts;
	struct io_uring_sqe *sqe;
	struct io_uring_cqe *cqe;
	struct io_uring ring;
	struct itimerval itv;
	sigset_t mask;
	int sfd, ret;

	sigemptyset(&mask);
	sigaddset(&mask, SIGALRM);
	sigprocmask(SIG_BLOCK, &mask, NULL);

	sfd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_TASK);
	if (sfd < 0) {
		if (errno == EINVAL) {
			printf("Not supported\n");
			return 0;
		}
		perror("signalfd");
		return 1;
	}

	memset(&itv, 0, sizeof(itv));
	itv.it_value.tv_sec = 0;
	itv.it_value.tv_usec = 100000;
	setitimer(ITIMER_REAL, &itv, NULL);

	io_uring_queue_init(32, &ring, 0);
	sqe = io_uring_get_sqe(&ring);
	io_uring_prep_poll_add(sqe, sfd, POLLIN);
	io_uring_submit(&ring);

	ts.tv_sec = 1;
	ts.tv_nsec = 0;
	ret = io_uring_wait_cqe_timeout(&ring, &cqe, &ts);
	if (ret < 0) {
		fprintf(stderr, "Timed out waiting for cqe\n");
		ret = 1;
	} else {
		if (cqe->res < 0) {
			fprintf(stderr, "cqe failed with %d\n", cqe->res);
			ret = 1;
		} else if (!(cqe->res & POLLIN)) {
			fprintf(stderr, "POLLIN not set in result mask?\n");
			ret = 1;
		} else {
			ret = 0;
		}
	}
	io_uring_cqe_seen(&ring, cqe);

	io_uring_queue_exit(&ring);
	close(sfd);
	return ret;
}

[-- Attachment #3: signalfd-read.c --]
[-- Type: text/x-csrc, Size: 1516 bytes --]

#include <unistd.h>
#include <sys/signalfd.h>
#include <sys/poll.h>
#include <sys/time.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>

#include <liburing.h>

#define SFD_TASK	00000001

int main(int argc, char *argv[])
{
	struct __kernel_timespec ts;
	struct signalfd_siginfo si;
	struct iovec iov = {
		.iov_base = &si,
		.iov_len = sizeof(si),
	};
	struct io_uring_sqe *sqe;
	struct io_uring_cqe *cqe;
	struct io_uring ring;
	struct itimerval itv;
	sigset_t mask;
	int sfd, ret;

	sigemptyset(&mask);
	sigaddset(&mask, SIGALRM);
	sigprocmask(SIG_BLOCK, &mask, NULL);

	sfd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_TASK);
	if (sfd < 0) {
		if (errno == EINVAL) {
			printf("Not supported\n");
			return 0;
		}
		perror("signalfd");
		return 1;
	}

	memset(&itv, 0, sizeof(itv));
	itv.it_value.tv_sec = 0;
	itv.it_value.tv_usec = 100000;
	setitimer(ITIMER_REAL, &itv, NULL);

	io_uring_queue_init(32, &ring, 0);
	sqe = io_uring_get_sqe(&ring);
	io_uring_prep_readv(sqe, sfd, &iov, 1, 0);
	io_uring_submit(&ring);

	ts.tv_sec = 1;
	ts.tv_nsec = 0;
	ret = io_uring_wait_cqe_timeout(&ring, &cqe, &ts);
	if (ret < 0) {
		fprintf(stderr, "Timed out waiting for cqe\n");
		ret = 1;
	} else {
		ret = 0;
		if (cqe->res < 0) {
			fprintf(stderr, "cqe failed with %d\n", cqe->res);
			ret = 1;
		} else if (cqe->res != sizeof(si)) {
			fprintf(stderr, "Read %d, wanted %d\n", cqe->res, (int)sizeof(si));
			ret = 1;
		}
	}
	io_uring_cqe_seen(&ring, cqe);

	io_uring_queue_exit(&ring);
	close(sfd);
	return ret;
}

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-27  5:11 [PATCH RFC] signalfd: add support for SFD_TASK Jens Axboe
@ 2019-11-27 19:23 ` Jann Horn
  2019-11-27 20:48   ` Jens Axboe
  0 siblings, 1 reply; 10+ messages in thread
From: Jann Horn @ 2019-11-27 19:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-kernel, linux-fsdevel

On Wed, Nov 27, 2019 at 6:11 AM Jens Axboe <axboe@kernel.dk> wrote:
> I posted this a few weeks back, took another look at it and refined it a
> bit. I'd like some input on the viability of this approach.
>
> A new signalfd setup flag is added, SFD_TASK. This is only valid if used
> with SFD_CLOEXEC. If set, the task setting up the signalfd descriptor is
> remembered in the signalfd context, and will be the one we use for
> checking signals in the poll/read handlers in signalfd.
>
> This is needed to make signalfd useful with io_uring and aio, of which
> the former in particular has my interest.
>
> I _think_ this is sane. To prevent the case of a task clearing O_CLOEXEC
> on the signalfd descriptor, forking, and then exiting, we grab a
> reference to the task when we assign it. If that original task exits, we
> catch it in signalfd_flush() and ensure waiters are woken up.

Mh... that's not really reliable, because you only get ->flush() from
the last exiting thread (or more precisely, the last exiting task that
shares the files_struct).

What is your goal here? To have a reference to a task without keeping
the entire task_struct around in memory if someone leaks the signalfd
to another process - basically like a weak pointer? If so, you could
store a refcounted reference to "struct pid" instead of a refcounted
reference to the task_struct, and then do the lookup of the
task_struct on ->poll and ->read (similar to what procfs does).

In other words:

> diff --git a/fs/signalfd.c b/fs/signalfd.c
> index 44b6845b071c..4bbdab9438c1 100644
> --- a/fs/signalfd.c
> +++ b/fs/signalfd.c
> @@ -50,28 +50,62 @@ void signalfd_cleanup(struct sighand_struct *sighand)
>
>   struct signalfd_ctx {
>         sigset_t sigmask;
> +       struct task_struct *task;

Turn this into "struct pid *task_pid".

> +static int signalfd_flush(struct file *file, void *data)
> +{
> +       struct signalfd_ctx *ctx = file->private_data;
> +       struct task_struct *tsk = ctx->task;
> +
> +       if (tsk == current) {
> +               ctx->task = NULL;
> +               wake_up(&tsk->sighand->signalfd_wqh);
> +               put_task_struct(tsk);
> +       }
> +
> +       return 0;
> +}

Get rid of this.

> +static struct task_struct *signalfd_get_task(struct signalfd_ctx *ctx)
> +{
> +       struct task_struct *tsk = ctx->task ?: current;
> +
> +       get_task_struct(tsk);
> +       return tsk;
> +}

Replace this with something like:

  if (ctx->task_pid)
    return get_pid_task(ctx->task_pid, PIDTYPE_PID); /* will return
NULL if the task is gone */
  else
    return get_task_struct(current);

and add NULL checks to the places that call this.

> @@ -167,10 +201,11 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
>                                 int nonblock)
>   {
>         ssize_t ret;
> +       struct task_struct *tsk = signalfd_get_task(ctx);

(Here we could even optimize away the refcounting using RCU if we
wanted to, since unlike in the ->poll handler, we don't need to be
able to block.)

>         if (ufd == -1) {
> -               ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> +               ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
>                 if (!ctx)
>                         return -ENOMEM;
>
>                 ctx->sigmask = *mask;
> +               if (flags & SFD_TASK) {
> +                       ctx->task = current;
> +                       get_task_struct(ctx->task);
> +               }

and here do "ctx->task_pid = get_task_pid(current, PIDTYPE_PID)"

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-27 19:23 ` Jann Horn
@ 2019-11-27 20:48   ` Jens Axboe
  2019-11-27 23:27     ` Jann Horn
  0 siblings, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2019-11-27 20:48 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, linux-kernel, linux-fsdevel

On 11/27/19 12:23 PM, Jann Horn wrote:
> On Wed, Nov 27, 2019 at 6:11 AM Jens Axboe <axboe@kernel.dk> wrote:
>> I posted this a few weeks back, took another look at it and refined it a
>> bit. I'd like some input on the viability of this approach.
>>
>> A new signalfd setup flag is added, SFD_TASK. This is only valid if used
>> with SFD_CLOEXEC. If set, the task setting up the signalfd descriptor is
>> remembered in the signalfd context, and will be the one we use for
>> checking signals in the poll/read handlers in signalfd.
>>
>> This is needed to make signalfd useful with io_uring and aio, of which
>> the former in particular has my interest.
>>
>> I _think_ this is sane. To prevent the case of a task clearing O_CLOEXEC
>> on the signalfd descriptor, forking, and then exiting, we grab a
>> reference to the task when we assign it. If that original task exits, we
>> catch it in signalfd_flush() and ensure waiters are woken up.
> 
> Mh... that's not really reliable, because you only get ->flush() from
> the last exiting thread (or more precisely, the last exiting task that
> shares the files_struct).
> 
> What is your goal here? To have a reference to a task without keeping
> the entire task_struct around in memory if someone leaks the signalfd
> to another process - basically like a weak pointer? If so, you could
> store a refcounted reference to "struct pid" instead of a refcounted
> reference to the task_struct, and then do the lookup of the
> task_struct on ->poll and ->read (similar to what procfs does).

Yeah, I think that works out much better (and cleaner). How about this,
then? Follows your advice and turns it into a struct pid instead. I
don't particularly like the -ESRCH in dequeue and setup, what do you
think? For poll, POLLERR seems like a prudent choice.

Tested with the test cases I sent out yesterday, works for me.

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 44b6845b071c..ccb1173b20aa 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -50,6 +50,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
  
  struct signalfd_ctx {
  	sigset_t sigmask;
+	struct pid *task_pid;
  };
  
  static int signalfd_release(struct inode *inode, struct file *file)
@@ -58,20 +59,41 @@ static int signalfd_release(struct inode *inode, struct file *file)
  	return 0;
  }
  
+static void signalfd_put_task(struct signalfd_ctx *ctx, struct task_struct *tsk)
+{
+	if (ctx->task_pid)
+		put_task_struct(tsk);
+}
+
+static struct task_struct *signalfd_get_task(struct signalfd_ctx *ctx)
+{
+	if (ctx->task_pid)
+		return get_pid_task(ctx->task_pid, PIDTYPE_PID);
+
+	return current;
+}
+
  static __poll_t signalfd_poll(struct file *file, poll_table *wait)
  {
  	struct signalfd_ctx *ctx = file->private_data;
+	struct task_struct *tsk;
  	__poll_t events = 0;
  
-	poll_wait(file, &current->sighand->signalfd_wqh, wait);
+	tsk = signalfd_get_task(ctx);
+	if (tsk) {
+		poll_wait(file, &tsk->sighand->signalfd_wqh, wait);
  
-	spin_lock_irq(&current->sighand->siglock);
-	if (next_signal(&current->pending, &ctx->sigmask) ||
-	    next_signal(&current->signal->shared_pending,
-			&ctx->sigmask))
-		events |= EPOLLIN;
-	spin_unlock_irq(&current->sighand->siglock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		if (next_signal(&tsk->pending, &ctx->sigmask) ||
+		    next_signal(&tsk->signal->shared_pending,
+				&ctx->sigmask))
+			events |= EPOLLIN;
+		spin_unlock_irq(&tsk->sighand->siglock);
  
+		signalfd_put_task(ctx, tsk);
+	} else {
+		events |= EPOLLERR;
+	}
  	return events;
  }
  
@@ -167,10 +189,15 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
  				int nonblock)
  {
  	ssize_t ret;
+	struct task_struct *tsk;
  	DECLARE_WAITQUEUE(wait, current);
  
-	spin_lock_irq(&current->sighand->siglock);
-	ret = dequeue_signal(current, &ctx->sigmask, info);
+	tsk = signalfd_get_task(ctx);
+	if (!tsk)
+		return -ESRCH;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	ret = dequeue_signal(tsk, &ctx->sigmask, info);
  	switch (ret) {
  	case 0:
  		if (!nonblock)
@@ -178,29 +205,31 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
  		ret = -EAGAIN;
  		/* fall through */
  	default:
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&tsk->sighand->siglock);
+		signalfd_put_task(ctx, tsk);
  		return ret;
  	}
  
-	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	add_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
  	for (;;) {
  		set_current_state(TASK_INTERRUPTIBLE);
-		ret = dequeue_signal(current, &ctx->sigmask, info);
+		ret = dequeue_signal(tsk, &ctx->sigmask, info);
  		if (ret != 0)
  			break;
  		if (signal_pending(current)) {
  			ret = -ERESTARTSYS;
  			break;
  		}
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&tsk->sighand->siglock);
  		schedule();
-		spin_lock_irq(&current->sighand->siglock);
+		spin_lock_irq(&tsk->sighand->siglock);
  	}
-	spin_unlock_irq(&current->sighand->siglock);
+	spin_unlock_irq(&tsk->sighand->siglock);
  
-	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
+	remove_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
  	__set_current_state(TASK_RUNNING);
  
+	signalfd_put_task(ctx, tsk);
  	return ret;
  }
  
@@ -267,19 +296,24 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
  	/* Check the SFD_* constants for consistency.  */
  	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
  	BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+	BUILD_BUG_ON(SFD_TASK & (SFD_CLOEXEC | SFD_NONBLOCK));
  
-	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_TASK))
+		return -EINVAL;
+	if ((flags & (SFD_CLOEXEC | SFD_TASK)) == SFD_TASK)
  		return -EINVAL;
  
  	sigdelsetmask(mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
  	signotset(mask);
  
  	if (ufd == -1) {
-		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+		ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
  		if (!ctx)
  			return -ENOMEM;
  
  		ctx->sigmask = *mask;
+		if (flags & SFD_TASK)
+			ctx->task_pid = get_task_pid(current, PIDTYPE_PID);
  
  		/*
  		 * When we call this, the initialization must be complete, since
@@ -290,6 +324,7 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
  		if (ufd < 0)
  			kfree(ctx);
  	} else {
+		struct task_struct *tsk;
  		struct fd f = fdget(ufd);
  		if (!f.file)
  			return -EBADF;
@@ -298,11 +333,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
  			fdput(f);
  			return -EINVAL;
  		}
-		spin_lock_irq(&current->sighand->siglock);
+		tsk = signalfd_get_task(ctx);
+		if (!tsk) {
+			fdput(f);
+			return -ESRCH;
+		}
+		spin_lock_irq(&tsk->sighand->siglock);
  		ctx->sigmask = *mask;
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&tsk->sighand->siglock);
  
-		wake_up(&current->sighand->signalfd_wqh);
+		wake_up(&tsk->sighand->signalfd_wqh);
+		signalfd_put_task(ctx, tsk);
  		fdput(f);
  	}
  
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 83429a05b698..064c5dc3eb99 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -16,6 +16,7 @@
  /* Flags for signalfd4.  */
  #define SFD_CLOEXEC O_CLOEXEC
  #define SFD_NONBLOCK O_NONBLOCK
+#define SFD_TASK 00000001
  
  struct signalfd_siginfo {
  	__u32 ssi_signo;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-27 20:48   ` Jens Axboe
@ 2019-11-27 23:27     ` Jann Horn
  2019-11-28  0:41       ` Jens Axboe
  2019-11-28  9:02       ` Rasmus Villemoes
  0 siblings, 2 replies; 10+ messages in thread
From: Jann Horn @ 2019-11-27 23:27 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-kernel, linux-fsdevel

On Wed, Nov 27, 2019 at 9:48 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 11/27/19 12:23 PM, Jann Horn wrote:
> > On Wed, Nov 27, 2019 at 6:11 AM Jens Axboe <axboe@kernel.dk> wrote:
> >> I posted this a few weeks back, took another look at it and refined it a
> >> bit. I'd like some input on the viability of this approach.
> >>
> >> A new signalfd setup flag is added, SFD_TASK. This is only valid if used
> >> with SFD_CLOEXEC. If set, the task setting up the signalfd descriptor is
> >> remembered in the signalfd context, and will be the one we use for
> >> checking signals in the poll/read handlers in signalfd.
> >>
> >> This is needed to make signalfd useful with io_uring and aio, of which
> >> the former in particular has my interest.
> >>
> >> I _think_ this is sane. To prevent the case of a task clearing O_CLOEXEC
> >> on the signalfd descriptor, forking, and then exiting, we grab a
> >> reference to the task when we assign it. If that original task exits, we
> >> catch it in signalfd_flush() and ensure waiters are woken up.
> >
> > Mh... that's not really reliable, because you only get ->flush() from
> > the last exiting thread (or more precisely, the last exiting task that
> > shares the files_struct).
> >
> > What is your goal here? To have a reference to a task without keeping
> > the entire task_struct around in memory if someone leaks the signalfd
> > to another process - basically like a weak pointer? If so, you could
> > store a refcounted reference to "struct pid" instead of a refcounted
> > reference to the task_struct, and then do the lookup of the
> > task_struct on ->poll and ->read (similar to what procfs does).
>
> Yeah, I think that works out much better (and cleaner). How about this,
> then? Follows your advice and turns it into a struct pid instead. I
> don't particularly like the -ESRCH in dequeue and setup, what do you
> think? For poll, POLLERR seems like a prudent choice.

-ESRCH may be kinda weird, but I also can't think of anything
better... and it does describe the problem pretty accurately: The task
whose signal state you're trying to inspect is gone. I went through
the list of errnos, and everything else sounded more weird...


One more thing, though: We'll have to figure out some way to
invalidate the fd when the target goes through execve(), in particular
if it's a setuid execution. Otherwise we'll be able to just steal
signals that were intended for the other task, that's probably not
good.

So we should:
 a) prevent using ->wait() on an old signalfd once the task has gone
through execve()
 b) kick off all existing waiters
 c) most importantly, prevent ->read() on an old signalfd once the
task has gone through execve()

We probably want to avoid using the cred_guard_mutex here, since it is
quite broad and has some deadlocking issues; it might make sense to
put the update of ->self_exec_id in fs/exec.c under something like the
siglock, and then for a) and c) we can check whether the
->self_exec_id changed while holding the siglock, and for b) we can
add a call to signalfd_cleanup() after the ->self_exec_id change.

> +static void signalfd_put_task(struct signalfd_ctx *ctx, struct task_struct *tsk)
> +{
> +       if (ctx->task_pid)
> +               put_task_struct(tsk);
> +}
> +
> +static struct task_struct *signalfd_get_task(struct signalfd_ctx *ctx)
> +{
> +       if (ctx->task_pid)
> +               return get_pid_task(ctx->task_pid, PIDTYPE_PID);
> +
> +       return current;
> +}

This works, and I guess it's a question of coding style... but I'd
kinda prefer to do the refcount operation in both cases, so that the
semantics of the returned reference are simply "holds a reference"
instead of "either holds a reference or borrows from current depending
on ctx->task_pid". But if you feel strongly about it, feel free to
keep it as-is.

[...]
> -       add_wait_queue(&current->sighand->signalfd_wqh, &wait);
> +       add_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
>         for (;;) {
>                 set_current_state(TASK_INTERRUPTIBLE);
> -               ret = dequeue_signal(current, &ctx->sigmask, info);
> +               ret = dequeue_signal(tsk, &ctx->sigmask, info);
>                 if (ret != 0)
>                         break;
>                 if (signal_pending(current)) {
>                         ret = -ERESTARTSYS;
>                         break;
>                 }
> -               spin_unlock_irq(&current->sighand->siglock);
> +               spin_unlock_irq(&tsk->sighand->siglock);
>                 schedule();

Should we be dropping the reference to the task before schedule() and
re-acquiring it afterwards so that if we're blocked on a signalfd read
and then the corresponding task dies, the refcount can drop to zero
and we can get woken up? Probably doesn't matter, but seems a bit
cleaner to me.

> -               spin_lock_irq(&current->sighand->siglock);
> +               spin_lock_irq(&tsk->sighand->siglock);
>         }
> -       spin_unlock_irq(&current->sighand->siglock);
> +       spin_unlock_irq(&tsk->sighand->siglock);
>
> -       remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
> +       remove_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
>         __set_current_state(TASK_RUNNING);
>
> +       signalfd_put_task(ctx, tsk);
>         return ret;
>   }
>
> @@ -267,19 +296,24 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
>         /* Check the SFD_* constants for consistency.  */
>         BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
>         BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
> +       BUILD_BUG_ON(SFD_TASK & (SFD_CLOEXEC | SFD_NONBLOCK));
>
> -       if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
> +       if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_TASK))
> +               return -EINVAL;
> +       if ((flags & (SFD_CLOEXEC | SFD_TASK)) == SFD_TASK)
>                 return -EINVAL;

(non-actionable comment: It seems kinda weird that you can specify
these parameters with no effect for the `uffd != -1` case... but since
the existing parameters already work that way, I guess it's
consistent.)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-27 23:27     ` Jann Horn
@ 2019-11-28  0:41       ` Jens Axboe
  2019-11-28  9:02       ` Rasmus Villemoes
  1 sibling, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-11-28  0:41 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, linux-kernel, linux-fsdevel

On 11/27/19 4:27 PM, Jann Horn wrote:
> On Wed, Nov 27, 2019 at 9:48 PM Jens Axboe <axboe@kernel.dk> wrote:
>> On 11/27/19 12:23 PM, Jann Horn wrote:
>>> On Wed, Nov 27, 2019 at 6:11 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>> I posted this a few weeks back, took another look at it and refined it a
>>>> bit. I'd like some input on the viability of this approach.
>>>>
>>>> A new signalfd setup flag is added, SFD_TASK. This is only valid if used
>>>> with SFD_CLOEXEC. If set, the task setting up the signalfd descriptor is
>>>> remembered in the signalfd context, and will be the one we use for
>>>> checking signals in the poll/read handlers in signalfd.
>>>>
>>>> This is needed to make signalfd useful with io_uring and aio, of which
>>>> the former in particular has my interest.
>>>>
>>>> I _think_ this is sane. To prevent the case of a task clearing O_CLOEXEC
>>>> on the signalfd descriptor, forking, and then exiting, we grab a
>>>> reference to the task when we assign it. If that original task exits, we
>>>> catch it in signalfd_flush() and ensure waiters are woken up.
>>>
>>> Mh... that's not really reliable, because you only get ->flush() from
>>> the last exiting thread (or more precisely, the last exiting task that
>>> shares the files_struct).
>>>
>>> What is your goal here? To have a reference to a task without keeping
>>> the entire task_struct around in memory if someone leaks the signalfd
>>> to another process - basically like a weak pointer? If so, you could
>>> store a refcounted reference to "struct pid" instead of a refcounted
>>> reference to the task_struct, and then do the lookup of the
>>> task_struct on ->poll and ->read (similar to what procfs does).
>>
>> Yeah, I think that works out much better (and cleaner). How about this,
>> then? Follows your advice and turns it into a struct pid instead. I
>> don't particularly like the -ESRCH in dequeue and setup, what do you
>> think? For poll, POLLERR seems like a prudent choice.
> 
> -ESRCH may be kinda weird, but I also can't think of anything
> better... and it does describe the problem pretty accurately: The task
> whose signal state you're trying to inspect is gone. I went through
> the list of errnos, and everything else sounded more weird...

Right, that's why I ultimately ended up with -ESRCH. But I'll take that
as concensus :-)

> One more thing, though: We'll have to figure out some way to
> invalidate the fd when the target goes through execve(), in particular
> if it's a setuid execution. Otherwise we'll be able to just steal
> signals that were intended for the other task, that's probably not
> good.
> 
> So we should:
>   a) prevent using ->wait() on an old signalfd once the task has gone
> through execve()
>   b) kick off all existing waiters
>   c) most importantly, prevent ->read() on an old signalfd once the
> task has gone through execve()
> 
> We probably want to avoid using the cred_guard_mutex here, since it is
> quite broad and has some deadlocking issues; it might make sense to
> put the update of ->self_exec_id in fs/exec.c under something like the
> siglock, and then for a) and c) we can check whether the
> ->self_exec_id changed while holding the siglock, and for b) we can
> add a call to signalfd_cleanup() after the ->self_exec_id change.

OK, that seems like one for after the break. Was hoping there'd be a
more trivial way to accomplish that, I'll give it some thought.

>> +static void signalfd_put_task(struct signalfd_ctx *ctx, struct task_struct *tsk)
>> +{
>> +       if (ctx->task_pid)
>> +               put_task_struct(tsk);
>> +}
>> +
>> +static struct task_struct *signalfd_get_task(struct signalfd_ctx *ctx)
>> +{
>> +       if (ctx->task_pid)
>> +               return get_pid_task(ctx->task_pid, PIDTYPE_PID);
>> +
>> +       return current;
>> +}
> 
> This works, and I guess it's a question of coding style... but I'd
> kinda prefer to do the refcount operation in both cases, so that the
> semantics of the returned reference are simply "holds a reference"
> instead of "either holds a reference or borrows from current depending
> on ctx->task_pid". But if you feel strongly about it, feel free to
> keep it as-is.

I don't feel super strongly about it, but I wanted to avoid adding an
unnecessary get/put of the current task for the existing use cases of
signalfd. So I'll probably just keep it as-is.

>> -       add_wait_queue(&current->sighand->signalfd_wqh, &wait);
>> +       add_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
>>          for (;;) {
>>                  set_current_state(TASK_INTERRUPTIBLE);
>> -               ret = dequeue_signal(current, &ctx->sigmask, info);
>> +               ret = dequeue_signal(tsk, &ctx->sigmask, info);
>>                  if (ret != 0)
>>                          break;
>>                  if (signal_pending(current)) {
>>                          ret = -ERESTARTSYS;
>>                          break;
>>                  }
>> -               spin_unlock_irq(&current->sighand->siglock);
>> +               spin_unlock_irq(&tsk->sighand->siglock);
>>                  schedule();
> 
> Should we be dropping the reference to the task before schedule() and
> re-acquiring it afterwards so that if we're blocked on a signalfd read
> and then the corresponding task dies, the refcount can drop to zero
> and we can get woken up? Probably doesn't matter, but seems a bit
> cleaner to me.

That would be simple enough to do, as we know that tsk is either still
the same, or we need to abort. Hence no need to fiddle waitqueues at
that point. I'll make that change.

>> -               spin_lock_irq(&current->sighand->siglock);
>> +               spin_lock_irq(&tsk->sighand->siglock);
>>          }
>> -       spin_unlock_irq(&current->sighand->siglock);
>> +       spin_unlock_irq(&tsk->sighand->siglock);
>>
>> -       remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
>> +       remove_wait_queue(&tsk->sighand->signalfd_wqh, &wait);
>>          __set_current_state(TASK_RUNNING);
>>
>> +       signalfd_put_task(ctx, tsk);
>>          return ret;
>>    }
>>
>> @@ -267,19 +296,24 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
>>          /* Check the SFD_* constants for consistency.  */
>>          BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
>>          BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
>> +       BUILD_BUG_ON(SFD_TASK & (SFD_CLOEXEC | SFD_NONBLOCK));
>>
>> -       if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
>> +       if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_TASK))
>> +               return -EINVAL;
>> +       if ((flags & (SFD_CLOEXEC | SFD_TASK)) == SFD_TASK)
>>                  return -EINVAL;
> 
> (non-actionable comment: It seems kinda weird that you can specify
> these parameters with no effect for the `uffd != -1` case... but since
> the existing parameters already work that way, I guess it's
> consistent.)

Yeah, just following what it already does, though I do agree it is weird
with the two separate cases and it only impacting one of them. Didn't
want to make it behave differently.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-27 23:27     ` Jann Horn
  2019-11-28  0:41       ` Jens Axboe
@ 2019-11-28  9:02       ` Rasmus Villemoes
  2019-11-28 10:07         ` Jann Horn
  1 sibling, 1 reply; 10+ messages in thread
From: Rasmus Villemoes @ 2019-11-28  9:02 UTC (permalink / raw)
  To: Jann Horn, Jens Axboe; +Cc: io-uring, linux-kernel, linux-fsdevel

On 28/11/2019 00.27, Jann Horn wrote:

> One more thing, though: We'll have to figure out some way to
> invalidate the fd when the target goes through execve(), in particular
> if it's a setuid execution. Otherwise we'll be able to just steal
> signals that were intended for the other task, that's probably not
> good.
> 
> So we should:
>  a) prevent using ->wait() on an old signalfd once the task has gone
> through execve()
>  b) kick off all existing waiters
>  c) most importantly, prevent ->read() on an old signalfd once the
> task has gone through execve()
> 
> We probably want to avoid using the cred_guard_mutex here, since it is
> quite broad and has some deadlocking issues; it might make sense to
> put the update of ->self_exec_id in fs/exec.c under something like the
> siglock,

What prevents one from exec'ing a trivial helper 2^32-1 times before
exec'ing into the victim binary?

Rasmus

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-28  9:02       ` Rasmus Villemoes
@ 2019-11-28 10:07         ` Jann Horn
  2019-11-28 19:18           ` Jann Horn
  0 siblings, 1 reply; 10+ messages in thread
From: Jann Horn @ 2019-11-28 10:07 UTC (permalink / raw)
  To: Rasmus Villemoes; +Cc: Jens Axboe, io-uring, linux-kernel, linux-fsdevel

On Thu, Nov 28, 2019 at 10:02 AM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
> On 28/11/2019 00.27, Jann Horn wrote:
>
> > One more thing, though: We'll have to figure out some way to
> > invalidate the fd when the target goes through execve(), in particular
> > if it's a setuid execution. Otherwise we'll be able to just steal
> > signals that were intended for the other task, that's probably not
> > good.
> >
> > So we should:
> >  a) prevent using ->wait() on an old signalfd once the task has gone
> > through execve()
> >  b) kick off all existing waiters
> >  c) most importantly, prevent ->read() on an old signalfd once the
> > task has gone through execve()
> >
> > We probably want to avoid using the cred_guard_mutex here, since it is
> > quite broad and has some deadlocking issues; it might make sense to
> > put the update of ->self_exec_id in fs/exec.c under something like the
> > siglock,
>
> What prevents one from exec'ing a trivial helper 2^32-1 times before
> exec'ing into the victim binary?

Uh, yeah... that thing should probably become 64 bits wide, too.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-28 10:07         ` Jann Horn
@ 2019-11-28 19:18           ` Jann Horn
  2019-11-28 22:46             ` Jann Horn
  2019-11-29 22:30             ` Jann Horn
  0 siblings, 2 replies; 10+ messages in thread
From: Jann Horn @ 2019-11-28 19:18 UTC (permalink / raw)
  To: Rasmus Villemoes; +Cc: Jens Axboe, io-uring, linux-kernel, linux-fsdevel

On Thu, Nov 28, 2019 at 11:07 AM Jann Horn <jannh@google.com> wrote:
> On Thu, Nov 28, 2019 at 10:02 AM Rasmus Villemoes
> <linux@rasmusvillemoes.dk> wrote:
> > On 28/11/2019 00.27, Jann Horn wrote:
> >
> > > One more thing, though: We'll have to figure out some way to
> > > invalidate the fd when the target goes through execve(), in particular
> > > if it's a setuid execution. Otherwise we'll be able to just steal
> > > signals that were intended for the other task, that's probably not
> > > good.
> > >
> > > So we should:
> > >  a) prevent using ->wait() on an old signalfd once the task has gone
> > > through execve()
> > >  b) kick off all existing waiters
> > >  c) most importantly, prevent ->read() on an old signalfd once the
> > > task has gone through execve()
> > >
> > > We probably want to avoid using the cred_guard_mutex here, since it is
> > > quite broad and has some deadlocking issues; it might make sense to
> > > put the update of ->self_exec_id in fs/exec.c under something like the
> > > siglock,
> >
> > What prevents one from exec'ing a trivial helper 2^32-1 times before
> > exec'ing into the victim binary?
>
> Uh, yeah... that thing should probably become 64 bits wide, too.

Actually, that'd still be wrong even with the existing kernel code for
two reasons:

 - if you reparent to a subreaper, the existing exec_id comparison breaks
 - the new check here is going to break if a non-leader thread goes
through execve(), because of the weird magic where the thread going
through execve steals the thread id (PID) of the leader

I'm gone for the day, but will try to dust off the years-old patch for
this that I have lying around somewhere tomorrow. I should probably
send it through akpm's tree with cc stable, given that this is already
kinda broken in existing releases...

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-28 19:18           ` Jann Horn
@ 2019-11-28 22:46             ` Jann Horn
  2019-11-29 22:30             ` Jann Horn
  1 sibling, 0 replies; 10+ messages in thread
From: Jann Horn @ 2019-11-28 22:46 UTC (permalink / raw)
  To: Rasmus Villemoes; +Cc: Jens Axboe, io-uring, linux-kernel, linux-fsdevel

On Thu, Nov 28, 2019 at 8:18 PM Jann Horn <jannh@google.com> wrote:
> On Thu, Nov 28, 2019 at 11:07 AM Jann Horn <jannh@google.com> wrote:
> > On Thu, Nov 28, 2019 at 10:02 AM Rasmus Villemoes
> > <linux@rasmusvillemoes.dk> wrote:
> > > On 28/11/2019 00.27, Jann Horn wrote:
> > >
> > > > One more thing, though: We'll have to figure out some way to
> > > > invalidate the fd when the target goes through execve(), in particular
> > > > if it's a setuid execution. Otherwise we'll be able to just steal
> > > > signals that were intended for the other task, that's probably not
> > > > good.
> > > >
> > > > So we should:
> > > >  a) prevent using ->wait() on an old signalfd once the task has gone
> > > > through execve()
> > > >  b) kick off all existing waiters
> > > >  c) most importantly, prevent ->read() on an old signalfd once the
> > > > task has gone through execve()
> > > >
> > > > We probably want to avoid using the cred_guard_mutex here, since it is
> > > > quite broad and has some deadlocking issues; it might make sense to
> > > > put the update of ->self_exec_id in fs/exec.c under something like the
> > > > siglock,
> > >
> > > What prevents one from exec'ing a trivial helper 2^32-1 times before
> > > exec'ing into the victim binary?
> >
> > Uh, yeah... that thing should probably become 64 bits wide, too.
>
> Actually, that'd still be wrong even with the existing kernel code for
> two reasons:
>
>  - if you reparent to a subreaper, the existing exec_id comparison breaks

... actually, I was wrong about this, this case is fine because the
->exit_signal is reset in reparent_leader().

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] signalfd: add support for SFD_TASK
  2019-11-28 19:18           ` Jann Horn
  2019-11-28 22:46             ` Jann Horn
@ 2019-11-29 22:30             ` Jann Horn
  1 sibling, 0 replies; 10+ messages in thread
From: Jann Horn @ 2019-11-29 22:30 UTC (permalink / raw)
  To: Rasmus Villemoes; +Cc: Jens Axboe, io-uring, linux-kernel, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2273 bytes --]

On Thu, Nov 28, 2019 at 8:18 PM Jann Horn <jannh@google.com> wrote:
> On Thu, Nov 28, 2019 at 11:07 AM Jann Horn <jannh@google.com> wrote:
> > On Thu, Nov 28, 2019 at 10:02 AM Rasmus Villemoes
> > <linux@rasmusvillemoes.dk> wrote:
> > > On 28/11/2019 00.27, Jann Horn wrote:
> > >
> > > > One more thing, though: We'll have to figure out some way to
> > > > invalidate the fd when the target goes through execve(), in particular
> > > > if it's a setuid execution. Otherwise we'll be able to just steal
> > > > signals that were intended for the other task, that's probably not
> > > > good.
> > > >
> > > > So we should:
> > > >  a) prevent using ->wait() on an old signalfd once the task has gone
> > > > through execve()
> > > >  b) kick off all existing waiters
> > > >  c) most importantly, prevent ->read() on an old signalfd once the
> > > > task has gone through execve()
> > > >
> > > > We probably want to avoid using the cred_guard_mutex here, since it is
> > > > quite broad and has some deadlocking issues; it might make sense to
> > > > put the update of ->self_exec_id in fs/exec.c under something like the
> > > > siglock,
> > >
> > > What prevents one from exec'ing a trivial helper 2^32-1 times before
> > > exec'ing into the victim binary?
> >
> > Uh, yeah... that thing should probably become 64 bits wide, too.
>
> Actually, that'd still be wrong even with the existing kernel code for
> two reasons:
>
>  - if you reparent to a subreaper, the existing exec_id comparison breaks
>  - the new check here is going to break if a non-leader thread goes
> through execve(), because of the weird magic where the thread going
> through execve steals the thread id (PID) of the leader
>
> I'm gone for the day, but will try to dust off the years-old patch for
> this that I have lying around somewhere tomorrow. I should probably
> send it through akpm's tree with cc stable, given that this is already
> kinda broken in existing releases...

I'm taking that back, given that I was wrong when writing this mail.
But I've attached the old patch, in case you want to reuse it. That
cpu-plus-64-bits scheme was Andy Lutomirski's idea.

If you use that, you'd have to take the cred_guard_mutex for ->poll
and ->read, but I guess that's probably fine for signalfd.

[-- Attachment #2: 0001-exec-generate-unique-per-execution-per-process-ident.patch --]
[-- Type: text/x-patch, Size: 3024 bytes --]

From a6fcfcc15dacaeb4cc120df447a719da3b9e0c9d Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 29 Nov 2019 23:10:33 +0100
Subject: [PATCH] exec: generate unique per-execution, per-process identifiers

This adds a member privunit ("privilege unit") to task_struct.
privunit is only shared by threads and changes on execve().
It can be used to check whether two tasks are temporally and spatially
equal for privilege checking purposes.

The implementation of locally unique IDs is in sched.h and exec.c for now
because those are the only users so far - if anything else wants to use
them in the future, they can be moved elsewhere.

Signed-off-by: Jann Horn <jannh@google.com>
---
 fs/exec.c             | 17 +++++++++++++++++
 include/linux/sched.h | 14 ++++++++++++++
 kernel/fork.c         |  1 +
 3 files changed, 32 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index c27231234764..4cea8acb95e5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1331,6 +1331,22 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
 }
 EXPORT_SYMBOL(would_dump);
 
+/* value 0 is reserved for init */
+static DEFINE_PER_CPU(u64, luid_counters) = 1;
+
+/*
+ * Allocates a new LUID and writes the allocated LUID to @out.
+ * This function must not be called from IRQ context.
+ */
+void alloc_luid(struct luid *out)
+{
+	preempt_disable();
+	out->count = raw_cpu_read(luid_counters);
+	raw_cpu_add(luid_counters, 1);
+	out->cpu = smp_processor_id();
+	preempt_enable();
+}
+
 void setup_new_exec(struct linux_binprm * bprm)
 {
 	/*
@@ -1384,6 +1400,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 	current->self_exec_id++;
+	alloc_luid(&current->privunit);
 	flush_signal_handlers(current, 0);
 }
 EXPORT_SYMBOL(setup_new_exec);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07e68d9f5dc4..6a3c16b3b43d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -626,6 +626,19 @@ struct wake_q_node {
 	struct wake_q_node *next;
 };
 
+/* locally unique ID */
+struct luid {
+	u64 count;
+	unsigned int cpu;
+};
+
+void alloc_luid(struct luid *out);
+
+static inline bool luid_eq(const struct luid *a, const struct luid *b)
+{
+	return a->count == b->count && a->cpu == b->cpu;
+}
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -941,6 +954,7 @@ struct task_struct {
 	/* Thread group tracking: */
 	u32				parent_exec_id;
 	u32				self_exec_id;
+	struct luid			privunit;
 
 	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
 	spinlock_t			alloc_lock;
diff --git a/kernel/fork.c b/kernel/fork.c
index 00b64f41c2b4..75784ff9c9f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2152,6 +2152,7 @@ static __latent_entropy struct task_struct *copy_process(
 			p->exit_signal = args->exit_signal;
 		p->group_leader = p;
 		p->tgid = p->pid;
+		alloc_luid(&p->privunit);
 	}
 
 	p->nr_dirtied = 0;
-- 
2.24.0.393.g34dc348eaf-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-11-29 22:30 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-27  5:11 [PATCH RFC] signalfd: add support for SFD_TASK Jens Axboe
2019-11-27 19:23 ` Jann Horn
2019-11-27 20:48   ` Jens Axboe
2019-11-27 23:27     ` Jann Horn
2019-11-28  0:41       ` Jens Axboe
2019-11-28  9:02       ` Rasmus Villemoes
2019-11-28 10:07         ` Jann Horn
2019-11-28 19:18           ` Jann Horn
2019-11-28 22:46             ` Jann Horn
2019-11-29 22:30             ` Jann Horn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).