All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] seccomp: Add group_leader pid to seccomp_notif
@ 2020-05-15 23:40 Sargun Dhillon
  2020-05-17  7:17 ` Kees Cook
  2020-05-18 23:08 ` Eric W. Biederman
  0 siblings, 2 replies; 22+ messages in thread
From: Sargun Dhillon @ 2020-05-15 23:40 UTC (permalink / raw)
  To: linux-kernel, containers, linux-api
  Cc: Sargun Dhillon, christian.brauner, tycho, keescook, cyphar

This includes the thread group leader ID in the seccomp_notif. This is
immediately useful for opening up a pidfd for the group leader, as
pidfds only work on group leaders.

Previously, it was considered to include an actual pidfd in the
seccomp_notif structure[1], but it was suggested to avoid proliferating
mechanisms to create pidfds[2].

[1]: https://lkml.org/lkml/2020/1/24/133
[2]: https://lkml.org/lkml/2020/5/15/481

Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
 include/uapi/linux/seccomp.h                  |  2 +
 kernel/seccomp.c                              |  1 +
 tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c1735455bc53..f0c272ef0f1e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -75,6 +75,8 @@ struct seccomp_notif {
 	__u32 pid;
 	__u32 flags;
 	struct seccomp_data data;
+	__u32 tgid;
+	__u8 pad0[4];
 };
 
 /*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..538bcbbcf4da 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1061,6 +1061,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 
 	unotif.id = knotif->id;
 	unotif.pid = task_pid_vnr(knotif->task);
+	unotif.tgid = task_tgid_vnr(knotif->task);
 	unotif.data = *(knotif->data);
 
 	knotif->state = SECCOMP_NOTIFY_SENT;
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index c0aa46ce14f6..5658c6e95461 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -187,6 +187,8 @@ struct seccomp_notif {
 	__u32 pid;
 	__u32 flags;
 	struct seccomp_data data;
+	__u32 tgid;
+	__u8 pad0[4];
 };
 
 struct seccomp_notif_resp {
@@ -3226,6 +3228,8 @@ TEST(user_notification_basic)
 		req.pid = 0;
 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
 	}
+	EXPECT_EQ(pid, req.pid);
+	EXPECT_EQ(pid, req.tgid);
 
 	pollfd.fd = listener;
 	pollfd.events = POLLIN | POLLOUT;
@@ -3453,6 +3457,7 @@ TEST(user_notification_child_pid_ns)
 
 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
 	EXPECT_EQ(req.pid, pid);
+	EXPECT_EQ(req.tgid, pid);
 
 	resp.id = req.id;
 	resp.error = 0;
@@ -3686,6 +3691,51 @@ TEST(user_notification_continue)
 	}
 }
 
+void *getppid_thread(void *arg)
+{
+	int *tid = arg;
+
+	*tid = syscall(__NR_gettid);
+	if (*tid <= 0)
+		return (void *)(long)errno;
+	return NULL;
+}
+
+TEST(user_notification_groupleader)
+{
+	struct seccomp_notif_resp resp = {};
+	struct seccomp_notif req = {};
+	int ret, listener, tid, pid;
+	pthread_t thread;
+	void *status;
+
+	pid = getpid();
+	ASSERT_GT(pid, 0);
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+	}
+
+	listener = user_trap_syscall(__NR_gettid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	ASSERT_EQ(0, pthread_create(&thread, NULL, getppid_thread, &tid));
+
+	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	resp.id = req.id;
+	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	ASSERT_EQ(0, pthread_join(thread, &status));
+	ASSERT_EQ(0, status);
+
+	EXPECT_EQ(tid, req.pid);
+	EXPECT_EQ(pid, req.tgid);
+}
+
+
 /*
  * TODO:
  * - add microbenchmarks
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-15 23:40 [PATCH] seccomp: Add group_leader pid to seccomp_notif Sargun Dhillon
@ 2020-05-17  7:17 ` Kees Cook
  2020-05-17 10:47   ` Christian Brauner
  2020-05-17 11:17   ` Sargun Dhillon
  2020-05-18 23:08 ` Eric W. Biederman
  1 sibling, 2 replies; 22+ messages in thread
From: Kees Cook @ 2020-05-17  7:17 UTC (permalink / raw)
  To: Sargun Dhillon
  Cc: linux-kernel, containers, linux-api, christian.brauner, tycho, cyphar

On Fri, May 15, 2020 at 04:40:05PM -0700, Sargun Dhillon wrote:
> This includes the thread group leader ID in the seccomp_notif. This is
> immediately useful for opening up a pidfd for the group leader, as
> pidfds only work on group leaders.
> 
> Previously, it was considered to include an actual pidfd in the
> seccomp_notif structure[1], but it was suggested to avoid proliferating
> mechanisms to create pidfds[2].
> 
> [1]: https://lkml.org/lkml/2020/1/24/133
> [2]: https://lkml.org/lkml/2020/5/15/481

nit: please use lore.kernel.org/lkml/ URLs

> Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
>  include/uapi/linux/seccomp.h                  |  2 +
>  kernel/seccomp.c                              |  1 +
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
>  3 files changed, 53 insertions(+)
> 
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index c1735455bc53..f0c272ef0f1e 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -75,6 +75,8 @@ struct seccomp_notif {
>  	__u32 pid;
>  	__u32 flags;
>  	struct seccomp_data data;
> +	__u32 tgid;
> +	__u8 pad0[4];
>  };

I think we need to leave off padding and instead use __packed. If we
don't then userspace can't tell when "pad0" changes its "meaning" (i.e.
the size of seccomp_notif becomes 88 bytes with above -- either via
explicit padding like you've got or via implicit by the compiler. If
some other u32 gets added in the future, user space will still see "88"
as the size.

So I *think* the right change here is:

-};
+	__u32 tgid;
+} __packed;

Though tgid may need to go above seccomp_data... for when it grows.
Agh...

_However_, unfortunately, I appear to have no thought this through very
well, and there is actually no sanity-checking in the kernel for dealing
with an old userspace when sizes change. :( For example, if a userspace
doesn't check sizes and calls an ioctl, etc, the kernel will clobber the
user buffer if it's too small.

Even the SECCOMP_GET_NOTIF_SIZES command lacks a buffer size argument.
:(

So:

- should we just declare such userspace as "wrong"? I don't think
  that'll work, especially since what if we ever change the size of
  seccomp_data...  that predated the ..._SIZES command.

- should we add a SECCOMP_SET_SIZES command to tell the kernel what
  we're expecting? There's no "state" associated across seccomp(2)
  calls, but maybe that doesn't matter because only user_notif writes
  back to userspace. For the ioctl, the state could be part of the
  private file data? Sending seccomp_data back to userspace only
  happens here, and any changes in seccomp_data size will just be seen
  as allowing a filter to query further into it.

- should GET_SIZES report "useful" size? (i.e. exclude padding?)

> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c

Yay test updates! :)

> +TEST(user_notification_groupleader)

In my first pass of review I was going to say "can you please also check
the sizes used by the ioctl?" But that triggered the above size checking
mess in my mind.

Let me look at this more closely on Monday, and I'll proposed something.
:P

Thanks!

-Kees

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17  7:17 ` Kees Cook
@ 2020-05-17 10:47   ` Christian Brauner
  2020-05-17 11:21     ` Aleksa Sarai
  2020-05-17 11:17   ` Sargun Dhillon
  1 sibling, 1 reply; 22+ messages in thread
From: Christian Brauner @ 2020-05-17 10:47 UTC (permalink / raw)
  To: Kees Cook
  Cc: Sargun Dhillon, linux-kernel, containers, linux-api, tycho, cyphar

On Sun, May 17, 2020 at 12:17:14AM -0700, Kees Cook wrote:
> On Fri, May 15, 2020 at 04:40:05PM -0700, Sargun Dhillon wrote:
> > This includes the thread group leader ID in the seccomp_notif. This is
> > immediately useful for opening up a pidfd for the group leader, as
> > pidfds only work on group leaders.
> > 
> > Previously, it was considered to include an actual pidfd in the
> > seccomp_notif structure[1], but it was suggested to avoid proliferating
> > mechanisms to create pidfds[2].
> > 
> > [1]: https://lkml.org/lkml/2020/1/24/133
> > [2]: https://lkml.org/lkml/2020/5/15/481
> 
> nit: please use lore.kernel.org/lkml/ URLs
> 
> > Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > ---
> >  include/uapi/linux/seccomp.h                  |  2 +
> >  kernel/seccomp.c                              |  1 +
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
> >  3 files changed, 53 insertions(+)
> > 
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index c1735455bc53..f0c272ef0f1e 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -75,6 +75,8 @@ struct seccomp_notif {
> >  	__u32 pid;
> >  	__u32 flags;
> >  	struct seccomp_data data;
> > +	__u32 tgid;
> > +	__u8 pad0[4];
> >  };
> 
> I think we need to leave off padding and instead use __packed. If we
> don't then userspace can't tell when "pad0" changes its "meaning" (i.e.
> the size of seccomp_notif becomes 88 bytes with above -- either via
> explicit padding like you've got or via implicit by the compiler. If
> some other u32 gets added in the future, user space will still see "88"
> as the size.
> 
> So I *think* the right change here is:
> 
> -};
> +	__u32 tgid;
> +} __packed;
> 
> Though tgid may need to go above seccomp_data... for when it grows.
> Agh...
> 
> _However_, unfortunately, I appear to have no thought this through very
> well, and there is actually no sanity-checking in the kernel for dealing
> with an old userspace when sizes change. :( For example, if a userspace
> doesn't check sizes and calls an ioctl, etc, the kernel will clobber the
> user buffer if it's too small.

I'd just argue that that's just userspace messing up.

> 
> Even the SECCOMP_GET_NOTIF_SIZES command lacks a buffer size argument.
> :(
> 
> So:
> 
> - should we just declare such userspace as "wrong"? I don't think
>   that'll work, especially since what if we ever change the size of
>   seccomp_data...  that predated the ..._SIZES command.

Yeah, that's nasty since the struct member in seccomp_notif would now
clobber each other.

> 
> - should we add a SECCOMP_SET_SIZES command to tell the kernel what
>   we're expecting? There's no "state" associated across seccomp(2)
>   calls, but maybe that doesn't matter because only user_notif writes
>   back to userspace. For the ioctl, the state could be part of the
>   private file data? Sending seccomp_data back to userspace only
>   happens here, and any changes in seccomp_data size will just be seen
>   as allowing a filter to query further into it.

Sounds ok-ish in my opinion.

> 
> - should GET_SIZES report "useful" size? (i.e. exclude padding?)

Or... And that's more invasive but ultimately cleaner we v2 the whole
thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
embedd the size argument in the structs. Userspace sets the size
argument, we use get_user() to get the size first and then
copy_struct_from_user() to handle it cleanly based on that. A similar
model as with sched (has other unrelated quirks because they messed up
something too):

static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
	u32 size;
	int ret;

	/* Zero the full structure, so that a short copy will be nice: */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	/* ABI compatibility quirk: */
	if (!size)
		size = SCHED_ATTR_SIZE_VER0;
	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
		goto err_size;

	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	if (ret) {
		if (ret == -E2BIG)
			goto err_size;
		return ret;
	}

We're probably the biggest user of this right now and I'd be ok with
that change. If it's a v2 than whatever. :)

Christian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17  7:17 ` Kees Cook
  2020-05-17 10:47   ` Christian Brauner
@ 2020-05-17 11:17   ` Sargun Dhillon
  1 sibling, 0 replies; 22+ messages in thread
From: Sargun Dhillon @ 2020-05-17 11:17 UTC (permalink / raw)
  To: Kees Cook
  Cc: LKML, Linux Containers, Linux API, Christian Brauner,
	Tycho Andersen, Aleksa Sarai

On Sun, May 17, 2020 at 12:17 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Fri, May 15, 2020 at 04:40:05PM -0700, Sargun Dhillon wrote:
> > This includes the thread group leader ID in the seccomp_notif. This is
> > immediately useful for opening up a pidfd for the group leader, as
> > pidfds only work on group leaders.
> >
> > Previously, it was considered to include an actual pidfd in the
> > seccomp_notif structure[1], but it was suggested to avoid proliferating
> > mechanisms to create pidfds[2].
> >
> > [1]: https://lkml.org/lkml/2020/1/24/133
> > [2]: https://lkml.org/lkml/2020/5/15/481
>
> nit: please use lore.kernel.org/lkml/ URLs
>
> > Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > ---
> >  include/uapi/linux/seccomp.h                  |  2 +
> >  kernel/seccomp.c                              |  1 +
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
> >  3 files changed, 53 insertions(+)
> >
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index c1735455bc53..f0c272ef0f1e 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -75,6 +75,8 @@ struct seccomp_notif {
> >       __u32 pid;
> >       __u32 flags;
> >       struct seccomp_data data;
> > +     __u32 tgid;
> > +     __u8 pad0[4];
> >  };
>
> I think we need to leave off padding and instead use __packed. If we
> don't then userspace can't tell when "pad0" changes its "meaning" (i.e.
> the size of seccomp_notif becomes 88 bytes with above -- either via
> explicit padding like you've got or via implicit by the compiler. If
> some other u32 gets added in the future, user space will still see "88"
> as the size.
>
I've had previous feedback about using "packed". See:
https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/
https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/
> So I *think* the right change here is:
>
> -};
> +       __u32 tgid;
> +} __packed;
>
> Though tgid may need to go above seccomp_data... for when it grows.
> Agh...
(How) can seccomp_data grow safely, even with this extensibility mechanism?
>
> _However_, unfortunately, I appear to have no thought this through very
> well, and there is actually no sanity-checking in the kernel for dealing
> with an old userspace when sizes change. :( For example, if a userspace
> doesn't check sizes and calls an ioctl, etc, the kernel will clobber the
> user buffer if it's too small.
>
> Even the SECCOMP_GET_NOTIF_SIZES command lacks a buffer size argument.
> :(
>
> So:
>
> - should we just declare such userspace as "wrong"? I don't think
>   that'll work, especially since what if we ever change the size of
>   seccomp_data...  that predated the ..._SIZES command.
>
> - should we add a SECCOMP_SET_SIZES command to tell the kernel what
>   we're expecting? There's no "state" associated across seccomp(2)
>   calls, but maybe that doesn't matter because only user_notif writes
>   back to userspace. For the ioctl, the state could be part of the
>   private file data? Sending seccomp_data back to userspace only
>   happens here, and any changes in seccomp_data size will just be seen
>   as allowing a filter to query further into it.
Will we ever grow seccomp_data?

I suggest we throw away the _SIZES api, and just introduce RECV2, which sends
back a known, fixed format, and deprecate these dynamically sized uapi
shenanigans.

(Queue RECV3, etc..)

Maybe we do something like perf_event_open, where there's a read_format,
and that's used by the user to determine how big of a response / fields they
want to get?

>
> - should GET_SIZES report "useful" size? (i.e. exclude padding?)
>
> > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
>
> Yay test updates! :)
>
> > +TEST(user_notification_groupleader)
>
> In my first pass of review I was going to say "can you please also check
> the sizes used by the ioctl?" But that triggered the above size checking
> mess in my mind.
>
> Let me look at this more closely on Monday, and I'll proposed something.
> :P
To summarize my set of ideas:
1. We take the ptrace-style API, where we have a request to get the tgid of
a given request ID (or any new / extensible field)
2. We add a perf_event_open style API, where you have to tell it what fields
to include in the response
3. We introduce RECV2 [through N]
4. We never extend seccomp_data, and just continue to append things to the API
5. We rev the API _once_ and unroll seccomp_data, and make it so that
new members have to be *asked for*, rather than are implicitly
included.
>
> Thanks!
>
> -Kees
>
> --
> Kees Cook

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 10:47   ` Christian Brauner
@ 2020-05-17 11:21     ` Aleksa Sarai
  2020-05-17 14:23       ` Tycho Andersen
  0 siblings, 1 reply; 22+ messages in thread
From: Aleksa Sarai @ 2020-05-17 11:21 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Kees Cook, linux-api, containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1650 bytes --]

On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> Or... And that's more invasive but ultimately cleaner we v2 the whole
> thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> embedd the size argument in the structs. Userspace sets the size
> argument, we use get_user() to get the size first and then
> copy_struct_from_user() to handle it cleanly based on that. A similar
> model as with sched (has other unrelated quirks because they messed up
> something too):
> 
> static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> {
> 	u32 size;
> 	int ret;
> 
> 	/* Zero the full structure, so that a short copy will be nice: */
> 	memset(attr, 0, sizeof(*attr));
> 
> 	ret = get_user(size, &uattr->size);
> 	if (ret)
> 		return ret;
> 
> 	/* ABI compatibility quirk: */
> 	if (!size)
> 		size = SCHED_ATTR_SIZE_VER0;
> 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> 		goto err_size;
> 
> 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> 	if (ret) {
> 		if (ret == -E2BIG)
> 			goto err_size;
> 		return ret;
> 	}
> 
> We're probably the biggest user of this right now and I'd be ok with
> that change. If it's a v2 than whatever. :)

I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
little surprised when I found out that user_notif doesn't do it this
way a while ago (and although in theory it is userspace's fault, ideally
we could have an API that doesn't have built-in footguns).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 11:21     ` Aleksa Sarai
@ 2020-05-17 14:23       ` Tycho Andersen
  2020-05-17 14:33         ` Christian Brauner
  0 siblings, 1 reply; 22+ messages in thread
From: Tycho Andersen @ 2020-05-17 14:23 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Christian Brauner, Kees Cook, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > embedd the size argument in the structs. Userspace sets the size
> > argument, we use get_user() to get the size first and then
> > copy_struct_from_user() to handle it cleanly based on that. A similar
> > model as with sched (has other unrelated quirks because they messed up
> > something too):
> > 
> > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > {
> > 	u32 size;
> > 	int ret;
> > 
> > 	/* Zero the full structure, so that a short copy will be nice: */
> > 	memset(attr, 0, sizeof(*attr));
> > 
> > 	ret = get_user(size, &uattr->size);
> > 	if (ret)
> > 		return ret;
> > 
> > 	/* ABI compatibility quirk: */
> > 	if (!size)
> > 		size = SCHED_ATTR_SIZE_VER0;
> > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > 		goto err_size;
> > 
> > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > 	if (ret) {
> > 		if (ret == -E2BIG)
> > 			goto err_size;
> > 		return ret;
> > 	}
> > 
> > We're probably the biggest user of this right now and I'd be ok with
> > that change. If it's a v2 than whatever. :)
> 
> I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> little surprised when I found out that user_notif doesn't do it this
> way a while ago (and although in theory it is userspace's fault, ideally
> we could have an API that doesn't have built-in footguns).

But I thought the whole point was that we couldn't do that, because
there's two things that can vary in length (struct seccomp_notif and
struct seccomp_data)?

https://lore.kernel.org/lkml/CAGXu5j+ZPxu6egE1fEr+N9+zLx3N+SJ_vbS_zzj9_hrdWrrrWQ@mail.gmail.com/

Tycho

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 14:23       ` Tycho Andersen
@ 2020-05-17 14:33         ` Christian Brauner
  2020-05-17 14:35           ` Christian Brauner
  2020-05-17 14:46           ` Tycho Andersen
  0 siblings, 2 replies; 22+ messages in thread
From: Christian Brauner @ 2020-05-17 14:33 UTC (permalink / raw)
  To: Tycho Andersen
  Cc: Aleksa Sarai, Kees Cook, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 08:23:16AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> > On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > > embedd the size argument in the structs. Userspace sets the size
> > > argument, we use get_user() to get the size first and then
> > > copy_struct_from_user() to handle it cleanly based on that. A similar
> > > model as with sched (has other unrelated quirks because they messed up
> > > something too):
> > > 
> > > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > > {
> > > 	u32 size;
> > > 	int ret;
> > > 
> > > 	/* Zero the full structure, so that a short copy will be nice: */
> > > 	memset(attr, 0, sizeof(*attr));
> > > 
> > > 	ret = get_user(size, &uattr->size);
> > > 	if (ret)
> > > 		return ret;
> > > 
> > > 	/* ABI compatibility quirk: */
> > > 	if (!size)
> > > 		size = SCHED_ATTR_SIZE_VER0;
> > > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > > 		goto err_size;
> > > 
> > > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > > 	if (ret) {
> > > 		if (ret == -E2BIG)
> > > 			goto err_size;
> > > 		return ret;
> > > 	}
> > > 
> > > We're probably the biggest user of this right now and I'd be ok with
> > > that change. If it's a v2 than whatever. :)
> > 
> > I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> > little surprised when I found out that user_notif doesn't do it this
> > way a while ago (and although in theory it is userspace's fault, ideally
> > we could have an API that doesn't have built-in footguns).
> 
> But I thought the whole point was that we couldn't do that, because
> there's two things that can vary in length (struct seccomp_notif and
> struct seccomp_data)?

I may have missed that discussion you linked.
But why wouldn't:

struct seccomp_notif2 {
	__u32 notif_size;
	__u64 id;
	__u32 pid;
	__u32 flags;
	struct seccomp_data data;
	__u32 data_size;
};

struct seccomp_notif_resp2 {
	__u32 notif_resp_size;
	__u64 id;
	__s64 val;
	__s32 error;
	__u32 flags;
};

work?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 14:33         ` Christian Brauner
@ 2020-05-17 14:35           ` Christian Brauner
  2020-05-17 14:46           ` Tycho Andersen
  1 sibling, 0 replies; 22+ messages in thread
From: Christian Brauner @ 2020-05-17 14:35 UTC (permalink / raw)
  To: Tycho Andersen
  Cc: Aleksa Sarai, Kees Cook, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 08:23:16AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> > > On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > > > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > > > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > > > embedd the size argument in the structs. Userspace sets the size
> > > > argument, we use get_user() to get the size first and then
> > > > copy_struct_from_user() to handle it cleanly based on that. A similar
> > > > model as with sched (has other unrelated quirks because they messed up
> > > > something too):
> > > > 
> > > > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > > > {
> > > > 	u32 size;
> > > > 	int ret;
> > > > 
> > > > 	/* Zero the full structure, so that a short copy will be nice: */
> > > > 	memset(attr, 0, sizeof(*attr));
> > > > 
> > > > 	ret = get_user(size, &uattr->size);
> > > > 	if (ret)
> > > > 		return ret;
> > > > 
> > > > 	/* ABI compatibility quirk: */
> > > > 	if (!size)
> > > > 		size = SCHED_ATTR_SIZE_VER0;
> > > > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > > > 		goto err_size;
> > > > 
> > > > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > > > 	if (ret) {
> > > > 		if (ret == -E2BIG)
> > > > 			goto err_size;
> > > > 		return ret;
> > > > 	}
> > > > 
> > > > We're probably the biggest user of this right now and I'd be ok with
> > > > that change. If it's a v2 than whatever. :)
> > > 
> > > I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> > > little surprised when I found out that user_notif doesn't do it this
> > > way a while ago (and although in theory it is userspace's fault, ideally
> > > we could have an API that doesn't have built-in footguns).
> > 
> > But I thought the whole point was that we couldn't do that, because
> > there's two things that can vary in length (struct seccomp_notif and
> > struct seccomp_data)?
> 
> I may have missed that discussion you linked.
> But why wouldn't:
> 
> struct seccomp_notif2 {
> 	__u32 notif_size;
> 	__u64 id;
> 	__u32 pid;
> 	__u32 flags;
> 	struct seccomp_data data;
> 	__u32 data_size;
> };
> 
> struct seccomp_notif_resp2 {
> 	__u32 notif_resp_size;
> 	__u64 id;
> 	__s64 val;
> 	__s32 error;
> 	__u32 flags;
> };

(Ignore the missing 32 bits here.)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 14:33         ` Christian Brauner
  2020-05-17 14:35           ` Christian Brauner
@ 2020-05-17 14:46           ` Tycho Andersen
  2020-05-17 15:02             ` Tycho Andersen
  1 sibling, 1 reply; 22+ messages in thread
From: Tycho Andersen @ 2020-05-17 14:46 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Aleksa Sarai, Kees Cook, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 08:23:16AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> > > On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > > > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > > > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > > > embedd the size argument in the structs. Userspace sets the size
> > > > argument, we use get_user() to get the size first and then
> > > > copy_struct_from_user() to handle it cleanly based on that. A similar
> > > > model as with sched (has other unrelated quirks because they messed up
> > > > something too):
> > > > 
> > > > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > > > {
> > > > 	u32 size;
> > > > 	int ret;
> > > > 
> > > > 	/* Zero the full structure, so that a short copy will be nice: */
> > > > 	memset(attr, 0, sizeof(*attr));
> > > > 
> > > > 	ret = get_user(size, &uattr->size);
> > > > 	if (ret)
> > > > 		return ret;
> > > > 
> > > > 	/* ABI compatibility quirk: */
> > > > 	if (!size)
> > > > 		size = SCHED_ATTR_SIZE_VER0;
> > > > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > > > 		goto err_size;
> > > > 
> > > > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > > > 	if (ret) {
> > > > 		if (ret == -E2BIG)
> > > > 			goto err_size;
> > > > 		return ret;
> > > > 	}
> > > > 
> > > > We're probably the biggest user of this right now and I'd be ok with
> > > > that change. If it's a v2 than whatever. :)
> > > 
> > > I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> > > little surprised when I found out that user_notif doesn't do it this
> > > way a while ago (and although in theory it is userspace's fault, ideally
> > > we could have an API that doesn't have built-in footguns).
> > 
> > But I thought the whole point was that we couldn't do that, because
> > there's two things that can vary in length (struct seccomp_notif and
> > struct seccomp_data)?
> 
> I may have missed that discussion you linked.
> But why wouldn't:
> 
> struct seccomp_notif2 {
> 	__u32 notif_size;
> 	__u64 id;
> 	__u32 pid;
> 	__u32 flags;
> 	struct seccomp_data data;
> 	__u32 data_size;
> };

I guess you need to put data_size before data, otherwise old userspace
with a smaller struct seccomp_data will look in the wrong place.

But yes, that'll work if you put two sizes in, which is probably
reasonable since we're talking about two structs.

Tycho

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 14:46           ` Tycho Andersen
@ 2020-05-17 15:02             ` Tycho Andersen
  2020-05-17 21:30               ` Kees Cook
  2020-05-18 12:53               ` Christian Brauner
  0 siblings, 2 replies; 22+ messages in thread
From: Tycho Andersen @ 2020-05-17 15:02 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Aleksa Sarai, Kees Cook, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > struct seccomp_notif2 {
> > 	__u32 notif_size;
> > 	__u64 id;
> > 	__u32 pid;
> > 	__u32 flags;
> > 	struct seccomp_data data;
> > 	__u32 data_size;
> > };
> 
> I guess you need to put data_size before data, otherwise old userspace
> with a smaller struct seccomp_data will look in the wrong place.
> 
> But yes, that'll work if you put two sizes in, which is probably
> reasonable since we're talking about two structs.

Well, no, it doesn't either. Suppose we add a new field first to
struct seccomp_notif2:

struct seccomp_notif2 {
    __u32 notif_size;
    __u64 id;
    __u32 pid;
    __u32 flags;
    struct seccomp_data data;
    __u32 data_size;
    __u32 new_field;
};

And next we add a new field to struct secccomp_data. When a userspace
compiled with just the new seccomp_notif2 field does:

seccomp_notif2.new_field = ...;

the compiler will put it in the wrong place for the kernel with the
new seccomp_data field too.

Sort of feels like we should do:

struct seccomp_notif2 {
    struct seccomp_notif *notif;
    struct seccomp_data *data;
};

?

Tycho

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 15:02             ` Tycho Andersen
@ 2020-05-17 21:30               ` Kees Cook
  2020-05-18  8:32                 ` Sargun Dhillon
                                   ` (2 more replies)
  2020-05-18 12:53               ` Christian Brauner
  1 sibling, 3 replies; 22+ messages in thread
From: Kees Cook @ 2020-05-17 21:30 UTC (permalink / raw)
  To: Tycho Andersen
  Cc: Christian Brauner, Aleksa Sarai, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > struct seccomp_notif2 {
> > > 	__u32 notif_size;
> > > 	__u64 id;
> > > 	__u32 pid;
> > > 	__u32 flags;
> > > 	struct seccomp_data data;
> > > 	__u32 data_size;
> > > };
> > 
> > I guess you need to put data_size before data, otherwise old userspace
> > with a smaller struct seccomp_data will look in the wrong place.
> > 
> > But yes, that'll work if you put two sizes in, which is probably
> > reasonable since we're talking about two structs.
> 
> Well, no, it doesn't either. Suppose we add a new field first to
> struct seccomp_notif2:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data data;
>     __u32 data_size;
>     __u32 new_field;
> };
> 
> And next we add a new field to struct secccomp_data. When a userspace
> compiled with just the new seccomp_notif2 field does:
> 
> seccomp_notif2.new_field = ...;
> 
> the compiler will put it in the wrong place for the kernel with the
> new seccomp_data field too.
> 
> Sort of feels like we should do:
> 
> struct seccomp_notif2 {
>     struct seccomp_notif *notif;
>     struct seccomp_data *data;
> };

I'm going read this thread more carefully tomorrow, but I just wanted to
mention that I'd *like* to extend seccomp_data for doing deep argument
inspection of the new syscalls. I think it's the least bad of many
designs, and I'll write that up in more detail. (I would *really* like
to avoid extending seccomp's BPF language, and instead allow probing
into the struct copied from userspace, etc.)

Anyway, it's very related to this, so, yeah, probably we need a v2 of the
notif API, but I'll try to get all the ideas here collected in one place.

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 21:30               ` Kees Cook
@ 2020-05-18  8:32                 ` Sargun Dhillon
  2020-05-18 12:45                   ` Christian Brauner
  2020-05-18 12:05                 ` Christian Brauner
  2020-05-18 21:10                 ` Kees Cook
  2 siblings, 1 reply; 22+ messages in thread
From: Sargun Dhillon @ 2020-05-18  8:32 UTC (permalink / raw)
  To: Kees Cook
  Cc: Tycho Andersen, linux-api, Christian Brauner, containers, linux-kernel

On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> 
> I'm going read this thread more carefully tomorrow, but I just wanted to
> mention that I'd *like* to extend seccomp_data for doing deep argument
> inspection of the new syscalls. I think it's the least bad of many
> designs, and I'll write that up in more detail. (I would *really* like
> to avoid extending seccomp's BPF language, and instead allow probing
> into the struct copied from userspace, etc.)
> 
> Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> notif API, but I'll try to get all the ideas here collected in one place.
I scratched together a proposal of what I think would make a not-terrible
V2 API. I'm sure there's bugs in this code, but I think it's workable --
or at least a place to start. The biggest thing I think we should consider
is unrolling seccomp_data if we don't intend to add new BPF-accessible
fields.

If also uses read(2), so we get to take advantage of read(2)'s ability
to pass a size along with the read, as opposed to doing ioctl tricks.
It also makes programming from against it slightly simpler. I can imagine
that the send API could be similar, in that it could support write, and
thus making it 100% usable from Go (and the like) without requiring
a separate OS-thread be spun up to interact with the listener.

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c1735455bc53..db6e5335ec6a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -113,6 +113,41 @@ struct seccomp_notif_resp {
 	__u32 flags;
 };
 
+#define SECCOMP_DATA_SIZE_VER0	64
+
+#define SECCOMP_NOTIF_CONFIG_SIZE_VER0	16
+
+/* Optional seccomp return fields */
+/* __u32 for triggering pid */
+#define SECCOMP_NOTIF_FIELD_PID		(1UL << 1)
+/* __u32 for triggering  thread group leader id */
+#define SECCOMP_NOTIF_FIELD_TGID	(1UL << 1)
+
+struct seccomp_notif_config {
+	__u32 size;
+	/*
+	 * The supported SECCOMP_DATA_SIZE to be returned. If the size passed in
+	 * is unsupported (seccomp_data is too small or if it is larger than the
+	 * currently supported size), EINVAL or E2BIG will be, respectively,
+	 * returned, and this will be set to the latest supported size.
+	 */
+	__u32 seccomp_data_size;
+	/*
+	 * This is an OR'd together list of SECCOMP_NOTIF_FIELD_*. If an
+	 * unsupported field is requested, ENOTSUPP will be returned.
+	 */
+	__u64 optional_fields;
+};
+
+/* read(2) Notification format example:
+ * struct seccomp_notif_read {
+ *	__u64 id;
+ *	__u32 pid; if SECCOMP_NOTIF_FIELD_PID
+ *	__u32 tgid; if SECCOMP_NOTIF_FIELD_TGID
+ *	struct seccomp_data data;
+ * };
+ */
+
 #define SECCOMP_IOC_MAGIC		'!'
 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
@@ -124,4 +159,10 @@ struct seccomp_notif_resp {
 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
 						struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+/*
+ * Configure the data structure to be returned by read(2).
+ * Returns the size of the data structure which will be returned.
+ */
+#define SECCOMP_IOCTL_NOTIF_CONFIG	SECCOMP_IOR(3,	\
+						struct seccomp_notif_config)
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..4670cb03c390 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -95,12 +95,19 @@ struct seccomp_knotif {
  * @next_id: The id of the next request.
  * @notifications: A list of struct seccomp_knotif elements.
  * @wqh: A wait queue for poll.
+ * @read_size: The size of the expected read. If it is set to 0, it means that
+ *             reading notifications via read(2) is not yet configured. Until
+ *             then, EINVAL will be returned via read(2).
+ * @data_size: The supported size of seccomp_data.
+ * @optional_fields: The optional fields to return on read
  */
 struct notification {
 	struct semaphore request;
 	u64 next_id;
 	struct list_head notifications;
 	wait_queue_head_t wqh;
+	u32 read_size;
+	u64 optional_fields;
 };
 
 /**
@@ -1021,79 +1028,160 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static long seccomp_notify_recv(struct seccomp_filter *filter,
-				void __user *buf)
+static void seccomp_reset_notif(struct seccomp_filter *filter,
+				u64 id)
 {
-	struct seccomp_knotif *knotif = NULL, *cur;
-	struct seccomp_notif unotif;
-	ssize_t ret;
-
-	/* Verify that we're not given garbage to keep struct extensible. */
-	ret = check_zeroed_user(buf, sizeof(unotif));
-	if (ret < 0)
-		return ret;
-	if (!ret)
-		return -EINVAL;
+	struct seccomp_knotif *cur;
 
-	memset(&unotif, 0, sizeof(unotif));
+	/*
+	 * Something went wrong. To make sure that we keep this
+	 * notification alive, let's reset it back to INIT. It
+	 * may have died when we released the lock, so we need to make
+	 * sure it's still around.
+	 */
+	mutex_lock(&filter->notify_lock);
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->id == id) {
+			cur->state = SECCOMP_NOTIFY_INIT;
+			up(&filter->notif->request);
+			break;
+		}
+	}
+	mutex_unlock(&filter->notify_lock);
+}
+/*
+ * Returns the next knotif available. If the return is a non-error, it will
+ * be with notify_lock held. It is up to the caller to unlock it.
+ */
+static struct seccomp_knotif *seccomp_get_notif(struct seccomp_filter *filter)
+{
+	struct seccomp_knotif *cur;
+	int ret;
 
 	ret = down_interruptible(&filter->notif->request);
 	if (ret < 0)
-		return ret;
+		return ERR_PTR(ret);
 
 	mutex_lock(&filter->notify_lock);
 	list_for_each_entry(cur, &filter->notif->notifications, list) {
 		if (cur->state == SECCOMP_NOTIFY_INIT) {
-			knotif = cur;
-			break;
+			cur->state = SECCOMP_NOTIFY_SENT;
+			wake_up_poll(&filter->notif->wqh,
+				     EPOLLOUT | EPOLLWRNORM);
+			return cur;
 		}
 	}
-
+	mutex_unlock(&filter->notify_lock);
 	/*
 	 * If we didn't find a notification, it could be that the task was
 	 * interrupted by a fatal signal between the time we were woken and
 	 * when we were able to acquire the rw lock.
 	 */
-	if (!knotif) {
-		ret = -ENOENT;
-		goto out;
-	}
+	return ERR_PTR(-ENOENT);
+}
+
+static long seccomp_notify_recv(struct seccomp_filter *filter,
+				void __user *buf)
+{
+	struct seccomp_knotif *knotif;
+	struct seccomp_notif unotif;
+	ssize_t ret;
+
+	/* Verify that we're not given garbage to keep struct extensible. */
+	ret = check_zeroed_user(buf, sizeof(unotif));
+	if (ret < 0)
+		return ret;
+	if (!ret)
+		return -EINVAL;
+
+	memset(&unotif, 0, sizeof(unotif));
+
+	knotif = seccomp_get_notif(filter);
+	if (IS_ERR(knotif))
+		return PTR_ERR(knotif);
 
 	unotif.id = knotif->id;
 	unotif.pid = task_pid_vnr(knotif->task);
 	unotif.data = *(knotif->data);
 
-	knotif->state = SECCOMP_NOTIFY_SENT;
-	wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
-	ret = 0;
-out:
 	mutex_unlock(&filter->notify_lock);
 
-	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
-		ret = -EFAULT;
+	if (!copy_to_user(buf, &unotif, sizeof(unotif)))
+		return 0;
 
-		/*
-		 * Userspace screwed up. To make sure that we keep this
-		 * notification alive, let's reset it back to INIT. It
-		 * may have died when we released the lock, so we need to make
-		 * sure it's still around.
-		 */
-		knotif = NULL;
-		mutex_lock(&filter->notify_lock);
-		list_for_each_entry(cur, &filter->notif->notifications, list) {
-			if (cur->id == unotif.id) {
-				knotif = cur;
-				break;
-			}
-		}
+	seccomp_reset_notif(filter, unotif.id);
+	return -EFAULT;
+}
 
-		if (knotif) {
-			knotif->state = SECCOMP_NOTIFY_INIT;
-			up(&filter->notif->request);
-		}
-		mutex_unlock(&filter->notify_lock);
+/* Append the src value to the dst, and return the new cursor */
+#define append(dst, src)	({		\
+	typeof(src) _src = src;			\
+	memcpy(dst, &_src, sizeof(_src));	\
+	dst + sizeof(_src);			\
+})
+
+/*
+ * Append the value pointer to by the pointer, src, to the dst
+ * and return the new cursor
+ */
+#define append_ptr(dst, src)	({		\
+	typeof(src) _src = src;			\
+	memcpy(dst, _src, sizeof(*_src));	\
+	dst + sizeof(*_src);			\
+})
+
+static ssize_t seccomp_notify_read(struct file *file, char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct seccomp_filter *filter = file->private_data;
+	u32 optional_fields, read_size;
+	struct seccomp_knotif *knotif;
+	void *kbuf, *cursor;
+	int ret;
+	u64 id;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret)
+		return ret;
+	read_size = filter->notif->read_size;
+	optional_fields = filter->notif->optional_fields;
+	mutex_unlock(&filter->notify_lock);
+
+	if (read_size == 0)
+		return -EINVAL;
+	if (count < read_size)
+		return -ENOSPC;
+
+	knotif = seccomp_get_notif(filter);
+	if (IS_ERR(knotif))
+		return PTR_ERR(knotif);
+
+	id = knotif->id;
+	kbuf = kzalloc(read_size, GFP_KERNEL);
+	if (!kbuf) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
+	cursor = kbuf;
+	cursor = append(cursor, knotif->id);
+	if (filter->notif->optional_fields & SECCOMP_NOTIF_FIELD_PID)
+		cursor = append(cursor, task_pid_vnr(knotif->task));
+	if (filter->notif->optional_fields & SECCOMP_NOTIF_FIELD_TGID)
+		cursor = append(cursor, task_tgid_vnr(knotif->task));
+	cursor = append_ptr(cursor, knotif->data);
+	WARN_ON_ONCE(cursor != kbuf + read_size);
+
+	ret = copy_to_user(buf, kbuf, read_size);
+	if (!ret)
+		ret = read_size;
+
+	kfree(kbuf);
+out:
+	mutex_unlock(&filter->notify_lock);
+	if (ret < 0)
+		seccomp_reset_notif(filter, id);
+
 	return ret;
 }
 
@@ -1175,6 +1263,70 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	return ret;
 }
 
+static long seccomp_notif_config(struct seccomp_filter *filter,
+				 struct seccomp_notif_config __user *uconfig)
+{
+	struct seccomp_notif_config config;
+	/* ret_size is a minimum of 8, given id */
+	int ret, notification_size = 8;
+	u32 size;
+
+
+	ret = get_user(size, &uconfig->size);
+	if (ret)
+		return ret;
+	ret = copy_struct_from_user(&config, sizeof(config), uconfig, size);
+	if (ret)
+		return ret;
+
+	/*
+	 * This needs to be bumped every time we change seccomp_data's size
+	 */
+	BUILD_BUG_ON(sizeof(struct seccomp_data) != SECCOMP_DATA_SIZE_VER0);
+	if (config.seccomp_data_size < SECCOMP_DATA_SIZE_VER0) {
+		ret = -EINVAL;
+		goto invalid_seccomp_data_size;
+	}
+	if (config.seccomp_data_size > SECCOMP_DATA_SIZE_VER0) {
+		ret = -E2BIG;
+		goto invalid_seccomp_data_size;
+	}
+	notification_size += config.seccomp_data_size;
+
+	if (config.optional_fields & ~(SECCOMP_NOTIF_FIELD_PID|SECCOMP_NOTIF_FIELD_TGID))
+		return -ENOTSUPP;
+
+	if (config.optional_fields & SECCOMP_NOTIF_FIELD_PID)
+		notification_size += 4;
+	if (config.optional_fields & SECCOMP_NOTIF_FIELD_TGID)
+		notification_size += 4;
+
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+
+	if (filter->notif->read_size) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = notification_size;
+	filter->notif->read_size = notification_size;
+	filter->notif->optional_fields = config.optional_fields;
+
+out:
+	mutex_unlock(&filter->notify_lock);
+
+	return ret;
+
+invalid_seccomp_data_size:
+	if (put_user(SECCOMP_DATA_SIZE_VER0, &uconfig->seccomp_data_size))
+		return -EFAULT;
+
+	return ret;
+}
+
 static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 				 unsigned long arg)
 {
@@ -1188,6 +1340,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 		return seccomp_notify_send(filter, buf);
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_CONFIG:
+		return seccomp_notif_config(filter, buf);
 	default:
 		return -EINVAL;
 	}
@@ -1224,6 +1378,7 @@ static const struct file_operations seccomp_notify_ops = {
 	.release = seccomp_notify_release,
 	.unlocked_ioctl = seccomp_notify_ioctl,
 	.compat_ioctl = seccomp_notify_ioctl,
+	.read = seccomp_notify_read,
 };
 
 static struct file *init_listener(struct seccomp_filter *filter)


> 
> -- 
> Kees Cook
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 21:30               ` Kees Cook
  2020-05-18  8:32                 ` Sargun Dhillon
@ 2020-05-18 12:05                 ` Christian Brauner
  2020-05-18 21:10                 ` Kees Cook
  2 siblings, 0 replies; 22+ messages in thread
From: Christian Brauner @ 2020-05-18 12:05 UTC (permalink / raw)
  To: Kees Cook
  Cc: Tycho Andersen, Aleksa Sarai, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > > struct seccomp_notif2 {
> > > > 	__u32 notif_size;
> > > > 	__u64 id;
> > > > 	__u32 pid;
> > > > 	__u32 flags;
> > > > 	struct seccomp_data data;
> > > > 	__u32 data_size;
> > > > };
> > > 
> > > I guess you need to put data_size before data, otherwise old userspace
> > > with a smaller struct seccomp_data will look in the wrong place.
> > > 
> > > But yes, that'll work if you put two sizes in, which is probably
> > > reasonable since we're talking about two structs.
> > 
> > Well, no, it doesn't either. Suppose we add a new field first to
> > struct seccomp_notif2:
> > 
> > struct seccomp_notif2 {
> >     __u32 notif_size;
> >     __u64 id;
> >     __u32 pid;
> >     __u32 flags;
> >     struct seccomp_data data;
> >     __u32 data_size;
> >     __u32 new_field;
> > };
> > 
> > And next we add a new field to struct secccomp_data. When a userspace
> > compiled with just the new seccomp_notif2 field does:
> > 
> > seccomp_notif2.new_field = ...;
> > 
> > the compiler will put it in the wrong place for the kernel with the
> > new seccomp_data field too.
> > 
> > Sort of feels like we should do:
> > 
> > struct seccomp_notif2 {
> >     struct seccomp_notif *notif;
> >     struct seccomp_data *data;
> > };
> 
> I'm going read this thread more carefully tomorrow, but I just wanted to
> mention that I'd *like* to extend seccomp_data for doing deep argument
> inspection of the new syscalls. I think it's the least bad of many
> designs, and I'll write that up in more detail. (I would *really* like
> to avoid extending seccomp's BPF language, and instead allow probing
> into the struct copied from userspace, etc.)

It's great to hear that you're on this. I haven't had time to work on
this since our kernel summit session. :/
And so far, I really like what I hear. I had the same general thought
that not extending seccomp's bpf is what we want. And to stress this
again before the mails come flooding in: we really need this in seccomp
itself not in any current or future LSM. :)

> 
> Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> notif API, but I'll try to get all the ideas here collected in one place.

Cool, I was kinda worried that people would think that's a crazy idea
but I really think we're better off with a redesign. And I think that's
totally ok and cleaner than hacking around it.

Christian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-18  8:32                 ` Sargun Dhillon
@ 2020-05-18 12:45                   ` Christian Brauner
  2020-05-18 13:23                     ` Tycho Andersen
  0 siblings, 1 reply; 22+ messages in thread
From: Christian Brauner @ 2020-05-18 12:45 UTC (permalink / raw)
  To: Sargun Dhillon
  Cc: Kees Cook, Tycho Andersen, linux-api, containers, linux-kernel

On Mon, May 18, 2020 at 08:32:25AM +0000, Sargun Dhillon wrote:
> On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> > On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > 
> > I'm going read this thread more carefully tomorrow, but I just wanted to
> > mention that I'd *like* to extend seccomp_data for doing deep argument
> > inspection of the new syscalls. I think it's the least bad of many
> > designs, and I'll write that up in more detail. (I would *really* like
> > to avoid extending seccomp's BPF language, and instead allow probing
> > into the struct copied from userspace, etc.)
> > 
> > Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> > notif API, but I'll try to get all the ideas here collected in one place.
> I scratched together a proposal of what I think would make a not-terrible
> V2 API. I'm sure there's bugs in this code, but I think it's workable --
> or at least a place to start. The biggest thing I think we should consider
> is unrolling seccomp_data if we don't intend to add new BPF-accessible
> fields.
> 
> If also uses read(2), so we get to take advantage of read(2)'s ability
> to pass a size along with the read, as opposed to doing ioctl tricks.
> It also makes programming from against it slightly simpler. I can imagine
> that the send API could be similar, in that it could support write, and
> thus making it 100% usable from Go (and the like) without requiring
> a separate OS-thread be spun up to interact with the listener.

I don't have strong feelings about using read() and write() here but I
think that Jann had reservations and that's why we didn't do it in the
first version. But his reservations were specifically tied to fd passing
which we never implemented:
http://lkml.iu.edu/hypermail/linux/kernel/1806.2/05995.html

But still, worth considering.

Christian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 15:02             ` Tycho Andersen
  2020-05-17 21:30               ` Kees Cook
@ 2020-05-18 12:53               ` Christian Brauner
  2020-05-18 13:20                 ` Tycho Andersen
  2020-05-18 21:37                 ` Sargun Dhillon
  1 sibling, 2 replies; 22+ messages in thread
From: Christian Brauner @ 2020-05-18 12:53 UTC (permalink / raw)
  To: Tycho Andersen
  Cc: Aleksa Sarai, Kees Cook, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > struct seccomp_notif2 {
> > > 	__u32 notif_size;
> > > 	__u64 id;
> > > 	__u32 pid;
> > > 	__u32 flags;
> > > 	struct seccomp_data data;
> > > 	__u32 data_size;
> > > };
> > 
> > I guess you need to put data_size before data, otherwise old userspace
> > with a smaller struct seccomp_data will look in the wrong place.
> > 
> > But yes, that'll work if you put two sizes in, which is probably
> > reasonable since we're talking about two structs.
> 
> Well, no, it doesn't either. Suppose we add a new field first to
> struct seccomp_notif2:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data data;
>     __u32 data_size;
>     __u32 new_field;
> };
> 
> And next we add a new field to struct secccomp_data. When a userspace
> compiled with just the new seccomp_notif2 field does:
> 
> seccomp_notif2.new_field = ...;
> 
> the compiler will put it in the wrong place for the kernel with the
> new seccomp_data field too.
> 
> Sort of feels like we should do:
> 
> struct seccomp_notif2 {
>     struct seccomp_notif *notif;
>     struct seccomp_data *data;
> };
> 
> ?

Oh yes of course, sorry that was my stupid typo. I meant:

struct seccomp_notif2 {
    __u32 notif_size;
    __u64 id;
    __u32 pid;
    __u32 flags;
    struct seccomp_data *data;
    __u32 data_size;
    __u32 new_field;
}

at which point things should just work imho. This is similar to how the
set_tid array works. The kernel doesn't need to allocate any more too.
The kernel can just always use the currently know seccomp_data size.
If the kernel supports _less_ than what the caller expects, it can
report the supported size in data_size to userspace returning EINVAL. If
it supports more then it can just copy the known fields, I guess.

This way we don't need to add yet another ioctl...

Christian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-18 12:53               ` Christian Brauner
@ 2020-05-18 13:20                 ` Tycho Andersen
  2020-05-18 21:37                 ` Sargun Dhillon
  1 sibling, 0 replies; 22+ messages in thread
From: Tycho Andersen @ 2020-05-18 13:20 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Aleksa Sarai, Kees Cook, linux-api, containers, linux-kernel

On Mon, May 18, 2020 at 02:53:25PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > > struct seccomp_notif2 {
> > > > 	__u32 notif_size;
> > > > 	__u64 id;
> > > > 	__u32 pid;
> > > > 	__u32 flags;
> > > > 	struct seccomp_data data;
> > > > 	__u32 data_size;
> > > > };
> > > 
> > > I guess you need to put data_size before data, otherwise old userspace
> > > with a smaller struct seccomp_data will look in the wrong place.
> > > 
> > > But yes, that'll work if you put two sizes in, which is probably
> > > reasonable since we're talking about two structs.
> > 
> > Well, no, it doesn't either. Suppose we add a new field first to
> > struct seccomp_notif2:
> > 
> > struct seccomp_notif2 {
> >     __u32 notif_size;
> >     __u64 id;
> >     __u32 pid;
> >     __u32 flags;
> >     struct seccomp_data data;
> >     __u32 data_size;
> >     __u32 new_field;
> > };
> > 
> > And next we add a new field to struct secccomp_data. When a userspace
> > compiled with just the new seccomp_notif2 field does:
> > 
> > seccomp_notif2.new_field = ...;
> > 
> > the compiler will put it in the wrong place for the kernel with the
> > new seccomp_data field too.
> > 
> > Sort of feels like we should do:
> > 
> > struct seccomp_notif2 {
> >     struct seccomp_notif *notif;
> >     struct seccomp_data *data;
> > };
> > 
> > ?
> 
> Oh yes of course, sorry that was my stupid typo. I meant:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data *data;
>     __u32 data_size;
>     __u32 new_field;
> }
> 
> at which point things should just work imho.

Are you saying that data_size is an input? Because I don't think they
Just Work otherwise.

Tycho

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-18 12:45                   ` Christian Brauner
@ 2020-05-18 13:23                     ` Tycho Andersen
  2020-05-18 14:00                       ` Christian Brauner
  0 siblings, 1 reply; 22+ messages in thread
From: Tycho Andersen @ 2020-05-18 13:23 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Sargun Dhillon, Kees Cook, linux-api, containers, linux-kernel

On Mon, May 18, 2020 at 02:45:00PM +0200, Christian Brauner wrote:
> On Mon, May 18, 2020 at 08:32:25AM +0000, Sargun Dhillon wrote:
> > On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> > > On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > > 
> > > I'm going read this thread more carefully tomorrow, but I just wanted to
> > > mention that I'd *like* to extend seccomp_data for doing deep argument
> > > inspection of the new syscalls. I think it's the least bad of many
> > > designs, and I'll write that up in more detail. (I would *really* like
> > > to avoid extending seccomp's BPF language, and instead allow probing
> > > into the struct copied from userspace, etc.)
> > > 
> > > Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> > > notif API, but I'll try to get all the ideas here collected in one place.
> > I scratched together a proposal of what I think would make a not-terrible
> > V2 API. I'm sure there's bugs in this code, but I think it's workable --
> > or at least a place to start. The biggest thing I think we should consider
> > is unrolling seccomp_data if we don't intend to add new BPF-accessible
> > fields.
> > 
> > If also uses read(2), so we get to take advantage of read(2)'s ability
> > to pass a size along with the read, as opposed to doing ioctl tricks.
> > It also makes programming from against it slightly simpler. I can imagine
> > that the send API could be similar, in that it could support write, and
> > thus making it 100% usable from Go (and the like) without requiring
> > a separate OS-thread be spun up to interact with the listener.
> 
> I don't have strong feelings about using read() and write() here but I
> think that Jann had reservations and that's why we didn't do it in the
> first version. But his reservations were specifically tied to fd passing
> which we never implemented:
> http://lkml.iu.edu/hypermail/linux/kernel/1806.2/05995.html
> 
> But still, worth considering.

There was a thread about this same time for some other API (I can't
find it now, but I can dig if you want) that suggests that "read() is
for data" and we shouldn't use it for control in APIs.

Tycho

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-18 13:23                     ` Tycho Andersen
@ 2020-05-18 14:00                       ` Christian Brauner
  0 siblings, 0 replies; 22+ messages in thread
From: Christian Brauner @ 2020-05-18 14:00 UTC (permalink / raw)
  To: Tycho Andersen; +Cc: linux-api, containers, Kees Cook, linux-kernel

On Mon, May 18, 2020 at 07:23:55AM -0600, Tycho Andersen wrote:
> On Mon, May 18, 2020 at 02:45:00PM +0200, Christian Brauner wrote:
> > On Mon, May 18, 2020 at 08:32:25AM +0000, Sargun Dhillon wrote:
> > > On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> > > > On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > > > 
> > > > I'm going read this thread more carefully tomorrow, but I just wanted to
> > > > mention that I'd *like* to extend seccomp_data for doing deep argument
> > > > inspection of the new syscalls. I think it's the least bad of many
> > > > designs, and I'll write that up in more detail. (I would *really* like
> > > > to avoid extending seccomp's BPF language, and instead allow probing
> > > > into the struct copied from userspace, etc.)
> > > > 
> > > > Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> > > > notif API, but I'll try to get all the ideas here collected in one place.
> > > I scratched together a proposal of what I think would make a not-terrible
> > > V2 API. I'm sure there's bugs in this code, but I think it's workable --
> > > or at least a place to start. The biggest thing I think we should consider
> > > is unrolling seccomp_data if we don't intend to add new BPF-accessible
> > > fields.
> > > 
> > > If also uses read(2), so we get to take advantage of read(2)'s ability
> > > to pass a size along with the read, as opposed to doing ioctl tricks.
> > > It also makes programming from against it slightly simpler. I can imagine
> > > that the send API could be similar, in that it could support write, and
> > > thus making it 100% usable from Go (and the like) without requiring
> > > a separate OS-thread be spun up to interact with the listener.
> > 
> > I don't have strong feelings about using read() and write() here but I
> > think that Jann had reservations and that's why we didn't do it in the
> > first version. But his reservations were specifically tied to fd passing
> > which we never implemented:
> > http://lkml.iu.edu/hypermail/linux/kernel/1806.2/05995.html
> > 
> > But still, worth considering.
> 
> There was a thread about this same time for some other API (I can't
> find it now, but I can dig if you want) that suggests that "read() is
> for data" and we shouldn't use it for control in APIs.

Oh that sounds useful. Though I think you can wait with digging it out
until someone insists on using read(). :)

Christian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-17 21:30               ` Kees Cook
  2020-05-18  8:32                 ` Sargun Dhillon
  2020-05-18 12:05                 ` Christian Brauner
@ 2020-05-18 21:10                 ` Kees Cook
  2 siblings, 0 replies; 22+ messages in thread
From: Kees Cook @ 2020-05-18 21:10 UTC (permalink / raw)
  To: Tycho Andersen
  Cc: Christian Brauner, Aleksa Sarai, linux-api, containers, linux-kernel

On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> notif API, but I'll try to get all the ideas here collected in one place.

For future thread archæologists, I put my thoughts here:
https://lore.kernel.org/lkml/202005181120.971232B7B@keescook/

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-18 12:53               ` Christian Brauner
  2020-05-18 13:20                 ` Tycho Andersen
@ 2020-05-18 21:37                 ` Sargun Dhillon
  1 sibling, 0 replies; 22+ messages in thread
From: Sargun Dhillon @ 2020-05-18 21:37 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Tycho Andersen, Aleksa Sarai, Kees Cook, linux-api, containers,
	linux-kernel

On Mon, May 18, 2020 at 02:53:25PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > > struct seccomp_notif2 {
> > > > 	__u32 notif_size;
> > > > 	__u64 id;
> > > > 	__u32 pid;
> > > > 	__u32 flags;
> > > > 	struct seccomp_data data;
> > > > 	__u32 data_size;
> > > > };
> > > 
> > > I guess you need to put data_size before data, otherwise old userspace
> > > with a smaller struct seccomp_data will look in the wrong place.
> > > 
> > > But yes, that'll work if you put two sizes in, which is probably
> > > reasonable since we're talking about two structs.
> > 
> > Well, no, it doesn't either. Suppose we add a new field first to
> > struct seccomp_notif2:
> > 
> > struct seccomp_notif2 {
> >     __u32 notif_size;
> >     __u64 id;
> >     __u32 pid;
> >     __u32 flags;
> >     struct seccomp_data data;
> >     __u32 data_size;
> >     __u32 new_field;
> > };
> > 
> > And next we add a new field to struct secccomp_data. When a userspace
> > compiled with just the new seccomp_notif2 field does:
> > 
> > seccomp_notif2.new_field = ...;
> > 
> > the compiler will put it in the wrong place for the kernel with the
> > new seccomp_data field too.
> > 
> > Sort of feels like we should do:
> > 
> > struct seccomp_notif2 {
> >     struct seccomp_notif *notif;
> >     struct seccomp_data *data;
> > };
> > 
> > ?
> 
> Oh yes of course, sorry that was my stupid typo. I meant:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data *data;
>     __u32 data_size;
>     __u32 new_field;
> }
One big difference in the approach I described is that the user gets to ask
for specific fields, and can configure the listener upfront, versus having to
do the dance of fetching the sizes, and dynamically allocating memory.

This way userspace can just do on-stack static allocations. We can get
rid of the kbuf bits in my PR, if we incrementally fill up the userspace buffer
(copy_to_user shouldn't be *that* costly).

In addition, we're not copying a bunch of unnecessary data, or calculating
values that the user may not be interested in. This is particularly valuable
if we ever want to do things like passing optional fields (think PIDs) back.

Code-wise, it looks something like:
struct read_output_format {
	__u64 id;
	__u32 tgid;
	struct seccomp_data data;
} __packed;

TEST(user_notification_read)
{
	long ret;
	int status, pid, listener, read_size;
	struct seccomp_notif_config config = {};
	struct seccomp_notif_resp resp = {};
	struct read_output_format buf;

	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	ASSERT_EQ(0, ret) {
		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	}

	listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
	ASSERT_GE(listener, 0);

	EXPECT_EQ(read(listener, &buf, sizeof(buf)), -1) {
		EXPECT_EQ(errno, -EINVAL);
	}

	config.size = sizeof(config);
	config.seccomp_data_size = sizeof(struct seccomp_data);
	config.optional_fields = ~(0);
	/* Make sure invalid fields are not accepted */
	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_CONFIG, &config), -1);

	config.optional_fields = SECCOMP_NOTIF_FIELD_TGID;
	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_CONFIG, &config), sizeof(buf));

	pid = fork();
	ASSERT_GE(pid, 0);
	if (pid == 0)
		exit(syscall(__NR_dup, 42, 1, 1, 1) != USER_NOTIF_MAGIC);


	/* Passing a smaller value in should fail */
	EXPECT_EQ(read(listener, &buf, read_size - 1), -1) {
		EXPECT_EQ(errno, -E2BIG);
	}
	/* Passing a larger value in should succeed */
	ASSERT_EQ(read(listener, &buf, 200), sizeof(buf));
	EXPECT_EQ(buf.tgid, pid);
	EXPECT_EQ(buf.data.args[0], 42);
	EXPECT_EQ(buf.data.nr, __NR_dup);

	resp.id = buf.id;
	resp.error = 0;
	resp.val = USER_NOTIF_MAGIC;

	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);

	EXPECT_EQ(waitpid(pid, &status, 0), pid);
	EXPECT_EQ(true, WIFEXITED(status));
	EXPECT_EQ(0, WEXITSTATUS(status));
}

> 
> at which point things should just work imho. This is similar to how the
> set_tid array works. The kernel doesn't need to allocate any more too.
> The kernel can just always use the currently know seccomp_data size.
> If the kernel supports _less_ than what the caller expects, it can
> report the supported size in data_size to userspace returning EINVAL. If
> it supports more then it can just copy the known fields, I guess.
> 
> This way we don't need to add yet another ioctl...
> 
> Christian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-15 23:40 [PATCH] seccomp: Add group_leader pid to seccomp_notif Sargun Dhillon
  2020-05-17  7:17 ` Kees Cook
@ 2020-05-18 23:08 ` Eric W. Biederman
  2020-05-22 17:54   ` Sargun Dhillon
  1 sibling, 1 reply; 22+ messages in thread
From: Eric W. Biederman @ 2020-05-18 23:08 UTC (permalink / raw)
  To: Sargun Dhillon
  Cc: linux-kernel, containers, linux-api, christian.brauner, tycho,
	keescook, cyphar

Sargun Dhillon <sargun@sargun.me> writes:

> This includes the thread group leader ID in the seccomp_notif. This is
> immediately useful for opening up a pidfd for the group leader, as
> pidfds only work on group leaders.

The code looks fine (except for the name of the test), but can you
please talk and think about this as something other than the
group leader?

The initial thread in a thread group can die, and the tgid is still
valid for the entire group.  Because the initial thread of a
process/thread group can die (but rarely does) that tends to result in
kernel code that fails when thread_group_leader dies.

To remove that class of bugs I am slowy working to remove the
thread_group_leader from the kernel entirely.

Looking at the names of the fields in the structure it looks like
there is another class of bugs to be removed by renaming PIDTYPE_PID
to PIDTYPE_TID in the kernel as well.  Just skimming the example code
it looks very simple to get confused.

Is there any chance some can modify struct seccomp_notify to do
{
	...
        union {
		__u32 pid;
                __u32 tid;
	};
        ...
}

Just to reduce the chance of confusion between the userspace pid and the
in kernel pid names?

Eric

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] seccomp: Add group_leader pid to seccomp_notif
  2020-05-18 23:08 ` Eric W. Biederman
@ 2020-05-22 17:54   ` Sargun Dhillon
  0 siblings, 0 replies; 22+ messages in thread
From: Sargun Dhillon @ 2020-05-22 17:54 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: LKML, Linux Containers, Linux API, Christian Brauner,
	Tycho Andersen, Kees Cook, Aleksa Sarai

On Mon, May 18, 2020 at 4:11 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Sargun Dhillon <sargun@sargun.me> writes:
>
> > This includes the thread group leader ID in the seccomp_notif. This is
> > immediately useful for opening up a pidfd for the group leader, as
> > pidfds only work on group leaders.
>
> The code looks fine (except for the name of the test), but can you
> please talk and think about this as something other than the
> group leader?
>
> The initial thread in a thread group can die, and the tgid is still
> valid for the entire group.  Because the initial thread of a
> process/thread group can die (but rarely does) that tends to result in
> kernel code that fails when thread_group_leader dies.
>
> To remove that class of bugs I am slowy working to remove the
> thread_group_leader from the kernel entirely.
>
> Looking at the names of the fields in the structure it looks like
> there is another class of bugs to be removed by renaming PIDTYPE_PID
> to PIDTYPE_TID in the kernel as well.  Just skimming the example code
> it looks very simple to get confused.
>
> Is there any chance some can modify struct seccomp_notify to do
> {
>         ...
>         union {
>                 __u32 pid;
>                 __u32 tid;
>         };
>         ...
> }
>
> Just to reduce the chance of confusion between the userspace pid and the
> in kernel pid names?
>
> Eric
Our use cases would be unaffected by this. I think this would be a wonderful
way to move forward, but I don't know if it could break userspace.

I believe Christian's team is the biggest user of this feature in OSS right now,
so he might know.

In addition, I'm not sure where you would want the thread's ID versus the
process's ID, unless you wanted to do something like SIGSTOP, and freeze
the thread to prevent it from making more progress, or being interrupted
while you go do notifier work.

Christian & Kees,
Thoughts?

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-05-22 17:55 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-15 23:40 [PATCH] seccomp: Add group_leader pid to seccomp_notif Sargun Dhillon
2020-05-17  7:17 ` Kees Cook
2020-05-17 10:47   ` Christian Brauner
2020-05-17 11:21     ` Aleksa Sarai
2020-05-17 14:23       ` Tycho Andersen
2020-05-17 14:33         ` Christian Brauner
2020-05-17 14:35           ` Christian Brauner
2020-05-17 14:46           ` Tycho Andersen
2020-05-17 15:02             ` Tycho Andersen
2020-05-17 21:30               ` Kees Cook
2020-05-18  8:32                 ` Sargun Dhillon
2020-05-18 12:45                   ` Christian Brauner
2020-05-18 13:23                     ` Tycho Andersen
2020-05-18 14:00                       ` Christian Brauner
2020-05-18 12:05                 ` Christian Brauner
2020-05-18 21:10                 ` Kees Cook
2020-05-18 12:53               ` Christian Brauner
2020-05-18 13:20                 ` Tycho Andersen
2020-05-18 21:37                 ` Sargun Dhillon
2020-05-17 11:17   ` Sargun Dhillon
2020-05-18 23:08 ` Eric W. Biederman
2020-05-22 17:54   ` Sargun Dhillon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.