linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone
@ 2020-11-02 20:37 Jann Horn
  2020-11-02 20:37 ` [PATCH 2/3] samples: seccomp: simplify user-trap sample Jann Horn
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Jann Horn @ 2020-11-02 20:37 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andy Lutomirski, Will Drewry, linux-kernel, Tycho Andersen,
	Christian Brauner, Michael Kerrisk (man-pages),
	Linux Containers, Sargun Dhillon, Giuseppe Scrivano, Paul Moore

At the moment, the seccomp notifier API is hard to use without combining
it with APIs like poll() or epoll(); if all target processes have gone
away, the polling APIs will raise an error indication on the file
descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.

This usability problem was discovered when Michael Kerrisk wrote a
manpage for this API.

To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
behave as follows:

If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
immediately, no matter whether a notification is available or not.

If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
notification is delivered to userspace or all users of the filter have
gone away.

To avoid subtle breakage from eventloop-style code that doesn't set
O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
wants blocking behavior, and if blocking-style code forgets to do so,
that will be much more obvious than the breakage we'd get the other way
around.
This also means that UAPI breakage from this change should be limited to
blocking users of the API, of which, to my knowledge, there are none so far
(outside of in-tree sample and selftest code, which this patch adjusts - in
particular the code in samples/ has to change a bunch).

This should be backported because future userspace code might otherwise not
work properly on old kernels.

Cc: stable@vger.kernel.org
Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Jann Horn <jannh@google.com>
---
 kernel/seccomp.c                              | 62 +++++++++++++------
 samples/seccomp/user-trap.c                   | 16 +++++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
 3 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 8ad7a293255a..b3730740515f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -43,6 +43,7 @@
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
 #include <linux/lockdep.h>
+#include <linux/freezer.h>
 
 /*
  * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
@@ -138,7 +139,6 @@ struct seccomp_kaddfd {
  * @notifications: A list of struct seccomp_knotif elements.
  */
 struct notification {
-	struct semaphore request;
 	u64 next_id;
 	struct list_head notifications;
 };
@@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
 	list_add(&n.list, &match->notif->notifications);
 	INIT_LIST_HEAD(&n.addfd);
 
-	up(&match->notif->request);
 	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
 	mutex_unlock(&match->notify_lock);
 
@@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 id)
 
 
 static long seccomp_notify_recv(struct seccomp_filter *filter,
-				void __user *buf)
+				void __user *buf, bool blocking)
 {
 	struct seccomp_knotif *knotif = NULL, *cur;
+	DEFINE_WAIT(wait);
 	struct seccomp_notif unotif;
 	ssize_t ret;
 
@@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 
 	memset(&unotif, 0, sizeof(unotif));
 
-	ret = down_interruptible(&filter->notif->request);
-	if (ret < 0)
-		return ret;
-
 	mutex_lock(&filter->notify_lock);
+
+retry:
 	list_for_each_entry(cur, &filter->notif->notifications, list) {
 		if (cur->state == SECCOMP_NOTIFY_INIT) {
 			knotif = cur;
@@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 		}
 	}
 
-	/*
-	 * If we didn't find a notification, it could be that the task was
-	 * interrupted by a fatal signal between the time we were woken and
-	 * when we were able to acquire the rw lock.
-	 */
 	if (!knotif) {
-		ret = -ENOENT;
-		goto out;
+		if (!blocking) {
+			if (refcount_read(&filter->users) == 0)
+				ret = -ENOTCONN;
+			else
+				ret = -ENOENT;
+			goto out;
+		}
+
+		/* This has to happen before checking &filter->users. */
+		prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
+
+		/*
+		 * If all users of the filter are gone, throw an error
+		 * instead of pointlessly continuing to block.
+		 */
+		if (refcount_read(&filter->users) == 0) {
+			finish_wait(&filter->wqh, &wait);
+			ret = -ENOTCONN;
+			goto out;
+		}
+
+		/* No notifications - wait for one... */
+		mutex_unlock(&filter->notify_lock);
+		freezable_schedule();
+		finish_wait(&filter->wqh, &wait);
+
+		/* ... and retry */
+		mutex_lock(&filter->notify_lock);
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
+		}
+		goto retry;
 	}
 
 	unotif.id = knotif->id;
@@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 		 */
 		mutex_lock(&filter->notify_lock);
 		knotif = find_notification(filter, unotif.id);
-		if (knotif) {
+		if (knotif)
 			knotif->state = SECCOMP_NOTIFY_INIT;
-			up(&filter->notif->request);
-		}
 		mutex_unlock(&filter->notify_lock);
 	}
 
@@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 {
 	struct seccomp_filter *filter = file->private_data;
 	void __user *buf = (void __user *)arg;
+	bool blocking = !(file->f_flags & O_NONBLOCK);
 
 	/* Fixed-size ioctls */
 	switch (cmd) {
 	case SECCOMP_IOCTL_NOTIF_RECV:
-		return seccomp_notify_recv(filter, buf);
+		return seccomp_notify_recv(filter, buf, blocking);
 	case SECCOMP_IOCTL_NOTIF_SEND:
 		return seccomp_notify_send(filter, buf);
 	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
@@ -1483,12 +1506,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
 	if (!filter->notif)
 		goto out;
 
-	sema_init(&filter->notif->request, 0);
 	filter->notif->next_id = get_random_u64();
 	INIT_LIST_HEAD(&filter->notif->notifications);
 
 	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
-				 filter, O_RDWR);
+				 filter, O_RDWR|O_NONBLOCK);
 	if (IS_ERR(ret))
 		goto out_notif;
 
diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
index 20291ec6489f..b9e666f15998 100644
--- a/samples/seccomp/user-trap.c
+++ b/samples/seccomp/user-trap.c
@@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
 	return ret;
 }
 
+static void set_blocking(int fd)
+{
+	int file_status_flags = fcntl(fd, F_GETFL);
+
+	file_status_flags &= ~O_NONBLOCK;
+	if (fcntl(fd, F_SETFL, file_status_flags)) {
+		perror("F_SETFL");
+		exit(1);
+	}
+}
+
 int main(void)
 {
 	int sk_pair[2], ret = 1, status, listener;
@@ -274,6 +285,8 @@ int main(void)
 	if (listener < 0)
 		goto out_kill;
 
+	set_blocking(listener);
+
 	/*
 	 * Fork a task to handle the requests. This isn't strictly necessary,
 	 * but it makes the particular writing of this sample easier, since we
@@ -307,6 +320,9 @@ int main(void)
 		while (1) {
 			memset(req, 0, sizes.seccomp_notif);
 			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
+				if (errno == ENOTCONN)
+					exit(0);
+
 				perror("ioctl recv");
 				goto out_resp;
 			}
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 4a180439ee9e..5318f9cb1aec 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
 #endif
 }
 
+#define set_blocking(fd) ({					\
+	int file_status_flags;					\
+	file_status_flags = fcntl(fd, F_GETFL);			\
+	file_status_flags &= ~O_NONBLOCK;			\
+	ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0);	\
+})
+
 /* Have TH_LOG report actual location filecmp() is used. */
 #define filecmp(pid1, pid2, fd1, fd2)	({		\
 	int _ret;					\
@@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	/*
 	 * Check that nothing bad happens when we kill the task in the middle
 	 * of a syscall.
@@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	pid = fork();
 	ASSERT_GE(pid, 0);
 
@@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	pid = fork();
 	ASSERT_GE(pid, 0);
 
@@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	pid = fork();
 	ASSERT_GE(pid, 0);
 
@@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	pid = fork();
 	ASSERT_GE(pid, 0);
 
@@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	pid = fork();
 	ASSERT_GE(pid, 0);
 
@@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
 	ASSERT_GE(listener, 0);
 
+	set_blocking(listener);
+
 	pid = fork();
 	ASSERT_GE(pid, 0);
 

base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
-- 
2.29.1.341.ge80a0c044ae-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/3] samples: seccomp: simplify user-trap sample
  2020-11-02 20:37 [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Jann Horn
@ 2020-11-02 20:37 ` Jann Horn
  2020-11-02 20:37 ` [PATCH 3/3] selftests/seccomp: Test NOTIF_RECV empty/dead errors Jann Horn
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Jann Horn @ 2020-11-02 20:37 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andy Lutomirski, Will Drewry, linux-kernel, Tycho Andersen,
	Christian Brauner, Michael Kerrisk (man-pages),
	Linux Containers, Sargun Dhillon, Giuseppe Scrivano, Paul Moore

Now that the blocking unotify API returns when all users of the filter are
gone, get rid of the helper task that kills the blocked task.

Note that since seccomp only tracks whether tasks are completely gone, not
whether they have exited, we need to handle SIGCHLD while blocked on
SECCOMP_IOCTL_NOTIF_RECV. Alternatively we could also set SIGCHLD to
SIG_IGN and let the kernel autoreap exiting children.

Signed-off-by: Jann Horn <jannh@google.com>
---
 samples/seccomp/user-trap.c | 163 +++++++++++++++++++-----------------
 1 file changed, 87 insertions(+), 76 deletions(-)

diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
index b9e666f15998..92d0f9ba58b6 100644
--- a/samples/seccomp/user-trap.c
+++ b/samples/seccomp/user-trap.c
@@ -15,6 +15,7 @@
 #include <sys/syscall.h>
 #include <sys/user.h>
 #include <sys/ioctl.h>
+#include <sys/prctl.h>
 #include <sys/ptrace.h>
 #include <sys/mount.h>
 #include <linux/limits.h>
@@ -198,6 +199,21 @@ static int handle_req(struct seccomp_notif *req,
 	return ret;
 }
 
+static pid_t worker;
+static int worker_status;
+
+static void sigchld_handler(int sig, siginfo_t *info, void *ctx)
+{
+	if (info->si_pid != worker) {
+		fprintf(stderr, "SIGCHLD from unknown process\n");
+		return;
+	}
+	if (waitpid(worker, &worker_status, 0) != worker) {
+		perror("waitpid");
+		exit(1);
+	}
+}
+
 static void set_blocking(int fd)
 {
 	int file_status_flags = fcntl(fd, F_GETFL);
@@ -211,14 +227,26 @@ static void set_blocking(int fd)
 
 int main(void)
 {
-	int sk_pair[2], ret = 1, status, listener;
-	pid_t worker = 0 , tracer = 0;
+	int sk_pair[2], ret = 1, listener;
+	struct seccomp_notif *req;
+	struct seccomp_notif_resp *resp;
+	struct seccomp_notif_sizes sizes;
+	pid_t parent = getpid();
 
 	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
 		perror("socketpair");
 		return 1;
 	}
 
+	struct sigaction sigchld_act = {
+		.sa_sigaction = sigchld_handler,
+		.sa_flags = SA_SIGINFO
+	};
+	if (sigaction(SIGCHLD, &sigchld_act, NULL)) {
+		perror("sigaction");
+		return 1;
+	}
+
 	worker = fork();
 	if (worker < 0) {
 		perror("fork");
@@ -226,6 +254,14 @@ int main(void)
 	}
 
 	if (worker == 0) {
+		/* quit if the parent dies */
+		if (prctl(PR_SET_PDEATHSIG, SIGKILL)) {
+			perror("PR_SET_PDEATHSIG");
+			exit(1);
+		}
+		if (getppid() != parent)
+			exit(1);
+
 		listener = user_trap_syscall(__NR_mount,
 					     SECCOMP_FILTER_FLAG_NEW_LISTENER);
 		if (listener < 0) {
@@ -283,87 +319,69 @@ int main(void)
 	 */
 	listener = recv_fd(sk_pair[0]);
 	if (listener < 0)
-		goto out_kill;
+		goto close_pair;
 
 	set_blocking(listener);
 
-	/*
-	 * Fork a task to handle the requests. This isn't strictly necessary,
-	 * but it makes the particular writing of this sample easier, since we
-	 * can just wait ofr the tracee to exit and kill the tracer.
-	 */
-	tracer = fork();
-	if (tracer < 0) {
-		perror("fork");
-		goto out_kill;
+	if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
+		perror("seccomp(GET_NOTIF_SIZES)");
+		goto out_close;
 	}
 
-	if (tracer == 0) {
-		struct seccomp_notif *req;
-		struct seccomp_notif_resp *resp;
-		struct seccomp_notif_sizes sizes;
+	req = malloc(sizes.seccomp_notif);
+	if (!req)
+		goto out_close;
+
+	resp = malloc(sizes.seccomp_notif_resp);
+	if (!resp)
+		goto out_req;
+	memset(resp, 0, sizes.seccomp_notif_resp);
+
+	while (1) {
+		memset(req, 0, sizes.seccomp_notif);
+		if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
+			if (errno == ENOTCONN) {
+				/* The child process went away. */
+				break;
+			} else if (errno == EINTR) {
+				continue;
+			}
 
-		if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
-			perror("seccomp(GET_NOTIF_SIZES)");
-			goto out_close;
+			perror("ioctl recv");
+			goto out_resp;
 		}
 
-		req = malloc(sizes.seccomp_notif);
-		if (!req)
-			goto out_close;
-
-		resp = malloc(sizes.seccomp_notif_resp);
-		if (!resp)
-			goto out_req;
-		memset(resp, 0, sizes.seccomp_notif_resp);
+		if (handle_req(req, resp, listener) < 0)
+			goto out_resp;
 
-		while (1) {
-			memset(req, 0, sizes.seccomp_notif);
-			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
-				if (errno == ENOTCONN)
-					exit(0);
-
-				perror("ioctl recv");
-				goto out_resp;
-			}
-
-			if (handle_req(req, resp, listener) < 0)
-				goto out_resp;
-
-			/*
-			 * ENOENT here means that the task may have gotten a
-			 * signal and restarted the syscall. It's up to the
-			 * handler to decide what to do in this case, but for
-			 * the sample code, we just ignore it. Probably
-			 * something better should happen, like undoing the
-			 * mount, or keeping track of the args to make sure we
-			 * don't do it again.
-			 */
-			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
-			    errno != ENOENT) {
-				perror("ioctl send");
-				goto out_resp;
-			}
+		/*
+		 * ENOENT here means that the task may have gotten a
+		 * signal and restarted the syscall. It's up to the
+		 * handler to decide what to do in this case, but for
+		 * the sample code, we just ignore it. Probably
+		 * something better should happen, like undoing the
+		 * mount, or keeping track of the args to make sure we
+		 * don't do it again.
+		 */
+		if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
+		    errno != ENOENT) {
+			perror("ioctl send");
+			goto out_resp;
 		}
+	}
+	ret = 0;
+
 out_resp:
-		free(resp);
+	free(resp);
 out_req:
-		free(req);
+	free(req);
 out_close:
-		close(listener);
-		exit(1);
-	}
-
 	close(listener);
 
-	if (waitpid(worker, &status, 0) != worker) {
-		perror("waitpid");
-		goto out_kill;
-	}
-
 	if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
 		perror("umount2");
-		goto out_kill;
+		ret = 1;
+		goto close_pair;
 	}
 
 	if (remove("/tmp/foo") < 0 && errno != ENOENT) {
@@ -371,19 +389,12 @@ int main(void)
 		exit(1);
 	}
 
-	if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+	if (!WIFEXITED(worker_status) || WEXITSTATUS(worker_status)) {
 		fprintf(stderr, "worker exited nonzero\n");
-		goto out_kill;
+		ret = 1;
+		goto close_pair;
 	}
 
-	ret = 0;
-
-out_kill:
-	if (tracer > 0)
-		kill(tracer, SIGKILL);
-	if (worker > 0)
-		kill(worker, SIGKILL);
-
 close_pair:
 	close(sk_pair[0]);
 	close(sk_pair[1]);
-- 
2.29.1.341.ge80a0c044ae-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/3] selftests/seccomp: Test NOTIF_RECV empty/dead errors
  2020-11-02 20:37 [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Jann Horn
  2020-11-02 20:37 ` [PATCH 2/3] samples: seccomp: simplify user-trap sample Jann Horn
@ 2020-11-02 20:37 ` Jann Horn
  2020-11-03  9:44 ` [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Christian Brauner
  2020-11-04 11:18 ` Sargun Dhillon
  3 siblings, 0 replies; 5+ messages in thread
From: Jann Horn @ 2020-11-02 20:37 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andy Lutomirski, Will Drewry, linux-kernel, Tycho Andersen,
	Christian Brauner, Michael Kerrisk (man-pages),
	Linux Containers, Sargun Dhillon, Giuseppe Scrivano, Paul Moore

Test that SECCOMP_IOCTL_NOTIF_RECV on a seccomp fd with zero users returns
-ENOTCONN, both in blocking and in non-blocking mode.
Also test that SECCOMP_IOCTL_NOTIF_RECV on a seccomp fd with no active
notifications returns -ENOENT in non-blocking mode.

Signed-off-by: Jann Horn <jannh@google.com>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 127 ++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 5318f9cb1aec..8214c431ad4b 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -28,6 +28,7 @@
 #include <linux/prctl.h>
 #include <linux/ptrace.h>
 #include <linux/seccomp.h>
+#include <linux/futex.h>
 #include <pthread.h>
 #include <semaphore.h>
 #include <signal.h>
@@ -315,6 +316,35 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
 	}						\
 	_ret; })
 
+static void store_and_wake(int *ptr, int value)
+{
+	__atomic_store(ptr, &value, __ATOMIC_RELEASE);
+	if (syscall(__NR_futex, ptr, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) == -1) {
+		perror("FUTEX_WAKE failed unexpectedly");
+		exit(EXIT_FAILURE);
+	}
+}
+
+static int wait_and_load(int *ptr, int placeholder)
+{
+	int futex_ret;
+	int value;
+
+retry:
+	futex_ret = syscall(__NR_futex, ptr, FUTEX_WAIT, placeholder, NULL,
+			    NULL, 0);
+	if (futex_ret == -1 && errno != EAGAIN) {
+		if (errno == EINTR)
+			goto retry;
+		perror("FUTEX_WAIT failed unexpectedly");
+		exit(EXIT_FAILURE);
+	}
+	value = __atomic_load_n(ptr, __ATOMIC_ACQUIRE);
+	if (value == placeholder)
+		goto retry;
+	return value;
+}
+
 TEST(kcmp)
 {
 	int ret;
@@ -4160,6 +4190,103 @@ TEST(user_notification_addfd_rlimit)
 	close(memfd);
 }
 
+TEST(user_notification_recv_dead)
+{
+	pid_t pid;
+	int status;
+	struct __clone_args args = {
+		.flags = CLONE_FILES,
+		.exit_signal = SIGCHLD,
+	};
+	struct seccomp_notif notif = {};
+	struct shared_data {
+		int notif_fd;
+	} *shared_data;
+
+	shared_data = mmap(NULL, sizeof(struct shared_data),
+			   PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED,
+			   -1, 0);
+	ASSERT_NE(NULL, shared_data);
+	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+	}
+
+	pid = sys_clone3(&args, sizeof(args));
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		shared_data->notif_fd = user_notif_syscall(
+				__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+		if (shared_data->notif_fd < 0)
+			_exit(EXIT_FAILURE);
+
+		_exit(EXIT_SUCCESS);
+	}
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+
+	/* non-blocking recv */
+	EXPECT_EQ(-1, ioctl(shared_data->notif_fd, SECCOMP_IOCTL_NOTIF_RECV, &notif));
+	EXPECT_EQ(ENOTCONN, errno);
+
+	/* blocking recv */
+	set_blocking(shared_data->notif_fd);
+	EXPECT_EQ(-1, ioctl(shared_data->notif_fd, SECCOMP_IOCTL_NOTIF_RECV, &notif));
+	EXPECT_EQ(ENOTCONN, errno);
+}
+
+TEST(user_notification_recv_nonblock)
+{
+	pid_t pid;
+	struct __clone_args args = {
+		.flags = CLONE_FILES,
+		.exit_signal = SIGCHLD,
+	};
+	struct seccomp_notif notif = {};
+	struct shared_data {
+		int notif_fd;
+	} *shared_data;
+
+	shared_data = mmap(NULL, sizeof(struct shared_data),
+			   PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED,
+			   -1, 0);
+	ASSERT_NE(NULL, shared_data);
+	shared_data->notif_fd = -1;
+
+	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+	}
+
+	pid = sys_clone3(&args, sizeof(args));
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int fd;
+
+		fd = user_notif_syscall(__NR_mknodat,
+					SECCOMP_FILTER_FLAG_NEW_LISTENER);
+		if (fd < 0) {
+			store_and_wake(&shared_data->notif_fd, -2);
+			_exit(EXIT_FAILURE);
+		}
+		store_and_wake(&shared_data->notif_fd, fd);
+
+		while (1)
+			pause();
+	}
+
+	wait_and_load(&shared_data->notif_fd, -1);
+
+	/* non-blocking recv */
+	EXPECT_EQ(-1, ioctl(shared_data->notif_fd, SECCOMP_IOCTL_NOTIF_RECV,
+			    &notif));
+	EXPECT_EQ(ENOENT, errno);
+
+	kill(pid, SIGKILL);
+}
+
 /*
  * TODO:
  * - expand NNP testing
-- 
2.29.1.341.ge80a0c044ae-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone
  2020-11-02 20:37 [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Jann Horn
  2020-11-02 20:37 ` [PATCH 2/3] samples: seccomp: simplify user-trap sample Jann Horn
  2020-11-02 20:37 ` [PATCH 3/3] selftests/seccomp: Test NOTIF_RECV empty/dead errors Jann Horn
@ 2020-11-03  9:44 ` Christian Brauner
  2020-11-04 11:18 ` Sargun Dhillon
  3 siblings, 0 replies; 5+ messages in thread
From: Christian Brauner @ 2020-11-03  9:44 UTC (permalink / raw)
  To: Jann Horn
  Cc: Kees Cook, Giuseppe Scrivano, Will Drewry, Paul Moore,
	Linux Containers, linux-kernel, Andy Lutomirski,
	Christian Brauner, Michael Kerrisk (man-pages)

On Mon, Nov 02, 2020 at 09:37:04PM +0100, Jann Horn via Containers wrote:
> At the moment, the seccomp notifier API is hard to use without combining
> it with APIs like poll() or epoll(); if all target processes have gone
> away, the polling APIs will raise an error indication on the file
> descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.
> 
> This usability problem was discovered when Michael Kerrisk wrote a
> manpage for this API.
> 
> To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
> behave as follows:
> 
> If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
> immediately, no matter whether a notification is available or not.
> 
> If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
> notification is delivered to userspace or all users of the filter have
> gone away.
> 
> To avoid subtle breakage from eventloop-style code that doesn't set
> O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
> wants blocking behavior, and if blocking-style code forgets to do so,
> that will be much more obvious than the breakage we'd get the other way
> around.
> This also means that UAPI breakage from this change should be limited to
> blocking users of the API, of which, to my knowledge, there are none so far
> (outside of in-tree sample and selftest code, which this patch adjusts - in
> particular the code in samples/ has to change a bunch).
> 
> This should be backported because future userspace code might otherwise not
> work properly on old kernels.
> 
> Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
> Signed-off-by: Jann Horn <jannh@google.com>
> Cc: stable@vger.kernel.org
> ---

I think that all makes sense.

>  kernel/seccomp.c                              | 62 +++++++++++++------
>  samples/seccomp/user-trap.c                   | 16 +++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
>  3 files changed, 79 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 8ad7a293255a..b3730740515f 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -43,6 +43,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/anon_inodes.h>
>  #include <linux/lockdep.h>
> +#include <linux/freezer.h>
>  
>  /*
>   * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
> @@ -138,7 +139,6 @@ struct seccomp_kaddfd {
>   * @notifications: A list of struct seccomp_knotif elements.
>   */
>  struct notification {
> -	struct semaphore request;
>  	u64 next_id;
>  	struct list_head notifications;
>  };
> @@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
>  	list_add(&n.list, &match->notif->notifications);
>  	INIT_LIST_HEAD(&n.addfd);
>  
> -	up(&match->notif->request);
>  	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>  	mutex_unlock(&match->notify_lock);
>  
> @@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 id)
>  
>  
>  static long seccomp_notify_recv(struct seccomp_filter *filter,
> -				void __user *buf)
> +				void __user *buf, bool blocking)
>  {
>  	struct seccomp_knotif *knotif = NULL, *cur;
> +	DEFINE_WAIT(wait);
>  	struct seccomp_notif unotif;
>  	ssize_t ret;
>  
> @@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  
>  	memset(&unotif, 0, sizeof(unotif));
>  
> -	ret = down_interruptible(&filter->notif->request);
> -	if (ret < 0)
> -		return ret;
> -
>  	mutex_lock(&filter->notify_lock);
> +
> +retry:
>  	list_for_each_entry(cur, &filter->notif->notifications, list) {
>  		if (cur->state == SECCOMP_NOTIFY_INIT) {
>  			knotif = cur;
> @@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  		}
>  	}
>  
> -	/*
> -	 * If we didn't find a notification, it could be that the task was
> -	 * interrupted by a fatal signal between the time we were woken and
> -	 * when we were able to acquire the rw lock.
> -	 */
>  	if (!knotif) {
> -		ret = -ENOENT;
> -		goto out;
> +		if (!blocking) {
> +			if (refcount_read(&filter->users) == 0)
> +				ret = -ENOTCONN;

I'm trying to come up with something better here than ENOTCONN. Maybe
ENODATA or maybe even EOWNERDEAD? The latter has been introduced for
robust futexes but it seems quite fitting.

> +			else
> +				ret = -ENOENT;

Wouldn't it be nicer to return EAGAIN if O_NONBLOCK is set? It implies
no event and makes it easier to use with event loops that expect EAGAIN.

> +			goto out;
> +		}
> +
> +		/* This has to happen before checking &filter->users. */
> +		prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
> +
> +		/*
> +		 * If all users of the filter are gone, throw an error
> +		 * instead of pointlessly continuing to block.
> +		 */
> +		if (refcount_read(&filter->users) == 0) {
> +			finish_wait(&filter->wqh, &wait);
> +			ret = -ENOTCONN;
> +			goto out;
> +		}
> +
> +		/* No notifications - wait for one... */
> +		mutex_unlock(&filter->notify_lock);
> +		freezable_schedule();
> +		finish_wait(&filter->wqh, &wait);
> +
> +		/* ... and retry */
> +		mutex_lock(&filter->notify_lock);
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out;
> +		}
> +		goto retry;
>  	}
>  
>  	unotif.id = knotif->id;
> @@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  		 */
>  		mutex_lock(&filter->notify_lock);
>  		knotif = find_notification(filter, unotif.id);
> -		if (knotif) {
> +		if (knotif)
>  			knotif->state = SECCOMP_NOTIFY_INIT;
> -			up(&filter->notif->request);
> -		}
>  		mutex_unlock(&filter->notify_lock);
>  	}
>  
> @@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  {
>  	struct seccomp_filter *filter = file->private_data;
>  	void __user *buf = (void __user *)arg;
> +	bool blocking = !(file->f_flags & O_NONBLOCK);
>  
>  	/* Fixed-size ioctls */
>  	switch (cmd) {
>  	case SECCOMP_IOCTL_NOTIF_RECV:
> -		return seccomp_notify_recv(filter, buf);
> +		return seccomp_notify_recv(filter, buf, blocking);
>  	case SECCOMP_IOCTL_NOTIF_SEND:
>  		return seccomp_notify_send(filter, buf);
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
> @@ -1483,12 +1506,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
>  	if (!filter->notif)
>  		goto out;
>  
> -	sema_init(&filter->notif->request, 0);
>  	filter->notif->next_id = get_random_u64();
>  	INIT_LIST_HEAD(&filter->notif->notifications);
>  
>  	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
> -				 filter, O_RDWR);
> +				 filter, O_RDWR|O_NONBLOCK);
>  	if (IS_ERR(ret))
>  		goto out_notif;
>  
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> index 20291ec6489f..b9e666f15998 100644
> --- a/samples/seccomp/user-trap.c
> +++ b/samples/seccomp/user-trap.c
> @@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
>  	return ret;
>  }
>  
> +static void set_blocking(int fd)
> +{
> +	int file_status_flags = fcntl(fd, F_GETFL);
> +
> +	file_status_flags &= ~O_NONBLOCK;
> +	if (fcntl(fd, F_SETFL, file_status_flags)) {
> +		perror("F_SETFL");
> +		exit(1);
> +	}
> +}
> +
>  int main(void)
>  {
>  	int sk_pair[2], ret = 1, status, listener;
> @@ -274,6 +285,8 @@ int main(void)
>  	if (listener < 0)
>  		goto out_kill;
>  
> +	set_blocking(listener);
> +
>  	/*
>  	 * Fork a task to handle the requests. This isn't strictly necessary,
>  	 * but it makes the particular writing of this sample easier, since we
> @@ -307,6 +320,9 @@ int main(void)
>  		while (1) {
>  			memset(req, 0, sizes.seccomp_notif);
>  			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
> +				if (errno == ENOTCONN)
> +					exit(0);
> +
>  				perror("ioctl recv");
>  				goto out_resp;
>  			}
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 4a180439ee9e..5318f9cb1aec 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
>  #endif
>  }
>  
> +#define set_blocking(fd) ({					\
> +	int file_status_flags;					\
> +	file_status_flags = fcntl(fd, F_GETFL);			\
> +	file_status_flags &= ~O_NONBLOCK;			\
> +	ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0);	\
> +})
> +
>  /* Have TH_LOG report actual location filecmp() is used. */
>  #define filecmp(pid1, pid2, fd1, fd2)	({		\
>  	int _ret;					\
> @@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	/*
>  	 * Check that nothing bad happens when we kill the task in the middle
>  	 * of a syscall.
> @@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> 
> base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
> -- 
> 2.29.1.341.ge80a0c044ae-goog
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone
  2020-11-02 20:37 [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Jann Horn
                   ` (2 preceding siblings ...)
  2020-11-03  9:44 ` [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Christian Brauner
@ 2020-11-04 11:18 ` Sargun Dhillon
  3 siblings, 0 replies; 5+ messages in thread
From: Sargun Dhillon @ 2020-11-04 11:18 UTC (permalink / raw)
  To: Jann Horn
  Cc: Kees Cook, Andy Lutomirski, Will Drewry, linux-kernel,
	Tycho Andersen, Christian Brauner, Michael Kerrisk (man-pages),
	Linux Containers, Giuseppe Scrivano, Paul Moore

On Mon, Nov 02, 2020 at 09:37:04PM +0100, Jann Horn wrote:
> At the moment, the seccomp notifier API is hard to use without combining
> it with APIs like poll() or epoll(); if all target processes have gone
> away, the polling APIs will raise an error indication on the file
> descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.
> 
> This usability problem was discovered when Michael Kerrisk wrote a
> manpage for this API.
> 
> To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
> behave as follows:
> 
> If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
> immediately, no matter whether a notification is available or not.
> 
> If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
> notification is delivered to userspace or all users of the filter have
> gone away.
> 
> To avoid subtle breakage from eventloop-style code that doesn't set
> O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
> wants blocking behavior, and if blocking-style code forgets to do so,
> that will be much more obvious than the breakage we'd get the other way
> around.
> This also means that UAPI breakage from this change should be limited to
> blocking users of the API, of which, to my knowledge, there are none so far
> (outside of in-tree sample and selftest code, which this patch adjusts - in
> particular the code in samples/ has to change a bunch).
> 
> This should be backported because future userspace code might otherwise not
> work properly on old kernels.
> 
> Cc: stable@vger.kernel.org
> Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
>  kernel/seccomp.c                              | 62 +++++++++++++------
>  samples/seccomp/user-trap.c                   | 16 +++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
>  3 files changed, 79 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 8ad7a293255a..b3730740515f 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -43,6 +43,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/anon_inodes.h>
>  #include <linux/lockdep.h>
> +#include <linux/freezer.h>
>  
>  /*
>   * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
> @@ -138,7 +139,6 @@ struct seccomp_kaddfd {
>   * @notifications: A list of struct seccomp_knotif elements.
>   */
>  struct notification {
> -	struct semaphore request;
>  	u64 next_id;
>  	struct list_head notifications;
>  };
> @@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
>  	list_add(&n.list, &match->notif->notifications);
>  	INIT_LIST_HEAD(&n.addfd);
>  
> -	up(&match->notif->request);
>  	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>  	mutex_unlock(&match->notify_lock);
>  
> @@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 id)
>  
>  
>  static long seccomp_notify_recv(struct seccomp_filter *filter,
> -				void __user *buf)
> +				void __user *buf, bool blocking)
>  {
>  	struct seccomp_knotif *knotif = NULL, *cur;
> +	DEFINE_WAIT(wait);
>  	struct seccomp_notif unotif;
>  	ssize_t ret;
>  
> @@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  
>  	memset(&unotif, 0, sizeof(unotif));
>  
> -	ret = down_interruptible(&filter->notif->request);
> -	if (ret < 0)
> -		return ret;
> -
>  	mutex_lock(&filter->notify_lock);
> +
> +retry:
>  	list_for_each_entry(cur, &filter->notif->notifications, list) {
>  		if (cur->state == SECCOMP_NOTIFY_INIT) {
>  			knotif = cur;
> @@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  		}
>  	}
>  
> -	/*
> -	 * If we didn't find a notification, it could be that the task was
> -	 * interrupted by a fatal signal between the time we were woken and
> -	 * when we were able to acquire the rw lock.
> -	 */
>  	if (!knotif) {
> -		ret = -ENOENT;
> -		goto out;
> +		if (!blocking) {
> +			if (refcount_read(&filter->users) == 0)
> +				ret = -ENOTCONN;
> +			else
> +				ret = -ENOENT;
> +			goto out;
> +		}
> +
> +		/* This has to happen before checking &filter->users. */
> +		prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);

Isn't there a subtle race condition here, that if we are in blocking mode, and 
we get a notification here, we wont be woken up? since we enter the wait queue
after checking to see if there are pending notifications, and in that period
the wake_up_poll could have been called?

I very well might be missing something interesting about the semantics of
freezable_schedule.

shouldn't it read something like:

retry:
	mutex_lock(...);
	ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
	if (ret)
		goto out;
	list_for_each_entry(cur, &filter->notif->notifications, list) {
		if (cur->state == SECCOMP_NOTIFY_INIT) {
			knotif = cur;
			break;
		}
	}

	/* Maybe this should be before we iterate over the list of outstanding notifcations? */
	if (refcount_read(&filter->users) == 0) {
		ret = -ENOTCONN;
		goto out;
	}

	if (!knotif) {
		if (!blocking) {
			ret = -ENOENT;
			goto out;
		}
		mutex_unlock(...);
		freezable_schedule();
		goto retry;
	}


out:
	finish_wait(&filter->wqh, &wait);
	mutex_unlock(...);
> +
> +		/*
> +		 * If all users of the filter are gone, throw an error
> +		 * instead of pointlessly continuing to block.
> +		 */
> +		if (refcount_read(&filter->users) == 0) {
> +			finish_wait(&filter->wqh, &wait);
> +			ret = -ENOTCONN;
> +			goto out;
> +		}
> +
> +		/* No notifications - wait for one... */
> +		mutex_unlock(&filter->notify_lock);
> +		freezable_schedule();
> +		finish_wait(&filter->wqh, &wait);
> +
> +		/* ... and retry */
> +		mutex_lock(&filter->notify_lock);
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out;
> +		}
> +		goto retry;
>  	}
>  
>  	unotif.id = knotif->id;
> @@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  		 */
>  		mutex_lock(&filter->notify_lock);
>  		knotif = find_notification(filter, unotif.id);
> -		if (knotif) {
> +		if (knotif)
>  			knotif->state = SECCOMP_NOTIFY_INIT;
> -			up(&filter->notif->request);
> -		}
>  		mutex_unlock(&filter->notify_lock);
>  	}
>  
> @@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  {
>  	struct seccomp_filter *filter = file->private_data;
>  	void __user *buf = (void __user *)arg;
> +	bool blocking = !(file->f_flags & O_NONBLOCK);
>  
>  	/* Fixed-size ioctls */
>  	switch (cmd) {
>  	case SECCOMP_IOCTL_NOTIF_RECV:
> -		return seccomp_notify_recv(filter, buf);
> +		return seccomp_notify_recv(filter, buf, blocking);
>  	case SECCOMP_IOCTL_NOTIF_SEND:
>  		return seccomp_notify_send(filter, buf);
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
> @@ -1483,12 +1506,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
>  	if (!filter->notif)
>  		goto out;
>  
> -	sema_init(&filter->notif->request, 0);
>  	filter->notif->next_id = get_random_u64();
>  	INIT_LIST_HEAD(&filter->notif->notifications);
>  
>  	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
> -				 filter, O_RDWR);
> +				 filter, O_RDWR|O_NONBLOCK);
>  	if (IS_ERR(ret))
>  		goto out_notif;
>  
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> index 20291ec6489f..b9e666f15998 100644
> --- a/samples/seccomp/user-trap.c
> +++ b/samples/seccomp/user-trap.c
> @@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
>  	return ret;
>  }
>  
> +static void set_blocking(int fd)
> +{
> +	int file_status_flags = fcntl(fd, F_GETFL);
> +
> +	file_status_flags &= ~O_NONBLOCK;
> +	if (fcntl(fd, F_SETFL, file_status_flags)) {
> +		perror("F_SETFL");
> +		exit(1);
> +	}
> +}
> +
>  int main(void)
>  {
>  	int sk_pair[2], ret = 1, status, listener;
> @@ -274,6 +285,8 @@ int main(void)
>  	if (listener < 0)
>  		goto out_kill;
>  
> +	set_blocking(listener);
> +
>  	/*
>  	 * Fork a task to handle the requests. This isn't strictly necessary,
>  	 * but it makes the particular writing of this sample easier, since we
> @@ -307,6 +320,9 @@ int main(void)
>  		while (1) {
>  			memset(req, 0, sizes.seccomp_notif);
>  			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
> +				if (errno == ENOTCONN)
> +					exit(0);
> +
>  				perror("ioctl recv");
>  				goto out_resp;
>  			}
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 4a180439ee9e..5318f9cb1aec 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
>  #endif
>  }
>  
> +#define set_blocking(fd) ({					\
> +	int file_status_flags;					\
> +	file_status_flags = fcntl(fd, F_GETFL);			\
> +	file_status_flags &= ~O_NONBLOCK;			\
> +	ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0);	\
> +})
> +
>  /* Have TH_LOG report actual location filecmp() is used. */
>  #define filecmp(pid1, pid2, fd1, fd2)	({		\
>  	int _ret;					\
> @@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	/*
>  	 * Check that nothing bad happens when we kill the task in the middle
>  	 * of a syscall.
> @@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> 
> base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
> -- 
> 2.29.1.341.ge80a0c044ae-goog
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-11-04 11:21 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-02 20:37 [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Jann Horn
2020-11-02 20:37 ` [PATCH 2/3] samples: seccomp: simplify user-trap sample Jann Horn
2020-11-02 20:37 ` [PATCH 3/3] selftests/seccomp: Test NOTIF_RECV empty/dead errors Jann Horn
2020-11-03  9:44 ` [PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone Christian Brauner
2020-11-04 11:18 ` Sargun Dhillon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).