All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] seccomp: add the synchronous mode for seccomp_unotify
@ 2022-08-30  1:43 Andrei Vagin
  2022-08-30  1:43 ` [PATCH 1/4] seccomp: don't use semaphore and wait_queue together Andrei Vagin
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30  1:43 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Andy Lutomirski, Christian Brauner,
	Dietmar Eggemann, Kees Cook, Ingo Molnar, Juri Lelli,
	Peter Oskolkov, Peter Zijlstra, Tycho Andersen, Will Drewry,
	Vincent Guittot

seccomp_unotify allows more privileged processes does actions on behalf
of less privileged processes.

In many cases, the workflow is fully synchronous. It means a target
process triggers a system call and passes controls to a supervisor
process that handles the system call and returns controls to the target
process. In this context, "synchronous" means that only one process is
running and another one is waiting.

The new WF_CURRENT_CPU flag advises the scheduler to move the wakee to
the current CPU. For such synchronous workflows, it makes context
switches a few times faster.

Right now, each interaction takes 12µs. With this patch, it takes about
3µs.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Peter Oskolkov <posk@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tycho Andersen <tycho@tycho.pizza>
Cc: Will Drewry <wad@chromium.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>

Andrei Vagin (3):
  seccomp: don't use semaphore and wait_queue together
  sched: add a few helpers to wake up tasks on the current cpu
  seccomp: add the synchronous mode for seccomp_unotify

Peter Oskolkov (1):
  sched/umcg: add WF_CURRENT_CPU and externise ttwu

 include/linux/completion.h   |  1 +
 include/linux/swait.h        |  1 +
 include/linux/wait.h         |  3 ++
 include/uapi/linux/seccomp.h |  4 ++
 kernel/sched/completion.c    | 12 ++++++
 kernel/sched/core.c          |  5 +--
 kernel/sched/fair.c          |  4 ++
 kernel/sched/sched.h         | 13 ++++---
 kernel/sched/swait.c         | 11 ++++++
 kernel/sched/wait.c          |  5 +++
 kernel/seccomp.c             | 73 +++++++++++++++++++++++++++++-------
 11 files changed, 110 insertions(+), 22 deletions(-)

-- 
2.37.2


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/4] seccomp: don't use semaphore and wait_queue together
  2022-08-30  1:43 [PATCH 0/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
@ 2022-08-30  1:43 ` Andrei Vagin
  2022-08-30  9:57   ` Christian Brauner
  2022-08-30  1:43 ` [PATCH 2/4] sched: add WF_CURRENT_CPU and externise ttwu Andrei Vagin
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30  1:43 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Andy Lutomirski, Christian Brauner,
	Dietmar Eggemann, Kees Cook, Ingo Molnar, Juri Lelli,
	Peter Oskolkov, Peter Zijlstra, Tycho Andersen, Will Drewry,
	Vincent Guittot

Here is no reason to use two different primitives that do similar things.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 kernel/seccomp.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e9852d1b4a5e..667fd2d89464 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -145,7 +145,7 @@ struct seccomp_kaddfd {
  * @notifications: A list of struct seccomp_knotif elements.
  */
 struct notification {
-	struct semaphore request;
+	atomic_t requests;
 	u64 next_id;
 	struct list_head notifications;
 };
@@ -1116,7 +1116,7 @@ static int seccomp_do_user_notification(int this_syscall,
 	list_add_tail(&n.list, &match->notif->notifications);
 	INIT_LIST_HEAD(&n.addfd);
 
-	up(&match->notif->request);
+	atomic_add(1, &match->notif->requests);
 	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
 
 	/*
@@ -1388,8 +1388,10 @@ static long seccomp_set_mode_strict(void)
 #ifdef CONFIG_SECCOMP_FILTER
 static void seccomp_notify_free(struct seccomp_filter *filter)
 {
-	kfree(filter->notif);
-	filter->notif = NULL;
+	struct notification *notif = filter->notif;
+
+	WRITE_ONCE(filter->notif, NULL);
+	kfree_rcu(notif);
 }
 
 static void seccomp_notify_detach(struct seccomp_filter *filter)
@@ -1450,6 +1452,16 @@ find_notification(struct seccomp_filter *filter, u64 id)
 	return NULL;
 }
 
+static bool notify_wakeup(struct seccomp_filter *filter)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = atomic_add_unless(&filter->notif->requests, -1, 0);
+	rcu_read_unlock();
+
+	return ret;
+}
 
 static long seccomp_notify_recv(struct seccomp_filter *filter,
 				void __user *buf)
@@ -1467,7 +1479,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 
 	memset(&unotif, 0, sizeof(unotif));
 
-	ret = down_interruptible(&filter->notif->request);
+	ret = wait_event_interruptible(filter->wqh, notify_wakeup(filter));
 	if (ret < 0)
 		return ret;
 
@@ -1515,7 +1527,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 			if (should_sleep_killable(filter, knotif))
 				complete(&knotif->ready);
 			knotif->state = SECCOMP_NOTIFY_INIT;
-			up(&filter->notif->request);
+			atomic_add(1, &filter->notif->requests);
+			wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM);
 		}
 		mutex_unlock(&filter->notify_lock);
 	}
@@ -1771,15 +1784,15 @@ static const struct file_operations seccomp_notify_ops = {
 static struct file *init_listener(struct seccomp_filter *filter)
 {
 	struct file *ret;
+	struct notification *notif;
 
 	ret = ERR_PTR(-ENOMEM);
-	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
-	if (!filter->notif)
+	notif = kzalloc(sizeof(*notif), GFP_KERNEL);
+	if (!notif)
 		goto out;
 
-	sema_init(&filter->notif->request, 0);
-	filter->notif->next_id = get_random_u64();
-	INIT_LIST_HEAD(&filter->notif->notifications);
+	notif->next_id = get_random_u64();
+	INIT_LIST_HEAD(&notif->notifications);
 
 	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
 				 filter, O_RDWR);
@@ -1788,10 +1801,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
 
 	/* The file has a reference to it now */
 	__get_seccomp_filter(filter);
+	WRITE_ONCE(filter->notif, notif);
 
 out_notif:
 	if (IS_ERR(ret))
-		seccomp_notify_free(filter);
+		kfree(notif);
 out:
 	return ret;
 }
-- 
2.37.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/4] sched: add WF_CURRENT_CPU and externise ttwu
  2022-08-30  1:43 [PATCH 0/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
  2022-08-30  1:43 ` [PATCH 1/4] seccomp: don't use semaphore and wait_queue together Andrei Vagin
@ 2022-08-30  1:43 ` Andrei Vagin
  2022-08-30  1:43 ` [PATCH 3/4] sched: add a few helpers to wake up tasks on the current cpu Andrei Vagin
  2022-08-30  1:43 ` [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
  3 siblings, 0 replies; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30  1:43 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Andy Lutomirski, Christian Brauner,
	Dietmar Eggemann, Kees Cook, Ingo Molnar, Juri Lelli,
	Peter Oskolkov, Peter Zijlstra, Tycho Andersen, Will Drewry,
	Vincent Guittot

From: Peter Oskolkov <posk@google.com>

Add WF_CURRENT_CPU wake flag that advices the scheduler to
move the wakee to the current CPU. This is useful for fast on-CPU
context switching use cases such as UMCG.

In addition, make ttwu external rather than static so that
the flag could be passed to it from outside of sched/core.c.

Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 kernel/sched/core.c  |  3 +--
 kernel/sched/fair.c  |  4 ++++
 kernel/sched/sched.h | 13 ++++++++-----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..008be12c31e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4045,8 +4045,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
  * Return: %true if @p->state changes (an actual wakeup was done),
  *	   %false otherwise.
  */
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, success = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 914096c5b1ae..7b043870b634 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7028,6 +7028,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	if (wake_flags & WF_TTWU) {
 		record_wakee(p);
 
+		if ((wake_flags & WF_CURRENT_CPU) &&
+		    cpumask_test_cpu(cpu, p->cpus_ptr))
+			return cpu;
+
 		if (sched_energy_enabled()) {
 			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
 			if (new_cpu >= 0)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e26688d387ae..6e93e8808bfd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2080,12 +2080,13 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 }
 
 /* Wake flags. The first three directly map to some SD flag value */
-#define WF_EXEC     0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
-#define WF_FORK     0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
-#define WF_TTWU     0x08 /* Wakeup;            maps to SD_BALANCE_WAKE */
+#define WF_EXEC         0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK         0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU         0x08 /* Wakeup;            maps to SD_BALANCE_WAKE */
 
-#define WF_SYNC     0x10 /* Waker goes to sleep after wakeup */
-#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_SYNC         0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED     0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU  0x40 /* Prefer to move the wakee to the current CPU. */
 
 #ifdef CONFIG_SMP
 static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -3151,6 +3152,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 extern void swake_up_all_locked(struct swait_queue_head *q);
 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
 
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
 #ifdef CONFIG_PREEMPT_DYNAMIC
 extern int preempt_dynamic_mode;
 extern int sched_dynamic_mode(const char *str);
-- 
2.37.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/4] sched: add a few helpers to wake up tasks on the current cpu
  2022-08-30  1:43 [PATCH 0/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
  2022-08-30  1:43 ` [PATCH 1/4] seccomp: don't use semaphore and wait_queue together Andrei Vagin
  2022-08-30  1:43 ` [PATCH 2/4] sched: add WF_CURRENT_CPU and externise ttwu Andrei Vagin
@ 2022-08-30  1:43 ` Andrei Vagin
  2022-08-30  1:43 ` [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
  3 siblings, 0 replies; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30  1:43 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Andy Lutomirski, Christian Brauner,
	Dietmar Eggemann, Kees Cook, Ingo Molnar, Juri Lelli,
	Peter Oskolkov, Peter Zijlstra, Tycho Andersen, Will Drewry,
	Vincent Guittot

Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake
up processes on the current CPU.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 include/linux/completion.h |  1 +
 include/linux/swait.h      |  1 +
 include/linux/wait.h       |  3 +++
 kernel/sched/completion.c  | 12 ++++++++++++
 kernel/sched/core.c        |  2 +-
 kernel/sched/swait.c       | 11 +++++++++++
 kernel/sched/wait.c        |  5 +++++
 7 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 51d9ab079629..1699e697a225 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -115,6 +115,7 @@ extern bool try_wait_for_completion(struct completion *x);
 extern bool completion_done(struct completion *x);
 
 extern void complete(struct completion *);
+extern void complete_on_current_cpu(struct completion *x);
 extern void complete_all(struct completion *);
 
 #endif
diff --git a/include/linux/swait.h b/include/linux/swait.h
index 6a8c22b8c2a5..1f27b254adf5 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -147,6 +147,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
 extern void swake_up_one(struct swait_queue_head *q);
 extern void swake_up_all(struct swait_queue_head *q);
 extern void swake_up_locked(struct swait_queue_head *q);
+extern void swake_up_locked_on_current_cpu(struct swait_queue_head *q);
 
 extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
 extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 58cfbf81447c..dcd01dd4de3e 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -210,6 +210,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 }
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
 		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
@@ -237,6 +238,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
 #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
 #define wake_up_poll(x, m)							\
 	__wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
+#define wake_up_poll_on_current_cpu(x, m)					\
+	__wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
 #define wake_up_locked_poll(x, m)						\
 	__wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
 #define wake_up_interruptible_poll(x, m)					\
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 35f15c26ed54..1ae9b34822ef 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -38,6 +38,18 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
 
+void complete_on_current_cpu(struct completion *x)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+	if (x->done != UINT_MAX)
+		x->done++;
+	swake_up_locked_on_current_cpu(&x->wait);
+	raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
 /**
  * complete_all: - signals all threads waiting on this completion
  * @x:  holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 008be12c31e6..1e164d8fde1a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6813,7 +6813,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 76b9b796e695..9ebe23868942 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -31,6 +31,17 @@ void swake_up_locked(struct swait_queue_head *q)
 }
 EXPORT_SYMBOL(swake_up_locked);
 
+void swake_up_locked_on_current_cpu(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+
+	if (list_empty(&q->task_list))
+		return;
+
+	curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+	try_to_wake_up(curr->task, TASK_NORMAL, WF_CURRENT_CPU);
+	list_del_init(&curr->task_list);
+}
 /*
  * Wake up all waiters. This is an interface which is solely exposed for
  * completions and not for general usage.
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9860bb9a847c..9a78bca79419 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -157,6 +157,11 @@ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 }
 EXPORT_SYMBOL(__wake_up);
 
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+	__wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
-- 
2.37.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify
  2022-08-30  1:43 [PATCH 0/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
                   ` (2 preceding siblings ...)
  2022-08-30  1:43 ` [PATCH 3/4] sched: add a few helpers to wake up tasks on the current cpu Andrei Vagin
@ 2022-08-30  1:43 ` Andrei Vagin
  2022-08-30 10:43   ` Christian Brauner
  3 siblings, 1 reply; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30  1:43 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Andy Lutomirski, Christian Brauner,
	Dietmar Eggemann, Kees Cook, Ingo Molnar, Juri Lelli,
	Peter Oskolkov, Peter Zijlstra, Tycho Andersen, Will Drewry,
	Vincent Guittot

seccomp_unotify allows more privileged processes does actions on behalf
of less privileged processes.

In many cases, the workflow is fully synchronous. It means a target
process triggers a system call and passes controls to a supervisor
process that handles the system call and returns controls to the target
process. In this context, "synchronous" means that only one process is
running and another one is waiting.

There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
move the wakee to the current CPU. For such synchronous workflows, it
makes context switches a few times faster.

Right now, each interaction takes 12µs. With this patch, it takes about
3µs.

This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
it used to enable the sync mode.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 include/uapi/linux/seccomp.h |  4 ++++
 kernel/seccomp.c             | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0fdc6ef02b94..dbfc9b37fcae 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -115,6 +115,8 @@ struct seccomp_notif_resp {
 	__u32 flags;
 };
 
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+
 /* valid flags for seccomp_notif_addfd */
 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
 #define SECCOMP_ADDFD_FLAG_SEND		(1UL << 1) /* Addfd and return it, atomically */
@@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3, \
 						struct seccomp_notif_addfd)
 
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS	SECCOMP_IOW(4, __u64)
+
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 667fd2d89464..c24900eb8ced 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,11 +143,14 @@ struct seccomp_kaddfd {
  *           filter->notify_lock.
  * @next_id: The id of the next request.
  * @notifications: A list of struct seccomp_knotif elements.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
  */
+
 struct notification {
 	atomic_t requests;
 	u64 next_id;
 	struct list_head notifications;
+	int flags;
 };
 
 #ifdef SECCOMP_ARCH_NATIVE
@@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
 	INIT_LIST_HEAD(&n.addfd);
 
 	atomic_add(1, &match->notif->requests);
-	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
+	if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+		wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+	else
+		wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
 
 	/*
 	 * This is where we wait for a reply from userspace.
@@ -1574,7 +1580,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
 	knotif->error = resp.error;
 	knotif->val = resp.val;
 	knotif->flags = resp.flags;
-	complete(&knotif->ready);
+	if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+		complete_on_current_cpu(&knotif->ready);
+	else
+		complete(&knotif->ready);
 out:
 	mutex_unlock(&filter->notify_lock);
 	return ret;
@@ -1604,6 +1613,26 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	return ret;
 }
 
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+				    void __user *buf)
+{
+	u64 flags;
+	long ret;
+
+	if (copy_from_user(&flags, buf, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+	filter->notif->flags = flags;
+	mutex_unlock(&filter->notify_lock);
+	return ret;
+}
+
 static long seccomp_notify_addfd(struct seccomp_filter *filter,
 				 struct seccomp_notif_addfd __user *uaddfd,
 				 unsigned int size)
@@ -1733,6 +1762,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+		return seccomp_notify_set_flags(filter, buf);
 	}
 
 	/* Extensible Argument ioctls */
-- 
2.37.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] seccomp: don't use semaphore and wait_queue together
  2022-08-30  1:43 ` [PATCH 1/4] seccomp: don't use semaphore and wait_queue together Andrei Vagin
@ 2022-08-30  9:57   ` Christian Brauner
  2022-08-30 16:07     ` Andrei Vagin
  0 siblings, 1 reply; 10+ messages in thread
From: Christian Brauner @ 2022-08-30  9:57 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-kernel, Andy Lutomirski, Dietmar Eggemann, Kees Cook,
	Ingo Molnar, Juri Lelli, Peter Oskolkov, Peter Zijlstra,
	Tycho Andersen, Will Drewry, Vincent Guittot

On Mon, Aug 29, 2022 at 06:43:53PM -0700, Andrei Vagin wrote:
> Here is no reason to use two different primitives that do similar things.
> 
> Signed-off-by: Andrei Vagin <avagin@gmail.com>
> ---

I dug into the old threads a bit and there was no specific reason given
why we used the sempahore afaict. It was there from the beginning and
the wait queue got added to support polling. Maybe I'm missing why we
need both but it's not obvious to me right now as well. So I tend to
agree that we should get rid of it.

>  kernel/seccomp.c | 38 ++++++++++++++++++++++++++------------
>  1 file changed, 26 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index e9852d1b4a5e..667fd2d89464 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -145,7 +145,7 @@ struct seccomp_kaddfd {
>   * @notifications: A list of struct seccomp_knotif elements.
>   */
>  struct notification {
> -	struct semaphore request;
> +	atomic_t requests;
>  	u64 next_id;
>  	struct list_head notifications;
>  };
> @@ -1116,7 +1116,7 @@ static int seccomp_do_user_notification(int this_syscall,
>  	list_add_tail(&n.list, &match->notif->notifications);
>  	INIT_LIST_HEAD(&n.addfd);
>  
> -	up(&match->notif->request);
> +	atomic_add(1, &match->notif->requests);

atomic_inc()?

>  	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>  
>  	/*
> @@ -1388,8 +1388,10 @@ static long seccomp_set_mode_strict(void)
>  #ifdef CONFIG_SECCOMP_FILTER
>  static void seccomp_notify_free(struct seccomp_filter *filter)
>  {
> -	kfree(filter->notif);
> -	filter->notif = NULL;
> +	struct notification *notif = filter->notif;
> +
> +	WRITE_ONCE(filter->notif, NULL);
> +	kfree_rcu(notif);
>  }
>  
>  static void seccomp_notify_detach(struct seccomp_filter *filter)
> @@ -1450,6 +1452,16 @@ find_notification(struct seccomp_filter *filter, u64 id)
>  	return NULL;
>  }
>  
> +static bool notify_wakeup(struct seccomp_filter *filter)
> +{
> +	bool ret;
> +
> +	rcu_read_lock();
> +	ret = atomic_add_unless(&filter->notif->requests, -1, 0);

Can you please spell out why the change to wait_event_interruptible()
below that calls notify_wakeup() makes it necessary to rcu protect
->notif?

Given that you're using rcu_read_lock() here and the
WRITE_ONCE(fitler->notify, NULL) + kfree_rcu() sequence in
seccomp_notify_free() it seems to imply that filter->notif could be NULL
here and so would need a non-NULL check on ->notif before incrementing?

> +	rcu_read_unlock();
> +
> +	return ret;
> +}
>  
>  static long seccomp_notify_recv(struct seccomp_filter *filter,
>  				void __user *buf)
> @@ -1467,7 +1479,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  
>  	memset(&unotif, 0, sizeof(unotif));
>  
> -	ret = down_interruptible(&filter->notif->request);
> +	ret = wait_event_interruptible(filter->wqh, notify_wakeup(filter));
>  	if (ret < 0)
>  		return ret;
>  
> @@ -1515,7 +1527,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  			if (should_sleep_killable(filter, knotif))
>  				complete(&knotif->ready);
>  			knotif->state = SECCOMP_NOTIFY_INIT;
> -			up(&filter->notif->request);
> +			atomic_add(1, &filter->notif->requests);

atomic_inc()?

> +			wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM);
>  		}
>  		mutex_unlock(&filter->notify_lock);
>  	}
> @@ -1771,15 +1784,15 @@ static const struct file_operations seccomp_notify_ops = {
>  static struct file *init_listener(struct seccomp_filter *filter)
>  {
>  	struct file *ret;
> +	struct notification *notif;
>  
>  	ret = ERR_PTR(-ENOMEM);
> -	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
> -	if (!filter->notif)
> +	notif = kzalloc(sizeof(*notif), GFP_KERNEL);
> +	if (!notif)
>  		goto out;
>  
> -	sema_init(&filter->notif->request, 0);
> -	filter->notif->next_id = get_random_u64();
> -	INIT_LIST_HEAD(&filter->notif->notifications);
> +	notif->next_id = get_random_u64();
> +	INIT_LIST_HEAD(&notif->notifications);
>  
>  	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
>  				 filter, O_RDWR);
> @@ -1788,10 +1801,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
>  
>  	/* The file has a reference to it now */
>  	__get_seccomp_filter(filter);
> +	WRITE_ONCE(filter->notif, notif);
>  
>  out_notif:
>  	if (IS_ERR(ret))
> -		seccomp_notify_free(filter);
> +		kfree(notif);
>  out:
>  	return ret;
>  }
> -- 
> 2.37.2
> 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify
  2022-08-30  1:43 ` [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
@ 2022-08-30 10:43   ` Christian Brauner
  2022-08-30 21:23     ` Andrei Vagin
  0 siblings, 1 reply; 10+ messages in thread
From: Christian Brauner @ 2022-08-30 10:43 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-kernel, Andy Lutomirski, Dietmar Eggemann, Kees Cook,
	Ingo Molnar, Juri Lelli, Peter Oskolkov, Peter Zijlstra,
	Tycho Andersen, Will Drewry, Vincent Guittot

On Mon, Aug 29, 2022 at 06:43:56PM -0700, Andrei Vagin wrote:
> seccomp_unotify allows more privileged processes does actions on behalf
> of less privileged processes.
> 
> In many cases, the workflow is fully synchronous. It means a target
> process triggers a system call and passes controls to a supervisor
> process that handles the system call and returns controls to the target
> process. In this context, "synchronous" means that only one process is
> running and another one is waiting.
> 
> There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> move the wakee to the current CPU. For such synchronous workflows, it
> makes context switches a few times faster.
> 
> Right now, each interaction takes 12µs. With this patch, it takes about
> 3µs.

Seems like a nice idea though I leave it to the sched people to judge
whether this is sane or not. So the supervisor which gets woken will be
moved to the current cpu in this synchronous scenario.

I have no strong opinions on this patch. There are two things I wonder
about. First, how meaningful is that speed up given that the supervisor
will most often do a lot of heavy-handed things anyway.

Second, this flag is a very specific thing and I wonder how much
userspace will really use this and what's more use this correctly.

Just to note that LXD - one of the biggest user of this feature - isn't
synchronous iiuc for example. Each container gets a separate seccomp
supervisor thread (well, go routine but whatever) which exposes a socket
that the container manager connects to and sends the seccomp
notifications it received from its payload according to an api we
established. And each notification is handled in a separate thread
(again, go routine but whatever).

> 
> This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> it used to enable the sync mode.
> 
> Signed-off-by: Andrei Vagin <avagin@gmail.com>
> ---
>  include/uapi/linux/seccomp.h |  4 ++++
>  kernel/seccomp.c             | 35 +++++++++++++++++++++++++++++++++--
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 0fdc6ef02b94..dbfc9b37fcae 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
>  	__u32 flags;
>  };
>  
> +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
> +
>  /* valid flags for seccomp_notif_addfd */
>  #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
>  #define SECCOMP_ADDFD_FLAG_SEND		(1UL << 1) /* Addfd and return it, atomically */
> @@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
>  #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3, \
>  						struct seccomp_notif_addfd)
>  
> +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS	SECCOMP_IOW(4, __u64)
> +
>  #endif /* _UAPI_LINUX_SECCOMP_H */
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 667fd2d89464..c24900eb8ced 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -143,11 +143,14 @@ struct seccomp_kaddfd {
>   *           filter->notify_lock.
>   * @next_id: The id of the next request.
>   * @notifications: A list of struct seccomp_knotif elements.
> + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
>   */
> +
>  struct notification {
>  	atomic_t requests;
>  	u64 next_id;
>  	struct list_head notifications;
> +	int flags;
>  };
>  
>  #ifdef SECCOMP_ARCH_NATIVE
> @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
>  	INIT_LIST_HEAD(&n.addfd);
>  
>  	atomic_add(1, &match->notif->requests);
> -	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> +	if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> +		wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> +	else
> +		wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);

(We're accumulating a lot of conditional wake primitives in the notifier.)

>  
>  	/*
>  	 * This is where we wait for a reply from userspace.
> @@ -1574,7 +1580,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
>  	knotif->error = resp.error;
>  	knotif->val = resp.val;
>  	knotif->flags = resp.flags;
> -	complete(&knotif->ready);
> +	if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> +		complete_on_current_cpu(&knotif->ready);
> +	else
> +		complete(&knotif->ready);
>  out:
>  	mutex_unlock(&filter->notify_lock);
>  	return ret;
> @@ -1604,6 +1613,26 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
>  	return ret;
>  }
>  
> +static long seccomp_notify_set_flags(struct seccomp_filter *filter,
> +				    void __user *buf)
> +{
> +	u64 flags;
> +	long ret;
> +
> +	if (copy_from_user(&flags, buf, sizeof(flags)))
> +		return -EFAULT;
> +
> +	if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> +		return -EINVAL;
> +
> +	ret = mutex_lock_interruptible(&filter->notify_lock);
> +	if (ret < 0)
> +		return ret;
> +	filter->notif->flags = flags;

Might be better to just keep the uapi type and the in-kernel type in sync.

> +	mutex_unlock(&filter->notify_lock);
> +	return ret;
> +}
> +
>  static long seccomp_notify_addfd(struct seccomp_filter *filter,
>  				 struct seccomp_notif_addfd __user *uaddfd,
>  				 unsigned int size)
> @@ -1733,6 +1762,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID:
>  		return seccomp_notify_id_valid(filter, buf);
> +	case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
> +		return seccomp_notify_set_flags(filter, buf);
>  	}
>  
>  	/* Extensible Argument ioctls */
> -- 
> 2.37.2
> 
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] seccomp: don't use semaphore and wait_queue together
  2022-08-30  9:57   ` Christian Brauner
@ 2022-08-30 16:07     ` Andrei Vagin
  0 siblings, 0 replies; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30 16:07 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, Andy Lutomirski, Dietmar Eggemann, Kees Cook,
	Ingo Molnar, Juri Lelli, Peter Oskolkov, Peter Zijlstra,
	Tycho Andersen, Will Drewry, Vincent Guittot

On Tue, Aug 30, 2022 at 2:57 AM Christian Brauner <brauner@kernel.org> wrote:

> >
> > +static bool notify_wakeup(struct seccomp_filter *filter)
> > +{
> > +     bool ret;
> > +
> > +     rcu_read_lock();
> > +     ret = atomic_add_unless(&filter->notif->requests, -1, 0);
>
> Can you please spell out why the change to wait_event_interruptible()
> below that calls notify_wakeup() makes it necessary to rcu protect
> ->notif?

This is my mistake. rcu is used here when I tried to implement notify_wakeup
without introducing notif->request. The idea was to enumerate all elements of
notif->notifications. Now, it doesn't matter. In this context, filter->notif
can be dereferenced without any additional locks. Thanks for catching this.

>
> Given that you're using rcu_read_lock() here and the
> WRITE_ONCE(fitler->notify, NULL) + kfree_rcu() sequence in
> seccomp_notify_free() it seems to imply that filter->notif could be NULL
> here and so would need a non-NULL check on ->notif before incrementing?
>
> > +     rcu_read_unlock();
> > +
> > +     return ret;
> > +}
>
>  static long seccomp_notify_recv(struct seccomp_filter *filter,
>                               void __user *buf)
> @@ -1467,7 +1479,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>
>       memset(&unotif, 0, sizeof(unotif));
>
> -     ret = down_interruptible(&filter->notif->request);
> +     ret = wait_event_interruptible(filter->wqh, notify_wakeup(filter));
>       if (ret < 0)
>               return ret;
>

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify
  2022-08-30 10:43   ` Christian Brauner
@ 2022-08-30 21:23     ` Andrei Vagin
  2022-09-01 13:58       ` Christian Brauner
  0 siblings, 1 reply; 10+ messages in thread
From: Andrei Vagin @ 2022-08-30 21:23 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, Andy Lutomirski, Dietmar Eggemann, Kees Cook,
	Ingo Molnar, Juri Lelli, Peter Oskolkov, Peter Zijlstra,
	Tycho Andersen, Will Drewry, Vincent Guittot

On Tue, Aug 30, 2022 at 3:43 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Aug 29, 2022 at 06:43:56PM -0700, Andrei Vagin wrote:
> > seccomp_unotify allows more privileged processes does actions on behalf
> > of less privileged processes.
> >
> > In many cases, the workflow is fully synchronous. It means a target
> > process triggers a system call and passes controls to a supervisor
> > process that handles the system call and returns controls to the target
> > process. In this context, "synchronous" means that only one process is
> > running and another one is waiting.
> >
> > There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> > move the wakee to the current CPU. For such synchronous workflows, it
> > makes context switches a few times faster.
> >
> > Right now, each interaction takes 12µs. With this patch, it takes about
> > 3µs.
>
> Seems like a nice idea though I leave it to the sched people to judge
> whether this is sane or not. So the supervisor which gets woken will be
> moved to the current cpu in this synchronous scenario.
>
> I have no strong opinions on this patch. There are two things I wonder
> about. First, how meaningful is that speed up given that the supervisor
> will most often do a lot of heavy-handed things anyway.

I would not use the "most often" phrase in this case;). It is true for LXC-like
use cases when we need to handle rare syscalls. In this case, the performance
of this interface doesn't play a big role. But my use case is very different. I
have a prototype of the gVisor platform, where seccomp is used to trap
guest system calls. In this case, the difference between 12µs and 3µs is
tremendous.

The idea of WF_CURRENT_CPU is not mine. I spied it from the umcg series.
I took the second patch from that series without any changes.

>
> Second, this flag is a very specific thing and I wonder how much
> userspace will really use this and what's more use this correctly.
>
> Just to note that LXD - one of the biggest user of this feature - isn't
> synchronous iiuc for example. Each container gets a separate seccomp
> supervisor thread (well, go routine but whatever) which exposes a socket
> that the container manager connects to and sends the seccomp
> notifications it received from its payload according to an api we
> established. And each notification is handled in a separate thread
> (again, go routine but whatever).

It could be synchronous if seccomp events had been handled in [lxc monitor]. But
right now, [lxc monitor] is just a proxy. In this case, you are right, lxc will
not get any benefits by setting this flag. But we can look at this from another
side. If we add these changes, we will have another big user of the interface. I
think the number of gVisor containers that are started each day is comparable
with the number of LXC/LXD containers.

>
> >
> > This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> > it used to enable the sync mode.
> >
> > Signed-off-by: Andrei Vagin <avagin@gmail.com>
> > ---
> >  include/uapi/linux/seccomp.h |  4 ++++
> >  kernel/seccomp.c             | 35 +++++++++++++++++++++++++++++++++--
> >  2 files changed, 37 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index 0fdc6ef02b94..dbfc9b37fcae 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
> >       __u32 flags;
> >  };
> >

<snip>

> >
> >  #ifdef SECCOMP_ARCH_NATIVE
> > @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
> >       INIT_LIST_HEAD(&n.addfd);
> >
> >       atomic_add(1, &match->notif->requests);
> > -     wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > +     if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> > +             wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > +     else
> > +             wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>
> (We're accumulating a lot of conditional wake primitives in the notifier.)
>

I am not sure that I understand what you mean here.

Thanks,
Andrei.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify
  2022-08-30 21:23     ` Andrei Vagin
@ 2022-09-01 13:58       ` Christian Brauner
  0 siblings, 0 replies; 10+ messages in thread
From: Christian Brauner @ 2022-09-01 13:58 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-kernel, Andy Lutomirski, Dietmar Eggemann, Kees Cook,
	Ingo Molnar, Juri Lelli, Peter Oskolkov, Peter Zijlstra,
	Tycho Andersen, Will Drewry, Vincent Guittot

On Tue, Aug 30, 2022 at 02:23:24PM -0700, Andrei Vagin wrote:
> On Tue, Aug 30, 2022 at 3:43 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Mon, Aug 29, 2022 at 06:43:56PM -0700, Andrei Vagin wrote:
> > > seccomp_unotify allows more privileged processes does actions on behalf
> > > of less privileged processes.
> > >
> > > In many cases, the workflow is fully synchronous. It means a target
> > > process triggers a system call and passes controls to a supervisor
> > > process that handles the system call and returns controls to the target
> > > process. In this context, "synchronous" means that only one process is
> > > running and another one is waiting.
> > >
> > > There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> > > move the wakee to the current CPU. For such synchronous workflows, it
> > > makes context switches a few times faster.
> > >
> > > Right now, each interaction takes 12µs. With this patch, it takes about
> > > 3µs.
> >
> > Seems like a nice idea though I leave it to the sched people to judge
> > whether this is sane or not. So the supervisor which gets woken will be
> > moved to the current cpu in this synchronous scenario.
> >
> > I have no strong opinions on this patch. There are two things I wonder
> > about. First, how meaningful is that speed up given that the supervisor
> > will most often do a lot of heavy-handed things anyway.
> 
> I would not use the "most often" phrase in this case;). It is true for LXC-like
> use cases when we need to handle rare syscalls. In this case, the performance
> of this interface doesn't play a big role. But my use case is very different. I
> have a prototype of the gVisor platform, where seccomp is used to trap
> guest system calls. In this case, the difference between 12µs and 3µs is
> tremendous.

Oh yeah, makes sense. I don't know enough about gVisor but I know we can
trust your word! :)

> 
> The idea of WF_CURRENT_CPU is not mine. I spied it from the umcg series.
> I took the second patch from that series without any changes.
> 
> >
> > Second, this flag is a very specific thing and I wonder how much
> > userspace will really use this and what's more use this correctly.
> >
> > Just to note that LXD - one of the biggest user of this feature - isn't
> > synchronous iiuc for example. Each container gets a separate seccomp
> > supervisor thread (well, go routine but whatever) which exposes a socket
> > that the container manager connects to and sends the seccomp
> > notifications it received from its payload according to an api we
> > established. And each notification is handled in a separate thread
> > (again, go routine but whatever).
> 
> It could be synchronous if seccomp events had been handled in [lxc monitor]. But
> right now, [lxc monitor] is just a proxy. In this case, you are right, lxc will

Yep.

> not get any benefits by setting this flag. But we can look at this from another
> side. If we add these changes, we will have another big user of the interface. I
> think the number of gVisor containers that are started each day is comparable
> with the number of LXC/LXD containers.

Sure, if there's users that would benefit from this then no reason to
not consider it. It's just a lot of low-level knobs we give userspace
here but I guess for the notifier it makes sense.

> 
> >
> > >
> > > This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> > > it used to enable the sync mode.
> > >
> > > Signed-off-by: Andrei Vagin <avagin@gmail.com>
> > > ---
> > >  include/uapi/linux/seccomp.h |  4 ++++
> > >  kernel/seccomp.c             | 35 +++++++++++++++++++++++++++++++++--
> > >  2 files changed, 37 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > > index 0fdc6ef02b94..dbfc9b37fcae 100644
> > > --- a/include/uapi/linux/seccomp.h
> > > +++ b/include/uapi/linux/seccomp.h
> > > @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
> > >       __u32 flags;
> > >  };
> > >
> 
> <snip>
> 
> > >
> > >  #ifdef SECCOMP_ARCH_NATIVE
> > > @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
> > >       INIT_LIST_HEAD(&n.addfd);
> > >
> > >       atomic_add(1, &match->notif->requests);
> > > -     wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > > +     if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> > > +             wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > > +     else
> > > +             wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> >
> > (We're accumulating a lot of conditional wake primitives in the notifier.)
> >
> 
> I am not sure that I understand what you mean here.

I just meant that we have 

if (wait_killable)
	err = wait_for_completion_killable(&n.ready);
else
	err = wait_for_completion_interruptible(&n.ready);

and now also

if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
        wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
else
        wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);

which is a bit unpleasant but nothing that would mean we can't do this.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-09-01 13:58 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-30  1:43 [PATCH 0/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
2022-08-30  1:43 ` [PATCH 1/4] seccomp: don't use semaphore and wait_queue together Andrei Vagin
2022-08-30  9:57   ` Christian Brauner
2022-08-30 16:07     ` Andrei Vagin
2022-08-30  1:43 ` [PATCH 2/4] sched: add WF_CURRENT_CPU and externise ttwu Andrei Vagin
2022-08-30  1:43 ` [PATCH 3/4] sched: add a few helpers to wake up tasks on the current cpu Andrei Vagin
2022-08-30  1:43 ` [PATCH 4/4] seccomp: add the synchronous mode for seccomp_unotify Andrei Vagin
2022-08-30 10:43   ` Christian Brauner
2022-08-30 21:23     ` Andrei Vagin
2022-09-01 13:58       ` Christian Brauner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.