From: Peter Oskolkov <posk@google.com>
To: Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>,
linux-kernel@vger.kernel.org, linux-api@vger.kernel.org
Cc: Paul Turner <pjt@google.com>, Ben Segall <bsegall@google.com>,
Peter Oskolkov <posk@google.com>, Peter Oskolkov <posk@posk.io>,
Joel Fernandes <joel@joelfernandes.org>,
Andrew Morton <akpm@linux-foundation.org>,
Andrei Vagin <avagin@google.com>,
Jim Newsome <jnewsome@torproject.org>
Subject: [RFC PATCH v0.1 4/9] sched/umcg: implement core UMCG API
Date: Thu, 20 May 2021 11:36:09 -0700 [thread overview]
Message-ID: <20210520183614.1227046-5-posk@google.com> (raw)
In-Reply-To: <20210520183614.1227046-1-posk@google.com>
Implement version 1 of core UMCG API (wait/wake/swap).
As has been outlined in
https://lore.kernel.org/lkml/20200722234538.166697-1-posk@posk.io/,
efficient and synchronous on-CPU context switching is key
to enabling two broad use cases: in-process M:N userspace scheduling
and fast X-process RPCs for security wrappers.
High-level design considerations/approaches used:
- wait & wake can race with each other;
- offload as much work as possible to libumcg in tools/lib/umcg,
specifically:
- most state changes, e.g. RUNNABLE <=> RUNNING, are done in
the userspace (libumcg);
- retries are offloaded to the userspace.
This implementation misses timeout handling in sys_umcg_wait
and sys_umcg_swap, which will be added in version 2.
Signed-off-by: Peter Oskolkov <posk@google.com>
---
include/linux/sched.h | 7 +-
kernel/sched/core.c | 3 +
kernel/sched/umcg.c | 237 ++++++++++++++++++++++++++++++++++++++++--
kernel/sched/umcg.h | 42 ++++++++
4 files changed, 282 insertions(+), 7 deletions(-)
create mode 100644 kernel/sched/umcg.h
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7e7d50e2fdc..fc4b8775f514 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct umcg_task_data;
/*
* Task state bitmask. NOTE! These bits are also
@@ -778,6 +779,10 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
+#ifdef CONFIG_UMCG
+ struct umcg_task_data __rcu *umcg_task_data;
+#endif
+
/* Per-thread vma caching: */
struct vmacache vmacache;
@@ -1022,7 +1027,7 @@ struct task_struct {
u64 parent_exec_id;
u64 self_exec_id;
- /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
+ /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy, umcg: */
spinlock_t alloc_lock;
/* Protection of the PI data structures: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88506bc2617f..462104f13c28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3964,6 +3964,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+#ifdef CONFIG_UMCG
+ rcu_assign_pointer(p->umcg_task_data, NULL);
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
index b8195cfdb76a..2d718433c773 100644
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -4,11 +4,23 @@
* User Managed Concurrency Groups (UMCG).
*/
+#include <linux/freezer.h>
#include <linux/syscalls.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/umcg.h>
+#include "sched.h"
+#include "umcg.h"
+
+static int __api_version(u32 requested)
+{
+ if (requested == 1)
+ return 0;
+
+ return 1;
+}
+
/**
* sys_umcg_api_version - query UMCG API versions that are supported.
* @api_version: Requested API version.
@@ -26,7 +38,52 @@
*/
SYSCALL_DEFINE2(umcg_api_version, u32, api_version, u32, flags)
{
- return -ENOSYS;
+ if (flags)
+ return -EINVAL;
+
+ return __api_version(api_version);
+}
+
+static int get_state(struct umcg_task __user *ut, u32 *state)
+{
+ return get_user(*state, (u32 __user *)ut);
+}
+
+static int put_state(struct umcg_task __user *ut, u32 state)
+{
+ return put_user(state, (u32 __user *)ut);
+}
+
+static int register_core_task(u32 api_version, struct umcg_task __user *umcg_task)
+{
+ struct umcg_task_data *utd;
+ u32 state;
+
+ if (get_state(umcg_task, &state))
+ return -EFAULT;
+
+ if (state != UMCG_TASK_NONE)
+ return -EINVAL;
+
+ utd = kzalloc(sizeof(struct umcg_task_data), GFP_KERNEL);
+ if (!utd)
+ return -EINVAL;
+
+ utd->self = current;
+ utd->umcg_task = umcg_task;
+ utd->task_type = UMCG_TT_CORE;
+ utd->api_version = api_version;
+
+ if (put_state(umcg_task, UMCG_TASK_RUNNING)) {
+ kfree(utd);
+ return -EFAULT;
+ }
+
+ task_lock(current);
+ rcu_assign_pointer(current->umcg_task_data, utd);
+ task_unlock(current);
+
+ return 0;
}
/**
@@ -54,7 +111,20 @@ SYSCALL_DEFINE2(umcg_api_version, u32, api_version, u32, flags)
SYSCALL_DEFINE4(umcg_register_task, u32, api_version, u32, flags, u32, group_id,
struct umcg_task __user *, umcg_task)
{
- return -ENOSYS;
+ if (__api_version(api_version))
+ return -EOPNOTSUPP;
+
+ if (rcu_access_pointer(current->umcg_task_data) || !umcg_task)
+ return -EINVAL;
+
+ switch (flags) {
+ case UMCG_REGISTER_CORE_TASK:
+ if (group_id != UMCG_NOID)
+ return -EINVAL;
+ return register_core_task(api_version, umcg_task);
+ default:
+ return -EINVAL;
+ }
}
/**
@@ -67,7 +137,75 @@ SYSCALL_DEFINE4(umcg_register_task, u32, api_version, u32, flags, u32, group_id,
*/
SYSCALL_DEFINE1(umcg_unregister_task, u32, flags)
{
- return -ENOSYS;
+ struct umcg_task_data *utd;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ utd = rcu_dereference(current->umcg_task_data);
+
+ if (!utd || flags)
+ goto out;
+
+ task_lock(current);
+ rcu_assign_pointer(current->umcg_task_data, NULL);
+ task_unlock(current);
+
+ ret = 0;
+
+out:
+ rcu_read_unlock();
+ if (!ret && utd) {
+ synchronize_rcu();
+ kfree(utd);
+ }
+ return ret;
+}
+
+static int do_context_switch(struct task_struct *next)
+{
+ struct umcg_task_data *utd = rcu_access_pointer(current->umcg_task_data);
+
+ /*
+ * It is important to set_current_state(TASK_INTERRUPTIBLE) before
+ * waking @next, as @next may immediately try to wake current back
+ * (e.g. current is a server, @next is a worker that immediately
+ * blocks or waits), and this next wakeup must not be lost.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ WRITE_ONCE(utd->in_wait, true);
+
+ if (!try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU))
+ return -EAGAIN;
+
+ freezable_schedule();
+
+ WRITE_ONCE(utd->in_wait, false);
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
+static int do_wait(void)
+{
+ struct umcg_task_data *utd = rcu_access_pointer(current->umcg_task_data);
+
+ if (!utd)
+ return -EINVAL;
+
+ WRITE_ONCE(utd->in_wait, true);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ freezable_schedule();
+
+ WRITE_ONCE(utd->in_wait, false);
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ return 0;
}
/**
@@ -90,7 +228,23 @@ SYSCALL_DEFINE1(umcg_unregister_task, u32, flags)
SYSCALL_DEFINE2(umcg_wait, u32, flags,
const struct __kernel_timespec __user *, timeout)
{
- return -ENOSYS;
+ struct umcg_task_data *utd;
+
+ if (flags)
+ return -EINVAL;
+ if (timeout)
+ return -EOPNOTSUPP;
+
+ rcu_read_lock();
+ utd = rcu_dereference(current->umcg_task_data);
+ if (!utd) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ rcu_read_unlock();
+
+ return do_wait();
}
/**
@@ -110,7 +264,39 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags,
*/
SYSCALL_DEFINE2(umcg_wake, u32, flags, u32, next_tid)
{
- return -ENOSYS;
+ struct umcg_task_data *next_utd;
+ struct task_struct *next;
+ int ret = -EINVAL;
+
+ if (!next_tid)
+ return -EINVAL;
+ if (flags)
+ return -EINVAL;
+
+ next = find_get_task_by_vpid(next_tid);
+ if (!next)
+ return -ESRCH;
+
+ rcu_read_lock();
+ next_utd = rcu_dereference(next->umcg_task_data);
+ if (!next_utd)
+ goto out;
+
+ if (!READ_ONCE(next_utd->in_wait)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ ret = wake_up_process(next);
+ put_task_struct(next);
+ if (ret)
+ ret = 0;
+ else
+ ret = -EAGAIN;
+
+out:
+ rcu_read_unlock();
+ return ret;
}
/**
@@ -139,5 +325,44 @@ SYSCALL_DEFINE2(umcg_wake, u32, flags, u32, next_tid)
SYSCALL_DEFINE4(umcg_swap, u32, wake_flags, u32, next_tid, u32, wait_flags,
const struct __kernel_timespec __user *, timeout)
{
- return -ENOSYS;
+ struct umcg_task_data *curr_utd;
+ struct umcg_task_data *next_utd;
+ struct task_struct *next;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ curr_utd = rcu_dereference(current->umcg_task_data);
+
+ if (!next_tid || wake_flags || wait_flags || !curr_utd)
+ goto out;
+
+ if (timeout) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ next = find_get_task_by_vpid(next_tid);
+ if (!next) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ next_utd = rcu_dereference(next->umcg_task_data);
+ if (!next_utd) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!READ_ONCE(next_utd->in_wait)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ rcu_read_unlock();
+
+ return do_context_switch(next);
+
+out:
+ rcu_read_unlock();
+ return ret;
}
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
new file mode 100644
index 000000000000..6791d570f622
--- /dev/null
+++ b/kernel/sched/umcg.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_UMCG_H
+#define _KERNEL_SCHED_UMCG_H
+
+#ifdef CONFIG_UMCG
+
+#include <linux/sched.h>
+#include <linux/umcg.h>
+
+enum umcg_task_type {
+ UMCG_TT_CORE = 1,
+ UMCG_TT_SERVER = 2,
+ UMCG_TT_WORKER = 3
+};
+
+struct umcg_task_data {
+ /* umcg_task != NULL. Never changes. */
+ struct umcg_task __user *umcg_task;
+
+ /* The task that owns this umcg_task_data. Never changes. */
+ struct task_struct *self;
+
+ /* Core task, server, or worker. Never changes. */
+ enum umcg_task_type task_type;
+
+ /*
+ * The API version used to register this task. If this is a
+ * worker or a server, must equal group->api_version.
+ *
+ * Never changes.
+ */
+ u32 api_version;
+
+ /*
+ * Used by wait/wake routines to handle races. Written only by current.
+ */
+ bool in_wait;
+};
+
+#endif /* CONFIG_UMCG */
+#endif /* _KERNEL_SCHED_UMCG_H */
--
2.31.1.818.g46aad6cb9e-goog
next prev parent reply other threads:[~2021-05-20 18:36 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-05-20 18:36 [RFC PATCH v0.1 0/9] UMCG early preview/RFC patchset Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 1/9] sched/umcg: add UMCG syscall stubs and CONFIG_UMCG Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 2/9] sched/umcg: add uapi/linux/umcg.h and sched/umcg.c Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 3/9] sched: add WF_CURRENT_CPU and externise ttwu Peter Oskolkov
2021-05-20 18:36 ` Peter Oskolkov [this message]
2021-05-21 19:06 ` [RFC PATCH v0.1 4/9] sched/umcg: implement core UMCG API Andrei Vagin
2021-05-21 21:31 ` Jann Horn
2021-05-21 22:03 ` Peter Oskolkov
2021-05-21 19:32 ` Andy Lutomirski
2021-05-21 22:01 ` Peter Oskolkov
2021-05-21 21:33 ` Jann Horn
2021-06-09 13:01 ` Peter Zijlstra
2021-05-20 18:36 ` [RFC PATCH v0.1 5/9] lib/umcg: implement UMCG core API for userspace Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 6/9] selftests/umcg: add UMCG core API selftest Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 7/9] sched/umcg: add UMCG server/worker API (early RFC) Peter Oskolkov
2021-05-21 20:17 ` Andrei Vagin
2021-05-20 18:36 ` [RFC PATCH v0.1 8/9] lib/umcg: " Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 9/9] selftests/umcg: add UMCG server/worker API selftest Peter Oskolkov
2021-05-20 21:17 ` [RFC PATCH v0.1 0/9] UMCG early preview/RFC patchset Jonathan Corbet
2021-05-20 21:38 ` Peter Oskolkov
2021-05-21 0:15 ` Randy Dunlap
2021-05-21 8:04 ` Peter Zijlstra
2021-05-21 15:08 ` Jonathan Corbet
2021-05-21 16:03 ` Peter Oskolkov
2021-05-21 19:17 ` Jonathan Corbet
2021-05-27 0:06 ` Peter Oskolkov
2021-05-27 15:41 ` Jonathan Corbet
[not found] ` <CAEWA0a72SvpcuN4ov=98T3uWtExPCr7BQePOgjkqD1ofWKEASw@mail.gmail.com>
2021-05-21 19:13 ` Peter Oskolkov
2021-05-21 23:08 ` Jann Horn
2021-06-09 12:54 ` Peter Zijlstra
2021-06-09 20:18 ` Peter Oskolkov
2021-06-10 18:02 ` Peter Zijlstra
2021-06-10 20:06 ` Peter Oskolkov
2021-07-07 17:45 ` Thierry Delisle
2021-07-08 21:44 ` Peter Oskolkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210520183614.1227046-5-posk@google.com \
--to=posk@google.com \
--cc=akpm@linux-foundation.org \
--cc=avagin@google.com \
--cc=bsegall@google.com \
--cc=jnewsome@torproject.org \
--cc=joel@joelfernandes.org \
--cc=linux-api@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=posk@posk.io \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).