* [RFC PATCH 1/4 v0.3] sched: add WF_CURRENT_CPU and externise ttwu
2021-07-16 18:47 [RFC PATCH 0/4 v0.3] sched/UMCG Peter Oskolkov
@ 2021-07-16 18:47 ` Peter Oskolkov
2021-07-16 18:47 ` [RFC PATCH 2/4 v0.3] sched/umcg: RFC: add userspace atomic helpers Peter Oskolkov
` (2 subsequent siblings)
3 siblings, 0 replies; 14+ messages in thread
From: Peter Oskolkov @ 2021-07-16 18:47 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, linux-kernel, linux-api
Cc: Paul Turner, Ben Segall, Peter Oskolkov, Peter Oskolkov,
Joel Fernandes, Andrei Vagin, Jim Newsome, Jann Horn,
Thierry Delisle
Add WF_CURRENT_CPU wake flag that advices the scheduler to
move the wakee to the current CPU. This is useful for fast on-CPU
context switching use cases such as UMCG.
In addition, make ttwu external rather than static so that
the flag could be passed to it from outside of sched/core.c.
Signed-off-by: Peter Oskolkov <posk@google.com>
---
kernel/sched/core.c | 3 +--
kernel/sched/fair.c | 4 ++++
kernel/sched/sched.h | 15 +++++++++------
3 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c22cd026440..293f5801bf81 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3680,8 +3680,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11d22943753f..16a9c93e6e82 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6836,6 +6836,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (wake_flags & WF_TTWU) {
record_wakee(p);
+ if ((wake_flags & WF_CURRENT_CPU) &&
+ cpumask_test_cpu(cpu, p->cpus_ptr))
+ return cpu;
+
if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a1c6aeb9165..80de6836f8ae 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2031,13 +2031,14 @@ static inline int task_on_rq_migrating(struct task_struct *p)
}
/* Wake flags. The first three directly map to some SD flag value */
-#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
-#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
-#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
-#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
-#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
-#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
+#define WF_CURRENT_CPU 0x80 /* Prefer to move the wakee to the current CPU. */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -3037,6 +3038,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
#ifdef CONFIG_PREEMPT_DYNAMIC
extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [RFC PATCH 2/4 v0.3] sched/umcg: RFC: add userspace atomic helpers
2021-07-16 18:47 [RFC PATCH 0/4 v0.3] sched/UMCG Peter Oskolkov
2021-07-16 18:47 ` [RFC PATCH 1/4 v0.3] sched: add WF_CURRENT_CPU and externise ttwu Peter Oskolkov
@ 2021-07-16 18:47 ` Peter Oskolkov
2021-07-16 18:47 ` [RFC PATCH 3/4 v0.3] sched/umcg: RFC: add userspace sll helpers Peter Oskolkov
2021-07-16 18:47 ` [RFC PATCH 4/4 v0.3] sched/umcg: RFC: implement UMCG syscalls Peter Oskolkov
3 siblings, 0 replies; 14+ messages in thread
From: Peter Oskolkov @ 2021-07-16 18:47 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, linux-kernel, linux-api
Cc: Paul Turner, Ben Segall, Peter Oskolkov, Peter Oskolkov,
Joel Fernandes, Andrei Vagin, Jim Newsome, Jann Horn,
Thierry Delisle
Add helper functions to work atomically with userspace 32/64 bit values -
there are some .*futex.* named helpers, but they are not exactly
what is needed for UMCG; I haven't found what else I could use, so I
rolled these.
At the moment only X86_64 is supported.
Note: the helpers should probably go into arch/ somewhere; I have
them in kernel/sched/umcg.h temporarily for convenience. Please
let me know where I should put them.
Note: the current code follows sugestions here:
https://lore.kernel.org/lkml/YOgCdMWE9OXvqczk@hirez.programming.kicks-ass.net/
with the exception that I couldn't combine __try_cmpxchg_user_32/64 functions
into a macro, as my asm foo is not too strong. I'll continue trying to make
the macro work, but for the moment I've decided to post this RFC so that
other areas of the patchset could be reviewed.
Changelog:
v0.2->v0.3:
- renamed and refactored the helpers a bit, as described above;
- moved linked list/stack operations into a separate patch.
Signed-off-by: Peter Oskolkov <posk@google.com>
---
kernel/sched/umcg.h | 169 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 169 insertions(+)
create mode 100644 kernel/sched/umcg.h
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
new file mode 100644
index 000000000000..288017f5368c
--- /dev/null
+++ b/kernel/sched/umcg.h
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _KERNEL_SCHED_UMCG_H
+#define _KERNEL_SCHED_UMCG_H
+
+#ifdef CONFIG_X86_64
+
+#include <linux/uaccess.h>
+
+#include <asm/asm.h>
+#include <linux/atomic.h>
+
+/* TODO: move atomic operations below into arch/ headers */
+static inline int __try_cmpxchg_user_32(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
+{
+ int ret = 0;
+
+ asm volatile("\n"
+ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
+ "2:\n"
+ "\t.section .fixup, \"ax\"\n"
+ "3:\tmov %3, %0\n"
+ "\tjmp 2b\n"
+ "\t.previous\n"
+ _ASM_EXTABLE_UA(1b, 3b)
+ : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+ : "i" (-EFAULT), "r" (newval), "1" (oldval)
+ : "memory"
+ );
+ *uval = oldval;
+ return ret;
+}
+
+static inline int __try_cmpxchg_user_64(u64 *uval, u64 __user *uaddr,
+ u64 oldval, u64 newval)
+{
+ int ret = 0;
+
+ asm volatile("\n"
+ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"
+ "2:\n"
+ "\t.section .fixup, \"ax\"\n"
+ "3:\tmov %3, %0\n"
+ "\tjmp 2b\n"
+ "\t.previous\n"
+ _ASM_EXTABLE_UA(1b, 3b)
+ : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+ : "i" (-EFAULT), "r" (newval), "1" (oldval)
+ : "memory"
+ );
+ *uval = oldval;
+ return ret;
+}
+
+static inline int fix_pagefault(unsigned long uaddr, bool write_fault)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ mmap_read_lock(mm);
+ ret = fixup_user_fault(mm, uaddr, write_fault ? FAULT_FLAG_WRITE : 0,
+ NULL);
+ mmap_read_unlock(mm);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * umcg_cmpxchg_32_user - compare_exchange 32-bit values
+ *
+ * Return:
+ * 0 - OK
+ * -EFAULT: memory access error
+ * -EAGAIN: @expected did not match; consult @prev
+ */
+static inline int cmpxchg_user_32(u32 __user *uaddr, u32 *old, u32 new)
+{
+ int ret = -EFAULT;
+ u32 __old = *old;
+
+ if (unlikely(!access_ok(uaddr, sizeof(*uaddr))))
+ return -EFAULT;
+
+ pagefault_disable();
+
+ while (true) {
+ __uaccess_begin_nospec();
+ ret = __try_cmpxchg_user_32(old, uaddr, __old, new);
+ user_access_end();
+
+ if (!ret) {
+ ret = *old == __old ? 0 : -EAGAIN;
+ break;
+ }
+
+ if (fix_pagefault((unsigned long)uaddr, true) < 0)
+ break;
+ }
+
+ pagefault_enable();
+ return ret;
+}
+
+/**
+ * umcg_cmpxchg_64_user - compare_exchange 64-bit values
+ *
+ * Return:
+ * 0 - OK
+ * -EFAULT: memory access error
+ * -EAGAIN: @expected did not match; consult @prev
+ */
+static inline int cmpxchg_user_64(u64 __user *uaddr, u64 *old, u64 new)
+{
+ int ret = -EFAULT;
+ u64 __old = *old;
+
+ if (unlikely(!access_ok(uaddr, sizeof(*uaddr))))
+ return -EFAULT;
+
+ pagefault_disable();
+
+ while (true) {
+ __uaccess_begin_nospec();
+ ret = __try_cmpxchg_user_64(old, uaddr, __old, new);
+ user_access_end();
+
+ if (!ret) {
+ ret = *old == __old ? 0 : -EAGAIN;
+ break;
+ }
+
+ if (fix_pagefault((unsigned long)uaddr, true) < 0)
+ break;
+ }
+
+ pagefault_enable();
+ return ret;
+}
+
+/**
+ * get_user_nosleep - get user value with inline fixup without sleeping.
+ *
+ * get_user() might sleep and therefore cannot be used in preempt-disabled
+ * regions.
+ */
+#define get_user_nosleep(out, uaddr) \
+({ \
+ int ret = -EFAULT; \
+ \
+ if (access_ok((uaddr), sizeof(*(uaddr)))) { \
+ pagefault_disable(); \
+ \
+ while (true) { \
+ if (!__get_user((out), (uaddr))) { \
+ ret = 0; \
+ break; \
+ } \
+ \
+ if (fix_pagefault((unsigned long)(uaddr), true) < 0) \
+ break; \
+ } \
+ \
+ pagefault_enable(); \
+ } \
+ ret; \
+})
+
+#endif /* CONFIG_X86_64 */
+#endif /* _KERNEL_SCHED_UMCG_H */
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [RFC PATCH 3/4 v0.3] sched/umcg: RFC: add userspace sll helpers
2021-07-16 18:47 [RFC PATCH 0/4 v0.3] sched/UMCG Peter Oskolkov
2021-07-16 18:47 ` [RFC PATCH 1/4 v0.3] sched: add WF_CURRENT_CPU and externise ttwu Peter Oskolkov
2021-07-16 18:47 ` [RFC PATCH 2/4 v0.3] sched/umcg: RFC: add userspace atomic helpers Peter Oskolkov
@ 2021-07-16 18:47 ` Peter Oskolkov
2021-07-17 0:58 ` Thierry Delisle
2021-07-16 18:47 ` [RFC PATCH 4/4 v0.3] sched/umcg: RFC: implement UMCG syscalls Peter Oskolkov
3 siblings, 1 reply; 14+ messages in thread
From: Peter Oskolkov @ 2021-07-16 18:47 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, linux-kernel, linux-api
Cc: Paul Turner, Ben Segall, Peter Oskolkov, Peter Oskolkov,
Joel Fernandes, Andrei Vagin, Jim Newsome, Jann Horn,
Thierry Delisle
The helpers below can work with userspace single-linked lists concurrently
without indefinitely spinning in the kernel. Specifically:
push (add to the head of the list):
step = 0
while (++step < N)
old = *head
*next = old
cmp_xchng(head, &old, next)
pop (remove the first element from the list):
mark the node as deleted by flipping its lowest bit without
actually removing the node from the list:
curr = *head
step = 0
while (curr && ++step < N)
next = *curr
if (next & 1)
curr = next & ~1
continue
if (cmp_xchng(curr, next, next | 1))
return curr
else
curr = next & ~1
It is the userspace's responsibility to actually remove the
nodes marked as deleted from the list.
v0.2->v0.3 changes:
- removed spinning from the kernel (other than capped cmpxchg).
Signed-off-by: Peter Oskolkov <posk@google.com>
---
kernel/sched/umcg.h | 116 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 116 insertions(+)
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
index 288017f5368c..435531d751f2 100644
--- a/kernel/sched/umcg.h
+++ b/kernel/sched/umcg.h
@@ -165,5 +165,121 @@ static inline int cmpxchg_user_64(u64 __user *uaddr, u64 *old, u64 new)
ret; \
})
+/**
+ * umcg_sll_push - push a node onto a single-linked-list (stack) in userspace
+ * @head - a pointer to the head of the stack;
+ * @node - a pointer to the node to push;
+ * @max_tries - the maximum number of attempts to push the node onto the list.
+ *
+ * Push a node onto a single-linked list (stack). The function tries
+ * @max_tries times and returns -EAGAIN if too many concurrent updates, so that
+ * the caller may yield the CPU.
+ *
+ * Return:
+ * 0 - success;
+ * -EFAULT - error;
+ * -EAGAIN - try again.
+ */
+static inline int umcg_sll_push(u64 __user *head, u64 __user *node, int max_tries)
+{
+ int step;
+
+ for (step = 0; step < max_tries; step++) {
+ int ret;
+ u64 first;
+
+ smp_mb(); /* Make the read below clean. */
+ if (get_user(first, head))
+ return -EFAULT;
+
+ if (put_user(first, node))
+ return -EFAULT;
+ smp_mb(); /* Make the write above visible. */
+
+ ret = cmpxchg_user_64(head, &first, (u64)node);
+ if (!ret)
+ return 0;
+
+ if (ret == -EAGAIN) {
+ cpu_relax();
+ continue;
+ }
+
+ if (WARN_ONCE(ret != -EFAULT, "unexpected umcg_cmpxchg result"))
+ return -EFAULT;
+
+ return -EFAULT;
+ }
+
+ return -EAGAIN;
+}
+
+/**
+ * umcg_sll_pop - pop a node from a single-linked-list (stack) in the userspace
+ * @head - a pointer to the head of the stack;
+ * @value - a pointer to where store the popped value;
+ * @max_tries - the maximum number of nodes to try to pop.
+ *
+ * Pop a node from a single-linked list (stack). Internally, popped nodes
+ * are marked as such, and not actually removed from the list, to avoid
+ * the ABA problem. It is the responsibility of the userspace to do GC.
+ *
+ * As an additional protection, the function checks only the first
+ * @max_tries nodes.
+ *
+ * Note: on success, @value should be cast to (u64 __user *) if not zero.
+ *
+ * Return:
+ * 0 - success;
+ * -EFAULT - error;
+ * -EAGAIN - try again.
+ */
+static inline int umcg_sll_pop(u64 __user *head, u64 *value, int max_tries)
+{
+ int step;
+ u64 curr;
+
+ smp_mb(); /* Make the read below clean. */
+ if (get_user(curr, head))
+ return -EFAULT;
+
+ for (step = 0; step < max_tries; step++) {
+ int ret;
+ u64 next;
+
+ if (!curr) {
+ *value = 0UL;
+ return 0;
+ }
+
+ if (curr & 1UL)
+ return -EFAULT; /* Should not happen. */
+
+ smp_mb(); /* Make the read below clean. */
+ if (get_user(next, (u64 __user *)curr))
+ return -EFAULT;
+
+ if (next & 1UL) { /* curr is deleted */
+ curr = next & ~1UL;
+ continue;
+ }
+
+ ret = cmpxchg_user_64((u64 __user *)curr, &next, next | 1UL);
+ if (!ret) {
+ *value = curr;
+ return 0;
+ }
+ if (ret == -EAGAIN) {
+ curr = next & ~1UL;
+ cpu_relax();
+ continue;
+ }
+
+ if (ret)
+ return -EFAULT;
+ }
+
+ return -EAGAIN;
+}
#endif /* CONFIG_X86_64 */
#endif /* _KERNEL_SCHED_UMCG_H */
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [RFC PATCH 4/4 v0.3] sched/umcg: RFC: implement UMCG syscalls
2021-07-16 18:47 [RFC PATCH 0/4 v0.3] sched/UMCG Peter Oskolkov
` (2 preceding siblings ...)
2021-07-16 18:47 ` [RFC PATCH 3/4 v0.3] sched/umcg: RFC: add userspace sll helpers Peter Oskolkov
@ 2021-07-16 18:47 ` Peter Oskolkov
2021-07-19 16:07 ` Thierry Delisle
3 siblings, 1 reply; 14+ messages in thread
From: Peter Oskolkov @ 2021-07-16 18:47 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, linux-kernel, linux-api
Cc: Paul Turner, Ben Segall, Peter Oskolkov, Peter Oskolkov,
Joel Fernandes, Andrei Vagin, Jim Newsome, Jann Horn,
Thierry Delisle
Define struct umcg_task and sys_umcg_ctl/sys_umcg_wait syscalls.
This is another attempt at implementing UMCG, based on
discussion in https://lore.kernel.org/patchwork/cover/1433967/
Most of the "why" is covered here (some details are obsolete):
https://lore.kernel.org/patchwork/cover/1433967/#1632328
I'll update this commit message with more "why" when the general
approach is ACKed at a high level.
In this patch I used the approach suggested by peterz@ (should I add
a Suggested-by: tag?) in the discussion linked above;
specifically, only a single
struct umcg_task __user *umcg_task
pointer is added to struct task_struct.
Comments in include/uapi/linux/umcg.h and kernel/sched/umcg.c
provide many details on how UMCG syscalls are to be used.
What is NOT implemented yet:
- timeouts;
- preemption.
All the basics (wait/wake/swap, block/wake detection) seem to be
working.
v0.2->v0.3 changes:
- new protocol for working with idle workers and servers is used, to avoid
spinning in the kernel;
- waking a UMCG task now does not require spinning.
Signed-off-by: Peter Oskolkov <posk@google.com>
---
arch/x86/entry/syscalls/syscall_64.tbl | 2 +
include/linux/sched.h | 6 +
include/linux/syscalls.h | 4 +
include/uapi/asm-generic/unistd.h | 8 +-
include/uapi/linux/umcg.h | 259 +++++++++++++
init/Kconfig | 10 +
kernel/exit.c | 7 +
kernel/sched/Makefile | 1 +
kernel/sched/core.c | 17 +-
kernel/sched/umcg.c | 485 +++++++++++++++++++++++++
kernel/sched/umcg.h | 13 +
kernel/sys_ni.c | 4 +
12 files changed, 813 insertions(+), 3 deletions(-)
create mode 100644 include/uapi/linux/umcg.h
create mode 100644 kernel/sched/umcg.c
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ce18119ea0d0..0c6c7fd72b0b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -368,6 +368,8 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+447 common umcg_ctl sys_umcg_ctl
+448 common umcg_wait sys_umcg_wait
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50db9496c99d..185ad1cdde77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct umcg_task;
/*
* Task state bitmask. NOTE! These bits are also
@@ -1223,6 +1224,10 @@ struct task_struct {
unsigned long rseq_event_mask;
#endif
+#ifdef CONFIG_UMCG
+ struct umcg_task __user *umcg_task;
+#endif
+
struct tlbflush_unmap_batch tlb_ubc;
union {
@@ -1599,6 +1604,7 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
+#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 050511e8f1f8..f3e1ef8d842f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -71,6 +71,7 @@ struct open_how;
struct mount_attr;
struct landlock_ruleset_attr;
enum landlock_rule_type;
+struct umcg_task;
#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -1050,6 +1051,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _
asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
const void __user *rule_attr, __u32 flags);
asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
+asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self);
+asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout);
+
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 6de5a7fc066b..1a4c9ac0e296 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
#define __NR_landlock_restrict_self 446
__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
+#define __NR_umcg_ctl 447
+__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl)
+#define __NR_umcg_wait 448
+__SYSCALL(__NR_umcg_wait, sys_umcg_wait)
+
+
#undef __NR_syscalls
-#define __NR_syscalls 447
+#define __NR_syscalls 449
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h
new file mode 100644
index 000000000000..402974b475bf
--- /dev/null
+++ b/include/uapi/linux/umcg.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_UMCG_H
+#define _UAPI_LINUX_UMCG_H
+
+#include <linux/limits.h>
+#include <linux/types.h>
+
+/*
+ * UMCG: User Managed Concurrency Groups.
+ *
+ * Syscalls (see kernel/sched/umcg.c):
+ * sys_umcg_ctl() - register/unregister UMCG tasks;
+ * sys_umcg_wait() - wait/wake/context-switch.
+ *
+ * struct umcg_task (below): controls the state of UMCG tasks.
+ */
+
+/*
+ * UMCG task states, the first 8 bits. The states represent the user space
+ * point of view.
+ *
+ * UMCG tasks can be either RUNNING, i.e. doing useful work, or IDLE,
+ * i.e. have no work assigned to them and blocked in sys_umcg_wait().
+ *
+ * In addition, when a UMCG worker blocks in the kernel (e.g. on I/O),
+ * it is marked as BLOCKED; when a BLOCKED worker completes its blocking
+ * operation, it is marked as IDLE and added to idle_workers list (see
+ * struct umcg_task below), until a server "runs" it.
+ *
+ * UMCG servers continue to be considered as RUNNING even if they are blocked
+ * in the kernel in any way other than in sys_umcg_wait().
+ *
+ * State transitions:
+ *
+ * RUNNING => IDLE: the current RUNNING task becomes IDLE by calling
+ * sys_umcg_wait();
+ * IDLE => RUNNING: - another worker or server task called
+ * sys_umcg_wait() with self->next_tid pointing to the
+ * task transitioning from IDLE to RUNNING (mostly
+ * applies to workers and basic tasks);
+ * - the userspace marked and IDLE task as RUNNING and
+ * sent a signal to it (thus interrupting sys_umcg_wait);
+ * - servers: the kernel wakes an IDLE server from
+ * idle_servers list when a BLOCKED worker becomes IDLE
+ * (see below);
+ * - servers: the kernel wakes and IDLE server that
+ * is "attached" to a RUNNING worker when the worker
+ * becomes BLOCKED;
+ * RUNNING => BLOCKED: when a RUNNING UMCG worker blocks in the kernel,
+ * the kernel marks it as BLOCKED (and wakes its server);
+ * BLOCKED => IDLE: when a BLOCKED UMCG worker finishes its blocking
+ * operation, the kernel marks it as IDLE, adds it to
+ * the list of idle workers (see struct umcg_task) and
+ * wakes an idle server from the list of idle servers, if
+ * available.
+ *
+ * Note 1: only the transitions listed above are possible; these state
+ * transitions never happen:
+ * - IDLE => BLOCKED (never happen)
+ * - BLOCKED => RUNNING (never happen)
+ *
+ * Note 2: only UMCG workers (UMCG tasks registered with UMCG_CTL_WORKER
+ * flag set) are subject to block/wake detection logic;
+ *
+ * Note 3: if a worker has UMC_TF_LOCKED state flag set, it behaves as
+ * a server, i.e. the block/wake detection is disabled (this is a UMCG
+ * equivalent of task_lock() or preempt_disable()). UMCG_TF_LOCKED
+ * flag is cleared by the kernel when the worker goes to sleep in
+ * umcg_wait().
+ *
+ * Note 4: changing the value of umcg_task.state field is the responsibility
+ * of the party initiating the state change: when a state transition
+ * is initiated by the userspace via a call to sys_umcg_wait(), it
+ * is the userspace's responsibility to change the value of the
+ * umcg_task.state field; when a state transition is initiated
+ * by the kernel during worker block/wake handling, it is the kernel
+ * who marks the worker as BLOCKED or IDLE, and the server as RUNNING.
+ */
+#define UMCG_TASK_NONE 0
+#define UMCG_TASK_RUNNING 1
+#define UMCG_TASK_IDLE 2
+#define UMCG_TASK_BLOCKED 3
+
+#define UMCG_TF_STATE_MASK 0xff
+
+/* UMCG task state flags, bits 8-15 */
+
+/*
+ * UMCG_TF_LOCKED: locked by the userspace; workers with UMCG_TF_LOCKED set
+ * do not become BLOCKED and do not wake their attached server.
+ */
+#define UMCG_TF_LOCKED (1 << 8)
+
+/**
+ * struct umcg_task - controls the state of UMCG tasks.
+ *
+ * UMCG tasks can be:
+ *
+ * - UMCG workers: must have a UMCG server assigned when running (unless
+ * UMCG_TF_LOCKED flag is set); the server is woken when
+ * the worker blocks; has PF_UMCG_WORKER task flag set
+ * in task_struct.
+ *
+ * Both @idle_servers_ptr and @idle_workes_ptr are !NULL
+ * when running or calling sys_umcg_wait().
+ *
+ * A worker's state can be:
+ * - RUNNING: is schedulable by the kernel, has a server
+ * assigned in @server_tid;
+ * - IDLE: not schedulable by the kernel; can be
+ * context-switched into via sys_umcg_wait;
+ * - BLOCKED: blocked in the kernel (e.g. on I/O).
+ *
+ * - UMCG servers: A server's state can be:
+ * - RUNNING: behaves like a "normal" task: is schedulable
+ * by the kernel, can block on I/O, etc.
+ * - IDLE: not schedulable by the kernel.
+ *
+ * See sys_umcg_ctl() documentation in kernel/sched/umcg.c for a detailed
+ * explanation of how UMCG task types are determined.
+ *
+ * See sys_umcg_wait() documentation in kernel/sched/ucmg.c for a detailed
+ * explanation of server/worker interactions.
+ *
+ * Once a UMCG task is registered, it may not change its type.
+ *
+ * The struct is aligned at 64 bytes to ensure that it fits into
+ * a single cache line.
+ */
+struct umcg_task {
+ /**
+ * @state: the current state of the UMCG task described by this struct.
+ *
+ * Readable/writable by both the kernel and the userspace.
+ *
+ * UMCG task state:
+ * bits 0 - 7: task state;
+ * bits 8 - 15: state flags;
+ * bits 16 - 23: reserved; must be zeroes;
+ * bits 24 - 31: for userspace use.
+ */
+ uint32_t state; /* r/w */
+
+ /**
+ * @api_version: the version of UMCG API the userspace would like
+ * to use. Must be set before calling umcg_ctl
+ * and must not be changed afterwards.
+ */
+ uint32_t api_version; /* r */
+
+ /**
+ * @server_tid: the TID of the server UMCG task that should be
+ * woken when this WORKER becomes BLOCKED. Can be zero.
+ *
+ * If this is a UMCG server, @server_tid should
+ * contain the TID of @self - it will be used to find
+ * the task_struct to wake when pulled from
+ * @idle_servers.
+ *
+ * Read-only for the kernel, read/write for the userspace.
+ */
+ uint32_t server_tid; /* r */
+
+ /**
+ * @next_tid: the TID of the UMCG task that should be context-switched
+ * into in sys_umcg_wait(). Can be zero.
+ *
+ * Read-only for the kernel, read/write for the userspace.
+ */
+ uint32_t next_tid; /* r */
+
+ /**
+ * @idle_servers_ptr: a single-linked list pointing to the list
+ * of idle servers. Can be NULL.
+ *
+ * Readable/writable by both the kernel and the userspace: the
+ * userspace adds items to the list, the kernel removes them.
+ *
+ * This is a single-linked list (stack): head->next->next->next->NULL.
+ * "next" nodes are idle_servers_ptr fields in struct umcg_task.
+ *
+ * Example:
+ *
+ * a running worker idle server 1 idle server 2
+ *
+ * struct umct_task: struct umcg_task: struct umcg_task:
+ * state state state
+ * api_version api_version api_version
+ * ... ... ...
+ * idle_servers_ptr --> head --> idle_servers_ptr --> idle_servers_ptr --> NULL
+ * ... ... ...
+ *
+ *
+ * Due to the way struct umcg_task is aligned, idle_servers_ptr
+ * is aligned at 8 byte boundary, and so has its first byte as zero
+ * when it holds a valid pointer.
+ *
+ * When pulling idle servers from the list, the kernel marks nodes as
+ * "deleted" by ORing the node value (the pointer) with 1UL atomically.
+ * If a node is "deleted" (i.e. its value AND 1UL is not zero),
+ * the kernel proceeds to the next node.
+ *
+ * The kernel checks at most [nr_cpu_ids * 2] first nodes in the list.
+ *
+ * It is NOT considered an error if the kernel cannot find an idle
+ * server.
+ *
+ * The userspace is responsible for cleanup/gc (i.e. for actually
+ * removing nodes marked as "deleted" from the list).
+ */
+ uint64_t idle_servers_ptr; /* r/w */
+
+ /**
+ * @idle_workers_ptr: a single-linked list pointing to the list
+ * of idle workers. Can be NULL.
+ *
+ * Readable/writable by both the kernel and the userspace: the
+ * kernel adds items to the list, the userspace removes them.
+ *
+ * The list (stack) is structured the same way as idle_servers_ptr
+ * above. The kernel pushes new nodes to the beginning of the list
+ * by assigning the current head value to the node's idle_workers_ptr
+ * and trying to atomically change the head to point to the new node.
+ *
+ * The kernel tries at most [nr_cpu_ids + 1] times to push a node
+ * onto the stack, after which the idle worker will sleep a short
+ * while before attempting to do so again. After several failed
+ * attempts the kernel will SIGKILL the worker.
+ */
+ uint64_t idle_workers_ptr; /* r/w */
+} __attribute__((packed, aligned(8 * sizeof(__u64))));
+
+/**
+ * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
+ * @UMCG_CTL_REGISTER: register the current task as a UMCG task
+ * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
+ * @UMCG_CTL_WORKER: register the current task as a UMCG worker
+ *
+ * See sys_umcg_ctl documentation for more details.
+ */
+enum umcg_ctl_flag {
+ UMCG_CTL_REGISTER = 0x00001,
+ UMCG_CTL_UNREGISTER = 0x00002,
+ UMCG_CTL_WORKER = 0x10000,
+};
+
+/**
+ * enum umcg_wait_flag - flags to pass to sys_umcg_wait
+ * @UMCG_WAIT_WAKE_ONLY: wake @self->next_tid, don't put @self to sleep;
+ * @UMCG_WAIT_WF_CURRENT_CPU: wake @self->next_tid on the current CPU
+ * (use WF_CURRENT_CPU); @UMCG_WAIT_WAKE_ONLY
+ * must be set.
+ */
+enum umcg_wait_flag {
+ UMCG_WAIT_WAKE_ONLY = 1,
+ UMCG_WAIT_WF_CURRENT_CPU = 2,
+};
+
+#endif /* _UAPI_LINUX_UMCG_H */
diff --git a/init/Kconfig b/init/Kconfig
index a61c92066c2e..c15a50a61ba6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1662,6 +1662,16 @@ config MEMBARRIER
If unsure, say Y.
+config UMCG
+ bool "Enable User Managed Concurrency Groups API"
+ depends on X86_64
+ default n
+ help
+ Enable User Managed Concurrency Groups API, which form the basis
+ for an in-process M:N userspace scheduling framework.
+ At the moment this is an experimental/RFC feature that is not
+ guaranteed to be backward-compatible.
+
config KALLSYMS
bool "Load all symbols for debugging/ksymoops" if EXPERT
default y
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..dc8398558d87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -744,6 +744,13 @@ void __noreturn do_exit(long code)
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
+#ifdef CONFIG_UMCG
+ if (unlikely(tsk->flags & PF_UMCG_WORKER))
+ tsk->flags &= ~PF_UMCG_WORKER;
+
+ tsk->umcg_task = NULL;
+#endif
+
/*
* If do_exit is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 978fcfca5871..e4e481eee1b7 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
obj-$(CONFIG_PSI) += psi.o
obj-$(CONFIG_SCHED_CORE) += core_sched.o
+obj-$(CONFIG_UMCG) += umcg.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 293f5801bf81..f7ddeed72e30 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
#include "pelt.h"
#include "smp.h"
+#include "umcg.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -3961,6 +3962,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+#ifdef CONFIG_UMCG
+ p->umcg_task = NULL;
+ p->flags &= ~PF_UMCG_WORKER;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -5975,10 +5980,14 @@ static inline void sched_submit_work(struct task_struct *tsk)
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
- if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
preempt_disable();
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
+#ifdef CONFIG_UMCG
+ else if (task_flags & PF_UMCG_WORKER)
+ umcg_wq_worker_sleeping(tsk);
+#endif
else
io_wq_worker_sleeping(tsk);
preempt_enable_no_resched();
@@ -5997,9 +6006,13 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
+#ifdef CONFIG_UMCG
+ else if (tsk->flags & PF_UMCG_WORKER)
+ umcg_wq_worker_running(tsk);
+#endif
else
io_wq_worker_running(tsk);
}
diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
new file mode 100644
index 000000000000..f87c32974882
--- /dev/null
+++ b/kernel/sched/umcg.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * User Managed Concurrency Groups (UMCG).
+ *
+ * See include/uapi/linux/umcg.h for more documentation.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/umcg.h>
+#include <linux/freezer.h>
+
+#include "sched.h"
+#include "umcg.h"
+
+static int umcg_validate_version(u32 api_version)
+{
+ if (api_version == 1)
+ return 0;
+ return 1;
+}
+
+/**
+ * sys_umcg_ctl: (un)register the current task as a UMCG task.
+ * @flags: ORed values from enum umcg_ctl_flag; see below;
+ * @self: a pointer to struct umcg_task that describes this
+ * task and governs the behavior of sys_umcg_wait if
+ * registering; must be NULL if unregistering.
+ *
+ * @flags & UMCG_CTL_REGISTER: register a UMCG task:
+ * UMCG workers:
+ * - self->state must be UMCG_TASK_IDLE
+ * - @flags & UMCG_CTL_WORKER
+ * UMCG servers:
+ * - self->state must be UMCG_TASK_RUNNING
+ * - !(@flags & UMCG_CTL_WORKER)
+ *
+ * All tasks:
+ * - self->api_version must match one of the supported API
+ * versions
+ * - self->server_tid must be zero
+ * - self->next_tid must be zero
+ *
+ * If the conditions above are met, sys_umcg_ctl() immediately returns
+ * if the registered task is a server; a worker will be added to
+ * idle_workers_ptr, and the worker put to sleep; an idle server
+ * from idle_servers_ptr will be woken, if any.
+ *
+ * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task
+ * is a UMCG worker, the userspace is responsible for waking its
+ * server (before or after calling sys_umcg_ctl).
+ *
+ * Return:
+ * 0 - success
+ * > 0 - the highest supported API version if @self->api_version
+ * is not supported (when registering)
+ * -EFAULT - failed to read @self
+ * -EINVAL - some other error occurred
+ */
+SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self)
+{
+ struct umcg_task ut;
+ int ret;
+
+ if (flags == UMCG_CTL_UNREGISTER) {
+ if (self || !current->umcg_task)
+ return -EINVAL;
+
+ if (current->flags & PF_UMCG_WORKER)
+ current->flags &= ~PF_UMCG_WORKER;
+
+ current->umcg_task = NULL;
+ return 0;
+ }
+
+ /* Register the current task as a UMCG task. */
+ if (!(flags & UMCG_CTL_REGISTER))
+ return -EINVAL;
+
+ flags &= ~UMCG_CTL_REGISTER;
+ if (flags && flags != UMCG_CTL_WORKER)
+ return -EINVAL;
+
+ if (current->umcg_task)
+ return -EINVAL;
+
+ if (copy_from_user(&ut, self, sizeof(ut)))
+ return -EFAULT;
+
+ ret = umcg_validate_version(ut.api_version);
+ if (ret)
+ return ret;
+
+ if (ut.server_tid || ut.next_tid)
+ return -EINVAL;
+
+ if (flags == UMCG_CTL_WORKER) {
+ if (ut.state != UMCG_TASK_IDLE)
+ return -EINVAL;
+
+ current->umcg_task = self;
+ current->flags |= PF_UMCG_WORKER;
+
+ umcg_wq_worker_running(current); /* Will sleep. */
+ return 0;
+ }
+
+ /* This is a server task. */
+ if (ut.state != UMCG_TASK_RUNNING)
+ return -EINVAL;
+
+ current->umcg_task = self;
+ return 0;
+}
+
+/* Sleep until interrupted or self.state becomes RUNNING or timeout expires. */
+static int umcg_idle_loop(u64 abs_timeout)
+{
+ struct umcg_task __user *self = current->umcg_task;
+
+ if (abs_timeout)
+ return -EOPNOTSUPP;
+
+ /* Unlock the worker, if locked. */
+ if (current->flags & PF_UMCG_WORKER) {
+ u32 umcg_state;
+
+ smp_mb(); /* Protect the read below. */
+ if (get_user(umcg_state, &self->state))
+ return -EFAULT;
+
+ if ((umcg_state & UMCG_TF_LOCKED) && cmpxchg_user_32(
+ &self->state, &umcg_state,
+ umcg_state & ~UMCG_TF_LOCKED))
+ return -EFAULT;
+ }
+
+ while (true) {
+ u32 umcg_state;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ smp_mb(); /* Order with set_current_state() above. */
+ if (get_user(umcg_state, &self->state)) {
+ set_current_state(TASK_RUNNING);
+ return -EFAULT;
+ }
+
+ if ((umcg_state & UMCG_TF_STATE_MASK) ==
+ UMCG_TASK_RUNNING) {
+ set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ freezable_schedule();
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (get_user(umcg_state, &self->state))
+ return -EFAULT;
+
+ if ((umcg_state & UMCG_TF_STATE_MASK) == UMCG_TASK_RUNNING)
+ return 0;
+ }
+}
+
+/*
+ * Try to wake up. May be called with preempt_disable set.
+ *
+ * Note: umcg_ttwu succeeds even if ttwu fails: see wait/wake state
+ * ordering logic documented in sys_umcg_wait() below.
+ */
+static int umcg_ttwu(u32 next_tid, int wake_flags)
+{
+ struct task_struct *next;
+
+ rcu_read_lock();
+ next = find_task_by_vpid(next_tid);
+ if (!next || !(READ_ONCE(next->umcg_task))) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ try_to_wake_up(next, TASK_NORMAL, wake_flags); /* Result ignored. */
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * At the moment, umcg_do_context_switch simply wakes up @next with
+ * WF_CURRENT_CPU and puts the current task to sleep.
+ *
+ * In the future an optimization will be added to adjust runtime accounting
+ * so that from the kernel scheduling perspective the two tasks are
+ * essentially treated as one.
+ */
+static int umcg_do_context_switch(u32 next_tid, u64 abs_timeout)
+{
+ struct task_struct *next;
+
+ if (abs_timeout)
+ return -EOPNOTSUPP;
+
+ rcu_read_lock();
+ next = find_task_by_vpid(next_tid);
+ if (!next) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ /* TODO: instead of wake + sleep, do a context switch. */
+ try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); /* Result ignored. */
+ rcu_read_unlock();
+
+ return umcg_idle_loop(abs_timeout);
+}
+
+/**
+ * sys_umcg_wait: sleep the current task and/or wake another task.
+ * @flags: zero or a value from enum umcg_wait_flag.
+ * @abs_timeout: when to wake the task; zero for no timeout. NOT SUPPORTED yet.
+ *
+ * @self->state must be UMCG_TASK_IDLE (where @self is current->umcg_task
+ * if !(@flags & UMCG_WAIT_WAKE_ONLY).
+ *
+ * If @self->next_tid is not zero, it must point to an IDLE UMCG task blocked
+ * in sys_umcg_wait(). The userspace must have changed its state from IDLE to
+ * RUNNING before calling sys_umcg_wait() in the current task. This "next"
+ * task will be woken (context-switched-to on the fast path) when the current
+ * task is put to sleep.
+ *
+ * If this is a worker (PF_UMCG_WORKER is set), and @self->next_tid is zero,
+ * the server assigned to this worker (@self->server_tid) will be
+ * woken/switched-to; same rules apply (the server must be waiting in
+ * sys_umcg_wait(); its state must be RUNNING now).
+ *
+ * If @self->next_tid points to a UMCG worker, it must have its server_tid
+ * set, with the server blocked in sys_umcg_wait().
+ *
+ *
+ * Note: wait/wake ordering: to avoid missing wakeups, the following
+ * state changes order is required:
+ *
+ * wait: the userspace marks the current task's UMCG state as IDLE
+ * and calls sys_umcg_wait().
+ * wake: the userspace marks the wakee's UMCG state as RUNNING and
+ * calls sys_umcg_wait() with the wakee's TID in self->next_tid;
+ *
+ * To wake a umcg task, the kernel issues a single ttwu() call, ignoring
+ * the result. On the wait path the kernel carefully orders task
+ * state changes with umcg state checks to ensure the wakeup above
+ * is not lost. See umcg_idle_loop for details.
+ *
+ * Return:
+ * 0 - OK;
+ * -ETIMEDOUT - the timeout expired;
+ * -EFAULT - failed accessing struct umcg_task __user of the current
+ * task;
+ * -ESRCH - the task to wake not found or not a UMCG task;
+ * -EINVAL - another error happened (e.g. bad @flags, or the current
+ * task is not a UMCG task, etc.)
+ */
+SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, abs_timeout)
+{
+ struct umcg_task __user *self = current->umcg_task;
+ u32 next_tid;
+
+ if (!self)
+ return -EINVAL;
+
+ if (get_user(next_tid, &self->next_tid))
+ return -EFAULT;
+
+ if (flags & UMCG_WAIT_WAKE_ONLY) {
+ if (!next_tid || abs_timeout)
+ return -EINVAL;
+
+ flags &= ~UMCG_WAIT_WAKE_ONLY;
+ if (flags & ~UMCG_WAIT_WF_CURRENT_CPU)
+ return -EINVAL;
+
+ return umcg_ttwu(next_tid, flags & UMCG_WAIT_WF_CURRENT_CPU ?
+ WF_CURRENT_CPU : 0);
+ }
+
+ if (!next_tid && current->flags & PF_UMCG_WORKER) {
+ if (get_user(next_tid, &self->server_tid))
+ return -EFAULT;
+ }
+
+ if (next_tid)
+ return umcg_do_context_switch(next_tid, abs_timeout);
+
+ return umcg_idle_loop(abs_timeout);
+}
+
+#define umcg_die() \
+{ \
+ pr_warn("UMCG: umcg_die: %s:%d\n", __FILE__, __LINE__); \
+ force_sig(SIGSEGV); \
+}
+
+/* Try to wake the server; may be called within preempt_disable section. */
+static bool try_to_wake_server(struct umcg_task __user *ut_server, u32 server_tid)
+{
+ u32 state = UMCG_TASK_IDLE;
+
+ if (WARN_ON(!(ut_server || server_tid)))
+ return false;
+
+ if (!ut_server) {
+ struct task_struct *tsk;
+
+ rcu_read_lock();
+ tsk = find_task_by_vpid(server_tid);
+ if (tsk)
+ ut_server = READ_ONCE(tsk->umcg_task);
+ rcu_read_unlock();
+
+ if (!ut_server)
+ return false;
+ }
+
+ if (!server_tid && get_user(server_tid, &ut_server->server_tid))
+ return false;
+
+ if (cmpxchg_user_32(&ut_server->state, &state, UMCG_TASK_RUNNING)) {
+ if (state != UMCG_TASK_RUNNING)
+ umcg_die(); /* The userspace broke the contract. */
+ return false;
+ }
+
+ /* TODO: make a smarter context switch when available. */
+ return umcg_ttwu(server_tid, WF_CURRENT_CPU) == 0;
+}
+
+/*
+ * Change the worker's state RUNNING => BLOCKED or BLOCKED => IDLE, depending
+ * on context (@sleeping).
+ *
+ * May be called with preempt_disable.
+ *
+ * Returns true to continue; false to abort.
+ */
+static bool wq_worker_change_state(struct umcg_task __user *ut_worker,
+ bool sleeping)
+{
+ u32 prev_state, next_state;
+ int ret;
+
+ smp_mb(); /* Guard the read below. */
+ if (get_user_nosleep(prev_state, &ut_worker->state)) {
+ umcg_die();
+ return false;
+ }
+
+ if (prev_state & UMCG_TF_LOCKED)
+ return false;
+
+ if (sleeping) {
+ if ((prev_state & UMCG_TF_STATE_MASK) !=
+ UMCG_TASK_RUNNING)
+ return false;
+ } else {
+ if ((prev_state & UMCG_TF_STATE_MASK) ==
+ UMCG_TASK_RUNNING) {
+ /*
+ * Workers with servers attached should
+ * pass through; workers without servers
+ * should wait.
+ */
+ u32 server_tid;
+
+ if (get_user_nosleep(server_tid,
+ &ut_worker->server_tid)) {
+ umcg_die(); /* The userspace broke the conract. */
+ return false;
+ }
+
+ if (server_tid)
+ return false;
+ }
+ }
+
+ next_state = prev_state & ~UMCG_TF_STATE_MASK;
+ next_state |= sleeping ? UMCG_TASK_BLOCKED : UMCG_TASK_IDLE;
+
+ ret = cmpxchg_user_32(&ut_worker->state, &prev_state, next_state);
+
+ if (!ret)
+ return true;
+
+ umcg_die(); /* The userspace broke the contract. */
+ return false;
+}
+
+/* Called from sched_submit_work() with preempt_disable. */
+void umcg_wq_worker_sleeping(struct task_struct *tsk)
+{
+ struct umcg_task __user *ut_worker = tsk->umcg_task;
+ u32 server_tid;
+
+ if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker"))
+ return;
+
+ /* Step one: mark the worker as BLOCKED. */
+ if (!wq_worker_change_state(ut_worker, true))
+ return;
+
+ /* Step two: wake the server, if any. */
+ if (get_user_nosleep(server_tid, &ut_worker->server_tid)) {
+ umcg_die(); /* EFAULT */
+ return;
+ }
+
+ if (!server_tid)
+ return;
+
+ /* TODO: make a smarter context switch when available. */
+ try_to_wake_server(NULL, server_tid);
+}
+
+/* Called from sched_update_worker(). May sleep. */
+void umcg_wq_worker_running(struct task_struct *tsk)
+{
+ struct umcg_task __user *ut_worker = tsk->umcg_task;
+ u64 head, popped_server;
+
+ if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker"))
+ return;
+
+ /*
+ * Remove the workqueue flag to avoid triggering
+ * umcg_wq_worker_sleeping.
+ */
+ current->flags &= ~PF_UMCG_WORKER;
+
+ /* Step 1: mark the worker as IDLE and add to the idle workers list. */
+ if (!wq_worker_change_state(ut_worker, false))
+ goto out;
+
+ if (get_user(head, &ut_worker->idle_workers_ptr))
+ goto die; /* EFAULT */
+
+ if (!head)
+ goto die; /* The userspace broke the conract. */
+
+ if (umcg_sll_push((u64 __user *)head,
+ (u64 __user *)&ut_worker->idle_workers_ptr,
+ nr_cpu_ids * 2 /* max number of times to push */))
+ goto die; /* TODO: sleep+retry several times before dying. */
+
+ smp_mb(); /* Make sure steps 1 and 2 are ordered. */
+
+ /* Step 2: wake an idle server, if any. */
+ if (get_user(head, &ut_worker->idle_servers_ptr))
+ goto die;
+
+ /* The number of servers should not exceed the number of CPUs much. */
+ if (head && umcg_sll_pop((u64 __user *)head, &popped_server,
+ nr_cpu_ids * 2 /* max number of nodes to check*/))
+ goto die;
+
+ if (popped_server) {
+ struct umcg_task __user *ut_server = container_of(
+ (u64 *)popped_server,
+ struct umcg_task, idle_servers_ptr);
+
+ try_to_wake_server(ut_server, 0);
+ }
+
+ /* Step 3: sleep until woken by a server */
+ umcg_idle_loop(0);
+
+out:
+ current->flags |= PF_UMCG_WORKER;
+ return;
+
+die:
+ umcg_die();
+}
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
index 435531d751f2..619ba02be1d4 100644
--- a/kernel/sched/umcg.h
+++ b/kernel/sched/umcg.h
@@ -9,6 +9,19 @@
#include <asm/asm.h>
#include <linux/atomic.h>
+#ifdef CONFIG_UMCG
+
+struct task_struct;
+
+/*
+ * umcg_wq_worker_[sleeping|running] are called in core.c by
+ * sched_submit_work() and sched_update_worker().
+ */
+void umcg_wq_worker_sleeping(struct task_struct *tsk);
+void umcg_wq_worker_running(struct task_struct *tsk);
+
+#endif /* CONFIG_UMCG */
+
/* TODO: move atomic operations below into arch/ headers */
static inline int __try_cmpxchg_user_32(u32 *uval, u32 __user *uaddr,
u32 oldval, u32 newval)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0ea8128468c3..cd1be6356e42 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -272,6 +272,10 @@ COND_SYSCALL(landlock_create_ruleset);
COND_SYSCALL(landlock_add_rule);
COND_SYSCALL(landlock_restrict_self);
+/* kernel/sched/umcg.c */
+COND_SYSCALL(umcg_ctl);
+COND_SYSCALL(umcg_wait);
+
/* arch/example/kernel/sys_example.c */
/* mm/fadvise.c */
--
2.25.1
^ permalink raw reply related [flat|nested] 14+ messages in thread