* [PATCH v3 0/6] Freezer rewrite
@ 2021-10-09 10:07 Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
` (5 more replies)
0 siblings, 6 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
Hi all,
This is the by now familiar rewrite of the freezer, but with yet another
attempt at getting the special blocked states (STOPPED/TRACED) right.
Currently it is already possible to tell if a task should be STOPPED from
looking at p->jobctl, the same idea is extended to TRACED and p->ptrace.
But treating p->jobctl/p->ptrace as canonical state, the blocked state can be
(hopefully correctly this time) reconstructed from it at all times.
Specifically, if a TRACED task was woken by the tracer, while the tracee was
FROZEN, the p->ptrace state will have been updated, and the thaw operation will
result in the wakeup.
Oleg, if you can please try and shoot holes in it again? :-)
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
2021-10-14 8:58 ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
` (4 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
Rafael explained that the reason for having both PF_NOFREEZE and
PF_FREEZER_SKIP is that {,un}lock_system_sleep() is callable from
kthread context that has previously called set_freezable().
In preparation of merging the flags, have {,un}lock_system_slee() save
and restore current->flags.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
drivers/scsi/scsi_transport_spi.c | 7 ++++---
include/linux/suspend.h | 8 ++++----
kernel/power/hibernate.c | 35 ++++++++++++++++++++++-------------
kernel/power/main.c | 16 ++++++++++------
kernel/power/suspend.c | 12 ++++++++----
kernel/power/user.c | 24 ++++++++++++++----------
6 files changed, 62 insertions(+), 40 deletions(-)
--- a/drivers/scsi/scsi_transport_spi.c
+++ b/drivers/scsi/scsi_transport_spi.c
@@ -998,8 +998,9 @@ void
spi_dv_device(struct scsi_device *sdev)
{
struct scsi_target *starget = sdev->sdev_target;
- u8 *buffer;
const int len = SPI_MAX_ECHO_BUFFER_SIZE*2;
+ unsigned int sleep_flags;
+ u8 *buffer;
/*
* Because this function and the power management code both call
@@ -1007,7 +1008,7 @@ spi_dv_device(struct scsi_device *sdev)
* while suspend or resume is in progress. Hence the
* lock/unlock_system_sleep() calls.
*/
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (scsi_autopm_get_device(sdev))
goto unlock_system_sleep;
@@ -1058,7 +1059,7 @@ spi_dv_device(struct scsi_device *sdev)
scsi_autopm_put_device(sdev);
unlock_system_sleep:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL(spi_dv_device);
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -518,8 +518,8 @@ extern bool pm_save_wakeup_count(unsigne
extern void pm_wakep_autosleep_enabled(bool set);
extern void pm_print_active_wakeup_sources(void);
-extern void lock_system_sleep(void);
-extern void unlock_system_sleep(void);
+extern unsigned int lock_system_sleep(void);
+extern void unlock_system_sleep(unsigned int);
#else /* !CONFIG_PM_SLEEP */
@@ -542,8 +542,8 @@ static inline void pm_system_wakeup(void
static inline void pm_wakeup_clear(bool reset) {}
static inline void pm_system_irq_wakeup(unsigned int irq_number) {}
-static inline void lock_system_sleep(void) {}
-static inline void unlock_system_sleep(void) {}
+static inline unsigned int lock_system_sleep(void) { return 0; }
+static inline void unlock_system_sleep(unsigned int flags) {}
#endif /* !CONFIG_PM_SLEEP */
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -90,20 +90,24 @@ bool hibernation_available(void)
*/
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
+ unsigned int sleep_flags;
+
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
- lock_system_sleep();
+
+ sleep_flags = lock_system_sleep();
+
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(hibernation_set_ops);
@@ -707,6 +711,7 @@ static int load_image_and_restore(void)
int hibernate(void)
{
bool snapshot_test = false;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available()) {
@@ -714,7 +719,7 @@ int hibernate(void)
return -EPERM;
}
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -788,7 +793,7 @@ int hibernate(void)
pm_restore_console();
hibernate_release();
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
pr_info("hibernation exit\n");
return error;
@@ -803,9 +808,10 @@ int hibernate(void)
*/
int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
+ unsigned int sleep_flags;
int error;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -885,7 +891,7 @@ int hibernate_quiet_exec(int (*func)(voi
hibernate_release();
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -1094,11 +1100,12 @@ static ssize_t disk_show(struct kobject
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ int mode = HIBERNATION_INVALID;
+ unsigned int sleep_flags;
int error = 0;
- int i;
int len;
char *p;
- int mode = HIBERNATION_INVALID;
+ int i;
if (!hibernation_available())
return -EPERM;
@@ -1106,7 +1113,7 @@ static ssize_t disk_store(struct kobject
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -1136,7 +1143,7 @@ static ssize_t disk_store(struct kobject
if (!error)
pm_pr_dbg("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -1152,9 +1159,10 @@ static ssize_t resume_show(struct kobjec
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- dev_t res;
+ unsigned int sleep_flags;
int len = n;
char *name;
+ dev_t res;
if (len && buf[len-1] == '\n')
len--;
@@ -1167,9 +1175,10 @@ static ssize_t resume_store(struct kobje
if (!res)
return -EINVAL;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_resume_device = res;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
+
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
swsusp_resume_device);
noresume = 0;
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,14 +21,16 @@
#ifdef CONFIG_PM_SLEEP
-void lock_system_sleep(void)
+unsigned int lock_system_sleep(void)
{
+ unsigned int flags = current->flags;
current->flags |= PF_FREEZER_SKIP;
mutex_lock(&system_transition_mutex);
+ return flags;
}
EXPORT_SYMBOL_GPL(lock_system_sleep);
-void unlock_system_sleep(void)
+void unlock_system_sleep(unsigned int flags)
{
/*
* Don't use freezer_count() because we don't want the call to
@@ -46,7 +48,8 @@ void unlock_system_sleep(void)
* Which means, if we use try_to_freeze() here, it would make them
* enter the refrigerator, thus causing hibernation to lockup.
*/
- current->flags &= ~PF_FREEZER_SKIP;
+ if (!(flags & PF_FREEZER_SKIP))
+ current->flags &= ~PF_FREEZER_SKIP;
mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
@@ -260,16 +263,17 @@ static ssize_t pm_test_show(struct kobje
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ unsigned int sleep_flags;
const char * const *s;
+ int error = -EINVAL;
int level;
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -279,7 +283,7 @@ static ssize_t pm_test_store(struct kobj
break;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2i
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
s2idle_ops = ops;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
static void s2idle_begin(void)
@@ -202,7 +204,9 @@ __setup("mem_sleep_default=", mem_sleep_
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- lock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
suspend_ops = ops;
@@ -218,7 +222,7 @@ void suspend_set_ops(const struct platfo
mem_sleep_current = PM_SUSPEND_MEM;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -46,12 +46,13 @@ int is_hibernate_resume_dev(dev_t dev)
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available())
return -EPERM;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
if (!hibernate_acquire()) {
error = -EBUSY;
@@ -97,7 +98,7 @@ static int snapshot_open(struct inode *i
data->dev = 0;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -105,8 +106,9 @@ static int snapshot_open(struct inode *i
static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_free();
data = filp->private_data;
@@ -123,7 +125,7 @@ static int snapshot_release(struct inode
PM_POST_HIBERNATION : PM_POST_RESTORE);
hibernate_release();
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return 0;
}
@@ -131,11 +133,12 @@ static int snapshot_release(struct inode
static ssize_t snapshot_read(struct file *filp, char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned int sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
if (!data->ready) {
@@ -156,7 +159,7 @@ static ssize_t snapshot_read(struct file
*offp += res;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -164,11 +167,12 @@ static ssize_t snapshot_read(struct file
static ssize_t snapshot_write(struct file *filp, const char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned int sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
@@ -190,7 +194,7 @@ static ssize_t snapshot_write(struct fil
if (res > 0)
*offp += res;
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
2021-10-14 9:13 ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
` (3 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
handle_initrd() marks itself as PF_FREEZER_SKIP in order to ensure
that the UMH, which is going to freeze the system, doesn't
indefinitely wait for it's caller.
Rework things by adding UMH_FREEZABLE to indicate the completion is
freezable.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/umh.h | 9 +++++----
init/do_mounts_initrd.c | 10 +---------
kernel/umh.c | 8 ++++++++
3 files changed, 14 insertions(+), 13 deletions(-)
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -11,10 +11,11 @@
struct cred;
struct file;
-#define UMH_NO_WAIT 0 /* don't wait at all */
-#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */
-#define UMH_WAIT_PROC 2 /* wait for the process to complete */
-#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */
+#define UMH_NO_WAIT 0x00 /* don't wait at all */
+#define UMH_WAIT_EXEC 0x01 /* wait for the exec, but not the process */
+#define UMH_WAIT_PROC 0x02 /* wait for the process to complete */
+#define UMH_KILLABLE 0x04 /* wait for EXEC/PROC killable */
+#define UMH_FREEZABLE 0x08 /* wait for EXEC/PROC freezable */
struct subprocess_info {
struct work_struct work;
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -79,19 +79,11 @@ static void __init handle_initrd(void)
init_mkdir("/old", 0700);
init_chdir("/old");
- /*
- * In case that a resume from disk is carried out by linuxrc or one of
- * its children, we need to tell the freezer not to wait for us.
- */
- current->flags |= PF_FREEZER_SKIP;
-
info = call_usermodehelper_setup("/linuxrc", argv, envp_init,
GFP_KERNEL, init_linuxrc, NULL, NULL);
if (!info)
return;
- call_usermodehelper_exec(info, UMH_WAIT_PROC);
-
- current->flags &= ~PF_FREEZER_SKIP;
+ call_usermodehelper_exec(info, UMH_WAIT_PROC|UMH_FREEZABLE);
/* move initrd to rootfs' /old */
init_mount("..", ".", NULL, MS_MOVE, NULL);
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -28,6 +28,7 @@
#include <linux/async.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
+#include <linux/freezer.h>
#include <trace/events/module.h>
@@ -436,6 +437,9 @@ int call_usermodehelper_exec(struct subp
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
+ if (wait & UMH_FREEZABLE)
+ freezer_do_not_count();
+
if (wait & UMH_KILLABLE) {
retval = wait_for_completion_killable(&done);
if (!retval)
@@ -448,6 +452,10 @@ int call_usermodehelper_exec(struct subp
}
wait_for_completion(&done);
+
+ if (wait & UMH_FREEZABLE)
+ freezer_count();
+
wait_done:
retval = sub_info->retval;
out:
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 3/6] ptrace: Order and comment PT_flags
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
2021-10-14 9:31 ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace Peter Zijlstra
` (2 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
Add a comment to the PT_flags to indicate their actual value, this
makes it easier to see what bits are used and where there might be a
possible hole to use.
Notable PT_SEIZED was placed wrong, also PT_EVENT_FLAG() space seems
ill defined, as written is seems to be meant to cover the entire
PTRACE_O_ range offset by 3 bits, which would then be 3+[0..21],
however PT_SEIZED is in the middle of that.
XXX if at all possible, PT_SEIZED should be moved outside of this
range.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/ptrace.h | 32 +++++++++++++++++---------------
include/uapi/linux/ptrace.h | 3 +++
2 files changed, 20 insertions(+), 15 deletions(-)
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -28,30 +28,32 @@ extern int ptrace_access_vm(struct task_
* flags. When the a task is stopped the ptracer owns task->ptrace.
*/
-#define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */
-#define PT_PTRACED 0x00000001
-#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */
+#define PT_PTRACED 0x00000001 // 0x00000001
+#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ // 0x00000002
#define PT_OPT_FLAG_SHIFT 3
/* PT_TRACE_* event enable flags */
#define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event)))
-#define PT_TRACESYSGOOD PT_EVENT_FLAG(0)
-#define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK)
-#define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
-#define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
-#define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
-#define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
-#define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
-#define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
-#define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
-#define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
+#define PT_TRACESYSGOOD PT_EVENT_FLAG(0) // 0x00000008
+#define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) // 0x00000010
+#define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) // 0x00000020
+#define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) // 0x00000040
+#define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) // 0x00000080
+#define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) // 0x00000100
+#define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) // 0x00000200
+#define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) // 0x00000400
+
+#define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */// 0x00010000
+
+#define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) // 0x00800000
+#define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) // 0x01000000
/* single stepping state bits (used on ARM and PA-RISC) */
#define PT_SINGLESTEP_BIT 31
-#define PT_SINGLESTEP (1<<PT_SINGLESTEP_BIT)
+#define PT_SINGLESTEP (1<<PT_SINGLESTEP_BIT) // 0x80000000
#define PT_BLOCKSTEP_BIT 30
-#define PT_BLOCKSTEP (1<<PT_BLOCKSTEP_BIT)
+#define PT_BLOCKSTEP (1<<PT_BLOCKSTEP_BIT) // 0x40000000
extern long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data);
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -134,6 +134,7 @@ struct ptrace_rseq_configuration {
#define PTRACE_EVENT_STOP 128
/* Options set using PTRACE_SETOPTIONS or using PTRACE_SEIZE @data param */
+/* Consider overlap with task->ptrace PT_flags */
#define PTRACE_O_TRACESYSGOOD 1
#define PTRACE_O_TRACEFORK (1 << PTRACE_EVENT_FORK)
#define PTRACE_O_TRACEVFORK (1 << PTRACE_EVENT_VFORK)
@@ -143,6 +144,8 @@ struct ptrace_rseq_configuration {
#define PTRACE_O_TRACEEXIT (1 << PTRACE_EVENT_EXIT)
#define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP)
+/* PTRACE_O_SEIZED (1 << 13) */
+
/* eventless options */
#define PTRACE_O_EXITKILL (1 << 20)
#define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
` (2 preceding siblings ...)
2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED Peter Zijlstra
2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
5 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
Just like we can recover __TASK_STOPPED from p->jobctl, add some bits
to p->ptrace such that we can recover __TASK_TRACED.
All these t->ptrace modifications are done under sighand lock.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/ptrace.h | 4 ++++
include/linux/sched/signal.h | 6 ++----
kernel/ptrace.c | 27 ++++++++++++++++++---------
kernel/signal.c | 1 +
4 files changed, 25 insertions(+), 13 deletions(-)
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -45,6 +45,10 @@ extern int ptrace_access_vm(struct task_
#define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) // 0x00000400
#define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */// 0x00010000
+#define PT_STOPPED 0x00020000 // 0x00020000
+#define PT_STOPPED_FATAL 0x00040000 // 0x00040000
+
+#define PT_STOPPED_MASK (PT_STOPPED|PT_STOPPED_FATAL)
#define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) // 0x00800000
#define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) // 0x01000000
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -418,10 +418,8 @@ static inline void signal_wake_up(struct
{
signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
-static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
-{
- signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
-}
+
+extern void ptrace_signal_wake_up(struct task_struct *t, bool resume);
void task_join_group_stop(struct task_struct *task);
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -64,6 +64,14 @@ int ptrace_access_vm(struct task_struct
return ret;
}
+void ptrace_signal_wake_up(struct task_struct *t, bool resume)
+{
+ lockdep_assert_task_sighand_held(t);
+
+ if (resume)
+ t->ptrace &= ~PT_STOPPED_MASK;
+ signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+}
void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
const struct cred *ptracer_cred)
@@ -197,6 +205,8 @@ static bool ptrace_freeze_traced(struct
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
+ task->ptrace &= ~PT_STOPPED_MASK;
+ task->ptrace |= PT_STOPPED;
WRITE_ONCE(task->__state, __TASK_TRACED);
ret = true;
}
@@ -218,10 +228,13 @@ static void ptrace_unfreeze_traced(struc
*/
spin_lock_irq(&task->sighand->siglock);
if (READ_ONCE(task->__state) == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
+ if (__fatal_signal_pending(task)) {
+ task->ptrace &= ~PT_STOPPED_MASK;
wake_up_state(task, __TASK_TRACED);
- else
+ } else {
+ task->ptrace |= PT_STOPPED_MASK;
WRITE_ONCE(task->__state, TASK_TRACED);
+ }
}
spin_unlock_irq(&task->sighand->siglock);
}
@@ -835,8 +848,6 @@ static long ptrace_get_rseq_configuratio
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
- bool need_siglock;
-
if (!valid_signal(data))
return -EIO;
@@ -877,13 +888,11 @@ static int ptrace_resume(struct task_str
* status and clears the code too; this can't race with the tracee, it
* takes siglock after resume.
*/
- need_siglock = data && !thread_group_empty(current);
- if (need_siglock)
- spin_lock_irq(&child->sighand->siglock);
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->ptrace &= ~PT_STOPPED_MASK;
wake_up_state(child, __TASK_TRACED);
- if (need_siglock)
- spin_unlock_irq(&child->sighand->siglock);
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2228,6 +2228,7 @@ static void ptrace_stop(int exit_code, i
return;
}
+ current->ptrace |= PT_STOPPED_MASK;
set_special_state(TASK_TRACED);
/*
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
` (3 preceding siblings ...)
2021-10-09 10:07 ` [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
5 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
Make ->ptrace/->jobctl the canonical state, this allows us to play
games with __state (such as freezing).
The wait_task_inactive() usage will be fixed up later, once we have
additional TASK_state.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 27 ++++++++++++++++++++++-----
kernel/ptrace.c | 16 +++++++++-------
2 files changed, 31 insertions(+), 12 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;
#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)
-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task) ((task)->ptrace & PT_STOPPED)
+#define task_is_stopped(task) ((task)->jobctl & JOBCTL_STOP_PENDING)
+#define task_is_stopped_or_traced(task) (task_is_stopped(task) || task_is_traced(task))
/*
* Special states are those that do not use the normal wait-loop pattern. See
@@ -228,6 +226,25 @@ struct task_group;
} while (0)
/*
+ * task_cond_set_special_state() is a cmpxchg like operation on task->state.
+ *
+ * This operation isn't safe in general and should only be used to transform
+ * one (special) blocked state into another, such as:
+ * TASK_STOPPED <-> TASK_FROZEN.
+ */
+#define task_cond_set_special_state(task, cond_state) \
+ ({ \
+ struct task_struct *__p = (task); \
+ unsigned long __flags; /* may shadow */ \
+ unsigned int __state; \
+ raw_spin_lock_irqsave(&__p->pi_lock, __flags); \
+ if ((__state = (cond_state))) \
+ WRITE_ONCE(__p->__state, __state); \
+ raw_spin_unlock_irqrestore(&__p->pi_lock, __flags); \
+ !!__state; \
+ })
+
+/*
* PREEMPT_RT specific variants for "sleeping" spin/rwlocks
*
* RT's spin/rwlock substitutions are state preserving. The state of the
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -207,7 +207,8 @@ static bool ptrace_freeze_traced(struct
!__fatal_signal_pending(task)) {
task->ptrace &= ~PT_STOPPED_MASK;
task->ptrace |= PT_STOPPED;
- WRITE_ONCE(task->__state, __TASK_TRACED);
+ /* *TASK_TRACED -> __TASK_TRACED */
+ task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * __TASK_TRACED);
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -217,7 +218,7 @@ static bool ptrace_freeze_traced(struct
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (READ_ONCE(task->__state) != __TASK_TRACED)
+ if (!task_is_traced(task))
return;
WARN_ON(!task->ptrace || task->parent != current);
@@ -227,13 +228,14 @@ static void ptrace_unfreeze_traced(struc
* Recheck state under the lock to close this race.
*/
spin_lock_irq(&task->sighand->siglock);
- if (READ_ONCE(task->__state) == __TASK_TRACED) {
+ if (task_is_traced(task)) {
if (__fatal_signal_pending(task)) {
task->ptrace &= ~PT_STOPPED_MASK;
wake_up_state(task, __TASK_TRACED);
} else {
task->ptrace |= PT_STOPPED_MASK;
- WRITE_ONCE(task->__state, TASK_TRACED);
+ /* *TASK_TRACED -> TASK_TRACED */
+ task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * TASK_TRACED);
}
}
spin_unlock_irq(&task->sighand->siglock);
@@ -269,7 +271,7 @@ static int ptrace_check_attach(struct ta
*/
read_lock(&tasklist_lock);
if (child->ptrace && child->parent == current) {
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
+// WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
@@ -280,13 +282,13 @@ static int ptrace_check_attach(struct ta
read_unlock(&tasklist_lock);
if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) {
+ if (!wait_task_inactive(child, __TASK_TRACED)) { // XXX mooo!!!
/*
* This can only happen if may_ptrace_stop() fails and
* ptrace_stop() changes ->state back to TASK_RUNNING,
* so we should not worry about leaking __TASK_TRACED.
*/
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
+// WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
ret = -ESRCH;
}
}
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
` (4 preceding siblings ...)
2021-10-09 10:07 ` [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED Peter Zijlstra
@ 2021-10-09 10:08 ` Peter Zijlstra
2021-10-18 13:36 ` Peter Zijlstra
5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:08 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, peterz, tj, linux-pm
This here rewrites the core freezer to behave better wrt thawing. By
replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.
As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay).
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
drivers/android/binder.c | 4
drivers/media/pci/pt3/pt3.c | 4
fs/cifs/inode.c | 4
fs/cifs/transport.c | 5
fs/coredump.c | 5
fs/nfs/file.c | 3
fs/nfs/inode.c | 12 --
fs/nfs/nfs3proc.c | 3
fs/nfs/nfs4proc.c | 14 +-
fs/nfs/nfs4state.c | 3
fs/nfs/pnfs.c | 4
fs/xfs/xfs_trans_ail.c | 8 -
include/linux/completion.h | 1
include/linux/freezer.h | 244 +----------------------------------------
include/linux/sched.h | 41 +++---
include/linux/sunrpc/sched.h | 7 -
include/linux/wait.h | 40 +++++-
kernel/cgroup/legacy_freezer.c | 23 +--
kernel/exit.c | 4
kernel/fork.c | 5
kernel/freezer.c | 132 +++++++++++++++-------
kernel/futex.c | 4
kernel/hung_task.c | 4
kernel/power/main.c | 6 -
kernel/power/process.c | 10 -
kernel/ptrace.c | 2
kernel/sched/completion.c | 9 +
kernel/sched/core.c | 19 ++-
kernel/signal.c | 14 +-
kernel/time/hrtimer.c | 4
kernel/umh.c | 20 +--
mm/khugepaged.c | 4
net/sunrpc/sched.c | 12 --
net/unix/af_unix.c | 8 -
34 files changed, 274 insertions(+), 408 deletions(-)
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3722,10 +3722,9 @@ static int binder_wait_for_work(struct b
struct binder_proc *proc = thread->proc;
int ret = 0;
- freezer_do_not_count();
binder_inner_proc_lock(proc);
for (;;) {
- prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE);
if (binder_has_work_ilocked(thread, do_proc_work))
break;
if (do_proc_work)
@@ -3742,7 +3741,6 @@ static int binder_wait_for_work(struct b
}
finish_wait(&thread->wait, &wait);
binder_inner_proc_unlock(proc);
- freezer_count();
return ret;
}
--- a/drivers/media/pci/pt3/pt3.c
+++ b/drivers/media/pci/pt3/pt3.c
@@ -445,8 +445,8 @@ static int pt3_fetch_thread(void *data)
pt3_proc_dma(adap);
delay = ktime_set(0, PT3_FETCH_DELAY * NSEC_PER_MSEC);
- set_current_state(TASK_UNINTERRUPTIBLE);
- freezable_schedule_hrtimeout_range(&delay,
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_hrtimeout_range(&delay,
PT3_FETCH_DELAY_DELTA * NSEC_PER_MSEC,
HRTIMER_MODE_REL);
}
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2285,7 +2285,7 @@ cifs_invalidate_mapping(struct inode *in
static int
cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
@@ -2303,7 +2303,7 @@ cifs_revalidate_mapping(struct inode *in
return 0;
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (rc)
return rc;
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -760,8 +760,9 @@ wait_for_response(struct TCP_Server_Info
{
int error;
- error = wait_event_freezekillable_unsafe(server->response_q,
- midQ->mid_state != MID_REQUEST_SUBMITTED);
+ error = wait_event_state(server->response_q,
+ midQ->mid_state != MID_REQUEST_SUBMITTED,
+ (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
if (error < 0)
return -ERESTARTSYS;
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -465,9 +465,8 @@ static int coredump_wait(int exit_code,
if (core_waiters > 0) {
struct core_thread *ptr;
- freezer_do_not_count();
- wait_for_completion(&core_state->startup);
- freezer_count();
+ wait_for_completion_state(&core_state->startup,
+ TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
/*
* Wait for all the threads to become inactive, so that
* all the thread context (extended register state, like
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -558,7 +558,8 @@ static vm_fault_t nfs_vm_page_mkwrite(st
nfs_fscache_wait_on_page_write(NFS_I(inode), page);
wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
lock_page(page);
mapping = page_file_mapping(page);
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fat
return nfs_fileid_to_ino_t(fattr->fileid);
}
-static int nfs_wait_killable(int mode)
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
}
-
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
-{
- return nfs_wait_killable(mode);
-}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
/**
@@ -1327,7 +1322,8 @@ int nfs_clear_invalid_mapping(struct add
*/
for (;;) {
ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (ret)
goto out;
spin_lock(&inode->i_lock);
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt,
res = rpc_call_sync(clnt, msg, flags);
if (res != -EJUKEBOX)
break;
- freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
} while (!fatal_signal_pending(current));
return res;
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -411,8 +411,8 @@ static int nfs4_delay_killable(long *tim
{
might_sleep();
- freezable_schedule_timeout_killable_unsafe(
- nfs4_update_delay(timeout));
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
if (!__fatal_signal_pending(current))
return 0;
return -EINTR;
@@ -422,7 +422,8 @@ static int nfs4_delay_interruptible(long
{
might_sleep();
- freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
if (!signal_pending(current))
return 0;
return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
@@ -7320,7 +7321,8 @@ nfs4_retry_setlk_simple(struct nfs4_stat
status = nfs4_proc_setlk(state, cmd, request);
if ((status != -EAGAIN) || IS_SETLK(cmd))
break;
- freezable_schedule_timeout_interruptible(timeout);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(timeout);
timeout *= 2;
timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
status = -ERESTARTSYS;
@@ -7388,10 +7390,8 @@ nfs4_retry_setlk(struct nfs4_state *stat
break;
status = -ERESTARTSYS;
- freezer_do_not_count();
- wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
+ wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
NFS4_LOCK_MAXTIMEOUT);
- freezer_count();
} while (!signalled());
remove_wait_queue(q, &waiter.wait);
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1307,7 +1307,8 @@ int nfs4_wait_clnt_recover(struct nfs_cl
refcount_inc(&clp->cl_count);
res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (res)
goto out;
if (clp->cl_cons_state < 0)
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1896,7 +1896,7 @@ static int pnfs_prepare_to_retry_layoutg
pnfs_layoutcommit_inode(lo->plh_inode, false);
return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
nfs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
}
static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
@@ -3176,7 +3176,7 @@ pnfs_layoutcommit_inode(struct inode *in
status = wait_on_bit_lock_action(&nfsi->flags,
NFS_INO_LAYOUTCOMMITTING,
nfs_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (status)
goto out;
}
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -590,9 +590,9 @@ xfsaild(
while (1) {
if (tout && tout <= 20)
- set_current_state(TASK_KILLABLE);
+ set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
else
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
/*
* Check kthread_should_stop() after we set the task state to
@@ -641,14 +641,14 @@ xfsaild(
ailp->ail_target == ailp->ail_target_prev &&
list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
- freezable_schedule();
+ schedule();
tout = 0;
continue;
}
spin_unlock(&ailp->ail_lock);
if (tout)
- freezable_schedule_timeout(msecs_to_jiffies(tout));
+ schedule_timeout(msecs_to_jiffies(tout));
__set_current_state(TASK_RUNNING);
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -103,6 +103,7 @@ extern void wait_for_completion(struct c
extern void wait_for_completion_io(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
+extern int wait_for_completion_state(struct completion *x, unsigned int state);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
extern unsigned long wait_for_completion_io_timeout(struct completion *x,
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -8,9 +8,11 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
+#include <linux/jump_label.h>
#ifdef CONFIG_FREEZER
-extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */
+DECLARE_STATIC_KEY_FALSE(freezer_active);
+
extern bool pm_freezing; /* PM freezing in effect */
extern bool pm_nosig_freezing; /* PM nosig freezing in effect */
@@ -22,10 +24,7 @@ extern unsigned int freeze_timeout_msecs
/*
* Check if a process has been frozen
*/
-static inline bool frozen(struct task_struct *p)
-{
- return p->flags & PF_FROZEN;
-}
+extern bool frozen(struct task_struct *p);
extern bool freezing_slow_path(struct task_struct *p);
@@ -34,9 +33,10 @@ extern bool freezing_slow_path(struct ta
*/
static inline bool freezing(struct task_struct *p)
{
- if (likely(!atomic_read(&system_freezing_cnt)))
- return false;
- return freezing_slow_path(p);
+ if (static_branch_unlikely(&freezer_active))
+ return freezing_slow_path(p);
+
+ return false;
}
/* Takes and releases task alloc lock using task_lock() */
@@ -48,23 +48,14 @@ extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);
-/*
- * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
- * If try_to_freeze causes a lockdep warning it means the caller may deadlock
- */
-static inline bool try_to_freeze_unsafe(void)
+static inline bool try_to_freeze(void)
{
might_sleep();
if (likely(!freezing(current)))
return false;
- return __refrigerator(false);
-}
-
-static inline bool try_to_freeze(void)
-{
if (!(current->flags & PF_NOFREEZE))
debug_check_no_locks_held();
- return try_to_freeze_unsafe();
+ return __refrigerator(false);
}
extern bool freeze_task(struct task_struct *p);
@@ -79,195 +70,6 @@ static inline bool cgroup_freezing(struc
}
#endif /* !CONFIG_CGROUP_FREEZER */
-/*
- * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
- * calls wait_for_completion(&vfork) and reset right after it returns from this
- * function. Next, the parent should call try_to_freeze() to freeze itself
- * appropriately in case the child has exited before the freezing of tasks is
- * complete. However, we don't want kernel threads to be frozen in unexpected
- * places, so we allow them to block freeze_processes() instead or to set
- * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the
- * parent won't really block freeze_processes(), since ____call_usermodehelper()
- * (the child) does a little before exec/exit and it can't be frozen before
- * waking up the parent.
- */
-
-
-/**
- * freezer_do_not_count - tell freezer to ignore %current
- *
- * Tell freezers to ignore the current task when determining whether the
- * target frozen state is reached. IOW, the current task will be
- * considered frozen enough by freezers.
- *
- * The caller shouldn't do anything which isn't allowed for a frozen task
- * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair
- * wrap a scheduling operation and nothing much else.
- */
-static inline void freezer_do_not_count(void)
-{
- current->flags |= PF_FREEZER_SKIP;
-}
-
-/**
- * freezer_count - tell freezer to stop ignoring %current
- *
- * Undo freezer_do_not_count(). It tells freezers that %current should be
- * considered again and tries to freeze if freezing condition is already in
- * effect.
- */
-static inline void freezer_count(void)
-{
- current->flags &= ~PF_FREEZER_SKIP;
- /*
- * If freezing is in progress, the following paired with smp_mb()
- * in freezer_should_skip() ensures that either we see %true
- * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
- */
- smp_mb();
- try_to_freeze();
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezer_count_unsafe(void)
-{
- current->flags &= ~PF_FREEZER_SKIP;
- smp_mb();
- try_to_freeze_unsafe();
-}
-
-/**
- * freezer_should_skip - whether to skip a task when determining frozen
- * state is reached
- * @p: task in quesion
- *
- * This function is used by freezers after establishing %true freezing() to
- * test whether a task should be skipped when determining the target frozen
- * state is reached. IOW, if this function returns %true, @p is considered
- * frozen enough.
- */
-static inline bool freezer_should_skip(struct task_struct *p)
-{
- /*
- * The following smp_mb() paired with the one in freezer_count()
- * ensures that either freezer_count() sees %true freezing() or we
- * see cleared %PF_FREEZER_SKIP and return %false. This makes it
- * impossible for a task to slip frozen state testing after
- * clearing %PF_FREEZER_SKIP.
- */
- smp_mb();
- return p->flags & PF_FREEZER_SKIP;
-}
-
-/*
- * These functions are intended to be used whenever you want allow a sleeping
- * task to be frozen. Note that neither return any clear indication of
- * whether a freeze event happened while in this function.
- */
-
-/* Like schedule(), but should not block the freezer. */
-static inline void freezable_schedule(void)
-{
- freezer_do_not_count();
- schedule();
- freezer_count();
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezable_schedule_unsafe(void)
-{
- freezer_do_not_count();
- schedule();
- freezer_count_unsafe();
-}
-
-/*
- * Like schedule_timeout(), but should not block the freezer. Do not
- * call this with locks held.
- */
-static inline long freezable_schedule_timeout(long timeout)
-{
- long __retval;
- freezer_do_not_count();
- __retval = schedule_timeout(timeout);
- freezer_count();
- return __retval;
-}
-
-/*
- * Like schedule_timeout_interruptible(), but should not block the freezer. Do not
- * call this with locks held.
- */
-static inline long freezable_schedule_timeout_interruptible(long timeout)
-{
- long __retval;
- freezer_do_not_count();
- __retval = schedule_timeout_interruptible(timeout);
- freezer_count();
- return __retval;
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout)
-{
- long __retval;
-
- freezer_do_not_count();
- __retval = schedule_timeout_interruptible(timeout);
- freezer_count_unsafe();
- return __retval;
-}
-
-/* Like schedule_timeout_killable(), but should not block the freezer. */
-static inline long freezable_schedule_timeout_killable(long timeout)
-{
- long __retval;
- freezer_do_not_count();
- __retval = schedule_timeout_killable(timeout);
- freezer_count();
- return __retval;
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
-{
- long __retval;
- freezer_do_not_count();
- __retval = schedule_timeout_killable(timeout);
- freezer_count_unsafe();
- return __retval;
-}
-
-/*
- * Like schedule_hrtimeout_range(), but should not block the freezer. Do not
- * call this with locks held.
- */
-static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
- u64 delta, const enum hrtimer_mode mode)
-{
- int __retval;
- freezer_do_not_count();
- __retval = schedule_hrtimeout_range(expires, delta, mode);
- freezer_count();
- return __retval;
-}
-
-/*
- * Freezer-friendly wrappers around wait_event_interruptible(),
- * wait_event_killable() and wait_event_interruptible_timeout(), originally
- * defined in <linux/wait.h>
- */
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-#define wait_event_freezekillable_unsafe(wq, condition) \
-({ \
- int __retval; \
- freezer_do_not_count(); \
- __retval = wait_event_killable(wq, (condition)); \
- freezer_count_unsafe(); \
- __retval; \
-})
-
#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
@@ -281,35 +83,9 @@ static inline void thaw_kernel_threads(v
static inline bool try_to_freeze(void) { return false; }
-static inline void freezer_do_not_count(void) {}
static inline void freezer_count(void) {}
-static inline int freezer_should_skip(struct task_struct *p) { return 0; }
static inline void set_freezable(void) {}
-#define freezable_schedule() schedule()
-
-#define freezable_schedule_unsafe() schedule()
-
-#define freezable_schedule_timeout(timeout) schedule_timeout(timeout)
-
-#define freezable_schedule_timeout_interruptible(timeout) \
- schedule_timeout_interruptible(timeout)
-
-#define freezable_schedule_timeout_interruptible_unsafe(timeout) \
- schedule_timeout_interruptible(timeout)
-
-#define freezable_schedule_timeout_killable(timeout) \
- schedule_timeout_killable(timeout)
-
-#define freezable_schedule_timeout_killable_unsafe(timeout) \
- schedule_timeout_killable(timeout)
-
-#define freezable_schedule_hrtimeout_range(expires, delta, mode) \
- schedule_hrtimeout_range(expires, delta, mode)
-
-#define wait_event_freezekillable_unsafe(wq, condition) \
- wait_event_killable(wq, condition)
-
#endif /* !CONFIG_FREEZER */
#endif /* FREEZER_H_INCLUDED */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,25 +80,32 @@ struct task_group;
*/
/* Used in tsk->state: */
-#define TASK_RUNNING 0x0000
-#define TASK_INTERRUPTIBLE 0x0001
-#define TASK_UNINTERRUPTIBLE 0x0002
-#define __TASK_STOPPED 0x0004
-#define __TASK_TRACED 0x0008
+#define TASK_RUNNING 0x000000
+#define TASK_INTERRUPTIBLE 0x000001
+#define TASK_UNINTERRUPTIBLE 0x000002
+#define __TASK_STOPPED 0x000004
+#define __TASK_TRACED 0x000008
/* Used in tsk->exit_state: */
-#define EXIT_DEAD 0x0010
-#define EXIT_ZOMBIE 0x0020
+#define EXIT_DEAD 0x000010
+#define EXIT_ZOMBIE 0x000020
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
-#define TASK_PARKED 0x0040
-#define TASK_DEAD 0x0080
-#define TASK_WAKEKILL 0x0100
-#define TASK_WAKING 0x0200
-#define TASK_NOLOAD 0x0400
-#define TASK_NEW 0x0800
-/* RT specific auxilliary flag to mark RT lock waiters */
-#define TASK_RTLOCK_WAIT 0x1000
-#define TASK_STATE_MAX 0x2000
+#define TASK_PARKED 0x000040
+#define TASK_DEAD 0x000080
+#define TASK_WAKEKILL 0x000100
+#define TASK_WAKING 0x000200
+#define TASK_NOLOAD 0x000400
+#define TASK_NEW 0x000800
+#define TASK_FREEZABLE 0x001000
+#define __TASK_FREEZABLE_UNSAFE (0x002000 * IS_ENABLED(CONFIG_LOCKDEP))
+#define TASK_FROZEN 0x004000
+#define TASK_RTLOCK_WAIT 0x008000
+#define TASK_STATE_MAX 0x010000
+
+/*
+ * DO NOT ADD ANY NEW USERS !
+ */
+#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -1695,7 +1702,6 @@ extern struct pid *cad_pid;
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
-#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
#define PF_KSWAPD 0x00020000 /* I am kswapd */
#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
@@ -1707,7 +1713,6 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
-#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
/*
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -265,7 +265,7 @@ int rpc_malloc(struct rpc_task *);
void rpc_free(struct rpc_task *);
int rpciod_up(void);
void rpciod_down(void);
-int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
+int rpc_wait_for_completion_task(struct rpc_task *task);
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct net;
void rpc_show_tasks(struct net *);
@@ -276,11 +276,6 @@ extern struct workqueue_struct *rpciod_w
extern struct workqueue_struct *xprtiod_workqueue;
void rpc_prepare_task(struct rpc_task *task);
-static inline int rpc_wait_for_completion_task(struct rpc_task *task)
-{
- return __rpc_wait_for_completion_task(task, NULL);
-}
-
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
static inline const char * rpc_qname(const struct rpc_wait_queue *q)
{
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -335,8 +335,8 @@ do { \
} while (0)
#define __wait_event_freezable(wq_head, condition) \
- ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
- freezable_schedule())
+ ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \
+ 0, 0, schedule())
/**
* wait_event_freezable - sleep (or freeze) until a condition gets true
@@ -394,8 +394,8 @@ do { \
#define __wait_event_freezable_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
- TASK_INTERRUPTIBLE, 0, timeout, \
- __ret = freezable_schedule_timeout(__ret))
+ (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \
+ __ret = schedule_timeout(__ret))
/*
* like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
@@ -615,8 +615,8 @@ do { \
#define __wait_event_freezable_exclusive(wq, condition) \
- ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
- freezable_schedule())
+ ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
+ schedule())
#define wait_event_freezable_exclusive(wq, condition) \
({ \
@@ -905,6 +905,34 @@ extern int do_wait_intr_irq(wait_queue_h
__ret; \
})
+#define __wait_event_state(wq, condition, state) \
+ ___wait_event(wq, condition, state, 0, 0, schedule())
+
+/**
+ * wait_event_state - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @state: state to sleep in
+ *
+ * The process is put to sleep (@state) until the @condition evaluates to true
+ * or a signal is received. The @condition is checked each time the waitqueue
+ * @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_state(wq_head, condition, state) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_state(wq_head, condition, state); \
+ __ret; \
+})
+
#define __wait_event_killable_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_KILLABLE, 0, timeout, \
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -113,7 +113,7 @@ static int freezer_css_online(struct cgr
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
}
mutex_unlock(&freezer_mutex);
@@ -134,7 +134,7 @@ static void freezer_css_offline(struct c
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
freezer->state = 0;
@@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +272,8 @@ static void update_if_frozen(struct cgro
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +350,7 @@ static void freezer_apply_state(struct f
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +359,9 @@ static void freezer_apply_state(struct f
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec(&freezer_active);
unfreeze_cgroup(freezer);
}
}
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -469,10 +469,10 @@ static void exit_mm(void)
complete(&core_state->startup);
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
if (!self.task) /* see coredump_finish() */
break;
- freezable_schedule();
+ schedule();
}
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1348,13 +1348,12 @@ static void complete_vfork_done(struct t
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
+ unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
int killed;
- freezer_do_not_count();
cgroup_enter_frozen();
- killed = wait_for_completion_killable(vfork);
+ killed = wait_for_completion_state(vfork, state);
cgroup_leave_frozen(false);
- freezer_count();
if (killed) {
task_lock(child);
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -13,10 +13,11 @@
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
-/* indicate whether PM freezing is in effect, protected by
+/*
+ * indicate whether PM freezing is in effect, protected by
* system_transition_mutex
*/
bool pm_freezing;
@@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock);
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
@@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_stru
}
EXPORT_SYMBOL(freezing_slow_path);
+bool frozen(struct task_struct *p)
+{
+ return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
- /* Hmm, should we be allowed to suspend when there are realtime
- processes around? */
+ unsigned int state = get_current_state();
bool was_frozen = false;
- unsigned int save = get_current_state();
pr_debug("%s entered refrigerator\n", current->comm);
+ WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ bool freeze;
+
+ set_current_state(TASK_FROZEN);
spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
- current->flags &= ~PF_FROZEN;
+ freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
spin_unlock_irq(&freezer_lock);
- if (!(current->flags & PF_FROZEN))
+ if (!freeze)
break;
+
was_frozen = true;
schedule();
}
+ __set_current_state(TASK_RUNNING);
pr_debug("%s left refrigerator\n", current->comm);
- /*
- * Restore saved task state before returning. The mb'd version
- * needs to be used; otherwise, it might silently break
- * synchronization which depends on ordered task state change.
- */
- set_current_state(save);
-
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
@@ -101,6 +101,37 @@ static void fake_signal_wake_up(struct t
}
}
+static inline unsigned int __can_freeze(struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+ return 0;
+
+ /*
+ * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+ * can suffer spurious wakeups.
+ */
+ if (state & TASK_FREEZABLE)
+ WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It's dangerous to freeze with locks held; there be dragons there.
+ */
+ if (!(state & __TASK_FREEZABLE_UNSAFE))
+ WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+ return TASK_FROZEN;
+}
+
+/* See task_cond_set_special_state(); serializes against ttwu() */
+static bool __freeze_task(struct task_struct *p)
+{
+ return task_cond_set_special_state(p, __can_freeze(p));
+}
+
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
@@ -116,20 +147,8 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * This check can race with freezer_do_not_count, but worst case that
- * will result in an extra wakeup being sent to the task. It does not
- * race with freezer_count(), the barriers in freezer_count() and
- * freezer_should_skip() ensure that either freezer_count() sees
- * freezing == true in try_to_freeze() and freezes, or
- * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
- * normally.
- */
- if (freezer_should_skip(p))
- return false;
-
spin_lock_irqsave(&freezer_lock, flags);
- if (!freezing(p) || frozen(p)) {
+ if (!freezing(p) || frozen(p) || __freeze_task(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
@@ -137,19 +156,58 @@ bool freeze_task(struct task_struct *p)
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
- wake_up_state(p, TASK_INTERRUPTIBLE);
+ wake_up_state(p, TASK_NORMAL);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
+/*
+ * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
+ * state in p->jobctl and p->ptrace respectively. If either of them got a
+ * wakeup that was missed because TASK_FROZEN, then their canonical state
+ * reflects that and the below will refuse to restore the special state and
+ * instead issue the wakeup.
+ */
+static inline unsigned int __thaw_special(struct task_struct *p)
+{
+ unsigned int state = 0;
+
+ if (p->ptrace & PT_STOPPED) {
+ state = __TASK_TRACED;
+
+ if (p->ptrace & PT_STOPPED_FATAL) {
+ state |= TASK_WAKEKILL;
+ if (__fatal_signal_pending(p))
+ state = 0;
+ }
+
+ } else if ((p->jobctl & JOBCTL_STOP_PENDING) &&
+ !__fatal_signal_pending(p)) {
+
+ state = TASK_STOPPED;
+ }
+
+ return state;
+}
+
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
+ unsigned long flags, flags2;
spin_lock_irqsave(&freezer_lock, flags);
- if (frozen(p))
- wake_up_process(p);
+ if (WARN_ON_ONCE(freezing(p)))
+ goto unlock;
+
+ if (lock_task_sighand(p, &flags2)) {
+ bool ret = task_cond_set_special_state(p, __thaw_special(p));
+ unlock_task_sighand(p, &flags2);
+ if (ret)
+ goto unlock;
+ }
+
+ wake_up_state(p, TASK_FROZEN);
+unlock:
spin_unlock_irqrestore(&freezer_lock, flags);
}
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2832,7 +2832,7 @@ static void futex_wait_queue_me(struct f
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
queue_me(q, hb);
/* Arm the timer */
@@ -2850,7 +2850,7 @@ static void futex_wait_queue_me(struct f
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- freezable_schedule();
+ schedule();
}
__set_current_state(TASK_RUNNING);
}
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -93,8 +93,8 @@ static void check_hung_task(struct task_
* Ensure the task is not frozen.
* Also, skip vfork and any other user process that freezer should skip.
*/
- if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
- return;
+ if (unlikely(READ_ONCE(t->__state) & (TASK_FREEZABLE | TASK_FROZEN)))
+ return;
/*
* When a freshly created task is scheduled once, changes its state to
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,7 +24,7 @@
unsigned int lock_system_sleep(void)
{
unsigned int flags = current->flags;
- current->flags |= PF_FREEZER_SKIP;
+ current->flags |= PF_NOFREEZE;
mutex_lock(&system_transition_mutex);
return flags;
}
@@ -48,8 +48,8 @@ void unlock_system_sleep(unsigned int fl
* Which means, if we use try_to_freeze() here, it would make them
* enter the refrigerator, thus causing hibernation to lockup.
*/
- if (!(flags & PF_FREEZER_SKIP))
- current->flags &= ~PF_FREEZER_SKIP;
+ if (!(flags & PF_NOFREEZE))
+ current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,8 +53,7 @@ static int try_to_freeze_tasks(bool user
if (p == current || !freeze_task(p))
continue;
- if (!freezer_should_skip(p))
- todo++;
+ todo++;
}
read_unlock(&tasklist_lock);
@@ -99,8 +98,7 @@ static int try_to_freeze_tasks(bool user
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
+ if (p != current && freezing(p) && !frozen(p))
sched_show_task(p);
}
read_unlock(&tasklist_lock);
@@ -132,7 +130,7 @@ int freeze_processes(void)
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
pm_wakeup_clear(true);
pr_info("Freezing user space processes ... ");
@@ -193,7 +191,7 @@ void thaw_processes(void)
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
pm_freezing = false;
pm_nosig_freezing = false;
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -282,7 +282,7 @@ static int ptrace_check_attach(struct ta
read_unlock(&tasklist_lock);
if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) { // XXX mooo!!!
+ if (!wait_task_inactive(child, __TASK_TRACED | TASK_FREEZABLE)) {
/*
* This can only happen if may_ptrace_stop() fails and
* ptrace_stop() changes ->state back to TASK_RUNNING,
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -247,6 +247,15 @@ int __sched wait_for_completion_killable
}
EXPORT_SYMBOL(wait_for_completion_killable);
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3162,6 +3162,19 @@ int migrate_swap(struct task_struct *cur
}
#endif /* CONFIG_NUMA_BALANCING */
+static inline __wti_match(struct task_struct *p, unsigned int match_state)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ if ((match_state & TASK_FREEZABLE) && state == TASK_FROZEN)
+ return true;
+
+ if (state == (match_state & ~TASK_FREEZABLE))
+ return true;
+
+ return false;
+}
+
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -3206,7 +3219,7 @@ unsigned long wait_task_inactive(struct
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
- if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+ if (match_state && __wti_match(p, match_state))
return 0;
cpu_relax();
}
@@ -3221,7 +3234,7 @@ unsigned long wait_task_inactive(struct
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || READ_ONCE(p->__state) == match_state)
+ if (!match_state || __wti_match(p, match_state))
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -6154,7 +6167,7 @@ static void __sched notrace __schedule(u
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
+ !(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2299,7 +2299,7 @@ static void ptrace_stop(int exit_code, i
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
preempt_enable_no_resched();
- freezable_schedule();
+ schedule();
cgroup_leave_frozen(true);
} else {
/*
@@ -2479,7 +2479,7 @@ static bool do_signal_stop(int signr)
/* Now we don't run again until woken by SIGCONT or SIGKILL */
cgroup_enter_frozen();
- freezable_schedule();
+ schedule();
return true;
} else {
/*
@@ -2555,11 +2555,11 @@ static void do_freezer_trap(void)
* immediately (if there is a non-fatal signal pending), and
* put the task into sleep.
*/
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
clear_thread_flag(TIF_SIGPENDING);
spin_unlock_irq(¤t->sighand->siglock);
cgroup_enter_frozen();
- freezable_schedule();
+ schedule();
}
static int ptrace_signal(int signr, kernel_siginfo_t *info)
@@ -3608,9 +3608,9 @@ static int do_sigtimedwait(const sigset_
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
- __set_current_state(TASK_INTERRUPTIBLE);
- ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
- HRTIMER_MODE_REL);
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+ HRTIMER_MODE_REL);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct h
struct restart_block *restart;
do {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
hrtimer_sleeper_start_expires(t, mode);
if (likely(t->task))
- freezable_schedule();
+ schedule();
hrtimer_cancel(&t->timer);
mode = HRTIMER_MODE_ABS;
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -404,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup)
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
+ unsigned int state = TASK_UNINTERRUPTIBLE;
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
@@ -437,25 +438,22 @@ int call_usermodehelper_exec(struct subp
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
+ if (wait & UMH_KILLABLE)
+ state |= TASK_KILLABLE;
+
if (wait & UMH_FREEZABLE)
- freezer_do_not_count();
+ state |= TASK_FREEZABLE;
- if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
- if (!retval)
- goto wait_done;
+ retval = wait_for_completion_state(&done, state);
+ if (!retval)
+ goto wait_done;
+ if (wait & UMH_KILLABLE) {
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
- /* fallthrough, umh_complete() was already called */
}
- wait_for_completion(&done);
-
- if (wait & UMH_FREEZABLE)
- freezer_count();
-
wait_done:
retval = sub_info->retval;
out:
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -793,8 +793,8 @@ static void khugepaged_alloc_sleep(void)
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
- freezable_schedule_timeout_interruptible(
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue
static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- freezable_schedule_unsafe();
+ schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
return 0;
@@ -324,14 +324,12 @@ static int rpc_complete_task(struct rpc_
* to enforce taking of the wq->lock and hence avoid races with
* rpc_complete_task().
*/
-int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
+int rpc_wait_for_completion_task(struct rpc_task *task)
{
- if (action == NULL)
- action = rpc_wait_bit_killable;
return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
- action, TASK_KILLABLE);
+ rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
}
-EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
+EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task);
/*
* Make an RPC task runnable.
@@ -938,7 +936,7 @@ static void __rpc_execute(struct rpc_tas
trace_rpc_task_sync_sleep(task, task->tk_action);
status = out_of_line_wait_on_bit(&task->tk_runstate,
RPC_TASK_QUEUED, rpc_wait_bit_killable,
- TASK_KILLABLE);
+ TASK_KILLABLE|TASK_FREEZABLE);
if (status < 0) {
/*
* When a sync task receives a signal, it exits with
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2442,13 +2442,14 @@ static long unix_stream_data_wait(struct
struct sk_buff *last, unsigned int last_len,
bool freezable)
{
+ unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
struct sk_buff *tail;
DEFINE_WAIT(wait);
unix_state_lock(sk);
for (;;) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, state);
tail = skb_peek_tail(&sk->sk_receive_queue);
if (tail != last ||
@@ -2461,10 +2462,7 @@ static long unix_stream_data_wait(struct
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
unix_state_unlock(sk);
- if (freezable)
- timeo = freezable_schedule_timeout(timeo);
- else
- timeo = schedule_timeout(timeo);
+ timeo = schedule_timeout(timeo);
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD))
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags
2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
@ 2021-10-14 8:58 ` Will Deacon
0 siblings, 0 replies; 12+ messages in thread
From: Will Deacon @ 2021-10-14 8:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, linux-kernel, tj, linux-pm
On Sat, Oct 09, 2021 at 12:07:55PM +0200, Peter Zijlstra wrote:
> Rafael explained that the reason for having both PF_NOFREEZE and
> PF_FREEZER_SKIP is that {,un}lock_system_sleep() is callable from
> kthread context that has previously called set_freezable().
>
> In preparation of merging the flags, have {,un}lock_system_slee() save
> and restore current->flags.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> drivers/scsi/scsi_transport_spi.c | 7 ++++---
> include/linux/suspend.h | 8 ++++----
> kernel/power/hibernate.c | 35 ++++++++++++++++++++++-------------
> kernel/power/main.c | 16 ++++++++++------
> kernel/power/suspend.c | 12 ++++++++----
> kernel/power/user.c | 24 ++++++++++++++----------
> 6 files changed, 62 insertions(+), 40 deletions(-)
There's lots of Christmas tree inversion sprinkled in here, but the change
looks good so:
Acked-by: Will Deacon <will@kernel.org>
Will
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction
2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
@ 2021-10-14 9:13 ` Will Deacon
0 siblings, 0 replies; 12+ messages in thread
From: Will Deacon @ 2021-10-14 9:13 UTC (permalink / raw)
To: Peter Zijlstra
Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, linux-kernel, tj, linux-pm
On Sat, Oct 09, 2021 at 12:07:56PM +0200, Peter Zijlstra wrote:
> handle_initrd() marks itself as PF_FREEZER_SKIP in order to ensure
> that the UMH, which is going to freeze the system, doesn't
> indefinitely wait for it's caller.
>
> Rework things by adding UMH_FREEZABLE to indicate the completion is
> freezable.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/umh.h | 9 +++++----
> init/do_mounts_initrd.c | 10 +---------
> kernel/umh.c | 8 ++++++++
> 3 files changed, 14 insertions(+), 13 deletions(-)
This looks much better to me:
Acked-by: Will Deacon <will@kernel.org>
Just a question on the old code:
> --- a/include/linux/umh.h
> +++ b/include/linux/umh.h
> @@ -11,10 +11,11 @@
> struct cred;
> struct file;
>
> -#define UMH_NO_WAIT 0 /* don't wait at all */
> -#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */
> -#define UMH_WAIT_PROC 2 /* wait for the process to complete */
> -#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */
> +#define UMH_NO_WAIT 0x00 /* don't wait at all */
> +#define UMH_WAIT_EXEC 0x01 /* wait for the exec, but not the process */
> +#define UMH_WAIT_PROC 0x02 /* wait for the process to complete */
> +#define UMH_KILLABLE 0x04 /* wait for EXEC/PROC killable */
> +#define UMH_FREEZABLE 0x08 /* wait for EXEC/PROC freezable */
>
> struct subprocess_info {
> struct work_struct work;
> --- a/init/do_mounts_initrd.c
> +++ b/init/do_mounts_initrd.c
> @@ -79,19 +79,11 @@ static void __init handle_initrd(void)
> init_mkdir("/old", 0700);
> init_chdir("/old");
>
> - /*
> - * In case that a resume from disk is carried out by linuxrc or one of
> - * its children, we need to tell the freezer not to wait for us.
> - */
> - current->flags |= PF_FREEZER_SKIP;
> -
> info = call_usermodehelper_setup("/linuxrc", argv, envp_init,
> GFP_KERNEL, init_linuxrc, NULL, NULL);
> if (!info)
> return;
> - call_usermodehelper_exec(info, UMH_WAIT_PROC);
> -
> - current->flags &= ~PF_FREEZER_SKIP;
How was this supposed to work if it raced with the freezer checking the
flag?
Will
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 3/6] ptrace: Order and comment PT_flags
2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
@ 2021-10-14 9:31 ` Will Deacon
2021-10-14 14:27 ` Peter Zijlstra
0 siblings, 1 reply; 12+ messages in thread
From: Will Deacon @ 2021-10-14 9:31 UTC (permalink / raw)
To: Peter Zijlstra
Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, linux-kernel, tj, linux-pm
On Sat, Oct 09, 2021 at 12:07:57PM +0200, Peter Zijlstra wrote:
> Add a comment to the PT_flags to indicate their actual value, this
> makes it easier to see what bits are used and where there might be a
> possible hole to use.
>
> Notable PT_SEIZED was placed wrong, also PT_EVENT_FLAG() space seems
> ill defined, as written is seems to be meant to cover the entire
> PTRACE_O_ range offset by 3 bits, which would then be 3+[0..21],
> however PT_SEIZED is in the middle of that.
Why do you think PT_EVENT_FLAG() should cover all the PTRACE_O_* options?
Just going by the name and current callers, I'd only expect it to cover
the PTRACE_EVENT_* flags, no?
But in any case, having the comments is helpful, so:
Acked-by: Will Deacon <will@kernel.org>
Will
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 3/6] ptrace: Order and comment PT_flags
2021-10-14 9:31 ` Will Deacon
@ 2021-10-14 14:27 ` Peter Zijlstra
0 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-14 14:27 UTC (permalink / raw)
To: Will Deacon
Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, linux-kernel, tj, linux-pm
On Thu, Oct 14, 2021 at 10:31:22AM +0100, Will Deacon wrote:
> On Sat, Oct 09, 2021 at 12:07:57PM +0200, Peter Zijlstra wrote:
> > Add a comment to the PT_flags to indicate their actual value, this
> > makes it easier to see what bits are used and where there might be a
> > possible hole to use.
> >
> > Notable PT_SEIZED was placed wrong, also PT_EVENT_FLAG() space seems
> > ill defined, as written is seems to be meant to cover the entire
> > PTRACE_O_ range offset by 3 bits, which would then be 3+[0..21],
> > however PT_SEIZED is in the middle of that.
>
> Why do you think PT_EVENT_FLAG() should cover all the PTRACE_O_* options?
> Just going by the name and current callers, I'd only expect it to cover
> the PTRACE_EVENT_* flags, no?
Because PT_EXITKILL and PT_SUSPEND_SECCOMP are also exposed in that same
mapping.
Ideally we'd change PT_OPT_FLAG_SHIFT to 8 or something and have the
high 24 bits for OPT and then use the low 8 bits for SEIZED and the new
flags.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic
2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
@ 2021-10-18 13:36 ` Peter Zijlstra
0 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-18 13:36 UTC (permalink / raw)
To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
mgorman, Will Deacon
Cc: linux-kernel, tj, linux-pm
On Sat, Oct 09, 2021 at 12:08:00PM +0200, Peter Zijlstra wrote:
> +static inline unsigned int __can_freeze(struct task_struct *p)
> +{
> + unsigned int state = READ_ONCE(p->__state);
> +
> + if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
> + return 0;
> +
> + /*
> + * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
> + * can suffer spurious wakeups.
> + */
> + if (state & TASK_FREEZABLE)
> + WARN_ON_ONCE(!(state & TASK_NORMAL));
> +
> +#ifdef CONFIG_LOCKDEP
> + /*
> + * It's dangerous to freeze with locks held; there be dragons there.
> + */
> + if (!(state & __TASK_FREEZABLE_UNSAFE))
> + WARN_ON_ONCE(debug_locks && p->lockdep_depth);
> +#endif
> +
> + return TASK_FROZEN;
> +}
> +
> +/* See task_cond_set_special_state(); serializes against ttwu() */
> +static bool __freeze_task(struct task_struct *p)
> +{
> + return task_cond_set_special_state(p, __can_freeze(p));
> +}
Will found an issue with this, notably task_cond_set_special() only
takes ->pi_lock and as such doesn't serialize against __schedule(),
which then yields the following fun scenario:
__schedule() __freeze_task()
prev_state = READ_ONCE(prev->__state); // INTERRUPTIBLE
task_cond_set_special_state()
...
WRITE_ONCE(prev->__state, TASK_FROZEN);
if (signal_pending_state(prev_state, prev)) // SIGPENDING
WRITE_ONCE(prev->__state, TASK_RUNNING)
And *whoopsie*, freezer things we're frozen, but we're back in the game.
AFAICT the below, which uses the brand-spanking-new task_call_func()
which currently sits in tip/sched/core to also serialize against
rq->lock should avoid this scenario.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -233,25 +233,6 @@ struct task_group;
} while (0)
/*
- * task_cond_set_special_state() is a cmpxchg like operation on task->state.
- *
- * This operation isn't safe in general and should only be used to transform
- * one (special) blocked state into another, such as:
- * TASK_STOPPED <-> TASK_FROZEN.
- */
-#define task_cond_set_special_state(task, cond_state) \
- ({ \
- struct task_struct *__p = (task); \
- unsigned long __flags; /* may shadow */ \
- unsigned int __state; \
- raw_spin_lock_irqsave(&__p->pi_lock, __flags); \
- if ((__state = (cond_state))) \
- WRITE_ONCE(__p->__state, __state); \
- raw_spin_unlock_irqrestore(&__p->pi_lock, __flags); \
- !!__state; \
- })
-
-/*
* PREEMPT_RT specific variants for "sleeping" spin/rwlocks
*
* RT's spin/rwlock substitutions are state preserving. The state of the
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -101,7 +101,7 @@ static void fake_signal_wake_up(struct t
}
}
-static inline unsigned int __can_freeze(struct task_struct *p)
+static int __set_task_frozen(struct task_struct *p, void *arg)
{
unsigned int state = READ_ONCE(p->__state);
@@ -123,13 +123,14 @@ static inline unsigned int __can_freeze(
WARN_ON_ONCE(debug_locks && p->lockdep_depth);
#endif
+ WRITE_ONCE(p->__state, TASK_FROZEN);
return TASK_FROZEN;
}
-/* See task_cond_set_special_state(); serializes against ttwu() */
static bool __freeze_task(struct task_struct *p)
{
- return task_cond_set_special_state(p, __can_freeze(p));
+ /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+ return task_call_func(p, __set_task_frozen, NULL);
}
/**
@@ -169,7 +170,7 @@ bool freeze_task(struct task_struct *p)
* reflects that and the below will refuse to restore the special state and
* instead issue the wakeup.
*/
-static inline unsigned int __thaw_special(struct task_struct *p)
+static int __set_task_special(struct task_struct *p, void *arg)
{
unsigned int state = 0;
@@ -188,6 +189,9 @@ static inline unsigned int __thaw_specia
state = TASK_STOPPED;
}
+ if (state)
+ WRITE_ONCE(p->__state, state);
+
return state;
}
@@ -200,7 +204,8 @@ void __thaw_task(struct task_struct *p)
goto unlock;
if (lock_task_sighand(p, &flags2)) {
- bool ret = task_cond_set_special_state(p, __thaw_special(p));
+ /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
+ bool ret = task_call_func(p, __set_task_special, NULL);
unlock_task_sighand(p, &flags2);
if (ret)
goto unlock;
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -193,6 +193,17 @@ static bool looks_like_a_spurious_pid(st
return true;
}
+static int __set_task_traced(struct task_struct *task, void *arg)
+{
+ unsigned int *state = arg;
+
+ if (!(task->__state & __TASK_TRACED))
+ return 0;
+
+ WRITE_ONCE(task->__state, *state);
+ return *state;
+}
+
/* Ensure that nothing can wake it up, even SIGKILL */
static bool ptrace_freeze_traced(struct task_struct *task)
{
@@ -205,10 +216,12 @@ static bool ptrace_freeze_traced(struct
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
+ unsigned int state = __TASK_TRACED;
+
task->ptrace &= ~PT_STOPPED_MASK;
task->ptrace |= PT_STOPPED;
/* *TASK_TRACED -> __TASK_TRACED */
- task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * __TASK_TRACED);
+ task_call_func(task, __set_task_traced, &state);
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -233,9 +246,11 @@ static void ptrace_unfreeze_traced(struc
task->ptrace &= ~PT_STOPPED_MASK;
wake_up_state(task, __TASK_TRACED);
} else {
+ unsigned int state = TASK_TRACED;
+
task->ptrace |= PT_STOPPED_MASK;
/* *TASK_TRACED -> TASK_TRACED */
- task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * TASK_TRACED);
+ task_call_func(task, __set_task_traced, &state);
}
}
spin_unlock_irq(&task->sighand->siglock);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3181,7 +3181,7 @@ int migrate_swap(struct task_struct *cur
}
#endif /* CONFIG_NUMA_BALANCING */
-static inline __wti_match(struct task_struct *p, unsigned int match_state)
+static inline bool __wti_match(struct task_struct *p, unsigned int match_state)
{
unsigned int state = READ_ONCE(p->__state);
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2021-10-18 14:10 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
2021-10-14 8:58 ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
2021-10-14 9:13 ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
2021-10-14 9:31 ` Will Deacon
2021-10-14 14:27 ` Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED Peter Zijlstra
2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
2021-10-18 13:36 ` Peter Zijlstra
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.