All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/6] Freezer rewrite
@ 2021-10-09 10:07 Peter Zijlstra
  2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
                   ` (5 more replies)
  0 siblings, 6 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

Hi all,

This is the by now familiar rewrite of the freezer, but with yet another
attempt at getting the special blocked states (STOPPED/TRACED) right.

Currently it is already possible to tell if a task should be STOPPED from
looking at p->jobctl, the same idea is extended to TRACED and p->ptrace.

But treating p->jobctl/p->ptrace as canonical state, the blocked state can be
(hopefully correctly this time) reconstructed from it at all times.
Specifically, if a TRACED task was woken by the tracer, while the tracee was
FROZEN, the p->ptrace state will have been updated, and the thaw operation will
result in the wakeup.

Oleg, if you can please try and shoot holes in it again? :-)



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags
  2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
  2021-10-14  8:58   ` Will Deacon
  2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

Rafael explained that the reason for having both PF_NOFREEZE and
PF_FREEZER_SKIP is that {,un}lock_system_sleep() is callable from
kthread context that has previously called set_freezable().

In preparation of merging the flags, have {,un}lock_system_slee() save
and restore current->flags.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/scsi/scsi_transport_spi.c |    7 ++++---
 include/linux/suspend.h           |    8 ++++----
 kernel/power/hibernate.c          |   35 ++++++++++++++++++++++-------------
 kernel/power/main.c               |   16 ++++++++++------
 kernel/power/suspend.c            |   12 ++++++++----
 kernel/power/user.c               |   24 ++++++++++++++----------
 6 files changed, 62 insertions(+), 40 deletions(-)

--- a/drivers/scsi/scsi_transport_spi.c
+++ b/drivers/scsi/scsi_transport_spi.c
@@ -998,8 +998,9 @@ void
 spi_dv_device(struct scsi_device *sdev)
 {
 	struct scsi_target *starget = sdev->sdev_target;
-	u8 *buffer;
 	const int len = SPI_MAX_ECHO_BUFFER_SIZE*2;
+	unsigned int sleep_flags;
+	u8 *buffer;
 
 	/*
 	 * Because this function and the power management code both call
@@ -1007,7 +1008,7 @@ spi_dv_device(struct scsi_device *sdev)
 	 * while suspend or resume is in progress. Hence the
 	 * lock/unlock_system_sleep() calls.
 	 */
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	if (scsi_autopm_get_device(sdev))
 		goto unlock_system_sleep;
@@ -1058,7 +1059,7 @@ spi_dv_device(struct scsi_device *sdev)
 	scsi_autopm_put_device(sdev);
 
 unlock_system_sleep:
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 }
 EXPORT_SYMBOL(spi_dv_device);
 
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -518,8 +518,8 @@ extern bool pm_save_wakeup_count(unsigne
 extern void pm_wakep_autosleep_enabled(bool set);
 extern void pm_print_active_wakeup_sources(void);
 
-extern void lock_system_sleep(void);
-extern void unlock_system_sleep(void);
+extern unsigned int lock_system_sleep(void);
+extern void unlock_system_sleep(unsigned int);
 
 #else /* !CONFIG_PM_SLEEP */
 
@@ -542,8 +542,8 @@ static inline void pm_system_wakeup(void
 static inline void pm_wakeup_clear(bool reset) {}
 static inline void pm_system_irq_wakeup(unsigned int irq_number) {}
 
-static inline void lock_system_sleep(void) {}
-static inline void unlock_system_sleep(void) {}
+static inline unsigned int lock_system_sleep(void) { return 0; }
+static inline void unlock_system_sleep(unsigned int flags) {}
 
 #endif /* !CONFIG_PM_SLEEP */
 
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -90,20 +90,24 @@ bool hibernation_available(void)
  */
 void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
+	unsigned int sleep_flags;
+
 	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
 	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore
 	    && ops->restore_cleanup && ops->leave)) {
 		WARN_ON(1);
 		return;
 	}
-	lock_system_sleep();
+
+	sleep_flags = lock_system_sleep();
+
 	hibernation_ops = ops;
 	if (ops)
 		hibernation_mode = HIBERNATION_PLATFORM;
 	else if (hibernation_mode == HIBERNATION_PLATFORM)
 		hibernation_mode = HIBERNATION_SHUTDOWN;
 
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 }
 EXPORT_SYMBOL_GPL(hibernation_set_ops);
 
@@ -707,6 +711,7 @@ static int load_image_and_restore(void)
 int hibernate(void)
 {
 	bool snapshot_test = false;
+	unsigned int sleep_flags;
 	int error;
 
 	if (!hibernation_available()) {
@@ -714,7 +719,7 @@ int hibernate(void)
 		return -EPERM;
 	}
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 	/* The snapshot device should not be opened while we're running */
 	if (!hibernate_acquire()) {
 		error = -EBUSY;
@@ -788,7 +793,7 @@ int hibernate(void)
 	pm_restore_console();
 	hibernate_release();
  Unlock:
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 	pr_info("hibernation exit\n");
 
 	return error;
@@ -803,9 +808,10 @@ int hibernate(void)
  */
 int hibernate_quiet_exec(int (*func)(void *data), void *data)
 {
+	unsigned int sleep_flags;
 	int error;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	if (!hibernate_acquire()) {
 		error = -EBUSY;
@@ -885,7 +891,7 @@ int hibernate_quiet_exec(int (*func)(voi
 	hibernate_release();
 
 unlock:
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 
 	return error;
 }
@@ -1094,11 +1100,12 @@ static ssize_t disk_show(struct kobject
 static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 			  const char *buf, size_t n)
 {
+	int mode = HIBERNATION_INVALID;
+	unsigned int sleep_flags;
 	int error = 0;
-	int i;
 	int len;
 	char *p;
-	int mode = HIBERNATION_INVALID;
+	int i;
 
 	if (!hibernation_available())
 		return -EPERM;
@@ -1106,7 +1113,7 @@ static ssize_t disk_store(struct kobject
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
 		if (len == strlen(hibernation_modes[i])
 		    && !strncmp(buf, hibernation_modes[i], len)) {
@@ -1136,7 +1143,7 @@ static ssize_t disk_store(struct kobject
 	if (!error)
 		pm_pr_dbg("Hibernation mode set to '%s'\n",
 			       hibernation_modes[mode]);
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 	return error ? error : n;
 }
 
@@ -1152,9 +1159,10 @@ static ssize_t resume_show(struct kobjec
 static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
 			    const char *buf, size_t n)
 {
-	dev_t res;
+	unsigned int sleep_flags;
 	int len = n;
 	char *name;
+	dev_t res;
 
 	if (len && buf[len-1] == '\n')
 		len--;
@@ -1167,9 +1175,10 @@ static ssize_t resume_store(struct kobje
 	if (!res)
 		return -EINVAL;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 	swsusp_resume_device = res;
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
+
 	pm_pr_dbg("Configured hibernation resume from disk to %u\n",
 		  swsusp_resume_device);
 	noresume = 0;
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,14 +21,16 @@
 
 #ifdef CONFIG_PM_SLEEP
 
-void lock_system_sleep(void)
+unsigned int lock_system_sleep(void)
 {
+	unsigned int flags = current->flags;
 	current->flags |= PF_FREEZER_SKIP;
 	mutex_lock(&system_transition_mutex);
+	return flags;
 }
 EXPORT_SYMBOL_GPL(lock_system_sleep);
 
-void unlock_system_sleep(void)
+void unlock_system_sleep(unsigned int flags)
 {
 	/*
 	 * Don't use freezer_count() because we don't want the call to
@@ -46,7 +48,8 @@ void unlock_system_sleep(void)
 	 * Which means, if we use try_to_freeze() here, it would make them
 	 * enter the refrigerator, thus causing hibernation to lockup.
 	 */
-	current->flags &= ~PF_FREEZER_SKIP;
+	if (!(flags & PF_FREEZER_SKIP))
+		current->flags &= ~PF_FREEZER_SKIP;
 	mutex_unlock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
@@ -260,16 +263,17 @@ static ssize_t pm_test_show(struct kobje
 static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 				const char *buf, size_t n)
 {
+	unsigned int sleep_flags;
 	const char * const *s;
+	int error = -EINVAL;
 	int level;
 	char *p;
 	int len;
-	int error = -EINVAL;
 
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	level = TEST_FIRST;
 	for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -279,7 +283,7 @@ static ssize_t pm_test_store(struct kobj
 			break;
 		}
 
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 
 	return error ? error : n;
 }
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2i
 
 void s2idle_set_ops(const struct platform_s2idle_ops *ops)
 {
-	lock_system_sleep();
+	unsigned int sleep_flags;
+
+	sleep_flags = lock_system_sleep();
 	s2idle_ops = ops;
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 }
 
 static void s2idle_begin(void)
@@ -202,7 +204,9 @@ __setup("mem_sleep_default=", mem_sleep_
  */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
-	lock_system_sleep();
+	unsigned int sleep_flags;
+
+	sleep_flags = lock_system_sleep();
 
 	suspend_ops = ops;
 
@@ -218,7 +222,7 @@ void suspend_set_ops(const struct platfo
 			mem_sleep_current = PM_SUSPEND_MEM;
 	}
 
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 }
 EXPORT_SYMBOL_GPL(suspend_set_ops);
 
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -46,12 +46,13 @@ int is_hibernate_resume_dev(dev_t dev)
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
+	unsigned int sleep_flags;
 	int error;
 
 	if (!hibernation_available())
 		return -EPERM;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	if (!hibernate_acquire()) {
 		error = -EBUSY;
@@ -97,7 +98,7 @@ static int snapshot_open(struct inode *i
 	data->dev = 0;
 
  Unlock:
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 
 	return error;
 }
@@ -105,8 +106,9 @@ static int snapshot_open(struct inode *i
 static int snapshot_release(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
+	unsigned int sleep_flags;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	swsusp_free();
 	data = filp->private_data;
@@ -123,7 +125,7 @@ static int snapshot_release(struct inode
 			PM_POST_HIBERNATION : PM_POST_RESTORE);
 	hibernate_release();
 
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 
 	return 0;
 }
@@ -131,11 +133,12 @@ static int snapshot_release(struct inode
 static ssize_t snapshot_read(struct file *filp, char __user *buf,
                              size_t count, loff_t *offp)
 {
+	loff_t pg_offp = *offp & ~PAGE_MASK;
 	struct snapshot_data *data;
+	unsigned int sleep_flags;
 	ssize_t res;
-	loff_t pg_offp = *offp & ~PAGE_MASK;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	data = filp->private_data;
 	if (!data->ready) {
@@ -156,7 +159,7 @@ static ssize_t snapshot_read(struct file
 		*offp += res;
 
  Unlock:
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 
 	return res;
 }
@@ -164,11 +167,12 @@ static ssize_t snapshot_read(struct file
 static ssize_t snapshot_write(struct file *filp, const char __user *buf,
                               size_t count, loff_t *offp)
 {
+	loff_t pg_offp = *offp & ~PAGE_MASK;
 	struct snapshot_data *data;
+	unsigned int sleep_flags;
 	ssize_t res;
-	loff_t pg_offp = *offp & ~PAGE_MASK;
 
-	lock_system_sleep();
+	sleep_flags = lock_system_sleep();
 
 	data = filp->private_data;
 
@@ -190,7 +194,7 @@ static ssize_t snapshot_write(struct fil
 	if (res > 0)
 		*offp += res;
 unlock:
-	unlock_system_sleep();
+	unlock_system_sleep(sleep_flags);
 
 	return res;
 }



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction
  2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
  2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
  2021-10-14  9:13   ` Will Deacon
  2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

handle_initrd() marks itself as PF_FREEZER_SKIP in order to ensure
that the UMH, which is going to freeze the system, doesn't
indefinitely wait for it's caller.

Rework things by adding UMH_FREEZABLE to indicate the completion is
freezable.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/umh.h     |    9 +++++----
 init/do_mounts_initrd.c |   10 +---------
 kernel/umh.c            |    8 ++++++++
 3 files changed, 14 insertions(+), 13 deletions(-)

--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -11,10 +11,11 @@
 struct cred;
 struct file;
 
-#define UMH_NO_WAIT	0	/* don't wait at all */
-#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
-#define UMH_WAIT_PROC	2	/* wait for the process to complete */
-#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
+#define UMH_NO_WAIT	0x00	/* don't wait at all */
+#define UMH_WAIT_EXEC	0x01	/* wait for the exec, but not the process */
+#define UMH_WAIT_PROC	0x02	/* wait for the process to complete */
+#define UMH_KILLABLE	0x04	/* wait for EXEC/PROC killable */
+#define UMH_FREEZABLE	0x08	/* wait for EXEC/PROC freezable */
 
 struct subprocess_info {
 	struct work_struct work;
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -79,19 +79,11 @@ static void __init handle_initrd(void)
 	init_mkdir("/old", 0700);
 	init_chdir("/old");
 
-	/*
-	 * In case that a resume from disk is carried out by linuxrc or one of
-	 * its children, we need to tell the freezer not to wait for us.
-	 */
-	current->flags |= PF_FREEZER_SKIP;
-
 	info = call_usermodehelper_setup("/linuxrc", argv, envp_init,
 					 GFP_KERNEL, init_linuxrc, NULL, NULL);
 	if (!info)
 		return;
-	call_usermodehelper_exec(info, UMH_WAIT_PROC);
-
-	current->flags &= ~PF_FREEZER_SKIP;
+	call_usermodehelper_exec(info, UMH_WAIT_PROC|UMH_FREEZABLE);
 
 	/* move initrd to rootfs' /old */
 	init_mount("..", ".", NULL, MS_MOVE, NULL);
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -28,6 +28,7 @@
 #include <linux/async.h>
 #include <linux/uaccess.h>
 #include <linux/initrd.h>
+#include <linux/freezer.h>
 
 #include <trace/events/module.h>
 
@@ -436,6 +437,9 @@ int call_usermodehelper_exec(struct subp
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
 
+	if (wait & UMH_FREEZABLE)
+		freezer_do_not_count();
+
 	if (wait & UMH_KILLABLE) {
 		retval = wait_for_completion_killable(&done);
 		if (!retval)
@@ -448,6 +452,10 @@ int call_usermodehelper_exec(struct subp
 	}
 
 	wait_for_completion(&done);
+
+	if (wait & UMH_FREEZABLE)
+		freezer_count();
+
 wait_done:
 	retval = sub_info->retval;
 out:



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 3/6] ptrace: Order and comment PT_flags
  2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
  2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
  2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
  2021-10-14  9:31   ` Will Deacon
  2021-10-09 10:07 ` [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

Add a comment to the PT_flags to indicate their actual value, this
makes it easier to see what bits are used and where there might be a
possible hole to use.

Notable PT_SEIZED was placed wrong, also PT_EVENT_FLAG() space seems
ill defined, as written is seems to be meant to cover the entire
PTRACE_O_ range offset by 3 bits, which would then be 3+[0..21],
however PT_SEIZED is in the middle of that.

XXX if at all possible, PT_SEIZED should be moved outside of this
range.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/ptrace.h      |   32 +++++++++++++++++---------------
 include/uapi/linux/ptrace.h |    3 +++
 2 files changed, 20 insertions(+), 15 deletions(-)

--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -28,30 +28,32 @@ extern int ptrace_access_vm(struct task_
  * flags.  When the a task is stopped the ptracer owns task->ptrace.
  */
 
-#define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
-#define PT_PTRACED	0x00000001
-#define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
+#define PT_PTRACED	0x00000001						// 0x00000001
+#define PT_DTRACE	0x00000002 /* delayed trace (used on m68k, i386) */	// 0x00000002
 
 #define PT_OPT_FLAG_SHIFT	3
 /* PT_TRACE_* event enable flags */
 #define PT_EVENT_FLAG(event)	(1 << (PT_OPT_FLAG_SHIFT + (event)))
-#define PT_TRACESYSGOOD		PT_EVENT_FLAG(0)
-#define PT_TRACE_FORK		PT_EVENT_FLAG(PTRACE_EVENT_FORK)
-#define PT_TRACE_VFORK		PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
-#define PT_TRACE_CLONE		PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
-#define PT_TRACE_EXEC		PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
-#define PT_TRACE_VFORK_DONE	PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
-#define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
-#define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
-#define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
-#define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
+#define PT_TRACESYSGOOD		PT_EVENT_FLAG(0)			        // 0x00000008
+#define PT_TRACE_FORK		PT_EVENT_FLAG(PTRACE_EVENT_FORK)	        // 0x00000010
+#define PT_TRACE_VFORK		PT_EVENT_FLAG(PTRACE_EVENT_VFORK)	        // 0x00000020
+#define PT_TRACE_CLONE		PT_EVENT_FLAG(PTRACE_EVENT_CLONE)	        // 0x00000040
+#define PT_TRACE_EXEC		PT_EVENT_FLAG(PTRACE_EVENT_EXEC)	        // 0x00000080
+#define PT_TRACE_VFORK_DONE	PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)	        // 0x00000100
+#define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)	        // 0x00000200
+#define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)	        // 0x00000400
+
+#define PT_SEIZED		0x00010000 /* SEIZE used, enable new behavior */// 0x00010000
+
+#define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)	// 0x00800000
+#define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) // 0x01000000
 
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
-#define PT_SINGLESTEP		(1<<PT_SINGLESTEP_BIT)
+#define PT_SINGLESTEP		(1<<PT_SINGLESTEP_BIT)				// 0x80000000
 #define PT_BLOCKSTEP_BIT	30
-#define PT_BLOCKSTEP		(1<<PT_BLOCKSTEP_BIT)
+#define PT_BLOCKSTEP		(1<<PT_BLOCKSTEP_BIT)				// 0x40000000
 
 extern long arch_ptrace(struct task_struct *child, long request,
 			unsigned long addr, unsigned long data);
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -134,6 +134,7 @@ struct ptrace_rseq_configuration {
 #define PTRACE_EVENT_STOP	128
 
 /* Options set using PTRACE_SETOPTIONS or using PTRACE_SEIZE @data param */
+/* Consider overlap with task->ptrace PT_flags */
 #define PTRACE_O_TRACESYSGOOD	1
 #define PTRACE_O_TRACEFORK	(1 << PTRACE_EVENT_FORK)
 #define PTRACE_O_TRACEVFORK	(1 << PTRACE_EVENT_VFORK)
@@ -143,6 +144,8 @@ struct ptrace_rseq_configuration {
 #define PTRACE_O_TRACEEXIT	(1 << PTRACE_EVENT_EXIT)
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
+/* PTRACE_O_SEIZED			(1 << 13) */
+
 /* eventless options */
 #define PTRACE_O_EXITKILL		(1 << 20)
 #define PTRACE_O_SUSPEND_SECCOMP	(1 << 21)



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace
  2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
                   ` (2 preceding siblings ...)
  2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
  2021-10-09 10:07 ` [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED Peter Zijlstra
  2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
  5 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

Just like we can recover __TASK_STOPPED from p->jobctl, add some bits
to p->ptrace such that we can recover __TASK_TRACED.

All these t->ptrace modifications are done under sighand lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/ptrace.h       |    4 ++++
 include/linux/sched/signal.h |    6 ++----
 kernel/ptrace.c              |   27 ++++++++++++++++++---------
 kernel/signal.c              |    1 +
 4 files changed, 25 insertions(+), 13 deletions(-)

--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -45,6 +45,10 @@ extern int ptrace_access_vm(struct task_
 #define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)	        // 0x00000400
 
 #define PT_SEIZED		0x00010000 /* SEIZE used, enable new behavior */// 0x00010000
+#define PT_STOPPED		0x00020000					// 0x00020000
+#define PT_STOPPED_FATAL	0x00040000					// 0x00040000
+
+#define PT_STOPPED_MASK		(PT_STOPPED|PT_STOPPED_FATAL)
 
 #define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)	// 0x00800000
 #define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) // 0x01000000
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -418,10 +418,8 @@ static inline void signal_wake_up(struct
 {
 	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
 }
-static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
-{
-	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
-}
+
+extern void ptrace_signal_wake_up(struct task_struct *t, bool resume);
 
 void task_join_group_stop(struct task_struct *task);
 
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -64,6 +64,14 @@ int ptrace_access_vm(struct task_struct
 	return ret;
 }
 
+void ptrace_signal_wake_up(struct task_struct *t, bool resume)
+{
+	lockdep_assert_task_sighand_held(t);
+
+	if (resume)
+		t->ptrace &= ~PT_STOPPED_MASK;
+	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+}
 
 void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
 		   const struct cred *ptracer_cred)
@@ -197,6 +205,8 @@ static bool ptrace_freeze_traced(struct
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
 	    !__fatal_signal_pending(task)) {
+		task->ptrace &= ~PT_STOPPED_MASK;
+		task->ptrace |= PT_STOPPED;
 		WRITE_ONCE(task->__state, __TASK_TRACED);
 		ret = true;
 	}
@@ -218,10 +228,13 @@ static void ptrace_unfreeze_traced(struc
 	 */
 	spin_lock_irq(&task->sighand->siglock);
 	if (READ_ONCE(task->__state) == __TASK_TRACED) {
-		if (__fatal_signal_pending(task))
+		if (__fatal_signal_pending(task)) {
+			task->ptrace &= ~PT_STOPPED_MASK;
 			wake_up_state(task, __TASK_TRACED);
-		else
+		} else {
+			task->ptrace |= PT_STOPPED_MASK;
 			WRITE_ONCE(task->__state, TASK_TRACED);
+		}
 	}
 	spin_unlock_irq(&task->sighand->siglock);
 }
@@ -835,8 +848,6 @@ static long ptrace_get_rseq_configuratio
 static int ptrace_resume(struct task_struct *child, long request,
 			 unsigned long data)
 {
-	bool need_siglock;
-
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -877,13 +888,11 @@ static int ptrace_resume(struct task_str
 	 * status and clears the code too; this can't race with the tracee, it
 	 * takes siglock after resume.
 	 */
-	need_siglock = data && !thread_group_empty(current);
-	if (need_siglock)
-		spin_lock_irq(&child->sighand->siglock);
+	spin_lock_irq(&child->sighand->siglock);
 	child->exit_code = data;
+	child->ptrace &= ~PT_STOPPED_MASK;
 	wake_up_state(child, __TASK_TRACED);
-	if (need_siglock)
-		spin_unlock_irq(&child->sighand->siglock);
+	spin_unlock_irq(&child->sighand->siglock);
 
 	return 0;
 }
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2228,6 +2228,7 @@ static void ptrace_stop(int exit_code, i
 			return;
 	}
 
+	current->ptrace |= PT_STOPPED_MASK;
 	set_special_state(TASK_TRACED);
 
 	/*



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED
  2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
                   ` (3 preceding siblings ...)
  2021-10-09 10:07 ` [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace Peter Zijlstra
@ 2021-10-09 10:07 ` Peter Zijlstra
  2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
  5 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:07 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

Make ->ptrace/->jobctl the canonical state, this allows us to play
games with __state (such as freezing).

The wait_task_inactive() usage will be fixed up later, once we have
additional TASK_state.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |   27 ++++++++++++++++++++++-----
 kernel/ptrace.c       |   16 +++++++++-------
 2 files changed, 31 insertions(+), 12 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;
 
 #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
 
-#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task)		((task)->ptrace & PT_STOPPED)
+#define task_is_stopped(task)		((task)->jobctl & JOBCTL_STOP_PENDING)
+#define task_is_stopped_or_traced(task)	(task_is_stopped(task) || task_is_traced(task))
 
 /*
  * Special states are those that do not use the normal wait-loop pattern. See
@@ -228,6 +226,25 @@ struct task_group;
 	} while (0)
 
 /*
+ * task_cond_set_special_state() is a cmpxchg like operation on task->state.
+ *
+ * This operation isn't safe in general and should only be used to transform
+ * one (special) blocked state into another, such as:
+ *   TASK_STOPPED <-> TASK_FROZEN.
+ */
+#define task_cond_set_special_state(task, cond_state)			\
+	({								\
+		struct task_struct *__p = (task);			\
+		unsigned long __flags; /* may shadow */			\
+		unsigned int __state;					\
+		raw_spin_lock_irqsave(&__p->pi_lock, __flags);		\
+		if ((__state = (cond_state)))				\
+			WRITE_ONCE(__p->__state, __state);		\
+		raw_spin_unlock_irqrestore(&__p->pi_lock, __flags);	\
+		!!__state;						\
+	})
+
+/*
  * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
  *
  * RT's spin/rwlock substitutions are state preserving. The state of the
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -207,7 +207,8 @@ static bool ptrace_freeze_traced(struct
 	    !__fatal_signal_pending(task)) {
 		task->ptrace &= ~PT_STOPPED_MASK;
 		task->ptrace |= PT_STOPPED;
-		WRITE_ONCE(task->__state, __TASK_TRACED);
+		/* *TASK_TRACED -> __TASK_TRACED */
+		task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * __TASK_TRACED);
 		ret = true;
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -217,7 +218,7 @@ static bool ptrace_freeze_traced(struct
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-	if (READ_ONCE(task->__state) != __TASK_TRACED)
+	if (!task_is_traced(task))
 		return;
 
 	WARN_ON(!task->ptrace || task->parent != current);
@@ -227,13 +228,14 @@ static void ptrace_unfreeze_traced(struc
 	 * Recheck state under the lock to close this race.
 	 */
 	spin_lock_irq(&task->sighand->siglock);
-	if (READ_ONCE(task->__state) == __TASK_TRACED) {
+	if (task_is_traced(task)) {
 		if (__fatal_signal_pending(task)) {
 			task->ptrace &= ~PT_STOPPED_MASK;
 			wake_up_state(task, __TASK_TRACED);
 		} else {
 			task->ptrace |= PT_STOPPED_MASK;
-			WRITE_ONCE(task->__state, TASK_TRACED);
+			/* *TASK_TRACED -> TASK_TRACED */
+			task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * TASK_TRACED);
 		}
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -269,7 +271,7 @@ static int ptrace_check_attach(struct ta
 	 */
 	read_lock(&tasklist_lock);
 	if (child->ptrace && child->parent == current) {
-		WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
+//		WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
 		/*
 		 * child->sighand can't be NULL, release_task()
 		 * does ptrace_unlink() before __exit_signal().
@@ -280,13 +282,13 @@ static int ptrace_check_attach(struct ta
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !ignore_state) {
-		if (!wait_task_inactive(child, __TASK_TRACED)) {
+		if (!wait_task_inactive(child, __TASK_TRACED)) { // XXX mooo!!!
 			/*
 			 * This can only happen if may_ptrace_stop() fails and
 			 * ptrace_stop() changes ->state back to TASK_RUNNING,
 			 * so we should not worry about leaking __TASK_TRACED.
 			 */
-			WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
+//			WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
 			ret = -ESRCH;
 		}
 	}



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic
  2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
                   ` (4 preceding siblings ...)
  2021-10-09 10:07 ` [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED Peter Zijlstra
@ 2021-10-09 10:08 ` Peter Zijlstra
  2021-10-18 13:36   ` Peter Zijlstra
  5 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-09 10:08 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, peterz, tj, linux-pm

This here rewrites the core freezer to behave better wrt thawing. By
replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.

As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/android/binder.c       |    4 
 drivers/media/pci/pt3/pt3.c    |    4 
 fs/cifs/inode.c                |    4 
 fs/cifs/transport.c            |    5 
 fs/coredump.c                  |    5 
 fs/nfs/file.c                  |    3 
 fs/nfs/inode.c                 |   12 --
 fs/nfs/nfs3proc.c              |    3 
 fs/nfs/nfs4proc.c              |   14 +-
 fs/nfs/nfs4state.c             |    3 
 fs/nfs/pnfs.c                  |    4 
 fs/xfs/xfs_trans_ail.c         |    8 -
 include/linux/completion.h     |    1 
 include/linux/freezer.h        |  244 +----------------------------------------
 include/linux/sched.h          |   41 +++---
 include/linux/sunrpc/sched.h   |    7 -
 include/linux/wait.h           |   40 +++++-
 kernel/cgroup/legacy_freezer.c |   23 +--
 kernel/exit.c                  |    4 
 kernel/fork.c                  |    5 
 kernel/freezer.c               |  132 +++++++++++++++-------
 kernel/futex.c                 |    4 
 kernel/hung_task.c             |    4 
 kernel/power/main.c            |    6 -
 kernel/power/process.c         |   10 -
 kernel/ptrace.c                |    2 
 kernel/sched/completion.c      |    9 +
 kernel/sched/core.c            |   19 ++-
 kernel/signal.c                |   14 +-
 kernel/time/hrtimer.c          |    4 
 kernel/umh.c                   |   20 +--
 mm/khugepaged.c                |    4 
 net/sunrpc/sched.c             |   12 --
 net/unix/af_unix.c             |    8 -
 34 files changed, 274 insertions(+), 408 deletions(-)

--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3722,10 +3722,9 @@ static int binder_wait_for_work(struct b
 	struct binder_proc *proc = thread->proc;
 	int ret = 0;
 
-	freezer_do_not_count();
 	binder_inner_proc_lock(proc);
 	for (;;) {
-		prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 		if (binder_has_work_ilocked(thread, do_proc_work))
 			break;
 		if (do_proc_work)
@@ -3742,7 +3741,6 @@ static int binder_wait_for_work(struct b
 	}
 	finish_wait(&thread->wait, &wait);
 	binder_inner_proc_unlock(proc);
-	freezer_count();
 
 	return ret;
 }
--- a/drivers/media/pci/pt3/pt3.c
+++ b/drivers/media/pci/pt3/pt3.c
@@ -445,8 +445,8 @@ static int pt3_fetch_thread(void *data)
 		pt3_proc_dma(adap);
 
 		delay = ktime_set(0, PT3_FETCH_DELAY * NSEC_PER_MSEC);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		freezable_schedule_hrtimeout_range(&delay,
+		set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+		schedule_hrtimeout_range(&delay,
 					PT3_FETCH_DELAY_DELTA * NSEC_PER_MSEC,
 					HRTIMER_MODE_REL);
 	}
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2285,7 +2285,7 @@ cifs_invalidate_mapping(struct inode *in
 static int
 cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
@@ -2303,7 +2303,7 @@ cifs_revalidate_mapping(struct inode *in
 		return 0;
 
 	rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
-				     TASK_KILLABLE);
+				     TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 	if (rc)
 		return rc;
 
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -760,8 +760,9 @@ wait_for_response(struct TCP_Server_Info
 {
 	int error;
 
-	error = wait_event_freezekillable_unsafe(server->response_q,
-				    midQ->mid_state != MID_REQUEST_SUBMITTED);
+	error = wait_event_state(server->response_q,
+				 midQ->mid_state != MID_REQUEST_SUBMITTED,
+				 (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
 	if (error < 0)
 		return -ERESTARTSYS;
 
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -465,9 +465,8 @@ static int coredump_wait(int exit_code,
 	if (core_waiters > 0) {
 		struct core_thread *ptr;
 
-		freezer_do_not_count();
-		wait_for_completion(&core_state->startup);
-		freezer_count();
+		wait_for_completion_state(&core_state->startup,
+					  TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
 		/*
 		 * Wait for all the threads to become inactive, so that
 		 * all the thread context (extended register state, like
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -558,7 +558,8 @@ static vm_fault_t nfs_vm_page_mkwrite(st
 	nfs_fscache_wait_on_page_write(NFS_I(inode), page);
 
 	wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
-			nfs_wait_bit_killable, TASK_KILLABLE);
+			   nfs_wait_bit_killable,
+			   TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 
 	lock_page(page);
 	mapping = page_file_mapping(page);
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fat
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-static int nfs_wait_killable(int mode)
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
 }
-
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
-{
-	return nfs_wait_killable(mode);
-}
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
 /**
@@ -1327,7 +1322,8 @@ int nfs_clear_invalid_mapping(struct add
 	 */
 	for (;;) {
 		ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
-					 nfs_wait_bit_killable, TASK_KILLABLE);
+					 nfs_wait_bit_killable,
+					 TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 		if (ret)
 			goto out;
 		spin_lock(&inode->i_lock);
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt,
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+		__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+		schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -411,8 +411,8 @@ static int nfs4_delay_killable(long *tim
 {
 	might_sleep();
 
-	freezable_schedule_timeout_killable_unsafe(
-		nfs4_update_delay(timeout));
+	__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+	schedule_timeout(nfs4_update_delay(timeout));
 	if (!__fatal_signal_pending(current))
 		return 0;
 	return -EINTR;
@@ -422,7 +422,8 @@ static int nfs4_delay_interruptible(long
 {
 	might_sleep();
 
-	freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
+	__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
+	schedule_timeout(nfs4_update_delay(timeout));
 	if (!signal_pending(current))
 		return 0;
 	return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
@@ -7320,7 +7321,8 @@ nfs4_retry_setlk_simple(struct nfs4_stat
 		status = nfs4_proc_setlk(state, cmd, request);
 		if ((status != -EAGAIN) || IS_SETLK(cmd))
 			break;
-		freezable_schedule_timeout_interruptible(timeout);
+		__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+		schedule_timeout(timeout);
 		timeout *= 2;
 		timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
 		status = -ERESTARTSYS;
@@ -7388,10 +7390,8 @@ nfs4_retry_setlk(struct nfs4_state *stat
 			break;
 
 		status = -ERESTARTSYS;
-		freezer_do_not_count();
-		wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
+		wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
 			   NFS4_LOCK_MAXTIMEOUT);
-		freezer_count();
 	} while (!signalled());
 
 	remove_wait_queue(q, &waiter.wait);
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1307,7 +1307,8 @@ int nfs4_wait_clnt_recover(struct nfs_cl
 
 	refcount_inc(&clp->cl_count);
 	res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-				 nfs_wait_bit_killable, TASK_KILLABLE);
+				 nfs_wait_bit_killable,
+				 TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 	if (res)
 		goto out;
 	if (clp->cl_cons_state < 0)
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1896,7 +1896,7 @@ static int pnfs_prepare_to_retry_layoutg
 	pnfs_layoutcommit_inode(lo->plh_inode, false);
 	return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
 				   nfs_wait_bit_killable,
-				   TASK_KILLABLE);
+				   TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 }
 
 static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
@@ -3176,7 +3176,7 @@ pnfs_layoutcommit_inode(struct inode *in
 		status = wait_on_bit_lock_action(&nfsi->flags,
 				NFS_INO_LAYOUTCOMMITTING,
 				nfs_wait_bit_killable,
-				TASK_KILLABLE);
+				TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 		if (status)
 			goto out;
 	}
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -590,9 +590,9 @@ xfsaild(
 
 	while (1) {
 		if (tout && tout <= 20)
-			set_current_state(TASK_KILLABLE);
+			set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
 		else
-			set_current_state(TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 
 		/*
 		 * Check kthread_should_stop() after we set the task state to
@@ -641,14 +641,14 @@ xfsaild(
 		    ailp->ail_target == ailp->ail_target_prev &&
 		    list_empty(&ailp->ail_buf_list)) {
 			spin_unlock(&ailp->ail_lock);
-			freezable_schedule();
+			schedule();
 			tout = 0;
 			continue;
 		}
 		spin_unlock(&ailp->ail_lock);
 
 		if (tout)
-			freezable_schedule_timeout(msecs_to_jiffies(tout));
+			schedule_timeout(msecs_to_jiffies(tout));
 
 		__set_current_state(TASK_RUNNING);
 
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -103,6 +103,7 @@ extern void wait_for_completion(struct c
 extern void wait_for_completion_io(struct completion *);
 extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
+extern int wait_for_completion_state(struct completion *x, unsigned int state);
 extern unsigned long wait_for_completion_timeout(struct completion *x,
 						   unsigned long timeout);
 extern unsigned long wait_for_completion_io_timeout(struct completion *x,
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -8,9 +8,11 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
+#include <linux/jump_label.h>
 
 #ifdef CONFIG_FREEZER
-extern atomic_t system_freezing_cnt;	/* nr of freezing conds in effect */
+DECLARE_STATIC_KEY_FALSE(freezer_active);
+
 extern bool pm_freezing;		/* PM freezing in effect */
 extern bool pm_nosig_freezing;		/* PM nosig freezing in effect */
 
@@ -22,10 +24,7 @@ extern unsigned int freeze_timeout_msecs
 /*
  * Check if a process has been frozen
  */
-static inline bool frozen(struct task_struct *p)
-{
-	return p->flags & PF_FROZEN;
-}
+extern bool frozen(struct task_struct *p);
 
 extern bool freezing_slow_path(struct task_struct *p);
 
@@ -34,9 +33,10 @@ extern bool freezing_slow_path(struct ta
  */
 static inline bool freezing(struct task_struct *p)
 {
-	if (likely(!atomic_read(&system_freezing_cnt)))
-		return false;
-	return freezing_slow_path(p);
+	if (static_branch_unlikely(&freezer_active))
+		return freezing_slow_path(p);
+
+	return false;
 }
 
 /* Takes and releases task alloc lock using task_lock() */
@@ -48,23 +48,14 @@ extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
 extern void thaw_kernel_threads(void);
 
-/*
- * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
- * If try_to_freeze causes a lockdep warning it means the caller may deadlock
- */
-static inline bool try_to_freeze_unsafe(void)
+static inline bool try_to_freeze(void)
 {
 	might_sleep();
 	if (likely(!freezing(current)))
 		return false;
-	return __refrigerator(false);
-}
-
-static inline bool try_to_freeze(void)
-{
 	if (!(current->flags & PF_NOFREEZE))
 		debug_check_no_locks_held();
-	return try_to_freeze_unsafe();
+	return __refrigerator(false);
 }
 
 extern bool freeze_task(struct task_struct *p);
@@ -79,195 +70,6 @@ static inline bool cgroup_freezing(struc
 }
 #endif /* !CONFIG_CGROUP_FREEZER */
 
-/*
- * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
- * calls wait_for_completion(&vfork) and reset right after it returns from this
- * function.  Next, the parent should call try_to_freeze() to freeze itself
- * appropriately in case the child has exited before the freezing of tasks is
- * complete.  However, we don't want kernel threads to be frozen in unexpected
- * places, so we allow them to block freeze_processes() instead or to set
- * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the
- * parent won't really block freeze_processes(), since ____call_usermodehelper()
- * (the child) does a little before exec/exit and it can't be frozen before
- * waking up the parent.
- */
-
-
-/**
- * freezer_do_not_count - tell freezer to ignore %current
- *
- * Tell freezers to ignore the current task when determining whether the
- * target frozen state is reached.  IOW, the current task will be
- * considered frozen enough by freezers.
- *
- * The caller shouldn't do anything which isn't allowed for a frozen task
- * until freezer_cont() is called.  Usually, freezer[_do_not]_count() pair
- * wrap a scheduling operation and nothing much else.
- */
-static inline void freezer_do_not_count(void)
-{
-	current->flags |= PF_FREEZER_SKIP;
-}
-
-/**
- * freezer_count - tell freezer to stop ignoring %current
- *
- * Undo freezer_do_not_count().  It tells freezers that %current should be
- * considered again and tries to freeze if freezing condition is already in
- * effect.
- */
-static inline void freezer_count(void)
-{
-	current->flags &= ~PF_FREEZER_SKIP;
-	/*
-	 * If freezing is in progress, the following paired with smp_mb()
-	 * in freezer_should_skip() ensures that either we see %true
-	 * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
-	 */
-	smp_mb();
-	try_to_freeze();
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezer_count_unsafe(void)
-{
-	current->flags &= ~PF_FREEZER_SKIP;
-	smp_mb();
-	try_to_freeze_unsafe();
-}
-
-/**
- * freezer_should_skip - whether to skip a task when determining frozen
- *			 state is reached
- * @p: task in quesion
- *
- * This function is used by freezers after establishing %true freezing() to
- * test whether a task should be skipped when determining the target frozen
- * state is reached.  IOW, if this function returns %true, @p is considered
- * frozen enough.
- */
-static inline bool freezer_should_skip(struct task_struct *p)
-{
-	/*
-	 * The following smp_mb() paired with the one in freezer_count()
-	 * ensures that either freezer_count() sees %true freezing() or we
-	 * see cleared %PF_FREEZER_SKIP and return %false.  This makes it
-	 * impossible for a task to slip frozen state testing after
-	 * clearing %PF_FREEZER_SKIP.
-	 */
-	smp_mb();
-	return p->flags & PF_FREEZER_SKIP;
-}
-
-/*
- * These functions are intended to be used whenever you want allow a sleeping
- * task to be frozen. Note that neither return any clear indication of
- * whether a freeze event happened while in this function.
- */
-
-/* Like schedule(), but should not block the freezer. */
-static inline void freezable_schedule(void)
-{
-	freezer_do_not_count();
-	schedule();
-	freezer_count();
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezable_schedule_unsafe(void)
-{
-	freezer_do_not_count();
-	schedule();
-	freezer_count_unsafe();
-}
-
-/*
- * Like schedule_timeout(), but should not block the freezer.  Do not
- * call this with locks held.
- */
-static inline long freezable_schedule_timeout(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout(timeout);
-	freezer_count();
-	return __retval;
-}
-
-/*
- * Like schedule_timeout_interruptible(), but should not block the freezer.  Do not
- * call this with locks held.
- */
-static inline long freezable_schedule_timeout_interruptible(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_interruptible(timeout);
-	freezer_count();
-	return __retval;
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout)
-{
-	long __retval;
-
-	freezer_do_not_count();
-	__retval = schedule_timeout_interruptible(timeout);
-	freezer_count_unsafe();
-	return __retval;
-}
-
-/* Like schedule_timeout_killable(), but should not block the freezer. */
-static inline long freezable_schedule_timeout_killable(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_killable(timeout);
-	freezer_count();
-	return __retval;
-}
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_killable(timeout);
-	freezer_count_unsafe();
-	return __retval;
-}
-
-/*
- * Like schedule_hrtimeout_range(), but should not block the freezer.  Do not
- * call this with locks held.
- */
-static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
-		u64 delta, const enum hrtimer_mode mode)
-{
-	int __retval;
-	freezer_do_not_count();
-	__retval = schedule_hrtimeout_range(expires, delta, mode);
-	freezer_count();
-	return __retval;
-}
-
-/*
- * Freezer-friendly wrappers around wait_event_interruptible(),
- * wait_event_killable() and wait_event_interruptible_timeout(), originally
- * defined in <linux/wait.h>
- */
-
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-#define wait_event_freezekillable_unsafe(wq, condition)			\
-({									\
-	int __retval;							\
-	freezer_do_not_count();						\
-	__retval = wait_event_killable(wq, (condition));		\
-	freezer_count_unsafe();						\
-	__retval;							\
-})
-
 #else /* !CONFIG_FREEZER */
 static inline bool frozen(struct task_struct *p) { return false; }
 static inline bool freezing(struct task_struct *p) { return false; }
@@ -281,35 +83,9 @@ static inline void thaw_kernel_threads(v
 
 static inline bool try_to_freeze(void) { return false; }
 
-static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
-static inline int freezer_should_skip(struct task_struct *p) { return 0; }
 static inline void set_freezable(void) {}
 
-#define freezable_schedule()  schedule()
-
-#define freezable_schedule_unsafe()  schedule()
-
-#define freezable_schedule_timeout(timeout)  schedule_timeout(timeout)
-
-#define freezable_schedule_timeout_interruptible(timeout)		\
-	schedule_timeout_interruptible(timeout)
-
-#define freezable_schedule_timeout_interruptible_unsafe(timeout)	\
-	schedule_timeout_interruptible(timeout)
-
-#define freezable_schedule_timeout_killable(timeout)			\
-	schedule_timeout_killable(timeout)
-
-#define freezable_schedule_timeout_killable_unsafe(timeout)		\
-	schedule_timeout_killable(timeout)
-
-#define freezable_schedule_hrtimeout_range(expires, delta, mode)	\
-	schedule_hrtimeout_range(expires, delta, mode)
-
-#define wait_event_freezekillable_unsafe(wq, condition)			\
-		wait_event_killable(wq, condition)
-
 #endif /* !CONFIG_FREEZER */
 
 #endif	/* FREEZER_H_INCLUDED */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,25 +80,32 @@ struct task_group;
  */
 
 /* Used in tsk->state: */
-#define TASK_RUNNING			0x0000
-#define TASK_INTERRUPTIBLE		0x0001
-#define TASK_UNINTERRUPTIBLE		0x0002
-#define __TASK_STOPPED			0x0004
-#define __TASK_TRACED			0x0008
+#define TASK_RUNNING			0x000000
+#define TASK_INTERRUPTIBLE		0x000001
+#define TASK_UNINTERRUPTIBLE		0x000002
+#define __TASK_STOPPED			0x000004
+#define __TASK_TRACED			0x000008
 /* Used in tsk->exit_state: */
-#define EXIT_DEAD			0x0010
-#define EXIT_ZOMBIE			0x0020
+#define EXIT_DEAD			0x000010
+#define EXIT_ZOMBIE			0x000020
 #define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
 /* Used in tsk->state again: */
-#define TASK_PARKED			0x0040
-#define TASK_DEAD			0x0080
-#define TASK_WAKEKILL			0x0100
-#define TASK_WAKING			0x0200
-#define TASK_NOLOAD			0x0400
-#define TASK_NEW			0x0800
-/* RT specific auxilliary flag to mark RT lock waiters */
-#define TASK_RTLOCK_WAIT		0x1000
-#define TASK_STATE_MAX			0x2000
+#define TASK_PARKED			0x000040
+#define TASK_DEAD			0x000080
+#define TASK_WAKEKILL			0x000100
+#define TASK_WAKING			0x000200
+#define TASK_NOLOAD			0x000400
+#define TASK_NEW			0x000800
+#define TASK_FREEZABLE			0x001000
+#define __TASK_FREEZABLE_UNSAFE	       (0x002000 * IS_ENABLED(CONFIG_LOCKDEP))
+#define TASK_FROZEN			0x004000
+#define TASK_RTLOCK_WAIT		0x008000
+#define TASK_STATE_MAX			0x010000
+
+/*
+ * DO NOT ADD ANY NEW USERS !
+ */
+#define TASK_FREEZABLE_UNSAFE		(TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
 
 /* Convenience macros for the sake of set_current_state: */
 #define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -1695,7 +1702,6 @@ extern struct pid *cad_pid;
 #define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
 #define PF_USED_ASYNC		0x00004000	/* Used async_schedule*(), used by module init */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
-#define PF_FROZEN		0x00010000	/* Frozen for system suspend */
 #define PF_KSWAPD		0x00020000	/* I am kswapd */
 #define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
 #define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
@@ -1707,7 +1713,6 @@ extern struct pid *cad_pid;
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
-#define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
 /*
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -265,7 +265,7 @@ int		rpc_malloc(struct rpc_task *);
 void		rpc_free(struct rpc_task *);
 int		rpciod_up(void);
 void		rpciod_down(void);
-int		__rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
+int		rpc_wait_for_completion_task(struct rpc_task *task);
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 struct net;
 void		rpc_show_tasks(struct net *);
@@ -276,11 +276,6 @@ extern struct workqueue_struct *rpciod_w
 extern struct workqueue_struct *xprtiod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
 
-static inline int rpc_wait_for_completion_task(struct rpc_task *task)
-{
-	return __rpc_wait_for_completion_task(task, NULL);
-}
-
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
 static inline const char * rpc_qname(const struct rpc_wait_queue *q)
 {
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -335,8 +335,8 @@ do {										\
 } while (0)
 
 #define __wait_event_freezable(wq_head, condition)				\
-	___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,		\
-			    freezable_schedule())
+	___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),	\
+			0, 0, schedule())
 
 /**
  * wait_event_freezable - sleep (or freeze) until a condition gets true
@@ -394,8 +394,8 @@ do {										\
 
 #define __wait_event_freezable_timeout(wq_head, condition, timeout)		\
 	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
-		      TASK_INTERRUPTIBLE, 0, timeout,				\
-		      __ret = freezable_schedule_timeout(__ret))
+		      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,		\
+		      __ret = schedule_timeout(__ret))
 
 /*
  * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
@@ -615,8 +615,8 @@ do {										\
 
 
 #define __wait_event_freezable_exclusive(wq, condition)				\
-	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,			\
-			freezable_schedule())
+	___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
+			schedule())
 
 #define wait_event_freezable_exclusive(wq, condition)				\
 ({										\
@@ -905,6 +905,34 @@ extern int do_wait_intr_irq(wait_queue_h
 	__ret;									\
 })
 
+#define __wait_event_state(wq, condition, state)				\
+	___wait_event(wq, condition, state, 0, 0, schedule())
+
+/**
+ * wait_event_state - sleep until a condition gets true
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @state: state to sleep in
+ *
+ * The process is put to sleep (@state) until the @condition evaluates to true
+ * or a signal is received.  The @condition is checked each time the waitqueue
+ * @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_state(wq_head, condition, state)				\
+({										\
+	int __ret = 0;								\
+	might_sleep();								\
+	if (!(condition))							\
+		__ret = __wait_event_state(wq_head, condition, state);		\
+	__ret;									\
+})
+
 #define __wait_event_killable_timeout(wq_head, condition, timeout)		\
 	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
 		      TASK_KILLABLE, 0, timeout,				\
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -113,7 +113,7 @@ static int freezer_css_online(struct cgr
 
 	if (parent && (parent->state & CGROUP_FREEZING)) {
 		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
-		atomic_inc(&system_freezing_cnt);
+		static_branch_inc(&freezer_active);
 	}
 
 	mutex_unlock(&freezer_mutex);
@@ -134,7 +134,7 @@ static void freezer_css_offline(struct c
 	mutex_lock(&freezer_mutex);
 
 	if (freezer->state & CGROUP_FREEZING)
-		atomic_dec(&system_freezing_cnt);
+		static_branch_dec(&freezer_active);
 
 	freezer->state = 0;
 
@@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup
 			__thaw_task(task);
 		} else {
 			freeze_task(task);
+
 			/* clear FROZEN and propagate upwards */
 			while (freezer && (freezer->state & CGROUP_FROZEN)) {
 				freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +272,8 @@ static void update_if_frozen(struct cgro
 	css_task_iter_start(css, 0, &it);
 
 	while ((task = css_task_iter_next(&it))) {
-		if (freezing(task)) {
-			/*
-			 * freezer_should_skip() indicates that the task
-			 * should be skipped when determining freezing
-			 * completion.  Consider it frozen in addition to
-			 * the usual frozen condition.
-			 */
-			if (!frozen(task) && !freezer_should_skip(task))
-				goto out_iter_end;
-		}
+		if (freezing(task) && !frozen(task))
+			goto out_iter_end;
 	}
 
 	freezer->state |= CGROUP_FROZEN;
@@ -357,7 +350,7 @@ static void freezer_apply_state(struct f
 
 	if (freeze) {
 		if (!(freezer->state & CGROUP_FREEZING))
-			atomic_inc(&system_freezing_cnt);
+			static_branch_inc(&freezer_active);
 		freezer->state |= state;
 		freeze_cgroup(freezer);
 	} else {
@@ -366,9 +359,9 @@ static void freezer_apply_state(struct f
 		freezer->state &= ~state;
 
 		if (!(freezer->state & CGROUP_FREEZING)) {
-			if (was_freezing)
-				atomic_dec(&system_freezing_cnt);
 			freezer->state &= ~CGROUP_FROZEN;
+			if (was_freezing)
+				static_branch_dec(&freezer_active);
 			unfreeze_cgroup(freezer);
 		}
 	}
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -469,10 +469,10 @@ static void exit_mm(void)
 			complete(&core_state->startup);
 
 		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
-			freezable_schedule();
+			schedule();
 		}
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1348,13 +1348,12 @@ static void complete_vfork_done(struct t
 static int wait_for_vfork_done(struct task_struct *child,
 				struct completion *vfork)
 {
+	unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
 	int killed;
 
-	freezer_do_not_count();
 	cgroup_enter_frozen();
-	killed = wait_for_completion_killable(vfork);
+	killed = wait_for_completion_state(vfork, state);
 	cgroup_leave_frozen(false);
-	freezer_count();
 
 	if (killed) {
 		task_lock(child);
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -13,10 +13,11 @@
 #include <linux/kthread.h>
 
 /* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
 
-/* indicate whether PM freezing is in effect, protected by
+/*
+ * indicate whether PM freezing is in effect, protected by
  * system_transition_mutex
  */
 bool pm_freezing;
@@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock);
  * freezing_slow_path - slow path for testing whether a task needs to be frozen
  * @p: task to be tested
  *
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
  * and tests whether @p needs to enter and stay in frozen state.  Can be
  * called under any context.  The freezers are responsible for ensuring the
  * target tasks see the updated state.
@@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_stru
 }
 EXPORT_SYMBOL(freezing_slow_path);
 
+bool frozen(struct task_struct *p)
+{
+	return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
 /* Refrigerator is place where frozen processes are stored :-). */
 bool __refrigerator(bool check_kthr_stop)
 {
-	/* Hmm, should we be allowed to suspend when there are realtime
-	   processes around? */
+	unsigned int state = get_current_state();
 	bool was_frozen = false;
-	unsigned int save = get_current_state();
 
 	pr_debug("%s entered refrigerator\n", current->comm);
 
+	WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		bool freeze;
+
+		set_current_state(TASK_FROZEN);
 
 		spin_lock_irq(&freezer_lock);
-		current->flags |= PF_FROZEN;
-		if (!freezing(current) ||
-		    (check_kthr_stop && kthread_should_stop()))
-			current->flags &= ~PF_FROZEN;
+		freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
 		spin_unlock_irq(&freezer_lock);
 
-		if (!(current->flags & PF_FROZEN))
+		if (!freeze)
 			break;
+
 		was_frozen = true;
 		schedule();
 	}
+	__set_current_state(TASK_RUNNING);
 
 	pr_debug("%s left refrigerator\n", current->comm);
 
-	/*
-	 * Restore saved task state before returning.  The mb'd version
-	 * needs to be used; otherwise, it might silently break
-	 * synchronization which depends on ordered task state change.
-	 */
-	set_current_state(save);
-
 	return was_frozen;
 }
 EXPORT_SYMBOL(__refrigerator);
@@ -101,6 +101,37 @@ static void fake_signal_wake_up(struct t
 	}
 }
 
+static inline unsigned int __can_freeze(struct task_struct *p)
+{
+	unsigned int state = READ_ONCE(p->__state);
+
+	if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+		return 0;
+
+	/*
+	 * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+	 * can suffer spurious wakeups.
+	 */
+	if (state & TASK_FREEZABLE)
+		WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * It's dangerous to freeze with locks held; there be dragons there.
+	 */
+	if (!(state & __TASK_FREEZABLE_UNSAFE))
+		WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+	return TASK_FROZEN;
+}
+
+/* See task_cond_set_special_state(); serializes against ttwu() */
+static bool __freeze_task(struct task_struct *p)
+{
+	return task_cond_set_special_state(p, __can_freeze(p));
+}
+
 /**
  * freeze_task - send a freeze request to given task
  * @p: task to send the request to
@@ -116,20 +147,8 @@ bool freeze_task(struct task_struct *p)
 {
 	unsigned long flags;
 
-	/*
-	 * This check can race with freezer_do_not_count, but worst case that
-	 * will result in an extra wakeup being sent to the task.  It does not
-	 * race with freezer_count(), the barriers in freezer_count() and
-	 * freezer_should_skip() ensure that either freezer_count() sees
-	 * freezing == true in try_to_freeze() and freezes, or
-	 * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
-	 * normally.
-	 */
-	if (freezer_should_skip(p))
-		return false;
-
 	spin_lock_irqsave(&freezer_lock, flags);
-	if (!freezing(p) || frozen(p)) {
+	if (!freezing(p) || frozen(p) || __freeze_task(p)) {
 		spin_unlock_irqrestore(&freezer_lock, flags);
 		return false;
 	}
@@ -137,19 +156,58 @@ bool freeze_task(struct task_struct *p)
 	if (!(p->flags & PF_KTHREAD))
 		fake_signal_wake_up(p);
 	else
-		wake_up_state(p, TASK_INTERRUPTIBLE);
+		wake_up_state(p, TASK_NORMAL);
 
 	spin_unlock_irqrestore(&freezer_lock, flags);
 	return true;
 }
 
+/*
+ * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
+ * state in p->jobctl and p->ptrace respectively. If either of them got a
+ * wakeup that was missed because TASK_FROZEN, then their canonical state
+ * reflects that and the below will refuse to restore the special state and
+ * instead issue the wakeup.
+ */
+static inline unsigned int __thaw_special(struct task_struct *p)
+{
+	unsigned int state = 0;
+
+	if (p->ptrace & PT_STOPPED) {
+		state = __TASK_TRACED;
+
+		if (p->ptrace & PT_STOPPED_FATAL) {
+			state |= TASK_WAKEKILL;
+			if (__fatal_signal_pending(p))
+				state = 0;
+		}
+
+	} else if ((p->jobctl & JOBCTL_STOP_PENDING) &&
+		   !__fatal_signal_pending(p)) {
+
+		state = TASK_STOPPED;
+	}
+
+	return state;
+}
+
 void __thaw_task(struct task_struct *p)
 {
-	unsigned long flags;
+	unsigned long flags, flags2;
 
 	spin_lock_irqsave(&freezer_lock, flags);
-	if (frozen(p))
-		wake_up_process(p);
+	if (WARN_ON_ONCE(freezing(p)))
+		goto unlock;
+
+	if (lock_task_sighand(p, &flags2)) {
+		bool ret = task_cond_set_special_state(p, __thaw_special(p));
+		unlock_task_sighand(p, &flags2);
+		if (ret)
+			goto unlock;
+	}
+
+	wake_up_state(p, TASK_FROZEN);
+unlock:
 	spin_unlock_irqrestore(&freezer_lock, flags);
 }
 
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2832,7 +2832,7 @@ static void futex_wait_queue_me(struct f
 	 * queue_me() calls spin_unlock() upon completion, both serializing
 	 * access to the hash list and forcing another memory barrier.
 	 */
-	set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 	queue_me(q, hb);
 
 	/* Arm the timer */
@@ -2850,7 +2850,7 @@ static void futex_wait_queue_me(struct f
 		 * is no timeout, or if it has yet to expire.
 		 */
 		if (!timeout || timeout->task)
-			freezable_schedule();
+			schedule();
 	}
 	__set_current_state(TASK_RUNNING);
 }
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -93,8 +93,8 @@ static void check_hung_task(struct task_
 	 * Ensure the task is not frozen.
 	 * Also, skip vfork and any other user process that freezer should skip.
 	 */
-	if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
-	    return;
+	if (unlikely(READ_ONCE(t->__state) & (TASK_FREEZABLE | TASK_FROZEN)))
+		return;
 
 	/*
 	 * When a freshly created task is scheduled once, changes its state to
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,7 +24,7 @@
 unsigned int lock_system_sleep(void)
 {
 	unsigned int flags = current->flags;
-	current->flags |= PF_FREEZER_SKIP;
+	current->flags |= PF_NOFREEZE;
 	mutex_lock(&system_transition_mutex);
 	return flags;
 }
@@ -48,8 +48,8 @@ void unlock_system_sleep(unsigned int fl
 	 * Which means, if we use try_to_freeze() here, it would make them
 	 * enter the refrigerator, thus causing hibernation to lockup.
 	 */
-	if (!(flags & PF_FREEZER_SKIP))
-		current->flags &= ~PF_FREEZER_SKIP;
+	if (!(flags & PF_NOFREEZE))
+		current->flags &= ~PF_NOFREEZE;
 	mutex_unlock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,8 +53,7 @@ static int try_to_freeze_tasks(bool user
 			if (p == current || !freeze_task(p))
 				continue;
 
-			if (!freezer_should_skip(p))
-				todo++;
+			todo++;
 		}
 		read_unlock(&tasklist_lock);
 
@@ -99,8 +98,7 @@ static int try_to_freeze_tasks(bool user
 		if (!wakeup || pm_debug_messages_on) {
 			read_lock(&tasklist_lock);
 			for_each_process_thread(g, p) {
-				if (p != current && !freezer_should_skip(p)
-				    && freezing(p) && !frozen(p))
+				if (p != current && freezing(p) && !frozen(p))
 					sched_show_task(p);
 			}
 			read_unlock(&tasklist_lock);
@@ -132,7 +130,7 @@ int freeze_processes(void)
 	current->flags |= PF_SUSPEND_TASK;
 
 	if (!pm_freezing)
-		atomic_inc(&system_freezing_cnt);
+		static_branch_inc(&freezer_active);
 
 	pm_wakeup_clear(true);
 	pr_info("Freezing user space processes ... ");
@@ -193,7 +191,7 @@ void thaw_processes(void)
 
 	trace_suspend_resume(TPS("thaw_processes"), 0, true);
 	if (pm_freezing)
-		atomic_dec(&system_freezing_cnt);
+		static_branch_dec(&freezer_active);
 	pm_freezing = false;
 	pm_nosig_freezing = false;
 
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -282,7 +282,7 @@ static int ptrace_check_attach(struct ta
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !ignore_state) {
-		if (!wait_task_inactive(child, __TASK_TRACED)) { // XXX mooo!!!
+		if (!wait_task_inactive(child, __TASK_TRACED | TASK_FREEZABLE)) {
 			/*
 			 * This can only happen if may_ptrace_stop() fails and
 			 * ptrace_stop() changes ->state back to TASK_RUNNING,
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -247,6 +247,15 @@ int __sched wait_for_completion_killable
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
 
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+	if (t == -ERESTARTSYS)
+		return t;
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
 /**
  * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
  * @x:  holds the state of this particular completion
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3162,6 +3162,19 @@ int migrate_swap(struct task_struct *cur
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline __wti_match(struct task_struct *p, unsigned int match_state)
+{
+	unsigned int state = READ_ONCE(p->__state);
+
+	if ((match_state & TASK_FREEZABLE) && state == TASK_FROZEN)
+		return true;
+
+	if (state == (match_state & ~TASK_FREEZABLE))
+		return true;
+
+	return false;
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -3206,7 +3219,7 @@ unsigned long wait_task_inactive(struct
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+			if (match_state && __wti_match(p, match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -3221,7 +3234,7 @@ unsigned long wait_task_inactive(struct
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
-		if (!match_state || READ_ONCE(p->__state) == match_state)
+		if (!match_state || __wti_match(p, match_state))
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);
 
@@ -6154,7 +6167,7 @@ static void __sched notrace __schedule(u
 			prev->sched_contributes_to_load =
 				(prev_state & TASK_UNINTERRUPTIBLE) &&
 				!(prev_state & TASK_NOLOAD) &&
-				!(prev->flags & PF_FROZEN);
+				!(prev_state & TASK_FROZEN);
 
 			if (prev->sched_contributes_to_load)
 				rq->nr_uninterruptible++;
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2299,7 +2299,7 @@ static void ptrace_stop(int exit_code, i
 		read_unlock(&tasklist_lock);
 		cgroup_enter_frozen();
 		preempt_enable_no_resched();
-		freezable_schedule();
+		schedule();
 		cgroup_leave_frozen(true);
 	} else {
 		/*
@@ -2479,7 +2479,7 @@ static bool do_signal_stop(int signr)
 
 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
 		cgroup_enter_frozen();
-		freezable_schedule();
+		schedule();
 		return true;
 	} else {
 		/*
@@ -2555,11 +2555,11 @@ static void do_freezer_trap(void)
 	 * immediately (if there is a non-fatal signal pending), and
 	 * put the task into sleep.
 	 */
-	__set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 	clear_thread_flag(TIF_SIGPENDING);
 	spin_unlock_irq(&current->sighand->siglock);
 	cgroup_enter_frozen();
-	freezable_schedule();
+	schedule();
 }
 
 static int ptrace_signal(int signr, kernel_siginfo_t *info)
@@ -3608,9 +3608,9 @@ static int do_sigtimedwait(const sigset_
 		recalc_sigpending();
 		spin_unlock_irq(&tsk->sighand->siglock);
 
-		__set_current_state(TASK_INTERRUPTIBLE);
-		ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
-							 HRTIMER_MODE_REL);
+		__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+		ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+					       HRTIMER_MODE_REL);
 		spin_lock_irq(&tsk->sighand->siglock);
 		__set_task_blocked(tsk, &tsk->real_blocked);
 		sigemptyset(&tsk->real_blocked);
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct h
 	struct restart_block *restart;
 
 	do {
-		set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 		hrtimer_sleeper_start_expires(t, mode);
 
 		if (likely(t->task))
-			freezable_schedule();
+			schedule();
 
 		hrtimer_cancel(&t->timer);
 		mode = HRTIMER_MODE_ABS;
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -404,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup)
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
+	unsigned int state = TASK_UNINTERRUPTIBLE;
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
@@ -437,25 +438,22 @@ int call_usermodehelper_exec(struct subp
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
 
+	if (wait & UMH_KILLABLE)
+		state |= TASK_KILLABLE;
+
 	if (wait & UMH_FREEZABLE)
-		freezer_do_not_count();
+		state |= TASK_FREEZABLE;
 
-	if (wait & UMH_KILLABLE) {
-		retval = wait_for_completion_killable(&done);
-		if (!retval)
-			goto wait_done;
+	retval = wait_for_completion_state(&done, state);
+	if (!retval)
+		goto wait_done;
 
+	if (wait & UMH_KILLABLE) {
 		/* umh_complete() will see NULL and free sub_info */
 		if (xchg(&sub_info->complete, NULL))
 			goto unlock;
-		/* fallthrough, umh_complete() was already called */
 	}
 
-	wait_for_completion(&done);
-
-	if (wait & UMH_FREEZABLE)
-		freezer_count();
-
 wait_done:
 	retval = sub_info->retval;
 out:
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -793,8 +793,8 @@ static void khugepaged_alloc_sleep(void)
 	DEFINE_WAIT(wait);
 
 	add_wait_queue(&khugepaged_wait, &wait);
-	freezable_schedule_timeout_interruptible(
-		msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+	__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+	schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
 	remove_wait_queue(&khugepaged_wait, &wait);
 }
 
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue
 
 static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
@@ -324,14 +324,12 @@ static int rpc_complete_task(struct rpc_
  * to enforce taking of the wq->lock and hence avoid races with
  * rpc_complete_task().
  */
-int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
+int rpc_wait_for_completion_task(struct rpc_task *task)
 {
-	if (action == NULL)
-		action = rpc_wait_bit_killable;
 	return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
-			action, TASK_KILLABLE);
+			rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
 }
-EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
+EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task);
 
 /*
  * Make an RPC task runnable.
@@ -938,7 +936,7 @@ static void __rpc_execute(struct rpc_tas
 		trace_rpc_task_sync_sleep(task, task->tk_action);
 		status = out_of_line_wait_on_bit(&task->tk_runstate,
 				RPC_TASK_QUEUED, rpc_wait_bit_killable,
-				TASK_KILLABLE);
+				TASK_KILLABLE|TASK_FREEZABLE);
 		if (status < 0) {
 			/*
 			 * When a sync task receives a signal, it exits with
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2442,13 +2442,14 @@ static long unix_stream_data_wait(struct
 				  struct sk_buff *last, unsigned int last_len,
 				  bool freezable)
 {
+	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
 	struct sk_buff *tail;
 	DEFINE_WAIT(wait);
 
 	unix_state_lock(sk);
 
 	for (;;) {
-		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, state);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		if (tail != last ||
@@ -2461,10 +2462,7 @@ static long unix_stream_data_wait(struct
 
 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		unix_state_unlock(sk);
-		if (freezable)
-			timeo = freezable_schedule_timeout(timeo);
-		else
-			timeo = schedule_timeout(timeo);
+		timeo = schedule_timeout(timeo);
 		unix_state_lock(sk);
 
 		if (sock_flag(sk, SOCK_DEAD))



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags
  2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
@ 2021-10-14  8:58   ` Will Deacon
  0 siblings, 0 replies; 12+ messages in thread
From: Will Deacon @ 2021-10-14  8:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, linux-kernel, tj, linux-pm

On Sat, Oct 09, 2021 at 12:07:55PM +0200, Peter Zijlstra wrote:
> Rafael explained that the reason for having both PF_NOFREEZE and
> PF_FREEZER_SKIP is that {,un}lock_system_sleep() is callable from
> kthread context that has previously called set_freezable().
> 
> In preparation of merging the flags, have {,un}lock_system_slee() save
> and restore current->flags.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  drivers/scsi/scsi_transport_spi.c |    7 ++++---
>  include/linux/suspend.h           |    8 ++++----
>  kernel/power/hibernate.c          |   35 ++++++++++++++++++++++-------------
>  kernel/power/main.c               |   16 ++++++++++------
>  kernel/power/suspend.c            |   12 ++++++++----
>  kernel/power/user.c               |   24 ++++++++++++++----------
>  6 files changed, 62 insertions(+), 40 deletions(-)

There's lots of Christmas tree inversion sprinkled in here, but the change
looks good so:

Acked-by: Will Deacon <will@kernel.org>

Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction
  2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
@ 2021-10-14  9:13   ` Will Deacon
  0 siblings, 0 replies; 12+ messages in thread
From: Will Deacon @ 2021-10-14  9:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, linux-kernel, tj, linux-pm

On Sat, Oct 09, 2021 at 12:07:56PM +0200, Peter Zijlstra wrote:
> handle_initrd() marks itself as PF_FREEZER_SKIP in order to ensure
> that the UMH, which is going to freeze the system, doesn't
> indefinitely wait for it's caller.
> 
> Rework things by adding UMH_FREEZABLE to indicate the completion is
> freezable.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/umh.h     |    9 +++++----
>  init/do_mounts_initrd.c |   10 +---------
>  kernel/umh.c            |    8 ++++++++
>  3 files changed, 14 insertions(+), 13 deletions(-)

This looks much better to me:

Acked-by: Will Deacon <will@kernel.org>

Just a question on the old code:

> --- a/include/linux/umh.h
> +++ b/include/linux/umh.h
> @@ -11,10 +11,11 @@
>  struct cred;
>  struct file;
>  
> -#define UMH_NO_WAIT	0	/* don't wait at all */
> -#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
> -#define UMH_WAIT_PROC	2	/* wait for the process to complete */
> -#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
> +#define UMH_NO_WAIT	0x00	/* don't wait at all */
> +#define UMH_WAIT_EXEC	0x01	/* wait for the exec, but not the process */
> +#define UMH_WAIT_PROC	0x02	/* wait for the process to complete */
> +#define UMH_KILLABLE	0x04	/* wait for EXEC/PROC killable */
> +#define UMH_FREEZABLE	0x08	/* wait for EXEC/PROC freezable */
>  
>  struct subprocess_info {
>  	struct work_struct work;
> --- a/init/do_mounts_initrd.c
> +++ b/init/do_mounts_initrd.c
> @@ -79,19 +79,11 @@ static void __init handle_initrd(void)
>  	init_mkdir("/old", 0700);
>  	init_chdir("/old");
>  
> -	/*
> -	 * In case that a resume from disk is carried out by linuxrc or one of
> -	 * its children, we need to tell the freezer not to wait for us.
> -	 */
> -	current->flags |= PF_FREEZER_SKIP;
> -
>  	info = call_usermodehelper_setup("/linuxrc", argv, envp_init,
>  					 GFP_KERNEL, init_linuxrc, NULL, NULL);
>  	if (!info)
>  		return;
> -	call_usermodehelper_exec(info, UMH_WAIT_PROC);
> -
> -	current->flags &= ~PF_FREEZER_SKIP;

How was this supposed to work if it raced with the freezer checking the
flag?

Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 3/6] ptrace: Order and comment PT_flags
  2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
@ 2021-10-14  9:31   ` Will Deacon
  2021-10-14 14:27     ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Will Deacon @ 2021-10-14  9:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, linux-kernel, tj, linux-pm

On Sat, Oct 09, 2021 at 12:07:57PM +0200, Peter Zijlstra wrote:
> Add a comment to the PT_flags to indicate their actual value, this
> makes it easier to see what bits are used and where there might be a
> possible hole to use.
> 
> Notable PT_SEIZED was placed wrong, also PT_EVENT_FLAG() space seems
> ill defined, as written is seems to be meant to cover the entire
> PTRACE_O_ range offset by 3 bits, which would then be 3+[0..21],
> however PT_SEIZED is in the middle of that.

Why do you think PT_EVENT_FLAG() should cover all the PTRACE_O_* options?
Just going by the name and current callers, I'd only expect it to cover
the PTRACE_EVENT_* flags, no?

But in any case, having the comments is helpful, so:

Acked-by: Will Deacon <will@kernel.org>

Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 3/6] ptrace: Order and comment PT_flags
  2021-10-14  9:31   ` Will Deacon
@ 2021-10-14 14:27     ` Peter Zijlstra
  0 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-14 14:27 UTC (permalink / raw)
  To: Will Deacon
  Cc: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, linux-kernel, tj, linux-pm

On Thu, Oct 14, 2021 at 10:31:22AM +0100, Will Deacon wrote:
> On Sat, Oct 09, 2021 at 12:07:57PM +0200, Peter Zijlstra wrote:
> > Add a comment to the PT_flags to indicate their actual value, this
> > makes it easier to see what bits are used and where there might be a
> > possible hole to use.
> > 
> > Notable PT_SEIZED was placed wrong, also PT_EVENT_FLAG() space seems
> > ill defined, as written is seems to be meant to cover the entire
> > PTRACE_O_ range offset by 3 bits, which would then be 3+[0..21],
> > however PT_SEIZED is in the middle of that.
> 
> Why do you think PT_EVENT_FLAG() should cover all the PTRACE_O_* options?
> Just going by the name and current callers, I'd only expect it to cover
> the PTRACE_EVENT_* flags, no?

Because PT_EXITKILL and PT_SUSPEND_SECCOMP are also exposed in that same
mapping.

Ideally we'd change PT_OPT_FLAG_SHIFT to 8 or something and have the
high 24 bits for OPT and then use the low 8 bits for SEIZED and the new
flags.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic
  2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
@ 2021-10-18 13:36   ` Peter Zijlstra
  0 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2021-10-18 13:36 UTC (permalink / raw)
  To: rjw, oleg, mingo, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, Will Deacon
  Cc: linux-kernel, tj, linux-pm

On Sat, Oct 09, 2021 at 12:08:00PM +0200, Peter Zijlstra wrote:

> +static inline unsigned int __can_freeze(struct task_struct *p)
> +{
> +	unsigned int state = READ_ONCE(p->__state);
> +
> +	if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
> +		return 0;
> +
> +	/*
> +	 * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
> +	 * can suffer spurious wakeups.
> +	 */
> +	if (state & TASK_FREEZABLE)
> +		WARN_ON_ONCE(!(state & TASK_NORMAL));
> +
> +#ifdef CONFIG_LOCKDEP
> +	/*
> +	 * It's dangerous to freeze with locks held; there be dragons there.
> +	 */
> +	if (!(state & __TASK_FREEZABLE_UNSAFE))
> +		WARN_ON_ONCE(debug_locks && p->lockdep_depth);
> +#endif
> +
> +	return TASK_FROZEN;
> +}
> +
> +/* See task_cond_set_special_state(); serializes against ttwu() */
> +static bool __freeze_task(struct task_struct *p)
> +{
> +	return task_cond_set_special_state(p, __can_freeze(p));
> +}

Will found an issue with this, notably task_cond_set_special() only
takes ->pi_lock and as such doesn't serialize against __schedule(),
which then yields the following fun scenario:


	__schedule()					__freeze_task()


	prev_state = READ_ONCE(prev->__state); // INTERRUPTIBLE

							task_cond_set_special_state()
							  ...
							  WRITE_ONCE(prev->__state, TASK_FROZEN);

	if (signal_pending_state(prev_state, prev)) // SIGPENDING
	  WRITE_ONCE(prev->__state, TASK_RUNNING)




And *whoopsie*, freezer things we're frozen, but we're back in the game.


AFAICT the below, which uses the brand-spanking-new task_call_func()
which currently sits in tip/sched/core to also serialize against
rq->lock should avoid this scenario.


--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -233,25 +233,6 @@ struct task_group;
 	} while (0)
 
 /*
- * task_cond_set_special_state() is a cmpxchg like operation on task->state.
- *
- * This operation isn't safe in general and should only be used to transform
- * one (special) blocked state into another, such as:
- *   TASK_STOPPED <-> TASK_FROZEN.
- */
-#define task_cond_set_special_state(task, cond_state)			\
-	({								\
-		struct task_struct *__p = (task);			\
-		unsigned long __flags; /* may shadow */			\
-		unsigned int __state;					\
-		raw_spin_lock_irqsave(&__p->pi_lock, __flags);		\
-		if ((__state = (cond_state)))				\
-			WRITE_ONCE(__p->__state, __state);		\
-		raw_spin_unlock_irqrestore(&__p->pi_lock, __flags);	\
-		!!__state;						\
-	})
-
-/*
  * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
  *
  * RT's spin/rwlock substitutions are state preserving. The state of the
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -101,7 +101,7 @@ static void fake_signal_wake_up(struct t
 	}
 }
 
-static inline unsigned int __can_freeze(struct task_struct *p)
+static int __set_task_frozen(struct task_struct *p, void *arg)
 {
 	unsigned int state = READ_ONCE(p->__state);
 
@@ -123,13 +123,14 @@ static inline unsigned int __can_freeze(
 		WARN_ON_ONCE(debug_locks && p->lockdep_depth);
 #endif
 
+	WRITE_ONCE(p->__state, TASK_FROZEN);
 	return TASK_FROZEN;
 }
 
-/* See task_cond_set_special_state(); serializes against ttwu() */
 static bool __freeze_task(struct task_struct *p)
 {
-	return task_cond_set_special_state(p, __can_freeze(p));
+	/* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+	return task_call_func(p, __set_task_frozen, NULL);
 }
 
 /**
@@ -169,7 +170,7 @@ bool freeze_task(struct task_struct *p)
  * reflects that and the below will refuse to restore the special state and
  * instead issue the wakeup.
  */
-static inline unsigned int __thaw_special(struct task_struct *p)
+static int __set_task_special(struct task_struct *p, void *arg)
 {
 	unsigned int state = 0;
 
@@ -188,6 +189,9 @@ static inline unsigned int __thaw_specia
 		state = TASK_STOPPED;
 	}
 
+	if (state)
+		WRITE_ONCE(p->__state, state);
+
 	return state;
 }
 
@@ -200,7 +204,8 @@ void __thaw_task(struct task_struct *p)
 		goto unlock;
 
 	if (lock_task_sighand(p, &flags2)) {
-		bool ret = task_cond_set_special_state(p, __thaw_special(p));
+		/* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
+		bool ret = task_call_func(p, __set_task_special, NULL);
 		unlock_task_sighand(p, &flags2);
 		if (ret)
 			goto unlock;
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -193,6 +193,17 @@ static bool looks_like_a_spurious_pid(st
 	return true;
 }
 
+static int __set_task_traced(struct task_struct *task, void *arg)
+{
+	unsigned int *state = arg;
+
+	if (!(task->__state & __TASK_TRACED))
+		return 0;
+
+	WRITE_ONCE(task->__state, *state);
+	return *state;
+}
+
 /* Ensure that nothing can wake it up, even SIGKILL */
 static bool ptrace_freeze_traced(struct task_struct *task)
 {
@@ -205,10 +216,12 @@ static bool ptrace_freeze_traced(struct
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
 	    !__fatal_signal_pending(task)) {
+		unsigned int state = __TASK_TRACED;
+
 		task->ptrace &= ~PT_STOPPED_MASK;
 		task->ptrace |= PT_STOPPED;
 		/* *TASK_TRACED -> __TASK_TRACED */
-		task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * __TASK_TRACED);
+		task_call_func(task, __set_task_traced, &state);
 		ret = true;
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -233,9 +246,11 @@ static void ptrace_unfreeze_traced(struc
 			task->ptrace &= ~PT_STOPPED_MASK;
 			wake_up_state(task, __TASK_TRACED);
 		} else {
+			unsigned int state = TASK_TRACED;
+
 			task->ptrace |= PT_STOPPED_MASK;
 			/* *TASK_TRACED -> TASK_TRACED */
-			task_cond_set_special_state(task, !!(task->__state & __TASK_TRACED) * TASK_TRACED);
+			task_call_func(task, __set_task_traced, &state);
 		}
 	}
 	spin_unlock_irq(&task->sighand->siglock);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3181,7 +3181,7 @@ int migrate_swap(struct task_struct *cur
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline __wti_match(struct task_struct *p, unsigned int match_state)
+static inline bool __wti_match(struct task_struct *p, unsigned int match_state)
 {
 	unsigned int state = READ_ONCE(p->__state);
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-10-18 14:10 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-09 10:07 [PATCH v3 0/6] Freezer rewrite Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 1/6] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
2021-10-14  8:58   ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 2/6] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
2021-10-14  9:13   ` Will Deacon
2021-10-09 10:07 ` [PATCH v3 3/6] ptrace: Order and comment PT_flags Peter Zijlstra
2021-10-14  9:31   ` Will Deacon
2021-10-14 14:27     ` Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 4/6] ptrace: Track __TASK_TRACED state in p->ptrace Peter Zijlstra
2021-10-09 10:07 ` [PATCH v3 5/6] sched,ptrace: Avoid relying on __TASK_TRACED | __TASK_STOPPED Peter Zijlstra
2021-10-09 10:08 ` [PATCH v3 6/6] freezer,sched: Rewrite core freezer logic Peter Zijlstra
2021-10-18 13:36   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.