linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
@ 2022-03-02 21:04 Sebastian Andrzej Siewior
  2022-03-14  9:27 ` Sebastian Andrzej Siewior
  2022-03-14 18:54 ` Oleg Nesterov
  0 siblings, 2 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-03-02 21:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Oleg Nesterov,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

As explained by Alexander Fyodorov <halcy@yandex.ru>:

|read_lock(&tasklist_lock) in ptrace_stop() is converted to sleeping
|lock on a PREEMPT_RT kernel, and it can remove __TASK_TRACED from
|task->state (by moving  it to task->saved_state). If parent does
|wait() on child followed by a sys_ptrace call, the following race can
|happen:
|
|- child sets __TASK_TRACED in ptrace_stop()
|- parent does wait() which eventually calls wait_task_stopped() and returns
|  child's pid
|- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves
|  __TASK_TRACED flag to saved_state
|- parent calls sys_ptrace, which calls ptrace_check_attach() and
|  wait_task_inactive()

The patch is based on his initial patch where an additional check is
added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is
taken in case the caller is interrupted between looking into ->state and
->saved_state.

[ Fix for ptrace_unfreeze_traced() by Oleg Nesterov ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---

I redid the state matching part compared to what I had in my queue so it
hopefully looks better.

 include/linux/sched.h | 127 ++++++++++++++++++++++++++++++++++++++++--
 kernel/ptrace.c       |  25 +++++----
 kernel/sched/core.c   |   5 +-
 3 files changed, 140 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248b..73109ce7ce789 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,12 +118,8 @@ struct task_group;
 
 #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
 
-#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
 #define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
 
-#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
-
 /*
  * Special states are those that do not use the normal wait-loop pattern. See
  * the comment with set_special_state().
@@ -2006,6 +2002,129 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+#ifdef CONFIG_PREEMPT_RT
+
+static inline bool task_state_match_and(struct task_struct *tsk, long state)
+{
+	unsigned long flags;
+	bool match = false;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	if (READ_ONCE(tsk->__state) & state)
+		match = true;
+	else if (tsk->saved_state & state)
+		match = true;
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+static inline bool __task_state_match_eq(struct task_struct *tsk, long state)
+{
+	bool match = false;
+
+	if (READ_ONCE(tsk->__state) == state)
+		match = true;
+	else if (tsk->saved_state == state)
+		match = true;
+	return match;
+}
+
+static inline bool task_state_match_eq(struct task_struct *tsk, long state)
+{
+	unsigned long flags;
+	bool match;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	match = __task_state_match_eq(tsk, state);
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
+					    long new_state)
+{
+	unsigned long flags;
+	bool match = false;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	if (READ_ONCE(tsk->__state) & state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		match = true;
+	} else if (tsk->saved_state & state) {
+		tsk->__state = new_state;
+		match = true;
+	}
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
+					   long new_state)
+{
+	unsigned long flags;
+	bool match = false;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	if (READ_ONCE(tsk->__state) == state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		match = true;
+	} else if (tsk->saved_state == state) {
+		tsk->saved_state = new_state;
+		match = true;
+	}
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+#else
+
+static inline bool task_state_match_and(struct task_struct *tsk, long state)
+{
+	return READ_ONCE(tsk->__state) & state;
+}
+
+static inline bool __task_state_match_eq(struct task_struct *tsk, long state)
+{
+	return READ_ONCE(tsk->__state) == state;
+}
+
+static inline bool task_state_match_eq(struct task_struct *tsk, long state)
+{
+	return __task_state_match_eq(tsk, state);
+}
+
+static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
+					    long new_state)
+{
+	if (READ_ONCE(tsk->__state) & state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		return true;
+	}
+	return false;
+}
+
+static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
+					   long new_state)
+{
+	if (READ_ONCE(tsk->__state) == state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		return true;
+	}
+	return false;
+}
+
+#endif
+
+static inline bool task_is_traced(struct task_struct *tsk)
+{
+	return task_state_match_and(tsk, __TASK_TRACED);
+}
+
+static inline bool task_is_stopped_or_traced(struct task_struct *tsk)
+{
+	return task_state_match_and(tsk, __TASK_STOPPED | __TASK_TRACED);
+}
+
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e975..5ce0948c0c0a7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -195,10 +195,10 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 		return ret;
 
 	spin_lock_irq(&task->sighand->siglock);
-	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
-	    !__fatal_signal_pending(task)) {
-		WRITE_ONCE(task->__state, __TASK_TRACED);
-		ret = true;
+	if (!looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) {
+
+		ret = task_state_match_and_set(task, __TASK_TRACED,
+					       __TASK_TRACED);
 	}
 	spin_unlock_irq(&task->sighand->siglock);
 
@@ -207,7 +207,10 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-	if (READ_ONCE(task->__state) != __TASK_TRACED)
+	bool frozen;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+	    READ_ONCE(task->__state) != __TASK_TRACED)
 		return;
 
 	WARN_ON(!task->ptrace || task->parent != current);
@@ -217,12 +220,12 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 	 * Recheck state under the lock to close this race.
 	 */
 	spin_lock_irq(&task->sighand->siglock);
-	if (READ_ONCE(task->__state) == __TASK_TRACED) {
-		if (__fatal_signal_pending(task))
-			wake_up_state(task, __TASK_TRACED);
-		else
-			WRITE_ONCE(task->__state, TASK_TRACED);
-	}
+
+	frozen = task_state_match_eq_set(task, __TASK_TRACED, TASK_TRACED);
+
+	if (frozen && __fatal_signal_pending(task))
+		wake_up_state(task, __TASK_TRACED);
+
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531ce..a44414946de3d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3239,7 +3239,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+			if (match_state &&
+			    unlikely(!task_state_match_eq(p, match_state)))
 				return 0;
 			cpu_relax();
 		}
@@ -3254,7 +3255,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
-		if (!match_state || READ_ONCE(p->__state) == match_state)
+		if (!match_state || __task_state_match_eq(p, match_state))
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-02 21:04 [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT Sebastian Andrzej Siewior
@ 2022-03-14  9:27 ` Sebastian Andrzej Siewior
  2022-03-14 18:54 ` Oleg Nesterov
  1 sibling, 0 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-03-14  9:27 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Oleg Nesterov,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

On 2022-03-02 22:04:25 [+0100], To linux-kernel@vger.kernel.org wrote:
> As explained by Alexander Fyodorov <halcy@yandex.ru>:
> 
> |read_lock(&tasklist_lock) in ptrace_stop() is converted to sleeping
> |lock on a PREEMPT_RT kernel, and it can remove __TASK_TRACED from
> |task->state (by moving  it to task->saved_state). If parent does
> |wait() on child followed by a sys_ptrace call, the following race can
> |happen:
> |
> |- child sets __TASK_TRACED in ptrace_stop()
> |- parent does wait() which eventually calls wait_task_stopped() and returns
> |  child's pid
> |- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves
> |  __TASK_TRACED flag to saved_state
> |- parent calls sys_ptrace, which calls ptrace_check_attach() and
> |  wait_task_inactive()
> 
> The patch is based on his initial patch where an additional check is
> added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is
> taken in case the caller is interrupted between looking into ->state and
> ->saved_state.
> 
> [ Fix for ptrace_unfreeze_traced() by Oleg Nesterov ]
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

ping.

> ---
> 
> I redid the state matching part compared to what I had in my queue so it
> hopefully looks better.
> 
>  include/linux/sched.h | 127 ++++++++++++++++++++++++++++++++++++++++--
>  kernel/ptrace.c       |  25 +++++----
>  kernel/sched/core.c   |   5 +-
>  3 files changed, 140 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 75ba8aa60248b..73109ce7ce789 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -118,12 +118,8 @@ struct task_group;
>  
>  #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
>  
> -#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
> -
>  #define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
>  
> -#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
> -
>  /*
>   * Special states are those that do not use the normal wait-loop pattern. See
>   * the comment with set_special_state().
> @@ -2006,6 +2002,129 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
>  	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
>  }
>  
> +#ifdef CONFIG_PREEMPT_RT
> +
> +static inline bool task_state_match_and(struct task_struct *tsk, long state)
> +{
> +	unsigned long flags;
> +	bool match = false;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	if (READ_ONCE(tsk->__state) & state)
> +		match = true;
> +	else if (tsk->saved_state & state)
> +		match = true;
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +static inline bool __task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	bool match = false;
> +
> +	if (READ_ONCE(tsk->__state) == state)
> +		match = true;
> +	else if (tsk->saved_state == state)
> +		match = true;
> +	return match;
> +}
> +
> +static inline bool task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	unsigned long flags;
> +	bool match;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	match = __task_state_match_eq(tsk, state);
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
> +					    long new_state)
> +{
> +	unsigned long flags;
> +	bool match = false;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	if (READ_ONCE(tsk->__state) & state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		match = true;
> +	} else if (tsk->saved_state & state) {
> +		tsk->__state = new_state;
> +		match = true;
> +	}
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
> +					   long new_state)
> +{
> +	unsigned long flags;
> +	bool match = false;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	if (READ_ONCE(tsk->__state) == state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		match = true;
> +	} else if (tsk->saved_state == state) {
> +		tsk->saved_state = new_state;
> +		match = true;
> +	}
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +#else
> +
> +static inline bool task_state_match_and(struct task_struct *tsk, long state)
> +{
> +	return READ_ONCE(tsk->__state) & state;
> +}
> +
> +static inline bool __task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	return READ_ONCE(tsk->__state) == state;
> +}
> +
> +static inline bool task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	return __task_state_match_eq(tsk, state);
> +}
> +
> +static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
> +					    long new_state)
> +{
> +	if (READ_ONCE(tsk->__state) & state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
> +					   long new_state)
> +{
> +	if (READ_ONCE(tsk->__state) == state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +#endif
> +
> +static inline bool task_is_traced(struct task_struct *tsk)
> +{
> +	return task_state_match_and(tsk, __TASK_TRACED);
> +}
> +
> +static inline bool task_is_stopped_or_traced(struct task_struct *tsk)
> +{
> +	return task_state_match_and(tsk, __TASK_STOPPED | __TASK_TRACED);
> +}
> +
>  /*
>   * cond_resched() and cond_resched_lock(): latency reduction via
>   * explicit rescheduling in places that are safe. The return
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index eea265082e975..5ce0948c0c0a7 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -195,10 +195,10 @@ static bool ptrace_freeze_traced(struct task_struct *task)
>  		return ret;
>  
>  	spin_lock_irq(&task->sighand->siglock);
> -	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
> -	    !__fatal_signal_pending(task)) {
> -		WRITE_ONCE(task->__state, __TASK_TRACED);
> -		ret = true;
> +	if (!looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) {
> +
> +		ret = task_state_match_and_set(task, __TASK_TRACED,
> +					       __TASK_TRACED);
>  	}
>  	spin_unlock_irq(&task->sighand->siglock);
>  
> @@ -207,7 +207,10 @@ static bool ptrace_freeze_traced(struct task_struct *task)
>  
>  static void ptrace_unfreeze_traced(struct task_struct *task)
>  {
> -	if (READ_ONCE(task->__state) != __TASK_TRACED)
> +	bool frozen;
> +
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
> +	    READ_ONCE(task->__state) != __TASK_TRACED)
>  		return;
>  
>  	WARN_ON(!task->ptrace || task->parent != current);
> @@ -217,12 +220,12 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
>  	 * Recheck state under the lock to close this race.
>  	 */
>  	spin_lock_irq(&task->sighand->siglock);
> -	if (READ_ONCE(task->__state) == __TASK_TRACED) {
> -		if (__fatal_signal_pending(task))
> -			wake_up_state(task, __TASK_TRACED);
> -		else
> -			WRITE_ONCE(task->__state, TASK_TRACED);
> -	}
> +
> +	frozen = task_state_match_eq_set(task, __TASK_TRACED, TASK_TRACED);
> +
> +	if (frozen && __fatal_signal_pending(task))
> +		wake_up_state(task, __TASK_TRACED);
> +
>  	spin_unlock_irq(&task->sighand->siglock);
>  }
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9745613d531ce..a44414946de3d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3239,7 +3239,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
>  		 * is actually now running somewhere else!
>  		 */
>  		while (task_running(rq, p)) {
> -			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> +			if (match_state &&
> +			    unlikely(!task_state_match_eq(p, match_state)))
>  				return 0;
>  			cpu_relax();
>  		}
> @@ -3254,7 +3255,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
>  		running = task_running(rq, p);
>  		queued = task_on_rq_queued(p);
>  		ncsw = 0;
> -		if (!match_state || READ_ONCE(p->__state) == match_state)
> +		if (!match_state || __task_state_match_eq(p, match_state))
>  			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
>  		task_rq_unlock(rq, p, &rf);
>  
> -- 
> 2.35.1
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-02 21:04 [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT Sebastian Andrzej Siewior
  2022-03-14  9:27 ` Sebastian Andrzej Siewior
@ 2022-03-14 18:54 ` Oleg Nesterov
  2022-03-15  8:31   ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 22+ messages in thread
From: Oleg Nesterov @ 2022-03-14 18:54 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

I never really understood ->saved_state logic. Will read this patch
tomorrow, but at first glance this patch doesn't solve all problems.

On 03/02, Sebastian Andrzej Siewior wrote:
>
> +static inline bool __task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	bool match = false;
> +
> +	if (READ_ONCE(tsk->__state) == state)
> +		match = true;
> +	else if (tsk->saved_state == state)
> +		match = true;
> +	return match;
> +}

...

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3239,7 +3239,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
>  		 * is actually now running somewhere else!
>  		 */
>  		while (task_running(rq, p)) {
> -			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> +			if (match_state &&
> +			    unlikely(!task_state_match_eq(p, match_state)))
>  				return 0;

So wait_task_inactive() can return 0 but the task can run after that, right?
This is not what we want...

Oleg.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-14 18:54 ` Oleg Nesterov
@ 2022-03-15  8:31   ` Sebastian Andrzej Siewior
  2022-03-15 14:29     ` Oleg Nesterov
  0 siblings, 1 reply; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-03-15  8:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

On 2022-03-14 19:54:30 [+0100], Oleg Nesterov wrote:
> I never really understood ->saved_state logic. Will read this patch
> tomorrow, but at first glance this patch doesn't solve all problems.

Let me explain the ->saved_state logic:
On !RT, this

   set_current_state(TASK_UNINTERRUPTIBLE); 	// 1
   spin_lock(&lock);				// 2
   spin_unlock(&lock);				// 3
   schedule();					// 4

will assign ->state, spin on &lock and then invoke schedule() while
->state is still TASK_UNINTERRUPTIBLE.
On RT however, the spinlock_t becomes a sleeping lock and won't spin on
&lock but rather sleep want waiting for the lock. While at sleep waiting
for the lock, the ->state needs to be preserved or otherwise the ->state
gets lost on the wake-up with the lock acquired. 

That means RT that happens:
- 1 assigns ->state as with !RT
- 2 acquires &lock. If the is contained then
    with current->pi_lock acquired
    (current_save_and_set_rtlock_wait_state):
       ->saved_state = ->state (TASK_UNINTERRUPTIBLE)
       ->state = TASK_RTLOCK_WAIT
   and the task sleeps until &lock is available.
   Once the lock is acquired, the task will be woken up and its state is
   updated with ->pi_lock acquired (current_restore_rtlock_saved_state):
      ->state = ->saved_state (TASK_UNINTERRUPTIBLE)
      ->state = TASK_RUNNING

- 3 unlocks &lock, ->state still TASK_UNINTERRUPTIBLE
- 4 invokes schedule with TASK_UNINTERRUPTIBLE.

The sleeping locks on RT are spinlock_t and rwlock_t.

Side note: If !RT at step 2 spins on the lock then it may receive a wake
up at which point TASK_UNINTERRUPTIBLE becomes TASK_RUNNING and then it
would invoke schedule() with TASK_RUNNING (assuming the condition
becomes sooner available).
On RT, this also works and the task at step 2 may sleep or be in
transition to/ from sleep. Therefore the wake up (under ->pi_lock)
looks at ->state and if it is TASK_RTLOCK_WAIT then it updates
saved_state instead (ttwu_state_match()).

> On 03/02, Sebastian Andrzej Siewior wrote:
> >
> > +static inline bool __task_state_match_eq(struct task_struct *tsk, long state)
> > +{
> > +	bool match = false;
> > +
> > +	if (READ_ONCE(tsk->__state) == state)
> > +		match = true;
> > +	else if (tsk->saved_state == state)
> > +		match = true;
> > +	return match;
> > +}
> 
> ...
> 
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3239,7 +3239,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
> >  		 * is actually now running somewhere else!
> >  		 */
> >  		while (task_running(rq, p)) {
> > -			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> > +			if (match_state &&
> > +			    unlikely(!task_state_match_eq(p, match_state)))
> >  				return 0;
> 
> So wait_task_inactive() can return 0 but the task can run after that, right?
> This is not what we want...

Without checking both states you may never observe the requested state
because it is set to TASK_RTLOCK_WAIT while waiting for a lock. Other
than that, it may run briefly because it tries to acquire a lock or just
acquired and this shouldn't be different from a task spinning on a lock.

> Oleg.

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-15  8:31   ` Sebastian Andrzej Siewior
@ 2022-03-15 14:29     ` Oleg Nesterov
  2022-03-16  8:23       ` Sebastian Andrzej Siewior
  2022-03-31 14:25       ` [PATCH v2] " Sebastian Andrzej Siewior
  0 siblings, 2 replies; 22+ messages in thread
From: Oleg Nesterov @ 2022-03-15 14:29 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

On 03/15, Sebastian Andrzej Siewior wrote:
>
> On 2022-03-14 19:54:30 [+0100], Oleg Nesterov wrote:
> > I never really understood ->saved_state logic. Will read this patch
> > tomorrow, but at first glance this patch doesn't solve all problems.
>
> Let me explain the ->saved_state logic:

Ah, thanks, but this is clear.

> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -3239,7 +3239,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
> > >  		 * is actually now running somewhere else!
> > >  		 */
> > >  		while (task_running(rq, p)) {
> > > -			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> > > +			if (match_state &&
> > > +			    unlikely(!task_state_match_eq(p, match_state)))
> > >  				return 0;
> >
> > So wait_task_inactive() can return 0 but the task can run after that, right?
> > This is not what we want...
>
> Without checking both states you may never observe the requested state
> because it is set to TASK_RTLOCK_WAIT while waiting for a lock. Other
> than that, it may run briefly because it tries to acquire a lock or just
> acquired and this shouldn't be different from a task spinning on a lock.

I don't understand. wait_task_inactive() is used to ensure that this task
doesn't and can't run again, until debugger resumes these tracee.

Now. Unless I missed something, the tracee can leave CPU with saved_state
= TRACED (so task_state_match_eq() returns T) and wait_task_inactive() will
return. Then later the tracee will park in schedule again, yes.

But, for example, what if debugger clears TIF_BLOCKSTEP in between, while
the tracee is running? Can't this race with __switch_to_xtra() ?

Oleg.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-15 14:29     ` Oleg Nesterov
@ 2022-03-16  8:23       ` Sebastian Andrzej Siewior
  2022-03-31 14:25       ` [PATCH v2] " Sebastian Andrzej Siewior
  1 sibling, 0 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-03-16  8:23 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

On 2022-03-15 15:29:46 [+0100], Oleg Nesterov wrote:
> > > > --- a/kernel/sched/core.c
> > > > +++ b/kernel/sched/core.c
> > > > @@ -3239,7 +3239,8 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
> > > >  		 * is actually now running somewhere else!
> > > >  		 */
> > > >  		while (task_running(rq, p)) {
> > > > -			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> > > > +			if (match_state &&
> > > > +			    unlikely(!task_state_match_eq(p, match_state)))
> > > >  				return 0;
> > >
> > > So wait_task_inactive() can return 0 but the task can run after that, right?
> > > This is not what we want...
> >
> > Without checking both states you may never observe the requested state
> > because it is set to TASK_RTLOCK_WAIT while waiting for a lock. Other
> > than that, it may run briefly because it tries to acquire a lock or just
> > acquired and this shouldn't be different from a task spinning on a lock.
> 
> I don't understand. wait_task_inactive() is used to ensure that this task
> doesn't and can't run again, until debugger resumes these tracee.
> 
> Now. Unless I missed something, the tracee can leave CPU with saved_state
> = TRACED (so task_state_match_eq() returns T) and wait_task_inactive() will
> return. Then later the tracee will park in schedule again, yes.
> 
> But, for example, what if debugger clears TIF_BLOCKSTEP in between, while
> the tracee is running? Can't this race with __switch_to_xtra() ?

If you describe like that, then it appears better to only look at
->state. Otherwise, yes, you would see the expected state in
->saved_state and the task might still be on the CPU. Even if it is not
actually running/ on the runqueue, it could be the case if the lock has
been made available shortly after.

> Oleg.

Sebastian

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-15 14:29     ` Oleg Nesterov
  2022-03-16  8:23       ` Sebastian Andrzej Siewior
@ 2022-03-31 14:25       ` Sebastian Andrzej Siewior
  2022-04-04 16:13         ` Oleg Nesterov
  2022-04-05 10:10         ` Peter Zijlstra
  1 sibling, 2 replies; 22+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-03-31 14:25 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

As explained by Alexander Fyodorov <halcy@yandex.ru>:

|read_lock(&tasklist_lock) in ptrace_stop() is converted to sleeping
|lock on a PREEMPT_RT kernel, and it can remove __TASK_TRACED from
|task->__state (by moving  it to task->saved_state). If parent does
|wait() on child followed by a sys_ptrace call, the following race can
|happen:
|
|- child sets __TASK_TRACED in ptrace_stop()
|- parent does wait() which eventually calls wait_task_stopped() and returns
|  child's pid
|- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves
|  __TASK_TRACED flag to saved_state
|- parent calls sys_ptrace, which calls ptrace_check_attach() and
|  wait_task_inactive()

The patch is based on his initial patch where an additional check is
added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is
acquired to have stable view on ->__state and ->saved_state.

wait_task_inactive() needs to check both task states while waiting for the
expected task state. Should the expected task state be in ->saved_state then
the task is blocked on a sleeping lock. In this case wait_task_inactive() needs
to wait until the lock situtation has been resolved (the expected state is in
->__state). This ensures that the task is idle and does not wakeup as part of
lock resolving and races for instance with __switch_to_xtra() while the
debugger clears TIF_BLOCKSTEP() (noted by Oleg Nesterov).

[ Fix for ptrace_unfreeze_traced() by Oleg Nesterov ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
v1…v2:
  - Use also ->saved_state in task_state_match_and_set().
  - Wait in wait_task_inactive() until the desired task state is in
    ->__state so that the task won't wake up a as part of lock
    resolving. Pointed out by Oleg Nesterov.

 include/linux/sched.h |  128 ++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/ptrace.c       |   25 +++++----
 kernel/sched/core.c   |   11 +++-
 3 files changed, 146 insertions(+), 18 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,12 +118,8 @@ struct task_group;
 
 #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
 
-#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
 #define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
 
-#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
-
 /*
  * Special states are those that do not use the normal wait-loop pattern. See
  * the comment with set_special_state().
@@ -2009,6 +2005,130 @@ static inline int test_tsk_need_resched(
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 
+#ifdef CONFIG_PREEMPT_RT
+
+static inline bool task_state_match_and(struct task_struct *tsk, long state)
+{
+	unsigned long flags;
+	bool match = false;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	if (READ_ONCE(tsk->__state) & state)
+		match = true;
+	else if (tsk->saved_state & state)
+		match = true;
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+static inline int __task_state_match_eq(struct task_struct *tsk, long state)
+{
+	int match = 0;
+
+	if (READ_ONCE(tsk->__state) == state)
+		match = 1;
+	else if (tsk->saved_state == state)
+		match = -1;
+
+	return match;
+}
+
+static inline int task_state_match_eq(struct task_struct *tsk, long state)
+{
+	unsigned long flags;
+	int match;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	match = __task_state_match_eq(tsk, state);
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
+					    long new_state)
+{
+	unsigned long flags;
+	bool match = false;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	if (READ_ONCE(tsk->__state) & state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		match = true;
+	} else if (tsk->saved_state & state) {
+		tsk->saved_state = new_state;
+		match = true;
+	}
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
+					   long new_state)
+{
+	unsigned long flags;
+	bool match = false;
+
+	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
+	if (READ_ONCE(tsk->__state) == state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		match = true;
+	} else if (tsk->saved_state == state) {
+		tsk->saved_state = new_state;
+		match = true;
+	}
+	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
+	return match;
+}
+
+#else
+
+static inline bool task_state_match_and(struct task_struct *tsk, long state)
+{
+	return READ_ONCE(tsk->__state) & state;
+}
+
+static inline int __task_state_match_eq(struct task_struct *tsk, long state)
+{
+	return READ_ONCE(tsk->__state) == state;
+}
+
+static inline int task_state_match_eq(struct task_struct *tsk, long state)
+{
+	return __task_state_match_eq(tsk, state);
+}
+
+static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
+					    long new_state)
+{
+	if (READ_ONCE(tsk->__state) & state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		return true;
+	}
+	return false;
+}
+
+static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
+					   long new_state)
+{
+	if (READ_ONCE(tsk->__state) == state) {
+		WRITE_ONCE(tsk->__state, new_state);
+		return true;
+	}
+	return false;
+}
+
+#endif
+
+static inline bool task_is_traced(struct task_struct *tsk)
+{
+	return task_state_match_and(tsk, __TASK_TRACED);
+}
+
+static inline bool task_is_stopped_or_traced(struct task_struct *tsk)
+{
+	return task_state_match_and(tsk, __TASK_STOPPED | __TASK_TRACED);
+}
+
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -195,10 +195,10 @@ static bool ptrace_freeze_traced(struct
 		return ret;
 
 	spin_lock_irq(&task->sighand->siglock);
-	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
-	    !__fatal_signal_pending(task)) {
-		WRITE_ONCE(task->__state, __TASK_TRACED);
-		ret = true;
+	if (!looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) {
+
+		ret = task_state_match_and_set(task, __TASK_TRACED,
+					       __TASK_TRACED);
 	}
 	spin_unlock_irq(&task->sighand->siglock);
 
@@ -207,7 +207,10 @@ static bool ptrace_freeze_traced(struct
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-	if (READ_ONCE(task->__state) != __TASK_TRACED)
+	bool frozen;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+	    READ_ONCE(task->__state) != __TASK_TRACED)
 		return;
 
 	WARN_ON(!task->ptrace || task->parent != current);
@@ -217,12 +220,12 @@ static void ptrace_unfreeze_traced(struc
 	 * Recheck state under the lock to close this race.
 	 */
 	spin_lock_irq(&task->sighand->siglock);
-	if (READ_ONCE(task->__state) == __TASK_TRACED) {
-		if (__fatal_signal_pending(task))
-			wake_up_state(task, __TASK_TRACED);
-		else
-			WRITE_ONCE(task->__state, TASK_TRACED);
-	}
+
+	frozen = task_state_match_eq_set(task, __TASK_TRACED, TASK_TRACED);
+
+	if (frozen && __fatal_signal_pending(task))
+		wake_up_state(task, __TASK_TRACED);
+
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3219,6 +3219,8 @@ unsigned long wait_task_inactive(struct
 	struct rq *rq;
 
 	for (;;) {
+		int match_type = 0;
+
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
@@ -3239,7 +3241,8 @@ unsigned long wait_task_inactive(struct
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+			if (match_state &&
+			    unlikely(!task_state_match_eq(p, match_state)))
 				return 0;
 			cpu_relax();
 		}
@@ -3254,7 +3257,9 @@ unsigned long wait_task_inactive(struct
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
-		if (!match_state || READ_ONCE(p->__state) == match_state)
+		if (match_state)
+			match_type = __task_state_match_eq(p, match_state);
+		if (!match_state || match_type)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);
 
@@ -3284,7 +3289,7 @@ unsigned long wait_task_inactive(struct
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(queued)) {
+		if (unlikely(queued || match_type < 0)) {
 			ktime_t to = NSEC_PER_SEC / HZ;
 
 			set_current_state(TASK_UNINTERRUPTIBLE);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-31 14:25       ` [PATCH v2] " Sebastian Andrzej Siewior
@ 2022-04-04 16:13         ` Oleg Nesterov
  2022-04-05  8:34           ` Oleg Nesterov
  2022-04-05 10:10         ` Peter Zijlstra
  1 sibling, 1 reply; 22+ messages in thread
From: Oleg Nesterov @ 2022-04-04 16:13 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

Cough. Somehow I can hardly understand v2. For example, if we fix
wait_task_inactive() correctly, then why ptrace_freeze_traced()
should take saved_state into account? And how can unfreeze_traced
hit saved_state == __TASK_TRACED ?

I will read this patch tommorrow again, I have a fever today, quite
possibly I missed something...

But in any case, I think you need to update the changelog. Yes sure,
ptrace/wait_task_inactive doesn't play well if CONFIG_PREEMPT_RT,
but which exactly problem this patch tries to solve? and how?

And btw... What does the "Fix for ptrace_unfreeze_traced() by Oleg
Nesterov" below mean? This all looks as if there is something else
I am not aware of.

On 03/31, Sebastian Andrzej Siewior wrote:
>
> As explained by Alexander Fyodorov <halcy@yandex.ru>:
>
> |read_lock(&tasklist_lock) in ptrace_stop() is converted to sleeping
> |lock on a PREEMPT_RT kernel, and it can remove __TASK_TRACED from
> |task->__state (by moving  it to task->saved_state). If parent does
> |wait() on child followed by a sys_ptrace call, the following race can
> |happen:
> |
> |- child sets __TASK_TRACED in ptrace_stop()
> |- parent does wait() which eventually calls wait_task_stopped() and returns
> |  child's pid
> |- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves
> |  __TASK_TRACED flag to saved_state
> |- parent calls sys_ptrace, which calls ptrace_check_attach() and
> |  wait_task_inactive()
>
> The patch is based on his initial patch where an additional check is
> added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is
> acquired to have stable view on ->__state and ->saved_state.
>
> wait_task_inactive() needs to check both task states while waiting for the
> expected task state. Should the expected task state be in ->saved_state then
> the task is blocked on a sleeping lock. In this case wait_task_inactive() needs
> to wait until the lock situtation has been resolved (the expected state is in
> ->__state). This ensures that the task is idle and does not wakeup as part of
> lock resolving and races for instance with __switch_to_xtra() while the
> debugger clears TIF_BLOCKSTEP() (noted by Oleg Nesterov).
>
> [ Fix for ptrace_unfreeze_traced() by Oleg Nesterov ]
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> v1…v2:
>   - Use also ->saved_state in task_state_match_and_set().
>   - Wait in wait_task_inactive() until the desired task state is in
>     ->__state so that the task won't wake up a as part of lock
>     resolving. Pointed out by Oleg Nesterov.
>
>  include/linux/sched.h |  128 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/ptrace.c       |   25 +++++----
>  kernel/sched/core.c   |   11 +++-
>  3 files changed, 146 insertions(+), 18 deletions(-)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -118,12 +118,8 @@ struct task_group;
>
>  #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
>
> -#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
> -
>  #define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
>
> -#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
> -
>  /*
>   * Special states are those that do not use the normal wait-loop pattern. See
>   * the comment with set_special_state().
> @@ -2009,6 +2005,130 @@ static inline int test_tsk_need_resched(
>  	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
>  }
>
> +#ifdef CONFIG_PREEMPT_RT
> +
> +static inline bool task_state_match_and(struct task_struct *tsk, long state)
> +{
> +	unsigned long flags;
> +	bool match = false;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	if (READ_ONCE(tsk->__state) & state)
> +		match = true;
> +	else if (tsk->saved_state & state)
> +		match = true;
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +static inline int __task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	int match = 0;
> +
> +	if (READ_ONCE(tsk->__state) == state)
> +		match = 1;
> +	else if (tsk->saved_state == state)
> +		match = -1;
> +
> +	return match;
> +}
> +
> +static inline int task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	unsigned long flags;
> +	int match;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	match = __task_state_match_eq(tsk, state);
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
> +					    long new_state)
> +{
> +	unsigned long flags;
> +	bool match = false;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	if (READ_ONCE(tsk->__state) & state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		match = true;
> +	} else if (tsk->saved_state & state) {
> +		tsk->saved_state = new_state;
> +		match = true;
> +	}
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
> +					   long new_state)
> +{
> +	unsigned long flags;
> +	bool match = false;
> +
> +	raw_spin_lock_irqsave(&tsk->pi_lock, flags);
> +	if (READ_ONCE(tsk->__state) == state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		match = true;
> +	} else if (tsk->saved_state == state) {
> +		tsk->saved_state = new_state;
> +		match = true;
> +	}
> +	raw_spin_unlock_irqrestore(&tsk->pi_lock, flags);
> +	return match;
> +}
> +
> +#else
> +
> +static inline bool task_state_match_and(struct task_struct *tsk, long state)
> +{
> +	return READ_ONCE(tsk->__state) & state;
> +}
> +
> +static inline int __task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	return READ_ONCE(tsk->__state) == state;
> +}
> +
> +static inline int task_state_match_eq(struct task_struct *tsk, long state)
> +{
> +	return __task_state_match_eq(tsk, state);
> +}
> +
> +static inline bool task_state_match_and_set(struct task_struct *tsk, long state,
> +					    long new_state)
> +{
> +	if (READ_ONCE(tsk->__state) & state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static inline bool task_state_match_eq_set(struct task_struct *tsk, long state,
> +					   long new_state)
> +{
> +	if (READ_ONCE(tsk->__state) == state) {
> +		WRITE_ONCE(tsk->__state, new_state);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +#endif
> +
> +static inline bool task_is_traced(struct task_struct *tsk)
> +{
> +	return task_state_match_and(tsk, __TASK_TRACED);
> +}
> +
> +static inline bool task_is_stopped_or_traced(struct task_struct *tsk)
> +{
> +	return task_state_match_and(tsk, __TASK_STOPPED | __TASK_TRACED);
> +}
> +
>  /*
>   * cond_resched() and cond_resched_lock(): latency reduction via
>   * explicit rescheduling in places that are safe. The return
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -195,10 +195,10 @@ static bool ptrace_freeze_traced(struct
>  		return ret;
>
>  	spin_lock_irq(&task->sighand->siglock);
> -	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
> -	    !__fatal_signal_pending(task)) {
> -		WRITE_ONCE(task->__state, __TASK_TRACED);
> -		ret = true;
> +	if (!looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) {
> +
> +		ret = task_state_match_and_set(task, __TASK_TRACED,
> +					       __TASK_TRACED);
>  	}
>  	spin_unlock_irq(&task->sighand->siglock);
>
> @@ -207,7 +207,10 @@ static bool ptrace_freeze_traced(struct
>
>  static void ptrace_unfreeze_traced(struct task_struct *task)
>  {
> -	if (READ_ONCE(task->__state) != __TASK_TRACED)
> +	bool frozen;
> +
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
> +	    READ_ONCE(task->__state) != __TASK_TRACED)
>  		return;
>
>  	WARN_ON(!task->ptrace || task->parent != current);
> @@ -217,12 +220,12 @@ static void ptrace_unfreeze_traced(struc
>  	 * Recheck state under the lock to close this race.
>  	 */
>  	spin_lock_irq(&task->sighand->siglock);
> -	if (READ_ONCE(task->__state) == __TASK_TRACED) {
> -		if (__fatal_signal_pending(task))
> -			wake_up_state(task, __TASK_TRACED);
> -		else
> -			WRITE_ONCE(task->__state, TASK_TRACED);
> -	}
> +
> +	frozen = task_state_match_eq_set(task, __TASK_TRACED, TASK_TRACED);
> +
> +	if (frozen && __fatal_signal_pending(task))
> +		wake_up_state(task, __TASK_TRACED);
> +
>  	spin_unlock_irq(&task->sighand->siglock);
>  }
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3219,6 +3219,8 @@ unsigned long wait_task_inactive(struct
>  	struct rq *rq;
>
>  	for (;;) {
> +		int match_type = 0;
> +
>  		/*
>  		 * We do the initial early heuristics without holding
>  		 * any task-queue locks at all. We'll only try to get
> @@ -3239,7 +3241,8 @@ unsigned long wait_task_inactive(struct
>  		 * is actually now running somewhere else!
>  		 */
>  		while (task_running(rq, p)) {
> -			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> +			if (match_state &&
> +			    unlikely(!task_state_match_eq(p, match_state)))
>  				return 0;
>  			cpu_relax();
>  		}
> @@ -3254,7 +3257,9 @@ unsigned long wait_task_inactive(struct
>  		running = task_running(rq, p);
>  		queued = task_on_rq_queued(p);
>  		ncsw = 0;
> -		if (!match_state || READ_ONCE(p->__state) == match_state)
> +		if (match_state)
> +			match_type = __task_state_match_eq(p, match_state);
> +		if (!match_state || match_type)
>  			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
>  		task_rq_unlock(rq, p, &rf);
>
> @@ -3284,7 +3289,7 @@ unsigned long wait_task_inactive(struct
>  		 * running right now), it's preempted, and we should
>  		 * yield - it could be a while.
>  		 */
> -		if (unlikely(queued)) {
> +		if (unlikely(queued || match_type < 0)) {
>  			ktime_t to = NSEC_PER_SEC / HZ;
>
>  			set_current_state(TASK_UNINTERRUPTIBLE);


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-04 16:13         ` Oleg Nesterov
@ 2022-04-05  8:34           ` Oleg Nesterov
  0 siblings, 0 replies; 22+ messages in thread
From: Oleg Nesterov @ 2022-04-05  8:34 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-kernel, Ben Segall, Daniel Bristot de Oliveira,
	Dietmar Eggemann, Ingo Molnar, Juri Lelli, Mel Gorman,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Vincent Guittot

On 04/04, Oleg Nesterov wrote:
>
> Cough. Somehow I can hardly understand v2. For example, if we fix
> wait_task_inactive() correctly, then why ptrace_freeze_traced()
> should take saved_state into account? And how can unfreeze_traced
> hit saved_state == __TASK_TRACED ?

OK, somehow I forgot that ptrace_freeze_traced() is called before
wait_task_inactive(), so it does need to check/change saved_state.

But still, ptrace_unfreeze_traced() can't see task->saved_state ==
__TASK_TRACED, right ?


Oleg.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-03-31 14:25       ` [PATCH v2] " Sebastian Andrzej Siewior
  2022-04-04 16:13         ` Oleg Nesterov
@ 2022-04-05 10:10         ` Peter Zijlstra
  2022-04-05 10:29           ` Oleg Nesterov
  1 sibling, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-05 10:10 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Oleg Nesterov, linux-kernel, Ben Segall,
	Daniel Bristot de Oliveira, Dietmar Eggemann, Ingo Molnar,
	Juri Lelli, Mel Gorman, Steven Rostedt, Thomas Gleixner,
	Vincent Guittot

On Thu, Mar 31, 2022 at 04:25:42PM +0200, Sebastian Andrzej Siewior wrote:
> As explained by Alexander Fyodorov <halcy@yandex.ru>:
> 
> |read_lock(&tasklist_lock) in ptrace_stop() is converted to sleeping
> |lock on a PREEMPT_RT kernel, and it can remove __TASK_TRACED from
> |task->__state (by moving  it to task->saved_state). If parent does
> |wait() on child followed by a sys_ptrace call, the following race can
> |happen:
> |
> |- child sets __TASK_TRACED in ptrace_stop()
> |- parent does wait() which eventually calls wait_task_stopped() and returns
> |  child's pid
> |- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves
> |  __TASK_TRACED flag to saved_state
> |- parent calls sys_ptrace, which calls ptrace_check_attach() and
> |  wait_task_inactive()
> 
> The patch is based on his initial patch where an additional check is
> added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is
> acquired to have stable view on ->__state and ->saved_state.
> 
> wait_task_inactive() needs to check both task states while waiting for the
> expected task state. Should the expected task state be in ->saved_state then
> the task is blocked on a sleeping lock. In this case wait_task_inactive() needs
> to wait until the lock situtation has been resolved (the expected state is in
> ->__state). This ensures that the task is idle and does not wakeup as part of
> lock resolving and races for instance with __switch_to_xtra() while the
> debugger clears TIF_BLOCKSTEP() (noted by Oleg Nesterov).
> 
> [ Fix for ptrace_unfreeze_traced() by Oleg Nesterov ]
> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> v1…v2:
>   - Use also ->saved_state in task_state_match_and_set().
>   - Wait in wait_task_inactive() until the desired task state is in
>     ->__state so that the task won't wake up a as part of lock
>     resolving. Pointed out by Oleg Nesterov.
> 
>  include/linux/sched.h |  128 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/ptrace.c       |   25 +++++----
>  kernel/sched/core.c   |   11 +++-
>  3 files changed, 146 insertions(+), 18 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -118,12 +118,8 @@ struct task_group;
>  
>  #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
>  
> -#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
> -
>  #define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
>  
> -#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
> -
>  /*
>   * Special states are those that do not use the normal wait-loop pattern. See
>   * the comment with set_special_state().

Urgh, so I have reworking all this somewhere on my todo list as well.
Except I mean to move it away from using p->__state entirely. We should
not be keeping canonical state in there.

As is, I think we can write task_is_stopped() like:

#define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)

Because jobctl is in fact the canonical state. I'm still not sure if we
can do the same with task_is_traced(), ideally that would be expressed
in terms of (task)->ptrace. But ptrace_stop() hurts my brain. All that
stuff is entirely to involved.

Anyway, let me see if I can page some of that back..



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-05 10:10         ` Peter Zijlstra
@ 2022-04-05 10:29           ` Oleg Nesterov
  2022-04-05 11:28             ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Oleg Nesterov @ 2022-04-05 10:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Sebastian Andrzej Siewior, linux-kernel, Ben Segall,
	Daniel Bristot de Oliveira, Dietmar Eggemann, Ingo Molnar,
	Juri Lelli, Mel Gorman, Steven Rostedt, Thomas Gleixner,
	Vincent Guittot

On 04/05, Peter Zijlstra wrote:
>
> As is, I think we can write task_is_stopped() like:
>
> #define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)
>
> Because jobctl is in fact the canonical state.

Not really. JOBCTL_STOP_PENDING means that the task should stop.

And this flag is cleared right before set_special_state(TASK_STOPPED)
in do_signal_stop(), see task_participate_group_stop().

Oleg.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-05 10:29           ` Oleg Nesterov
@ 2022-04-05 11:28             ` Peter Zijlstra
  2022-04-07 12:13               ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-05 11:28 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sebastian Andrzej Siewior, linux-kernel, Ben Segall,
	Daniel Bristot de Oliveira, Dietmar Eggemann, Ingo Molnar,
	Juri Lelli, Mel Gorman, Steven Rostedt, Thomas Gleixner,
	Vincent Guittot

On Tue, Apr 05, 2022 at 12:29:03PM +0200, Oleg Nesterov wrote:
> On 04/05, Peter Zijlstra wrote:
> >
> > As is, I think we can write task_is_stopped() like:
> >
> > #define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)
> >
> > Because jobctl is in fact the canonical state.
> 
> Not really. JOBCTL_STOP_PENDING means that the task should stop.
> 
> And this flag is cleared right before set_special_state(TASK_STOPPED)
> in do_signal_stop(), see task_participate_group_stop().

Argh! More work then :-( Still, I really want to not have p->__state be
actual unique state.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-05 11:28             ` Peter Zijlstra
@ 2022-04-07 12:13               ` Peter Zijlstra
  2022-04-07 17:51                 ` Eric W. Biederman
  2022-04-07 22:50                 ` Eric W. Biederman
  0 siblings, 2 replies; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-07 12:13 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sebastian Andrzej Siewior, linux-kernel, Ben Segall,
	Daniel Bristot de Oliveira, Dietmar Eggemann, Ingo Molnar,
	Juri Lelli, Mel Gorman, Steven Rostedt, Thomas Gleixner,
	Vincent Guittot, ebiederm

On Tue, Apr 05, 2022 at 01:28:16PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 05, 2022 at 12:29:03PM +0200, Oleg Nesterov wrote:
> > On 04/05, Peter Zijlstra wrote:
> > >
> > > As is, I think we can write task_is_stopped() like:
> > >
> > > #define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)
> > >
> > > Because jobctl is in fact the canonical state.
> > 
> > Not really. JOBCTL_STOP_PENDING means that the task should stop.
> > 
> > And this flag is cleared right before set_special_state(TASK_STOPPED)
> > in do_signal_stop(), see task_participate_group_stop().
> 
> Argh! More work then :-( Still, I really want to not have p->__state be
> actual unique state.

How insane is this?

In addition to solving the whole saved_state issue, it also allows my
freezer rewrite to reconstruct __state. If you don't find the below
fundamentally broken I can respin that freezer rewrite on top of it.

---
 include/linux/sched.h        |  8 +++-----
 include/linux/sched/jobctl.h |  8 ++++++++
 include/linux/sched/signal.h | 15 ++++++++++++++-
 kernel/ptrace.c              | 18 ++++++++++++++----
 kernel/signal.c              |  9 ++++++---
 5 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 67f06f72c50e..6698b97b8e3b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;
 
 #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
 
-#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
+#define task_is_stopped(task)		((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
+#define task_is_stopped_or_traced(task)	((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)
 
 /*
  * Special states are those that do not use the normal wait-loop pattern. See
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index fa067de9f1a9..ec8b312f7506 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -20,6 +20,10 @@ struct task_struct;
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 #define JOBCTL_TRAP_FREEZE_BIT	23	/* trap for cgroup freezer */
 
+#define JOBCTL_STOPPED_BIT	24
+#define JOBCTL_TRACED_BIT	25
+#define JOBCTL_TRACED_FROZEN_BIT 26
+
 #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
 #define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
@@ -29,6 +33,10 @@ struct task_struct;
 #define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
 #define JOBCTL_TRAP_FREEZE	(1UL << JOBCTL_TRAP_FREEZE_BIT)
 
+#define JOBCTL_STOPPED		(1UL << JOBCTL_STOPPED_BIT)
+#define JOBCTL_TRACED		(1UL << JOBCTL_TRACED_BIT)
+#define JOBCTL_TRACED_FROZEN	(1UL << JOBCTL_TRACED_FROZEN_BIT)
+
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3c8b34876744..3e4a14ed402e 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void)
 static inline void kernel_signal_stop(void)
 {
 	spin_lock_irq(&current->sighand->siglock);
-	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+	if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
+		current->jobctl |= JOBCTL_STOPPED;
 		set_special_state(TASK_STOPPED);
+	}
 	spin_unlock_irq(&current->sighand->siglock);
 
 	schedule();
@@ -437,10 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
 
 static inline void signal_wake_up(struct task_struct *t, bool resume)
 {
+	lockdep_assert_held(t->sighand->siglock);
+
+	if (resume && !(t->jobctl & JOBCTL_TRACED_FROZEN))
+		t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+
 	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
 }
+
 static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
 {
+	lockdep_assert_held(t->sighand->siglock);
+
+	if (resume)
+		t->jobctl &= ~JOBCTL_TRACED;
+
 	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
 }
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ccc4b465775b..626f96d275c7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
 	return true;
 }
 
-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
 static bool ptrace_freeze_traced(struct task_struct *task)
 {
 	bool ret = false;
@@ -197,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
 	    !__fatal_signal_pending(task)) {
+		task->jobctl |= JOBCTL_TRACED_FROZEN;
 		WRITE_ONCE(task->__state, __TASK_TRACED);
 		ret = true;
 	}
@@ -218,9 +224,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 	 */
 	spin_lock_irq(&task->sighand->siglock);
 	if (READ_ONCE(task->__state) == __TASK_TRACED) {
-		if (__fatal_signal_pending(task))
+		task->jobctl &= ~JOBCTL_TRACED_FROZEN;
+		if (__fatal_signal_pending(task)) {
+			task->jobctl &= ~JOBCTL_TRACED;
 			wake_up_state(task, __TASK_TRACED);
-		else
+		} else
 			WRITE_ONCE(task->__state, TASK_TRACED);
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -475,8 +483,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 	 * in and out of STOPPED are protected by siglock.
 	 */
 	if (task_is_stopped(task) &&
-	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+		task->jobctl &= ~JOBCTL_STOPPED;
 		signal_wake_up_state(task, __TASK_STOPPED);
+	}
 
 	spin_unlock(&task->sighand->siglock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 30cd1ca43bcd..0aea3f0a8002 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
 static void ptrace_trap_notify(struct task_struct *t)
 {
 	WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
-	assert_spin_locked(&t->sighand->siglock);
+	lockdep_assert_held(&t->sighand->siglock);
 
 	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
 	ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
@@ -930,9 +930,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
 		for_each_thread(p, t) {
 			flush_sigqueue_mask(&flush, &t->pending);
 			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
-			if (likely(!(t->ptrace & PT_SEIZED)))
+			if (likely(!(t->ptrace & PT_SEIZED))) {
+				t->jobctl &= ~JOBCTL_STOPPED;
 				wake_up_state(t, __TASK_STOPPED);
-			else
+			} else
 				ptrace_trap_notify(t);
 		}
 
@@ -2219,6 +2220,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
 	 * schedule() will not sleep if there is a pending signal that
 	 * can awaken the task.
 	 */
+	current->jobctl |= JOBCTL_TRACED;
 	set_special_state(TASK_TRACED);
 
 	/*
@@ -2460,6 +2462,7 @@ static bool do_signal_stop(int signr)
 		if (task_participate_group_stop(current))
 			notify = CLD_STOPPED;
 
+		current->jobctl |= JOBCTL_STOPPED;
 		set_special_state(TASK_STOPPED);
 		spin_unlock_irq(&current->sighand->siglock);
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-07 12:13               ` Peter Zijlstra
@ 2022-04-07 17:51                 ` Eric W. Biederman
  2022-04-07 22:50                 ` Eric W. Biederman
  1 sibling, 0 replies; 22+ messages in thread
From: Eric W. Biederman @ 2022-04-07 17:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

Peter Zijlstra <peterz@infradead.org> writes:

> On Tue, Apr 05, 2022 at 01:28:16PM +0200, Peter Zijlstra wrote:
>> On Tue, Apr 05, 2022 at 12:29:03PM +0200, Oleg Nesterov wrote:
>> > On 04/05, Peter Zijlstra wrote:
>> > >
>> > > As is, I think we can write task_is_stopped() like:
>> > >
>> > > #define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)
>> > >
>> > > Because jobctl is in fact the canonical state.
>> > 
>> > Not really. JOBCTL_STOP_PENDING means that the task should stop.
>> > 
>> > And this flag is cleared right before set_special_state(TASK_STOPPED)
>> > in do_signal_stop(), see task_participate_group_stop().
>> 
>> Argh! More work then :-( Still, I really want to not have p->__state be
>> actual unique state.
>
> How insane is this?
>
> In addition to solving the whole saved_state issue, it also allows my
> freezer rewrite to reconstruct __state. If you don't find the below
> fundamentally broken I can respin that freezer rewrite on top of it.

I am confused about what everyone is trying to fix here.
So I can't comment right on the fundamentals except to see
it looks like more code and more complexity which could be a problem.

If this actually solves the fundamental issue we need to coordinate
because I am just about to allocate jobctl bit 31 for JOBCTL_WILL_EXIT
to implement fatal_signal_pending.  Among other things that allows
to not have two meanings for SIGKILL in task->pending.

There are also some interesting interactions between some of these
states and processes that have fatal_signal_pending that I should
look at as well.

I can say that I recently looked and signal_wake_up_state is
consistently called with siglock held.

Eric


>
> ---
>  include/linux/sched.h        |  8 +++-----
>  include/linux/sched/jobctl.h |  8 ++++++++
>  include/linux/sched/signal.h | 15 ++++++++++++++-
>  kernel/ptrace.c              | 18 ++++++++++++++----
>  kernel/signal.c              |  9 ++++++---
>  5 files changed, 45 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 67f06f72c50e..6698b97b8e3b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -118,11 +118,9 @@ struct task_group;
>  
>  #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
>  
> -#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
> -
> -#define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
> -
> -#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
> +#define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
> +#define task_is_stopped(task)		((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
> +#define task_is_stopped_or_traced(task)	((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)
>  
>  /*
>   * Special states are those that do not use the normal wait-loop pattern. See
> diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
> index fa067de9f1a9..ec8b312f7506 100644
> --- a/include/linux/sched/jobctl.h
> +++ b/include/linux/sched/jobctl.h
> @@ -20,6 +20,10 @@ struct task_struct;
>  #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
>  #define JOBCTL_TRAP_FREEZE_BIT	23	/* trap for cgroup freezer */
>  
> +#define JOBCTL_STOPPED_BIT	24
> +#define JOBCTL_TRACED_BIT	25
> +#define JOBCTL_TRACED_FROZEN_BIT 26
> +
>  #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
>  #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
>  #define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
> @@ -29,6 +33,10 @@ struct task_struct;
>  #define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
>  #define JOBCTL_TRAP_FREEZE	(1UL << JOBCTL_TRAP_FREEZE_BIT)
>  
> +#define JOBCTL_STOPPED		(1UL << JOBCTL_STOPPED_BIT)
> +#define JOBCTL_TRACED		(1UL << JOBCTL_TRACED_BIT)
> +#define JOBCTL_TRACED_FROZEN	(1UL << JOBCTL_TRACED_FROZEN_BIT)
> +
>  #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
>  #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
>  
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 3c8b34876744..3e4a14ed402e 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void)
>  static inline void kernel_signal_stop(void)
>  {
>  	spin_lock_irq(&current->sighand->siglock);
> -	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
> +	if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
> +		current->jobctl |= JOBCTL_STOPPED;
>  		set_special_state(TASK_STOPPED);
> +	}
>  	spin_unlock_irq(&current->sighand->siglock);
>  
>  	schedule();
> @@ -437,10 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
>  
>  static inline void signal_wake_up(struct task_struct *t, bool resume)
>  {
> +	lockdep_assert_held(t->sighand->siglock);
> +
> +	if (resume && !(t->jobctl & JOBCTL_TRACED_FROZEN))
> +		t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
> +
>  	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
>  }
> +
>  static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
>  {
> +	lockdep_assert_held(t->sighand->siglock);
> +
> +	if (resume)
> +		t->jobctl &= ~JOBCTL_TRACED;
> +
>  	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
>  }
>  
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index ccc4b465775b..626f96d275c7 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
>  	return true;
>  }
>  
> -/* Ensure that nothing can wake it up, even SIGKILL */
> +/*
> + * Ensure that nothing can wake it up, even SIGKILL
> + *
> + * A task is switched to this state while a ptrace operation is in progress;
> + * such that the ptrace operation is uninterruptible.
> + */
>  static bool ptrace_freeze_traced(struct task_struct *task)
>  {
>  	bool ret = false;
> @@ -197,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
>  	spin_lock_irq(&task->sighand->siglock);
>  	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
>  	    !__fatal_signal_pending(task)) {
> +		task->jobctl |= JOBCTL_TRACED_FROZEN;
>  		WRITE_ONCE(task->__state, __TASK_TRACED);
>  		ret = true;
>  	}
> @@ -218,9 +224,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
>  	 */
>  	spin_lock_irq(&task->sighand->siglock);
>  	if (READ_ONCE(task->__state) == __TASK_TRACED) {
> -		if (__fatal_signal_pending(task))
> +		task->jobctl &= ~JOBCTL_TRACED_FROZEN;
> +		if (__fatal_signal_pending(task)) {
> +			task->jobctl &= ~JOBCTL_TRACED;
>  			wake_up_state(task, __TASK_TRACED);
> -		else
> +		} else
>  			WRITE_ONCE(task->__state, TASK_TRACED);
>  	}
>  	spin_unlock_irq(&task->sighand->siglock);
> @@ -475,8 +483,10 @@ static int ptrace_attach(struct task_struct *task, long request,
>  	 * in and out of STOPPED are protected by siglock.
>  	 */
>  	if (task_is_stopped(task) &&
> -	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
> +	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
> +		task->jobctl &= ~JOBCTL_STOPPED;
>  		signal_wake_up_state(task, __TASK_STOPPED);
> +	}
>  
>  	spin_unlock(&task->sighand->siglock);
>  
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 30cd1ca43bcd..0aea3f0a8002 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
>  static void ptrace_trap_notify(struct task_struct *t)
>  {
>  	WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
> -	assert_spin_locked(&t->sighand->siglock);
> +	lockdep_assert_held(&t->sighand->siglock);
>  
>  	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
>  	ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
> @@ -930,9 +930,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
>  		for_each_thread(p, t) {
>  			flush_sigqueue_mask(&flush, &t->pending);
>  			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
> -			if (likely(!(t->ptrace & PT_SEIZED)))
> +			if (likely(!(t->ptrace & PT_SEIZED))) {
> +				t->jobctl &= ~JOBCTL_STOPPED;
>  				wake_up_state(t, __TASK_STOPPED);
> -			else
> +			} else
>  				ptrace_trap_notify(t);
>  		}
>  
> @@ -2219,6 +2220,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
>  	 * schedule() will not sleep if there is a pending signal that
>  	 * can awaken the task.
>  	 */
> +	current->jobctl |= JOBCTL_TRACED;
>  	set_special_state(TASK_TRACED);
>  
>  	/*
> @@ -2460,6 +2462,7 @@ static bool do_signal_stop(int signr)
>  		if (task_participate_group_stop(current))
>  			notify = CLD_STOPPED;
>  
> +		current->jobctl |= JOBCTL_STOPPED;
>  		set_special_state(TASK_STOPPED);
>  		spin_unlock_irq(&current->sighand->siglock);
>  

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-07 12:13               ` Peter Zijlstra
  2022-04-07 17:51                 ` Eric W. Biederman
@ 2022-04-07 22:50                 ` Eric W. Biederman
  2022-04-08  9:09                   ` Peter Zijlstra
  1 sibling, 1 reply; 22+ messages in thread
From: Eric W. Biederman @ 2022-04-07 22:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

Peter Zijlstra <peterz@infradead.org> writes:

> On Tue, Apr 05, 2022 at 01:28:16PM +0200, Peter Zijlstra wrote:
>> On Tue, Apr 05, 2022 at 12:29:03PM +0200, Oleg Nesterov wrote:
>> > On 04/05, Peter Zijlstra wrote:
>> > >
>> > > As is, I think we can write task_is_stopped() like:
>> > >
>> > > #define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)
>> > >
>> > > Because jobctl is in fact the canonical state.
>> > 
>> > Not really. JOBCTL_STOP_PENDING means that the task should stop.
>> > 
>> > And this flag is cleared right before set_special_state(TASK_STOPPED)
>> > in do_signal_stop(), see task_participate_group_stop().
>> 
>> Argh! More work then :-( Still, I really want to not have p->__state be
>> actual unique state.
>
> How insane is this?
>
> In addition to solving the whole saved_state issue, it also allows my
> freezer rewrite to reconstruct __state. If you don't find the below
> fundamentally broken I can respin that freezer rewrite on top of it.

Let me see if I can describe what is going on.  Commit 5f220be21418
("sched/wakeup: Prepare for RT sleeping spin/rwlocks") is buggy because
it fundamentally depends upon the assumption that only the current
process will change task->state.  That assumption breaks ptrace_attach.

Given that wake_up_state and wake_up_process fundamentally violate that
assumption I am not at all certain it makes sense to try and fix the
broken commit.  I think the assumption is that it is fine to ignore
wake_up_state and wake_up_process and their wake_ups because the process
will wake up eventually.

Is it possible to simply take pi_lock around what ptrace does and fix
ptrace that way?

Hmmm.

Your ptrace_stop does:

> +	current->jobctl |= JOBCTL_TRACED;
>  	set_special_state(TASK_TRACED);

Your ptrace_freeze_traced does:
>  	    !__fatal_signal_pending(task)) {
> +		task->jobctl |= JOBCTL_TRACED_FROZEN;
>  		WRITE_ONCE(task->__state, __TASK_TRACED);
>  		ret = true;

At which point I say bleep!

Unless I am misreading something if ptrace_freeze_traced happens
during read_lock(&tasklist_lock) task->__state will be changed
from TASK_RTLOCK_WAIT to __TASK_TRACED.   Then when
read_lock(&tasklist_lock) is acquired task->__state will be changed
from __TASK_TRACED to TASK_TRACED.  Making it possible to send
SIGKILL to a process that is being ptraced.  Introducing all kinds
of unexpected races.

Given that fundamentally TASK_WAKEKILL must be added in ptrace_stop and
removed in ptrace_attach I don't see your proposed usage of jobctl helps
anything fundamental.

I suspect somewhere there is a deep trade-off between complicating
the scheduler to have a very special case for what is now
TASK_RTLOCK_WAIT, and complicating the rest of the code with having
TASK_RTLOCK_WAIT in __state and the values that should be in state
stored somewhere else.

Perhaps we should just remove task->saved_state and start all over from a
clean drawing sheet?

Eric

>
> ---
>  include/linux/sched.h        |  8 +++-----
>  include/linux/sched/jobctl.h |  8 ++++++++
>  include/linux/sched/signal.h | 15 ++++++++++++++-
>  kernel/ptrace.c              | 18 ++++++++++++++----
>  kernel/signal.c              |  9 ++++++---
>  5 files changed, 45 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 67f06f72c50e..6698b97b8e3b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -118,11 +118,9 @@ struct task_group;
>  
>  #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
>  
> -#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
> -
> -#define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
> -
> -#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
> +#define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
> +#define task_is_stopped(task)		((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
> +#define task_is_stopped_or_traced(task)	((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)
>  
>  /*
>   * Special states are those that do not use the normal wait-loop pattern. See
> diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
> index fa067de9f1a9..ec8b312f7506 100644
> --- a/include/linux/sched/jobctl.h
> +++ b/include/linux/sched/jobctl.h
> @@ -20,6 +20,10 @@ struct task_struct;
>  #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
>  #define JOBCTL_TRAP_FREEZE_BIT	23	/* trap for cgroup freezer */
>  
> +#define JOBCTL_STOPPED_BIT	24
> +#define JOBCTL_TRACED_BIT	25
> +#define JOBCTL_TRACED_FROZEN_BIT 26
> +
>  #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
>  #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
>  #define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
> @@ -29,6 +33,10 @@ struct task_struct;
>  #define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
>  #define JOBCTL_TRAP_FREEZE	(1UL << JOBCTL_TRAP_FREEZE_BIT)
>  
> +#define JOBCTL_STOPPED		(1UL << JOBCTL_STOPPED_BIT)
> +#define JOBCTL_TRACED		(1UL << JOBCTL_TRACED_BIT)
> +#define JOBCTL_TRACED_FROZEN	(1UL << JOBCTL_TRACED_FROZEN_BIT)
> +
>  #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
>  #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
>  
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 3c8b34876744..3e4a14ed402e 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void)
>  static inline void kernel_signal_stop(void)
>  {
>  	spin_lock_irq(&current->sighand->siglock);
> -	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
> +	if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
> +		current->jobctl |= JOBCTL_STOPPED;
>  		set_special_state(TASK_STOPPED);
> +	}
>  	spin_unlock_irq(&current->sighand->siglock);
>  
>  	schedule();
> @@ -437,10 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
>  
>  static inline void signal_wake_up(struct task_struct *t, bool resume)
>  {
> +	lockdep_assert_held(t->sighand->siglock);
> +
> +	if (resume && !(t->jobctl & JOBCTL_TRACED_FROZEN))
> +		t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
> +
>  	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
>  }
> +
>  static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
>  {
> +	lockdep_assert_held(t->sighand->siglock);
> +
> +	if (resume)
> +		t->jobctl &= ~JOBCTL_TRACED;
> +
>  	signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
>  }
>  
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index ccc4b465775b..626f96d275c7 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
>  	return true;
>  }
>  
> -/* Ensure that nothing can wake it up, even SIGKILL */
> +/*
> + * Ensure that nothing can wake it up, even SIGKILL
> + *
> + * A task is switched to this state while a ptrace operation is in progress;
> + * such that the ptrace operation is uninterruptible.
> + */
>  static bool ptrace_freeze_traced(struct task_struct *task)
>  {
>  	bool ret = false;
> @@ -197,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
>  	spin_lock_irq(&task->sighand->siglock);
>  	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
>  	    !__fatal_signal_pending(task)) {
> +		task->jobctl |= JOBCTL_TRACED_FROZEN;
>  		WRITE_ONCE(task->__state, __TASK_TRACED);
>  		ret = true;
>  	}
> @@ -218,9 +224,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
>  	 */
>  	spin_lock_irq(&task->sighand->siglock);
>  	if (READ_ONCE(task->__state) == __TASK_TRACED) {
> -		if (__fatal_signal_pending(task))
> +		task->jobctl &= ~JOBCTL_TRACED_FROZEN;
> +		if (__fatal_signal_pending(task)) {
> +			task->jobctl &= ~JOBCTL_TRACED;
>  			wake_up_state(task, __TASK_TRACED);
> -		else
> +		} else
>  			WRITE_ONCE(task->__state, TASK_TRACED);
>  	}
>  	spin_unlock_irq(&task->sighand->siglock);
> @@ -475,8 +483,10 @@ static int ptrace_attach(struct task_struct *task, long request,
>  	 * in and out of STOPPED are protected by siglock.
>  	 */
>  	if (task_is_stopped(task) &&
> -	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
> +	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
> +		task->jobctl &= ~JOBCTL_STOPPED;
>  		signal_wake_up_state(task, __TASK_STOPPED);
> +	}
>  
>  	spin_unlock(&task->sighand->siglock);
>  
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 30cd1ca43bcd..0aea3f0a8002 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
>  static void ptrace_trap_notify(struct task_struct *t)
>  {
>  	WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
> -	assert_spin_locked(&t->sighand->siglock);
> +	lockdep_assert_held(&t->sighand->siglock);
>  
>  	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
>  	ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
> @@ -930,9 +930,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
>  		for_each_thread(p, t) {
>  			flush_sigqueue_mask(&flush, &t->pending);
>  			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
> -			if (likely(!(t->ptrace & PT_SEIZED)))
> +			if (likely(!(t->ptrace & PT_SEIZED))) {
> +				t->jobctl &= ~JOBCTL_STOPPED;
>  				wake_up_state(t, __TASK_STOPPED);
> -			else
> +			} else
>  				ptrace_trap_notify(t);
>  		}
>  
> @@ -2219,6 +2220,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
>  	 * schedule() will not sleep if there is a pending signal that
>  	 * can awaken the task.
>  	 */
> +	current->jobctl |= JOBCTL_TRACED;
>  	set_special_state(TASK_TRACED);
>  
>  	/*
> @@ -2460,6 +2462,7 @@ static bool do_signal_stop(int signr)
>  		if (task_participate_group_stop(current))
>  			notify = CLD_STOPPED;
>  
> +		current->jobctl |= JOBCTL_STOPPED;
>  		set_special_state(TASK_STOPPED);
>  		spin_unlock_irq(&current->sighand->siglock);
>  

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-07 22:50                 ` Eric W. Biederman
@ 2022-04-08  9:09                   ` Peter Zijlstra
  2022-04-08 19:40                     ` Eric W. Biederman
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-08  9:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

On Thu, Apr 07, 2022 at 05:50:39PM -0500, Eric W. Biederman wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> 
> > On Tue, Apr 05, 2022 at 01:28:16PM +0200, Peter Zijlstra wrote:
> >> On Tue, Apr 05, 2022 at 12:29:03PM +0200, Oleg Nesterov wrote:
> >> > On 04/05, Peter Zijlstra wrote:
> >> > >
> >> > > As is, I think we can write task_is_stopped() like:
> >> > >
> >> > > #define task_is_stopped(task)	((task)->jobctl & JOBCTL_STOP_PENDING)
> >> > >
> >> > > Because jobctl is in fact the canonical state.
> >> > 
> >> > Not really. JOBCTL_STOP_PENDING means that the task should stop.
> >> > 
> >> > And this flag is cleared right before set_special_state(TASK_STOPPED)
> >> > in do_signal_stop(), see task_participate_group_stop().
> >> 
> >> Argh! More work then :-( Still, I really want to not have p->__state be
> >> actual unique state.
> >
> > How insane is this?
> >
> > In addition to solving the whole saved_state issue, it also allows my
> > freezer rewrite to reconstruct __state. If you don't find the below
> > fundamentally broken I can respin that freezer rewrite on top of it.
> 
> Let me see if I can describe what is going on.  Commit 5f220be21418
> ("sched/wakeup: Prepare for RT sleeping spin/rwlocks") is buggy because
> it fundamentally depends upon the assumption that only the current
> process will change task->state.  That assumption breaks ptrace_attach.

Right -- although I'll be arguing ptrace does some rather dodgy things
:-)

> Given that wake_up_state and wake_up_process fundamentally violate that
> assumption I am not at all certain it makes sense to try and fix the
> broken commit.  I think the assumption is that it is fine to ignore
> wake_up_state and wake_up_process and their wake_ups because the process
> will wake up eventually.
> 
> Is it possible to simply take pi_lock around what ptrace does and fix
> ptrace that way?

Sorta, see below, but there's an additional problem where I want to
rewrite the freezer to be a special task state. That is, replace:

	freezer_do_not_count();
	for (;;) {
		set_current_state(state);
		if (cond)
			break;
		schedule();
	}
	__set_current_state(RUNNING);
	freezer_count();

with:

	for (;;) {
		set_current_state(state|TASK_FREEZABLE);
		if (cond)
			break;
		schedule()
	}
	__set_current_state(RUNNING);

And have the freezer do (with pi_lock held):

+static int __set_task_frozen(struct task_struct *p, void *arg)
+{
+	unsigned int state = READ_ONCE(p->__state);
+
+	if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+		return 0;
+
+	/*
+	 * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+	 * can suffer spurious wakeups.
+	 */
+	if (state & TASK_FREEZABLE)
+		WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * It's dangerous to freeze with locks held; there be dragons there.
+	 */
+	if (!(state & __TASK_FREEZABLE_UNSAFE))
+		WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+	WRITE_ONCE(p->__state, TASK_FROZEN);
+	return TASK_FROZEN;
+}

That is, for anything FREEZABLE, STOPPED or TRACED set FROZEN.

The reason is that currently there's all sorts of races between the
whole freezer_do_not_count(); schedule(); freezer_count(); and thawing
which allows a task to thaw before it really is time.

By moving all that to a special block state, all that insanity goes
away.

*HOWEVER* per the above STOPPED,TRACED->FROZEN transition, that state is
lost. Thawing needs to somehow recover that state if needed. With the
proposed patch, I can write (this time with sighand *and* pi_lock held):

+/*
+ * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
+ * state in p->jobctl. If either of them got a wakeup that was missed because
+ * TASK_FROZEN, then their canonical state reflects that and the below will
+ * refuse to restore the special state and instead issue the wakeup.
+ */
+static int __set_task_special(struct task_struct *p, void *arg)
+{
+	unsigned int state = 0;
+
+	if (p->jobctl & JOBCTL_TRACED) {
+		state = __TASK_TRACED;
+
+		if (!(p->jobctl & JOBCTL_TRACED_FROZEN)) {
+			state |= TASK_WAKEKILL;
+			if (__fatal_signal_pending(p))
+				state = 0;
+		}
+
+	} else if ((p->jobctl & JOBCTL_STOPPED) &&
+		   !__fatal_signal_pending(p)) {
+
+		state = TASK_STOPPED;
+	}
+
+	if (state)
+		WRITE_ONCE(p->__state, state);
+
+	return state;
+}

So recover the special states and not have them experience spurious
wakeups.

So the proposed change moves the state into jobctl, such that:

 - task_is_{stopped,traced}() can look at jobctl and not need to worry
   about ->__state / ->saved_state etc.

 - task->__state can be recovered, it no longer contains unique state.

> Hmmm.
> 
> Your ptrace_stop does:
> 
> > +	current->jobctl |= JOBCTL_TRACED;
> >  	set_special_state(TASK_TRACED);
> 
> Your ptrace_freeze_traced does:
> >  	    !__fatal_signal_pending(task)) {
> > +		task->jobctl |= JOBCTL_TRACED_FROZEN;
> >  		WRITE_ONCE(task->__state, __TASK_TRACED);
> >  		ret = true;
> 
> At which point I say bleep!
> 
> Unless I am misreading something if ptrace_freeze_traced happens
> during read_lock(&tasklist_lock) task->__state will be changed
> from TASK_RTLOCK_WAIT to __TASK_TRACED.   Then when
> read_lock(&tasklist_lock) is acquired task->__state will be changed
> from __TASK_TRACED to TASK_TRACED.  Making it possible to send
> SIGKILL to a process that is being ptraced.  Introducing all kinds
> of unexpected races.

Correct, I 'forgot' about that part but have the below patch on top to
cure that. Mind you, it's not something I'm proud of, but it should
work.

> Given that fundamentally TASK_WAKEKILL must be added in ptrace_stop and
> removed in ptrace_attach I don't see your proposed usage of jobctl helps
> anything fundamental.
> 
> I suspect somewhere there is a deep trade-off between complicating
> the scheduler to have a very special case for what is now
> TASK_RTLOCK_WAIT, and complicating the rest of the code with having
> TASK_RTLOCK_WAIT in __state and the values that should be in state
> stored somewhere else.

The thing is; ptrace is a special case. I feel very strongly we should
not complicate the scheduler/wakeup path for something that 'never'
happens.

> Perhaps we should just remove task->saved_state and start all over from a
> clean drawing sheet?

We've not managed to come up with an alternative scheme for allowing
sleeping spinlocks and not wrecking many other things.


Anyway, let me finish this freezer rewrite thing and write more coherent
Changelogs on that :-)

~ Peter

---

Subject: sched,ptrace: Fix PREEMPT_RT vs TASK_TRACED
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Apr 7 16:38:16 CEST 2022

ptrace_{,un}freeze_traced() attempts a compare-and-change pattern on
a remote task->__state. The correctness of this scheme relies on
__TASK_TRACED being a special state and those having stability
guarantees.

Except... PREEMPT_RT has task->saved_state which 'temporarily' holds
the real task->__state while a spinlock_rt does TASK_RTLOCK_WAIT.

This means the remote compare-and-change pattern is busted in the face
of PREEMPT_RT.

Rework it so it is atomic vs state changes and takes task->saved_state
into account.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/ptrace.c |   37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,6 +185,29 @@ static bool looks_like_a_spurious_pid(st
 	return true;
 }
 
+static inline bool __set_if(unsigned int *state, unsigned int mask, unsigned int new)
+{
+	if (*state & mask) {
+		WRITE_ONCE(*state, new);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int __set_if_traced(struct task_struct *task, void *arg)
+{
+	unsigned int *state = arg;
+
+	__set_if(&task->__state, __TASK_TRACED, *state)
+#ifdef CONFIG_PREEMPT_RT
+		|| __set_if(&task->saved_state, __TASK_TRACED, *state)
+#endif
+		;
+
+	return 0;
+}
+
 /*
  * Ensure that nothing can wake it up, even SIGKILL
  *
@@ -202,8 +225,10 @@ static bool ptrace_freeze_traced(struct
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
 	    !__fatal_signal_pending(task)) {
+		unsigned int state = __TASK_TRACED;
+
 		task->jobctl |= JOBCTL_TRACED_FROZEN;
-		WRITE_ONCE(task->__state, __TASK_TRACED);
+		task_call_func(task, __set_if_traced, &state);
 		ret = true;
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -213,7 +238,7 @@ static bool ptrace_freeze_traced(struct
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-	if (READ_ONCE(task->__state) != __TASK_TRACED)
+	if (!task_is_traced(task))
 		return;
 
 	WARN_ON(!task->ptrace || task->parent != current);
@@ -223,13 +248,15 @@ static void ptrace_unfreeze_traced(struc
 	 * Recheck state under the lock to close this race.
 	 */
 	spin_lock_irq(&task->sighand->siglock);
-	if (READ_ONCE(task->__state) == __TASK_TRACED) {
+	if (task->jobctl & JOBCTL_TRACED_FROZEN) {
 		task->jobctl &= ~JOBCTL_TRACED_FROZEN;
 		if (__fatal_signal_pending(task)) {
 			task->jobctl &= ~JOBCTL_TRACED;
 			wake_up_state(task, __TASK_TRACED);
-		} else
-			WRITE_ONCE(task->__state, TASK_TRACED);
+		} else {
+			unsigned int state = TASK_TRACED;
+			task_call_func(task, __set_if_traced, &state);
+		}
 	}
 	spin_unlock_irq(&task->sighand->siglock);
 }

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-08  9:09                   ` Peter Zijlstra
@ 2022-04-08 19:40                     ` Eric W. Biederman
  2022-04-08 20:06                       ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Eric W. Biederman @ 2022-04-08 19:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

Peter Zijlstra <peterz@infradead.org> writes:

> On Thu, Apr 07, 2022 at 05:50:39PM -0500, Eric W. Biederman wrote:

>> Given that fundamentally TASK_WAKEKILL must be added in ptrace_stop and
>> removed in ptrace_attach I don't see your proposed usage of jobctl helps
>> anything fundamental.
>> 
>> I suspect somewhere there is a deep trade-off between complicating
>> the scheduler to have a very special case for what is now
>> TASK_RTLOCK_WAIT, and complicating the rest of the code with having
>> TASK_RTLOCK_WAIT in __state and the values that should be in state
>> stored somewhere else.
>
> The thing is; ptrace is a special case. I feel very strongly we should
> not complicate the scheduler/wakeup path for something that 'never'
> happens.

I was going to comment that I could not understand how the saved_state
mechanism under PREEMPT_RT works.  Then I realized that wake_up_process
and wake_up_state call try_to_wake_up which calls ttwu_state_match which
modifies saved_state.


The options appear to be that either ptrace_freeze_traced modifies
__state/state to remove TASK_KILLABLE.  Or that something clever happens
in ptrace_freeze_traced that guarantees the task does not wake
up.  Something living in kernel/sched/* like wait_task_inactive.


I can imagine adding add a loop around freezable_schedule in
ptrace_stop.  That does something like:

	do {
        	freezable_schedule();
        } while (current->jobctl & JOBCTL_PTRACE_FREEZE);

Unfortunately after a SIGKILL is delivered the process will never sleep
unless there is a higher priority process to preempt it.  So I don't
think that is a viable solution.


What ptrace_freeze_traced and ptrace_unfreeze_traced fundamentally need
is that the process to not do anything interesting, so that the tracer
process can modify the process and it's task_struct.


That need is the entire reason ptrace does questionable things with
with __state.

So if we can do something better perhaps with a rewritten freezer it
would be a general code improvement.


The ptrace code really does want TASK_KILLABLE semantics the entire time
a task is being manipulated by the ptrace system call.   The code in
ptrace_unfreeze_traced goes through some gymnastics to detect if a
process was killed while traced (AKA to detect a missed SIGKILL)
and to use wake_up_state to make the task runnable instead of putting
it back in TASK_TRACED.

So really all that is required is a way to ask the scheduler to just
not schedule the process until the ptrace syscall completes and calls
ptrace_unfreeze_traced.

Eric

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-08 19:40                     ` Eric W. Biederman
@ 2022-04-08 20:06                       ` Peter Zijlstra
  2022-04-11 11:35                         ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-08 20:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

On Fri, Apr 08, 2022 at 02:40:42PM -0500, Eric W. Biederman wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> 
> > On Thu, Apr 07, 2022 at 05:50:39PM -0500, Eric W. Biederman wrote:
> 
> >> Given that fundamentally TASK_WAKEKILL must be added in ptrace_stop and
> >> removed in ptrace_attach I don't see your proposed usage of jobctl helps
> >> anything fundamental.
> >> 
> >> I suspect somewhere there is a deep trade-off between complicating
> >> the scheduler to have a very special case for what is now
> >> TASK_RTLOCK_WAIT, and complicating the rest of the code with having
> >> TASK_RTLOCK_WAIT in __state and the values that should be in state
> >> stored somewhere else.
> >
> > The thing is; ptrace is a special case. I feel very strongly we should
> > not complicate the scheduler/wakeup path for something that 'never'
> > happens.
> 
> I was going to comment that I could not understand how the saved_state
> mechanism under PREEMPT_RT works.  Then I realized that wake_up_process
> and wake_up_state call try_to_wake_up which calls ttwu_state_match which
> modifies saved_state.

Correct.

> The options appear to be that either ptrace_freeze_traced modifies
> __state/state to remove TASK_KILLABLE.  Or that something clever happens
> in ptrace_freeze_traced that guarantees the task does not wake
> up.  Something living in kernel/sched/* like wait_task_inactive.

The code I posted in the parent will attempt to strip (and re-instate)
WAKEKILL from __state and then saved_state, all under pi_lock.

I think that preserves the current constraints.

> I can imagine adding add a loop around freezable_schedule in
> ptrace_stop.  That does something like:
> 
> 	do {
>         	freezable_schedule();
>         } while (current->jobctl & JOBCTL_PTRACE_FREEZE);

I'm not entirely sure where you're headin with this; but my goal is to
get rid of freezable_*() everything.

I'll ponder if wait_task_inactive() can simplify things..

> What ptrace_freeze_traced and ptrace_unfreeze_traced fundamentally need
> is that the process to not do anything interesting, so that the tracer
> process can modify the process and it's task_struct.

Agreed, I understand this need. I think I've done this, but I'll
centrainly look hard at it again Monday -- the weekend hopefully
clearing my brain of preconceptions enough so that I can see my own code
a-fresh.

Anyway, my current set lives here:

  https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/wip.freezer

I meant to post earlier today, but stuff got in between and I've not
even done build-tests yet :/

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-08 20:06                       ` Peter Zijlstra
@ 2022-04-11 11:35                         ` Peter Zijlstra
  2022-04-11 13:44                           ` Eric W. Biederman
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-11 11:35 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

On Fri, Apr 08, 2022 at 10:06:30PM +0200, Peter Zijlstra wrote:

> I'll ponder if wait_task_inactive() can simplify things..

This,.. so ptrace_check_attach(), which does ptrace_freeze_traced()
already does wait_task_inactive(), but on the 'wrong' side of things.

AFAICT, if we move that up, we're almost there, except that opens up a
detach+attach race. That could be fixed by doing another
wait_task_inactive(), but we can't due to locking :/

Let's see if I can make that work without making a mess of things.
Because ensuring the task is stuck in schedule() makes the whole
saved_state thing go away -- as you noted.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-11 11:35                         ` Peter Zijlstra
@ 2022-04-11 13:44                           ` Eric W. Biederman
  2022-04-11 17:07                             ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Eric W. Biederman @ 2022-04-11 13:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

Peter Zijlstra <peterz@infradead.org> writes:

> On Fri, Apr 08, 2022 at 10:06:30PM +0200, Peter Zijlstra wrote:
>
>> I'll ponder if wait_task_inactive() can simplify things..
>
> This,.. so ptrace_check_attach(), which does ptrace_freeze_traced()
> already does wait_task_inactive(), but on the 'wrong' side of things.
>
> AFAICT, if we move that up, we're almost there, except that opens up a
> detach+attach race. That could be fixed by doing another
> wait_task_inactive(), but we can't due to locking :/
>
> Let's see if I can make that work without making a mess of things.
> Because ensuring the task is stuck in schedule() makes the whole
> saved_state thing go away -- as you noted.

The code can perhaps synchronize on a bit using the the full locking and
then drop the locks and call the wait_task_inactive or whatever.

The challenge as I see it is after the traced task is inactive to allow
"wake_up_state(t, TASK_WAKEKILL)" on the traced task, have the traced
tasks state change to TASK_RUNNING and not allow the traced task to run
until what is today ptrace_unfreeze_task is called.

I just don't know how to get something stuck and not allow it to run.

Eric

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-11 13:44                           ` Eric W. Biederman
@ 2022-04-11 17:07                             ` Peter Zijlstra
  2022-04-12 11:59                               ` Peter Zijlstra
  0 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-11 17:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot

On Mon, Apr 11, 2022 at 08:44:24AM -0500, Eric W. Biederman wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> 
> > On Fri, Apr 08, 2022 at 10:06:30PM +0200, Peter Zijlstra wrote:
> >
> >> I'll ponder if wait_task_inactive() can simplify things..
> >
> > This,.. so ptrace_check_attach(), which does ptrace_freeze_traced()
> > already does wait_task_inactive(), but on the 'wrong' side of things.
> >
> > AFAICT, if we move that up, we're almost there, except that opens up a
> > detach+attach race. That could be fixed by doing another
> > wait_task_inactive(), but we can't due to locking :/
> >
> > Let's see if I can make that work without making a mess of things.
> > Because ensuring the task is stuck in schedule() makes the whole
> > saved_state thing go away -- as you noted.
> 
> The code can perhaps synchronize on a bit using the the full locking and
> then drop the locks and call the wait_task_inactive or whatever.
> 
> The challenge as I see it is after the traced task is inactive to allow
> "wake_up_state(t, TASK_WAKEKILL)" on the traced task, have the traced
> tasks state change to TASK_RUNNING and not allow the traced task to run
> until what is today ptrace_unfreeze_task is called.
> 
> I just don't know how to get something stuck and not allow it to run.

Same as today? Clear TASK_WAKEKILL from __state and check
__fatal_signal_pending() before putting it back again.

The thing is, once we hit schedule() with TASK_TRACED, there's only two
possible ways for that task to wake up:

  wake_up_state(t, TASK_WAKEKILL)

and

  wake_up_state(t, __TASK_TRACED)

both are issued while holding sighand lock, so provided we hold sighand
lock, we can actually frob __state as we do today, we just need to know
the task really has scheduled out first.

That is, the problem today, for PREEMPT_RT, is:

ptrace_stop():					ptrace_freeze_traced()

  set_special_state(TASK_TRACING)

  ...

  spin_lock(&foo)
    current->saved_state = current->__state;
    current->__state = TASK_RTLOCK_WAIT

						READ_ONCE(t->__state)
						  // whoopsie, not
						  // TRACED

  ...

  schedule()


But if wait_task_inactive() ensures our @t is actually in schedule(),
we're good again, nothing will then ever change __state as long as we
hold sighand lock.

The only fun bit is that wait_task_inactive() likes to schedule so we
want do that with sighand lock held. What we need to do is call it
first, and then re-check stuff is still sane once we (re)acquire all the
locks.

This is certainly possible -- and not in fact too hard; the only thing
I'm really concerned about is not making it more ugly than dealing with
saved_state in the first place (and *that* is turning out to be somewhat
hard).

But while going over all this I think there might be an additional
problem; wait_task_inactive() is stubbed for SMP=n...



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT.
  2022-04-11 17:07                             ` Peter Zijlstra
@ 2022-04-12 11:59                               ` Peter Zijlstra
  0 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2022-04-12 11:59 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Oleg Nesterov, Sebastian Andrzej Siewior, linux-kernel,
	Ben Segall, Daniel Bristot de Oliveira, Dietmar Eggemann,
	Ingo Molnar, Juri Lelli, Mel Gorman, Steven Rostedt,
	Thomas Gleixner, Vincent Guittot


For all who I forgot on the Cc list.. continued here:

  https://lkml.kernel.org/r/20220412114421.691372568@infradead.org

(once it shows up, the web seems congested just now)

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-04-12 12:41 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-02 21:04 [PATCH] ptrace: fix ptrace vs tasklist_lock race on PREEMPT_RT Sebastian Andrzej Siewior
2022-03-14  9:27 ` Sebastian Andrzej Siewior
2022-03-14 18:54 ` Oleg Nesterov
2022-03-15  8:31   ` Sebastian Andrzej Siewior
2022-03-15 14:29     ` Oleg Nesterov
2022-03-16  8:23       ` Sebastian Andrzej Siewior
2022-03-31 14:25       ` [PATCH v2] " Sebastian Andrzej Siewior
2022-04-04 16:13         ` Oleg Nesterov
2022-04-05  8:34           ` Oleg Nesterov
2022-04-05 10:10         ` Peter Zijlstra
2022-04-05 10:29           ` Oleg Nesterov
2022-04-05 11:28             ` Peter Zijlstra
2022-04-07 12:13               ` Peter Zijlstra
2022-04-07 17:51                 ` Eric W. Biederman
2022-04-07 22:50                 ` Eric W. Biederman
2022-04-08  9:09                   ` Peter Zijlstra
2022-04-08 19:40                     ` Eric W. Biederman
2022-04-08 20:06                       ` Peter Zijlstra
2022-04-11 11:35                         ` Peter Zijlstra
2022-04-11 13:44                           ` Eric W. Biederman
2022-04-11 17:07                             ` Peter Zijlstra
2022-04-12 11:59                               ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).