All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] kernel/hung_task.c: Monitor killed tasks.
@ 2019-05-13 11:02 Tetsuo Handa
  2019-05-13 11:11 ` Dmitry Vyukov
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-13 11:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Petr Mladek,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Tetsuo Handa, Dmitry Vyukov

syzbot's second top report is "no output from test machine" where the
userspace process failed to spawn a new test process for 300 seconds
for some reason. One of reasons which can result in this report is that
an already spawned test process was unable to terminate (e.g. trapped at
an unkillable retry loop due to some bug) after SIGKILL was sent to that
process. Therefore, reporting when a thread is failing to terminate
despite a fatal signal is pending would give us more useful information.

This version shares existing sysctl settings (e.g. check interval,
timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE
threads, for I don't know whether people want to use a new kernel
config option and different sysctl settings for monitoring killed
threads.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dmitry Vyukov <dvyukov@google.com>
---
 include/linux/sched.h |  1 +
 kernel/hung_task.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2cd1585..d42bdd7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -850,6 +850,7 @@ struct task_struct {
 #ifdef CONFIG_DETECT_HUNG_TASK
 	unsigned long			last_switch_count;
 	unsigned long			last_switch_time;
+	unsigned long			killed_time;
 #endif
 	/* Filesystem information: */
 	struct fs_struct		*fs;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95..34e7b84 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -141,6 +141,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	touch_nmi_watchdog();
 }
 
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+	unsigned long stamp = t->killed_time;
+
+	/*
+	 * Ensure the task is not frozen.
+	 * Also, skip vfork and any other user process that freezer should skip.
+	 */
+	if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+		return;
+	/*
+	 * Skip threads which are already inside do_exit(), for exit_mm() etc.
+	 * might take many seconds.
+	 */
+	if (t->flags & PF_EXITING)
+		return;
+	if (!stamp) {
+		stamp = jiffies;
+		if (!stamp)
+			stamp++;
+		t->killed_time = stamp;
+		return;
+	}
+	if (time_is_after_jiffies(stamp + timeout * HZ))
+		return;
+	trace_sched_process_hang(t);
+	if (sysctl_hung_task_panic) {
+		console_verbose();
+		hung_task_call_panic = true;
+	}
+	/*
+	 * This thread failed to terminate for more than
+	 * sysctl_hung_task_timeout_secs seconds, complain:
+	 */
+	pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+	       t->comm, t->pid, (jiffies - stamp) / HZ);
+	sched_show_task(t);
+	hung_task_show_lock = true;
+	touch_nmi_watchdog();
+}
+
 /*
  * To avoid extending the RCU grace period for an unbounded amount of time,
  * periodically exit the critical section and enter a new one.
@@ -192,6 +233,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 				goto unlock;
 			last_break = jiffies;
 		}
+		/* Check threads which are about to terminate. */
+		if (unlikely(fatal_signal_pending(t)))
+			check_killed_task(t, timeout);
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, timeout);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-13 11:02 [PATCH] kernel/hung_task.c: Monitor killed tasks Tetsuo Handa
@ 2019-05-13 11:11 ` Dmitry Vyukov
  2019-05-14 22:28 ` Paul E. McKenney
  2019-05-15 10:55 ` Petr Mladek
  2 siblings, 0 replies; 15+ messages in thread
From: Dmitry Vyukov @ 2019-05-13 11:11 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Petr Mladek, Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks,
	LKML, syzkaller

From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Mon, May 13, 2019 at 1:04 PM
To: Andrew Morton
Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Petr Mladek, Vitaly
Kuznetsov, Liu Chuansheng, Valdis Kletnieks,
<linux-kernel@vger.kernel.org>, Tetsuo Handa, Dmitry Vyukov

> syzbot's second top report is "no output from test machine" where the
> userspace process failed to spawn a new test process for 300 seconds
> for some reason. One of reasons which can result in this report is that
> an already spawned test process was unable to terminate (e.g. trapped at
> an unkillable retry loop due to some bug) after SIGKILL was sent to that
> process. Therefore, reporting when a thread is failing to terminate
> despite a fatal signal is pending would give us more useful information.
>
> This version shares existing sysctl settings (e.g. check interval,
> timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE
> threads, for I don't know whether people want to use a new kernel
> config option and different sysctl settings for monitoring killed
> threads.
>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Dmitry Vyukov <dvyukov@google.com>
> ---
>  include/linux/sched.h |  1 +
>  kernel/hung_task.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 45 insertions(+)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a2cd1585..d42bdd7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -850,6 +850,7 @@ struct task_struct {
>  #ifdef CONFIG_DETECT_HUNG_TASK
>         unsigned long                   last_switch_count;
>         unsigned long                   last_switch_time;
> +       unsigned long                   killed_time;
>  #endif
>         /* Filesystem information: */
>         struct fs_struct                *fs;
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index f108a95..34e7b84 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -141,6 +141,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
>         touch_nmi_watchdog();
>  }
>
> +static void check_killed_task(struct task_struct *t, unsigned long timeout)
> +{
> +       unsigned long stamp = t->killed_time;
> +
> +       /*
> +        * Ensure the task is not frozen.
> +        * Also, skip vfork and any other user process that freezer should skip.
> +        */
> +       if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
> +               return;
> +       /*
> +        * Skip threads which are already inside do_exit(), for exit_mm() etc.
> +        * might take many seconds.
> +        */
> +       if (t->flags & PF_EXITING)
> +               return;
> +       if (!stamp) {
> +               stamp = jiffies;
> +               if (!stamp)
> +                       stamp++;
> +               t->killed_time = stamp;
> +               return;
> +       }
> +       if (time_is_after_jiffies(stamp + timeout * HZ))
> +               return;
> +       trace_sched_process_hang(t);
> +       if (sysctl_hung_task_panic) {
> +               console_verbose();
> +               hung_task_call_panic = true;
> +       }
> +       /*
> +        * This thread failed to terminate for more than
> +        * sysctl_hung_task_timeout_secs seconds, complain:
> +        */
> +       pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
> +              t->comm, t->pid, (jiffies - stamp) / HZ);
> +       sched_show_task(t);
> +       hung_task_show_lock = true;
> +       touch_nmi_watchdog();
> +}
> +
>  /*
>   * To avoid extending the RCU grace period for an unbounded amount of time,
>   * periodically exit the critical section and enter a new one.
> @@ -192,6 +233,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>                                 goto unlock;
>                         last_break = jiffies;
>                 }
> +               /* Check threads which are about to terminate. */
> +               if (unlikely(fatal_signal_pending(t)))
> +                       check_killed_task(t, timeout);
>                 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>                 if (t->state == TASK_UNINTERRUPTIBLE)
>                         check_hung_task(t, timeout);
> --
> 1.8.3.1


For background context:
syzkaller has found a number of cases with unkillable tasks ([1]),
but they were always poorly diagnosed by the kernel. In all cases
everything just hangs with no additional diagnosis and in all cases
it required manual work just to identify that there is an instance
of an unkillable task in the pile.
With syzbot all such bugs end up in the "no output from test machine" pile ([2])
with hundreds of thousands of crashes, which nobody usually looks at
(except for Tetsuo). Most likely we now have many more of these in the pile
so it would be very useful to make it possible to auto-diagnose and auto-bucket
these bugs.


[1] search for "unkillable" at
https://github.com/google/syzkaller/blob/master/docs/linux/found_bugs.md
[2] https://syzkaller.appspot.com/bug?id=0b210638616bb68109e9642158d4c0072770ae1c

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-13 11:02 [PATCH] kernel/hung_task.c: Monitor killed tasks Tetsuo Handa
  2019-05-13 11:11 ` Dmitry Vyukov
@ 2019-05-14 22:28 ` Paul E. McKenney
  2019-05-15 10:55 ` Petr Mladek
  2 siblings, 0 replies; 15+ messages in thread
From: Paul E. McKenney @ 2019-05-14 22:28 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Petr Mladek,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Dmitry Vyukov

On Mon, May 13, 2019 at 08:02:11PM +0900, Tetsuo Handa wrote:
> syzbot's second top report is "no output from test machine" where the
> userspace process failed to spawn a new test process for 300 seconds
> for some reason. One of reasons which can result in this report is that
> an already spawned test process was unable to terminate (e.g. trapped at
> an unkillable retry loop due to some bug) after SIGKILL was sent to that
> process. Therefore, reporting when a thread is failing to terminate
> despite a fatal signal is pending would give us more useful information.
> 
> This version shares existing sysctl settings (e.g. check interval,
> timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE
> threads, for I don't know whether people want to use a new kernel
> config option and different sysctl settings for monitoring killed
> threads.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Dmitry Vyukov <dvyukov@google.com>

Looks good to me.

Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>

A few inconsequential comments below.

> ---
>  include/linux/sched.h |  1 +
>  kernel/hung_task.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a2cd1585..d42bdd7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -850,6 +850,7 @@ struct task_struct {
>  #ifdef CONFIG_DETECT_HUNG_TASK
>  	unsigned long			last_switch_count;
>  	unsigned long			last_switch_time;
> +	unsigned long			killed_time;
>  #endif
>  	/* Filesystem information: */
>  	struct fs_struct		*fs;
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index f108a95..34e7b84 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -141,6 +141,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
>  	touch_nmi_watchdog();
>  }
>  
> +static void check_killed_task(struct task_struct *t, unsigned long timeout)
> +{
> +	unsigned long stamp = t->killed_time;
> +
> +	/*
> +	 * Ensure the task is not frozen.
> +	 * Also, skip vfork and any other user process that freezer should skip.
> +	 */
> +	if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
> +		return;
> +	/*
> +	 * Skip threads which are already inside do_exit(), for exit_mm() etc.
> +	 * might take many seconds.
> +	 */
> +	if (t->flags & PF_EXITING)
> +		return;
> +	if (!stamp) {
> +		stamp = jiffies;
> +		if (!stamp)
> +			stamp++;

Cute trick to avoid issues with jiffy overflow on 32-bit systems.  ;-)

> +		t->killed_time = stamp;
> +		return;
> +	}
> +	if (time_is_after_jiffies(stamp + timeout * HZ))

And if I understand correctly, timeout of zero disables everything, so
we don't get the backwards false-positive comparison above.

> +		return;
> +	trace_sched_process_hang(t);
> +	if (sysctl_hung_task_panic) {
> +		console_verbose();
> +		hung_task_call_panic = true;
> +	}
> +	/*
> +	 * This thread failed to terminate for more than
> +	 * sysctl_hung_task_timeout_secs seconds, complain:
> +	 */
> +	pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
> +	       t->comm, t->pid, (jiffies - stamp) / HZ);
> +	sched_show_task(t);
> +	hung_task_show_lock = true;
> +	touch_nmi_watchdog();
> +}
> +
>  /*
>   * To avoid extending the RCU grace period for an unbounded amount of time,
>   * periodically exit the critical section and enter a new one.
> @@ -192,6 +233,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  				goto unlock;
>  			last_break = jiffies;
>  		}
> +		/* Check threads which are about to terminate. */
> +		if (unlikely(fatal_signal_pending(t)))
> +			check_killed_task(t, timeout);
>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>  		if (t->state == TASK_UNINTERRUPTIBLE)
>  			check_hung_task(t, timeout);
> -- 
> 1.8.3.1
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-13 11:02 [PATCH] kernel/hung_task.c: Monitor killed tasks Tetsuo Handa
  2019-05-13 11:11 ` Dmitry Vyukov
  2019-05-14 22:28 ` Paul E. McKenney
@ 2019-05-15 10:55 ` Petr Mladek
  2019-05-16  8:19   ` Tetsuo Handa
  2 siblings, 1 reply; 15+ messages in thread
From: Petr Mladek @ 2019-05-15 10:55 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Dmitry Vyukov

On Mon 2019-05-13 20:02:11, Tetsuo Handa wrote:
> syzbot's second top report is "no output from test machine" where the
> userspace process failed to spawn a new test process for 300 seconds
> for some reason. One of reasons which can result in this report is that
> an already spawned test process was unable to terminate (e.g. trapped at
> an unkillable retry loop due to some bug) after SIGKILL was sent to that
> process. Therefore, reporting when a thread is failing to terminate
> despite a fatal signal is pending would give us more useful information.
> 
> This version shares existing sysctl settings (e.g. check interval,
> timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE
> threads, for I don't know whether people want to use a new kernel
> config option and different sysctl settings for monitoring killed
> threads.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Dmitry Vyukov <dvyukov@google.com>
> ---
>  include/linux/sched.h |  1 +
>  kernel/hung_task.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a2cd1585..d42bdd7 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -850,6 +850,7 @@ struct task_struct {
>  #ifdef CONFIG_DETECT_HUNG_TASK
>  	unsigned long			last_switch_count;
>  	unsigned long			last_switch_time;
> +	unsigned long			killed_time;

I would call this fatal_signal_time to make the meaning more clear.

>  #endif
>  	/* Filesystem information: */
>  	struct fs_struct		*fs;
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index f108a95..34e7b84 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -141,6 +141,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
>  	touch_nmi_watchdog();
>  }
>  
> +static void check_killed_task(struct task_struct *t, unsigned long timeout)
> +{
> +	unsigned long stamp = t->killed_time;
> +
> +	/*
> +	 * Ensure the task is not frozen.
> +	 * Also, skip vfork and any other user process that freezer should skip.
> +	 */
> +	if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
> +		return;
> +	/*
> +	 * Skip threads which are already inside do_exit(), for exit_mm() etc.
> +	 * might take many seconds.
> +	 */
> +	if (t->flags & PF_EXITING)
> +		return;
> +	if (!stamp) {
> +		stamp = jiffies;
> +		if (!stamp)
> +			stamp++;
> +		t->killed_time = stamp;
> +		return;
> +	}

I might be too dumb but the above code looks pretty tricky to me.
It would deserve a comment. Or better, I would remove
trick to handle overflow. If it happens, we would just
lose one check period.

Alternative solution would be to set the timestamp in
complete_signal(). Then we would know that the timestamp
is always valid when a fatal signal is pending.


> +	if (time_is_after_jiffies(stamp + timeout * HZ))
> +		return;
> +	trace_sched_process_hang(t);
> +	if (sysctl_hung_task_panic) {
> +		console_verbose();
> +		hung_task_call_panic = true;

IMHO, the delayed task exit is much less fatal than sleeping
in an uninterruptible state.

Anyway, the check is much less reliable. In case of hung_task,
it is enough when the task gets scheduled. In the new check,
the task has to do some amount of work until the signal
gets handled and do_exit() is called.

The panic should either get enabled separately or we should
never panic in this case.

Best Regards,
Petr

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-15 10:55 ` Petr Mladek
@ 2019-05-16  8:19   ` Tetsuo Handa
  2019-05-16 11:57     ` Petr Mladek
  0 siblings, 1 reply; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-16  8:19 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Dmitry Vyukov

On 2019/05/15 19:55, Petr Mladek wrote:
>> +	if (!stamp) {
>> +		stamp = jiffies;
>> +		if (!stamp)
>> +			stamp++;
>> +		t->killed_time = stamp;
>> +		return;
>> +	}
> 
> I might be too dumb but the above code looks pretty tricky to me.
> It would deserve a comment. Or better, I would remove
> trick to handle overflow. If it happens, we would just
> lose one check period.

We can use

  static inline unsigned long jiffies_nonzero(void)
  {
      const unsigned long stamp = jiffies;

      return stamp ? stamp : -1;
  }

or even shortcut "jiffies | 1" because difference by one jiffie
is an measurement error for multiple HZ of timeout.

> 
> Alternative solution would be to set the timestamp in
> complete_signal(). Then we would know that the timestamp
> is always valid when a fatal signal is pending.

Yes. But I guess that since signal might be delivered just before
setting PF_FROZEN and the thread might be kept frozen for longer
than timeout, we will need to reset the timestamp just before
clearing PF_FROZEN.



>> +	if (time_is_after_jiffies(stamp + timeout * HZ))
>> +		return;
>> +	trace_sched_process_hang(t);
>> +	if (sysctl_hung_task_panic) {
>> +		console_verbose();
>> +		hung_task_call_panic = true;
> 
> IMHO, the delayed task exit is much less fatal than sleeping
> in an uninterruptible state.
> 
> Anyway, the check is much less reliable. In case of hung_task,
> it is enough when the task gets scheduled. In the new check,
> the task has to do some amount of work until the signal
> gets handled and do_exit() is called.
> 
> The panic should either get enabled separately or we should
> never panic in this case.

OK, we should not share existing sysctl settings.

But in the context of syzbot's testing where there are only 2 CPUs
in the target VM (which means that only small number of threads and
not so much memory) and threads get SIGKILL after 5 seconds from fork(),
being unable to reach do_exit() within 10 seconds is likely a sign of
something went wrong. For example, 6 out of 7 trials of a reproducer for
https://syzkaller.appspot.com/bug?id=835a0b9e75b14b55112661cbc61ca8b8f0edf767
resulted in "no output from test machine" rather than "task hung".
This patch is revealing that such killed threads are failing to reach
do_exit() because they are trapped at unkillable retry loop due to a
race bug.

Therefore, I would like to try this patch in linux-next.git for feasibility
testing whether this patch helps finding more bugs and reproducers for such
bugs, by bringing "unable to terminate threads" reports out of "no output from
test machine" reports. We can add sysctl settings before sending to linux.git.

Any questions?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-16  8:19   ` Tetsuo Handa
@ 2019-05-16 11:57     ` Petr Mladek
  2019-05-16 12:38       ` Tetsuo Handa
  0 siblings, 1 reply; 15+ messages in thread
From: Petr Mladek @ 2019-05-16 11:57 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Dmitry Vyukov, Stephen Rothwell

CCed Stephen to discuss linux-next related question at the bottom
of the mail.

On Thu 2019-05-16 17:19:12, Tetsuo Handa wrote:
> On 2019/05/15 19:55, Petr Mladek wrote:
> >> +	if (!stamp) {
> >> +		stamp = jiffies;
> >> +		if (!stamp)
> >> +			stamp++;
> >> +		t->killed_time = stamp;
> >> +		return;
> >> +	}
> > 
> > I might be too dumb but the above code looks pretty tricky to me.
> > It would deserve a comment. Or better, I would remove
> > trick to handle overflow. If it happens, we would just
> > lose one check period.
> 
> We can use
> 
>   static inline unsigned long jiffies_nonzero(void)
>   {
>       const unsigned long stamp = jiffies;
> 
>       return stamp ? stamp : -1;
>   }
> 
> or even shortcut "jiffies | 1" because difference by one jiffie
> is an measurement error for multiple HZ of timeout.

I would just ignore the overflow. We would just start measuring
the timeout in the next check_hung_task() call. It is not
a big deal and removes few lines of a tricky code.

> >> +	if (time_is_after_jiffies(stamp + timeout * HZ))
> >> +		return;
> >> +	trace_sched_process_hang(t);
> >> +	if (sysctl_hung_task_panic) {
> >> +		console_verbose();
> >> +		hung_task_call_panic = true;
> > 
> > IMHO, the delayed task exit is much less fatal than sleeping
> > in an uninterruptible state.
> > 
> > Anyway, the check is much less reliable. In case of hung_task,
> > it is enough when the task gets scheduled. In the new check,
> > the task has to do some amount of work until the signal
> > gets handled and do_exit() is called.
> > 
> > The panic should either get enabled separately or we should
> > never panic in this case.
> 
> OK, we should not share existing sysctl settings.
> 
> But in the context of syzbot's testing where there are only 2 CPUs
> in the target VM (which means that only small number of threads and
> not so much memory) and threads get SIGKILL after 5 seconds from fork(),
> being unable to reach do_exit() within 10 seconds is likely a sign of
> something went wrong. For example, 6 out of 7 trials of a reproducer for
> https://syzkaller.appspot.com/bug?id=835a0b9e75b14b55112661cbc61ca8b8f0edf767
> resulted in "no output from test machine" rather than "task hung".
> This patch is revealing that such killed threads are failing to reach
> do_exit() because they are trapped at unkillable retry loop due to a
> race bug.
> 
> Therefore, I would like to try this patch in linux-next.git for feasibility
> testing whether this patch helps finding more bugs and reproducers for such
> bugs, by bringing "unable to terminate threads" reports out of "no output from
> test machine" reports. We can add sysctl settings before sending to linux.git.

In this case, the watchdog should get enabled on with
CONFIG_DEBUG_AID_FOR_SYZBOT

Also we should ask/inform Stephen about this. I am not sure
if he is willing to resolve eventual conflicts for these
syzboot-specific patches that are not upstream candidates.

A solution might be to create sysbot-specific for-next branch
that Stephen might simply ignore when there are conflicts.
And you would be responsible for updating it.

Best Regards,
Petr

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-16 11:57     ` Petr Mladek
@ 2019-05-16 12:38       ` Tetsuo Handa
  2019-05-22 12:38         ` Tetsuo Handa
  2019-05-27 14:12         ` Tetsuo Handa
  0 siblings, 2 replies; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-16 12:38 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Dmitry Vyukov, Stephen Rothwell

On 2019/05/16 20:57, Petr Mladek wrote:
> CCed Stephen to discuss linux-next related question at the bottom
> of the mail.
> 
> On Thu 2019-05-16 17:19:12, Tetsuo Handa wrote:
>> On 2019/05/15 19:55, Petr Mladek wrote:
>> But in the context of syzbot's testing where there are only 2 CPUs
>> in the target VM (which means that only small number of threads and
>> not so much memory) and threads get SIGKILL after 5 seconds from fork(),
>> being unable to reach do_exit() within 10 seconds is likely a sign of
>> something went wrong. For example, 6 out of 7 trials of a reproducer for
>> https://syzkaller.appspot.com/bug?id=835a0b9e75b14b55112661cbc61ca8b8f0edf767
>> resulted in "no output from test machine" rather than "task hung".
>> This patch is revealing that such killed threads are failing to reach
>> do_exit() because they are trapped at unkillable retry loop due to a
>> race bug.
>>
>> Therefore, I would like to try this patch in linux-next.git for feasibility
>> testing whether this patch helps finding more bugs and reproducers for such
>> bugs, by bringing "unable to terminate threads" reports out of "no output from
>> test machine" reports. We can add sysctl settings before sending to linux.git.
> 
> In this case, the watchdog should get enabled on with
> CONFIG_DEBUG_AID_FOR_SYZBOT

Since "[PATCH] printk: Monitor change of console loglevel." is one time (only
needed until we find the reason of silence), testing on only linux-next.git
is sufficient and it gets enabled on with CONFIG_DEBUG_AID_FOR_SYZBOT.

> 
> Also we should ask/inform Stephen about this. I am not sure
> if he is willing to resolve eventual conflicts for these
> syzboot-specific patches that are not upstream candidates.
> 
> A solution might be to create sysbot-specific for-next branch
> that Stephen might simply ignore when there are conflicts.
> And you would be responsible for updating it.

syzbot tests not only linux-next.git but also various trees, and tests
attempted depends on target git tree. Therefore, apart from whether we
can introduce a kernel config option for fuzzing testing,
"[PATCH] kernel/hung_task.c: Monitor killed tasks." is expected to be
in linux.git. This patch will eventually become upstream candidate.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-16 12:38       ` Tetsuo Handa
@ 2019-05-22 12:38         ` Tetsuo Handa
  2019-05-22 13:41           ` Stephen Rothwell
  2019-05-27 14:12         ` Tetsuo Handa
  1 sibling, 1 reply; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-22 12:38 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Petr Mladek, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

Hello, Stephen.

I want to send debug printk() patches to linux-next.git. Petr Mladek
is suggesting me to have a git tree for debug printk() patches.
But it seems that there is "git quiltimport" command, and I prefer
"subversion + quilt", and I don't have trees for sending "git pull"
requests. Therefore, just ignoring "git quiltimport" failure is fine.
What do you think?

On 2019/05/16 21:38, Tetsuo Handa wrote:
> On 2019/05/16 20:57, Petr Mladek wrote:
>> CCed Stephen to discuss linux-next related question at the bottom
>> of the mail.
>>
>> On Thu 2019-05-16 17:19:12, Tetsuo Handa wrote:
>>> On 2019/05/15 19:55, Petr Mladek wrote:
>>> But in the context of syzbot's testing where there are only 2 CPUs
>>> in the target VM (which means that only small number of threads and
>>> not so much memory) and threads get SIGKILL after 5 seconds from fork(),
>>> being unable to reach do_exit() within 10 seconds is likely a sign of
>>> something went wrong. For example, 6 out of 7 trials of a reproducer for
>>> https://syzkaller.appspot.com/bug?id=835a0b9e75b14b55112661cbc61ca8b8f0edf767
>>> resulted in "no output from test machine" rather than "task hung".
>>> This patch is revealing that such killed threads are failing to reach
>>> do_exit() because they are trapped at unkillable retry loop due to a
>>> race bug.
>>>
>>> Therefore, I would like to try this patch in linux-next.git for feasibility
>>> testing whether this patch helps finding more bugs and reproducers for such
>>> bugs, by bringing "unable to terminate threads" reports out of "no output from
>>> test machine" reports. We can add sysctl settings before sending to linux.git.
>>
>> In this case, the watchdog should get enabled on with
>> CONFIG_DEBUG_AID_FOR_SYZBOT
> 
> Since "[PATCH] printk: Monitor change of console loglevel." is one time (only
> needed until we find the reason of silence), testing on only linux-next.git
> is sufficient and it gets enabled on with CONFIG_DEBUG_AID_FOR_SYZBOT.
> 
>>
>> Also we should ask/inform Stephen about this. I am not sure
>> if he is willing to resolve eventual conflicts for these
>> syzboot-specific patches that are not upstream candidates.
>>
>> A solution might be to create sysbot-specific for-next branch
>> that Stephen might simply ignore when there are conflicts.
>> And you would be responsible for updating it.
> 
> syzbot tests not only linux-next.git but also various trees, and tests
> attempted depends on target git tree. Therefore, apart from whether we
> can introduce a kernel config option for fuzzing testing,
> "[PATCH] kernel/hung_task.c: Monitor killed tasks." is expected to be
> in linux.git. This patch will eventually become upstream candidate.
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-22 12:38         ` Tetsuo Handa
@ 2019-05-22 13:41           ` Stephen Rothwell
  2019-05-22 14:58             ` Tetsuo Handa
  0 siblings, 1 reply; 15+ messages in thread
From: Stephen Rothwell @ 2019-05-22 13:41 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Petr Mladek, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

[-- Attachment #1: Type: text/plain, Size: 883 bytes --]

Hi Tetsuo,

On Wed, 22 May 2019 21:38:45 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> I want to send debug printk() patches to linux-next.git. Petr Mladek
> is suggesting me to have a git tree for debug printk() patches.
> But it seems that there is "git quiltimport" command, and I prefer
> "subversion + quilt", and I don't have trees for sending "git pull"
> requests. Therefore, just ignoring "git quiltimport" failure is fine.
> What do you think?

Sure, we can try.  I already have one quilt tree (besides Andrew's) in
linux-next, but much prefer a git tree.  If you have to use a quilt
tree, I will import it into a local branch on the base you tell me to
and then fetch it every morning and reimport it if it changes.  I will
then merge it like any other git branch.  Let me know what you can deal
with.
-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-22 13:41           ` Stephen Rothwell
@ 2019-05-22 14:58             ` Tetsuo Handa
  2019-05-22 21:09               ` Tetsuo Handa
  0 siblings, 1 reply; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-22 14:58 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Petr Mladek, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

On 2019/05/22 22:41, Stephen Rothwell wrote:
> Hi Tetsuo,
> 
> On Wed, 22 May 2019 21:38:45 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>>
>> I want to send debug printk() patches to linux-next.git. Petr Mladek
>> is suggesting me to have a git tree for debug printk() patches.
>> But it seems that there is "git quiltimport" command, and I prefer
>> "subversion + quilt", and I don't have trees for sending "git pull"
>> requests. Therefore, just ignoring "git quiltimport" failure is fine.
>> What do you think?
> 
> Sure, we can try.  I already have one quilt tree (besides Andrew's) in
> linux-next, but much prefer a git tree.  If you have to use a quilt
> tree, I will import it into a local branch on the base you tell me to
> and then fetch it every morning and reimport it if it changes.  I will
> then merge it like any other git branch.  Let me know what you can deal
> with.
> 

What I do for making patches is:

  git fetch --tags
  git reset --hard next-$date
  edit files
  git commit -a -s
  git format-patch -1
  git send-email --to=$recipient 0001-*.patch

I'm sure I will confuse git history/repository everyday if
I try to send changes using git. For my skill level, managing
0001-*.patch in a subversion repository is the simplest and safest.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-22 14:58             ` Tetsuo Handa
@ 2019-05-22 21:09               ` Tetsuo Handa
  2019-05-22 21:39                 ` Stephen Rothwell
  0 siblings, 1 reply; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-22 21:09 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Petr Mladek, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

On 2019/05/22 23:58, Tetsuo Handa wrote:
> On 2019/05/22 22:41, Stephen Rothwell wrote:
>> Hi Tetsuo,
>>
>> On Wed, 22 May 2019 21:38:45 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>>>
>>> I want to send debug printk() patches to linux-next.git. Petr Mladek
>>> is suggesting me to have a git tree for debug printk() patches.
>>> But it seems that there is "git quiltimport" command, and I prefer
>>> "subversion + quilt", and I don't have trees for sending "git pull"
>>> requests. Therefore, just ignoring "git quiltimport" failure is fine.
>>> What do you think?
>>
>> Sure, we can try.  I already have one quilt tree (besides Andrew's) in
>> linux-next, but much prefer a git tree.  If you have to use a quilt
>> tree, I will import it into a local branch on the base you tell me to
>> and then fetch it every morning and reimport it if it changes.  I will
>> then merge it like any other git branch.  Let me know what you can deal
>> with.
>>
> 
> What I do for making patches is:
> 
>   git fetch --tags
>   git reset --hard next-$date
>   edit files
>   git commit -a -s
>   git format-patch -1
>   git send-email --to=$recipient 0001-*.patch
> 
> I'm sure I will confuse git history/repository everyday if
> I try to send changes using git. For my skill level, managing
> 0001-*.patch in a subversion repository is the simplest and safest.
> 

I put an example patch into my subversion repository:

  svn checkout https://svn.osdn.net/svnroot/tomoyo/branches/syzbot-patches/

To fetch up-to-date debug printk() patches:

  cd syzbot-patches
  svn update

Does this work for you?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-22 21:09               ` Tetsuo Handa
@ 2019-05-22 21:39                 ` Stephen Rothwell
  2019-05-22 21:43                   ` Andrew Morton
  0 siblings, 1 reply; 15+ messages in thread
From: Stephen Rothwell @ 2019-05-22 21:39 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Petr Mladek, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

[-- Attachment #1: Type: text/plain, Size: 1101 bytes --]

Hi Tetsuo,

On Thu, 23 May 2019 06:09:07 +0900 Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> > What I do for making patches is:
> > 
> >   git fetch --tags
> >   git reset --hard next-$date
> >   edit files
> >   git commit -a -s
> >   git format-patch -1
> >   git send-email --to=$recipient 0001-*.patch
> > 
> > I'm sure I will confuse git history/repository everyday if
> > I try to send changes using git. For my skill level, managing
> > 0001-*.patch in a subversion repository is the simplest and safest.
> >   
> 
> I put an example patch into my subversion repository:
> 
>   svn checkout https://svn.osdn.net/svnroot/tomoyo/branches/syzbot-patches/
> 
> To fetch up-to-date debug printk() patches:
> 
>   cd syzbot-patches
>   svn update
> 
> Does this work for you?

Neither will fit into my normal workflow.

So, tell me, what are you trying to do?  What does you work depend on?
Just Linus' tree, or something already in linux-next?  Why would you
want to keep moving your patch(es) on top of linux-next?

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-22 21:39                 ` Stephen Rothwell
@ 2019-05-22 21:43                   ` Andrew Morton
  2019-05-22 23:46                     ` Tetsuo Handa
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2019-05-22 21:43 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Tetsuo Handa, Petr Mladek, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

On Thu, 23 May 2019 07:39:25 +1000 Stephen Rothwell <sfr@canb.auug.org.au> wrote:

> > I put an example patch into my subversion repository:
> > 
> >   svn checkout https://svn.osdn.net/svnroot/tomoyo/branches/syzbot-patches/
> > 
> > To fetch up-to-date debug printk() patches:
> > 
> >   cd syzbot-patches
> >   svn update
> > 
> > Does this work for you?
> 
> Neither will fit into my normal workflow.
> 
> So, tell me, what are you trying to do?  What does you work depend on?
> Just Linus' tree, or something already in linux-next?  Why would you
> want to keep moving your patch(es) on top of linux-next?

um, I can carry developer-only linux-next debug patches.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-22 21:43                   ` Andrew Morton
@ 2019-05-22 23:46                     ` Tetsuo Handa
  0 siblings, 0 replies; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-22 23:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Stephen Rothwell, Petr Mladek, Ingo Molnar, Peter Zijlstra,
	Paul E. McKenney, Vitaly Kuznetsov, Liu Chuansheng,
	Valdis Kletnieks, linux-kernel, Dmitry Vyukov

Andrew Morton wrote:
> On Thu, 23 May 2019 07:39:25 +1000 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
> 
> > > I put an example patch into my subversion repository:
> > > 
> > >   svn checkout https://svn.osdn.net/svnroot/tomoyo/branches/syzbot-patches/
> > > 
> > > To fetch up-to-date debug printk() patches:
> > > 
> > >   cd syzbot-patches
> > >   svn update
> > > 
> > > Does this work for you?
> > 
> > Neither will fit into my normal workflow.
> > 
> > So, tell me, what are you trying to do?  What does you work depend on?
> > Just Linus' tree, or something already in linux-next?  Why would you
> > want to keep moving your patch(es) on top of linux-next?

"[PATCH] printk: Monitor change of console loglevel." is targeted for
linux-next only, and I estimate that this patch will be removed in a
week or so, for syzbot can reproduce this problem using linux-next and
syzbot will blacklist testcases causing this problem.

"[PATCH] kernel/hung_task.c: Monitor killed tasks." is targeted for upstream, for
syzbot is hitting this problem in any tree and this will be a kernel's problem.
But for feasibility check, for now I want to try this patch on only linux-next.
I guess we need to tune (e.g. add sysctl) before sending to linux.git tree.

I am seeking for an approach which is less burden for both of you. But it
seems that using Andrew's route seems to fit better for Stephen's workflow.

> 
> um, I can carry developer-only linux-next debug patches.
> 

OK. Then, will you carry these patches?

Regards.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kernel/hung_task.c: Monitor killed tasks.
  2019-05-16 12:38       ` Tetsuo Handa
  2019-05-22 12:38         ` Tetsuo Handa
@ 2019-05-27 14:12         ` Tetsuo Handa
  1 sibling, 0 replies; 15+ messages in thread
From: Tetsuo Handa @ 2019-05-27 14:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Petr Mladek, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Vitaly Kuznetsov, Liu Chuansheng, Valdis Kletnieks, linux-kernel,
	Dmitry Vyukov, Stephen Rothwell, Linus Torvalds

Andrew, I updated description part. Please carry this patch.
----------
[PATCH] kernel/hung_task.c: Monitor killed tasks.

syzbot's current top report is "no output from test machine" where the
userspace process failed to spawn a new test process for 300 seconds
for some reason. One of reasons which can result in this report is that
an already spawned test process was unable to terminate (e.g. trapped at
an unkillable retry loop due to some bug) after SIGKILL was sent to that
process. Therefore, reporting when a thread is failing to terminate
despite a fatal signal is pending would give us more useful information.

In the context of syzbot's testing where there are only 2 CPUs in the
target VM (which means that only small number of threads and not so much
memory) and threads get SIGKILL after 5 seconds from fork(), being unable
to reach do_exit() within 10 seconds is likely a sign of something went
wrong. Therefore, I would like to try this patch in linux-next.git for
feasibility testing whether this patch helps finding more bugs and
reproducers for such bugs, by bringing "unable to terminate threads"
reports out of "no output from test machine" reports.

Potential bad effect of this patch will be that kernel code becomes
killable without addressing the root cause of being unable to terminate,
for use of killable wait will bypass both TASK_UNINTERRUPTIBLE stall test
and SIGKILL after 5 seconds behavior, which will result in failing to
detect in real systems where SIGKILL won't be sent after 5 seconds when
something went wrong.

This version shares existing sysctl settings (e.g. check interval,
timeout, whether to panic) used for detecting TASK_UNINTERRUPTIBLE
threads. We will likely want to use different sysctl settings for
monitoring killed threads. But let's start as linux-next.git patch
without introducing new sysctl settings. We can add sysctl settings
before sending to linux.git.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dmitry Vyukov <dvyukov@google.com>
---

 include/linux/sched.h |  1 +
 kernel/hung_task.c    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2cd1585..d42bdd7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -850,6 +850,7 @@ struct task_struct {
 #ifdef CONFIG_DETECT_HUNG_TASK
 	unsigned long			last_switch_count;
 	unsigned long			last_switch_time;
+	unsigned long			killed_time;
 #endif
 	/* Filesystem information: */
 	struct fs_struct		*fs;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95..34e7b84 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -141,6 +141,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	touch_nmi_watchdog();
 }
 
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+	unsigned long stamp = t->killed_time;
+
+	/*
+	 * Ensure the task is not frozen.
+	 * Also, skip vfork and any other user process that freezer should skip.
+	 */
+	if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+		return;
+	/*
+	 * Skip threads which are already inside do_exit(), for exit_mm() etc.
+	 * might take many seconds.
+	 */
+	if (t->flags & PF_EXITING)
+		return;
+	if (!stamp) {
+		stamp = jiffies;
+		if (!stamp)
+			stamp++;
+		t->killed_time = stamp;
+		return;
+	}
+	if (time_is_after_jiffies(stamp + timeout * HZ))
+		return;
+	trace_sched_process_hang(t);
+	if (sysctl_hung_task_panic) {
+		console_verbose();
+		hung_task_call_panic = true;
+	}
+	/*
+	 * This thread failed to terminate for more than
+	 * sysctl_hung_task_timeout_secs seconds, complain:
+	 */
+	pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+	       t->comm, t->pid, (jiffies - stamp) / HZ);
+	sched_show_task(t);
+	hung_task_show_lock = true;
+	touch_nmi_watchdog();
+}
+
 /*
  * To avoid extending the RCU grace period for an unbounded amount of time,
  * periodically exit the critical section and enter a new one.
@@ -192,6 +233,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 				goto unlock;
 			last_break = jiffies;
 		}
+		/* Check threads which are about to terminate. */
+		if (unlikely(fatal_signal_pending(t)))
+			check_killed_task(t, timeout);
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, timeout);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2019-05-27 14:14 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-13 11:02 [PATCH] kernel/hung_task.c: Monitor killed tasks Tetsuo Handa
2019-05-13 11:11 ` Dmitry Vyukov
2019-05-14 22:28 ` Paul E. McKenney
2019-05-15 10:55 ` Petr Mladek
2019-05-16  8:19   ` Tetsuo Handa
2019-05-16 11:57     ` Petr Mladek
2019-05-16 12:38       ` Tetsuo Handa
2019-05-22 12:38         ` Tetsuo Handa
2019-05-22 13:41           ` Stephen Rothwell
2019-05-22 14:58             ` Tetsuo Handa
2019-05-22 21:09               ` Tetsuo Handa
2019-05-22 21:39                 ` Stephen Rothwell
2019-05-22 21:43                   ` Andrew Morton
2019-05-22 23:46                     ` Tetsuo Handa
2019-05-27 14:12         ` Tetsuo Handa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.