linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
@ 2021-04-17 14:13 zhouchuangao
  2021-04-17 14:36 ` Peter Zijlstra
  2021-04-17 14:44 ` Tetsuo Handa
  0 siblings, 2 replies; 7+ messages in thread
From: zhouchuangao @ 2021-04-17 14:13 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Tetsuo Handa, Michal Hocko, Guilherme G. Piccoli,
	Vlastimil Babka, Lukas Bulwahn, zhouchuangao, linux-kernel

The main purpose of this patch is to add a whitelist and blacklist
mechanism to the hung task thread.

1. Add a /sys/module/hung_task/parameters/hung_seconds interface,
so you can write a specific time to 'hung_seconds' to hang a shell
process for a while. This interface is convenient for us to debug
the function of hung task thread.
eg:
echo 100 > /sys/module/hung_task/parameters/hung_seconds
This will put the current shell process into D state for 100s.

2. Add whitelist and blacklist. If a D state process is on the
whitelist, it will skip checking for that process. In contrast, if
the process is on a blacklist, panic is triggered.
Different use scenarios can make different whitelist and blacklist.

eg:
In Android system, we usually and some processes to the whitelist.
static task_t task_whitelist[] = {
	{"mdrt_thread", HUNG_TASK_WHITELIST},
	{"chre_kthread", HUNG_TASK_WHITELIST},
	{"scp_power_reset", HUNG_TASK_WHITELIST},
	{"ccci_fsm1", HUNG_TASK_WHITELIST},
	{"qos_ipi_recv", HUNG_TASK_WHITELIST},
	{NULL, 0},
};

3. Add a new member hung_state to task_struct to identify the type
of the D state process being detected.
A value of 0x1(HUNG_TASK_WHITELIST) indicates that the process is
on the whitelist.
A value of 0x2(HUNG_TASK_BLACKLIST) indicates that the process is
on the blacklist.
The remaining bits are reserved, and there may be other scenarios
entering the D state that need to be added to hung_state.

Signed-off-by: zhouchuangao <zhouchuangao@vivo.com>
---
 include/linux/sched.h |  1 +
 kernel/hung_task.c    | 88 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d5264b..8ffbdf6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -973,6 +973,7 @@ struct task_struct {
 	unsigned long			last_switch_count;
 	unsigned long			last_switch_time;
 	unsigned long			killed_time;
+	unsigned long			hung_state;
 #endif
 	/* Filesystem information: */
 	struct fs_struct		*fs;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index bb2e3e1..952a44c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,6 +72,57 @@ unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
 unsigned int __read_mostly sysctl_hung_task_panic =
 				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
 
+#define HUNG_TASK_WHITELIST		0x1
+#define HUNG_TASK_BLACKLIST		0x2
+
+typedef struct {
+	char *task_comm;
+	unsigned long flag;
+} task_t;
+
+static task_t task_whitelist[] = {
+	{NULL, 0},
+};
+
+static task_t task_blacklist[] = {
+	{"init", HUNG_TASK_BLACKLIST},
+	{NULL, 0},
+};
+
+static bool task_in_blacklist(struct task_struct *tsk)
+{
+	task_t *info = NULL;
+
+	if (tsk->hung_state & HUNG_TASK_BLACKLIST)
+		return true;
+
+	for (info = task_blacklist; info->task_comm; info++) {
+		if (!strcmp(tsk->comm, info->task_comm)) {
+			tsk->hung_state |= HUNG_TASK_BLACKLIST;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool task_in_whitelist(struct task_struct *tsk)
+{
+	task_t *info = NULL;
+
+	if (tsk->hung_state & HUNG_TASK_WHITELIST)
+		return true;
+
+	for (info = task_whitelist; info->task_comm; info++) {
+		if (!strcmp(tsk->comm, info->task_comm)) {
+			tsk->hung_state |= HUNG_TASK_WHITELIST;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static int
 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
@@ -111,6 +162,12 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
 		return;
 
+	/* Check if process 't' is in the whitelist. */
+	if (task_in_whitelist(t)) {
+		pr_info("skip hung task: %s\n", t->comm);
+		return;
+	}
+
 	trace_sched_process_hang(t);
 
 	if (sysctl_hung_task_panic) {
@@ -118,7 +175,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 		hung_task_show_lock = true;
 		hung_task_call_panic = true;
 	}
-
 	/*
 	 * Ok, the task did not get scheduled for more than 2 minutes,
 	 * complain:
@@ -141,6 +197,12 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 			hung_task_show_all_bt = true;
 	}
 
+	/* Check if process 't' is in the blacklist. */
+	if (task_in_blacklist(t)) {
+		pr_err("critical task blocked: (%d)%s\n", t->pid, t->comm);
+		hung_task_call_panic = true;
+	}
+
 	touch_nmi_watchdog();
 }
 
@@ -253,8 +315,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		trigger_all_cpu_backtrace();
 	}
 
-	if (hung_task_call_panic)
+	if (hung_task_call_panic) {
+		show_state_filter(TASK_UNINTERRUPTIBLE);
 		panic("hung_task: blocked tasks");
+	}
 }
 
 static long hung_timeout_jiffies(unsigned long last_checked,
@@ -322,6 +386,7 @@ static int watchdog(void *dummy)
 	unsigned long hung_last_checked = jiffies;
 
 	set_user_nice(current, 0);
+	pr_info("khungtaskd started...\n");
 
 	for ( ; ; ) {
 		unsigned long timeout = sysctl_hung_task_timeout_secs;
@@ -357,3 +422,22 @@ static int __init hung_task_init(void)
 	return 0;
 }
 subsys_initcall(hung_task_init);
+
+static int hung_task_test(const char *val, const struct kernel_param *kp)
+{
+	unsigned long sec = 0;
+	if (kstrtoul(val, 10, &sec))
+		return -EINVAL;
+
+	pr_info("Hung task Dsleep %ld s, start!\n", sec);
+	msleep(sec * 1000);
+	pr_info("Hung task Dsleep %ld s, end!\n", sec);
+
+	return 0;
+}
+
+static const struct kernel_param_ops hung_task_test_ops = {
+	.set = hung_task_test,
+};
+
+module_param_cb(hung_seconds, &hung_task_test_ops, NULL, 0600);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
  2021-04-17 14:13 [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism zhouchuangao
@ 2021-04-17 14:36 ` Peter Zijlstra
  2021-04-19  1:46   ` 周传高
  2021-04-17 14:44 ` Tetsuo Handa
  1 sibling, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2021-04-17 14:36 UTC (permalink / raw)
  To: zhouchuangao
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Tetsuo Handa, Michal Hocko, Guilherme G. Piccoli,
	Vlastimil Babka, Lukas Bulwahn, linux-kernel

On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote:
> eg:
> In Android system, we usually and some processes to the whitelist.
> static task_t task_whitelist[] = {
> 	{"mdrt_thread", HUNG_TASK_WHITELIST},
> 	{"chre_kthread", HUNG_TASK_WHITELIST},
> 	{"scp_power_reset", HUNG_TASK_WHITELIST},
> 	{"ccci_fsm1", HUNG_TASK_WHITELIST},
> 	{"qos_ipi_recv", HUNG_TASK_WHITELIST},
> 	{NULL, 0},
> };

What are these tasks doing that the hung-task detector fires on them?
Should you fix that instead?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
  2021-04-17 14:13 [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism zhouchuangao
  2021-04-17 14:36 ` Peter Zijlstra
@ 2021-04-17 14:44 ` Tetsuo Handa
  2021-04-19  2:22   ` 周传高
  1 sibling, 1 reply; 7+ messages in thread
From: Tetsuo Handa @ 2021-04-17 14:44 UTC (permalink / raw)
  To: zhouchuangao
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Michal Hocko, Guilherme G. Piccoli, Vlastimil Babka,
	Lukas Bulwahn, linux-kernel

On 2021/04/17 23:13, zhouchuangao wrote:
> The main purpose of this patch is to add a whitelist and blacklist
> mechanism to the hung task thread.

We stopped using the term 'whitelist'/'blacklist' for new code in Linux kernel,
and what you are proposing is something like 'ignorelist'/'fatallist'.

I think that matching based on comm name is poor, for comm name is subjected to
impersonation by malicious user processes.

Moreover, speak of syzkaller testing, most of hang task reports are reaction to
somebody else consuming too much CPU resources (e.g. printk() flooding, too many
pending workqueue requests). Even if some process is in 'ignorelist', it is
possible that some problem that should be reported is already happening. Even if
some process is in 'fatallist', it is possible that the cause of hang is simply
somebody else is consuming too much CPU.

By the way, I wish that khungtaskd can report recent top CPU consumers, for it is
rare that the cause of hung is locking dependency problems / hardware problems.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re:Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
  2021-04-17 14:36 ` Peter Zijlstra
@ 2021-04-19  1:46   ` 周传高
  2021-04-19  7:45     ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: 周传高 @ 2021-04-19  1:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Tetsuo Handa, Michal Hocko, Guilherme G. Piccoli,
	Vlastimil Babka, Lukas Bulwahn, linux-kernel


>On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote:
>> eg:
>> In Android system, we usually and some processes to the whitelist.
>> static task_t task_whitelist[] = {
>> 	{"mdrt_thread", HUNG_TASK_WHITELIST},
>> 	{"chre_kthread", HUNG_TASK_WHITELIST},
>> 	{"scp_power_reset", HUNG_TASK_WHITELIST},
>> 	{"ccci_fsm1", HUNG_TASK_WHITELIST},
>> 	{"qos_ipi_recv", HUNG_TASK_WHITELIST},
>> 	{NULL, 0},
>> };
>
>What are these tasks doing that the hung-task detector fires on them?
>Should you fix that instead?

These tasks are implemented by the SoC vendor, and normally they do
not configure HUNG TASK, so we need to ignore these tasks if we use
HUNG TASK. 
Here I want to provide a ignorelist and fatallist mechanism, different
users or use scenarios can be modified accordingly. 
The default ignorelist or fatallist is empty.

Thanks,
zhouchuangao


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re:Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
  2021-04-17 14:44 ` Tetsuo Handa
@ 2021-04-19  2:22   ` 周传高
  2021-04-19  2:38     ` Tetsuo Handa
  0 siblings, 1 reply; 7+ messages in thread
From: 周传高 @ 2021-04-19  2:22 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Michal Hocko, Guilherme G. Piccoli, Vlastimil Babka,
	Lukas Bulwahn, linux-kernel


>On 2021/04/17 23:13, zhouchuangao wrote:
>> The main purpose of this patch is to add a whitelist and blacklist
>> mechanism to the hung task thread.
>
>We stopped using the term 'whitelist'/'blacklist' for new code in Linux kernel,
>and what you are proposing is something like 'ignorelist'/'fatallist'.
>
>I think that matching based on comm name is poor, for comm name is subjected to
>impersonation by malicious user processes.
>
>Moreover, speak of syzkaller testing, most of hang task reports are reaction to
>somebody else consuming too much CPU resources (e.g. printk() flooding, too many
>pending workqueue requests). Even if some process is in 'ignorelist', it is
>possible that some problem that should be reported is already happening. Even if
>some process is in 'fatallist', it is possible that the cause of hang is simply
>somebody else is consuming too much CPU.
>
>By the way, I wish that khungtaskd can report recent top CPU consumers, for it is
>rare that the cause of hung is locking dependency problems / hardware problems.
>

Thank you for your suggestions,

Some SOC vendors' drivers or user-mode processes may be in D state for a long time,
and normally they do not configure HUNG TASK, so we need to ignore these tasks if
we use HUNG TASK. 

By default, ignorelist and fatallist are empty and can be configured by the user
(assuming the user knows what they are doing).

I will try to implement the function of khungtaskd to report top CPU consumers.

Thanks,
zhouchaungao


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
  2021-04-19  2:22   ` 周传高
@ 2021-04-19  2:38     ` Tetsuo Handa
  0 siblings, 0 replies; 7+ messages in thread
From: Tetsuo Handa @ 2021-04-19  2:38 UTC (permalink / raw)
  To: 周传高
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Michal Hocko, Guilherme G. Piccoli, Vlastimil Babka,
	Lukas Bulwahn, linux-kernel

On 2021/04/19 11:22, 周传高 wrote:
> Some SOC vendors' drivers or user-mode processes may be in D state for a long time,
> and normally they do not configure HUNG TASK, so we need to ignore these tasks if
> we use HUNG TASK. 

Isn't that a sign that the quality of the drivers and user-mode processes is poor?
Wait should be killable/interruptible if possible (for better response to e.g. OOM-killer).

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
  2021-04-19  1:46   ` 周传高
@ 2021-04-19  7:45     ` Peter Zijlstra
  0 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2021-04-19  7:45 UTC (permalink / raw)
  To: 周传高
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Andrew Morton, Kees Cook,
	Tetsuo Handa, Michal Hocko, Guilherme G. Piccoli,
	Vlastimil Babka, Lukas Bulwahn, linux-kernel

On Mon, Apr 19, 2021 at 09:46:26AM +0800, 周传高 wrote:
> 
> >On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote:
> >> eg:
> >> In Android system, we usually and some processes to the whitelist.
> >> static task_t task_whitelist[] = {
> >> 	{"mdrt_thread", HUNG_TASK_WHITELIST},
> >> 	{"chre_kthread", HUNG_TASK_WHITELIST},
> >> 	{"scp_power_reset", HUNG_TASK_WHITELIST},
> >> 	{"ccci_fsm1", HUNG_TASK_WHITELIST},
> >> 	{"qos_ipi_recv", HUNG_TASK_WHITELIST},
> >> 	{NULL, 0},
> >> };
> >
> >What are these tasks doing that the hung-task detector fires on them?
> >Should you fix that instead?
> 
> These tasks are implemented by the SoC vendor, and normally they do
> not configure HUNG TASK, so we need to ignore these tasks if we use
> HUNG TASK. 

Then raise a bug against their crap software, don't try and work around
it in the kernel.

We're not going to upstream workarounds for crap vendor software.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-04-19  7:46 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-17 14:13 [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism zhouchuangao
2021-04-17 14:36 ` Peter Zijlstra
2021-04-19  1:46   ` 周传高
2021-04-19  7:45     ` Peter Zijlstra
2021-04-17 14:44 ` Tetsuo Handa
2021-04-19  2:22   ` 周传高
2021-04-19  2:38     ` Tetsuo Handa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).