All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
@ 2017-08-21 10:25 Imran Khan
  2017-08-22  1:54 ` Zhang, Shile (NSB - CN/Hangzhou)
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Imran Khan @ 2017-08-21 10:25 UTC (permalink / raw)
  To: mingo
  Cc: imrank140517, Imran Khan, Luis R. Rodriguez, Kees Cook,
	Peter Zijlstra (Intel),
	Shile Zhang, Matt Fleming, Andrew Morton, Vegard Nossum,
	Tetsuo Handa, John Siddle, open list, open list:PROC SYSCTL

khungtask by default monitors either all tasks or no tasks at all
for long unterruptible sleeps.
For Android like environments this arrangement is not optimal because
on one hand it may be permissible to have some background(bg) task
in uninterruptible sleep state for long duration while on the other
hand it may not be permissible to have some foreground(fg) task like
surfaceflinger in uninterruptible sleep state for long duration.
So it would be good to have some arrangement so that few specified tasks
can be monitored by khungtaskd, on a need basis.
This change introduces a sysctl option, /proc/sys/kernel/
hung_task_check_selected, to enable monitoring of selected tasks using
khungtask daemon. If this sysctl option is enabled then only the tasks
specified in /proc/hung_task_monitor_list are monitored otherwise all
tasks are monitored, just like the default case.

Signed-off-by: Imran Khan <kimran@codeaurora.org>
---
 include/linux/sched/sysctl.h |   1 +
 kernel/hung_task.c           | 121 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c              |   8 +++
 3 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0f5ecd4..05892f1 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -10,6 +10,7 @@
 extern unsigned int  sysctl_hung_task_panic;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern int sysctl_hung_task_warnings;
+extern int sysctl_hung_task_check_selected;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 751593e..49f13fb 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,12 +16,28 @@
 #include <linux/export.h>
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
 
 #include <trace/events/sched.h>
 
 /*
+ * Hung task that needs monitoring
+ */
+struct hung_task {
+	struct list_head list;
+	char comm[TASK_COMM_LEN];
+};
+
+static struct hung_task	*monitor_list;
+int sysctl_hung_task_check_selected;
+
+
+
+/*
  * The number of tasks checked:
  */
 int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
@@ -76,6 +92,92 @@ static int __init hung_task_panic_setup(char *str)
 	.notifier_call = hung_task_panic,
 };
 
+static void hung_task_monitor_setup(void)
+{
+	monitor_list = kmalloc(sizeof(*monitor_list), GFP_KERNEL);
+	if (monitor_list) {
+		INIT_LIST_HEAD(&monitor_list->list);
+		memset(monitor_list->comm, 0, TASK_COMM_LEN);
+	}
+}
+
+
+static  int hung_task_info_show(struct seq_file *m, void *v)
+{
+	struct hung_task *ht;
+
+	ht = list_entry(v, struct hung_task, list);
+	seq_puts(m, ht->comm);
+
+	return 0;
+}
+
+static void *hung_task_info_start(struct seq_file *m, loff_t *pos)
+{
+	return seq_list_start_head(&monitor_list->list, *pos);
+}
+
+static void *hung_task_info_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &monitor_list->list, pos);
+}
+
+static void hung_task_info_stop(struct seq_file *m, void *v)
+{
+}
+
+const struct seq_operations hung_task_info_op = {
+	.start	= hung_task_info_start,
+	.next	= hung_task_info_next,
+	.stop	= hung_task_info_stop,
+	.show	= hung_task_info_show
+};
+
+static int hung_task_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &hung_task_info_op);
+}
+
+static ssize_t
+hung_task_info_write(struct file *file, const char __user *buf, size_t count,
+	     loff_t *offs)
+{
+	struct task_struct *g, *t;
+	struct hung_task *ht = kmalloc(sizeof(*ht), GFP_KERNEL);
+
+	if (!ht)
+		return -ENOMEM;
+
+	if (copy_from_user(ht->comm, buf, count))
+		return -EFAULT;
+	ht->comm[count] = '\0';
+
+	for_each_process_thread(g, t) {
+		if (!strncmp(t->comm, ht->comm, strlen(t->comm))) {
+			list_add_tail(&ht->list, &monitor_list->list);
+			return count;
+		}
+	}
+
+	pr_err("Non-existing task: %s can't be monitored\n", ht->comm);
+	return count;
+}
+
+static const struct file_operations hung_task_info_operations = {
+	.open		= hung_task_info_open,
+	.read		= seq_read,
+	.write		= hung_task_info_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_hung_task_info_init(void)
+{
+	proc_create("hung_task_monitor_list", 0644, NULL,
+		    &hung_task_info_operations);
+	return 0;
+}
+
 static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -167,6 +269,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	int max_count = sysctl_hung_task_check_count;
 	int batch_count = HUNG_TASK_BATCHING;
 	struct task_struct *g, *t;
+	struct hung_task *ht, *tmp;
 
 	/*
 	 * If the system crashed already then all bets are off,
@@ -186,9 +289,21 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 				goto unlock;
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, timeout);
+		if (t->state == TASK_UNINTERRUPTIBLE) {
+			if (sysctl_hung_task_check_selected) {
+				list_for_each_entry_safe(ht, tmp,
+							 &monitor_list->list,
+							 list)
+					if (!strncmp(ht->comm, t->comm,
+					    strlen(t->comm)))
+					/* Task belongs to the selected group */
+						check_hung_task(t, timeout);
+			} else {
+				check_hung_task(t, timeout);
+			}
+		}
 	}
+
  unlock:
 	rcu_read_unlock();
 	if (hung_task_show_lock)
@@ -259,6 +374,8 @@ static int watchdog(void *dummy)
 static int __init hung_task_init(void)
 {
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	hung_task_monitor_setup();
+	proc_hung_task_info_init();
 	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
 
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbb..ab45774 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1096,6 +1096,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
 	},
+	{
+		.procname	= "hung_task_check_selected",
+		.data		= &sysctl_hung_task_check_selected,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &neg_one,
+	},
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	{
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
  2017-08-21 10:25 [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state Imran Khan
@ 2017-08-22  1:54 ` Zhang, Shile (NSB - CN/Hangzhou)
  2017-08-22  6:03   ` Imran Khan
  2017-08-22 20:36 ` Peter Zijlstra
  2017-11-08  6:28 ` [RFC] hung task: check " Lingutla Chandrasekhar
  2 siblings, 1 reply; 10+ messages in thread
From: Zhang, Shile (NSB - CN/Hangzhou) @ 2017-08-22  1:54 UTC (permalink / raw)
  To: Imran Khan, mingo
  Cc: imrank140517, Luis R. Rodriguez, Kees Cook,
	Peter Zijlstra (Intel),
	Matt Fleming, Andrew Morton, Vegard Nossum, Tetsuo Handa,
	John Siddle, open list, open list:PROC SYSCTL

Hi, Imran,

I think a "unmonitored list" is better than "monitor list", because we want khungtaskd can find out the "unexpected" hung task, but not few in a list.
Then, for the fg tasks, which can put it in the "unmonitored list", for the bg tasks, I think we can tweak the timeout to control the duration.

Thanks!

BR, Shile

-----Original Message-----
From: Imran Khan [mailto:kimran@codeaurora.org] 
Sent: Monday, August 21, 2017 6:26 PM
To: mingo@kernel.org
Cc: imrank140517@gmail.com; Imran Khan <kimran@codeaurora.org>; Luis R. Rodriguez <mcgrof@kernel.org>; Kees Cook <keescook@chromium.org>; Peter Zijlstra (Intel) <peterz@infradead.org>; Zhang, Shile (NSB - CN/Hangzhou) <shile.zhang@nokia-sbell.com>; Matt Fleming <matt@codeblueprint.co.uk>; Andrew Morton <akpm@linux-foundation.org>; Vegard Nossum <vegard.nossum@oracle.com>; Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>; John Siddle <jsiddle@redhat.com>; open list <linux-kernel@vger.kernel.org>; open list:PROC SYSCTL <linux-fsdevel@vger.kernel.org>
Subject: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state

khungtask by default monitors either all tasks or no tasks at all
for long unterruptible sleeps.
For Android like environments this arrangement is not optimal because
on one hand it may be permissible to have some background(bg) task
in uninterruptible sleep state for long duration while on the other
hand it may not be permissible to have some foreground(fg) task like
surfaceflinger in uninterruptible sleep state for long duration.
So it would be good to have some arrangement so that few specified tasks
can be monitored by khungtaskd, on a need basis.
This change introduces a sysctl option, /proc/sys/kernel/
hung_task_check_selected, to enable monitoring of selected tasks using
khungtask daemon. If this sysctl option is enabled then only the tasks
specified in /proc/hung_task_monitor_list are monitored otherwise all
tasks are monitored, just like the default case.

Signed-off-by: Imran Khan <kimran@codeaurora.org>
---
 include/linux/sched/sysctl.h |   1 +
 kernel/hung_task.c           | 121 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c              |   8 +++
 3 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0f5ecd4..05892f1 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -10,6 +10,7 @@
 extern unsigned int  sysctl_hung_task_panic;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern int sysctl_hung_task_warnings;
+extern int sysctl_hung_task_check_selected;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 751593e..49f13fb 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,12 +16,28 @@
 #include <linux/export.h>
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
 
 #include <trace/events/sched.h>
 
 /*
+ * Hung task that needs monitoring
+ */
+struct hung_task {
+	struct list_head list;
+	char comm[TASK_COMM_LEN];
+};
+
+static struct hung_task	*monitor_list;
+int sysctl_hung_task_check_selected;
+
+
+
+/*
  * The number of tasks checked:
  */
 int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
@@ -76,6 +92,92 @@ static int __init hung_task_panic_setup(char *str)
 	.notifier_call = hung_task_panic,
 };
 
+static void hung_task_monitor_setup(void)
+{
+	monitor_list = kmalloc(sizeof(*monitor_list), GFP_KERNEL);
+	if (monitor_list) {
+		INIT_LIST_HEAD(&monitor_list->list);
+		memset(monitor_list->comm, 0, TASK_COMM_LEN);
+	}
+}
+
+
+static  int hung_task_info_show(struct seq_file *m, void *v)
+{
+	struct hung_task *ht;
+
+	ht = list_entry(v, struct hung_task, list);
+	seq_puts(m, ht->comm);
+
+	return 0;
+}
+
+static void *hung_task_info_start(struct seq_file *m, loff_t *pos)
+{
+	return seq_list_start_head(&monitor_list->list, *pos);
+}
+
+static void *hung_task_info_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &monitor_list->list, pos);
+}
+
+static void hung_task_info_stop(struct seq_file *m, void *v)
+{
+}
+
+const struct seq_operations hung_task_info_op = {
+	.start	= hung_task_info_start,
+	.next	= hung_task_info_next,
+	.stop	= hung_task_info_stop,
+	.show	= hung_task_info_show
+};
+
+static int hung_task_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &hung_task_info_op);
+}
+
+static ssize_t
+hung_task_info_write(struct file *file, const char __user *buf, size_t count,
+	     loff_t *offs)
+{
+	struct task_struct *g, *t;
+	struct hung_task *ht = kmalloc(sizeof(*ht), GFP_KERNEL);
+
+	if (!ht)
+		return -ENOMEM;
+
+	if (copy_from_user(ht->comm, buf, count))
+		return -EFAULT;
+	ht->comm[count] = '\0';
+
+	for_each_process_thread(g, t) {
+		if (!strncmp(t->comm, ht->comm, strlen(t->comm))) {
+			list_add_tail(&ht->list, &monitor_list->list);
+			return count;
+		}
+	}
+
+	pr_err("Non-existing task: %s can't be monitored\n", ht->comm);
+	return count;
+}
+
+static const struct file_operations hung_task_info_operations = {
+	.open		= hung_task_info_open,
+	.read		= seq_read,
+	.write		= hung_task_info_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_hung_task_info_init(void)
+{
+	proc_create("hung_task_monitor_list", 0644, NULL,
+		    &hung_task_info_operations);
+	return 0;
+}
+
 static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -167,6 +269,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	int max_count = sysctl_hung_task_check_count;
 	int batch_count = HUNG_TASK_BATCHING;
 	struct task_struct *g, *t;
+	struct hung_task *ht, *tmp;
 
 	/*
 	 * If the system crashed already then all bets are off,
@@ -186,9 +289,21 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 				goto unlock;
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, timeout);
+		if (t->state == TASK_UNINTERRUPTIBLE) {
+			if (sysctl_hung_task_check_selected) {
+				list_for_each_entry_safe(ht, tmp,
+							 &monitor_list->list,
+							 list)
+					if (!strncmp(ht->comm, t->comm,
+					    strlen(t->comm)))
+					/* Task belongs to the selected group */
+						check_hung_task(t, timeout);
+			} else {
+				check_hung_task(t, timeout);
+			}
+		}
 	}
+
  unlock:
 	rcu_read_unlock();
 	if (hung_task_show_lock)
@@ -259,6 +374,8 @@ static int watchdog(void *dummy)
 static int __init hung_task_init(void)
 {
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	hung_task_monitor_setup();
+	proc_hung_task_info_init();
 	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
 
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbb..ab45774 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1096,6 +1096,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
 	},
+	{
+		.procname	= "hung_task_check_selected",
+		.data		= &sysctl_hung_task_check_selected,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &neg_one,
+	},
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	{
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
  2017-08-22  1:54 ` Zhang, Shile (NSB - CN/Hangzhou)
@ 2017-08-22  6:03   ` Imran Khan
  2017-08-22  6:49     ` Zhang, Shile (NSB - CN/Hangzhou)
  0 siblings, 1 reply; 10+ messages in thread
From: Imran Khan @ 2017-08-22  6:03 UTC (permalink / raw)
  To: Zhang, Shile (NSB - CN/Hangzhou), mingo
  Cc: imrank140517, Luis R. Rodriguez, Kees Cook,
	Peter Zijlstra (Intel),
	Matt Fleming, Andrew Morton, Vegard Nossum, Tetsuo Handa,
	John Siddle, open list, open list:PROC SYSCTL

On 8/22/2017 7:24 AM, Zhang, Shile (NSB - CN/Hangzhou) wrote:

> Hi, Imran,
> 
Hi Shile,
Thanks for getting back on this.
> I think a "unmonitored list" is better than "monitor list", because we want khungtaskd can find out the "unexpected" hung task, but not few in a list.
> Then, for the fg tasks, which can put it in the "unmonitored list", for the bg tasks, I think we can tweak the timeout to control the duration.
> 
Here fg and bg tasks were just an example and may not be applicable for all environments (non-android). But main intention here is to have
some mechanism that would allow to monitor only specific tasks. As we have a sysctl option to enable/disable this mechanism we can at any time
fall back to default mechanism of monitoring all tasks.

Alternatively this can be done on task priority basis also, where we have a minimum priority and only tasks with priorities higher than
this min priority would be monitored and if we keep this min priority to the least priority , all tasks would be monitored like the 
default case. Please let me know your feedback regarding this approach.

Thanks and Regards,
Imran 
> Thanks!
> 
> BR, Shile
> 
> -----Original Message-----
> From: Imran Khan [mailto:kimran@codeaurora.org] 
> Sent: Monday, August 21, 2017 6:26 PM
> To: mingo@kernel.org
> Cc: imrank140517@gmail.com; Imran Khan <kimran@codeaurora.org>; Luis R. Rodriguez <mcgrof@kernel.org>; Kees Cook <keescook@chromium.org>; Peter Zijlstra (Intel) <peterz@infradead.org>; Zhang, Shile (NSB - CN/Hangzhou) <shile.zhang@nokia-sbell.com>; Matt Fleming <matt@codeblueprint.co.uk>; Andrew Morton <akpm@linux-foundation.org>; Vegard Nossum <vegard.nossum@oracle.com>; Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>; John Siddle <jsiddle@redhat.com>; open list <linux-kernel@vger.kernel.org>; open list:PROC SYSCTL <linux-fsdevel@vger.kernel.org>
> Subject: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
> 
> khungtask by default monitors either all tasks or no tasks at all
> for long unterruptible sleeps.
> For Android like environments this arrangement is not optimal because
> on one hand it may be permissible to have some background(bg) task
> in uninterruptible sleep state for long duration while on the other
> hand it may not be permissible to have some foreground(fg) task like
> surfaceflinger in uninterruptible sleep state for long duration.
> So it would be good to have some arrangement so that few specified tasks
> can be monitored by khungtaskd, on a need basis.
> This change introduces a sysctl option, /proc/sys/kernel/
> hung_task_check_selected, to enable monitoring of selected tasks using
> khungtask daemon. If this sysctl option is enabled then only the tasks
> specified in /proc/hung_task_monitor_list are monitored otherwise all
> tasks are monitored, just like the default case.
> 
> Signed-off-by: Imran Khan <kimran@codeaurora.org>
> ---
>  include/linux/sched/sysctl.h |   1 +
>  kernel/hung_task.c           | 121 ++++++++++++++++++++++++++++++++++++++++++-
>  kernel/sysctl.c              |   8 +++
>  3 files changed, 128 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 0f5ecd4..05892f1 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -10,6 +10,7 @@
>  extern unsigned int  sysctl_hung_task_panic;
>  extern unsigned long sysctl_hung_task_timeout_secs;
>  extern int sysctl_hung_task_warnings;
> +extern int sysctl_hung_task_check_selected;
>  extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
>  					 void __user *buffer,
>  					 size_t *lenp, loff_t *ppos);
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index 751593e..49f13fb 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -16,12 +16,28 @@
>  #include <linux/export.h>
>  #include <linux/sysctl.h>
>  #include <linux/utsname.h>
> +#include <linux/uaccess.h>
> +#include <linux/slab.h>
> +#include <linux/proc_fs.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/debug.h>
>  
>  #include <trace/events/sched.h>
>  
>  /*
> + * Hung task that needs monitoring
> + */
> +struct hung_task {
> +	struct list_head list;
> +	char comm[TASK_COMM_LEN];
> +};
> +
> +static struct hung_task	*monitor_list;
> +int sysctl_hung_task_check_selected;
> +
> +
> +
> +/*
>   * The number of tasks checked:
>   */
>  int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
> @@ -76,6 +92,92 @@ static int __init hung_task_panic_setup(char *str)
>  	.notifier_call = hung_task_panic,
>  };
>  
> +static void hung_task_monitor_setup(void)
> +{
> +	monitor_list = kmalloc(sizeof(*monitor_list), GFP_KERNEL);
> +	if (monitor_list) {
> +		INIT_LIST_HEAD(&monitor_list->list);
> +		memset(monitor_list->comm, 0, TASK_COMM_LEN);
> +	}
> +}
> +
> +
> +static  int hung_task_info_show(struct seq_file *m, void *v)
> +{
> +	struct hung_task *ht;
> +
> +	ht = list_entry(v, struct hung_task, list);
> +	seq_puts(m, ht->comm);
> +
> +	return 0;
> +}
> +
> +static void *hung_task_info_start(struct seq_file *m, loff_t *pos)
> +{
> +	return seq_list_start_head(&monitor_list->list, *pos);
> +}
> +
> +static void *hung_task_info_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> +	return seq_list_next(v, &monitor_list->list, pos);
> +}
> +
> +static void hung_task_info_stop(struct seq_file *m, void *v)
> +{
> +}
> +
> +const struct seq_operations hung_task_info_op = {
> +	.start	= hung_task_info_start,
> +	.next	= hung_task_info_next,
> +	.stop	= hung_task_info_stop,
> +	.show	= hung_task_info_show
> +};
> +
> +static int hung_task_info_open(struct inode *inode, struct file *file)
> +{
> +	return seq_open(file, &hung_task_info_op);
> +}
> +
> +static ssize_t
> +hung_task_info_write(struct file *file, const char __user *buf, size_t count,
> +	     loff_t *offs)
> +{
> +	struct task_struct *g, *t;
> +	struct hung_task *ht = kmalloc(sizeof(*ht), GFP_KERNEL);
> +
> +	if (!ht)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(ht->comm, buf, count))
> +		return -EFAULT;
> +	ht->comm[count] = '\0';
> +
> +	for_each_process_thread(g, t) {
> +		if (!strncmp(t->comm, ht->comm, strlen(t->comm))) {
> +			list_add_tail(&ht->list, &monitor_list->list);
> +			return count;
> +		}
> +	}
> +
> +	pr_err("Non-existing task: %s can't be monitored\n", ht->comm);
> +	return count;
> +}
> +
> +static const struct file_operations hung_task_info_operations = {
> +	.open		= hung_task_info_open,
> +	.read		= seq_read,
> +	.write		= hung_task_info_write,
> +	.llseek		= seq_lseek,
> +	.release	= seq_release,
> +};
> +
> +static int __init proc_hung_task_info_init(void)
> +{
> +	proc_create("hung_task_monitor_list", 0644, NULL,
> +		    &hung_task_info_operations);
> +	return 0;
> +}
> +
>  static void check_hung_task(struct task_struct *t, unsigned long timeout)
>  {
>  	unsigned long switch_count = t->nvcsw + t->nivcsw;
> @@ -167,6 +269,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  	int max_count = sysctl_hung_task_check_count;
>  	int batch_count = HUNG_TASK_BATCHING;
>  	struct task_struct *g, *t;
> +	struct hung_task *ht, *tmp;
>  
>  	/*
>  	 * If the system crashed already then all bets are off,
> @@ -186,9 +289,21 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  				goto unlock;
>  		}
>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> -		if (t->state == TASK_UNINTERRUPTIBLE)
> -			check_hung_task(t, timeout);
> +		if (t->state == TASK_UNINTERRUPTIBLE) {
> +			if (sysctl_hung_task_check_selected) {
> +				list_for_each_entry_safe(ht, tmp,
> +							 &monitor_list->list,
> +							 list)
> +					if (!strncmp(ht->comm, t->comm,
> +					    strlen(t->comm)))
> +					/* Task belongs to the selected group */
> +						check_hung_task(t, timeout);
> +			} else {
> +				check_hung_task(t, timeout);
> +			}
> +		}
>  	}
> +
>   unlock:
>  	rcu_read_unlock();
>  	if (hung_task_show_lock)
> @@ -259,6 +374,8 @@ static int watchdog(void *dummy)
>  static int __init hung_task_init(void)
>  {
>  	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
> +	hung_task_monitor_setup();
> +	proc_hung_task_info_init();
>  	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
>  
>  	return 0;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 6648fbb..ab45774 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1096,6 +1096,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
>  		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &neg_one,
>  	},
> +	{
> +		.procname	= "hung_task_check_selected",
> +		.data		= &sysctl_hung_task_check_selected,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &neg_one,
> +	},
>  #endif
>  #ifdef CONFIG_RT_MUTEXES
>  	{
> 


-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
  2017-08-22  6:03   ` Imran Khan
@ 2017-08-22  6:49     ` Zhang, Shile (NSB - CN/Hangzhou)
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Shile (NSB - CN/Hangzhou) @ 2017-08-22  6:49 UTC (permalink / raw)
  To: Imran Khan, mingo
  Cc: imrank140517, Luis R. Rodriguez, Kees Cook,
	Peter Zijlstra (Intel),
	Matt Fleming, Andrew Morton, Vegard Nossum, Tetsuo Handa,
	John Siddle, open list, open list:PROC SYSCTL

Hi, Imran,

Seems the alternatively approach is better than former.

But sorry, I still think, like this kind of "monitoring", its better/easy to mark some "privileged task" not be monitored, but not list the one should be monitored.
Because It's easy to know which task should NOT be monitored (for some special purpose), but it's hard to which one should be monitored, right? Likes some task created runtime.

Just for your info.

BR, Shile

-----Original Message-----
From: Imran Khan [mailto:kimran@codeaurora.org] 
Sent: Tuesday, August 22, 2017 2:04 PM
To: Zhang, Shile (NSB - CN/Hangzhou) <shile.zhang@nokia-sbell.com>; mingo@kernel.org
Cc: imrank140517@gmail.com; Luis R. Rodriguez <mcgrof@kernel.org>; Kees Cook <keescook@chromium.org>; Peter Zijlstra (Intel) <peterz@infradead.org>; Matt Fleming <matt@codeblueprint.co.uk>; Andrew Morton <akpm@linux-foundation.org>; Vegard Nossum <vegard.nossum@oracle.com>; Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>; John Siddle <jsiddle@redhat.com>; open list <linux-kernel@vger.kernel.org>; open list:PROC SYSCTL <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state

On 8/22/2017 7:24 AM, Zhang, Shile (NSB - CN/Hangzhou) wrote:

> Hi, Imran,
> 
Hi Shile,
Thanks for getting back on this.
> I think a "unmonitored list" is better than "monitor list", because we want khungtaskd can find out the "unexpected" hung task, but not few in a list.
> Then, for the fg tasks, which can put it in the "unmonitored list", for the bg tasks, I think we can tweak the timeout to control the duration.
> 
Here fg and bg tasks were just an example and may not be applicable for all environments (non-android). But main intention here is to have
some mechanism that would allow to monitor only specific tasks. As we have a sysctl option to enable/disable this mechanism we can at any time
fall back to default mechanism of monitoring all tasks.

Alternatively this can be done on task priority basis also, where we have a minimum priority and only tasks with priorities higher than
this min priority would be monitored and if we keep this min priority to the least priority , all tasks would be monitored like the 
default case. Please let me know your feedback regarding this approach.

Thanks and Regards,
Imran 
> Thanks!
> 
> BR, Shile
> 
> -----Original Message-----
> From: Imran Khan [mailto:kimran@codeaurora.org] 
> Sent: Monday, August 21, 2017 6:26 PM
> To: mingo@kernel.org
> Cc: imrank140517@gmail.com; Imran Khan <kimran@codeaurora.org>; Luis R. Rodriguez <mcgrof@kernel.org>; Kees Cook <keescook@chromium.org>; Peter Zijlstra (Intel) <peterz@infradead.org>; Zhang, Shile (NSB - CN/Hangzhou) <shile.zhang@nokia-sbell.com>; Matt Fleming <matt@codeblueprint.co.uk>; Andrew Morton <akpm@linux-foundation.org>; Vegard Nossum <vegard.nossum@oracle.com>; Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>; John Siddle <jsiddle@redhat.com>; open list <linux-kernel@vger.kernel.org>; open list:PROC SYSCTL <linux-fsdevel@vger.kernel.org>
> Subject: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
> 
> khungtask by default monitors either all tasks or no tasks at all
> for long unterruptible sleeps.
> For Android like environments this arrangement is not optimal because
> on one hand it may be permissible to have some background(bg) task
> in uninterruptible sleep state for long duration while on the other
> hand it may not be permissible to have some foreground(fg) task like
> surfaceflinger in uninterruptible sleep state for long duration.
> So it would be good to have some arrangement so that few specified tasks
> can be monitored by khungtaskd, on a need basis.
> This change introduces a sysctl option, /proc/sys/kernel/
> hung_task_check_selected, to enable monitoring of selected tasks using
> khungtask daemon. If this sysctl option is enabled then only the tasks
> specified in /proc/hung_task_monitor_list are monitored otherwise all
> tasks are monitored, just like the default case.
> 
> Signed-off-by: Imran Khan <kimran@codeaurora.org>
> ---
>  include/linux/sched/sysctl.h |   1 +
>  kernel/hung_task.c           | 121 ++++++++++++++++++++++++++++++++++++++++++-
>  kernel/sysctl.c              |   8 +++
>  3 files changed, 128 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 0f5ecd4..05892f1 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -10,6 +10,7 @@
>  extern unsigned int  sysctl_hung_task_panic;
>  extern unsigned long sysctl_hung_task_timeout_secs;
>  extern int sysctl_hung_task_warnings;
> +extern int sysctl_hung_task_check_selected;
>  extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
>  					 void __user *buffer,
>  					 size_t *lenp, loff_t *ppos);
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index 751593e..49f13fb 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -16,12 +16,28 @@
>  #include <linux/export.h>
>  #include <linux/sysctl.h>
>  #include <linux/utsname.h>
> +#include <linux/uaccess.h>
> +#include <linux/slab.h>
> +#include <linux/proc_fs.h>
>  #include <linux/sched/signal.h>
>  #include <linux/sched/debug.h>
>  
>  #include <trace/events/sched.h>
>  
>  /*
> + * Hung task that needs monitoring
> + */
> +struct hung_task {
> +	struct list_head list;
> +	char comm[TASK_COMM_LEN];
> +};
> +
> +static struct hung_task	*monitor_list;
> +int sysctl_hung_task_check_selected;
> +
> +
> +
> +/*
>   * The number of tasks checked:
>   */
>  int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
> @@ -76,6 +92,92 @@ static int __init hung_task_panic_setup(char *str)
>  	.notifier_call = hung_task_panic,
>  };
>  
> +static void hung_task_monitor_setup(void)
> +{
> +	monitor_list = kmalloc(sizeof(*monitor_list), GFP_KERNEL);
> +	if (monitor_list) {
> +		INIT_LIST_HEAD(&monitor_list->list);
> +		memset(monitor_list->comm, 0, TASK_COMM_LEN);
> +	}
> +}
> +
> +
> +static  int hung_task_info_show(struct seq_file *m, void *v)
> +{
> +	struct hung_task *ht;
> +
> +	ht = list_entry(v, struct hung_task, list);
> +	seq_puts(m, ht->comm);
> +
> +	return 0;
> +}
> +
> +static void *hung_task_info_start(struct seq_file *m, loff_t *pos)
> +{
> +	return seq_list_start_head(&monitor_list->list, *pos);
> +}
> +
> +static void *hung_task_info_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> +	return seq_list_next(v, &monitor_list->list, pos);
> +}
> +
> +static void hung_task_info_stop(struct seq_file *m, void *v)
> +{
> +}
> +
> +const struct seq_operations hung_task_info_op = {
> +	.start	= hung_task_info_start,
> +	.next	= hung_task_info_next,
> +	.stop	= hung_task_info_stop,
> +	.show	= hung_task_info_show
> +};
> +
> +static int hung_task_info_open(struct inode *inode, struct file *file)
> +{
> +	return seq_open(file, &hung_task_info_op);
> +}
> +
> +static ssize_t
> +hung_task_info_write(struct file *file, const char __user *buf, size_t count,
> +	     loff_t *offs)
> +{
> +	struct task_struct *g, *t;
> +	struct hung_task *ht = kmalloc(sizeof(*ht), GFP_KERNEL);
> +
> +	if (!ht)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(ht->comm, buf, count))
> +		return -EFAULT;
> +	ht->comm[count] = '\0';
> +
> +	for_each_process_thread(g, t) {
> +		if (!strncmp(t->comm, ht->comm, strlen(t->comm))) {
> +			list_add_tail(&ht->list, &monitor_list->list);
> +			return count;
> +		}
> +	}
> +
> +	pr_err("Non-existing task: %s can't be monitored\n", ht->comm);
> +	return count;
> +}
> +
> +static const struct file_operations hung_task_info_operations = {
> +	.open		= hung_task_info_open,
> +	.read		= seq_read,
> +	.write		= hung_task_info_write,
> +	.llseek		= seq_lseek,
> +	.release	= seq_release,
> +};
> +
> +static int __init proc_hung_task_info_init(void)
> +{
> +	proc_create("hung_task_monitor_list", 0644, NULL,
> +		    &hung_task_info_operations);
> +	return 0;
> +}
> +
>  static void check_hung_task(struct task_struct *t, unsigned long timeout)
>  {
>  	unsigned long switch_count = t->nvcsw + t->nivcsw;
> @@ -167,6 +269,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  	int max_count = sysctl_hung_task_check_count;
>  	int batch_count = HUNG_TASK_BATCHING;
>  	struct task_struct *g, *t;
> +	struct hung_task *ht, *tmp;
>  
>  	/*
>  	 * If the system crashed already then all bets are off,
> @@ -186,9 +289,21 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  				goto unlock;
>  		}
>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> -		if (t->state == TASK_UNINTERRUPTIBLE)
> -			check_hung_task(t, timeout);
> +		if (t->state == TASK_UNINTERRUPTIBLE) {
> +			if (sysctl_hung_task_check_selected) {
> +				list_for_each_entry_safe(ht, tmp,
> +							 &monitor_list->list,
> +							 list)
> +					if (!strncmp(ht->comm, t->comm,
> +					    strlen(t->comm)))
> +					/* Task belongs to the selected group */
> +						check_hung_task(t, timeout);
> +			} else {
> +				check_hung_task(t, timeout);
> +			}
> +		}
>  	}
> +
>   unlock:
>  	rcu_read_unlock();
>  	if (hung_task_show_lock)
> @@ -259,6 +374,8 @@ static int watchdog(void *dummy)
>  static int __init hung_task_init(void)
>  {
>  	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
> +	hung_task_monitor_setup();
> +	proc_hung_task_info_init();
>  	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
>  
>  	return 0;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 6648fbb..ab45774 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1096,6 +1096,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
>  		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &neg_one,
>  	},
> +	{
> +		.procname	= "hung_task_check_selected",
> +		.data		= &sysctl_hung_task_check_selected,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &neg_one,
> +	},
>  #endif
>  #ifdef CONFIG_RT_MUTEXES
>  	{
> 


-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
  2017-08-21 10:25 [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state Imran Khan
  2017-08-22  1:54 ` Zhang, Shile (NSB - CN/Hangzhou)
@ 2017-08-22 20:36 ` Peter Zijlstra
  2017-08-23 12:25   ` Imran Khan
  2017-11-08  6:28 ` [RFC] hung task: check " Lingutla Chandrasekhar
  2 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2017-08-22 20:36 UTC (permalink / raw)
  To: Imran Khan
  Cc: mingo, imrank140517, Luis R. Rodriguez, Kees Cook, Shile Zhang,
	Matt Fleming, Andrew Morton, Vegard Nossum, Tetsuo Handa,
	John Siddle, open list, open list:PROC SYSCTL

On Mon, Aug 21, 2017 at 03:55:53PM +0530, Imran Khan wrote:
> khungtask by default monitors either all tasks or no tasks at all
> for long unterruptible sleeps.
> For Android like environments this arrangement is not optimal because
> on one hand it may be permissible to have some background(bg) task
> in uninterruptible sleep state for long duration while on the other
> hand it may not be permissible to have some foreground(fg) task like
> surfaceflinger in uninterruptible sleep state for long duration.

How are you getting tasks in UNINTERRUPTIBLE state for a long time to
begin with? That's not a good thing, background or not.

> So it would be good to have some arrangement so that few specified tasks
> can be monitored by khungtaskd, on a need basis.
> This change introduces a sysctl option, /proc/sys/kernel/
> hung_task_check_selected, to enable monitoring of selected tasks using
> khungtask daemon. If this sysctl option is enabled then only the tasks
> specified in /proc/hung_task_monitor_list are monitored otherwise all
> tasks are monitored, just like the default case.

That's horrible. Why not a file in /proc/$PID/ that excludes it from
things?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state
  2017-08-22 20:36 ` Peter Zijlstra
@ 2017-08-23 12:25   ` Imran Khan
  0 siblings, 0 replies; 10+ messages in thread
From: Imran Khan @ 2017-08-23 12:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, imrank140517, Luis R. Rodriguez, Kees Cook, Shile Zhang,
	Matt Fleming, Andrew Morton, Vegard Nossum, Tetsuo Handa,
	John Siddle, open list, open list:PROC SYSCTL

On 8/23/2017 2:06 AM, Peter Zijlstra wrote:
> On Mon, Aug 21, 2017 at 03:55:53PM +0530, Imran Khan wrote:
>> khungtask by default monitors either all tasks or no tasks at all
>> for long unterruptible sleeps.
>> For Android like environments this arrangement is not optimal because
>> on one hand it may be permissible to have some background(bg) task
>> in uninterruptible sleep state for long duration while on the other
>> hand it may not be permissible to have some foreground(fg) task like
>> surfaceflinger in uninterruptible sleep state for long duration.
> 
> How are you getting tasks in UNINTERRUPTIBLE state for a long time to
> begin with? That's not a good thing, background or not.
> 
As of now, I don't have the details of exact test case as most of these issues are
coming in android aging and stress tests but I see even in a non-android
setup I get few tasks in uninterruptible sleep state for long duration (60 secs).
For example, I see there is a thread waiting for mmc requests to appear from block
layer. Of course this can be a faulty design on the driver side too but my main
intention here is to know if we can enable/disable this monitoring for 
specific tasks and whether such an option would be useful and agreeable to
community or not.

Also in current scheme of things we use same timeout for all tasks which may
not be optimal. For example if some task is in disk sleep state it may be okay
for it to wait for 2 minutes but if surfaceflinger (android) is stuck even for
1 minute, it results in a very poor UI experience for user of the phone.
>> So it would be good to have some arrangement so that few specified tasks
>> can be monitored by khungtaskd, on a need basis.
>> This change introduces a sysctl option, /proc/sys/kernel/
>> hung_task_check_selected, to enable monitoring of selected tasks using
>> khungtask daemon. If this sysctl option is enabled then only the tasks
>> specified in /proc/hung_task_monitor_list are monitored otherwise all
>> tasks are monitored, just like the default case.
> 
> That's horrible. Why not a file in /proc/$PID/ that excludes it from
> things?
> 
To be honest, I also had apprehension about my approach and did not think
it through. I agree that /proc/$PID/ based approach is much better. If it is
agreed that selective monitoring of hung tasks has valid use case(s), I can
send the /proc/$PID based patch for review.
Thanks for your time and feedback.

Regards,
Imran


-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [RFC] hung task: check specific tasks for long uninterruptible sleep state
  2017-08-21 10:25 [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state Imran Khan
  2017-08-22  1:54 ` Zhang, Shile (NSB - CN/Hangzhou)
  2017-08-22 20:36 ` Peter Zijlstra
@ 2017-11-08  6:28 ` Lingutla Chandrasekhar
  2017-11-08 11:57   ` Tetsuo Handa
  2017-11-09  1:54   ` Luis R. Rodriguez
  2 siblings, 2 replies; 10+ messages in thread
From: Lingutla Chandrasekhar @ 2017-11-08  6:28 UTC (permalink / raw)
  To: mingo, peterz
  Cc: mcgrof, keescook, shile.zhang, matt, akpm, vegard.nossum,
	penguin-kernel, jsiddle, linux-kernel, linux-fsdevel,
	Lingutla Chandrasekhar, Imran Khan

khungtask by default monitors all tasks for long unterruptible sleep.
This change introduces a sysctl option, /proc/sys/kernel/
hung_task_selective_monitoring, to enable monitoring selected tasks.
If this sysctl option is enabled then only the tasks with
/proc/$PID/hang_detection_enabled set are to be monitored,
otherwise all tasks are monitored as default case.

Some tasks may intentionally moves to uninterruptable sleep state,
which shouldn't leads to khungtask panics, as those are recoverable
hungs. So to avoid false hung reports, add an option to select tasks
to be monitored and report/panic them only.

Signed-off-by: Imran Khan <kimran@codeaurora.org>
Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d357b2ea6cb..810f9a5e209e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2733,6 +2733,52 @@ static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
 }
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
 
+#ifdef CONFIG_DETECT_HUNG_TASK
+static ssize_t proc_hung_task_detection_enabled_read(struct file *file,
+				char __user *buf, size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	bool hang_detection_enabled;
+
+	if (!task)
+		return -ESRCH;
+	hang_detection_enabled = task->hang_detection_enabled;
+	put_task_struct(task);
+
+	len = snprintf(buffer, sizeof(buffer), "%d\n", hang_detection_enabled);
+
+	return simple_read_from_buffer(buf, sizeof(buffer), ppos, buffer, len);
+}
+
+static ssize_t proc_hung_task_detection_enabled_write(struct file *file,
+			const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	bool hang_detection_enabled;
+	int rv;
+
+	rv = kstrtobool_from_user(buf, count, &hang_detection_enabled);
+	if (rv < 0)
+		return rv;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	task->hang_detection_enabled = hang_detection_enabled;
+	put_task_struct(task);
+
+	return count;
+}
+
+static const struct file_operations proc_hung_task_detection_enabled_operations = {
+	.read		= proc_hung_task_detection_enabled_read,
+	.write		= proc_hung_task_detection_enabled_write,
+	.llseek		= generic_file_llseek,
+};
+#endif
+
 #ifdef CONFIG_USER_NS
 static int proc_id_map_open(struct inode *inode, struct file *file,
 	const struct seq_operations *seq_ops)
@@ -2976,6 +3022,10 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HARDWALL
 	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
 #endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	REG("hang_detection_enabled", 0666,
+		proc_hung_task_detection_enabled_operations),
+#endif
 #ifdef CONFIG_USER_NS
 	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
@@ -3367,6 +3417,10 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HARDWALL
 	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
 #endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	REG("hang_detection_enabled", 0666,
+		proc_hung_task_detection_enabled_operations),
+#endif
 #ifdef CONFIG_USER_NS
 	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..1280df0a6708 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -767,6 +767,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 	unsigned long			last_switch_count;
+	bool				hang_detection_enabled;
 #endif
 	/* Filesystem information: */
 	struct fs_struct		*fs;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d6a18a3839cc..09d9f1c65fd2 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -11,6 +11,7 @@ extern int	     sysctl_hung_task_check_count;
 extern unsigned int  sysctl_hung_task_panic;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern int sysctl_hung_task_warnings;
+extern int sysctl_hung_task_selective_monitoring;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 751593ed7c0b..321bcfa2e34a 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -20,12 +20,21 @@
 #include <linux/sched/debug.h>
 
 #include <trace/events/sched.h>
+#include <linux/sched/sysctl.h>
 
 /*
  * The number of tasks checked:
  */
 int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
 
+/*
+ * Selective monitoring of hung tasks.
+ *
+ * if set to 1, khungtaskd only monitors tasks with 'hang_detection_enabled'
+ * value set, else monitors all tasks.
+ */
+int sysctl_hung_task_selective_monitoring;
+
 /*
  * Limit number of tasks checked in a batch.
  *
@@ -187,7 +196,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, timeout);
+			/* Check for selective monitoring */
+			if (!sysctl_hung_task_selective_monitoring ||
+			    t->hang_detection_enabled)
+				check_hung_task(t, timeout);
 	}
  unlock:
 	rcu_read_unlock();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc2eaea..30cc108ce2cc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1093,6 +1093,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
 	},
+	{
+		.procname	= "hung_task_selective_monitoring",
+		.data		= &sysctl_hung_task_selective_monitoring,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	{
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
 a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC] hung task: check specific tasks for long uninterruptible sleep state
  2017-11-08  6:28 ` [RFC] hung task: check " Lingutla Chandrasekhar
@ 2017-11-08 11:57   ` Tetsuo Handa
  2017-11-08 23:44     ` Andrew Morton
  2017-11-09  1:54   ` Luis R. Rodriguez
  1 sibling, 1 reply; 10+ messages in thread
From: Tetsuo Handa @ 2017-11-08 11:57 UTC (permalink / raw)
  To: clingutla, kimran
  Cc: mingo, peterz, mcgrof, keescook, shile.zhang, matt, akpm,
	vegard.nossum, jsiddle, linux-kernel, linux-fsdevel

Lingutla Chandrasekhar wrote:
> Some tasks may intentionally moves to uninterruptable sleep state,
> which shouldn't leads to khungtask panics, as those are recoverable
> hungs. So to avoid false hung reports, add an option to select tasks
> to be monitored and report/panic them only.

What are backtraces of such tasks? Please point the locations in the code.

If they are absolutely recoverable, why can't we let themselves declare that
"I'm intentionally in uninterruptible state. But there is no dependency that
prevents me from recovering. So, please ignore me." using per "struct
task_struct" flags rather than introducing userspace controlled interface?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] hung task: check specific tasks for long uninterruptible sleep state
  2017-11-08 11:57   ` Tetsuo Handa
@ 2017-11-08 23:44     ` Andrew Morton
  0 siblings, 0 replies; 10+ messages in thread
From: Andrew Morton @ 2017-11-08 23:44 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: clingutla, kimran, mingo, peterz, mcgrof, keescook, shile.zhang,
	matt, vegard.nossum, jsiddle, linux-kernel, linux-fsdevel

On Wed, 8 Nov 2017 20:57:42 +0900 Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> wrote:

> Lingutla Chandrasekhar wrote:
> > Some tasks may intentionally moves to uninterruptable sleep state,
> > which shouldn't leads to khungtask panics, as those are recoverable
> > hungs. So to avoid false hung reports, add an option to select tasks
> > to be monitored and report/panic them only.
> 
> What are backtraces of such tasks? Please point the locations in the code.
> 
> If they are absolutely recoverable, why can't we let themselves declare that
> "I'm intentionally in uninterruptible state. But there is no dependency that
> prevents me from recovering. So, please ignore me." using per "struct
> task_struct" flags rather than introducing userspace controlled interface?

Yes.  Please tell us much much much more about the use case and
scenarios which inspired this change.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] hung task: check specific tasks for long uninterruptible sleep state
  2017-11-08  6:28 ` [RFC] hung task: check " Lingutla Chandrasekhar
  2017-11-08 11:57   ` Tetsuo Handa
@ 2017-11-09  1:54   ` Luis R. Rodriguez
  1 sibling, 0 replies; 10+ messages in thread
From: Luis R. Rodriguez @ 2017-11-09  1:54 UTC (permalink / raw)
  To: Lingutla Chandrasekhar
  Cc: mingo, peterz, mcgrof, keescook, shile.zhang, matt, akpm,
	vegard.nossum, penguin-kernel, jsiddle, linux-kernel,
	linux-fsdevel, Imran Khan

On Wed, Nov 08, 2017 at 11:58:13AM +0530, Lingutla Chandrasekhar wrote:
> khungtask by default monitors all tasks for long unterruptible sleep.
> This change introduces a sysctl option, /proc/sys/kernel/
> hung_task_selective_monitoring, to enable monitoring selected tasks.
> If this sysctl option is enabled then only the tasks with
> /proc/$PID/hang_detection_enabled 

You did precisely the opposite as to what Peter had suggested, which was to
have an per pid exclude flag [0] and this is precisely what I was going to
suggest as well. Exclusion should be an exemption, not the norm, the way
you currently have it right now each process would have to be white-listed,
and that's a lot of work.

I'll in your previous iteration you had tried adding a
/proc/hung_task_monitor_list and only PIDs added there were monitored
otherwise all tasks are monitored, just like the default case, so this
is your V2 attempt, please clarify that in your next iteration it will
be v3.

Upon suspend we try to address putting to sleep such processes, in fact we have
a CAP_BLOCK_SUSPEND which can enable to employ features that can block system
suspend (epoll(7) EPOLLWAKEUP, /proc/sys/wake_lock), so shouldn't we
have a check for CAP_BLOCK_SUSPEND before enabling this?

This gets me wondering if this should instead be controlled via capabilities
and prtctl() such as with PR_CAPBSET_DROP.

What if the process forks?

[0] https://lkml.kernel.org/r/20170822203632.GQ32112@worktop.programming.kicks-ass.net

  Luis

> set are to be monitored,
> otherwise all tasks are monitored as default case.
> 
> Some tasks may intentionally moves to uninterruptable sleep state,
> which shouldn't leads to khungtask panics, as those are recoverable
> hungs. So to avoid false hung reports, add an option to select tasks
> to be monitored and report/panic them only.
> 
> Signed-off-by: Imran Khan <kimran@codeaurora.org>
> Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 9d357b2ea6cb..810f9a5e209e 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2733,6 +2733,52 @@ static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
>  }
>  #endif /* CONFIG_TASK_IO_ACCOUNTING */
>  
> +#ifdef CONFIG_DETECT_HUNG_TASK
> +static ssize_t proc_hung_task_detection_enabled_read(struct file *file,
> +				char __user *buf, size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task = get_proc_task(file_inode(file));
> +	char buffer[PROC_NUMBUF];
> +	size_t len;
> +	bool hang_detection_enabled;
> +
> +	if (!task)
> +		return -ESRCH;
> +	hang_detection_enabled = task->hang_detection_enabled;
> +	put_task_struct(task);
> +
> +	len = snprintf(buffer, sizeof(buffer), "%d\n", hang_detection_enabled);
> +
> +	return simple_read_from_buffer(buf, sizeof(buffer), ppos, buffer, len);
> +}
> +
> +static ssize_t proc_hung_task_detection_enabled_write(struct file *file,
> +			const char __user *buf, size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task;
> +	bool hang_detection_enabled;
> +	int rv;
> +
> +	rv = kstrtobool_from_user(buf, count, &hang_detection_enabled);
> +	if (rv < 0)
> +		return rv;
> +
> +	task = get_proc_task(file_inode(file));
> +	if (!task)
> +		return -ESRCH;
> +	task->hang_detection_enabled = hang_detection_enabled;
> +	put_task_struct(task);
> +
> +	return count;
> +}
> +
> +static const struct file_operations proc_hung_task_detection_enabled_operations = {
> +	.read		= proc_hung_task_detection_enabled_read,
> +	.write		= proc_hung_task_detection_enabled_write,
> +	.llseek		= generic_file_llseek,
> +};
> +#endif
> +
>  #ifdef CONFIG_USER_NS
>  static int proc_id_map_open(struct inode *inode, struct file *file,
>  	const struct seq_operations *seq_ops)
> @@ -2976,6 +3022,10 @@ static const struct pid_entry tgid_base_stuff[] = {
>  #ifdef CONFIG_HARDWALL
>  	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
>  #endif
> +#ifdef CONFIG_DETECT_HUNG_TASK
> +	REG("hang_detection_enabled", 0666,
> +		proc_hung_task_detection_enabled_operations),
> +#endif
>  #ifdef CONFIG_USER_NS
>  	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
>  	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
> @@ -3367,6 +3417,10 @@ static const struct pid_entry tid_base_stuff[] = {
>  #ifdef CONFIG_HARDWALL
>  	ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
>  #endif
> +#ifdef CONFIG_DETECT_HUNG_TASK
> +	REG("hang_detection_enabled", 0666,
> +		proc_hung_task_detection_enabled_operations),
> +#endif
>  #ifdef CONFIG_USER_NS
>  	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
>  	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index fdf74f27acf1..1280df0a6708 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -767,6 +767,7 @@ struct task_struct {
>  #endif
>  #ifdef CONFIG_DETECT_HUNG_TASK
>  	unsigned long			last_switch_count;
> +	bool				hang_detection_enabled;
>  #endif
>  	/* Filesystem information: */
>  	struct fs_struct		*fs;
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index d6a18a3839cc..09d9f1c65fd2 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -11,6 +11,7 @@ extern int	     sysctl_hung_task_check_count;
>  extern unsigned int  sysctl_hung_task_panic;
>  extern unsigned long sysctl_hung_task_timeout_secs;
>  extern int sysctl_hung_task_warnings;
> +extern int sysctl_hung_task_selective_monitoring;
>  extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
>  					 void __user *buffer,
>  					 size_t *lenp, loff_t *ppos);
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index 751593ed7c0b..321bcfa2e34a 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -20,12 +20,21 @@
>  #include <linux/sched/debug.h>
>  
>  #include <trace/events/sched.h>
> +#include <linux/sched/sysctl.h>
>  
>  /*
>   * The number of tasks checked:
>   */
>  int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
>  
> +/*
> + * Selective monitoring of hung tasks.
> + *
> + * if set to 1, khungtaskd only monitors tasks with 'hang_detection_enabled'
> + * value set, else monitors all tasks.
> + */
> +int sysctl_hung_task_selective_monitoring;
> +
>  /*
>   * Limit number of tasks checked in a batch.
>   *
> @@ -187,7 +196,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>  		}
>  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
>  		if (t->state == TASK_UNINTERRUPTIBLE)
> -			check_hung_task(t, timeout);
> +			/* Check for selective monitoring */
> +			if (!sysctl_hung_task_selective_monitoring ||
> +			    t->hang_detection_enabled)
> +				check_hung_task(t, timeout);
>  	}
>   unlock:
>  	rcu_read_unlock();
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index d9c31bc2eaea..30cc108ce2cc 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1093,6 +1093,16 @@ static struct ctl_table kern_table[] = {
>  		.proc_handler	= proc_dointvec_minmax,
>  		.extra1		= &neg_one,
>  	},
> +	{
> +		.procname	= "hung_task_selective_monitoring",
> +		.data		= &sysctl_hung_task_selective_monitoring,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
> +
>  #endif
>  #ifdef CONFIG_RT_MUTEXES
>  	{
> -- 
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
>  a Linux Foundation Collaborative Project.
> 
> 

-- 
Luis Rodriguez, SUSE LINUX GmbH
Maxfeldstrasse 5; D-90409 Nuernberg

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-11-09  1:54 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-21 10:25 [PATCH] RFC: hung task: Check specific tasks for long uninterruptible sleep state Imran Khan
2017-08-22  1:54 ` Zhang, Shile (NSB - CN/Hangzhou)
2017-08-22  6:03   ` Imran Khan
2017-08-22  6:49     ` Zhang, Shile (NSB - CN/Hangzhou)
2017-08-22 20:36 ` Peter Zijlstra
2017-08-23 12:25   ` Imran Khan
2017-11-08  6:28 ` [RFC] hung task: check " Lingutla Chandrasekhar
2017-11-08 11:57   ` Tetsuo Handa
2017-11-08 23:44     ` Andrew Morton
2017-11-09  1:54   ` Luis R. Rodriguez

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.