All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] sched: Warn on long periods of pending need_resched
@ 2021-03-23  3:57 Josh Don
  2021-03-24  9:37 ` Peter Zijlstra
  2021-03-24 11:27 ` [PATCH v2] sched: Warn on long periods of pending need_resched Mel Gorman
  0 siblings, 2 replies; 18+ messages in thread
From: Josh Don @ 2021-03-23  3:57 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner, Josh Don

From: Paul Turner <pjt@google.com>

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. If it goes off, it is
likely that there is a missing cond_resched() somewhere. Monitoring is
done via the tick and the accuracy is hence limited to jiffy scale. This
also means that we won't trigger the warning if the tick is disabled.

This feature is default disabled. It can be toggled on using sysctl
resched_latency_warn_enabled.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
---
Delta from v1:
- separate sysctl for enabling/disabling and triggering warn_once
  behavior
- add documentation
- static branch for the enable
 Documentation/admin-guide/sysctl/kernel.rst | 23 ++++++
 include/linux/sched/sysctl.h                |  4 ++
 kernel/sched/core.c                         | 78 ++++++++++++++++++++-
 kernel/sched/debug.c                        | 10 +++
 kernel/sched/sched.h                        | 10 +++
 kernel/sysctl.c                             | 24 +++++++
 6 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 1d56a6b73a4e..2d4a21d3b79f 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1077,6 +1077,29 @@ ROM/Flash boot loader. Maybe to tell it what to do after
 rebooting. ???
 
 
+resched_latency_warn_enabled
+============================
+
+Enables/disables a warning that will trigger if need_resched is set for
+longer than sysctl ``resched_latency_warn_ms``. This warning likely
+indicates a kernel bug, such as a failure to call cond_resched().
+
+Requires ``CONFIG_SCHED_DEBUG``.
+
+
+resched_latency_warn_ms
+=======================
+
+See ``resched_latency_warn_enabled``.
+
+
+resched_latency_warn_once
+=========================
+
+If set, ``resched_latency_warn_enabled`` will only trigger one warning
+per boot.
+
+
 sched_energy_aware
 ==================
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba88aca5..43a1f5ab819a 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,10 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
 
+extern struct static_key_false resched_latency_warn_enabled;
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *length, loff_t *ppos);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..d69ae342b450 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,21 @@ const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
 #undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * resched_latency_warn_enabled is set).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ *
+ * Resched latency will be ignored for the first resched_boot_quiet_sec, to
+ * reduce false alarms.
+ */
+int sysctl_resched_latency_warn_ms = 100;
+int sysctl_resched_latency_warn_once = 1;
+const long resched_boot_quiet_sec = 600;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4520,6 +4534,58 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+static u64 resched_latency_check(struct rq *rq)
+{
+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+	u64 need_resched_latency, now = rq_clock(rq);
+	static bool warned_once;
+
+	if (sysctl_resched_latency_warn_once && warned_once)
+		return 0;
+
+	if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
+		return 0;
+
+	/* Disable this warning for the first few mins after boot */
+	if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
+		return 0;
+
+	if (!rq->last_seen_need_resched_ns) {
+		rq->last_seen_need_resched_ns = now;
+		rq->ticks_without_resched = 0;
+		return 0;
+	}
+
+	rq->ticks_without_resched++;
+	need_resched_latency = now - rq->last_seen_need_resched_ns;
+	if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+		return 0;
+
+	warned_once = true;
+
+	return need_resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+	long val;
+
+	if ((kstrtol(str, 0, &val))) {
+		pr_warn("Unable to set resched_latency_warn_ms\n");
+		return 1;
+	}
+
+	sysctl_resched_latency_warn_ms = val;
+	return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
+
+DEFINE_STATIC_KEY_FALSE(resched_latency_warn_enabled);
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4531,6 +4597,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	u64 resched_latency = 0;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4541,11 +4608,17 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	if (static_branch_unlikely(&resched_latency_warn_enabled))
+		resched_latency = resched_latency_check(rq);
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);
 
 	rq_unlock(rq, &rf);
 
+	if (static_branch_unlikely(&resched_latency_warn_enabled) &&
+	    resched_latency)
+		resched_latency_warn(cpu, resched_latency);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -5040,6 +5113,9 @@ static void __sched notrace __schedule(bool preempt)
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+	rq->last_seen_need_resched_ns = 0;
+#endif
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 486f403a778b..39fe8c7851f7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1033,3 +1033,13 @@ void proc_sched_set_task(struct task_struct *p)
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+	static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+	WARN(__ratelimit(&latency_check_ratelimit),
+	     "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+	     "without schedule\n",
+	     cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..ae2a99098388 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -57,6 +57,7 @@
 #include <linux/prefetch.h>
 #include <linux/profile.h>
 #include <linux/psi.h>
+#include <linux/ratelimit.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
 #include <linux/stop_machine.h>
@@ -963,6 +964,11 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_SCHED_DEBUG
+	u64 last_seen_need_resched_ns;
+	int ticks_without_resched;
+#endif
+
 #ifdef CONFIG_MEMBARRIER
 	int membarrier_state;
 #endif
@@ -2366,6 +2372,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2373,6 +2381,8 @@ extern void
 print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
 	unsigned long tpf, unsigned long gsf, unsigned long gpf);
 #endif /* CONFIG_NUMA_BALANCING */
+#else
+static inline void resched_latency_warn(int cpu, u64 latency) {}
 #endif /* CONFIG_SCHED_DEBUG */
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..b784a1c98c5e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -193,6 +193,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 #endif /* CONFIG_SMP */
+static int min_resched_latency_warn_ms = 2;
 #endif /* CONFIG_SCHED_DEBUG */
 
 #ifdef CONFIG_COMPACTION
@@ -1763,6 +1764,29 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
+	{
+		.procname	= "resched_latency_warn_enabled",
+		.data		= &resched_latency_warn_enabled,
+		.mode		= 0644,
+		.proc_handler	= proc_do_static_key,
+	},
+	{
+		.procname	= "resched_latency_warn_ms",
+		.data		= &sysctl_resched_latency_warn_ms,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_resched_latency_warn_ms,
+	},
+	{
+		.procname	= "resched_latency_warn_once",
+		.data		= &sysctl_resched_latency_warn_once,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 #endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
-- 
2.31.0.rc2.261.g7f71774620-goog


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-23  3:57 [PATCH v2] sched: Warn on long periods of pending need_resched Josh Don
@ 2021-03-24  9:37 ` Peter Zijlstra
  2021-03-24 10:54   ` Peter Zijlstra
  2021-03-24 11:27 ` [PATCH v2] sched: Warn on long periods of pending need_resched Mel Gorman
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-03-24  9:37 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Mon, Mar 22, 2021 at 08:57:06PM -0700, Josh Don wrote:
> From: Paul Turner <pjt@google.com>
> 
> CPU scheduler marks need_resched flag to signal a schedule() on a
> particular CPU. But, schedule() may not happen immediately in cases
> where the current task is executing in the kernel mode (no
> preemption state) for extended periods of time.
> 
> This patch adds a warn_on if need_resched is pending for more than the
> time specified in sysctl resched_latency_warn_ms. If it goes off, it is
> likely that there is a missing cond_resched() somewhere. Monitoring is
> done via the tick and the accuracy is hence limited to jiffy scale. This
> also means that we won't trigger the warning if the tick is disabled.
> 
> This feature is default disabled. It can be toggled on using sysctl
> resched_latency_warn_enabled.
> 
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
> Delta from v1:
> - separate sysctl for enabling/disabling and triggering warn_once
>   behavior
> - add documentation
> - static branch for the enable
>  Documentation/admin-guide/sysctl/kernel.rst | 23 ++++++
>  include/linux/sched/sysctl.h                |  4 ++
>  kernel/sched/core.c                         | 78 ++++++++++++++++++++-
>  kernel/sched/debug.c                        | 10 +++
>  kernel/sched/sched.h                        | 10 +++
>  kernel/sysctl.c                             | 24 +++++++
>  6 files changed, 148 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index 1d56a6b73a4e..2d4a21d3b79f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1077,6 +1077,29 @@ ROM/Flash boot loader. Maybe to tell it what to do after
>  rebooting. ???
>  
>  
> +resched_latency_warn_enabled
> +============================
> +
> +Enables/disables a warning that will trigger if need_resched is set for
> +longer than sysctl ``resched_latency_warn_ms``. This warning likely
> +indicates a kernel bug, such as a failure to call cond_resched().
> +
> +Requires ``CONFIG_SCHED_DEBUG``.
> +
> +
> +resched_latency_warn_ms
> +=======================
> +
> +See ``resched_latency_warn_enabled``.
> +
> +
> +resched_latency_warn_once
> +=========================
> +
> +If set, ``resched_latency_warn_enabled`` will only trigger one warning
> +per boot.
> +
> +
>  sched_energy_aware
>  ==================

Should we perhaps take out all SCHED_DEBUG sysctls and move them to
/debug/sched/ ? (along with the existing /debug/sched_{debug,features,preemp}
files)

Having all that in sysctl and documented gives them far too much sheen
of ABI.

Not saying this patch should do that, just as a general observation.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24  9:37 ` Peter Zijlstra
@ 2021-03-24 10:54   ` Peter Zijlstra
  2021-03-24 10:55     ` Peter Zijlstra
  2021-03-24 11:42     ` Mel Gorman
  0 siblings, 2 replies; 18+ messages in thread
From: Peter Zijlstra @ 2021-03-24 10:54 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 10:37:43AM +0100, Peter Zijlstra wrote:
> Should we perhaps take out all SCHED_DEBUG sysctls and move them to
> /debug/sched/ ? (along with the existing /debug/sched_{debug,features,preemp}
> files)
> 
> Having all that in sysctl and documented gives them far too much sheen
> of ABI.

... a little something like this ...

---
Subject: sched: Move SCHED_DEBUG to debugfs
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Mar 24 11:43:21 CET 2021

Stop polluting sysctl with undocumented knobs that really are debug
only, move them all to /debug/sched/.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/sysctl.h |   12 ---
 kernel/sched/core.c          |   48 +-----------
 kernel/sched/debug.c         |  169 +++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/fair.c          |    9 --
 kernel/sysctl.c              |  116 -----------------------------
 5 files changed, 174 insertions(+), 180 deletions(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct
 enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
+extern unsigned int sysctl_sched_child_runs_first;
+
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_LINEAR,
 	SCHED_TUNABLESCALING_END,
 };
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_tunable_scaling;
 
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
@@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancin
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *length, loff_t *ppos);
 #endif
 
 /*
@@ -87,10 +85,6 @@ int sched_rt_handler(struct ctl_table *t
 		size_t *lenp, loff_t *ppos);
 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
-int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
-		size_t *lenp, loff_t *ppos);
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
-		size_t *lenp, loff_t *ppos);
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 extern unsigned int sysctl_sched_energy_aware;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3612,27 +3612,6 @@ void set_numabalancing_state(bool enable
 		static_branch_disable(&sched_numa_balancing);
 }
 
-#ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
-			  void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table t;
-	int err;
-	int state = static_branch_likely(&sched_numa_balancing);
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	t = *table;
-	t.data = &state;
-	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
-	if (err < 0)
-		return err;
-	if (write)
-		set_numabalancing_state(state);
-	return err;
-}
-#endif
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -3640,7 +3619,7 @@ int sysctl_numa_balancing(struct ctl_tab
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
-static void set_schedstats(bool enabled)
+void set_schedstats(bool enabled)
 {
 	if (enabled)
 		static_branch_enable(&sched_schedstats);
@@ -3687,27 +3666,6 @@ static void __init init_schedstats(void)
 	set_schedstats(__sched_schedstats);
 }
 
-#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
-		size_t *lenp, loff_t *ppos)
-{
-	struct ctl_table t;
-	int err;
-	int state = static_branch_likely(&sched_schedstats);
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	t = *table;
-	t.data = &state;
-	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
-	if (err < 0)
-		return err;
-	if (write)
-		set_schedstats(state);
-	return err;
-}
-#endif /* CONFIG_PROC_SYSCTL */
 #else  /* !CONFIG_SCHEDSTATS */
 static inline void init_schedstats(void) {}
 #endif /* CONFIG_SCHEDSTATS */
@@ -5504,9 +5462,11 @@ static const struct file_operations sche
 	.release	= single_release,
 };
 
+extern struct dentry *debugfs_sched;
+
 static __init int sched_init_debug_dynamic(void)
 {
-	debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+	debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
 	return 0;
 }
 late_initcall(sched_init_debug_dynamic);
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,15 +169,176 @@ static const struct file_operations sche
 	.release	= single_release,
 };
 
+#ifdef CONFIG_SMP
+
+extern int sched_update_scaling(void);
+
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	char buf[16];
+
+	if (cnt > 15)
+		cnt = 15;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+		return -EINVAL;
+
+	if (sched_update_scaling())
+		return -EINVAL;
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+	return 0;
+}
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+	.open		= sched_scaling_open,
+	.write		= sched_scaling_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_SCHEDSTATS
+
+extern void set_schedstats(bool enabled);
+
+static ssize_t sched_stats_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	char buf[16];
+	bool enabled;
+
+	if (cnt > 15)
+		cnt = 15;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (kstrtobool(buf, &enabled))
+		return -EINVAL;
+
+	set_schedstats(enabled);
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static int sched_stats_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", static_key_enabled(&sched_schedstats));
+	return 0;
+}
+
+static int sched_stats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_stats_show, NULL);
+}
+
+static const struct file_operations sched_stats_fops = {
+	.open		= sched_stats_open,
+	.write		= sched_stats_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* SCHEDSTATS */
+#endif /* SMP */
+
+#ifdef CONFIG_NUMA_BALANCING
+
+static ssize_t sched_numa_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	char buf[16];
+	bool enabled;
+
+	if (cnt > 15)
+		cnt = 15;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (kstrtobool(buf, &enabled))
+		return -EINVAL;
+
+	set_numabalancing_state(enabled);
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static int sched_numa_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", static_key_enabled(&sched_numa_balancing));
+	return 0;
+}
+
+static int sched_numa_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_numa_show, NULL);
+}
+
+static const struct file_operations sched_numa_fops = {
+	.open		= sched_numa_open,
+	.write		= sched_numa_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
 __read_mostly bool sched_debug_enabled;
 
+struct dentry *debugfs_sched;
+
 static __init int sched_init_debug(void)
 {
-	debugfs_create_file("sched_features", 0644, NULL, NULL,
-			&sched_feat_fops);
+	struct dentry __maybe_unused *numa;
+
+	debugfs_sched = debugfs_create_dir("sched", NULL);
+
+	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+	debugfs_create_bool("debug", 0644, debugfs_sched, &sched_debug_enabled);
+
+	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+
+#ifdef CONFIG_SMP
+	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+#ifdef CONFIG_SCHEDSTATS
+	debugfs_create_file("stats", 0644, debugfs_sched, NULL, &sched_stats_fops);
+#endif
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+	numa = debugfs_create_dir("numa", debugfs_sched);
 
-	debugfs_create_bool("sched_debug", 0644, NULL,
-			&sched_debug_enabled);
+	debugfs_create_file("balancing", 0644, numa, NULL, &sched_numa_fops);
+	debugfs_create_u32("balancing_scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+	debugfs_create_u32("balancing_scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+	debugfs_create_u32("balancing_scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+	debugfs_create_u32("balancing_scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+#endif
 
 	return 0;
 }
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sc
  *
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
@@ -627,15 +627,10 @@ struct sched_entity *__pick_last_entity(
  * Scheduling class statistics methods:
  */
 
-int sched_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
+int sched_update_scaling(void)
 {
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	unsigned int factor = get_update_sysctl_factor();
 
-	if (ret || !write)
-		return ret;
-
 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 					sysctl_sched_min_granularity);
 
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_wr
 int sysctl_legacy_va_layout;
 #endif
 
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000;		/* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-static int min_wakeup_granularity_ns;			/* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
-
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
@@ -1659,111 +1648,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#ifdef CONFIG_SCHED_DEBUG
-	{
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.procname	= "sched_latency_ns",
-		.data		= &sysctl_sched_latency,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.procname	= "sched_wakeup_granularity_ns",
-		.data		= &sysctl_sched_wakeup_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-#ifdef CONFIG_SMP
-	{
-		.procname	= "sched_tunable_scaling",
-		.data		= &sysctl_sched_tunable_scaling,
-		.maxlen		= sizeof(enum sched_tunable_scaling),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_tunable_scaling,
-		.extra2		= &max_sched_tunable_scaling,
-	},
-	{
-		.procname	= "sched_migration_cost_ns",
-		.data		= &sysctl_sched_migration_cost,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sched_nr_migrate",
-		.data		= &sysctl_sched_nr_migrate,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#ifdef CONFIG_SCHEDSTATS
-	{
-		.procname	= "sched_schedstats",
-		.data		= NULL,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_schedstats,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif /* CONFIG_SCHEDSTATS */
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_NUMA_BALANCING
-	{
-		.procname	= "numa_balancing_scan_delay_ms",
-		.data		= &sysctl_numa_balancing_scan_delay,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_min_ms",
-		.data		= &sysctl_numa_balancing_scan_period_min,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_max_ms",
-		.data		= &sysctl_numa_balancing_scan_period_max,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_size_mb",
-		.data		= &sysctl_numa_balancing_scan_size,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ONE,
-	},
-	{
-		.procname	= "numa_balancing",
-		.data		= NULL, /* filled in by handler */
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_numa_balancing,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 10:54   ` Peter Zijlstra
@ 2021-03-24 10:55     ` Peter Zijlstra
  2021-03-24 11:42     ` Mel Gorman
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2021-03-24 10:55 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 11:54:24AM +0100, Peter Zijlstra wrote:
> On Wed, Mar 24, 2021 at 10:37:43AM +0100, Peter Zijlstra wrote:
> > Should we perhaps take out all SCHED_DEBUG sysctls and move them to
> > /debug/sched/ ? (along with the existing /debug/sched_{debug,features,preemp}
> > files)
> > 
> > Having all that in sysctl and documented gives them far too much sheen
> > of ABI.
> 
> ... a little something like this ...
> 

And then the parent post becomes something like this..

---
Subject: sched: Warn on long periods of pending need_resched
From: Paul Turner <pjt@google.com>
Date: Mon, 22 Mar 2021 20:57:06 -0700

From: Paul Turner <pjt@google.com>

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. If it goes off, it is
likely that there is a missing cond_resched() somewhere. Monitoring is
done via the tick and the accuracy is hence limited to jiffy scale. This
also means that we won't trigger the warning if the tick is disabled.

This feature is default disabled.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210323035706.572953-1-joshdon@google.com
---
 include/linux/sched/sysctl.h |    3 +
 kernel/sched/core.c          |   75 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c         |   13 +++++++
 kernel/sched/features.h      |    2 +
 kernel/sched/sched.h         |   10 +++++
 5 files changed, 102 insertions(+), 1 deletion(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancin
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
 #endif
 
 /*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,21 @@ const_debug unsigned int sysctl_sched_fe
 #include "features.h"
 	0;
 #undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * resched_latency_warn_enabled is set).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ *
+ * Resched latency will be ignored for the first resched_boot_quiet_sec, to
+ * reduce false alarms.
+ */
+int sysctl_resched_latency_warn_ms = 100;
+int sysctl_resched_latency_warn_once = 1;
+const long resched_boot_quiet_sec = 600;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4485,6 +4499,56 @@ unsigned long long task_sched_runtime(st
 	return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+static u64 resched_latency_check(struct rq *rq)
+{
+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+	u64 need_resched_latency, now = rq_clock(rq);
+	static bool warned_once;
+
+	if (sysctl_resched_latency_warn_once && warned_once)
+		return 0;
+
+	if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
+		return 0;
+
+	/* Disable this warning for the first few mins after boot */
+	if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
+		return 0;
+
+	if (!rq->last_seen_need_resched_ns) {
+		rq->last_seen_need_resched_ns = now;
+		rq->ticks_without_resched = 0;
+		return 0;
+	}
+
+	rq->ticks_without_resched++;
+	need_resched_latency = now - rq->last_seen_need_resched_ns;
+	if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+		return 0;
+
+	warned_once = true;
+
+	return need_resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+	long val;
+
+	if ((kstrtol(str, 0, &val))) {
+		pr_warn("Unable to set resched_latency_warn_ms\n");
+		return 1;
+	}
+
+	sysctl_resched_latency_warn_ms = val;
+	return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4496,6 +4560,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	u64 resched_latency;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4506,10 +4571,15 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	if (sched_feat(LATENCY_WARN))
+		resched_latency = resched_latency_check(rq);
 	calc_global_load_tick(rq);
 
 	rq_unlock(rq, &rf);
 
+	if (sched_feat(LATENCY_WARN) && resched_latency)
+		resched_latency_warn(cpu, resched_latency);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -5004,6 +5074,9 @@ static void __sched notrace __schedule(b
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+	rq->last_seen_need_resched_ns = 0;
+#endif
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,9 @@ static __init int sched_init_debug(void)
 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
+	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+
 #ifdef CONFIG_SMP
 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
@@ -1194,3 +1197,13 @@ void proc_sched_set_task(struct task_str
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+	static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+	WARN(__ratelimit(&latency_check_ratelimit),
+	     "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+	     "without schedule\n",
+	     cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -90,3 +90,5 @@ SCHED_FEAT(WA_BIAS, true)
  */
 SCHED_FEAT(UTIL_EST, true)
 SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(LATENCY_WARN, false)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/profile.h>
 #include <linux/psi.h>
+#include <linux/ratelimit.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
 #include <linux/stop_machine.h>
@@ -971,6 +972,11 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_SCHED_DEBUG
+	u64 last_seen_need_resched_ns;
+	int ticks_without_resched;
+#endif
+
 #ifdef CONFIG_MEMBARRIER
 	int membarrier_state;
 #endif
@@ -2374,6 +2380,8 @@ extern void print_dl_stats(struct seq_fi
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2381,6 +2389,8 @@ extern void
 print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
 	unsigned long tpf, unsigned long gsf, unsigned long gpf);
 #endif /* CONFIG_NUMA_BALANCING */
+#else
+static inline void resched_latency_warn(int cpu, u64 latency) {}
 #endif /* CONFIG_SCHED_DEBUG */
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-23  3:57 [PATCH v2] sched: Warn on long periods of pending need_resched Josh Don
  2021-03-24  9:37 ` Peter Zijlstra
@ 2021-03-24 11:27 ` Mel Gorman
  2021-03-25 21:50   ` Josh Don
  1 sibling, 1 reply; 18+ messages in thread
From: Mel Gorman @ 2021-03-24 11:27 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Mon, Mar 22, 2021 at 08:57:06PM -0700, Josh Don wrote:
> From: Paul Turner <pjt@google.com>
> 
> CPU scheduler marks need_resched flag to signal a schedule() on a
> particular CPU. But, schedule() may not happen immediately in cases
> where the current task is executing in the kernel mode (no
> preemption state) for extended periods of time.
> 
> This patch adds a warn_on if need_resched is pending for more than the
> time specified in sysctl resched_latency_warn_ms. If it goes off, it is
> likely that there is a missing cond_resched() somewhere. Monitoring is
> done via the tick and the accuracy is hence limited to jiffy scale. This
> also means that we won't trigger the warning if the tick is disabled.
> 
> This feature is default disabled. It can be toggled on using sysctl
> resched_latency_warn_enabled.
> 
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
> Delta from v1:
> - separate sysctl for enabling/disabling and triggering warn_once
>   behavior
> - add documentation
> - static branch for the enable
>  Documentation/admin-guide/sysctl/kernel.rst | 23 ++++++
>  include/linux/sched/sysctl.h                |  4 ++
>  kernel/sched/core.c                         | 78 ++++++++++++++++++++-
>  kernel/sched/debug.c                        | 10 +++
>  kernel/sched/sched.h                        | 10 +++
>  kernel/sysctl.c                             | 24 +++++++
>  6 files changed, 148 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index 1d56a6b73a4e..2d4a21d3b79f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1077,6 +1077,29 @@ ROM/Flash boot loader. Maybe to tell it what to do after
>  rebooting. ???
>  
>  
> +resched_latency_warn_enabled
> +============================
> +
> +Enables/disables a warning that will trigger if need_resched is set for
> +longer than sysctl ``resched_latency_warn_ms``. This warning likely
> +indicates a kernel bug, such as a failure to call cond_resched().
> +
> +Requires ``CONFIG_SCHED_DEBUG``.
> +

I'm not a fan of the name. I know other sysctls have _enabled in the
name but it's redundant. If you say the name out loud, it sounds weird.
I would suggest an alternative but see below.

> +
> +resched_latency_warn_ms
> +=======================
> +
> +See ``resched_latency_warn_enabled``.
> +
> +
> +resched_latency_warn_once
> +=========================
> +
> +If set, ``resched_latency_warn_enabled`` will only trigger one warning
> +per boot.
> +

I suggest semantics and naming similar to hung_task_warnings
because it's sortof similar. resched_latency_warnings would combine
resched_latency_warn_enabled and resched_latency_warn_once. 0 would mean
"never warn", -1 would mean always warn and any positive value means
"warn X number of times".

Internally, you could still use the static label
resched_latency_warn_enabled, it would simply be false if
resched_latency_warnings == 0.

Obviously though sysctl_resched_latency_warn_once would need to change.

> +
>  sched_energy_aware
>  ==================
>  
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 3c31ba88aca5..43a1f5ab819a 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -48,6 +48,10 @@ extern unsigned int sysctl_numa_balancing_scan_size;
>  extern __read_mostly unsigned int sysctl_sched_migration_cost;
>  extern __read_mostly unsigned int sysctl_sched_nr_migrate;
>  
> +extern struct static_key_false resched_latency_warn_enabled;
> +extern int sysctl_resched_latency_warn_ms;
> +extern int sysctl_resched_latency_warn_once;
> +
>  int sched_proc_update_handler(struct ctl_table *table, int write,
>  		void *buffer, size_t *length, loff_t *ppos);
>  #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 98191218d891..d69ae342b450 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -58,7 +58,21 @@ const_debug unsigned int sysctl_sched_features =
>  #include "features.h"
>  	0;
>  #undef SCHED_FEAT
> -#endif
> +
> +/*
> + * Print a warning if need_resched is set for the given duration (if
> + * resched_latency_warn_enabled is set).
> + *
> + * If sysctl_resched_latency_warn_once is set, only one warning will be shown
> + * per boot.
> + *
> + * Resched latency will be ignored for the first resched_boot_quiet_sec, to
> + * reduce false alarms.
> + */
> +int sysctl_resched_latency_warn_ms = 100;
> +int sysctl_resched_latency_warn_once = 1;

Use __read_mostly

> +const long resched_boot_quiet_sec = 600;

This seems arbitrary but could also be a #define. More on this later

> +#endif /* CONFIG_SCHED_DEBUG */
>  


>  /*
>   * Number of tasks to iterate in a single balance run.
> @@ -4520,6 +4534,58 @@ unsigned long long task_sched_runtime(struct task_struct *p)
>  	return ns;
>  }
>  
> +#ifdef CONFIG_SCHED_DEBUG
> +static u64 resched_latency_check(struct rq *rq)
> +{
> +	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
> +	u64 need_resched_latency, now = rq_clock(rq);
> +	static bool warned_once;
> +
> +	if (sysctl_resched_latency_warn_once && warned_once)
> +		return 0;
> +

That is a global variable that can be modified in parallel and I do not
think it's properly locked (scheduler_tick is holding rq lock which does
not protect this).

Consider making resched_latency_warnings atomic and use
atomic_dec_if_positive. If it drops to zero in this path, disable the
static branch.

That said, it may be overkill. hung_task_warnings does not appear to have
special protection that prevents it going to -1 or lower values by accident
either. Maybe it can afford to be a bit more relaxed because a system that
is spamming hung task warnings is probably dead or might as well be dead.

> +	if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
> +		return 0;
> +

Why is 1ms special? Regardless of the answer, if the sysctl should not
be 1 then the user should not be able to set it to 1.

> +	/* Disable this warning for the first few mins after boot */
> +	if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
> +		return 0;
> +

Check system_state == SYSTEM_BOOTING instead?

> +	if (!rq->last_seen_need_resched_ns) {
> +		rq->last_seen_need_resched_ns = now;
> +		rq->ticks_without_resched = 0;
> +		return 0;
> +	}
> +
> +	rq->ticks_without_resched++;
> +	need_resched_latency = now - rq->last_seen_need_resched_ns;
> +	if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
> +		return 0;
> +

The naming need_resched_latency implies it's a boolean but it's not.
Maybe just resched_latency?

Similarly, resched_latency_check implies it returns a boolean but it
returns an excessive latency value. At this point I've been reading the
patch for a long time so I've ran out of naming suggestions :)

> +	warned_once = true;
> +
> +	return need_resched_latency;
> +}
> +

I note that you split when a warning is needed and printing the warning
but it's not clear why. Sure you are under the RQ lock but there are other
places that warn under the RQ lock. I suppose for consistency it could
use SCHED_WARN_ON even though all this code is under SCHED_DEBUG already.

> +static int __init setup_resched_latency_warn_ms(char *str)
> +{
> +	long val;
> +
> +	if ((kstrtol(str, 0, &val))) {
> +		pr_warn("Unable to set resched_latency_warn_ms\n");
> +		return 1;
> +	}
> +
> +	sysctl_resched_latency_warn_ms = val;
> +	return 1;
> +}
> +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
> +#else
> +static inline u64 resched_latency_check(struct rq *rq) { return 0; }
> +#endif /* CONFIG_SCHED_DEBUG */
> +
> +DEFINE_STATIC_KEY_FALSE(resched_latency_warn_enabled);
> +
>  /*
>   * This function gets called by the timer code, with HZ frequency.
>   * We call it with interrupts disabled.
> @@ -4531,6 +4597,7 @@ void scheduler_tick(void)
>  	struct task_struct *curr = rq->curr;
>  	struct rq_flags rf;
>  	unsigned long thermal_pressure;
> +	u64 resched_latency = 0;
>  
>  	arch_scale_freq_tick();
>  	sched_clock_tick();
> @@ -4541,11 +4608,17 @@ void scheduler_tick(void)
>  	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
>  	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
>  	curr->sched_class->task_tick(rq, curr, 0);
> +	if (static_branch_unlikely(&resched_latency_warn_enabled))
> +		resched_latency = resched_latency_check(rq);
>  	calc_global_load_tick(rq);
>  	psi_task_tick(rq);
>  
>  	rq_unlock(rq, &rf);
>  
> +	if (static_branch_unlikely(&resched_latency_warn_enabled) &&
> +	    resched_latency)
> +		resched_latency_warn(cpu, resched_latency);
> +
>  	perf_event_task_tick();
>  

I don't see the need to split latency detection with the display of the
warning. As resched_latency_check is static with a single caller, it should
be inlined so you can move all the logic, including the static branch
check there. Maybe to be on the safe side, explicitly mark it inline.

That allows you to delete resched_latency_warn and avoid advertising it
through sched.h

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 10:54   ` Peter Zijlstra
  2021-03-24 10:55     ` Peter Zijlstra
@ 2021-03-24 11:42     ` Mel Gorman
  2021-03-24 12:12       ` Peter Zijlstra
  1 sibling, 1 reply; 18+ messages in thread
From: Mel Gorman @ 2021-03-24 11:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 11:54:24AM +0100, Peter Zijlstra wrote:
> On Wed, Mar 24, 2021 at 10:37:43AM +0100, Peter Zijlstra wrote:
> > Should we perhaps take out all SCHED_DEBUG sysctls and move them to
> > /debug/sched/ ? (along with the existing /debug/sched_{debug,features,preemp}
> > files)
> > 
> > Having all that in sysctl and documented gives them far too much sheen
> > of ABI.
> 
> ... a little something like this ...
> 

I did not read this particularly carefully or boot it to check but some
of the sysctls moved are expected to exist and should never should have
been under SCHED_DEBUG.

For example, I'm surprised that numa_balancing is under the SCHED_DEBUG
sysctl because there are legimiate reasons to disable that at runtime.
For example, HPC clusters running various workloads may disable NUMA
balancing globally for particular jobs without wanting to reboot and
reenable it when finished.

Moving something like sched_min_granularity_ns will break a number of
tuning guides as well as the "tuned" tool which ships by default with
some distros and I believe some of the default profiles used for tuned
tweak kernel.sched_min_granularity_ns

Whether there are legimiate reasons to modify those values or not,
removing them may generate fun bug reports.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 11:42     ` Mel Gorman
@ 2021-03-24 12:12       ` Peter Zijlstra
  2021-03-24 13:39         ` Mel Gorman
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-03-24 12:12 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 11:42:24AM +0000, Mel Gorman wrote:
> On Wed, Mar 24, 2021 at 11:54:24AM +0100, Peter Zijlstra wrote:
> > On Wed, Mar 24, 2021 at 10:37:43AM +0100, Peter Zijlstra wrote:
> > > Should we perhaps take out all SCHED_DEBUG sysctls and move them to
> > > /debug/sched/ ? (along with the existing /debug/sched_{debug,features,preemp}
> > > files)
> > > 
> > > Having all that in sysctl and documented gives them far too much sheen
> > > of ABI.
> > 
> > ... a little something like this ...
> > 
> 
> I did not read this particularly carefully or boot it to check but some
> of the sysctls moved are expected to exist and should never should have
> been under SCHED_DEBUG.
> 
> For example, I'm surprised that numa_balancing is under the SCHED_DEBUG
> sysctl because there are legimiate reasons to disable that at runtime.
> For example, HPC clusters running various workloads may disable NUMA
> balancing globally for particular jobs without wanting to reboot and
> reenable it when finished.

Yeah, lets say I was pleasantly surprised to find it there :-)

> Moving something like sched_min_granularity_ns will break a number of
> tuning guides as well as the "tuned" tool which ships by default with
> some distros and I believe some of the default profiles used for tuned
> tweak kernel.sched_min_granularity_ns

Yeah, can't say I care. I suppose some people with PREEMPT=n kernels
increase that to make their server workloads 'go fast'. But I'll
absolutely suck rock on anything desktop.

These knobs really shouldn't have been as widely available as they are.

And guides, well, the writes have to earn a living too, right.

> Whether there are legimiate reasons to modify those values or not,
> removing them may generate fun bug reports.

Which I'll close with -EDONTCARE, userspace has to cope with
SCHED_DEBUG=n in any case.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 12:12       ` Peter Zijlstra
@ 2021-03-24 13:39         ` Mel Gorman
  2021-03-24 14:36           ` Peter Zijlstra
  2021-04-16 15:53           ` [tip: sched/core] sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG tip-bot2 for Mel Gorman
  0 siblings, 2 replies; 18+ messages in thread
From: Mel Gorman @ 2021-03-24 13:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 01:12:16PM +0100, Peter Zijlstra wrote:
> On Wed, Mar 24, 2021 at 11:42:24AM +0000, Mel Gorman wrote:
> > On Wed, Mar 24, 2021 at 11:54:24AM +0100, Peter Zijlstra wrote:
> > > On Wed, Mar 24, 2021 at 10:37:43AM +0100, Peter Zijlstra wrote:
> > > > Should we perhaps take out all SCHED_DEBUG sysctls and move them to
> > > > /debug/sched/ ? (along with the existing /debug/sched_{debug,features,preemp}
> > > > files)
> > > > 
> > > > Having all that in sysctl and documented gives them far too much sheen
> > > > of ABI.
> > > 
> > > ... a little something like this ...
> > > 
> > 
> > I did not read this particularly carefully or boot it to check but some
> > of the sysctls moved are expected to exist and should never should have
> > been under SCHED_DEBUG.
> > 
> > For example, I'm surprised that numa_balancing is under the SCHED_DEBUG
> > sysctl because there are legimiate reasons to disable that at runtime.
> > For example, HPC clusters running various workloads may disable NUMA
> > balancing globally for particular jobs without wanting to reboot and
> > reenable it when finished.
> 
> Yeah, lets say I was pleasantly surprised to find it there :-)
> 

Minimally, lets move that out before it gets kicked out. Patch below.

> > Moving something like sched_min_granularity_ns will break a number of
> > tuning guides as well as the "tuned" tool which ships by default with
> > some distros and I believe some of the default profiles used for tuned
> > tweak kernel.sched_min_granularity_ns
> 
> Yeah, can't say I care. I suppose some people with PREEMPT=n kernels
> increase that to make their server workloads 'go fast'. But I'll
> absolutely suck rock on anything desktop.
> 

Broadly speaking yes and despite the lack of documentation, enough people
think of that parameter when tuning for throughput vs latency depending on
the expected use of the machine.  kernel.sched_wakeup_granularity_ns might
get tuned if preemption is causing overscheduling. Same potentially with
kernel.sched_min_granularity_ns and kernel.sched_latency_ns. That said, I'm
struggling to think of an instance where I've seen tuning recommendations
properly quantified other than the impact on microbenchmarks but I
think there will be complaining if they disappear. I suspect that some
recommended tuning is based on "I tried a number of different values and
this seemed to work reasonably well".

kernel.sched_schedstats probably should not depend in SCHED_DEBUG because
it has value for workload analysis which is not necessarily about debugging
per-se. It might simply be informing whether another variable should be
tuned or useful for debugging applications rather than the kernel.

The others I'm less concerned with. kernel.sched_tunable_scaling is very
specific. sysctl_sched_migration_cost is subtle because it affects lots
of things including whether tasks are cache hot and load balancing and
is best left alone. I wonder how many people can accurately predict how
workloads will behave when that is tuned? sched_nr_migrate is also a hard
one to tune in a sensible fashion.

As an aside, I wonder how often SCHED_DEBUG has been enabled simply
because LATENCYTOP selects it -- no idea offhand why LATENCYTOP even
needs SCHED_DEBUG.

> These knobs really shouldn't have been as widely available as they are.
> 

Probably not. Worse, some of the tuning is probably based on "this worked
for workload X 10 years ago so I'll just keep doing that"

> And guides, well, the writes have to earn a living too, right.
> 

For most of the guides I've seen they either specify values without
explaining why or just describe roughly what the parameter does and it's
not always that accurate a description.

> > Whether there are legimiate reasons to modify those values or not,
> > removing them may generate fun bug reports.
> 
> Which I'll close with -EDONTCARE, userspace has to cope with
> SCHED_DEBUG=n in any case.

True but removing the throughput vs latency parameters is likely to
generate a lot of noise even if the reasons for tuning are bad ones.
Some definitely should not be depending on SCHED_DEBUG, others may
need to be moved to debugfs one patch at a time so they can be reverted
individually if complaining is excessive and there is a legiminate reason
why it should be tuned. It's possible that complaining will be based on
a workload regression that really depended on tuned changing parameters.

Anyway, I definitely want to save kernel.numa_balancing from the firing
line so....

--8<--
sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG

From: Mel Gorman <mgorman@suse.de>

The ability to enable/disable NUMA balancing is not a debugging feature
and should not depend on CONFIG_SCHED_DEBUG.  For example, machines within
a HPC cluster may disable NUMA balancing temporarily for some jobs and
re-enable it for other jobs without needing to reboot.

This patch removes the dependency on CONFIG_SCHED_DEBUG for
kernel.numa_balancing sysctl. The other numa balancing related sysctls
are left as-is because if they need to be tuned then it is more likely
that NUMA balancing needs to be fixed instead.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..8042098ae080 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1753,6 +1753,9 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ONE,
 	},
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_NUMA_BALANCING
 	{
 		.procname	= "numa_balancing",
 		.data		= NULL, /* filled in by handler */
@@ -1763,7 +1766,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 13:39         ` Mel Gorman
@ 2021-03-24 14:36           ` Peter Zijlstra
  2021-03-24 15:52             ` Mel Gorman
  2021-04-16 15:53           ` [tip: sched/core] sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG tip-bot2 for Mel Gorman
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-03-24 14:36 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 01:39:16PM +0000, Mel Gorman wrote:

> > Yeah, lets say I was pleasantly surprised to find it there :-)
> > 
> 
> Minimally, lets move that out before it gets kicked out. Patch below.

OK, stuck that in front.

> > > Moving something like sched_min_granularity_ns will break a number of
> > > tuning guides as well as the "tuned" tool which ships by default with
> > > some distros and I believe some of the default profiles used for tuned
> > > tweak kernel.sched_min_granularity_ns
> > 
> > Yeah, can't say I care. I suppose some people with PREEMPT=n kernels
> > increase that to make their server workloads 'go fast'. But I'll
> > absolutely suck rock on anything desktop.
> > 
> 
> Broadly speaking yes and despite the lack of documentation, enough people
> think of that parameter when tuning for throughput vs latency depending on
> the expected use of the machine.  kernel.sched_wakeup_granularity_ns might
> get tuned if preemption is causing overscheduling. Same potentially with
> kernel.sched_min_granularity_ns and kernel.sched_latency_ns. That said, I'm
> struggling to think of an instance where I've seen tuning recommendations
> properly quantified other than the impact on microbenchmarks but I
> think there will be complaining if they disappear. I suspect that some
> recommended tuning is based on "I tried a number of different values and
> this seemed to work reasonably well".

Right, except that due to that scaling thing, you'd have to re-evaluate
when you change machine.

Also, do you have any inclination on the perf difference we're talking
about? (I should probably ask Google and not you...)

> kernel.sched_schedstats probably should not depend in SCHED_DEBUG because
> it has value for workload analysis which is not necessarily about debugging
> per-se. It might simply be informing whether another variable should be
> tuned or useful for debugging applications rather than the kernel.

Dubious, if you're that far down the rabit hole, you're dang near
debugging.

> As an aside, I wonder how often SCHED_DEBUG has been enabled simply
> because LATENCYTOP selects it -- no idea offhand why LATENCYTOP even
> needs SCHED_DEBUG.

Perhaps schedstats used to rely on debug? I can't remember. I don't
think I've used latencytop in at least 10 years. ftrace and perf sorta
killed the need for it.

> > These knobs really shouldn't have been as widely available as they are.
> > 
> 
> Probably not. Worse, some of the tuning is probably based on "this worked
> for workload X 10 years ago so I'll just keep doing that"

That sounds like an excellent reason to disrupt ;-)

> > And guides, well, the writes have to earn a living too, right.
> > 
> 
> For most of the guides I've seen they either specify values without
> explaining why or just describe roughly what the parameter does and it's
> not always that accurate a description.

Another good reason.

> > > Whether there are legimiate reasons to modify those values or not,
> > > removing them may generate fun bug reports.
> > 
> > Which I'll close with -EDONTCARE, userspace has to cope with
> > SCHED_DEBUG=n in any case.
> 
> True but removing the throughput vs latency parameters is likely to
> generate a lot of noise even if the reasons for tuning are bad ones.
> Some definitely should not be depending on SCHED_DEBUG, others may
> need to be moved to debugfs one patch at a time so they can be reverted
> individually if complaining is excessive and there is a legiminate reason
> why it should be tuned. It's possible that complaining will be based on
> a workload regression that really depended on tuned changing parameters.

The way I've done it, you can simply re-instate the systl table entry
and it'll work again, except for the entries that had a custom handler.

I'm ready to disrupt :-)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 14:36           ` Peter Zijlstra
@ 2021-03-24 15:52             ` Mel Gorman
  2021-03-25 21:58               ` Josh Don
  0 siblings, 1 reply; 18+ messages in thread
From: Mel Gorman @ 2021-03-24 15:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Don, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 03:36:14PM +0100, Peter Zijlstra wrote:
> On Wed, Mar 24, 2021 at 01:39:16PM +0000, Mel Gorman wrote:
> 
> > > Yeah, lets say I was pleasantly surprised to find it there :-)
> > > 
> > 
> > Minimally, lets move that out before it gets kicked out. Patch below.
> 
> OK, stuck that in front.
> 

Thanks.

> > > > Moving something like sched_min_granularity_ns will break a number of
> > > > tuning guides as well as the "tuned" tool which ships by default with
> > > > some distros and I believe some of the default profiles used for tuned
> > > > tweak kernel.sched_min_granularity_ns
> > > 
> > > Yeah, can't say I care. I suppose some people with PREEMPT=n kernels
> > > increase that to make their server workloads 'go fast'. But I'll
> > > absolutely suck rock on anything desktop.
> > > 
> > 
> > Broadly speaking yes and despite the lack of documentation, enough people
> > think of that parameter when tuning for throughput vs latency depending on
> > the expected use of the machine.  kernel.sched_wakeup_granularity_ns might
> > get tuned if preemption is causing overscheduling. Same potentially with
> > kernel.sched_min_granularity_ns and kernel.sched_latency_ns. That said, I'm
> > struggling to think of an instance where I've seen tuning recommendations
> > properly quantified other than the impact on microbenchmarks but I
> > think there will be complaining if they disappear. I suspect that some
> > recommended tuning is based on "I tried a number of different values and
> > this seemed to work reasonably well".
> 
> Right, except that due to that scaling thing, you'd have to re-evaluate
> when you change machine.
> 

Yes although in practice I've rarely seen that happen. What I have seen
is tuning parameters being copied across machines or kernel versions that
turned out to be the source of the "regression" because something changed
in the scheduler that invalidated the tuning.

> Also, do you have any inclination on the perf difference we're talking
> about? (I should probably ask Google and not you...)
> 

I don't have good data on hand and I don't trust Google for performance
data. However, I know for certain that there are "Enterprise Applications"
whose tuning relies on modifying kernel.sched_min_granularity_ns and
kernel.sched_wakeup_granularity_ns at the very least (might be others,
I'd have to check). The issue was severe enough to fail acceptance testing
for OS upgrades and it generated bugs.

I did not see the raw data but even if I had, it would have been based on
a battery of tests across multiple platforms and generations so at best I
would have a vague range. For the vendors in question, it is unlikely they
would release detailed information because it can be seen as commercially
sensitive. I don't really agree that this is useful behaviour but it is
the reality so don't shoot the messenger :(

The last I checked, hackbench figures could be changed in the 10-15%
range either direction depending on group counts but in itself, that is
not useful.

> > kernel.sched_schedstats probably should not depend in SCHED_DEBUG because
> > it has value for workload analysis which is not necessarily about debugging
> > per-se. It might simply be informing whether another variable should be
> > tuned or useful for debugging applications rather than the kernel.
> 
> Dubious, if you're that far down the rabit hole, you're dang near
> debugging.
> 

Yes, but not necessarily the kernel. For example, the workload analysis
might be to see if the maximum number of threads in a worker pool should
be tuned (either up or down).

> > As an aside, I wonder how often SCHED_DEBUG has been enabled simply
> > because LATENCYTOP selects it -- no idea offhand why LATENCYTOP even
> > needs SCHED_DEBUG.
> 
> Perhaps schedstats used to rely on debug? I can't remember. I don't
> think I've used latencytop in at least 10 years. ftrace and perf sorta
> killed the need for it.
> 

I don't think schedstats used to rely on SCHED_DEBUG. LATENCYTOP appears
to build even if SCHED_DEBUG is disabled so it was either was an
accident or it's no longer necessary.

> > > These knobs really shouldn't have been as widely available as they are.
> > > 
> > 
> > Probably not. Worse, some of the tuning is probably based on "this worked
> > for workload X 10 years ago so I'll just keep doing that"
> 
> That sounds like an excellent reason to disrupt ;-)
> 

The same logic applies for all tuning unfortunately :P

> > > > Whether there are legimiate reasons to modify those values or not,
> > > > removing them may generate fun bug reports.
> > > 
> > > Which I'll close with -EDONTCARE, userspace has to cope with
> > > SCHED_DEBUG=n in any case.
> > 
> > True but removing the throughput vs latency parameters is likely to
> > generate a lot of noise even if the reasons for tuning are bad ones.
> > Some definitely should not be depending on SCHED_DEBUG, others may
> > need to be moved to debugfs one patch at a time so they can be reverted
> > individually if complaining is excessive and there is a legiminate reason
> > why it should be tuned. It's possible that complaining will be based on
> > a workload regression that really depended on tuned changing parameters.
> 
> The way I've done it, you can simply re-instate the systl table entry
> and it'll work again, except for the entries that had a custom handler.
> 

True.

> I'm ready to disrupt :-)

I'm not going to NAK because I do not have hard data that shows they must
exist. However, I won't ACK either because I bet a lot of tasty beverages
the next time we meet that the following parameters will generate reports
if removed.

kernel.sched_latency_ns
kernel.sched_migration_cost_ns
kernel.sched_min_granularity_ns
kernel.sched_wakeup_granularity_ns

I know they are altered by tuned for different profiles and some people do
go the effort to create custom profiles for specific applications. They
also show up in "Official Benchmarking" such as SPEC CPU 2017 and
some vendors put a *lot* of effort into SPEC CPU results for bragging
rights. They show up in technical books and best practice guids for
applications.  Finally they show up in Google when searching for "tuning
sched_foo". I'm not saying that any of these are even accurate or a good
idea, just that they show up near the top of the results and they are
sufficiently popular that they might as well be an ABI.

kernel.sched_latency_ns
 https://www.scylladb.com/2016/06/10/read-latency-and-scylla-jmx-process/
 https://github.com/tikv/tikv/issues/2473

kernel.sched_migration_cost_ns
 https://developer.ibm.com/technologies/systems/tutorials/postgresql-experiences-tuning-recomendations-linux-on-ibm-z/
 https://hunleyd.github.io/posts/tuned-PG-and-you/
 https://www.postgresql.org/message-id/50E4AAB1.9040902@optionshouse.com

kernel.sched_min_granularity_ns
https://community.mellanox.com/s/article/rivermax-linux-performance-tuning-guide--1-x

kernel.sched_wakeup_granularity_ns
https://www.droidviews.com/boost-performance-on-android-kernels-task-scheduler-part-1/

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 11:27 ` [PATCH v2] sched: Warn on long periods of pending need_resched Mel Gorman
@ 2021-03-25 21:50   ` Josh Don
  2021-03-30 22:44     ` Josh Don
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-03-25 21:50 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Wed, Mar 24, 2021 at 4:27 AM Mel Gorman <mgorman@suse.de> wrote:
>
> I'm not a fan of the name. I know other sysctls have _enabled in the
> name but it's redundant. If you say the name out loud, it sounds weird.
> I would suggest an alternative but see below.

Now using the version rebased by Peter; this control has gone away and
we have simply a scheduling feature "LATENCY WARN"

> I suggest semantics and naming similar to hung_task_warnings
> because it's sortof similar. resched_latency_warnings would combine
> resched_latency_warn_enabled and resched_latency_warn_once. 0 would mean
> "never warn", -1 would mean always warn and any positive value means
> "warn X number of times".

See above. I'm happy with the enabled bit being toggled separately by
a sched feature; the warn_once behavior is not overloaded with the
enabling/disabling. Also, I don't see value in "warn X number of
times", given the warning is rate limited anyway.

> > +int sysctl_resched_latency_warn_ms = 100;
> > +int sysctl_resched_latency_warn_once = 1;
>
> Use __read_mostly

Good point, done.

> > +#ifdef CONFIG_SCHED_DEBUG
> > +static u64 resched_latency_check(struct rq *rq)
> > +{
> > +     int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
> > +     u64 need_resched_latency, now = rq_clock(rq);
> > +     static bool warned_once;
> > +
> > +     if (sysctl_resched_latency_warn_once && warned_once)
> > +             return 0;
> > +
>
> That is a global variable that can be modified in parallel and I do not
> think it's properly locked (scheduler_tick is holding rq lock which does
> not protect this).
>
> Consider making resched_latency_warnings atomic and use
> atomic_dec_if_positive. If it drops to zero in this path, disable the
> static branch.
>
> That said, it may be overkill. hung_task_warnings does not appear to have
> special protection that prevents it going to -1 or lower values by accident
> either. Maybe it can afford to be a bit more relaxed because a system that
> is spamming hung task warnings is probably dead or might as well be dead.

There's no real issue if we race over modification to that sysctl.
This is intentionally not more strongly synchronized for that reason.

> > +     if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
> > +             return 0;
> > +
>
> Why is 1ms special? Regardless of the answer, if the sysctl should not
> be 1 then the user should not be able to set it to 1.

Yea let me change that to !latency_warn_ms so it isn't HZ specific.

>
> > +     /* Disable this warning for the first few mins after boot */
> > +     if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
> > +             return 0;
> > +
>
> Check system_state == SYSTEM_BOOTING instead?

Yea, that might be better; let me test that.

> > +     if (!rq->last_seen_need_resched_ns) {
> > +             rq->last_seen_need_resched_ns = now;
> > +             rq->ticks_without_resched = 0;
> > +             return 0;
> > +     }
> > +
> > +     rq->ticks_without_resched++;
> > +     need_resched_latency = now - rq->last_seen_need_resched_ns;
> > +     if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
> > +             return 0;
> > +
>
> The naming need_resched_latency implies it's a boolean but it's not.
> Maybe just resched_latency?
>
> Similarly, resched_latency_check implies it returns a boolean but it
> returns an excessive latency value. At this point I've been reading the
> patch for a long time so I've ran out of naming suggestions :)

The "need_" part does confuse it a bit; I reworded these to hopefully
make it more clear.

> > +     warned_once = true;
> > +
> > +     return need_resched_latency;
> > +}
> > +
>
> I note that you split when a warning is needed and printing the warning
> but it's not clear why. Sure you are under the RQ lock but there are other
> places that warn under the RQ lock. I suppose for consistency it could
> use SCHED_WARN_ON even though all this code is under SCHED_DEBUG already.

We had seen a circular lock dependency warning (console_sem, pi lock,
rq lock), since printing might need to wake a waiter. However, I do
see plenty of warns under rq->lock, so maybe I missed a patch to
address this?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-24 15:52             ` Mel Gorman
@ 2021-03-25 21:58               ` Josh Don
  2021-03-26  8:58                 ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-03-25 21:58 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Peter Zijlstra, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

> On Wed, Mar 24, 2021 at 01:39:16PM +0000, Mel Gorman wrote:
> I'm not going to NAK because I do not have hard data that shows they must
> exist. However, I won't ACK either because I bet a lot of tasty beverages
> the next time we meet that the following parameters will generate reports
> if removed.
>
> kernel.sched_latency_ns
> kernel.sched_migration_cost_ns
> kernel.sched_min_granularity_ns
> kernel.sched_wakeup_granularity_ns
>
> I know they are altered by tuned for different profiles and some people do
> go the effort to create custom profiles for specific applications. They
> also show up in "Official Benchmarking" such as SPEC CPU 2017 and
> some vendors put a *lot* of effort into SPEC CPU results for bragging
> rights. They show up in technical books and best practice guids for
> applications.  Finally they show up in Google when searching for "tuning
> sched_foo". I'm not saying that any of these are even accurate or a good
> idea, just that they show up near the top of the results and they are
> sufficiently popular that they might as well be an ABI.

+1, these seem like sufficiently well-known scheduler tunables, and
not really SCHED_DEBUG.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-25 21:58               ` Josh Don
@ 2021-03-26  8:58                 ` Peter Zijlstra
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2021-03-26  8:58 UTC (permalink / raw)
  To: Josh Don
  Cc: Mel Gorman, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Thu, Mar 25, 2021 at 02:58:52PM -0700, Josh Don wrote:
> > On Wed, Mar 24, 2021 at 01:39:16PM +0000, Mel Gorman wrote:
> > I'm not going to NAK because I do not have hard data that shows they must
> > exist. However, I won't ACK either because I bet a lot of tasty beverages
> > the next time we meet that the following parameters will generate reports
> > if removed.
> >
> > kernel.sched_latency_ns
> > kernel.sched_migration_cost_ns
> > kernel.sched_min_granularity_ns
> > kernel.sched_wakeup_granularity_ns
> >
> > I know they are altered by tuned for different profiles and some people do
> > go the effort to create custom profiles for specific applications. They
> > also show up in "Official Benchmarking" such as SPEC CPU 2017 and
> > some vendors put a *lot* of effort into SPEC CPU results for bragging
> > rights. They show up in technical books and best practice guids for
> > applications.  Finally they show up in Google when searching for "tuning
> > sched_foo". I'm not saying that any of these are even accurate or a good
> > idea, just that they show up near the top of the results and they are
> > sufficiently popular that they might as well be an ABI.
> 
> +1, these seem like sufficiently well-known scheduler tunables, and
> not really SCHED_DEBUG.

So we've never made any guarantees on their behaviour, nor am I willing
to make any.

In fact, I propose we merge the below along with the debugfs move. Just
to make absolutely sure any 'tuning' is broken.



---
Subject: sched,fair: Alternative sched_slice()
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Mar 25 13:44:46 CET 2021

The current sched_slice() seems to have issues; there's two possible
things that could be improved:

 - the 'nr_running' used for __sched_period() is daft when cgroups are
   considered. Using the RQ wide h_nr_running seems like a much more
   consistent number.

 - (esp) cgroups can slice it real fine (pun intendend), which makes for
   easy over-scheduling, ensure min_gran is what the name says.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c     |   15 ++++++++++++++-
 kernel/sched/features.h |    3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -680,7 +680,16 @@ static u64 __sched_period(unsigned long
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+	unsigned int nr_running = cfs_rq->nr_running;
+	u64 slice;
+
+	if (sched_feat(ALT_PERIOD))
+		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+
+	slice = __sched_period(nr_running + !se->on_rq);
+
+	if (sched_feat(BASE_SLICE))
+		slice -= sysctl_sched_min_granularity;
 
 	for_each_sched_entity(se) {
 		struct load_weight *load;
@@ -697,6 +706,10 @@ static u64 sched_slice(struct cfs_rq *cf
 		}
 		slice = __calc_delta(slice, se->load.weight, load);
 	}
+
+	if (sched_feat(BASE_SLICE))
+		slice += sysctl_sched_min_granularity;
+
 	return slice;
 }
 
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -90,3 +90,6 @@ SCHED_FEAT(WA_BIAS, true)
  */
 SCHED_FEAT(UTIL_EST, true)
 SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(ALT_PERIOD, true)
+SCHED_FEAT(BASE_SLICE, true)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-25 21:50   ` Josh Don
@ 2021-03-30 22:44     ` Josh Don
  2021-04-16 15:04       ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-03-30 22:44 UTC (permalink / raw)
  To: Mel Gorman, Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Daniel Bristot de Oliveira,
	Luis Chamberlain, Kees Cook, Iurii Zaikin, linux-kernel,
	linux-fsdevel, David Rientjes, Oleg Rombakh, linux-doc,
	Paul Turner

Peter,

Since you've already pulled the need_resched warning patch into your
tree, I'm including just the diff based on that patch (in response to
Mel's comments) below. This should be squashed into the original
patch.

Thanks,
Josh

---
From 85796b4d299b1cf3f99bde154a356ce1061221b7 Mon Sep 17 00:00:00 2001
From: Josh Don <joshdon@google.com>
Date: Mon, 22 Mar 2021 20:57:06 -0700
Subject: [PATCH] fixup: sched: Warn on long periods of pending need_resched

---
 kernel/sched/core.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6fdf15eebc0d..c07a4c17205f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -61,17 +61,13 @@ const_debug unsigned int sysctl_sched_features =

 /*
  * Print a warning if need_resched is set for the given duration (if
- * resched_latency_warn_enabled is set).
+ * LATENCY_WARN is enabled).
  *
  * If sysctl_resched_latency_warn_once is set, only one warning will be shown
  * per boot.
- *
- * Resched latency will be ignored for the first resched_boot_quiet_sec, to
- * reduce false alarms.
  */
-int sysctl_resched_latency_warn_ms = 100;
-int sysctl_resched_latency_warn_once = 1;
-static const long resched_boot_quiet_sec = 600;
+__read_mostly int sysctl_resched_latency_warn_ms = 100;
+__read_mostly int sysctl_resched_latency_warn_once = 1;
 #endif /* CONFIG_SCHED_DEBUG */

 /*
@@ -4542,20 +4538,19 @@ unsigned long long task_sched_runtime(struct
task_struct *p)
 }

 #ifdef CONFIG_SCHED_DEBUG
-static u64 resched_latency_check(struct rq *rq)
+static u64 cpu_resched_latency(struct rq *rq)
 {
  int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
- u64 need_resched_latency, now = rq_clock(rq);
+ u64 resched_latency, now = rq_clock(rq);
  static bool warned_once;

  if (sysctl_resched_latency_warn_once && warned_once)
  return 0;

- if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
+ if (!need_resched() || !latency_warn_ms)
  return 0;

- /* Disable this warning for the first few mins after boot */
- if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
+ if (system_state == SYSTEM_BOOTING)
  return 0;

  if (!rq->last_seen_need_resched_ns) {
@@ -4565,13 +4560,13 @@ static u64 resched_latency_check(struct rq *rq)
  }

  rq->ticks_without_resched++;
- need_resched_latency = now - rq->last_seen_need_resched_ns;
- if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+ resched_latency = now - rq->last_seen_need_resched_ns;
+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
  return 0;

  warned_once = true;

- return need_resched_latency;
+ return resched_latency;
 }

 static int __init setup_resched_latency_warn_ms(char *str)
@@ -4588,7 +4583,7 @@ static int __init setup_resched_latency_warn_ms(char *str)
 }
 __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
 #else
-static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
 #endif /* CONFIG_SCHED_DEBUG */

 /*
@@ -4614,7 +4609,7 @@ void scheduler_tick(void)
  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
  curr->sched_class->task_tick(rq, curr, 0);
  if (sched_feat(LATENCY_WARN))
- resched_latency = resched_latency_check(rq);
+ resched_latency = cpu_resched_latency(rq);
  calc_global_load_tick(rq);

  rq_unlock(rq, &rf);
-- 
2.31.0.291.g576ba9dcdaf-goog

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-03-30 22:44     ` Josh Don
@ 2021-04-16 15:04       ` Peter Zijlstra
  2021-04-16 21:33         ` Josh Don
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-04-16 15:04 UTC (permalink / raw)
  To: Josh Don
  Cc: Mel Gorman, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Tue, Mar 30, 2021 at 03:44:12PM -0700, Josh Don wrote:
> Peter,
> 
> Since you've already pulled the need_resched warning patch into your
> tree, I'm including just the diff based on that patch (in response to
> Mel's comments) below. This should be squashed into the original
> patch.
> 

Sorry, I seem to have missed this. The patch is completely whitespace
mangled through.

I've attached my latest copy of the patch and held it back for now,
please resubmit.

---
Subject: sched: Warn on long periods of pending need_resched
From: Paul Turner <pjt@google.com>
Date: Mon, 22 Mar 2021 20:57:06 -0700

From: Paul Turner <pjt@google.com>

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. If it goes off, it is
likely that there is a missing cond_resched() somewhere. Monitoring is
done via the tick and the accuracy is hence limited to jiffy scale. This
also means that we won't trigger the warning if the tick is disabled.

This feature is default disabled. It can be toggled on using sysctl
resched_latency_warn_enabled.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210323035706.572953-1-joshdon@google.com
---

 include/linux/sched/sysctl.h |    3 +
 kernel/sched/core.c          |   75 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c         |   13 +++++++
 kernel/sched/features.h      |    2 +
 kernel/sched/sched.h         |   10 +++++
 5 files changed, 102 insertions(+), 1 deletion(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancin
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
 #endif
 
 /*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,21 @@ const_debug unsigned int sysctl_sched_fe
 #include "features.h"
 	0;
 #undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * resched_latency_warn_enabled is set).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ *
+ * Resched latency will be ignored for the first resched_boot_quiet_sec, to
+ * reduce false alarms.
+ */
+int sysctl_resched_latency_warn_ms = 100;
+int sysctl_resched_latency_warn_once = 1;
+static const long resched_boot_quiet_sec = 600;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4527,6 +4541,56 @@ unsigned long long task_sched_runtime(st
 	return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+static u64 resched_latency_check(struct rq *rq)
+{
+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+	u64 need_resched_latency, now = rq_clock(rq);
+	static bool warned_once;
+
+	if (sysctl_resched_latency_warn_once && warned_once)
+		return 0;
+
+	if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
+		return 0;
+
+	/* Disable this warning for the first few mins after boot */
+	if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
+		return 0;
+
+	if (!rq->last_seen_need_resched_ns) {
+		rq->last_seen_need_resched_ns = now;
+		rq->ticks_without_resched = 0;
+		return 0;
+	}
+
+	rq->ticks_without_resched++;
+	need_resched_latency = now - rq->last_seen_need_resched_ns;
+	if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+		return 0;
+
+	warned_once = true;
+
+	return need_resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+	long val;
+
+	if ((kstrtol(str, 0, &val))) {
+		pr_warn("Unable to set resched_latency_warn_ms\n");
+		return 1;
+	}
+
+	sysctl_resched_latency_warn_ms = val;
+	return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4538,6 +4602,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	u64 resched_latency;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4548,10 +4613,15 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	if (sched_feat(LATENCY_WARN))
+		resched_latency = resched_latency_check(rq);
 	calc_global_load_tick(rq);
 
 	rq_unlock(rq, &rf);
 
+	if (sched_feat(LATENCY_WARN) && resched_latency)
+		resched_latency_warn(cpu, resched_latency);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -5046,6 +5116,9 @@ static void __sched notrace __schedule(b
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+	rq->last_seen_need_resched_ns = 0;
+#endif
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -297,6 +297,9 @@ static __init int sched_init_debug(void)
 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
+	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+
 #ifdef CONFIG_SMP
 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
@@ -1027,3 +1030,13 @@ void proc_sched_set_task(struct task_str
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+	static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+	WARN(__ratelimit(&latency_check_ratelimit),
+	     "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+	     "without schedule\n",
+	     cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -91,5 +91,7 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 SCHED_FEAT(UTIL_EST_FASTUP, true)
 
+SCHED_FEAT(LATENCY_WARN, false)
+
 SCHED_FEAT(ALT_PERIOD, true)
 SCHED_FEAT(BASE_SLICE, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/profile.h>
 #include <linux/psi.h>
+#include <linux/ratelimit.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
 #include <linux/stop_machine.h>
@@ -971,6 +972,11 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_SCHED_DEBUG
+	u64 last_seen_need_resched_ns;
+	int ticks_without_resched;
+#endif
+
 #ifdef CONFIG_MEMBARRIER
 	int membarrier_state;
 #endif
@@ -2362,6 +2368,8 @@ extern void print_dl_stats(struct seq_fi
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2369,6 +2377,8 @@ extern void
 print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
 	unsigned long tpf, unsigned long gsf, unsigned long gpf);
 #endif /* CONFIG_NUMA_BALANCING */
+#else
+static inline void resched_latency_warn(int cpu, u64 latency) {}
 #endif /* CONFIG_SCHED_DEBUG */
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [tip: sched/core] sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG
  2021-03-24 13:39         ` Mel Gorman
  2021-03-24 14:36           ` Peter Zijlstra
@ 2021-04-16 15:53           ` tip-bot2 for Mel Gorman
  1 sibling, 0 replies; 18+ messages in thread
From: tip-bot2 for Mel Gorman @ 2021-04-16 15:53 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mel Gorman, Peter Zijlstra (Intel),
	Valentin Schneider, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     b7cc6ec744b307db59568c654a8904a5928aa855
Gitweb:        https://git.kernel.org/tip/b7cc6ec744b307db59568c654a8904a5928aa855
Author:        Mel Gorman <mgorman@suse.de>
AuthorDate:    Wed, 24 Mar 2021 13:39:16 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00

sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG

The ability to enable/disable NUMA balancing is not a debugging feature
and should not depend on CONFIG_SCHED_DEBUG.  For example, machines within
a HPC cluster may disable NUMA balancing temporarily for some jobs and
re-enable it for other jobs without needing to reboot.

This patch removes the dependency on CONFIG_SCHED_DEBUG for
kernel.numa_balancing sysctl. The other numa balancing related sysctls
are left as-is because if they need to be tuned then it is more likely
that NUMA balancing needs to be fixed instead.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20210324133916.GQ15768@suse.de
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09..8042098 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1753,6 +1753,9 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ONE,
 	},
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_NUMA_BALANCING
 	{
 		.procname	= "numa_balancing",
 		.data		= NULL, /* filled in by handler */
@@ -1763,7 +1766,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-04-16 15:04       ` Peter Zijlstra
@ 2021-04-16 21:33         ` Josh Don
  2021-04-19  7:52           ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-04-16 21:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mel Gorman, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Fri, Apr 16, 2021 at 8:05 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Mar 30, 2021 at 03:44:12PM -0700, Josh Don wrote:
> > Peter,
> >
> > Since you've already pulled the need_resched warning patch into your
> > tree, I'm including just the diff based on that patch (in response to
> > Mel's comments) below. This should be squashed into the original
> > patch.
> >
>
> Sorry, I seem to have missed this. The patch is completely whitespace
> mangled through.
>
> I've attached my latest copy of the patch and held it back for now,
> please resubmit.

Yikes, sorry about that. I've squashed the fixup and sent the updated
patch (with trimmed cc list): https://lkml.org/lkml/2021/4/16/1125

Thanks,
Josh

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] sched: Warn on long periods of pending need_resched
  2021-04-16 21:33         ` Josh Don
@ 2021-04-19  7:52           ` Peter Zijlstra
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2021-04-19  7:52 UTC (permalink / raw)
  To: Josh Don
  Cc: Mel Gorman, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Daniel Bristot de Oliveira, Luis Chamberlain, Kees Cook,
	Iurii Zaikin, linux-kernel, linux-fsdevel, David Rientjes,
	Oleg Rombakh, linux-doc, Paul Turner

On Fri, Apr 16, 2021 at 02:33:27PM -0700, Josh Don wrote:
> Yikes, sorry about that. I've squashed the fixup and sent the updated
> patch (with trimmed cc list): https://lkml.org/lkml/2021/4/16/1125

For future reference, please use: https://lkml.kernel.org/r/MSGID

Then I can search for MSGID in my local mailer (mutt: ~h) and instantly
find the actual message (now I'll search for all email from you and it
should obviously be easily found as well).

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-04-19  7:53 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-23  3:57 [PATCH v2] sched: Warn on long periods of pending need_resched Josh Don
2021-03-24  9:37 ` Peter Zijlstra
2021-03-24 10:54   ` Peter Zijlstra
2021-03-24 10:55     ` Peter Zijlstra
2021-03-24 11:42     ` Mel Gorman
2021-03-24 12:12       ` Peter Zijlstra
2021-03-24 13:39         ` Mel Gorman
2021-03-24 14:36           ` Peter Zijlstra
2021-03-24 15:52             ` Mel Gorman
2021-03-25 21:58               ` Josh Don
2021-03-26  8:58                 ` Peter Zijlstra
2021-04-16 15:53           ` [tip: sched/core] sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG tip-bot2 for Mel Gorman
2021-03-24 11:27 ` [PATCH v2] sched: Warn on long periods of pending need_resched Mel Gorman
2021-03-25 21:50   ` Josh Don
2021-03-30 22:44     ` Josh Don
2021-04-16 15:04       ` Peter Zijlstra
2021-04-16 21:33         ` Josh Don
2021-04-19  7:52           ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.