linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Josh Don <joshdon@google.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Luis Chamberlain <mcgrof@kernel.org>,
	Kees Cook <keescook@chromium.org>,
	Iurii Zaikin <yzaikin@google.com>,
	linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	David Rientjes <rientjes@google.com>,
	Oleg Rombakh <olegrom@google.com>, Paul Turner <pjt@google.com>,
	Josh Don <joshdon@google.com>
Subject: [PATCH] sched: Warn on long periods of pending need_resched
Date: Tue, 16 Mar 2021 21:59:49 -0700	[thread overview]
Message-ID: <20210317045949.1584952-1-joshdon@google.com> (raw)

From: Paul Turner <pjt@google.com>

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. Monitoring is done via
the tick and the accuracy is hence limited to jiffy scale. This also
means that we won't trigger the warning if the tick is disabled.

If resched_latency_warn_ms is set to the default value, only one warning
will be produced per boot.

This warning only exists under CONFIG_SCHED_DEBUG. If it goes off, it is
likely that there is a missing cond_resched() somewhere.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
---
We've caught various bugs using this patch. In fact, a followup patch to
this one will be a patch introducing a missing cond_resched(). However,
this may be too noisy to have as default enabled (especially given that
it requires some tuning); I'm open to having this default disabled
instead.
 kernel/sched/core.c  | 91 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |  6 +++
 kernel/sysctl.c      |  8 ++++
 3 files changed, 105 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..ac037fc87d5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,25 @@ const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
 #undef SCHED_FEAT
+
+/*
+ * Print a warning if need_resched is set for at least this long. At the
+ * default value, only a single warning will be printed per boot.
+ *
+ * Values less than 2 disable the feature.
+ *
+ * A kernel compiled with CONFIG_KASAN tends to run more slowly on average.
+ * Increase the need_resched timeout to reduce false alarms.
+ */
+#ifdef CONFIG_KASAN
+#define RESCHED_DEFAULT_WARN_LATENCY_MS 101
+#define RESCHED_BOOT_QUIET_SEC 600
+#else
+#define RESCHED_DEFAULT_WARN_LATENCY_MS 51
+#define RESCHED_BOOT_QUIET_SEC 300
 #endif
+int sysctl_resched_latency_warn_ms = RESCHED_DEFAULT_WARN_LATENCY_MS;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4520,6 +4538,71 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+
+static inline u64 resched_latency_check(struct rq *rq)
+{
+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+	bool warn_only_once = (latency_warn_ms == RESCHED_DEFAULT_WARN_LATENCY_MS);
+	u64 need_resched_latency, now = rq_clock(rq);
+	static bool warned_once;
+
+	if (warn_only_once && warned_once)
+		return 0;
+
+	if (!need_resched() || latency_warn_ms < 2)
+		return 0;
+
+	/* Disable this warning for the first few mins after boot */
+	if (now < RESCHED_BOOT_QUIET_SEC * NSEC_PER_SEC)
+		return 0;
+
+	if (!rq->last_seen_need_resched_ns) {
+		rq->last_seen_need_resched_ns = now;
+		rq->ticks_without_resched = 0;
+		return 0;
+	}
+
+	rq->ticks_without_resched++;
+	need_resched_latency = now - rq->last_seen_need_resched_ns;
+	if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+		return 0;
+
+	warned_once = true;
+
+	return need_resched_latency;
+}
+
+static inline void resched_latency_warn(int cpu, u64 latency)
+{
+	static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+	WARN(__ratelimit(&latency_check_ratelimit),
+	     "CPU %d: need_resched set for > %llu ns (%d ticks) "
+	     "without schedule\n",
+	     cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
+
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+	long val;
+
+	if ((kstrtol(str, 0, &val))) {
+		pr_warn("Unable to set resched_latency_warn_ms\n");
+		return 1;
+	}
+
+	sysctl_resched_latency_warn_ms = val;
+	return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+static inline void resched_latency_warn(int cpu, u64 latency) {}
+#endif /* CONFIG_SCHED_DEBUG */
+
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4531,6 +4614,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	u64 resched_latency;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4541,11 +4625,15 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	resched_latency = resched_latency_check(rq);
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);
 
 	rq_unlock(rq, &rf);
 
+	if (resched_latency)
+		resched_latency_warn(cpu, resched_latency);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -5040,6 +5128,9 @@ static void __sched notrace __schedule(bool preempt)
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+	rq->last_seen_need_resched_ns = 0;
+#endif
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..912a8886bc7f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -57,6 +57,7 @@
 #include <linux/prefetch.h>
 #include <linux/profile.h>
 #include <linux/psi.h>
+#include <linux/ratelimit.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
 #include <linux/stop_machine.h>
@@ -963,6 +964,11 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_SCHED_DEBUG
+	u64 last_seen_need_resched_ns;
+	int ticks_without_resched;
+#endif
+
 #ifdef CONFIG_MEMBARRIER
 	int membarrier_state;
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..526094fc364e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -193,6 +193,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 #endif /* CONFIG_SMP */
+extern int sysctl_resched_latency_warn_ms;
 #endif /* CONFIG_SCHED_DEBUG */
 
 #ifdef CONFIG_COMPACTION
@@ -1763,6 +1764,13 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
+	{
+		.procname	= "resched_latency_warn_ms",
+		.data		= &sysctl_resched_latency_warn_ms,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
-- 
2.31.0.rc2.261.g7f71774620-goog


             reply	other threads:[~2021-03-17  5:01 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-17  4:59 Josh Don [this message]
2021-03-17  8:25 ` [PATCH] sched: Warn on long periods of pending need_resched Ingo Molnar
2021-03-18  0:06   ` Josh Don
2021-03-19  9:02     ` Mel Gorman
2021-03-23  3:35       ` Josh Don
2021-03-17  8:31 ` Ingo Molnar
2021-03-18  0:17   ` Josh Don
2021-03-18 14:31 ` [sched] 663017c554: WARNING:at_kernel/sched/core.c:#scheduler_tick kernel test robot
2021-03-18 20:41   ` Josh Don

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210317045949.1584952-1-joshdon@google.com \
    --to=joshdon@google.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=keescook@chromium.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mcgrof@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=olegrom@google.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=rientjes@google.com \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=yzaikin@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).