linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Phil Auld <pauld@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: Juri Lelli <juri.lelli@redhat.com>,
	Ingo Molnar <mingo@redhat.com>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Valentin Schneider <vschneid@redhat.com>,
	Ben Segall <bsegall@google.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Mel Gorman <mgorman@suse.de>, Phil Auld <pauld@redhat.com>
Subject: [PATCH v2] sched/nohz: Add HRTICK_BW for using cfs bandwidth with nohz_full
Date: Thu,  8 Jun 2023 08:52:28 -0400	[thread overview]
Message-ID: <20230608125228.56097-1-pauld@redhat.com> (raw)

CFS bandwidth limits and NOHZ full don't play well together.  Tasks
can easily run well past their quotas before a remote tick does
accounting.  This leads to long, multi-period stalls before such
tasks can run again.  Use the hrtick mechanism to set a sched
tick to fire at remaining_runtime in the future if we are on
a nohz full cpu, if the task has quota and if we are likely to
disable the tick (nr_running == 1).  This allows for bandwidth
accounting before tasks go too far over quota.

A number of container workloads use a dynamic number of real
nohz tasks but also have other work that is limited which ends
up running on the "spare" nohz cpus.  This is an artifact of
having to specify nohz_full cpus at boot. Adding this hrtick
resolves the issue of long stalls on these tasks. Currently
the scheduler, when faced with these conflicting requirements
choosed to favor nohz_full even though that is already best
effort. Here we make it favor respecting the bandwidth
limitations which are not supposed to be best effort.

Add the sched_feat HRTICK_BW off by default to allow users to
enable this only when needed.

Signed-off-by: Phil Auld <pauld@redhat.com>
Suggested-by: Juri Lelli <jlelli@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Ben Segall <bsegall@google.com>
---

v2: Clean up building issues with various related CONFIG changes. Add a
check to start the hrtick in __account_cfs_rq_runtime() for when the
task gets more runtime.

 kernel/sched/core.c     |  2 +-
 kernel/sched/fair.c     | 28 ++++++++++++++++++++++++++--
 kernel/sched/features.h |  1 +
 kernel/sched/sched.h    | 12 ++++++++++++
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..76425c377245 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6562,7 +6562,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 
 	schedule_debug(prev, !!sched_mode);
 
-	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
+	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL) || sched_feat(HRTICK_BW))
 		hrtick_clear(rq);
 
 	local_irq_disable();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..95e4b70ebb0a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5190,6 +5190,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 		check_preempt_tick(cfs_rq, curr);
 }
 
+#if !defined(CONFIG_SCHED_HRTICK) || !defined(CONFIG_NO_HZ_FULL) || !defined(CONFIG_CFS_BANDWIDTH)
+static void start_hrtick_cfs_bw(struct rq *rq, struct cfs_rq *cfs_rq)
+{
+}
+#endif
 
 /**************************************************
  * CFS bandwidth control machinery
@@ -5309,6 +5314,18 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	return ret;
 }
 
+#if defined(CONFIG_SCHED_HRTICK) && defined(CONFIG_NO_HZ_FULL)
+static void start_hrtick_cfs_bw(struct rq *rq, struct cfs_rq *cfs_rq)
+{
+	if (!tick_nohz_full_cpu(rq->cpu) || !cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+		return;
+
+	/* runtime_remaining should never be negative here but just in case */
+	if (rq->nr_running == 1 && cfs_rq->runtime_remaining > 0)
+		hrtick_start(rq, cfs_rq->runtime_remaining);
+}
+#endif
+
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
@@ -5323,8 +5340,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	 * if we're unable to extend our runtime we resched so that the active
 	 * hierarchy can be throttled
 	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_curr(rq_of(cfs_rq));
+	if (likely(cfs_rq->curr)) {
+		if (!assign_cfs_rq_runtime(cfs_rq))
+			resched_curr(rq_of(cfs_rq));
+		else if (hrtick_enabled_bw(rq_of(cfs_rq)))
+			start_hrtick_cfs_bw(rq_of(cfs_rq), cfs_rq);
+	}
 }
 
 static __always_inline
@@ -8096,6 +8117,9 @@ done: __maybe_unused;
 	if (hrtick_enabled_fair(rq))
 		hrtick_start_fair(rq, p);
 
+	if (hrtick_enabled_bw(rq))
+		start_hrtick_cfs_bw(rq, task_cfs_rq(p));
+
 	update_misfit_status(p, rq);
 
 	return p;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..c9d4fa0c1430 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(HRTICK_DL, false)
+SCHED_FEAT(HRTICK_BW, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7b3e0a2b20..db3a3b4b5746 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2507,6 +2507,13 @@ static inline int hrtick_enabled_dl(struct rq *rq)
 	return hrtick_enabled(rq);
 }
 
+static inline int hrtick_enabled_bw(struct rq *rq)
+{
+	if (!sched_feat(HRTICK_BW))
+		return 0;
+	return hrtick_enabled(rq);
+}
+
 void hrtick_start(struct rq *rq, u64 delay);
 
 #else
@@ -2521,6 +2528,11 @@ static inline int hrtick_enabled_dl(struct rq *rq)
 	return 0;
 }
 
+static inline int hrtick_enabled_bw(struct rq *rq)
+{
+	return 0;
+}
+
 static inline int hrtick_enabled(struct rq *rq)
 {
 	return 0;
-- 
2.31.1


             reply	other threads:[~2023-06-08 12:53 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-08 12:52 Phil Auld [this message]
2023-06-12 20:43 ` [PATCH v2] sched/nohz: Add HRTICK_BW for using cfs bandwidth with nohz_full Benjamin Segall
2023-06-12 21:38   ` Phil Auld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230608125228.56097-1-pauld@redhat.com \
    --to=pauld@redhat.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).