linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Abel Wu <wuyun.abel@bytedance.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: linux-kernel@vger.kernel.org
Subject: [RFC PATCH 1/5] sched/fair: record overloaded cpus
Date: Thu, 17 Feb 2022 23:43:57 +0800	[thread overview]
Message-ID: <20220217154403.6497-2-wuyun.abel@bytedance.com> (raw)
In-Reply-To: <20220217154403.6497-1-wuyun.abel@bytedance.com>

An CFS runqueue is considered overloaded when there are
more than one pullable non-idle tasks on it (since sched-
idle cpus are treated as idle cpus). And idle tasks are
counted towards rq->cfs.idle_h_nr_running, that is either
assigned SCHED_IDLE policy or placed under idle cgroups.

The overloaded cfs rqs can cause performance issues to
both task types:

  - for latency critical tasks like SCHED_NORMAL,
    time of waiting in the rq will increase and
    result in higher pct99 latency, and

  - batch tasks may not be able to make full use
    of cpu capacity if sched-idle rq exists, thus
    presents poorer throughput.

The mask of overloaded cpus is updated in periodic tick
and the idle path at the LLC domain basis. This cpumask
will also be used in SIS as a filter, improving idle cpu
searching.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
---
 include/linux/sched/topology.h | 10 ++++++++++
 kernel/sched/core.c            |  1 +
 kernel/sched/fair.c            | 43 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h           |  6 ++++++
 kernel/sched/topology.c        |  4 +++-
 5 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 56cffe42abbc..03c9c81dc886 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -81,6 +81,16 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+	/*
+	 * The above varibles are used in idle path and
+	 * select_task_rq, and the following two are
+	 * mainly updated in tick. They are all hot but
+	 * for different usage, so start a new cacheline
+	 * to avoid false sharing.
+	 */
+	atomic_t	nr_overloaded	____cacheline_aligned;
+	unsigned long	overloaded[];	/* Must be last */
 };
 
 struct sched_domain {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d863d7f6ad7..a6da2998ec49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9423,6 +9423,7 @@ void __init sched_init(void)
 		rq->wake_stamp = jiffies;
 		rq->wake_avg_idle = rq->avg_idle;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+		rq->overloaded = 0;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c4bfffe8c2c..0a0438c3319b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6968,6 +6968,46 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	return newidle_balance(rq, rf) != 0;
 }
+
+static inline int cfs_rq_overloaded(struct rq *rq)
+{
+	return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running > 1;
+}
+
+/* Must be called with rq locked */
+static void update_overload_status(struct rq *rq)
+{
+	struct sched_domain_shared *sds;
+	int overloaded = cfs_rq_overloaded(rq);
+	int cpu = cpu_of(rq);
+
+	lockdep_assert_rq_held(rq);
+
+	if (rq->overloaded == overloaded)
+		return;
+
+	rcu_read_lock();
+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	if (unlikely(!sds))
+		goto unlock;
+
+	if (overloaded) {
+		cpumask_set_cpu(cpu, sdo_mask(sds));
+		atomic_inc(&sds->nr_overloaded);
+	} else {
+		cpumask_clear_cpu(cpu, sdo_mask(sds));
+		atomic_dec(&sds->nr_overloaded);
+	}
+
+	rq->overloaded = overloaded;
+unlock:
+	rcu_read_unlock();
+}
+
+#else
+
+static inline void update_overload_status(struct rq *rq) { }
+
 #endif /* CONFIG_SMP */
 
 static unsigned long wakeup_gran(struct sched_entity *se)
@@ -7315,6 +7355,8 @@ done: __maybe_unused;
 	if (new_tasks > 0)
 		goto again;
 
+	update_overload_status(rq);
+
 	/*
 	 * rq is about to be idle, check if we need to update the
 	 * lost_idle_time of clock_pelt
@@ -11131,6 +11173,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
 
+	update_overload_status(rq);
 	update_misfit_status(curr, rq);
 	update_overutilized_status(task_rq(curr));
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b33ba9c3c42..c81a87082b8b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1012,6 +1012,7 @@ struct rq {
 
 	unsigned char		nohz_idle_balance;
 	unsigned char		idle_balance;
+	unsigned char		overloaded;
 
 	unsigned long		misfit_task_load;
 
@@ -1762,6 +1763,11 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 	return sd;
 }
 
+static inline struct cpumask *sdo_mask(struct sched_domain_shared *sds)
+{
+	return to_cpumask(sds->overloaded);
+}
+
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e6cd55951304..641f11415819 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1623,6 +1623,8 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
 		atomic_inc(&sd->shared->ref);
 		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+		atomic_set(&sd->shared->nr_overloaded, 0);
+		cpumask_clear(sdo_mask(sd->shared));
 	}
 
 	sd->private = sdd;
@@ -2050,7 +2052,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 			*per_cpu_ptr(sdd->sd, j) = sd;
 
-			sds = kzalloc_node(sizeof(struct sched_domain_shared),
+			sds = kzalloc_node(sizeof(struct sched_domain_shared) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sds)
 				return -ENOMEM;
-- 
2.11.0


  reply	other threads:[~2022-02-17 15:44 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-17 15:43 [RFC PATCH 0/5] introduce sched-idle balancing Abel Wu
2022-02-17 15:43 ` Abel Wu [this message]
2022-02-24  7:10   ` [RFC PATCH 1/5] sched/fair: record overloaded cpus Gautham R. Shenoy
2022-02-24 14:36     ` Abel Wu
2022-02-27  8:08     ` Aubrey Li
2022-02-17 15:43 ` [RFC PATCH 2/5] sched/fair: introduce sched-idle balance Abel Wu
2022-02-17 15:43 ` [RFC PATCH 3/5] sched/fair: add stats for sched-idle balancing Abel Wu
2022-02-17 15:44 ` [RFC PATCH 4/5] sched/fair: filter out overloaded cpus in sis Abel Wu
2022-02-17 15:44 ` [RFC PATCH 5/5] sched/fair: favor cpu capacity for idle tasks Abel Wu
2022-02-24  3:19 ` [RFC PATCH 0/5] introduce sched-idle balancing Abel Wu
2022-02-24 15:20 ` Peter Zijlstra
2022-02-24 15:29   ` Vincent Guittot
2022-02-25  6:51     ` Abel Wu
2022-02-25  6:46   ` Abel Wu
2022-02-25  8:29     ` Vincent Guittot
2022-02-25 10:46       ` Abel Wu
2022-02-25 13:15         ` Vincent Guittot
2022-02-24 16:47 ` Mel Gorman
2022-02-25  8:15   ` Abel Wu
2022-02-25 10:16     ` Mel Gorman
2022-02-25 13:20       ` Abel Wu
2022-03-02  0:41         ` Josh Don

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220217154403.6497-2-wuyun.abel@bytedance.com \
    --to=wuyun.abel@bytedance.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).