[PATCH 2/2] arm64: Add complex scheduler level for arm64

From: Qing Wang <wangqing@vivo.com>
To: Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will@kernel.org>,
	Sudeep Holla <sudeep.holla@arm.com>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org
Cc: Wang Qing <wangqing@vivo.com>
Subject: [PATCH 2/2] arm64: Add complex scheduler level for arm64
Date: Thu, 21 Apr 2022 07:55:58 -0700	[thread overview]
Message-ID: <1650552960-60165-3-git-send-email-wangqing@vivo.com> (raw)
In-Reply-To: <1650552960-60165-1-git-send-email-wangqing@vivo.com>

From: Wang Qing <wangqing@vivo.com>

The DSU-110 DynamIQ™ cluster supports blocks that are called complexes
which contain up to two cores of the same type and some shared logic.
Sharing some logic between the cores can make a complex area efficient.

This patch adds complex level for complexs and automatically enables
the load balance among complexs. It will directly benefit a lot of
workload which loves more resources such as memory bandwidth, caches.

Testing has been done in qcom sm8450 with Stream benchmark:
8threads stream (2 little cores * 2(complex) + 3 middle cores + 1 big core)
                stream                 stream
                w/o patch              w/ patch
MB/sec copy     37579.2 (   0.00%)    39127.3 (   4.12%)
MB/sec scale    38261.1 (   0.00%)    39195.4 (   2.44%)
MB/sec add      39497.0 (   0.00%)    41101.5 (   4.06%)
MB/sec triad    39885.6 (   0.00%)    40772.7 (   2.22%)

Signed-off-by: Wang Qing <wangqing@vivo.com>
---
 arch/arm64/Kconfig      | 13 +++++++++++
 arch/arm64/kernel/smp.c | 48 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index edbe035cb0e3..4063de8c6153 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1207,6 +1207,19 @@ config SCHED_CLUSTER
 	  by sharing mid-level caches, last-level cache tags or internal
 	  busses.
 
+config SCHED_COMPLEX
+	bool "Complex scheduler support"
+	help
+	  DSU supports blocks that are called complexes which contain up to
+	  two cores of the same type and some shared logic. Sharing some logic
+	  between the cores can make a complex area efficient.
+
+	  Complex also can be considered as a shared cache group smaller
+	  than cluster.
+
+	  Complex scheduler support improves the CPU scheduler's decision
+	  making when dealing with machines that have complexs of CPUs.
+
 config SCHED_SMT
 	bool "SMT scheduler support"
 	help
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 3b46041f2b97..526765112146 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -14,6 +14,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/topology.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
@@ -57,6 +58,10 @@
 DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
 EXPORT_PER_CPU_SYMBOL(cpu_number);
 
+#ifdef SCHED_COMPLEX
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_t, cpu_complex_map);
+#endif
+
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
@@ -715,6 +720,47 @@ void __init smp_init_cpus(void)
 	}
 }
 
+#ifdef SCHED_COMPLEX
+static int arm64_complex_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES;
+}
+
+const struct cpumask *arm64_complex_mask(int cpu)
+{
+	const struct cpumask *core_mask = cpu_cpu_mask(cpu);
+
+	/* Find the smaller shared cache level than clustergroup and coregroup*/
+#ifdef CONFIG_SCHED_MC
+	core_mask = cpu_coregroup_mask(cpu);
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+	core_mask = cpu_clustergroup_mask(cpu);
+#endif
+
+	find_max_sub_sc(core_mask, cpu, &per_cpu(cpu_complex_map, cpu));
+
+	return &per_cpu(cpu_complex_map, cpu);
+}
+#endif
+
+static struct sched_domain_topology_level arm64_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_COMPLEX
+	{ arm64_complex_mask, arm64_complex_flags, SD_INIT_NAME(CPL) },
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+	{ cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	const struct cpu_operations *ops;
@@ -723,9 +769,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	unsigned int this_cpu;
 
 	init_cpu_topology();
-
 	this_cpu = smp_processor_id();
 	store_cpu_topology(this_cpu);
+	set_sched_topology(arm64_topology);
 	numa_store_cpu_info(this_cpu);
 	numa_add_cpu(this_cpu);
 
-- 
2.27.0.windows.1