All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-10 12:58 ` Qing Wang
  0 siblings, 0 replies; 36+ messages in thread
From: Qing Wang @ 2022-03-10 12:58 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, linux-arm-kernel,
	linux-kernel
  Cc: Wang Qing

From: Wang Qing <wangqing@vivo.com>

Some architectures(e.g. ARM64), caches are implemented below:
cluster:              ****** cluster 0 *****      ****** cluster 1 *****
core:                 0      1      2      3      4      5      6      7
cache(Leveln):        **cache0**   **cache1**	 **cache2**   **cache3**
sd_llc_id(current):   0      0      0      0      4      4      4      4
sd_llc_id(should be): 0      0      2      2      4      4      6      6

Caches and cpus have different topology, this causes cpus_share_cache()
return the wrong value, which will affect the CPU load balance.

Cache topology should be separated with CPU topology, it can be obtained
from "next-level-cache" in DTS preferentially.

Signed-off-by: Wang Qing <wangqing@vivo.com>
---
 arch/arm64/kernel/smp.c       |  1 +
 drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
 include/linux/arch_topology.h |  3 +++
 kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
 4 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 27df5c1..94cf649
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	unsigned int this_cpu;
 
 	init_cpu_topology();
+	init_cpu_cache_topology();
 
 	this_cpu = smp_processor_id();
 	store_cpu_topology(this_cpu);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 9761541..613213f
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
  */
 struct cpu_topology cpu_topology[NR_CPUS];
 EXPORT_SYMBOL_GPL(cpu_topology);
+struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
 
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
@@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
 	else if (of_have_populated_dt() && parse_dt_topology())
 		reset_cpu_topology();
 }
+
+void __init init_cpu_cache_topology(void)
+{
+	struct device_node *node_cpu, *node_cache;
+	int cpu, level;
+
+	for_each_possible_cpu(cpu) {
+		node_cpu = of_get_cpu_node(cpu, NULL);
+		if (!node_cpu)
+			continue;
+
+		level = 0;
+		node_cache = node_cpu;
+		while (level < MAX_CPU_CACHE_LEVEL) {
+			node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
+			if (!node_cache)
+				break;
+			cache_topology[cpu][level++] = node_cache;
+		}
+		of_node_put(node_cpu);
+	}
+}
 #endif
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index cce6136b..d37f47d
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -72,6 +72,8 @@ struct cpu_topology {
 };
 
 #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
+#define MAX_CPU_CACHE_LEVEL 7
+extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
 extern struct cpu_topology cpu_topology[NR_CPUS];
 
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].package_id)
@@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_cluster_cpumask(cpu)	(&cpu_topology[cpu].cluster_sibling)
 #define topology_llc_cpumask(cpu)	(&cpu_topology[cpu].llc_sibling)
 void init_cpu_topology(void);
+void init_cpu_cache_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 const struct cpumask *cpu_clustergroup_mask(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a70..10850d6
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
+static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
+{
+	int cache_level, cpu_id;
+	int first, last;
+	int id = cpumask_first(sched_domain_span(sd));
+	int size = cpumask_weight(sched_domain_span(sd));
+
+	*first_cpu = id;
+	*cpu_num = size;
+
+	for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
+		if (!cache_topology[cpu][cache_level])
+			break;
+
+		first = -1;
+		last = id;
+		for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
+			if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
+				if (cpu_id >= id && cpu_id < id + size) {
+					first = (first == -1)?cpu_id:first;
+					last = cpu_id;
+				} else
+					return;
+			}
+		}
+		*first_cpu = first;
+		*cpu_num = last - first + 1;
+	}
+}
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
@@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
-		id = cpumask_first(sched_domain_span(sd));
-		size = cpumask_weight(sched_domain_span(sd));
+		set_sd_llc(cpu, sd, &id, &size);
 		sds = sd->shared;
 	}
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-10 12:58 ` Qing Wang
  0 siblings, 0 replies; 36+ messages in thread
From: Qing Wang @ 2022-03-10 12:58 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, linux-arm-kernel,
	linux-kernel
  Cc: Wang Qing

From: Wang Qing <wangqing@vivo.com>

Some architectures(e.g. ARM64), caches are implemented below:
cluster:              ****** cluster 0 *****      ****** cluster 1 *****
core:                 0      1      2      3      4      5      6      7
cache(Leveln):        **cache0**   **cache1**	 **cache2**   **cache3**
sd_llc_id(current):   0      0      0      0      4      4      4      4
sd_llc_id(should be): 0      0      2      2      4      4      6      6

Caches and cpus have different topology, this causes cpus_share_cache()
return the wrong value, which will affect the CPU load balance.

Cache topology should be separated with CPU topology, it can be obtained
from "next-level-cache" in DTS preferentially.

Signed-off-by: Wang Qing <wangqing@vivo.com>
---
 arch/arm64/kernel/smp.c       |  1 +
 drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
 include/linux/arch_topology.h |  3 +++
 kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
 4 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 27df5c1..94cf649
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	unsigned int this_cpu;
 
 	init_cpu_topology();
+	init_cpu_cache_topology();
 
 	this_cpu = smp_processor_id();
 	store_cpu_topology(this_cpu);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 9761541..613213f
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
  */
 struct cpu_topology cpu_topology[NR_CPUS];
 EXPORT_SYMBOL_GPL(cpu_topology);
+struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
 
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
@@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
 	else if (of_have_populated_dt() && parse_dt_topology())
 		reset_cpu_topology();
 }
+
+void __init init_cpu_cache_topology(void)
+{
+	struct device_node *node_cpu, *node_cache;
+	int cpu, level;
+
+	for_each_possible_cpu(cpu) {
+		node_cpu = of_get_cpu_node(cpu, NULL);
+		if (!node_cpu)
+			continue;
+
+		level = 0;
+		node_cache = node_cpu;
+		while (level < MAX_CPU_CACHE_LEVEL) {
+			node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
+			if (!node_cache)
+				break;
+			cache_topology[cpu][level++] = node_cache;
+		}
+		of_node_put(node_cpu);
+	}
+}
 #endif
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index cce6136b..d37f47d
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -72,6 +72,8 @@ struct cpu_topology {
 };
 
 #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
+#define MAX_CPU_CACHE_LEVEL 7
+extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
 extern struct cpu_topology cpu_topology[NR_CPUS];
 
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].package_id)
@@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_cluster_cpumask(cpu)	(&cpu_topology[cpu].cluster_sibling)
 #define topology_llc_cpumask(cpu)	(&cpu_topology[cpu].llc_sibling)
 void init_cpu_topology(void);
+void init_cpu_cache_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
 const struct cpumask *cpu_clustergroup_mask(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a70..10850d6
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
+static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
+{
+	int cache_level, cpu_id;
+	int first, last;
+	int id = cpumask_first(sched_domain_span(sd));
+	int size = cpumask_weight(sched_domain_span(sd));
+
+	*first_cpu = id;
+	*cpu_num = size;
+
+	for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
+		if (!cache_topology[cpu][cache_level])
+			break;
+
+		first = -1;
+		last = id;
+		for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
+			if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
+				if (cpu_id >= id && cpu_id < id + size) {
+					first = (first == -1)?cpu_id:first;
+					last = cpu_id;
+				} else
+					return;
+			}
+		}
+		*first_cpu = first;
+		*cpu_num = last - first + 1;
+	}
+}
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
@@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
-		id = cpumask_first(sched_domain_span(sd));
-		size = cpumask_weight(sched_domain_span(sd));
+		set_sd_llc(cpu, sd, &id, &size);
 		sds = sd->shared;
 	}
 
-- 
2.7.4


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-10 12:58 ` Qing Wang
@ 2022-03-10 14:18   ` Vincent Guittot
  -1 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-10 14:18 UTC (permalink / raw)
  To: Qing Wang
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>
> From: Wang Qing <wangqing@vivo.com>
>
> Some architectures(e.g. ARM64), caches are implemented below:
> cluster:              ****** cluster 0 *****      ****** cluster 1 *****
> core:                 0      1      2      3      4      5      6      7
> cache(Leveln):        **cache0**   **cache1**    **cache2**   **cache3**
> sd_llc_id(current):   0      0      0      0      4      4      4      4
> sd_llc_id(should be): 0      0      2      2      4      4      6      6
>
> Caches and cpus have different topology, this causes cpus_share_cache()
> return the wrong value, which will affect the CPU load balance.

What does your current scheduler topology  look like?

For CPU 0 to 3, do you have the below ?
DIE [0     -     3] [4-7]
MC  [0] [1] [2] [3]

But you would like something like below for cpu 0-1 instead ?
DIE [0     -     3] [4-7]
CLS [0 - 1] [2 - 3]
MC  [0] [1]

with SD_SHARE_PKG_RESOURCES only set to MC level ?

>
> Cache topology should be separated with CPU topology, it can be obtained
> from "next-level-cache" in DTS preferentially.
>
> Signed-off-by: Wang Qing <wangqing@vivo.com>
> ---
>  arch/arm64/kernel/smp.c       |  1 +
>  drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
>  include/linux/arch_topology.h |  3 +++
>  kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
>  4 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 27df5c1..94cf649
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>         unsigned int this_cpu;
>
>         init_cpu_topology();
> +       init_cpu_cache_topology();
>
>         this_cpu = smp_processor_id();
>         store_cpu_topology(this_cpu);
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 9761541..613213f
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
>   */
>  struct cpu_topology cpu_topology[NR_CPUS];
>  EXPORT_SYMBOL_GPL(cpu_topology);
> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];

AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
not initialized for other archs

>
>  const struct cpumask *cpu_coregroup_mask(int cpu)
>  {
> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
>         else if (of_have_populated_dt() && parse_dt_topology())
>                 reset_cpu_topology();
>  }
> +
> +void __init init_cpu_cache_topology(void)
> +{
> +       struct device_node *node_cpu, *node_cache;
> +       int cpu, level;
> +
> +       for_each_possible_cpu(cpu) {
> +               node_cpu = of_get_cpu_node(cpu, NULL);
> +               if (!node_cpu)
> +                       continue;
> +
> +               level = 0;
> +               node_cache = node_cpu;
> +               while (level < MAX_CPU_CACHE_LEVEL) {
> +                       node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
> +                       if (!node_cache)
> +                               break;
> +                       cache_topology[cpu][level++] = node_cache;
> +               }
> +               of_node_put(node_cpu);
> +       }
> +}
>  #endif
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index cce6136b..d37f47d
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -72,6 +72,8 @@ struct cpu_topology {
>  };
>
>  #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> +#define MAX_CPU_CACHE_LEVEL 7
> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>  extern struct cpu_topology cpu_topology[NR_CPUS];
>
>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
>  #define topology_llc_cpumask(cpu)      (&cpu_topology[cpu].llc_sibling)
>  void init_cpu_topology(void);
> +void init_cpu_cache_topology(void);
>  void store_cpu_topology(unsigned int cpuid);
>  const struct cpumask *cpu_coregroup_mask(int cpu);
>  const struct cpumask *cpu_clustergroup_mask(int cpu);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index d201a70..10850d6
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>  DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>
> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
> +{
> +       int cache_level, cpu_id;
> +       int first, last;
> +       int id = cpumask_first(sched_domain_span(sd));
> +       int size = cpumask_weight(sched_domain_span(sd));
> +
> +       *first_cpu = id;
> +       *cpu_num = size;
> +
> +       for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
> +               if (!cache_topology[cpu][cache_level])
> +                       break;
> +
> +               first = -1;
> +               last = id;
> +               for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
> +                       if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
> +                               if (cpu_id >= id && cpu_id < id + size) {
> +                                       first = (first == -1)?cpu_id:first;
> +                                       last = cpu_id;
> +                               } else
> +                                       return;
> +                       }
> +               }
> +               *first_cpu = first;
> +               *cpu_num = last - first + 1;
> +       }
> +}
> +
>  static void update_top_cache_domain(int cpu)
>  {
>         struct sched_domain_shared *sds = NULL;
> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>
>         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>         if (sd) {
> -               id = cpumask_first(sched_domain_span(sd));
> -               size = cpumask_weight(sched_domain_span(sd));
> +               set_sd_llc(cpu, sd, &id, &size);

In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
find shared memory. It seems that cpu_coregroup_mask doesn't return
the correct cpumask in your case as it returns a full cluster instead
of a subset

>                 sds = sd->shared;

sds must  stay aligned with id and size so instead of modifying id and
size you should returns a cpumask that reflects your topology

>         }
>
> --
> 2.7.4
>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-10 14:18   ` Vincent Guittot
  0 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-10 14:18 UTC (permalink / raw)
  To: Qing Wang
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>
> From: Wang Qing <wangqing@vivo.com>
>
> Some architectures(e.g. ARM64), caches are implemented below:
> cluster:              ****** cluster 0 *****      ****** cluster 1 *****
> core:                 0      1      2      3      4      5      6      7
> cache(Leveln):        **cache0**   **cache1**    **cache2**   **cache3**
> sd_llc_id(current):   0      0      0      0      4      4      4      4
> sd_llc_id(should be): 0      0      2      2      4      4      6      6
>
> Caches and cpus have different topology, this causes cpus_share_cache()
> return the wrong value, which will affect the CPU load balance.

What does your current scheduler topology  look like?

For CPU 0 to 3, do you have the below ?
DIE [0     -     3] [4-7]
MC  [0] [1] [2] [3]

But you would like something like below for cpu 0-1 instead ?
DIE [0     -     3] [4-7]
CLS [0 - 1] [2 - 3]
MC  [0] [1]

with SD_SHARE_PKG_RESOURCES only set to MC level ?

>
> Cache topology should be separated with CPU topology, it can be obtained
> from "next-level-cache" in DTS preferentially.
>
> Signed-off-by: Wang Qing <wangqing@vivo.com>
> ---
>  arch/arm64/kernel/smp.c       |  1 +
>  drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
>  include/linux/arch_topology.h |  3 +++
>  kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
>  4 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 27df5c1..94cf649
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>         unsigned int this_cpu;
>
>         init_cpu_topology();
> +       init_cpu_cache_topology();
>
>         this_cpu = smp_processor_id();
>         store_cpu_topology(this_cpu);
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 9761541..613213f
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
>   */
>  struct cpu_topology cpu_topology[NR_CPUS];
>  EXPORT_SYMBOL_GPL(cpu_topology);
> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];

AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
not initialized for other archs

>
>  const struct cpumask *cpu_coregroup_mask(int cpu)
>  {
> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
>         else if (of_have_populated_dt() && parse_dt_topology())
>                 reset_cpu_topology();
>  }
> +
> +void __init init_cpu_cache_topology(void)
> +{
> +       struct device_node *node_cpu, *node_cache;
> +       int cpu, level;
> +
> +       for_each_possible_cpu(cpu) {
> +               node_cpu = of_get_cpu_node(cpu, NULL);
> +               if (!node_cpu)
> +                       continue;
> +
> +               level = 0;
> +               node_cache = node_cpu;
> +               while (level < MAX_CPU_CACHE_LEVEL) {
> +                       node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
> +                       if (!node_cache)
> +                               break;
> +                       cache_topology[cpu][level++] = node_cache;
> +               }
> +               of_node_put(node_cpu);
> +       }
> +}
>  #endif
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index cce6136b..d37f47d
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -72,6 +72,8 @@ struct cpu_topology {
>  };
>
>  #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> +#define MAX_CPU_CACHE_LEVEL 7
> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>  extern struct cpu_topology cpu_topology[NR_CPUS];
>
>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
>  #define topology_llc_cpumask(cpu)      (&cpu_topology[cpu].llc_sibling)
>  void init_cpu_topology(void);
> +void init_cpu_cache_topology(void);
>  void store_cpu_topology(unsigned int cpuid);
>  const struct cpumask *cpu_coregroup_mask(int cpu);
>  const struct cpumask *cpu_clustergroup_mask(int cpu);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index d201a70..10850d6
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>  DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>
> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
> +{
> +       int cache_level, cpu_id;
> +       int first, last;
> +       int id = cpumask_first(sched_domain_span(sd));
> +       int size = cpumask_weight(sched_domain_span(sd));
> +
> +       *first_cpu = id;
> +       *cpu_num = size;
> +
> +       for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
> +               if (!cache_topology[cpu][cache_level])
> +                       break;
> +
> +               first = -1;
> +               last = id;
> +               for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
> +                       if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
> +                               if (cpu_id >= id && cpu_id < id + size) {
> +                                       first = (first == -1)?cpu_id:first;
> +                                       last = cpu_id;
> +                               } else
> +                                       return;
> +                       }
> +               }
> +               *first_cpu = first;
> +               *cpu_num = last - first + 1;
> +       }
> +}
> +
>  static void update_top_cache_domain(int cpu)
>  {
>         struct sched_domain_shared *sds = NULL;
> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>
>         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>         if (sd) {
> -               id = cpumask_first(sched_domain_span(sd));
> -               size = cpumask_weight(sched_domain_span(sd));
> +               set_sd_llc(cpu, sd, &id, &size);

In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
find shared memory. It seems that cpu_coregroup_mask doesn't return
the correct cpumask in your case as it returns a full cluster instead
of a subset

>                 sds = sd->shared;

sds must  stay aligned with id and size so instead of modifying id and
size you should returns a cpumask that reflects your topology

>         }
>
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-10 12:58 ` Qing Wang
@ 2022-03-10 17:35   ` kernel test robot
  -1 siblings, 0 replies; 36+ messages in thread
From: kernel test robot @ 2022-03-10 17:35 UTC (permalink / raw)
  To: Qing Wang, Catalin Marinas, Will Deacon, Sudeep Holla,
	Greg Kroah-Hartman, Rafael J. Wysocki, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, linux-arm-kernel,
	linux-kernel
  Cc: kbuild-all, Wang Qing

Hi Qing,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on driver-core/driver-core-testing tip/sched/core linus/master v5.17-rc7 next-20220310]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Qing-Wang/sched-topology-make-cache-topology-separate-from-cpu-topology/20220310-210139
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: i386-randconfig-a003 (https://download.01.org/0day-ci/archive/20220311/202203110141.qsdgjdyC-lkp@intel.com/config)
compiler: gcc-9 (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/035814a878b0c4cb313ebd35464e2149d8592d89
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Qing-Wang/sched-topology-make-cache-topology-separate-from-cpu-topology/20220310-210139
        git checkout 035814a878b0c4cb313ebd35464e2149d8592d89
        # save the config file to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash kernel/sched/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   kernel/sched/topology.c: In function 'set_sd_llc':
   kernel/sched/topology.c:663:38: error: 'MAX_CPU_CACHE_LEVEL' undeclared (first use in this function)
     663 |  for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
         |                                      ^~~~~~~~~~~~~~~~~~~
   kernel/sched/topology.c:663:38: note: each undeclared identifier is reported only once for each function it appears in
   kernel/sched/topology.c:664:8: error: 'cache_topology' undeclared (first use in this function); did you mean 'cpu_topology'?
     664 |   if (!cache_topology[cpu][cache_level])
         |        ^~~~~~~~~~~~~~
         |        cpu_topology
>> kernel/sched/topology.c:653:28: warning: parameter 'cpu' set but not used [-Wunused-but-set-parameter]
     653 | static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
         |                        ~~~~^~~


vim +/cpu +653 kernel/sched/topology.c

   652	
 > 653	static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
   654	{
   655		int cache_level, cpu_id;
   656		int first, last;
   657		int id = cpumask_first(sched_domain_span(sd));
   658		int size = cpumask_weight(sched_domain_span(sd));
   659	
   660		*first_cpu = id;
   661		*cpu_num = size;
   662	
 > 663		for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
   664			if (!cache_topology[cpu][cache_level])
   665				break;
   666	
   667			first = -1;
   668			last = id;
   669			for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
   670				if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
   671					if (cpu_id >= id && cpu_id < id + size) {
   672						first = (first == -1)?cpu_id:first;
   673						last = cpu_id;
   674					} else
   675						return;
   676				}
   677			}
   678			*first_cpu = first;
   679			*cpu_num = last - first + 1;
   680		}
   681	}
   682	

---
0-DAY CI Kernel Test Service
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-10 17:35   ` kernel test robot
  0 siblings, 0 replies; 36+ messages in thread
From: kernel test robot @ 2022-03-10 17:35 UTC (permalink / raw)
  To: Qing Wang, Catalin Marinas, Will Deacon, Sudeep Holla,
	Greg Kroah-Hartman, Rafael J. Wysocki, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, linux-arm-kernel,
	linux-kernel
  Cc: kbuild-all, Wang Qing

Hi Qing,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on driver-core/driver-core-testing tip/sched/core linus/master v5.17-rc7 next-20220310]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Qing-Wang/sched-topology-make-cache-topology-separate-from-cpu-topology/20220310-210139
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: i386-randconfig-a003 (https://download.01.org/0day-ci/archive/20220311/202203110141.qsdgjdyC-lkp@intel.com/config)
compiler: gcc-9 (Ubuntu 9.4.0-1ubuntu1~20.04) 9.4.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/035814a878b0c4cb313ebd35464e2149d8592d89
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Qing-Wang/sched-topology-make-cache-topology-separate-from-cpu-topology/20220310-210139
        git checkout 035814a878b0c4cb313ebd35464e2149d8592d89
        # save the config file to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash kernel/sched/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   kernel/sched/topology.c: In function 'set_sd_llc':
   kernel/sched/topology.c:663:38: error: 'MAX_CPU_CACHE_LEVEL' undeclared (first use in this function)
     663 |  for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
         |                                      ^~~~~~~~~~~~~~~~~~~
   kernel/sched/topology.c:663:38: note: each undeclared identifier is reported only once for each function it appears in
   kernel/sched/topology.c:664:8: error: 'cache_topology' undeclared (first use in this function); did you mean 'cpu_topology'?
     664 |   if (!cache_topology[cpu][cache_level])
         |        ^~~~~~~~~~~~~~
         |        cpu_topology
>> kernel/sched/topology.c:653:28: warning: parameter 'cpu' set but not used [-Wunused-but-set-parameter]
     653 | static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
         |                        ~~~~^~~


vim +/cpu +653 kernel/sched/topology.c

   652	
 > 653	static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
   654	{
   655		int cache_level, cpu_id;
   656		int first, last;
   657		int id = cpumask_first(sched_domain_span(sd));
   658		int size = cpumask_weight(sched_domain_span(sd));
   659	
   660		*first_cpu = id;
   661		*cpu_num = size;
   662	
 > 663		for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
   664			if (!cache_topology[cpu][cache_level])
   665				break;
   666	
   667			first = -1;
   668			last = id;
   669			for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
   670				if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
   671					if (cpu_id >= id && cpu_id < id + size) {
   672						first = (first == -1)?cpu_id:first;
   673						last = cpu_id;
   674					} else
   675						return;
   676				}
   677			}
   678			*first_cpu = first;
   679			*cpu_num = last - first + 1;
   680		}
   681	}
   682	

---
0-DAY CI Kernel Test Service
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-10 14:18   ` Vincent Guittot
@ 2022-03-11  2:03     ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-11  2:03 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

 
>On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>>
>> From: Wang Qing <wangqing@vivo.com>
>>
>> Some architectures(e.g. ARM64), caches are implemented below:
>> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> core:                         0      1          2      3          4      5           6      7
(add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
(add cache level 3)        *************share level 3 cache ***************
>> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>>
Here, n always be 2 in ARM64, but others are also possible.
core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.

>> Caches and cpus have different topology, this causes cpus_share_cache()
>> return the wrong value, which will affect the CPU load balance.
>>
>What does your current scheduler topology  look like?
>
>For CPU 0 to 3, do you have the below ?
>DIE [0     -     3] [4-7]
>MC  [0] [1] [2] [3]

The current scheduler topology consistent with CPU topology:
DIE  [0-7]           
MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
Most Android phones have this topology.
>
>But you would like something like below for cpu 0-1 instead ?
>DIE [0     -     3] [4-7]
>CLS [0 - 1] [2 - 3]
>MC  [0] [1]
>
>with SD_SHARE_PKG_RESOURCES only set to MC level ?

We don't change the current scheduler topology, but the  
cache topology should be separated like below:
[0-7]                          (shared level 3 cache )
[0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>
>>
>> Cache topology should be separated with CPU topology, it can be obtained
>> from "next-level-cache" in DTS preferentially.
>>
>> Signed-off-by: Wang Qing <wangqing@vivo.com>
>> ---
>>  arch/arm64/kernel/smp.c       |  1 +
>>  drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
>>  include/linux/arch_topology.h |  3 +++
>>  kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
>>  4 files changed, 58 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
>> index 27df5c1..94cf649
>> --- a/arch/arm64/kernel/smp.c
>> +++ b/arch/arm64/kernel/smp.c
>> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>>         unsigned int this_cpu;
>>
>>         init_cpu_topology();
>> +       init_cpu_cache_topology();
>>
>>         this_cpu = smp_processor_id();
>>         store_cpu_topology(this_cpu);
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>> index 9761541..613213f
>> --- a/drivers/base/arch_topology.c
>> +++ b/drivers/base/arch_topology.c
>> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
>>   */
>>  struct cpu_topology cpu_topology[NR_CPUS];
>>  EXPORT_SYMBOL_GPL(cpu_topology);
>> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>
>AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
>not initialized for other archs

I see, will be fixed in V2.

Thanks,
Wang
>
>>
>>  const struct cpumask *cpu_coregroup_mask(int cpu)
>>  {
>> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
>>         else if (of_have_populated_dt() && parse_dt_topology())
>>                 reset_cpu_topology();
>>  }
>> +
>> +void __init init_cpu_cache_topology(void)
>> +{
>> +       struct device_node *node_cpu, *node_cache;
>> +       int cpu, level;
>> +
>> +       for_each_possible_cpu(cpu) {
>> +               node_cpu = of_get_cpu_node(cpu, NULL);
>> +               if (!node_cpu)
>> +                       continue;
>> +
>> +               level = 0;
>> +               node_cache = node_cpu;
>> +               while (level < MAX_CPU_CACHE_LEVEL) {
>> +                       node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
>> +                       if (!node_cache)
>> +                               break;
>> +                       cache_topology[cpu][level++] = node_cache;
>> +               }
>> +               of_node_put(node_cpu);
>> +       }
>> +}
>>  #endif
>> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
>> index cce6136b..d37f47d
>> --- a/include/linux/arch_topology.h
>> +++ b/include/linux/arch_topology.h
>> @@ -72,6 +72,8 @@ struct cpu_topology {
>>  };
>>
>>  #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
>> +#define MAX_CPU_CACHE_LEVEL 7
>> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>>  extern struct cpu_topology cpu_topology[NR_CPUS];
>>
>>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
>> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
>>  #define topology_llc_cpumask(cpu)      (&cpu_topology[cpu].llc_sibling)
>>  void init_cpu_topology(void);
>> +void init_cpu_cache_topology(void);
>>  void store_cpu_topology(unsigned int cpuid);
>>  const struct cpumask *cpu_coregroup_mask(int cpu);
>>  const struct cpumask *cpu_clustergroup_mask(int cpu);
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index d201a70..10850d6
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>>  DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>
>> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
>> +{
>> +       int cache_level, cpu_id;
>> +       int first, last;
>> +       int id = cpumask_first(sched_domain_span(sd));
>> +       int size = cpumask_weight(sched_domain_span(sd));
>> +
>> +       *first_cpu = id;
>> +       *cpu_num = size;
>> +
>> +       for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
>> +               if (!cache_topology[cpu][cache_level])
>> +                       break;
>> +
>> +               first = -1;
>> +               last = id;
>> +               for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
>> +                       if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
>> +                               if (cpu_id >= id && cpu_id < id + size) {
>> +                                       first = (first == -1)?cpu_id:first;
>> +                                       last = cpu_id;
>> +                               } else
>> +                                       return;
>> +                       }
>> +               }
>> +               *first_cpu = first;
>> +               *cpu_num = last - first + 1;
>> +       }
>> +}
>> +
>>  static void update_top_cache_domain(int cpu)
>>  {
>>         struct sched_domain_shared *sds = NULL;
>> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>>
>>         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>>         if (sd) {
>> -               id = cpumask_first(sched_domain_span(sd));
>> -               size = cpumask_weight(sched_domain_span(sd));
>> +               set_sd_llc(cpu, sd, &id, &size);
>
>In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
>find shared memory. It seems that cpu_coregroup_mask doesn't return
>the correct cpumask in your case as it returns a full cluster instead
>of a subset
>
>>                 sds = sd->shared;
>
>sds must  stay aligned with id and size so instead of modifying id and
>size you should returns a cpumask that reflects your topology
>
>>         }
>>
>> --
>> 2.7.4
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11  2:03     ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-11  2:03 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

 
>On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>>
>> From: Wang Qing <wangqing@vivo.com>
>>
>> Some architectures(e.g. ARM64), caches are implemented below:
>> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> core:                         0      1          2      3          4      5           6      7
(add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
(add cache level 3)        *************share level 3 cache ***************
>> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>>
Here, n always be 2 in ARM64, but others are also possible.
core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.

>> Caches and cpus have different topology, this causes cpus_share_cache()
>> return the wrong value, which will affect the CPU load balance.
>>
>What does your current scheduler topology  look like?
>
>For CPU 0 to 3, do you have the below ?
>DIE [0     -     3] [4-7]
>MC  [0] [1] [2] [3]

The current scheduler topology consistent with CPU topology:
DIE  [0-7]           
MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
Most Android phones have this topology.
>
>But you would like something like below for cpu 0-1 instead ?
>DIE [0     -     3] [4-7]
>CLS [0 - 1] [2 - 3]
>MC  [0] [1]
>
>with SD_SHARE_PKG_RESOURCES only set to MC level ?

We don't change the current scheduler topology, but the  
cache topology should be separated like below:
[0-7]                          (shared level 3 cache )
[0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>
>>
>> Cache topology should be separated with CPU topology, it can be obtained
>> from "next-level-cache" in DTS preferentially.
>>
>> Signed-off-by: Wang Qing <wangqing@vivo.com>
>> ---
>>  arch/arm64/kernel/smp.c       |  1 +
>>  drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
>>  include/linux/arch_topology.h |  3 +++
>>  kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
>>  4 files changed, 58 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
>> index 27df5c1..94cf649
>> --- a/arch/arm64/kernel/smp.c
>> +++ b/arch/arm64/kernel/smp.c
>> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>>         unsigned int this_cpu;
>>
>>         init_cpu_topology();
>> +       init_cpu_cache_topology();
>>
>>         this_cpu = smp_processor_id();
>>         store_cpu_topology(this_cpu);
>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>> index 9761541..613213f
>> --- a/drivers/base/arch_topology.c
>> +++ b/drivers/base/arch_topology.c
>> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
>>   */
>>  struct cpu_topology cpu_topology[NR_CPUS];
>>  EXPORT_SYMBOL_GPL(cpu_topology);
>> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>
>AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
>not initialized for other archs

I see, will be fixed in V2.

Thanks,
Wang
>
>>
>>  const struct cpumask *cpu_coregroup_mask(int cpu)
>>  {
>> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
>>         else if (of_have_populated_dt() && parse_dt_topology())
>>                 reset_cpu_topology();
>>  }
>> +
>> +void __init init_cpu_cache_topology(void)
>> +{
>> +       struct device_node *node_cpu, *node_cache;
>> +       int cpu, level;
>> +
>> +       for_each_possible_cpu(cpu) {
>> +               node_cpu = of_get_cpu_node(cpu, NULL);
>> +               if (!node_cpu)
>> +                       continue;
>> +
>> +               level = 0;
>> +               node_cache = node_cpu;
>> +               while (level < MAX_CPU_CACHE_LEVEL) {
>> +                       node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
>> +                       if (!node_cache)
>> +                               break;
>> +                       cache_topology[cpu][level++] = node_cache;
>> +               }
>> +               of_node_put(node_cpu);
>> +       }
>> +}
>>  #endif
>> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
>> index cce6136b..d37f47d
>> --- a/include/linux/arch_topology.h
>> +++ b/include/linux/arch_topology.h
>> @@ -72,6 +72,8 @@ struct cpu_topology {
>>  };
>>
>>  #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
>> +#define MAX_CPU_CACHE_LEVEL 7
>> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
>>  extern struct cpu_topology cpu_topology[NR_CPUS];
>>
>>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
>> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
>>  #define topology_llc_cpumask(cpu)      (&cpu_topology[cpu].llc_sibling)
>>  void init_cpu_topology(void);
>> +void init_cpu_cache_topology(void);
>>  void store_cpu_topology(unsigned int cpuid);
>>  const struct cpumask *cpu_coregroup_mask(int cpu);
>>  const struct cpumask *cpu_clustergroup_mask(int cpu);
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index d201a70..10850d6
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
>>  DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
>>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
>>
>> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
>> +{
>> +       int cache_level, cpu_id;
>> +       int first, last;
>> +       int id = cpumask_first(sched_domain_span(sd));
>> +       int size = cpumask_weight(sched_domain_span(sd));
>> +
>> +       *first_cpu = id;
>> +       *cpu_num = size;
>> +
>> +       for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
>> +               if (!cache_topology[cpu][cache_level])
>> +                       break;
>> +
>> +               first = -1;
>> +               last = id;
>> +               for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
>> +                       if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
>> +                               if (cpu_id >= id && cpu_id < id + size) {
>> +                                       first = (first == -1)?cpu_id:first;
>> +                                       last = cpu_id;
>> +                               } else
>> +                                       return;
>> +                       }
>> +               }
>> +               *first_cpu = first;
>> +               *cpu_num = last - first + 1;
>> +       }
>> +}
>> +
>>  static void update_top_cache_domain(int cpu)
>>  {
>>         struct sched_domain_shared *sds = NULL;
>> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
>>
>>         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>>         if (sd) {
>> -               id = cpumask_first(sched_domain_span(sd));
>> -               size = cpumask_weight(sched_domain_span(sd));
>> +               set_sd_llc(cpu, sd, &id, &size);
>
>In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
>find shared memory. It seems that cpu_coregroup_mask doesn't return
>the correct cpumask in your case as it returns a full cluster instead
>of a subset
>
>>                 sds = sd->shared;
>
>sds must  stay aligned with id and size so instead of modifying id and
>size you should returns a cpumask that reflects your topology
>
>>         }
>>
>> --
>> 2.7.4
>
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11  2:03     ` 王擎
@ 2022-03-11  7:59       ` Vincent Guittot
  -1 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-11  7:59 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Fri, 11 Mar 2022 at 03:03, 王擎 <wangqing@vivo.com> wrote:
>
>
> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >>
> >> From: Wang Qing <wangqing@vivo.com>
> >>
> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> core:                         0      1          2      3          4      5           6      7
> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> (add cache level 3)        *************share level 3 cache ***************
> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >>
> Here, n always be 2 in ARM64, but others are also possible.
> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>
> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> return the wrong value, which will affect the CPU load balance.
> >>
> >What does your current scheduler topology  look like?
> >
> >For CPU 0 to 3, do you have the below ?
> >DIE [0     -     3] [4-7]
> >MC  [0] [1] [2] [3]
>
> The current scheduler topology consistent with CPU topology:
> DIE  [0-7]
> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> Most Android phones have this topology.
> >
> >But you would like something like below for cpu 0-1 instead ?
> >DIE [0     -     3] [4-7]
> >CLS [0 - 1] [2 - 3]
> >MC  [0] [1]
> >
> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>
> We don't change the current scheduler topology, but the
> cache topology should be separated like below:

The scheduler topology is not only cpu topology but a mixed of cpu and
cache/memory cache topology

> [0-7]                          (shared level 3 cache )
> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )

So you don't  bother the intermediate cluster level which is even simpler.
you have to modify generic arch topology so that cpu_coregroup_mask
returns the correct cpu mask directly.

You will notice a llc_sibling field that is currently used by acpi but
not DT to return llc cpu mask

> >
> >>
> >> Cache topology should be separated with CPU topology, it can be obtained
> >> from "next-level-cache" in DTS preferentially.
> >>
> >> Signed-off-by: Wang Qing <wangqing@vivo.com>
> >> ---
> >>  arch/arm64/kernel/smp.c       |  1 +
> >>  drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
> >>  include/linux/arch_topology.h |  3 +++
> >>  kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
> >>  4 files changed, 58 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> >> index 27df5c1..94cf649
> >> --- a/arch/arm64/kernel/smp.c
> >> +++ b/arch/arm64/kernel/smp.c
> >> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> >>         unsigned int this_cpu;
> >>
> >>         init_cpu_topology();
> >> +       init_cpu_cache_topology();
> >>
> >>         this_cpu = smp_processor_id();
> >>         store_cpu_topology(this_cpu);
> >> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> >> index 9761541..613213f
> >> --- a/drivers/base/arch_topology.c
> >> +++ b/drivers/base/arch_topology.c
> >> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
> >>   */
> >>  struct cpu_topology cpu_topology[NR_CPUS];
> >>  EXPORT_SYMBOL_GPL(cpu_topology);
> >> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> >
> >AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
> >not initialized for other archs
>
> I see, will be fixed in V2.
>
> Thanks,
> Wang
> >
> >>
> >>  const struct cpumask *cpu_coregroup_mask(int cpu)
> >>  {
> >> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
> >>         else if (of_have_populated_dt() && parse_dt_topology())
> >>                 reset_cpu_topology();
> >>  }
> >> +
> >> +void __init init_cpu_cache_topology(void)
> >> +{
> >> +       struct device_node *node_cpu, *node_cache;
> >> +       int cpu, level;
> >> +
> >> +       for_each_possible_cpu(cpu) {
> >> +               node_cpu = of_get_cpu_node(cpu, NULL);
> >> +               if (!node_cpu)
> >> +                       continue;
> >> +
> >> +               level = 0;
> >> +               node_cache = node_cpu;
> >> +               while (level < MAX_CPU_CACHE_LEVEL) {
> >> +                       node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
> >> +                       if (!node_cache)
> >> +                               break;
> >> +                       cache_topology[cpu][level++] = node_cache;
> >> +               }
> >> +               of_node_put(node_cpu);
> >> +       }
> >> +}
> >>  #endif
> >> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> >> index cce6136b..d37f47d
> >> --- a/include/linux/arch_topology.h
> >> +++ b/include/linux/arch_topology.h
> >> @@ -72,6 +72,8 @@ struct cpu_topology {
> >>  };
> >>
> >>  #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> >> +#define MAX_CPU_CACHE_LEVEL 7
> >> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> >>  extern struct cpu_topology cpu_topology[NR_CPUS];
> >>
> >>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
> >> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
> >>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
> >>  #define topology_llc_cpumask(cpu)      (&cpu_topology[cpu].llc_sibling)
> >>  void init_cpu_topology(void);
> >> +void init_cpu_cache_topology(void);
> >>  void store_cpu_topology(unsigned int cpuid);
> >>  const struct cpumask *cpu_coregroup_mask(int cpu);
> >>  const struct cpumask *cpu_clustergroup_mask(int cpu);
> >> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >> index d201a70..10850d6
> >> --- a/kernel/sched/topology.c
> >> +++ b/kernel/sched/topology.c
> >> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> >>  DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> >>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> >>
> >> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
> >> +{
> >> +       int cache_level, cpu_id;
> >> +       int first, last;
> >> +       int id = cpumask_first(sched_domain_span(sd));
> >> +       int size = cpumask_weight(sched_domain_span(sd));
> >> +
> >> +       *first_cpu = id;
> >> +       *cpu_num = size;
> >> +
> >> +       for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
> >> +               if (!cache_topology[cpu][cache_level])
> >> +                       break;
> >> +
> >> +               first = -1;
> >> +               last = id;
> >> +               for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
> >> +                       if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
> >> +                               if (cpu_id >= id && cpu_id < id + size) {
> >> +                                       first = (first == -1)?cpu_id:first;
> >> +                                       last = cpu_id;
> >> +                               } else
> >> +                                       return;
> >> +                       }
> >> +               }
> >> +               *first_cpu = first;
> >> +               *cpu_num = last - first + 1;
> >> +       }
> >> +}
> >> +
> >>  static void update_top_cache_domain(int cpu)
> >>  {
> >>         struct sched_domain_shared *sds = NULL;
> >> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
> >>
> >>         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> >>         if (sd) {
> >> -               id = cpumask_first(sched_domain_span(sd));
> >> -               size = cpumask_weight(sched_domain_span(sd));
> >> +               set_sd_llc(cpu, sd, &id, &size);
> >
> >In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
> >find shared memory. It seems that cpu_coregroup_mask doesn't return
> >the correct cpumask in your case as it returns a full cluster instead
> >of a subset
> >
> >>                 sds = sd->shared;
> >
> >sds must  stay aligned with id and size so instead of modifying id and
> >size you should returns a cpumask that reflects your topology
> >
> >>         }
> >>
> >> --
> >> 2.7.4
> >

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11  7:59       ` Vincent Guittot
  0 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-11  7:59 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Fri, 11 Mar 2022 at 03:03, 王擎 <wangqing@vivo.com> wrote:
>
>
> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >>
> >> From: Wang Qing <wangqing@vivo.com>
> >>
> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> core:                         0      1          2      3          4      5           6      7
> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> (add cache level 3)        *************share level 3 cache ***************
> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >>
> Here, n always be 2 in ARM64, but others are also possible.
> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>
> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> return the wrong value, which will affect the CPU load balance.
> >>
> >What does your current scheduler topology  look like?
> >
> >For CPU 0 to 3, do you have the below ?
> >DIE [0     -     3] [4-7]
> >MC  [0] [1] [2] [3]
>
> The current scheduler topology consistent with CPU topology:
> DIE  [0-7]
> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> Most Android phones have this topology.
> >
> >But you would like something like below for cpu 0-1 instead ?
> >DIE [0     -     3] [4-7]
> >CLS [0 - 1] [2 - 3]
> >MC  [0] [1]
> >
> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>
> We don't change the current scheduler topology, but the
> cache topology should be separated like below:

The scheduler topology is not only cpu topology but a mixed of cpu and
cache/memory cache topology

> [0-7]                          (shared level 3 cache )
> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )

So you don't  bother the intermediate cluster level which is even simpler.
you have to modify generic arch topology so that cpu_coregroup_mask
returns the correct cpu mask directly.

You will notice a llc_sibling field that is currently used by acpi but
not DT to return llc cpu mask

> >
> >>
> >> Cache topology should be separated with CPU topology, it can be obtained
> >> from "next-level-cache" in DTS preferentially.
> >>
> >> Signed-off-by: Wang Qing <wangqing@vivo.com>
> >> ---
> >>  arch/arm64/kernel/smp.c       |  1 +
> >>  drivers/base/arch_topology.c  | 23 +++++++++++++++++++++++
> >>  include/linux/arch_topology.h |  3 +++
> >>  kernel/sched/topology.c       | 33 +++++++++++++++++++++++++++++++--
> >>  4 files changed, 58 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> >> index 27df5c1..94cf649
> >> --- a/arch/arm64/kernel/smp.c
> >> +++ b/arch/arm64/kernel/smp.c
> >> @@ -723,6 +723,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> >>         unsigned int this_cpu;
> >>
> >>         init_cpu_topology();
> >> +       init_cpu_cache_topology();
> >>
> >>         this_cpu = smp_processor_id();
> >>         store_cpu_topology(this_cpu);
> >> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> >> index 9761541..613213f
> >> --- a/drivers/base/arch_topology.c
> >> +++ b/drivers/base/arch_topology.c
> >> @@ -613,6 +613,7 @@ static int __init parse_dt_topology(void)
> >>   */
> >>  struct cpu_topology cpu_topology[NR_CPUS];
> >>  EXPORT_SYMBOL_GPL(cpu_topology);
> >> +struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> >
> >AFAICT, arch_topology.c is only used by arm/arm64 and riscv so this is
> >not initialized for other archs
>
> I see, will be fixed in V2.
>
> Thanks,
> Wang
> >
> >>
> >>  const struct cpumask *cpu_coregroup_mask(int cpu)
> >>  {
> >> @@ -738,4 +739,26 @@ void __init init_cpu_topology(void)
> >>         else if (of_have_populated_dt() && parse_dt_topology())
> >>                 reset_cpu_topology();
> >>  }
> >> +
> >> +void __init init_cpu_cache_topology(void)
> >> +{
> >> +       struct device_node *node_cpu, *node_cache;
> >> +       int cpu, level;
> >> +
> >> +       for_each_possible_cpu(cpu) {
> >> +               node_cpu = of_get_cpu_node(cpu, NULL);
> >> +               if (!node_cpu)
> >> +                       continue;
> >> +
> >> +               level = 0;
> >> +               node_cache = node_cpu;
> >> +               while (level < MAX_CPU_CACHE_LEVEL) {
> >> +                       node_cache = of_parse_phandle(node_cache, "next-level-cache", 0);
> >> +                       if (!node_cache)
> >> +                               break;
> >> +                       cache_topology[cpu][level++] = node_cache;
> >> +               }
> >> +               of_node_put(node_cpu);
> >> +       }
> >> +}
> >>  #endif
> >> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> >> index cce6136b..d37f47d
> >> --- a/include/linux/arch_topology.h
> >> +++ b/include/linux/arch_topology.h
> >> @@ -72,6 +72,8 @@ struct cpu_topology {
> >>  };
> >>
> >>  #ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
> >> +#define MAX_CPU_CACHE_LEVEL 7
> >> +extern struct device_node *cache_topology[NR_CPUS][MAX_CPU_CACHE_LEVEL];
> >>  extern struct cpu_topology cpu_topology[NR_CPUS];
> >>
> >>  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].package_id)
> >> @@ -82,6 +84,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
> >>  #define topology_cluster_cpumask(cpu)  (&cpu_topology[cpu].cluster_sibling)
> >>  #define topology_llc_cpumask(cpu)      (&cpu_topology[cpu].llc_sibling)
> >>  void init_cpu_topology(void);
> >> +void init_cpu_cache_topology(void);
> >>  void store_cpu_topology(unsigned int cpuid);
> >>  const struct cpumask *cpu_coregroup_mask(int cpu);
> >>  const struct cpumask *cpu_clustergroup_mask(int cpu);
> >> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> >> index d201a70..10850d6
> >> --- a/kernel/sched/topology.c
> >> +++ b/kernel/sched/topology.c
> >> @@ -650,6 +650,36 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
> >>  DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
> >>  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
> >>
> >> +static void set_sd_llc(int cpu, struct sched_domain *sd, int *first_cpu, int *cpu_num)
> >> +{
> >> +       int cache_level, cpu_id;
> >> +       int first, last;
> >> +       int id = cpumask_first(sched_domain_span(sd));
> >> +       int size = cpumask_weight(sched_domain_span(sd));
> >> +
> >> +       *first_cpu = id;
> >> +       *cpu_num = size;
> >> +
> >> +       for (cache_level = 0; cache_level < MAX_CPU_CACHE_LEVEL; cache_level++) {
> >> +               if (!cache_topology[cpu][cache_level])
> >> +                       break;
> >> +
> >> +               first = -1;
> >> +               last = id;
> >> +               for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) {
> >> +                       if (cache_topology[cpu][cache_level] == cache_topology[cpu_id][cache_level]) {
> >> +                               if (cpu_id >= id && cpu_id < id + size) {
> >> +                                       first = (first == -1)?cpu_id:first;
> >> +                                       last = cpu_id;
> >> +                               } else
> >> +                                       return;
> >> +                       }
> >> +               }
> >> +               *first_cpu = first;
> >> +               *cpu_num = last - first + 1;
> >> +       }
> >> +}
> >> +
> >>  static void update_top_cache_domain(int cpu)
> >>  {
> >>         struct sched_domain_shared *sds = NULL;
> >> @@ -659,8 +689,7 @@ static void update_top_cache_domain(int cpu)
> >>
> >>         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> >>         if (sd) {
> >> -               id = cpumask_first(sched_domain_span(sd));
> >> -               size = cpumask_weight(sched_domain_span(sd));
> >> +               set_sd_llc(cpu, sd, &id, &size);
> >
> >In scheduler, we look for the last level of SD_SHARE_PKG_RESOURCES to
> >find shared memory. It seems that cpu_coregroup_mask doesn't return
> >the correct cpumask in your case as it returns a full cluster instead
> >of a subset
> >
> >>                 sds = sd->shared;
> >
> >sds must  stay aligned with id and size so instead of modifying id and
> >size you should returns a cpumask that reflects your topology
> >
> >>         }
> >>
> >> --
> >> 2.7.4
> >

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11  7:59       ` Vincent Guittot
@ 2022-03-11  8:18         ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-11  8:18 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >>
>> >> From: Wang Qing <wangqing@vivo.com>
>> >>
>> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> core:                         0      1          2      3          4      5           6      7
>> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> (add cache level 3)        *************share level 3 cache ***************
>> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >>
>> Here, n always be 2 in ARM64, but others are also possible.
>> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>>
>> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> return the wrong value, which will affect the CPU load balance.
>> >>
>> >What does your current scheduler topology  look like?
>> >
>> >For CPU 0 to 3, do you have the below ?
>> >DIE [0     -     3] [4-7]
>> >MC  [0] [1] [2] [3]
>>
>> The current scheduler topology consistent with CPU topology:
>> DIE  [0-7]
>> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> Most Android phones have this topology.
>> >
>> >But you would like something like below for cpu 0-1 instead ?
>> >DIE [0     -     3] [4-7]
>> >CLS [0 - 1] [2 - 3]
>> >MC  [0] [1]
>> >
>> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>>
>> We don't change the current scheduler topology, but the
>> cache topology should be separated like below:
>
>The scheduler topology is not only cpu topology but a mixed of cpu and
>cache/memory cache topology
>
>> [0-7]                          (shared level 3 cache )
>> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>
>So you don't  bother the intermediate cluster level which is even simpler.
>you have to modify generic arch topology so that cpu_coregroup_mask
>returns the correct cpu mask directly.
>
>You will notice a llc_sibling field that is currently used by acpi but
>not DT to return llc cpu mask
>
cpu_topology[].llc_sibling describe the last level cache of whole system, 
not in the sched_domain.

in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
the L3 cache sibling, but sd_llc_id describes the maximum shared cache
in sd, which should be [0-1] instead of [0-3].

Thanks,
Wang

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11  8:18         ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-11  8:18 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >>
>> >> From: Wang Qing <wangqing@vivo.com>
>> >>
>> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> core:                         0      1          2      3          4      5           6      7
>> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> (add cache level 3)        *************share level 3 cache ***************
>> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >>
>> Here, n always be 2 in ARM64, but others are also possible.
>> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>>
>> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> return the wrong value, which will affect the CPU load balance.
>> >>
>> >What does your current scheduler topology  look like?
>> >
>> >For CPU 0 to 3, do you have the below ?
>> >DIE [0     -     3] [4-7]
>> >MC  [0] [1] [2] [3]
>>
>> The current scheduler topology consistent with CPU topology:
>> DIE  [0-7]
>> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> Most Android phones have this topology.
>> >
>> >But you would like something like below for cpu 0-1 instead ?
>> >DIE [0     -     3] [4-7]
>> >CLS [0 - 1] [2 - 3]
>> >MC  [0] [1]
>> >
>> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>>
>> We don't change the current scheduler topology, but the
>> cache topology should be separated like below:
>
>The scheduler topology is not only cpu topology but a mixed of cpu and
>cache/memory cache topology
>
>> [0-7]                          (shared level 3 cache )
>> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>
>So you don't  bother the intermediate cluster level which is even simpler.
>you have to modify generic arch topology so that cpu_coregroup_mask
>returns the correct cpu mask directly.
>
>You will notice a llc_sibling field that is currently used by acpi but
>not DT to return llc cpu mask
>
cpu_topology[].llc_sibling describe the last level cache of whole system, 
not in the sched_domain.

in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
the L3 cache sibling, but sd_llc_id describes the maximum shared cache
in sd, which should be [0-1] instead of [0-3].

Thanks,
Wang
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11  8:18         ` 王擎
@ 2022-03-11  8:25           ` Vincent Guittot
  -1 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-11  8:25 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Fri, 11 Mar 2022 at 09:18, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >>
> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >>
> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> core:                         0      1          2      3          4      5           6      7
> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >>
> >> Here, n always be 2 in ARM64, but others are also possible.
> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >>
> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> return the wrong value, which will affect the CPU load balance.
> >> >>
> >> >What does your current scheduler topology  look like?
> >> >
> >> >For CPU 0 to 3, do you have the below ?
> >> >DIE [0     -     3] [4-7]
> >> >MC  [0] [1] [2] [3]
> >>
> >> The current scheduler topology consistent with CPU topology:
> >> DIE  [0-7]
> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> Most Android phones have this topology.
> >> >
> >> >But you would like something like below for cpu 0-1 instead ?
> >> >DIE [0     -     3] [4-7]
> >> >CLS [0 - 1] [2 - 3]
> >> >MC  [0] [1]
> >> >
> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >>
> >> We don't change the current scheduler topology, but the
> >> cache topology should be separated like below:
> >
> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >cache/memory cache topology
> >
> >> [0-7]                          (shared level 3 cache )
> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >
> >So you don't  bother the intermediate cluster level which is even simpler.
> >you have to modify generic arch topology so that cpu_coregroup_mask
> >returns the correct cpu mask directly.
> >
> >You will notice a llc_sibling field that is currently used by acpi but
> >not DT to return llc cpu mask
> >
> cpu_topology[].llc_sibling describe the last level cache of whole system,
> not in the sched_domain.
>
> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes

If llc_sibling was 0xff([0-7] on your system, you would have only one level:
MC[0-7]

> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> in sd, which should be [0-1] instead of [0-3].

sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
If you want llc to be [0-3] make sure that the
sched_domain_topology_level array returns the correct cpumask with
this flag


>
> Thanks,
> Wang

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11  8:25           ` Vincent Guittot
  0 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-11  8:25 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Fri, 11 Mar 2022 at 09:18, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >>
> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >>
> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> core:                         0      1          2      3          4      5           6      7
> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >>
> >> Here, n always be 2 in ARM64, but others are also possible.
> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >>
> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> return the wrong value, which will affect the CPU load balance.
> >> >>
> >> >What does your current scheduler topology  look like?
> >> >
> >> >For CPU 0 to 3, do you have the below ?
> >> >DIE [0     -     3] [4-7]
> >> >MC  [0] [1] [2] [3]
> >>
> >> The current scheduler topology consistent with CPU topology:
> >> DIE  [0-7]
> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> Most Android phones have this topology.
> >> >
> >> >But you would like something like below for cpu 0-1 instead ?
> >> >DIE [0     -     3] [4-7]
> >> >CLS [0 - 1] [2 - 3]
> >> >MC  [0] [1]
> >> >
> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >>
> >> We don't change the current scheduler topology, but the
> >> cache topology should be separated like below:
> >
> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >cache/memory cache topology
> >
> >> [0-7]                          (shared level 3 cache )
> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >
> >So you don't  bother the intermediate cluster level which is even simpler.
> >you have to modify generic arch topology so that cpu_coregroup_mask
> >returns the correct cpu mask directly.
> >
> >You will notice a llc_sibling field that is currently used by acpi but
> >not DT to return llc cpu mask
> >
> cpu_topology[].llc_sibling describe the last level cache of whole system,
> not in the sched_domain.
>
> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes

If llc_sibling was 0xff([0-7] on your system, you would have only one level:
MC[0-7]

> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> in sd, which should be [0-1] instead of [0-3].

sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
If you want llc to be [0-3] make sure that the
sched_domain_topology_level array returns the correct cpumask with
this flag


>
> Thanks,
> Wang

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11  8:25           ` Vincent Guittot
@ 2022-03-11  9:30             ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-11  9:30 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >>
>> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >>
>> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> core:                         0      1          2      3          4      5           6      7
>> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >>
>> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >>
>> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >>
>> >> >What does your current scheduler topology  look like?
>> >> >
>> >> >For CPU 0 to 3, do you have the below ?
>> >> >DIE [0     -     3] [4-7]
>> >> >MC  [0] [1] [2] [3]
>> >>
>> >> The current scheduler topology consistent with CPU topology:
>> >> DIE  [0-7]
>> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> Most Android phones have this topology.
>> >> >
>> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >DIE [0     -     3] [4-7]
>> >> >CLS [0 - 1] [2 - 3]
>> >> >MC  [0] [1]
>> >> >
>> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >>
>> >> We don't change the current scheduler topology, but the
>> >> cache topology should be separated like below:
>> >
>> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >cache/memory cache topology
>> >
>> >> [0-7]                          (shared level 3 cache )
>> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >
>> >So you don't  bother the intermediate cluster level which is even simpler.
>> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >returns the correct cpu mask directly.
>> >
>> >You will notice a llc_sibling field that is currently used by acpi but
>> >not DT to return llc cpu mask
>> >
>> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> not in the sched_domain.
>>
>> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>
>If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>MC[0-7]

Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they 
shared the llc(L3), but we also have two level:
DIE [0-7]
MC [0-3][4-6]
It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate 
when misfit. We won't change it.

>
>> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> in sd, which should be [0-1] instead of [0-3].
>
>sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>If you want llc to be [0-3] make sure that the
>sched_domain_topology_level array returns the correct cpumask with
>this flag

Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.

So we must separate sd_llc from sd topology, or the demand cannot be
met in any case under the existing mechanism.

Thanks,
Wang


^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11  9:30             ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-11  9:30 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >>
>> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >>
>> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> core:                         0      1          2      3          4      5           6      7
>> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >>
>> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >>
>> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >>
>> >> >What does your current scheduler topology  look like?
>> >> >
>> >> >For CPU 0 to 3, do you have the below ?
>> >> >DIE [0     -     3] [4-7]
>> >> >MC  [0] [1] [2] [3]
>> >>
>> >> The current scheduler topology consistent with CPU topology:
>> >> DIE  [0-7]
>> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> Most Android phones have this topology.
>> >> >
>> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >DIE [0     -     3] [4-7]
>> >> >CLS [0 - 1] [2 - 3]
>> >> >MC  [0] [1]
>> >> >
>> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >>
>> >> We don't change the current scheduler topology, but the
>> >> cache topology should be separated like below:
>> >
>> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >cache/memory cache topology
>> >
>> >> [0-7]                          (shared level 3 cache )
>> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >
>> >So you don't  bother the intermediate cluster level which is even simpler.
>> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >returns the correct cpu mask directly.
>> >
>> >You will notice a llc_sibling field that is currently used by acpi but
>> >not DT to return llc cpu mask
>> >
>> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> not in the sched_domain.
>>
>> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>
>If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>MC[0-7]

Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they 
shared the llc(L3), but we also have two level:
DIE [0-7]
MC [0-3][4-6]
It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate 
when misfit. We won't change it.

>
>> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> in sd, which should be [0-1] instead of [0-3].
>
>sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>If you want llc to be [0-3] make sure that the
>sched_domain_topology_level array returns the correct cpumask with
>this flag

Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.

So we must separate sd_llc from sd topology, or the demand cannot be
met in any case under the existing mechanism.

Thanks,
Wang

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11  9:30             ` 王擎
@ 2022-03-11 11:14               ` Vincent Guittot
  -1 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-11 11:14 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Fri, 11 Mar 2022 at 10:30, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >> >>
> >> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >> >>
> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> >> core:                         0      1          2      3          4      5           6      7
> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >> >>
> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >> >>
> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >>
> >> >> >What does your current scheduler topology  look like?
> >> >> >
> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >DIE [0     -     3] [4-7]
> >> >> >MC  [0] [1] [2] [3]
> >> >>
> >> >> The current scheduler topology consistent with CPU topology:
> >> >> DIE  [0-7]
> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> >> Most Android phones have this topology.
> >> >> >
> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >DIE [0     -     3] [4-7]
> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >MC  [0] [1]
> >> >> >
> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >>
> >> >> We don't change the current scheduler topology, but the
> >> >> cache topology should be separated like below:
> >> >
> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >cache/memory cache topology
> >> >
> >> >> [0-7]                          (shared level 3 cache )
> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >> >
> >> >So you don't  bother the intermediate cluster level which is even simpler.
> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >returns the correct cpu mask directly.
> >> >
> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >not DT to return llc cpu mask
> >> >
> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> not in the sched_domain.
> >>
> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >
> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >MC[0-7]
>
> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> shared the llc(L3), but we also have two level:
> DIE [0-7]
> MC [0-3][4-6]
> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> when misfit. We won't change it.
>
> >
> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> in sd, which should be [0-1] instead of [0-3].
> >
> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >If you want llc to be [0-3] make sure that the
> >sched_domain_topology_level array returns the correct cpumask with
> >this flag
>
> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have

sd_llc_id refers to a scheduler domain but your patch breaks this so
if you want a llc that reflects this topo:  [0-1] [2-3] you must
provide a sched_domain level with this topo

side question, why don't you want llc to be the L3 one ?

> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.

The only entry point for describing the scheduler domain is the
sched_domain_topology_level array which provides some cpumask and some
associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
scheduler MC level which implies that the cpus shared their cache. If
this is not the case for your system, you should either remove this
flag or update the cpumask to reflect which CPUs really share their
caches.

>
> So we must separate sd_llc from sd topology, or the demand cannot be
> met in any case under the existing mechanism.

There is a default array with DIE, MC, CLS and SMT levels with
SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
but a different array than the default one can be provided with
set_sched_topology()

Thanks
Vincent

>
> Thanks,
> Wang
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11 11:14               ` Vincent Guittot
  0 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-11 11:14 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Fri, 11 Mar 2022 at 10:30, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >> >>
> >> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >> >>
> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> >> core:                         0      1          2      3          4      5           6      7
> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >> >>
> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >> >>
> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >>
> >> >> >What does your current scheduler topology  look like?
> >> >> >
> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >DIE [0     -     3] [4-7]
> >> >> >MC  [0] [1] [2] [3]
> >> >>
> >> >> The current scheduler topology consistent with CPU topology:
> >> >> DIE  [0-7]
> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> >> Most Android phones have this topology.
> >> >> >
> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >DIE [0     -     3] [4-7]
> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >MC  [0] [1]
> >> >> >
> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >>
> >> >> We don't change the current scheduler topology, but the
> >> >> cache topology should be separated like below:
> >> >
> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >cache/memory cache topology
> >> >
> >> >> [0-7]                          (shared level 3 cache )
> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >> >
> >> >So you don't  bother the intermediate cluster level which is even simpler.
> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >returns the correct cpu mask directly.
> >> >
> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >not DT to return llc cpu mask
> >> >
> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> not in the sched_domain.
> >>
> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >
> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >MC[0-7]
>
> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> shared the llc(L3), but we also have two level:
> DIE [0-7]
> MC [0-3][4-6]
> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> when misfit. We won't change it.
>
> >
> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> in sd, which should be [0-1] instead of [0-3].
> >
> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >If you want llc to be [0-3] make sure that the
> >sched_domain_topology_level array returns the correct cpumask with
> >this flag
>
> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have

sd_llc_id refers to a scheduler domain but your patch breaks this so
if you want a llc that reflects this topo:  [0-1] [2-3] you must
provide a sched_domain level with this topo

side question, why don't you want llc to be the L3 one ?

> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.

The only entry point for describing the scheduler domain is the
sched_domain_topology_level array which provides some cpumask and some
associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
scheduler MC level which implies that the cpus shared their cache. If
this is not the case for your system, you should either remove this
flag or update the cpumask to reflect which CPUs really share their
caches.

>
> So we must separate sd_llc from sd topology, or the demand cannot be
> met in any case under the existing mechanism.

There is a default array with DIE, MC, CLS and SMT levels with
SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
but a different array than the default one can be provided with
set_sched_topology()

Thanks
Vincent

>
> Thanks,
> Wang
>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-10 12:58 ` Qing Wang
@ 2022-03-11 11:24   ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2022-03-11 11:24 UTC (permalink / raw)
  To: Qing Wang
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Thu, Mar 10, 2022 at 04:58:44AM -0800, Qing Wang wrote:
> From: Wang Qing <wangqing@vivo.com>
> 
> Some architectures(e.g. ARM64), caches are implemented below:
> cluster:              ****** cluster 0 *****      ****** cluster 1 *****
> core:                 0      1      2      3      4      5      6      7
> cache(Leveln):        **cache0**   **cache1**	 **cache2**   **cache3**
> sd_llc_id(current):   0      0      0      0      4      4      4      4
> sd_llc_id(should be): 0      0      2      2      4      4      6      6
> 
> Caches and cpus have different topology, this causes cpus_share_cache()
> return the wrong value, which will affect the CPU load balance.
> 
> Cache topology should be separated with CPU topology, it can be obtained
> from "next-level-cache" in DTS preferentially.

If your clusters do not have cache, then you're currently setting
SD_SHARE_PKG_RESOURCES wrong, if they do, things are correct.

If you want to represent L2, use the new fangled cluster level or
something, that's what it's there for.

That is, you can represent the above like:

	DIE:	0-7
	MC:	0-3,		4-7
	CLS:	0-1,1-2,	4-5,6-7

But if there is cache at MC, LLC is what it is.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-11 11:24   ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2022-03-11 11:24 UTC (permalink / raw)
  To: Qing Wang
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Thu, Mar 10, 2022 at 04:58:44AM -0800, Qing Wang wrote:
> From: Wang Qing <wangqing@vivo.com>
> 
> Some architectures(e.g. ARM64), caches are implemented below:
> cluster:              ****** cluster 0 *****      ****** cluster 1 *****
> core:                 0      1      2      3      4      5      6      7
> cache(Leveln):        **cache0**   **cache1**	 **cache2**   **cache3**
> sd_llc_id(current):   0      0      0      0      4      4      4      4
> sd_llc_id(should be): 0      0      2      2      4      4      6      6
> 
> Caches and cpus have different topology, this causes cpus_share_cache()
> return the wrong value, which will affect the CPU load balance.
> 
> Cache topology should be separated with CPU topology, it can be obtained
> from "next-level-cache" in DTS preferentially.

If your clusters do not have cache, then you're currently setting
SD_SHARE_PKG_RESOURCES wrong, if they do, things are correct.

If you want to represent L2, use the new fangled cluster level or
something, that's what it's there for.

That is, you can represent the above like:

	DIE:	0-7
	MC:	0-3,		4-7
	CLS:	0-1,1-2,	4-5,6-7

But if there is cache at MC, LLC is what it is.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11 11:24   ` Peter Zijlstra
@ 2022-03-14  2:37     ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-14  2:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

 
> > From: Wang Qing <wangqing@vivo.com>
> > 
> > Some architectures(e.g. ARM64), caches are implemented below:
> > cluster:              ****** cluster 0 *****      ****** cluster 1 *****
> > core:                 0      1      2      3      4      5      6      7
> > cache(Leveln):        **cache0**   **cache1**  **cache2**   **cache3**
> > sd_llc_id(current):   0      0      0      0      4      4      4      4
> > sd_llc_id(should be): 0      0      2      2      4      4      6      6
> > 
> > Caches and cpus have different topology, this causes cpus_share_cache()
> > return the wrong value, which will affect the CPU load balance.
> > 
> > Cache topology should be separated with CPU topology, it can be obtained
> > from "next-level-cache" in DTS preferentially.
> 
> If your clusters do not have cache, then you're currently setting
> SD_SHARE_PKG_RESOURCES wrong, if they do, things are correct.

If there is a shared cache(L3) between clusters(cls 0 and cls 1) for all cores, 
but not within the cluster like above, should we set SD_SHARE_PKG_RESOURCES 
for MC0(cls 0), or just set SD_SHARE_PKG_RESOURCES for CLS?
> 
> If you want to represent L2, use the new fangled cluster level or
> something, that's what it's there for.
> 
> That is, you can represent the above like:
> 
>        DIE:    0-7
>        MC:     0-3,            4-7
>         CLS:    0-1,1-2,        4-5,6-7
> 
> But if there is cache at MC, LLC is what it is.

There is no CLS support for LTS now, any plans to backport?

Thanks,
Wang

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-14  2:37     ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-14  2:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

 
> > From: Wang Qing <wangqing@vivo.com>
> > 
> > Some architectures(e.g. ARM64), caches are implemented below:
> > cluster:              ****** cluster 0 *****      ****** cluster 1 *****
> > core:                 0      1      2      3      4      5      6      7
> > cache(Leveln):        **cache0**   **cache1**  **cache2**   **cache3**
> > sd_llc_id(current):   0      0      0      0      4      4      4      4
> > sd_llc_id(should be): 0      0      2      2      4      4      6      6
> > 
> > Caches and cpus have different topology, this causes cpus_share_cache()
> > return the wrong value, which will affect the CPU load balance.
> > 
> > Cache topology should be separated with CPU topology, it can be obtained
> > from "next-level-cache" in DTS preferentially.
> 
> If your clusters do not have cache, then you're currently setting
> SD_SHARE_PKG_RESOURCES wrong, if they do, things are correct.

If there is a shared cache(L3) between clusters(cls 0 and cls 1) for all cores, 
but not within the cluster like above, should we set SD_SHARE_PKG_RESOURCES 
for MC0(cls 0), or just set SD_SHARE_PKG_RESOURCES for CLS?
> 
> If you want to represent L2, use the new fangled cluster level or
> something, that's what it's there for.
> 
> That is, you can represent the above like:
> 
>        DIE:    0-7
>        MC:     0-3,            4-7
>         CLS:    0-1,1-2,        4-5,6-7
> 
> But if there is cache at MC, LLC is what it is.

There is no CLS support for LTS now, any plans to backport?

Thanks,
Wang
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-14  2:37     ` 王擎
@ 2022-03-14 10:19       ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2022-03-14 10:19 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Mon, Mar 14, 2022 at 02:37:48AM +0000, 王擎 wrote:

> > If you want to represent L2, use the new fangled cluster level or
> > something, that's what it's there for.
> > 
> > That is, you can represent the above like:
> > 
> >        DIE:    0-7
> >        MC:     0-3,            4-7
> >         CLS:    0-1,1-2,        4-5,6-7
> > 
> > But if there is cache at MC, LLC is what it is.
> 
> There is no CLS support for LTS now, any plans to backport?

-ENOTMYPROBLEM

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-14 10:19       ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2022-03-14 10:19 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Mon, Mar 14, 2022 at 02:37:48AM +0000, 王擎 wrote:

> > If you want to represent L2, use the new fangled cluster level or
> > something, that's what it's there for.
> > 
> > That is, you can represent the above like:
> > 
> >        DIE:    0-7
> >        MC:     0-3,            4-7
> >         CLS:    0-1,1-2,        4-5,6-7
> > 
> > But if there is cache at MC, LLC is what it is.
> 
> There is no CLS support for LTS now, any plans to backport?

-ENOTMYPROBLEM

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-11 11:14               ` Vincent Guittot
@ 2022-03-15  1:58                 ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-15  1:58 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >> >>
>> >> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >> >>
>> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> >> core:                         0      1          2      3          4      5           6      7
>> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >> >>
>> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >> >>
>> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >>
>> >> >> >What does your current scheduler topology  look like?
>> >> >> >
>> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >DIE [0     -     3] [4-7]
>> >> >> >MC  [0] [1] [2] [3]
>> >> >>
>> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> DIE  [0-7]
>> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> >> Most Android phones have this topology.
>> >> >> >
>> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >DIE [0     -     3] [4-7]
>> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >MC  [0] [1]
>> >> >> >
>> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >>
>> >> >> We don't change the current scheduler topology, but the
>> >> >> cache topology should be separated like below:
>> >> >
>> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >cache/memory cache topology
>> >> >
>> >> >> [0-7]                          (shared level 3 cache )
>> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >> >
>> >> >So you don't  bother the intermediate cluster level which is even simpler.
>> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >returns the correct cpu mask directly.
>> >> >
>> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >not DT to return llc cpu mask
>> >> >
>> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> not in the sched_domain.
>> >>
>> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >
>> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >MC[0-7]
>>
>> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> shared the llc(L3), but we also have two level:
>> DIE [0-7]
>> MC [0-3][4-6]
>> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> when misfit. We won't change it.
>>
>> >
>> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> in sd, which should be [0-1] instead of [0-3].
>> >
>> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >If you want llc to be [0-3] make sure that the
>> >sched_domain_topology_level array returns the correct cpumask with
>> >this flag
>>
>> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>
>sd_llc_id refers to a scheduler domain but your patch breaks this so
>if you want a llc that reflects this topo:  [0-1] [2-3] you must
>provide a sched_domain level with this topo

Maybe we should add a shared-cache level(SC), like what CLS does:

DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
CLS  (if necessary)
SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
SMT (if necessary)

SC means a couple of CPUs which are placed closely by sharing 
mid-level caches, but not enough to be a cluster.
>
>side question, why don't you want llc to be the L3 one ?

Yes, we should set SD_SHARE_PKG_RESOURCES to DIE, but we also want to
represent the mid-level caches to improve throughput.

Thanks,
Wang
>
>> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
>
>The only entry point for describing the scheduler domain is the
>sched_domain_topology_level array which provides some cpumask and some
>associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
>scheduler MC level which implies that the cpus shared their cache. If
>this is not the case for your system, you should either remove this
>flag or update the cpumask to reflect which CPUs really share their
>caches.
>
>>
>> So we must separate sd_llc from sd topology, or the demand cannot be
>> met in any case under the existing mechanism.
>
>There is a default array with DIE, MC, CLS and SMT levels with
>SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
>but a different array than the default one can be provided with
>set_sched_topology()
>
>Thanks
>Vincent
>
>>
>> Thanks,
>> Wang
>>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-15  1:58                 ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-03-15  1:58 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >> >>
>> >> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >> >>
>> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> >> core:                         0      1          2      3          4      5           6      7
>> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >> >>
>> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >> >>
>> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >>
>> >> >> >What does your current scheduler topology  look like?
>> >> >> >
>> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >DIE [0     -     3] [4-7]
>> >> >> >MC  [0] [1] [2] [3]
>> >> >>
>> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> DIE  [0-7]
>> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> >> Most Android phones have this topology.
>> >> >> >
>> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >DIE [0     -     3] [4-7]
>> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >MC  [0] [1]
>> >> >> >
>> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >>
>> >> >> We don't change the current scheduler topology, but the
>> >> >> cache topology should be separated like below:
>> >> >
>> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >cache/memory cache topology
>> >> >
>> >> >> [0-7]                          (shared level 3 cache )
>> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >> >
>> >> >So you don't  bother the intermediate cluster level which is even simpler.
>> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >returns the correct cpu mask directly.
>> >> >
>> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >not DT to return llc cpu mask
>> >> >
>> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> not in the sched_domain.
>> >>
>> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >
>> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >MC[0-7]
>>
>> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> shared the llc(L3), but we also have two level:
>> DIE [0-7]
>> MC [0-3][4-6]
>> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> when misfit. We won't change it.
>>
>> >
>> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> in sd, which should be [0-1] instead of [0-3].
>> >
>> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >If you want llc to be [0-3] make sure that the
>> >sched_domain_topology_level array returns the correct cpumask with
>> >this flag
>>
>> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>
>sd_llc_id refers to a scheduler domain but your patch breaks this so
>if you want a llc that reflects this topo:  [0-1] [2-3] you must
>provide a sched_domain level with this topo

Maybe we should add a shared-cache level(SC), like what CLS does:

DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
CLS  (if necessary)
SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
SMT (if necessary)

SC means a couple of CPUs which are placed closely by sharing 
mid-level caches, but not enough to be a cluster.
>
>side question, why don't you want llc to be the L3 one ?

Yes, we should set SD_SHARE_PKG_RESOURCES to DIE, but we also want to
represent the mid-level caches to improve throughput.

Thanks,
Wang
>
>> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
>
>The only entry point for describing the scheduler domain is the
>sched_domain_topology_level array which provides some cpumask and some
>associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
>scheduler MC level which implies that the cpus shared their cache. If
>this is not the case for your system, you should either remove this
>flag or update the cpumask to reflect which CPUs really share their
>caches.
>
>>
>> So we must separate sd_llc from sd topology, or the demand cannot be
>> met in any case under the existing mechanism.
>
>There is a default array with DIE, MC, CLS and SMT levels with
>SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
>but a different array than the default one can be provided with
>set_sched_topology()
>
>Thanks
>Vincent
>
>>
>> Thanks,
>> Wang
>>
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-15  1:58                 ` 王擎
@ 2022-03-15 16:51                   ` Vincent Guittot
  -1 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-15 16:51 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Tue, 15 Mar 2022 at 02:58, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >>
> >> >> >>
> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >> >> >>
> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >> >> >>
> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> >> >> core:                         0      1          2      3          4      5           6      7
> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> >> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >> >> >>
> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >> >> >>
> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >> >>
> >> >> >> >What does your current scheduler topology  look like?
> >> >> >> >
> >> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >MC  [0] [1] [2] [3]
> >> >> >>
> >> >> >> The current scheduler topology consistent with CPU topology:
> >> >> >> DIE  [0-7]
> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> >> >> Most Android phones have this topology.
> >> >> >> >
> >> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >> >MC  [0] [1]
> >> >> >> >
> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >> >>
> >> >> >> We don't change the current scheduler topology, but the
> >> >> >> cache topology should be separated like below:
> >> >> >
> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >> >cache/memory cache topology
> >> >> >
> >> >> >> [0-7]                          (shared level 3 cache )
> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >> >> >
> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >> >returns the correct cpu mask directly.
> >> >> >
> >> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >> >not DT to return llc cpu mask
> >> >> >
> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> >> not in the sched_domain.
> >> >>
> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >> >
> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >> >MC[0-7]
> >>
> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> >> shared the llc(L3), but we also have two level:
> >> DIE [0-7]
> >> MC [0-3][4-6]
> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> >> when misfit. We won't change it.
> >>
> >> >
> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> >> in sd, which should be [0-1] instead of [0-3].
> >> >
> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >> >If you want llc to be [0-3] make sure that the
> >> >sched_domain_topology_level array returns the correct cpumask with
> >> >this flag
> >>
> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
> >
> >sd_llc_id refers to a scheduler domain but your patch breaks this so
> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
> >provide a sched_domain level with this topo
>
> Maybe we should add a shared-cache level(SC), like what CLS does:
>
> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
> CLS  (if necessary)
> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
> SMT (if necessary)
>
> SC means a couple of CPUs which are placed closely by sharing
> mid-level caches, but not enough to be a cluster.

what you name SC above looks the same as CLS which should not be mixed
with Arm cluster terminology

> >
> >side question, why don't you want llc to be the L3 one ?
>
> Yes, we should set SD_SHARE_PKG_RESOURCES to DIE, but we also want to
> represent the mid-level caches to improve throughput.

so your topology (from a scheduler poV) should be for cpu0:
DIE [0   -       3] [4   -   7] (SD_SHARE_PKG_RESOURCES)
MC  [0 - 1] [2 - 3]             (SD_SHARE_PKG_RESOURCES)
CLS [0] [1]                     (SD_SHARE_PKG_RESOURCES)

so the llc will be the DIE level and load balance will spread tasks in
the different  group of cpus

And regarding EAS, it will look at DIE level for task placement

And of course, this doesn't need any change in scheduler but the
arch_topology.c to return the correct cpumask for your system

>
> Thanks,
> Wang
> >
> >> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
> >
> >The only entry point for describing the scheduler domain is the
> >sched_domain_topology_level array which provides some cpumask and some
> >associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
> >scheduler MC level which implies that the cpus shared their cache. If
> >this is not the case for your system, you should either remove this
> >flag or update the cpumask to reflect which CPUs really share their
> >caches.
> >
> >>
> >> So we must separate sd_llc from sd topology, or the demand cannot be
> >> met in any case under the existing mechanism.
> >
> >There is a default array with DIE, MC, CLS and SMT levels with
> >SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
> >but a different array than the default one can be provided with
> >set_sched_topology()
> >
> >Thanks
> >Vincent
> >
> >>
> >> Thanks,
> >> Wang
> >>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-03-15 16:51                   ` Vincent Guittot
  0 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-03-15 16:51 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Tue, 15 Mar 2022 at 02:58, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >>
> >> >> >>
> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >> >> >>
> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >> >> >>
> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> >> >> core:                         0      1          2      3          4      5           6      7
> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> >> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >> >> >>
> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >> >> >>
> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >> >>
> >> >> >> >What does your current scheduler topology  look like?
> >> >> >> >
> >> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >MC  [0] [1] [2] [3]
> >> >> >>
> >> >> >> The current scheduler topology consistent with CPU topology:
> >> >> >> DIE  [0-7]
> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> >> >> Most Android phones have this topology.
> >> >> >> >
> >> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >> >MC  [0] [1]
> >> >> >> >
> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >> >>
> >> >> >> We don't change the current scheduler topology, but the
> >> >> >> cache topology should be separated like below:
> >> >> >
> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >> >cache/memory cache topology
> >> >> >
> >> >> >> [0-7]                          (shared level 3 cache )
> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >> >> >
> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >> >returns the correct cpu mask directly.
> >> >> >
> >> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >> >not DT to return llc cpu mask
> >> >> >
> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> >> not in the sched_domain.
> >> >>
> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >> >
> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >> >MC[0-7]
> >>
> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> >> shared the llc(L3), but we also have two level:
> >> DIE [0-7]
> >> MC [0-3][4-6]
> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> >> when misfit. We won't change it.
> >>
> >> >
> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> >> in sd, which should be [0-1] instead of [0-3].
> >> >
> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >> >If you want llc to be [0-3] make sure that the
> >> >sched_domain_topology_level array returns the correct cpumask with
> >> >this flag
> >>
> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
> >
> >sd_llc_id refers to a scheduler domain but your patch breaks this so
> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
> >provide a sched_domain level with this topo
>
> Maybe we should add a shared-cache level(SC), like what CLS does:
>
> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
> CLS  (if necessary)
> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
> SMT (if necessary)
>
> SC means a couple of CPUs which are placed closely by sharing
> mid-level caches, but not enough to be a cluster.

what you name SC above looks the same as CLS which should not be mixed
with Arm cluster terminology

> >
> >side question, why don't you want llc to be the L3 one ?
>
> Yes, we should set SD_SHARE_PKG_RESOURCES to DIE, but we also want to
> represent the mid-level caches to improve throughput.

so your topology (from a scheduler poV) should be for cpu0:
DIE [0   -       3] [4   -   7] (SD_SHARE_PKG_RESOURCES)
MC  [0 - 1] [2 - 3]             (SD_SHARE_PKG_RESOURCES)
CLS [0] [1]                     (SD_SHARE_PKG_RESOURCES)

so the llc will be the DIE level and load balance will spread tasks in
the different  group of cpus

And regarding EAS, it will look at DIE level for task placement

And of course, this doesn't need any change in scheduler but the
arch_topology.c to return the correct cpumask for your system

>
> Thanks,
> Wang
> >
> >> SD_SHARE_PKG_RESOURCES flag, the sd_llc will be [0][1][2][3]. It's not true.
> >
> >The only entry point for describing the scheduler domain is the
> >sched_domain_topology_level array which provides some cpumask and some
> >associated flags. By default, SD_SHARE_PKG_RESOURCES is set for
> >scheduler MC level which implies that the cpus shared their cache. If
> >this is not the case for your system, you should either remove this
> >flag or update the cpumask to reflect which CPUs really share their
> >caches.
> >
> >>
> >> So we must separate sd_llc from sd topology, or the demand cannot be
> >> met in any case under the existing mechanism.
> >
> >There is a default array with DIE, MC, CLS and SMT levels with
> >SD_SHARE_PKG_RESOURCES set up to MC which is considered to be the LLC
> >but a different array than the default one can be provided with
> >set_sched_topology()
> >
> >Thanks
> >Vincent
> >
> >>
> >> Thanks,
> >> Wang
> >>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-03-15 16:51                   ` Vincent Guittot
@ 2022-04-02  9:34                     ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-04-02  9:34 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >> >> >>
>> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >> >> >>
>> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> >> >> core:                         0      1          2      3          4      5           6      7
>> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> >> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >> >> >>
>> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >> >> >>
>> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >> >>
>> >> >> >> >What does your current scheduler topology  look like?
>> >> >> >> >
>> >> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >MC  [0] [1] [2] [3]
>> >> >> >>
>> >> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> >> DIE  [0-7]
>> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> >> >> Most Android phones have this topology.
>> >> >> >> >
>> >> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >> >MC  [0] [1]
>> >> >> >> >
>> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >> >>
>> >> >> >> We don't change the current scheduler topology, but the
>> >> >> >> cache topology should be separated like below:
>> >> >> >
>> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >> >cache/memory cache topology
>> >> >> >
>> >> >> >> [0-7]                          (shared level 3 cache )
>> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >> >> >
>> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
>> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >> >returns the correct cpu mask directly.
>> >> >> >
>> >> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >> >not DT to return llc cpu mask
>> >> >> >
>> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> >> not in the sched_domain.
>> >> >>
>> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >> >
>> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >> >MC[0-7]
>> >>
>> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> >> shared the llc(L3), but we also have two level:
>> >> DIE [0-7]
>> >> MC [0-3][4-6]
>> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> >> when misfit. We won't change it.
>> >>
>> >> >
>> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> >> in sd, which should be [0-1] instead of [0-3].
>> >> >
>> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >> >If you want llc to be [0-3] make sure that the
>> >> >sched_domain_topology_level array returns the correct cpumask with
>> >> >this flag
>> >>
>> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>> >
>> >sd_llc_id refers to a scheduler domain but your patch breaks this so
>> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
>> >provide a sched_domain level with this topo
>>
>> Maybe we should add a shared-cache level(SC), like what CLS does:
>>
>> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
>> CLS  (if necessary)
>> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>> SMT (if necessary)
>>
>> SC means a couple of CPUs which are placed closely by sharing
>> mid-level caches, but not enough to be a cluster.
>
>what you name SC above looks the same as CLS which should not be mixed
>with Arm cluster terminology

Do you mean cluster is equal to shared cache instead of containing, SC just 
means shared cache, but not form a cluster, a CLS can contain many SCs.

If as you said, SC looks the same as CLS, should we rename CLS to SC to 
avoid confusion?

Thanks,
Wang

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-04-02  9:34                     ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-04-02  9:34 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >> >> >>
>> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >> >> >>
>> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> >> >> core:                         0      1          2      3          4      5           6      7
>> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> >> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >> >> >>
>> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >> >> >>
>> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >> >>
>> >> >> >> >What does your current scheduler topology  look like?
>> >> >> >> >
>> >> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >MC  [0] [1] [2] [3]
>> >> >> >>
>> >> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> >> DIE  [0-7]
>> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> >> >> Most Android phones have this topology.
>> >> >> >> >
>> >> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >> >MC  [0] [1]
>> >> >> >> >
>> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >> >>
>> >> >> >> We don't change the current scheduler topology, but the
>> >> >> >> cache topology should be separated like below:
>> >> >> >
>> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >> >cache/memory cache topology
>> >> >> >
>> >> >> >> [0-7]                          (shared level 3 cache )
>> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >> >> >
>> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
>> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >> >returns the correct cpu mask directly.
>> >> >> >
>> >> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >> >not DT to return llc cpu mask
>> >> >> >
>> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> >> not in the sched_domain.
>> >> >>
>> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >> >
>> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >> >MC[0-7]
>> >>
>> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> >> shared the llc(L3), but we also have two level:
>> >> DIE [0-7]
>> >> MC [0-3][4-6]
>> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> >> when misfit. We won't change it.
>> >>
>> >> >
>> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> >> in sd, which should be [0-1] instead of [0-3].
>> >> >
>> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >> >If you want llc to be [0-3] make sure that the
>> >> >sched_domain_topology_level array returns the correct cpumask with
>> >> >this flag
>> >>
>> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>> >
>> >sd_llc_id refers to a scheduler domain but your patch breaks this so
>> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
>> >provide a sched_domain level with this topo
>>
>> Maybe we should add a shared-cache level(SC), like what CLS does:
>>
>> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
>> CLS  (if necessary)
>> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>> SMT (if necessary)
>>
>> SC means a couple of CPUs which are placed closely by sharing
>> mid-level caches, but not enough to be a cluster.
>
>what you name SC above looks the same as CLS which should not be mixed
>with Arm cluster terminology

Do you mean cluster is equal to shared cache instead of containing, SC just 
means shared cache, but not form a cluster, a CLS can contain many SCs.

If as you said, SC looks the same as CLS, should we rename CLS to SC to 
avoid confusion?

Thanks,
Wang
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-04-02  9:34                     ` 王擎
@ 2022-04-02 10:29                       ` Yicong Yang
  -1 siblings, 0 replies; 36+ messages in thread
From: Yicong Yang @ 2022-04-02 10:29 UTC (permalink / raw)
  To: 王擎, Vincent Guittot
  Cc: yangyicong, Catalin Marinas, Will Deacon, Sudeep Holla,
	Greg Kroah-Hartman, Rafael J. Wysocki, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
	linux-arm-kernel, linux-kernel

Hi Qing,

On 2022/4/2 17:34, 王擎 wrote:
> 
>>>
>>>
>>>>>
>>>>>
>>>>>>>
>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> From: Wang Qing <wangqing@vivo.com>
>>>>>>>>>>>
>>>>>>>>>>> Some architectures(e.g. ARM64), caches are implemented below:
>>>>>>>>>>> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>>>>>>>>>>> core:                         0      1          2      3          4      5           6      7
>>>>>>>>> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>>>>>>>>>>> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>>>>>>>>> (add cache level 3)        *************share level 3 cache ***************
>>>>>>>>>>> sd_llc_id(current):     0      0          0      0          4      4           4      4
>>>>>>>>>>> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>>>>>>>>>>>
>>>>>>>>> Here, n always be 2 in ARM64, but others are also possible.
>>>>>>>>> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>>>>>>>>>
>>>>>>>>>>> Caches and cpus have different topology, this causes cpus_share_cache()
>>>>>>>>>>> return the wrong value, which will affect the CPU load balance.
>>>>>>>>>>>
>>>>>>>>>> What does your current scheduler topology  look like?
>>>>>>>>>>
>>>>>>>>>> For CPU 0 to 3, do you have the below ?
>>>>>>>>>> DIE [0     -     3] [4-7]
>>>>>>>>>> MC  [0] [1] [2] [3]
>>>>>>>>>
>>>>>>>>> The current scheduler topology consistent with CPU topology:
>>>>>>>>> DIE  [0-7]
>>>>>>>>> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>>>>>>>>> Most Android phones have this topology.
>>>>>>>>>>
>>>>>>>>>> But you would like something like below for cpu 0-1 instead ?
>>>>>>>>>> DIE [0     -     3] [4-7]
>>>>>>>>>> CLS [0 - 1] [2 - 3]
>>>>>>>>>> MC  [0] [1]
>>>>>>>>>>
>>>>>>>>>> with SD_SHARE_PKG_RESOURCES only set to MC level ?
>>>>>>>>>
>>>>>>>>> We don't change the current scheduler topology, but the
>>>>>>>>> cache topology should be separated like below:
>>>>>>>>
>>>>>>>> The scheduler topology is not only cpu topology but a mixed of cpu and
>>>>>>>> cache/memory cache topology
>>>>>>>>
>>>>>>>>> [0-7]                          (shared level 3 cache )
>>>>>>>>> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>>>>>>>>
>>>>>>>> So you don't  bother the intermediate cluster level which is even simpler.
>>>>>>>> you have to modify generic arch topology so that cpu_coregroup_mask
>>>>>>>> returns the correct cpu mask directly.
>>>>>>>>
>>>>>>>> You will notice a llc_sibling field that is currently used by acpi but
>>>>>>>> not DT to return llc cpu mask
>>>>>>>>
>>>>>>> cpu_topology[].llc_sibling describe the last level cache of whole system,
>>>>>>> not in the sched_domain.
>>>>>>>
>>>>>>> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>>>>>>
>>>>>> If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>>>>>> MC[0-7]
>>>>>
>>>>> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>>>>> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>>>>> shared the llc(L3), but we also have two level:
>>>>> DIE [0-7]
>>>>> MC [0-3][4-6]
>>>>> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>>>>> when misfit. We won't change it.
>>>>>
>>>>>>
>>>>>>> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>>>>>>> in sd, which should be [0-1] instead of [0-3].
>>>>>>
>>>>>> sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>>>>>> If you want llc to be [0-3] make sure that the
>>>>>> sched_domain_topology_level array returns the correct cpumask with
>>>>>> this flag
>>>>>
>>>>> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>>>>
>>>> sd_llc_id refers to a scheduler domain but your patch breaks this so
>>>> if you want a llc that reflects this topo:  [0-1] [2-3] you must
>>>> provide a sched_domain level with this topo
>>>
>>> Maybe we should add a shared-cache level(SC), like what CLS does:
>>>
>>> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>>> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
>>> CLS  (if necessary)
>>> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>>> SMT (if necessary)
>>>
>>> SC means a couple of CPUs which are placed closely by sharing
>>> mid-level caches, but not enough to be a cluster.
>>
>> what you name SC above looks the same as CLS which should not be mixed
>> with Arm cluster terminology
> 
> Do you mean cluster is equal to shared cache instead of containing, SC just 
> means shared cache, but not form a cluster, a CLS can contain many SCs.
> 

The cluster is a topology level above the CPUs but under LLC. On Kunpeng 920 the cpus
in a CLS will share L3T and on Intel's Jacobsville cpus in a CLS will share L2[1].

Seems you're using a DT based system. I think the parsing of cluster level is not
supported on DT yet so you cannot see it. Otherwise with right cpu topology reported
you will have a CLS level in which the cpus share L2 cache, just like Jacobsville.

[1] https://lore.kernel.org/all/20210924085104.44806-4-21cnbao@gmail.com/

> If as you said, SC looks the same as CLS, should we rename CLS to SC to 
> avoid confusion?
> 
> Thanks,
> Wang
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-04-02 10:29                       ` Yicong Yang
  0 siblings, 0 replies; 36+ messages in thread
From: Yicong Yang @ 2022-04-02 10:29 UTC (permalink / raw)
  To: 王擎, Vincent Guittot
  Cc: yangyicong, Catalin Marinas, Will Deacon, Sudeep Holla,
	Greg Kroah-Hartman, Rafael J. Wysocki, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
	linux-arm-kernel, linux-kernel

Hi Qing,

On 2022/4/2 17:34, 王擎 wrote:
> 
>>>
>>>
>>>>>
>>>>>
>>>>>>>
>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> From: Wang Qing <wangqing@vivo.com>
>>>>>>>>>>>
>>>>>>>>>>> Some architectures(e.g. ARM64), caches are implemented below:
>>>>>>>>>>> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>>>>>>>>>>> core:                         0      1          2      3          4      5           6      7
>>>>>>>>> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>>>>>>>>>>> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>>>>>>>>> (add cache level 3)        *************share level 3 cache ***************
>>>>>>>>>>> sd_llc_id(current):     0      0          0      0          4      4           4      4
>>>>>>>>>>> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>>>>>>>>>>>
>>>>>>>>> Here, n always be 2 in ARM64, but others are also possible.
>>>>>>>>> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>>>>>>>>>
>>>>>>>>>>> Caches and cpus have different topology, this causes cpus_share_cache()
>>>>>>>>>>> return the wrong value, which will affect the CPU load balance.
>>>>>>>>>>>
>>>>>>>>>> What does your current scheduler topology  look like?
>>>>>>>>>>
>>>>>>>>>> For CPU 0 to 3, do you have the below ?
>>>>>>>>>> DIE [0     -     3] [4-7]
>>>>>>>>>> MC  [0] [1] [2] [3]
>>>>>>>>>
>>>>>>>>> The current scheduler topology consistent with CPU topology:
>>>>>>>>> DIE  [0-7]
>>>>>>>>> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>>>>>>>>> Most Android phones have this topology.
>>>>>>>>>>
>>>>>>>>>> But you would like something like below for cpu 0-1 instead ?
>>>>>>>>>> DIE [0     -     3] [4-7]
>>>>>>>>>> CLS [0 - 1] [2 - 3]
>>>>>>>>>> MC  [0] [1]
>>>>>>>>>>
>>>>>>>>>> with SD_SHARE_PKG_RESOURCES only set to MC level ?
>>>>>>>>>
>>>>>>>>> We don't change the current scheduler topology, but the
>>>>>>>>> cache topology should be separated like below:
>>>>>>>>
>>>>>>>> The scheduler topology is not only cpu topology but a mixed of cpu and
>>>>>>>> cache/memory cache topology
>>>>>>>>
>>>>>>>>> [0-7]                          (shared level 3 cache )
>>>>>>>>> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>>>>>>>>
>>>>>>>> So you don't  bother the intermediate cluster level which is even simpler.
>>>>>>>> you have to modify generic arch topology so that cpu_coregroup_mask
>>>>>>>> returns the correct cpu mask directly.
>>>>>>>>
>>>>>>>> You will notice a llc_sibling field that is currently used by acpi but
>>>>>>>> not DT to return llc cpu mask
>>>>>>>>
>>>>>>> cpu_topology[].llc_sibling describe the last level cache of whole system,
>>>>>>> not in the sched_domain.
>>>>>>>
>>>>>>> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>>>>>>
>>>>>> If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>>>>>> MC[0-7]
>>>>>
>>>>> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>>>>> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>>>>> shared the llc(L3), but we also have two level:
>>>>> DIE [0-7]
>>>>> MC [0-3][4-6]
>>>>> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>>>>> when misfit. We won't change it.
>>>>>
>>>>>>
>>>>>>> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>>>>>>> in sd, which should be [0-1] instead of [0-3].
>>>>>>
>>>>>> sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>>>>>> If you want llc to be [0-3] make sure that the
>>>>>> sched_domain_topology_level array returns the correct cpumask with
>>>>>> this flag
>>>>>
>>>>> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>>>>
>>>> sd_llc_id refers to a scheduler domain but your patch breaks this so
>>>> if you want a llc that reflects this topo:  [0-1] [2-3] you must
>>>> provide a sched_domain level with this topo
>>>
>>> Maybe we should add a shared-cache level(SC), like what CLS does:
>>>
>>> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>>> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
>>> CLS  (if necessary)
>>> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>>> SMT (if necessary)
>>>
>>> SC means a couple of CPUs which are placed closely by sharing
>>> mid-level caches, but not enough to be a cluster.
>>
>> what you name SC above looks the same as CLS which should not be mixed
>> with Arm cluster terminology
> 
> Do you mean cluster is equal to shared cache instead of containing, SC just 
> means shared cache, but not form a cluster, a CLS can contain many SCs.
> 

The cluster is a topology level above the CPUs but under LLC. On Kunpeng 920 the cpus
in a CLS will share L3T and on Intel's Jacobsville cpus in a CLS will share L2[1].

Seems you're using a DT based system. I think the parsing of cluster level is not
supported on DT yet so you cannot see it. Otherwise with right cpu topology reported
you will have a CLS level in which the cpus share L2 cache, just like Jacobsville.

[1] https://lore.kernel.org/all/20210924085104.44806-4-21cnbao@gmail.com/

> If as you said, SC looks the same as CLS, should we rename CLS to SC to 
> avoid confusion?
> 
> Thanks,
> Wang
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-04-02  9:34                     ` 王擎
@ 2022-04-04 12:31                       ` Vincent Guittot
  -1 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-04-04 12:31 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Sat, 2 Apr 2022 at 11:34, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >>
> >> >> >>
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >> >> >> >>
> >> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >> >> >> >>
> >> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> >> >> >> core:                         0      1          2      3          4      5           6      7
> >> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> >> >> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >> >> >> >>
> >> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >> >> >> >>
> >> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >> >> >>
> >> >> >> >> >What does your current scheduler topology  look like?
> >> >> >> >> >
> >> >> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >> >MC  [0] [1] [2] [3]
> >> >> >> >>
> >> >> >> >> The current scheduler topology consistent with CPU topology:
> >> >> >> >> DIE  [0-7]
> >> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> >> >> >> Most Android phones have this topology.
> >> >> >> >> >
> >> >> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >> >> >MC  [0] [1]
> >> >> >> >> >
> >> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >> >> >>
> >> >> >> >> We don't change the current scheduler topology, but the
> >> >> >> >> cache topology should be separated like below:
> >> >> >> >
> >> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >> >> >cache/memory cache topology
> >> >> >> >
> >> >> >> >> [0-7]                          (shared level 3 cache )
> >> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >> >> >> >
> >> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
> >> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >> >> >returns the correct cpu mask directly.
> >> >> >> >
> >> >> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >> >> >not DT to return llc cpu mask
> >> >> >> >
> >> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> >> >> not in the sched_domain.
> >> >> >>
> >> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >> >> >
> >> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >> >> >MC[0-7]
> >> >>
> >> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> >> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> >> >> shared the llc(L3), but we also have two level:
> >> >> DIE [0-7]
> >> >> MC [0-3][4-6]
> >> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> >> >> when misfit. We won't change it.
> >> >>
> >> >> >
> >> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> >> >> in sd, which should be [0-1] instead of [0-3].
> >> >> >
> >> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >> >> >If you want llc to be [0-3] make sure that the
> >> >> >sched_domain_topology_level array returns the correct cpumask with
> >> >> >this flag
> >> >>
> >> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
> >> >
> >> >sd_llc_id refers to a scheduler domain but your patch breaks this so
> >> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
> >> >provide a sched_domain level with this topo
> >>
> >> Maybe we should add a shared-cache level(SC), like what CLS does:
> >>
> >> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
> >> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
> >> CLS  (if necessary)
> >> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
> >> SMT (if necessary)
> >>
> >> SC means a couple of CPUs which are placed closely by sharing
> >> mid-level caches, but not enough to be a cluster.
> >
> >what you name SC above looks the same as CLS which should not be mixed
> >with Arm cluster terminology
>
> Do you mean cluster is equal to shared cache instead of containing, SC just
> means shared cache, but not form a cluster, a CLS can contain many SCs.

CLS in the scheduler topology is not strictly tied to the "Arm
cluster" but it's the generic name to describe an intermediate group
of CPUs with common properties. CLS is also used by some intel
platforms as an example. What I mean is that you can use the scheduler
CLS level to describe what you call an Arm SC level.

>
> If as you said, SC looks the same as CLS, should we rename CLS to SC to
> avoid confusion?

CLS is a generic scheduler name and I don't think that we need to
rename it to a Arm specific label

Thanks,
Vincent

>
> Thanks,
> Wang

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-04-04 12:31                       ` Vincent Guittot
  0 siblings, 0 replies; 36+ messages in thread
From: Vincent Guittot @ 2022-04-04 12:31 UTC (permalink / raw)
  To: 王擎
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel

On Sat, 2 Apr 2022 at 11:34, 王擎 <wangqing@vivo.com> wrote:
>
>
> >>
> >>
> >> >>
> >> >>
> >> >> >>
> >> >> >>
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
> >> >> >> >> >>
> >> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
> >> >> >> >> >>
> >> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
> >> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
> >> >> >> >> >> core:                         0      1          2      3          4      5           6      7
> >> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
> >> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
> >> >> >> >> (add cache level 3)        *************share level 3 cache ***************
> >> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
> >> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
> >> >> >> >> >>
> >> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
> >> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
> >> >> >> >>
> >> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
> >> >> >> >> >> return the wrong value, which will affect the CPU load balance.
> >> >> >> >> >>
> >> >> >> >> >What does your current scheduler topology  look like?
> >> >> >> >> >
> >> >> >> >> >For CPU 0 to 3, do you have the below ?
> >> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >> >MC  [0] [1] [2] [3]
> >> >> >> >>
> >> >> >> >> The current scheduler topology consistent with CPU topology:
> >> >> >> >> DIE  [0-7]
> >> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
> >> >> >> >> Most Android phones have this topology.
> >> >> >> >> >
> >> >> >> >> >But you would like something like below for cpu 0-1 instead ?
> >> >> >> >> >DIE [0     -     3] [4-7]
> >> >> >> >> >CLS [0 - 1] [2 - 3]
> >> >> >> >> >MC  [0] [1]
> >> >> >> >> >
> >> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
> >> >> >> >>
> >> >> >> >> We don't change the current scheduler topology, but the
> >> >> >> >> cache topology should be separated like below:
> >> >> >> >
> >> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
> >> >> >> >cache/memory cache topology
> >> >> >> >
> >> >> >> >> [0-7]                          (shared level 3 cache )
> >> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
> >> >> >> >
> >> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
> >> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
> >> >> >> >returns the correct cpu mask directly.
> >> >> >> >
> >> >> >> >You will notice a llc_sibling field that is currently used by acpi but
> >> >> >> >not DT to return llc cpu mask
> >> >> >> >
> >> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
> >> >> >> not in the sched_domain.
> >> >> >>
> >> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
> >> >> >
> >> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
> >> >> >MC[0-7]
> >> >>
> >> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
> >> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
> >> >> shared the llc(L3), but we also have two level:
> >> >> DIE [0-7]
> >> >> MC [0-3][4-6]
> >> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
> >> >> when misfit. We won't change it.
> >> >>
> >> >> >
> >> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
> >> >> >> in sd, which should be [0-1] instead of [0-3].
> >> >> >
> >> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
> >> >> >If you want llc to be [0-3] make sure that the
> >> >> >sched_domain_topology_level array returns the correct cpumask with
> >> >> >this flag
> >> >>
> >> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
> >> >
> >> >sd_llc_id refers to a scheduler domain but your patch breaks this so
> >> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
> >> >provide a sched_domain level with this topo
> >>
> >> Maybe we should add a shared-cache level(SC), like what CLS does:
> >>
> >> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
> >> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
> >> CLS  (if necessary)
> >> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
> >> SMT (if necessary)
> >>
> >> SC means a couple of CPUs which are placed closely by sharing
> >> mid-level caches, but not enough to be a cluster.
> >
> >what you name SC above looks the same as CLS which should not be mixed
> >with Arm cluster terminology
>
> Do you mean cluster is equal to shared cache instead of containing, SC just
> means shared cache, but not form a cluster, a CLS can contain many SCs.

CLS in the scheduler topology is not strictly tied to the "Arm
cluster" but it's the generic name to describe an intermediate group
of CPUs with common properties. CLS is also used by some intel
platforms as an example. What I mean is that you can use the scheduler
CLS level to describe what you call an Arm SC level.

>
> If as you said, SC looks the same as CLS, should we rename CLS to SC to
> avoid confusion?

CLS is a generic scheduler name and I don't think that we need to
rename it to a Arm specific label

Thanks,
Vincent

>
> Thanks,
> Wang

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH] sched: topology: make cache topology separate from cpu topology
  2022-04-04 12:31                       ` Vincent Guittot
@ 2022-04-07  2:31                         ` 王擎
  -1 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-04-07  2:31 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> >>
>> >> >> >> >>
>> >> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >> >> >> >>
>> >> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >> >> >> >>
>> >> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> >> >> >> core:                         0      1          2      3          4      5           6      7
>> >> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> >> >> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >> >> >> >>
>> >> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >> >> >> >>
>> >> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >> >> >>
>> >> >> >> >> >What does your current scheduler topology  look like?
>> >> >> >> >> >
>> >> >> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >> >MC  [0] [1] [2] [3]
>> >> >> >> >>
>> >> >> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> >> >> DIE  [0-7]
>> >> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> >> >> >> Most Android phones have this topology.
>> >> >> >> >> >
>> >> >> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >> >> >MC  [0] [1]
>> >> >> >> >> >
>> >> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >> >> >>
>> >> >> >> >> We don't change the current scheduler topology, but the
>> >> >> >> >> cache topology should be separated like below:
>> >> >> >> >
>> >> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >> >> >cache/memory cache topology
>> >> >> >> >
>> >> >> >> >> [0-7]                          (shared level 3 cache )
>> >> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >> >> >> >
>> >> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
>> >> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >> >> >returns the correct cpu mask directly.
>> >> >> >> >
>> >> >> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >> >> >not DT to return llc cpu mask
>> >> >> >> >
>> >> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> >> >> not in the sched_domain.
>> >> >> >>
>> >> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >> >> >
>> >> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >> >> >MC[0-7]
>> >> >>
>> >> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> >> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> >> >> shared the llc(L3), but we also have two level:
>> >> >> DIE [0-7]
>> >> >> MC [0-3][4-6]
>> >> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> >> >> when misfit. We won't change it.
>> >> >>
>> >> >> >
>> >> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> >> >> in sd, which should be [0-1] instead of [0-3].
>> >> >> >
>> >> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >> >> >If you want llc to be [0-3] make sure that the
>> >> >> >sched_domain_topology_level array returns the correct cpumask with
>> >> >> >this flag
>> >> >>
>> >> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>> >> >
>> >> >sd_llc_id refers to a scheduler domain but your patch breaks this so
>> >> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
>> >> >provide a sched_domain level with this topo
>> >>
>> >> Maybe we should add a shared-cache level(SC), like what CLS does:
>> >>
>> >> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>> >> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
>> >> CLS  (if necessary)
>> >> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>> >> SMT (if necessary)
>> >>
>> >> SC means a couple of CPUs which are placed closely by sharing
>> >> mid-level caches, but not enough to be a cluster.
>> >
>> >what you name SC above looks the same as CLS which should not be mixed
>> >with Arm cluster terminology
>>
>> Do you mean cluster is equal to shared cache instead of containing, SC just
>> means shared cache, but not form a cluster, a CLS can contain many SCs.
>
>CLS in the scheduler topology is not strictly tied to the "Arm
>cluster" but it's the generic name to describe an intermediate group
>of CPUs with common properties. CLS is also used by some intel
>platforms as an example. What I mean is that you can use the scheduler
>CLS level to describe what you call an Arm SC level.

It won't work, because cluster_sibling is assigned according to cluster_id, 
which is strictly tied to the "Arm cluster".
And if we have used CLS to describe the cluster sd, how do we describe
shared cache sd, like complex, which shared self cache within a cluster.
>
>>
>> If as you said, SC looks the same as CLS, should we rename CLS to SC to
>> avoid confusion?
>
>CLS is a generic scheduler name and I don't think that we need to
>rename it to a Arm specific label

I still insist on adding sc level within the cls, because maybe we have 
already used CLS to describe the cluster sd, please consider about it.

Thanks,
Wang

>
>Thanks,
>Vincent
>
>>
>> Thanks,
>> Wang

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH] sched: topology: make cache topology separate from cpu topology
@ 2022-04-07  2:31                         ` 王擎
  0 siblings, 0 replies; 36+ messages in thread
From: 王擎 @ 2022-04-07  2:31 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Catalin Marinas, Will Deacon, Sudeep Holla, Greg Kroah-Hartman,
	Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-arm-kernel, linux-kernel


>>
>>
>> >>
>> >>
>> >> >>
>> >> >>
>> >> >> >>
>> >> >> >>
>> >> >> >> >>
>> >> >> >> >>
>> >> >> >> >> >On Thu, 10 Mar 2022 at 13:59, Qing Wang <wangqing@vivo.com> wrote:
>> >> >> >> >> >>
>> >> >> >> >> >> From: Wang Qing <wangqing@vivo.com>
>> >> >> >> >> >>
>> >> >> >> >> >> Some architectures(e.g. ARM64), caches are implemented below:
>> >> >> >> >> >> cluster:                     ****** cluster 0 *****      ****** cluster 1 *****
>> >> >> >> >> >> core:                         0      1          2      3          4      5           6      7
>> >> >> >> >> (add cache level 1)        c0    c1        c2    c3         c4    c5         c6    c7
>> >> >> >> >> >> cache(Leveln):         **cache0**  **cache1**  **cache2**  **cache3**
>> >> >> >> >> (add cache level 3)        *************share level 3 cache ***************
>> >> >> >> >> >> sd_llc_id(current):     0      0          0      0          4      4           4      4
>> >> >> >> >> >> sd_llc_id(should be): 0      0          2      2          4      4           6      6
>> >> >> >> >> >>
>> >> >> >> >> Here, n always be 2 in ARM64, but others are also possible.
>> >> >> >> >> core[0,1] form a complex(ARMV9),  which share L2 cache, core[2,3] is the same.
>> >> >> >> >>
>> >> >> >> >> >> Caches and cpus have different topology, this causes cpus_share_cache()
>> >> >> >> >> >> return the wrong value, which will affect the CPU load balance.
>> >> >> >> >> >>
>> >> >> >> >> >What does your current scheduler topology  look like?
>> >> >> >> >> >
>> >> >> >> >> >For CPU 0 to 3, do you have the below ?
>> >> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >> >MC  [0] [1] [2] [3]
>> >> >> >> >>
>> >> >> >> >> The current scheduler topology consistent with CPU topology:
>> >> >> >> >> DIE  [0-7]
>> >> >> >> >> MC  [0-3] [4-7]  (SD_SHARE_PKG_RESOURCES)
>> >> >> >> >> Most Android phones have this topology.
>> >> >> >> >> >
>> >> >> >> >> >But you would like something like below for cpu 0-1 instead ?
>> >> >> >> >> >DIE [0     -     3] [4-7]
>> >> >> >> >> >CLS [0 - 1] [2 - 3]
>> >> >> >> >> >MC  [0] [1]
>> >> >> >> >> >
>> >> >> >> >> >with SD_SHARE_PKG_RESOURCES only set to MC level ?
>> >> >> >> >>
>> >> >> >> >> We don't change the current scheduler topology, but the
>> >> >> >> >> cache topology should be separated like below:
>> >> >> >> >
>> >> >> >> >The scheduler topology is not only cpu topology but a mixed of cpu and
>> >> >> >> >cache/memory cache topology
>> >> >> >> >
>> >> >> >> >> [0-7]                          (shared level 3 cache )
>> >> >> >> >> [0-1] [2-3][4-5][6-7]   (shared level 2 cache )
>> >> >> >> >
>> >> >> >> >So you don't  bother the intermediate cluster level which is even simpler.
>> >> >> >> >you have to modify generic arch topology so that cpu_coregroup_mask
>> >> >> >> >returns the correct cpu mask directly.
>> >> >> >> >
>> >> >> >> >You will notice a llc_sibling field that is currently used by acpi but
>> >> >> >> >not DT to return llc cpu mask
>> >> >> >> >
>> >> >> >> cpu_topology[].llc_sibling describe the last level cache of whole system,
>> >> >> >> not in the sched_domain.
>> >> >> >>
>> >> >> >> in the above cache topology, llc_sibling is 0xff([0-7]) , it describes
>> >> >> >
>> >> >> >If llc_sibling was 0xff([0-7] on your system, you would have only one level:
>> >> >> >MC[0-7]
>> >> >>
>> >> >> Sorry, but I don't get it, why llc_sibling was 0xff([0-7] means MC[0-7]?
>> >> >> In our system(Android), llc_sibling is indeed 0xff([0-7]) , because they
>> >> >> shared the llc(L3), but we also have two level:
>> >> >> DIE [0-7]
>> >> >> MC [0-3][4-6]
>> >> >> It makes sense, [0-3] are little cores, [4-7] are bit cores, se only up migrate
>> >> >> when misfit. We won't change it.
>> >> >>
>> >> >> >
>> >> >> >> the L3 cache sibling, but sd_llc_id describes the maximum shared cache
>> >> >> >> in sd, which should be [0-1] instead of [0-3].
>> >> >> >
>> >> >> >sd_llc_id describes the last sched_domain with SD_SHARE_PKG_RESOURCES.
>> >> >> >If you want llc to be [0-3] make sure that the
>> >> >> >sched_domain_topology_level array returns the correct cpumask with
>> >> >> >this flag
>> >> >>
>> >> >> Acturely, we want sd_llc to be [0-1] [2-3], but if the MC domain don't have
>> >> >
>> >> >sd_llc_id refers to a scheduler domain but your patch breaks this so
>> >> >if you want a llc that reflects this topo:  [0-1] [2-3] you must
>> >> >provide a sched_domain level with this topo
>> >>
>> >> Maybe we should add a shared-cache level(SC), like what CLS does:
>> >>
>> >> DIE  [0-7] (shared level 3 cache, SD_SHARE_PKG_RESOURCES)
>> >> MC  [0-3] [4-7]  (not SD_SHARE_PKG_RESOURCES)
>> >> CLS  (if necessary)
>> >> SC    [0-1][2-3][4-5][6-7] (shared level 2 cache, SD_SHARE_PKG_RESOURCES)
>> >> SMT (if necessary)
>> >>
>> >> SC means a couple of CPUs which are placed closely by sharing
>> >> mid-level caches, but not enough to be a cluster.
>> >
>> >what you name SC above looks the same as CLS which should not be mixed
>> >with Arm cluster terminology
>>
>> Do you mean cluster is equal to shared cache instead of containing, SC just
>> means shared cache, but not form a cluster, a CLS can contain many SCs.
>
>CLS in the scheduler topology is not strictly tied to the "Arm
>cluster" but it's the generic name to describe an intermediate group
>of CPUs with common properties. CLS is also used by some intel
>platforms as an example. What I mean is that you can use the scheduler
>CLS level to describe what you call an Arm SC level.

It won't work, because cluster_sibling is assigned according to cluster_id, 
which is strictly tied to the "Arm cluster".
And if we have used CLS to describe the cluster sd, how do we describe
shared cache sd, like complex, which shared self cache within a cluster.
>
>>
>> If as you said, SC looks the same as CLS, should we rename CLS to SC to
>> avoid confusion?
>
>CLS is a generic scheduler name and I don't think that we need to
>rename it to a Arm specific label

I still insist on adding sc level within the cls, because maybe we have 
already used CLS to describe the cluster sd, please consider about it.

Thanks,
Wang

>
>Thanks,
>Vincent
>
>>
>> Thanks,
>> Wang
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2022-04-07  2:32 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-10 12:58 [PATCH] sched: topology: make cache topology separate from cpu topology Qing Wang
2022-03-10 12:58 ` Qing Wang
2022-03-10 14:18 ` Vincent Guittot
2022-03-10 14:18   ` Vincent Guittot
2022-03-11  2:03   ` 王擎
2022-03-11  2:03     ` 王擎
2022-03-11  7:59     ` Vincent Guittot
2022-03-11  7:59       ` Vincent Guittot
2022-03-11  8:18       ` 王擎
2022-03-11  8:18         ` 王擎
2022-03-11  8:25         ` Vincent Guittot
2022-03-11  8:25           ` Vincent Guittot
2022-03-11  9:30           ` 王擎
2022-03-11  9:30             ` 王擎
2022-03-11 11:14             ` Vincent Guittot
2022-03-11 11:14               ` Vincent Guittot
2022-03-15  1:58               ` 王擎
2022-03-15  1:58                 ` 王擎
2022-03-15 16:51                 ` Vincent Guittot
2022-03-15 16:51                   ` Vincent Guittot
2022-04-02  9:34                   ` 王擎
2022-04-02  9:34                     ` 王擎
2022-04-02 10:29                     ` Yicong Yang
2022-04-02 10:29                       ` Yicong Yang
2022-04-04 12:31                     ` Vincent Guittot
2022-04-04 12:31                       ` Vincent Guittot
2022-04-07  2:31                       ` 王擎
2022-04-07  2:31                         ` 王擎
2022-03-10 17:35 ` kernel test robot
2022-03-10 17:35   ` kernel test robot
2022-03-11 11:24 ` Peter Zijlstra
2022-03-11 11:24   ` Peter Zijlstra
2022-03-14  2:37   ` 王擎
2022-03-14  2:37     ` 王擎
2022-03-14 10:19     ` Peter Zijlstra
2022-03-14 10:19       ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.