* [PATCH v3 1/7] sched: limit cpu search in select_idle_cpu
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 2/7] sched: introduce per-cpu var next_cpu to track search limit subhra mazumdar
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Put upper and lower limit on cpu search of select_idle_cpu. The lower limit
is amount of cpus in a core while upper limit is twice that. This ensures
for any architecture we will usually search beyond a core. The upper limit
also helps in keeping the search cost low and constant.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
kernel/sched/fair.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f..b58f08f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6188,7 +6188,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, nr = INT_MAX;
+ int cpu, limit, floor, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6206,10 +6206,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
if (sched_feat(SIS_PROP)) {
u64 span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
+ floor = cpumask_weight(topology_sibling_cpumask(target));
+ if (floor < 2)
+ floor = 2;
+ limit = floor << 1;
+ if (span_avg > floor*avg_cost) {
nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
+ if (nr > limit)
+ nr = limit;
+ } else {
+ nr = floor;
+ }
}
time = local_clock();
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 2/7] sched: introduce per-cpu var next_cpu to track search limit
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 1/7] sched: limit cpu search in select_idle_cpu subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 3/7] sched: rotate the cpu search window for better spread subhra mazumdar
` (4 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Introduce a per-cpu variable to track the limit upto which idle cpu search
was done in select_idle_cpu(). This will help to start the search next time
from there. This is necessary for rotating the search window over entire
LLC domain.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
kernel/sched/core.c | 2 ++
kernel/sched/sched.h | 1 +
2 files changed, 3 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427..80657fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,6 +24,7 @@
#include <trace/events/sched.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU_SHARED_ALIGNED(int, next_cpu);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
/*
@@ -5966,6 +5967,7 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
struct rq *rq;
+ per_cpu(next_cpu, i) = -1;
rq = cpu_rq(i);
raw_spin_lock_init(&rq->lock);
rq->nr_running = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1a..4cecfa2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -994,6 +994,7 @@ static inline void update_idle_core(struct rq *rq) { }
#endif
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(int, next_cpu);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 3/7] sched: rotate the cpu search window for better spread
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 1/7] sched: limit cpu search in select_idle_cpu subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 2/7] sched: introduce per-cpu var next_cpu to track search limit subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 4/7] sched: add sched feature to disable idle core search subhra mazumdar
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Rotate the cpu search window for better spread of threads. This will ensure
an idle cpu will quickly be found if one exists.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
kernel/sched/fair.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b58f08f..c1ca88e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6188,7 +6188,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, limit, floor, nr = INT_MAX;
+ int cpu, limit, floor, target_tmp, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6219,9 +6219,15 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
}
+ if (per_cpu(next_cpu, target) != -1)
+ target_tmp = per_cpu(next_cpu, target);
+ else
+ target_tmp = target;
+
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target_tmp) {
+ per_cpu(next_cpu, target) = cpu;
if (!--nr)
return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 4/7] sched: add sched feature to disable idle core search
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
` (2 preceding siblings ...)
2019-06-09 1:49 ` [PATCH v3 3/7] sched: rotate the cpu search window for better spread subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 5/7] sched: SIS_CORE " subhra mazumdar
` (2 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Add a new sched feature SIS_CORE to have an option to disable idle core
search (select_idle_core).
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
kernel/sched/features.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b..de4d506 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_AVG_CPU, false)
SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_CORE, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 5/7] sched: SIS_CORE to disable idle core search
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
` (3 preceding siblings ...)
2019-06-09 1:49 ` [PATCH v3 4/7] sched: add sched feature to disable idle core search subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 7/7] sched: use per-cpu variable cpumask_weight_sibling subhra mazumdar
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Use SIS_CORE to disable idle core search. For some workloads
select_idle_core becomes a scalability bottleneck, removing it improves
throughput. Also there are workloads where disabling it can hurt latency,
so need to have an option.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
kernel/sched/fair.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c1ca88e..6a74808 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6280,9 +6280,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (sched_feat(SIS_CORE)) {
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+ }
i = select_idle_cpu(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
` (4 preceding siblings ...)
2019-06-09 1:49 ` [PATCH v3 5/7] sched: SIS_CORE " subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
2019-06-09 1:49 ` [PATCH v3 7/7] sched: use per-cpu variable cpumask_weight_sibling subhra mazumdar
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Introduce a per-cpu variable to keep the number of HT siblings of a cpu.
This will be used for quick lookup in select_idle_cpu to determine the
limits of search. This patch does it only for x86.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
arch/x86/include/asm/smp.h | 1 +
arch/x86/include/asm/topology.h | 1 +
arch/x86/kernel/smpboot.c | 17 ++++++++++++++++-
include/linux/topology.h | 4 ++++
4 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index da545df..1e90cbd 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -22,6 +22,7 @@ extern int smp_num_siblings;
extern unsigned int num_processors;
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned int, cpumask_weight_sibling);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 453cf38..dd19c71 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -111,6 +111,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
#ifdef CONFIG_SMP
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_weight(cpu) (per_cpu(cpumask_weight_sibling, cpu))
extern unsigned int __max_logical_packages;
#define topology_max_packages() (__max_logical_packages)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 362dd89..20bf676 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -85,6 +85,10 @@
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+/* representing number of HT siblings of each CPU */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, cpumask_weight_sibling);
+EXPORT_PER_CPU_SYMBOL(cpumask_weight_sibling);
+
/* representing HT and core siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
@@ -520,6 +524,8 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
+ per_cpu(cpumask_weight_sibling, cpu) =
+ cpumask_weight(topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
c->booted_cores = 1;
@@ -529,8 +535,12 @@ void set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
- if ((i == cpu) || (has_smt && match_smt(c, o)))
+ if ((i == cpu) || (has_smt && match_smt(c, o))) {
link_mask(topology_sibling_cpumask, cpu, i);
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ per_cpu(cpumask_weight_sibling, cpu) = threads;
+ per_cpu(cpumask_weight_sibling, i) = threads;
+ }
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
@@ -1173,6 +1183,8 @@ static __init void disable_smp(void)
else
physid_set_mask_of_physid(0, &phys_cpu_present_map);
cpumask_set_cpu(0, topology_sibling_cpumask(0));
+ per_cpu(cpumask_weight_sibling, 0) =
+ cpumask_weight(topology_sibling_cpumask(0));
cpumask_set_cpu(0, topology_core_cpumask(0));
}
@@ -1482,6 +1494,8 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, topology_core_cpumask(cpu)) {
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
+ per_cpu(cpumask_weight_sibling, sibling) =
+ cpumask_weight(topology_sibling_cpumask(sibling));
/*/
* last thread sibling in this cpu core going down
*/
@@ -1495,6 +1509,7 @@ static void remove_siblinginfo(int cpu)
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
+ per_cpu(cpumask_weight_sibling, cpu) = 0;
cpumask_clear(topology_core_cpumask(cpu));
c->cpu_core_id = 0;
c->booted_cores = 0;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cb0775e..a85aea1 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -190,6 +190,10 @@ static inline int cpu_to_mem(int cpu)
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
#endif
+#ifndef topology_sibling_weight
+#define topology_sibling_weight(cpu) \
+ cpumask_weight(topology_sibling_cpumask(cpu))
+#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu) cpumask_of(cpu)
#endif
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 7/7] sched: use per-cpu variable cpumask_weight_sibling
2019-06-09 1:49 [PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
` (5 preceding siblings ...)
2019-06-09 1:49 ` [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings subhra mazumdar
@ 2019-06-09 1:49 ` subhra mazumdar
6 siblings, 0 replies; 13+ messages in thread
From: subhra mazumdar @ 2019-06-09 1:49 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, steven.sistare, dhaval.giani, daniel.lezcano,
vincent.guittot, viresh.kumar, tim.c.chen, mgorman
Use per-cpu var cpumask_weight_sibling for quick lookup in select_idle_cpu.
This is the fast path of scheduler and every cycle is worth saving. Usage
of cpumask_weight can result in iterations.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a74808..878f11c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6206,7 +6206,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
if (sched_feat(SIS_PROP)) {
u64 span_avg = sd->span_weight * avg_idle;
- floor = cpumask_weight(topology_sibling_cpumask(target));
+ floor = topology_sibling_weight(target);
if (floor < 2)
floor = 2;
limit = floor << 1;
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings
2019-06-27 1:29 [RESEND PATCH v3 0/7] Improve scheduler scalability for fast path subhra mazumdar
@ 2019-06-27 1:29 ` subhra mazumdar
2019-06-27 6:51 ` Thomas Gleixner
0 siblings, 1 reply; 13+ messages in thread
From: subhra mazumdar @ 2019-06-27 1:29 UTC (permalink / raw)
To: linux-kernel
Cc: peterz, mingo, tglx, steven.sistare, dhaval.giani,
daniel.lezcano, vincent.guittot, viresh.kumar, tim.c.chen,
mgorman
Introduce a per-cpu variable to keep the number of HT siblings of a cpu.
This will be used for quick lookup in select_idle_cpu to determine the
limits of search. This patch does it only for x86.
Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
arch/x86/include/asm/smp.h | 1 +
arch/x86/include/asm/topology.h | 1 +
arch/x86/kernel/smpboot.c | 17 ++++++++++++++++-
include/linux/topology.h | 4 ++++
4 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index da545df..1e90cbd 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -22,6 +22,7 @@ extern int smp_num_siblings;
extern unsigned int num_processors;
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned int, cpumask_weight_sibling);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 453cf38..dd19c71 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -111,6 +111,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
#ifdef CONFIG_SMP
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_weight(cpu) (per_cpu(cpumask_weight_sibling, cpu))
extern unsigned int __max_logical_packages;
#define topology_max_packages() (__max_logical_packages)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 362dd89..20bf676 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -85,6 +85,10 @@
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+/* representing number of HT siblings of each CPU */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, cpumask_weight_sibling);
+EXPORT_PER_CPU_SYMBOL(cpumask_weight_sibling);
+
/* representing HT and core siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
@@ -520,6 +524,8 @@ void set_cpu_sibling_map(int cpu)
if (!has_mp) {
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
+ per_cpu(cpumask_weight_sibling, cpu) =
+ cpumask_weight(topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
c->booted_cores = 1;
@@ -529,8 +535,12 @@ void set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
- if ((i == cpu) || (has_smt && match_smt(c, o)))
+ if ((i == cpu) || (has_smt && match_smt(c, o))) {
link_mask(topology_sibling_cpumask, cpu, i);
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ per_cpu(cpumask_weight_sibling, cpu) = threads;
+ per_cpu(cpumask_weight_sibling, i) = threads;
+ }
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
@@ -1173,6 +1183,8 @@ static __init void disable_smp(void)
else
physid_set_mask_of_physid(0, &phys_cpu_present_map);
cpumask_set_cpu(0, topology_sibling_cpumask(0));
+ per_cpu(cpumask_weight_sibling, 0) =
+ cpumask_weight(topology_sibling_cpumask(0));
cpumask_set_cpu(0, topology_core_cpumask(0));
}
@@ -1482,6 +1494,8 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, topology_core_cpumask(cpu)) {
cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
+ per_cpu(cpumask_weight_sibling, sibling) =
+ cpumask_weight(topology_sibling_cpumask(sibling));
/*/
* last thread sibling in this cpu core going down
*/
@@ -1495,6 +1509,7 @@ static void remove_siblinginfo(int cpu)
cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
+ per_cpu(cpumask_weight_sibling, cpu) = 0;
cpumask_clear(topology_core_cpumask(cpu));
c->cpu_core_id = 0;
c->booted_cores = 0;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cb0775e..a85aea1 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -190,6 +190,10 @@ static inline int cpu_to_mem(int cpu)
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
#endif
+#ifndef topology_sibling_weight
+#define topology_sibling_weight(cpu) \
+ cpumask_weight(topology_sibling_cpumask(cpu))
+#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu) cpumask_of(cpu)
#endif
--
2.9.3
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings
2019-06-27 1:29 ` [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings subhra mazumdar
@ 2019-06-27 6:51 ` Thomas Gleixner
2019-06-27 6:54 ` Thomas Gleixner
2019-06-28 1:02 ` Subhra Mazumdar
0 siblings, 2 replies; 13+ messages in thread
From: Thomas Gleixner @ 2019-06-27 6:51 UTC (permalink / raw)
To: subhra mazumdar
Cc: linux-kernel, peterz, mingo, steven.sistare, dhaval.giani,
daniel.lezcano, vincent.guittot, viresh.kumar, tim.c.chen,
mgorman
On Wed, 26 Jun 2019, subhra mazumdar wrote:
> Introduce a per-cpu variable to keep the number of HT siblings of a cpu.
> This will be used for quick lookup in select_idle_cpu to determine the
> limits of search.
Why? The number of siblings is constant at least today unless you play
silly cpu hotplug games. A bit more justification for adding yet another
random storage would be appreciated.
> This patch does it only for x86.
# grep 'This patch' Documentation/process/submitting-patches.rst
IOW, we all know already that this is a patch and from the subject prefix
and the diffstat it's pretty obvious that this is x86 only.
So instead of documenting the obvious, please add proper context to justify
the change.
> +/* representing number of HT siblings of each CPU */
> +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, cpumask_weight_sibling);
> +EXPORT_PER_CPU_SYMBOL(cpumask_weight_sibling);
Why does this need an export? No module has any reason to access this.
> /* representing HT and core siblings of each logical CPU */
> DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
> EXPORT_PER_CPU_SYMBOL(cpu_core_map);
> @@ -520,6 +524,8 @@ void set_cpu_sibling_map(int cpu)
>
> if (!has_mp) {
> cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
> + per_cpu(cpumask_weight_sibling, cpu) =
> + cpumask_weight(topology_sibling_cpumask(cpu));
> cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
> cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
> c->booted_cores = 1;
> @@ -529,8 +535,12 @@ void set_cpu_sibling_map(int cpu)
> for_each_cpu(i, cpu_sibling_setup_mask) {
> o = &cpu_data(i);
>
> - if ((i == cpu) || (has_smt && match_smt(c, o)))
> + if ((i == cpu) || (has_smt && match_smt(c, o))) {
> link_mask(topology_sibling_cpumask, cpu, i);
> + threads = cpumask_weight(topology_sibling_cpumask(cpu));
> + per_cpu(cpumask_weight_sibling, cpu) = threads;
> + per_cpu(cpumask_weight_sibling, i) = threads;
This only works for SMT=2, but fails to update the rest for SMT=4.
> @@ -1482,6 +1494,8 @@ static void remove_siblinginfo(int cpu)
>
> for_each_cpu(sibling, topology_core_cpumask(cpu)) {
> cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
> + per_cpu(cpumask_weight_sibling, sibling) =
> + cpumask_weight(topology_sibling_cpumask(sibling));
While remove does the right thing.
Thanks,
tglx
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings
2019-06-27 6:51 ` Thomas Gleixner
@ 2019-06-27 6:54 ` Thomas Gleixner
2019-06-28 1:06 ` Subhra Mazumdar
2019-06-28 1:02 ` Subhra Mazumdar
1 sibling, 1 reply; 13+ messages in thread
From: Thomas Gleixner @ 2019-06-27 6:54 UTC (permalink / raw)
To: subhra mazumdar
Cc: linux-kernel, peterz, mingo, steven.sistare, dhaval.giani,
daniel.lezcano, vincent.guittot, viresh.kumar, tim.c.chen,
mgorman
On Thu, 27 Jun 2019, Thomas Gleixner wrote:
> On Wed, 26 Jun 2019, subhra mazumdar wrote:
>
> > Introduce a per-cpu variable to keep the number of HT siblings of a cpu.
> > This will be used for quick lookup in select_idle_cpu to determine the
> > limits of search.
>
> Why? The number of siblings is constant at least today unless you play
> silly cpu hotplug games. A bit more justification for adding yet another
> random storage would be appreciated.
>
> > This patch does it only for x86.
>
> # grep 'This patch' Documentation/process/submitting-patches.rst
>
> IOW, we all know already that this is a patch and from the subject prefix
> and the diffstat it's pretty obvious that this is x86 only.
>
> So instead of documenting the obvious, please add proper context to justify
> the change.
Aside of that the right ordering is to introduce the default fallback in a
separate patch, which explains the reasoning and then in the next one add
the x86 optimized version.
Thanks,
tglx
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings
2019-06-27 6:54 ` Thomas Gleixner
@ 2019-06-28 1:06 ` Subhra Mazumdar
0 siblings, 0 replies; 13+ messages in thread
From: Subhra Mazumdar @ 2019-06-28 1:06 UTC (permalink / raw)
To: Thomas Gleixner
Cc: linux-kernel, peterz, mingo, steven.sistare, dhaval.giani,
daniel.lezcano, vincent.guittot, viresh.kumar, tim.c.chen,
mgorman
On 6/26/19 11:54 PM, Thomas Gleixner wrote:
> On Thu, 27 Jun 2019, Thomas Gleixner wrote:
>
>> On Wed, 26 Jun 2019, subhra mazumdar wrote:
>>
>>> Introduce a per-cpu variable to keep the number of HT siblings of a cpu.
>>> This will be used for quick lookup in select_idle_cpu to determine the
>>> limits of search.
>> Why? The number of siblings is constant at least today unless you play
>> silly cpu hotplug games. A bit more justification for adding yet another
>> random storage would be appreciated.
>>
>>> This patch does it only for x86.
>> # grep 'This patch' Documentation/process/submitting-patches.rst
>>
>> IOW, we all know already that this is a patch and from the subject prefix
>> and the diffstat it's pretty obvious that this is x86 only.
>>
>> So instead of documenting the obvious, please add proper context to justify
>> the change.
> Aside of that the right ordering is to introduce the default fallback in a
> separate patch, which explains the reasoning and then in the next one add
> the x86 optimized version.
OK. I will also add the extra optimization for other architectures.
Thanks,
Subhra
>
> Thanks,
>
> tglx
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v3 6/7] x86/smpboot: introduce per-cpu variable for HT siblings
2019-06-27 6:51 ` Thomas Gleixner
2019-06-27 6:54 ` Thomas Gleixner
@ 2019-06-28 1:02 ` Subhra Mazumdar
1 sibling, 0 replies; 13+ messages in thread
From: Subhra Mazumdar @ 2019-06-28 1:02 UTC (permalink / raw)
To: Thomas Gleixner
Cc: linux-kernel, peterz, mingo, steven.sistare, dhaval.giani,
daniel.lezcano, vincent.guittot, viresh.kumar, tim.c.chen,
mgorman
On 6/26/19 11:51 PM, Thomas Gleixner wrote:
> On Wed, 26 Jun 2019, subhra mazumdar wrote:
>
>> Introduce a per-cpu variable to keep the number of HT siblings of a cpu.
>> This will be used for quick lookup in select_idle_cpu to determine the
>> limits of search.
> Why? The number of siblings is constant at least today unless you play
> silly cpu hotplug games. A bit more justification for adding yet another
> random storage would be appreciated.
Using cpumask_weight every time in select_idle_cpu to compute the no. of
SMT siblings can be costly as cpumask_weight may not be O(1) for systems
with large no. of CPUs (e.g 8 socket, each socket having lots of cores).
Over 512 CPUs the bitmask will span multiple cache lines and touching
multiple cache lines in the fast path of scheduler can cost more than we
save from this optimization. Even in single cache line it loops in longs.
We want to touch O(1) cache lines and do O(1) operations, hence
pre-compute it in per-CPU variable.
>
>> This patch does it only for x86.
> # grep 'This patch' Documentation/process/submitting-patches.rst
>
> IOW, we all know already that this is a patch and from the subject prefix
> and the diffstat it's pretty obvious that this is x86 only.
>
> So instead of documenting the obvious, please add proper context to justify
> the change.
Ok. The extra per-CPU optimization was done only for x86 as we cared about
it the most and make it future proof. I will add for other architectures.
>
>> +/* representing number of HT siblings of each CPU */
>> +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, cpumask_weight_sibling);
>> +EXPORT_PER_CPU_SYMBOL(cpumask_weight_sibling);
> Why does this need an export? No module has any reason to access this.
I will remove it
>
>> /* representing HT and core siblings of each logical CPU */
>> DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
>> EXPORT_PER_CPU_SYMBOL(cpu_core_map);
>> @@ -520,6 +524,8 @@ void set_cpu_sibling_map(int cpu)
>>
>> if (!has_mp) {
>> cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
>> + per_cpu(cpumask_weight_sibling, cpu) =
>> + cpumask_weight(topology_sibling_cpumask(cpu));
>> cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
>> cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
>> c->booted_cores = 1;
>> @@ -529,8 +535,12 @@ void set_cpu_sibling_map(int cpu)
>> for_each_cpu(i, cpu_sibling_setup_mask) {
>> o = &cpu_data(i);
>>
>> - if ((i == cpu) || (has_smt && match_smt(c, o)))
>> + if ((i == cpu) || (has_smt && match_smt(c, o))) {
>> link_mask(topology_sibling_cpumask, cpu, i);
>> + threads = cpumask_weight(topology_sibling_cpumask(cpu));
>> + per_cpu(cpumask_weight_sibling, cpu) = threads;
>> + per_cpu(cpumask_weight_sibling, i) = threads;
> This only works for SMT=2, but fails to update the rest for SMT=4.
I guess I assumed that x86 will always be SMT2, will fix this.
Thanks,
Subhra
>
>> @@ -1482,6 +1494,8 @@ static void remove_siblinginfo(int cpu)
>>
>> for_each_cpu(sibling, topology_core_cpumask(cpu)) {
>> cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
>> + per_cpu(cpumask_weight_sibling, sibling) =
>> + cpumask_weight(topology_sibling_cpumask(sibling));
> While remove does the right thing.
>
> Thanks,
>
> tglx
^ permalink raw reply [flat|nested] 13+ messages in thread